1 /* 2 * Copyright © 2011,2012,2014 Google, Inc. 3 * 4 * This is part of HarfBuzz, a text shaping library. 5 * 6 * Permission is hereby granted, without written agreement and without 7 * license or royalty fees, to use, copy, modify, and distribute this 8 * software and its documentation for any purpose, provided that the 9 * above copyright notice and the following two paragraphs appear in 10 * all copies of this software. 11 * 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 16 * DAMAGE. 17 * 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 * 24 * Google Author(s): Behdad Esfahbod 25 */ 26 27 #ifndef HB_UTF_PRIVATE_HH 28 #define HB_UTF_PRIVATE_HH 29 30 #include "hb-private.hh" 31 32 33 struct hb_utf8_t 34 { 35 typedef uint8_t codepoint_t; 36 37 static inline const uint8_t * nexthb_utf8_t38 next (const uint8_t *text, 39 const uint8_t *end, 40 hb_codepoint_t *unicode, 41 hb_codepoint_t replacement) 42 { 43 /* Written to only accept well-formed sequences. 44 * Based on ideas from ICU's U8_NEXT. 45 * Generates one "replacement" for each ill-formed byte. */ 46 47 hb_codepoint_t c = *text++; 48 49 if (c > 0x7Fu) 50 { 51 if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ 52 { 53 unsigned int t1; 54 if (likely (text < end && 55 (t1 = text[0] - 0x80u) <= 0x3Fu)) 56 { 57 c = ((c&0x1Fu)<<6) | t1; 58 text++; 59 } 60 else 61 goto error; 62 } 63 else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ 64 { 65 unsigned int t1, t2; 66 if (likely (1 < end - text && 67 (t1 = text[0] - 0x80u) <= 0x3Fu && 68 (t2 = text[1] - 0x80u) <= 0x3Fu)) 69 { 70 c = ((c&0xFu)<<12) | (t1<<6) | t2; 71 if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) 72 goto error; 73 text += 2; 74 } 75 else 76 goto error; 77 } 78 else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ 79 { 80 unsigned int t1, t2, t3; 81 if (likely (2 < end - text && 82 (t1 = text[0] - 0x80u) <= 0x3Fu && 83 (t2 = text[1] - 0x80u) <= 0x3Fu && 84 (t3 = text[2] - 0x80u) <= 0x3Fu)) 85 { 86 c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; 87 if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) 88 goto error; 89 text += 3; 90 } 91 else 92 goto error; 93 } 94 else 95 goto error; 96 } 97 98 *unicode = c; 99 return text; 100 101 error: 102 *unicode = replacement; 103 return text; 104 } 105 106 static inline const uint8_t * prevhb_utf8_t107 prev (const uint8_t *text, 108 const uint8_t *start, 109 hb_codepoint_t *unicode, 110 hb_codepoint_t replacement) 111 { 112 const uint8_t *end = text--; 113 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) 114 text--; 115 116 if (likely (next (text, end, unicode, replacement) == end)) 117 return text; 118 119 *unicode = replacement; 120 return end - 1; 121 } 122 123 static inline unsigned int strlenhb_utf8_t124 strlen (const uint8_t *text) 125 { 126 return ::strlen ((const char *) text); 127 } 128 }; 129 130 131 struct hb_utf16_t 132 { 133 typedef uint16_t codepoint_t; 134 135 static inline const uint16_t * nexthb_utf16_t136 next (const uint16_t *text, 137 const uint16_t *end, 138 hb_codepoint_t *unicode, 139 hb_codepoint_t replacement) 140 { 141 hb_codepoint_t c = *text++; 142 143 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) 144 { 145 *unicode = c; 146 return text; 147 } 148 149 if (likely (c <= 0xDBFFu && text < end)) 150 { 151 /* High-surrogate in c */ 152 hb_codepoint_t l = *text; 153 if (likely (hb_in_range (l, 0xDC00u, 0xDFFFu))) 154 { 155 /* Low-surrogate in l */ 156 *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); 157 text++; 158 return text; 159 } 160 } 161 162 /* Lonely / out-of-order surrogate. */ 163 *unicode = replacement; 164 return text; 165 } 166 167 static inline const uint16_t * prevhb_utf16_t168 prev (const uint16_t *text, 169 const uint16_t *start, 170 hb_codepoint_t *unicode, 171 hb_codepoint_t replacement) 172 { 173 hb_codepoint_t c = *--text; 174 175 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) 176 { 177 *unicode = c; 178 return text; 179 } 180 181 if (likely (c >= 0xDC00u && start < text)) 182 { 183 /* Low-surrogate in c */ 184 hb_codepoint_t h = text[-1]; 185 if (likely (hb_in_range (h, 0xD800u, 0xDBFFu))) 186 { 187 /* High-surrogate in h */ 188 *unicode = (h << 10) + c - ((0xD800u << 10) - 0x10000u + 0xDC00u); 189 text--; 190 return text; 191 } 192 } 193 194 /* Lonely / out-of-order surrogate. */ 195 *unicode = replacement; 196 return text; 197 } 198 199 200 static inline unsigned int strlenhb_utf16_t201 strlen (const uint16_t *text) 202 { 203 unsigned int l = 0; 204 while (*text++) l++; 205 return l; 206 } 207 }; 208 209 210 template <bool validate=true> 211 struct hb_utf32_t 212 { 213 typedef uint32_t codepoint_t; 214 215 static inline const uint32_t * nexthb_utf32_t216 next (const uint32_t *text, 217 const uint32_t *end HB_UNUSED, 218 hb_codepoint_t *unicode, 219 hb_codepoint_t replacement) 220 { 221 hb_codepoint_t c = *unicode = *text++; 222 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 223 *unicode = replacement; 224 return text; 225 } 226 227 static inline const uint32_t * prevhb_utf32_t228 prev (const uint32_t *text, 229 const uint32_t *start HB_UNUSED, 230 hb_codepoint_t *unicode, 231 hb_codepoint_t replacement) 232 { 233 hb_codepoint_t c = *unicode = *--text; 234 if (validate && unlikely (c >= 0xD800u && (c <= 0xDFFFu || c > 0x10FFFFu))) 235 *unicode = replacement; 236 return text; 237 } 238 239 static inline unsigned int strlenhb_utf32_t240 strlen (const uint32_t *text) 241 { 242 unsigned int l = 0; 243 while (*text++) l++; 244 return l; 245 } 246 }; 247 248 249 struct hb_latin1_t 250 { 251 typedef uint8_t codepoint_t; 252 253 static inline const uint8_t * nexthb_latin1_t254 next (const uint8_t *text, 255 const uint8_t *end HB_UNUSED, 256 hb_codepoint_t *unicode, 257 hb_codepoint_t replacement HB_UNUSED) 258 { 259 *unicode = *text++; 260 return text; 261 } 262 263 static inline const uint8_t * prevhb_latin1_t264 prev (const uint8_t *text, 265 const uint8_t *start HB_UNUSED, 266 hb_codepoint_t *unicode, 267 hb_codepoint_t replacement) 268 { 269 *unicode = *--text; 270 return text; 271 } 272 273 static inline unsigned int strlenhb_latin1_t274 strlen (const uint8_t *text) 275 { 276 unsigned int l = 0; 277 while (*text++) l++; 278 return l; 279 } 280 }; 281 282 #endif /* HB_UTF_PRIVATE_HH */ 283