1 /* 2 * Copyright © 2011,2012,2014 Google, Inc. 3 * 4 * This is part of HarfBuzz, a text shaping library. 5 * 6 * Permission is hereby granted, without written agreement and without 7 * license or royalty fees, to use, copy, modify, and distribute this 8 * software and its documentation for any purpose, provided that the 9 * above copyright notice and the following two paragraphs appear in 10 * all copies of this software. 11 * 12 * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR 13 * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES 14 * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN 15 * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH 16 * DAMAGE. 17 * 18 * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, 19 * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 20 * FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS 21 * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO 22 * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. 23 * 24 * Google Author(s): Behdad Esfahbod 25 */ 26 27 #ifndef HB_UTF_PRIVATE_HH 28 #define HB_UTF_PRIVATE_HH 29 30 #include "hb-private.hh" 31 32 template <typename T, bool validate=true> struct hb_utf_t; 33 34 35 /* UTF-8 */ 36 37 template <> 38 struct hb_utf_t<uint8_t, true> 39 { 40 static inline const uint8_t * nexthb_utf_t41 next (const uint8_t *text, 42 const uint8_t *end, 43 hb_codepoint_t *unicode, 44 hb_codepoint_t replacement) 45 { 46 /* Written to only accept well-formed sequences. 47 * Based on ideas from ICU's U8_NEXT. 48 * Generates one "replacement" for each ill-formed byte. */ 49 50 hb_codepoint_t c = *text++; 51 52 if (c > 0x7Fu) 53 { 54 if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */ 55 { 56 unsigned int t1; 57 if (likely (text < end && 58 (t1 = text[0] - 0x80u) <= 0x3Fu)) 59 { 60 c = ((c&0x1Fu)<<6) | t1; 61 text++; 62 } 63 else 64 goto error; 65 } 66 else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */ 67 { 68 unsigned int t1, t2; 69 if (likely (1 < end - text && 70 (t1 = text[0] - 0x80u) <= 0x3Fu && 71 (t2 = text[1] - 0x80u) <= 0x3Fu)) 72 { 73 c = ((c&0xFu)<<12) | (t1<<6) | t2; 74 if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu))) 75 goto error; 76 text += 2; 77 } 78 else 79 goto error; 80 } 81 else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */ 82 { 83 unsigned int t1, t2, t3; 84 if (likely (2 < end - text && 85 (t1 = text[0] - 0x80u) <= 0x3Fu && 86 (t2 = text[1] - 0x80u) <= 0x3Fu && 87 (t3 = text[2] - 0x80u) <= 0x3Fu)) 88 { 89 c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3; 90 if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu))) 91 goto error; 92 text += 3; 93 } 94 else 95 goto error; 96 } 97 else 98 goto error; 99 } 100 101 *unicode = c; 102 return text; 103 104 error: 105 *unicode = replacement; 106 return text; 107 } 108 109 static inline const uint8_t * prevhb_utf_t110 prev (const uint8_t *text, 111 const uint8_t *start, 112 hb_codepoint_t *unicode, 113 hb_codepoint_t replacement) 114 { 115 const uint8_t *end = text--; 116 while (start < text && (*text & 0xc0) == 0x80 && end - text < 4) 117 text--; 118 119 if (likely (next (text, end, unicode, replacement) == end)) 120 return text; 121 122 *unicode = replacement; 123 return end - 1; 124 } 125 126 static inline unsigned int strlenhb_utf_t127 strlen (const uint8_t *text) 128 { 129 return ::strlen ((const char *) text); 130 } 131 }; 132 133 134 /* UTF-16 */ 135 136 template <> 137 struct hb_utf_t<uint16_t, true> 138 { 139 static inline const uint16_t * nexthb_utf_t140 next (const uint16_t *text, 141 const uint16_t *end, 142 hb_codepoint_t *unicode, 143 hb_codepoint_t replacement) 144 { 145 hb_codepoint_t c = *text++; 146 147 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) 148 { 149 *unicode = c; 150 return text; 151 } 152 153 if (likely (hb_in_range (c, 0xD800u, 0xDBFFu))) 154 { 155 /* High-surrogate in c */ 156 hb_codepoint_t l; 157 if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu)))) 158 { 159 /* Low-surrogate in l */ 160 *unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u); 161 text++; 162 return text; 163 } 164 } 165 166 /* Lonely / out-of-order surrogate. */ 167 *unicode = replacement; 168 return text; 169 } 170 171 static inline const uint16_t * prevhb_utf_t172 prev (const uint16_t *text, 173 const uint16_t *start, 174 hb_codepoint_t *unicode, 175 hb_codepoint_t replacement) 176 { 177 const uint16_t *end = text--; 178 hb_codepoint_t c = *text; 179 180 if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu))) 181 { 182 *unicode = c; 183 return text; 184 } 185 186 if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu))) 187 text--; 188 189 if (likely (next (text, end, unicode, replacement) == end)) 190 return text; 191 192 *unicode = replacement; 193 return end - 1; 194 } 195 196 197 static inline unsigned int strlenhb_utf_t198 strlen (const uint16_t *text) 199 { 200 unsigned int l = 0; 201 while (*text++) l++; 202 return l; 203 } 204 }; 205 206 207 /* UTF-32 */ 208 209 template <bool validate> 210 struct hb_utf_t<uint32_t, validate> 211 { 212 static inline const uint32_t * nexthb_utf_t213 next (const uint32_t *text, 214 const uint32_t *end HB_UNUSED, 215 hb_codepoint_t *unicode, 216 hb_codepoint_t replacement) 217 { 218 hb_codepoint_t c = *text++; 219 if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu))) 220 goto error; 221 *unicode = c; 222 return text; 223 224 error: 225 *unicode = replacement; 226 return text; 227 } 228 229 static inline const uint32_t * prevhb_utf_t230 prev (const uint32_t *text, 231 const uint32_t *start HB_UNUSED, 232 hb_codepoint_t *unicode, 233 hb_codepoint_t replacement) 234 { 235 next (text - 1, text, unicode, replacement); 236 return text - 1; 237 } 238 239 static inline unsigned int strlenhb_utf_t240 strlen (const uint32_t *text) 241 { 242 unsigned int l = 0; 243 while (*text++) l++; 244 return l; 245 } 246 }; 247 248 249 #endif /* HB_UTF_PRIVATE_HH */ 250