1 /*
2  * Copyright © 2011,2012,2014  Google, Inc.
3  *
4  *  This is part of HarfBuzz, a text shaping library.
5  *
6  * Permission is hereby granted, without written agreement and without
7  * license or royalty fees, to use, copy, modify, and distribute this
8  * software and its documentation for any purpose, provided that the
9  * above copyright notice and the following two paragraphs appear in
10  * all copies of this software.
11  *
12  * IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE TO ANY PARTY FOR
13  * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
14  * ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN
15  * IF THE COPYRIGHT HOLDER HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
16  * DAMAGE.
17  *
18  * THE COPYRIGHT HOLDER SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING,
19  * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
20  * FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS
21  * ON AN "AS IS" BASIS, AND THE COPYRIGHT HOLDER HAS NO OBLIGATION TO
22  * PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
23  *
24  * Google Author(s): Behdad Esfahbod
25  */
26 
27 #ifndef HB_UTF_PRIVATE_HH
28 #define HB_UTF_PRIVATE_HH
29 
30 #include "hb-private.hh"
31 
32 template <typename T, bool validate=true> struct hb_utf_t;
33 
34 
35 /* UTF-8 */
36 
37 template <>
38 struct hb_utf_t<uint8_t, true>
39 {
40   static inline const uint8_t *
nexthb_utf_t41   next (const uint8_t *text,
42 	const uint8_t *end,
43 	hb_codepoint_t *unicode,
44 	hb_codepoint_t replacement)
45   {
46     /* Written to only accept well-formed sequences.
47      * Based on ideas from ICU's U8_NEXT.
48      * Generates one "replacement" for each ill-formed byte. */
49 
50     hb_codepoint_t c = *text++;
51 
52     if (c > 0x7Fu)
53     {
54       if (hb_in_range (c, 0xC2u, 0xDFu)) /* Two-byte */
55       {
56 	unsigned int t1;
57 	if (likely (text < end &&
58 		    (t1 = text[0] - 0x80u) <= 0x3Fu))
59 	{
60 	  c = ((c&0x1Fu)<<6) | t1;
61 	  text++;
62 	}
63 	else
64 	  goto error;
65       }
66       else if (hb_in_range (c, 0xE0u, 0xEFu)) /* Three-byte */
67       {
68 	unsigned int t1, t2;
69 	if (likely (1 < end - text &&
70 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
71 		    (t2 = text[1] - 0x80u) <= 0x3Fu))
72 	{
73 	  c = ((c&0xFu)<<12) | (t1<<6) | t2;
74 	  if (unlikely (c < 0x0800u || hb_in_range (c, 0xD800u, 0xDFFFu)))
75 	    goto error;
76 	  text += 2;
77 	}
78 	else
79 	  goto error;
80       }
81       else if (hb_in_range (c, 0xF0u, 0xF4u)) /* Four-byte */
82       {
83 	unsigned int t1, t2, t3;
84 	if (likely (2 < end - text &&
85 		    (t1 = text[0] - 0x80u) <= 0x3Fu &&
86 		    (t2 = text[1] - 0x80u) <= 0x3Fu &&
87 		    (t3 = text[2] - 0x80u) <= 0x3Fu))
88 	{
89 	  c = ((c&0x7u)<<18) | (t1<<12) | (t2<<6) | t3;
90 	  if (unlikely (!hb_in_range (c, 0x10000u, 0x10FFFFu)))
91 	    goto error;
92 	  text += 3;
93 	}
94 	else
95 	  goto error;
96       }
97       else
98 	goto error;
99     }
100 
101     *unicode = c;
102     return text;
103 
104   error:
105     *unicode = replacement;
106     return text;
107   }
108 
109   static inline const uint8_t *
prevhb_utf_t110   prev (const uint8_t *text,
111 	const uint8_t *start,
112 	hb_codepoint_t *unicode,
113 	hb_codepoint_t replacement)
114   {
115     const uint8_t *end = text--;
116     while (start < text && (*text & 0xc0) == 0x80 && end - text < 4)
117       text--;
118 
119     if (likely (next (text, end, unicode, replacement) == end))
120       return text;
121 
122     *unicode = replacement;
123     return end - 1;
124   }
125 
126   static inline unsigned int
strlenhb_utf_t127   strlen (const uint8_t *text)
128   {
129     return ::strlen ((const char *) text);
130   }
131 };
132 
133 
134 /* UTF-16 */
135 
136 template <>
137 struct hb_utf_t<uint16_t, true>
138 {
139   static inline const uint16_t *
nexthb_utf_t140   next (const uint16_t *text,
141 	const uint16_t *end,
142 	hb_codepoint_t *unicode,
143 	hb_codepoint_t replacement)
144   {
145     hb_codepoint_t c = *text++;
146 
147     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
148     {
149       *unicode = c;
150       return text;
151     }
152 
153     if (likely (hb_in_range (c, 0xD800u, 0xDBFFu)))
154     {
155       /* High-surrogate in c */
156       hb_codepoint_t l;
157       if (text < end && ((l = *text), likely (hb_in_range (l, 0xDC00u, 0xDFFFu))))
158       {
159 	/* Low-surrogate in l */
160 	*unicode = (c << 10) + l - ((0xD800u << 10) - 0x10000u + 0xDC00u);
161 	 text++;
162 	 return text;
163       }
164     }
165 
166     /* Lonely / out-of-order surrogate. */
167     *unicode = replacement;
168     return text;
169   }
170 
171   static inline const uint16_t *
prevhb_utf_t172   prev (const uint16_t *text,
173 	const uint16_t *start,
174 	hb_codepoint_t *unicode,
175 	hb_codepoint_t replacement)
176   {
177     const uint16_t *end = text--;
178     hb_codepoint_t c = *text;
179 
180     if (likely (!hb_in_range (c, 0xD800u, 0xDFFFu)))
181     {
182       *unicode = c;
183       return text;
184     }
185 
186     if (likely (start < text && hb_in_range (c, 0xDC00u, 0xDFFFu)))
187       text--;
188 
189     if (likely (next (text, end, unicode, replacement) == end))
190       return text;
191 
192     *unicode = replacement;
193     return end - 1;
194   }
195 
196 
197   static inline unsigned int
strlenhb_utf_t198   strlen (const uint16_t *text)
199   {
200     unsigned int l = 0;
201     while (*text++) l++;
202     return l;
203   }
204 };
205 
206 
207 /* UTF-32 */
208 
209 template <bool validate>
210 struct hb_utf_t<uint32_t, validate>
211 {
212   static inline const uint32_t *
nexthb_utf_t213   next (const uint32_t *text,
214 	const uint32_t *end HB_UNUSED,
215 	hb_codepoint_t *unicode,
216 	hb_codepoint_t replacement)
217   {
218     hb_codepoint_t c = *text++;
219     if (validate && unlikely (c > 0x10FFFFu || hb_in_range (c, 0xD800u, 0xDFFFu)))
220       goto error;
221     *unicode = c;
222     return text;
223 
224   error:
225     *unicode = replacement;
226     return text;
227   }
228 
229   static inline const uint32_t *
prevhb_utf_t230   prev (const uint32_t *text,
231 	const uint32_t *start HB_UNUSED,
232 	hb_codepoint_t *unicode,
233 	hb_codepoint_t replacement)
234   {
235     next (text - 1, text, unicode, replacement);
236     return text - 1;
237   }
238 
239   static inline unsigned int
strlenhb_utf_t240   strlen (const uint32_t *text)
241   {
242     unsigned int l = 0;
243     while (*text++) l++;
244     return l;
245   }
246 };
247 
248 
249 #endif /* HB_UTF_PRIVATE_HH */
250