1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ******************************************************************************
5 *
6 *   Copyright (C) 1999-2015, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 ******************************************************************************
10 */
11 
12 #ifndef BASE_THIRD_PARTY_ICU_ICU_UTF_H_
13 #define BASE_THIRD_PARTY_ICU_ICU_UTF_H_
14 
15 #include <stdint.h>
16 
17 namespace base_icu {
18 
19 // source/common/unicode/umachine.h
20 
21 /** The ICU boolean type @stable ICU 2.0 */
22 typedef int8_t UBool;
23 
24 /**
25  * Define UChar32 as a type for single Unicode code points.
26  * UChar32 is a signed 32-bit integer (same as int32_t).
27  *
28  * The Unicode code point range is 0..0x10ffff.
29  * All other values (negative or >=0x110000) are illegal as Unicode code points.
30  * They may be used as sentinel values to indicate "done", "error"
31  * or similar non-code point conditions.
32  *
33  * Before ICU 2.4 (Jitterbug 2146), UChar32 was defined
34  * to be wchar_t if that is 32 bits wide (wchar_t may be signed or unsigned)
35  * or else to be uint32_t.
36  * That is, the definition of UChar32 was platform-dependent.
37  *
38  * @see U_SENTINEL
39  * @stable ICU 2.4
40  */
41 typedef int32_t UChar32;
42 
43 /**
44  * This value is intended for sentinel values for APIs that
45  * (take or) return single code points (UChar32).
46  * It is outside of the Unicode code point range 0..0x10ffff.
47  *
48  * For example, a "done" or "error" value in a new API
49  * could be indicated with U_SENTINEL.
50  *
51  * ICU APIs designed before ICU 2.4 usually define service-specific "done"
52  * values, mostly 0xffff.
53  * Those may need to be distinguished from
54  * actual U+ffff text contents by calling functions like
55  * CharacterIterator::hasNext() or UnicodeString::length().
56  *
57  * @return -1
58  * @see UChar32
59  * @stable ICU 2.4
60  */
61 #define CBU_SENTINEL (-1)
62 
63 // source/common/unicode/utf.h
64 
65 /**
66  * Is this code point a Unicode noncharacter?
67  * @param c 32-bit code point
68  * @return TRUE or FALSE
69  * @stable ICU 2.4
70  */
71 #define CBU_IS_UNICODE_NONCHAR(c) \
72     ((c)>=0xfdd0 && \
73      ((c)<=0xfdef || ((c)&0xfffe)==0xfffe) && (c)<=0x10ffff)
74 
75 /**
76  * Is c a Unicode code point value (0..U+10ffff)
77  * that can be assigned a character?
78  *
79  * Code points that are not characters include:
80  * - single surrogate code points (U+d800..U+dfff, 2048 code points)
81  * - the last two code points on each plane (U+__fffe and U+__ffff, 34 code points)
82  * - U+fdd0..U+fdef (new with Unicode 3.1, 32 code points)
83  * - the highest Unicode code point value is U+10ffff
84  *
85  * This means that all code points below U+d800 are character code points,
86  * and that boundary is tested first for performance.
87  *
88  * @param c 32-bit code point
89  * @return TRUE or FALSE
90  * @stable ICU 2.4
91  */
92 #define CBU_IS_UNICODE_CHAR(c) \
93     ((uint32_t)(c)<0xd800 || \
94         (0xdfff<(c) && (c)<=0x10ffff && !CBU_IS_UNICODE_NONCHAR(c)))
95 
96 /**
97  * Is this code point a surrogate (U+d800..U+dfff)?
98  * @param c 32-bit code point
99  * @return TRUE or FALSE
100  * @stable ICU 2.4
101  */
102 #define CBU_IS_SURROGATE(c) (((c)&0xfffff800)==0xd800)
103 
104 /**
105  * Assuming c is a surrogate code point (U_IS_SURROGATE(c)),
106  * is it a lead surrogate?
107  * @param c 32-bit code point
108  * @return TRUE or FALSE
109  * @stable ICU 2.4
110  */
111 #define CBU_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
112 
113 // source/common/unicode/utf8.h
114 
115 /**
116  * Internal bit vector for 3-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD3_AND_T1.
117  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
118  * Lead byte E0..EF bits 3..0 are used as byte index,
119  * first trail byte bits 7..5 are used as bit index into that byte.
120  * @see U8_IS_VALID_LEAD3_AND_T1
121  * @internal
122  */
123 #define CBU8_LEAD3_T1_BITS "\x20\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x30\x10\x30\x30"
124 
125 /**
126  * Internal 3-byte UTF-8 validity check.
127  * Non-zero if lead byte E0..EF and first trail byte 00..FF start a valid sequence.
128  * @internal
129  */
130 #define CBU8_IS_VALID_LEAD3_AND_T1(lead, t1) (CBU8_LEAD3_T1_BITS[(lead)&0xf]&(1<<((uint8_t)(t1)>>5)))
131 
132 /**
133  * Internal bit vector for 4-byte UTF-8 validity check, for use in U8_IS_VALID_LEAD4_AND_T1.
134  * Each bit indicates whether one lead byte + first trail byte pair starts a valid sequence.
135  * First trail byte bits 7..4 are used as byte index,
136  * lead byte F0..F4 bits 2..0 are used as bit index into that byte.
137  * @see U8_IS_VALID_LEAD4_AND_T1
138  * @internal
139  */
140 #define CBU8_LEAD4_T1_BITS "\x00\x00\x00\x00\x00\x00\x00\x00\x1E\x0F\x0F\x0F\x00\x00\x00\x00"
141 
142 /**
143  * Internal 4-byte UTF-8 validity check.
144  * Non-zero if lead byte F0..F4 and first trail byte 00..FF start a valid sequence.
145  * @internal
146  */
147 #define CBU8_IS_VALID_LEAD4_AND_T1(lead, t1) (CBU8_LEAD4_T1_BITS[(uint8_t)(t1)>>4]&(1<<((lead)&7)))
148 
149 /**
150  * Function for handling "next code point" with error-checking.
151  *
152  * This is internal since it is not meant to be called directly by external clie
153 nts;
154  * however it is U_STABLE (not U_INTERNAL) since it is called by public macros i
155 n this
156  * file and thus must remain stable, and should not be hidden when other interna
157 l
158  * functions are hidden (otherwise public macros would fail to compile).
159  * @internal
160  */
161 UChar32
162 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, ::base_icu::UChar32 c, ::base_icu::UBool strict);
163 
164 /**
165  * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)?
166  * @param c 8-bit code unit (byte)
167  * @return TRUE or FALSE
168  * @stable ICU 2.4
169  */
170 #define CBU8_IS_SINGLE(c) (((c)&0x80)==0)
171 
172 /**
173  * Is this code unit (byte) a UTF-8 lead byte? (0xC2..0xF4)
174  * @param c 8-bit code unit (byte)
175  * @return TRUE or FALSE
176  * @stable ICU 2.4
177  */
178 #define CBU8_IS_LEAD(c) ((uint8_t)((c)-0xc2)<=0x32)
179 
180 /**
181  * Is this code unit (byte) a UTF-8 trail byte? (0x80..0xBF)
182  * @param c 8-bit code unit (byte)
183  * @return TRUE or FALSE
184  * @stable ICU 2.4
185  */
186 #define CBU8_IS_TRAIL(c) ((int8_t)(c)<-0x40)
187 
188 /**
189  * How many code units (bytes) are used for the UTF-8 encoding
190  * of this Unicode code point?
191  * @param c 32-bit code point
192  * @return 1..4, or 0 if c is a surrogate or not a Unicode code point
193  * @stable ICU 2.4
194  */
195 #define CBU8_LENGTH(c) \
196     ((uint32_t)(c)<=0x7f ? 1 : \
197         ((uint32_t)(c)<=0x7ff ? 2 : \
198             ((uint32_t)(c)<=0xd7ff ? 3 : \
199                 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \
200                     ((uint32_t)(c)<=0xffff ? 3 : 4)\
201                 ) \
202             ) \
203         ) \
204     )
205 
206 /**
207  * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff).
208  * @return 4
209  * @stable ICU 2.4
210  */
211 #define CBU8_MAX_LENGTH 4
212 
213 /**
214  * Get a code point from a string at a code point boundary offset,
215  * and advance the offset to the next code point boundary.
216  * (Post-incrementing forward iteration.)
217  * "Safe" macro, checks for illegal sequences and for string boundaries.
218  *
219  * The length can be negative for a NUL-terminated string.
220  *
221  * The offset may point to the lead byte of a multi-byte sequence,
222  * in which case the macro will read the whole sequence.
223  * If the offset points to a trail byte or an illegal UTF-8 sequence, then
224  * c is set to a negative value.
225  *
226  * @param s const uint8_t * string
227  * @param i int32_t string offset, must be i<length
228  * @param length int32_t string length
229  * @param c output UChar32 variable, set to <0 in case of an error
230  * @see U8_NEXT_UNSAFE
231  * @stable ICU 2.4
232  */
233 #define CBU8_NEXT(s, i, length, c) { \
234     (c)=(uint8_t)(s)[(i)++]; \
235     if(!CBU8_IS_SINGLE(c)) { \
236         uint8_t __t1, __t2; \
237         if( /* handle U+0800..U+FFFF inline */ \
238                 (0xe0<=(c) && (c)<0xf0) && \
239                 (((i)+1)<(length) || (length)<0) && \
240                 CBU8_IS_VALID_LEAD3_AND_T1((c), __t1=(s)[i]) && \
241                 (__t2=(s)[(i)+1]-0x80)<=0x3f) { \
242             (c)=(((c)&0xf)<<12)|((__t1&0x3f)<<6)|__t2; \
243             (i)+=2; \
244         } else if( /* handle U+0080..U+07FF inline */ \
245                 ((c)<0xe0 && (c)>=0xc2) && \
246                 ((i)!=(length)) && \
247                 (__t1=(s)[i]-0x80)<=0x3f) { \
248             (c)=(((c)&0x1f)<<6)|__t1; \
249             ++(i); \
250         } else { \
251             /* function call for "complicated" and error cases */ \
252             (c)=::base_icu::utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \
253         } \
254     } \
255 }
256 
257 /**
258  * Append a code point to a string, overwriting 1 to 4 bytes.
259  * The offset points to the current end of the string contents
260  * and is advanced (post-increment).
261  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
262  * Otherwise, the result is undefined.
263  *
264  * @param s const uint8_t * string buffer
265  * @param i string offset
266  * @param c code point to append
267  * @see U8_APPEND
268  * @stable ICU 2.4
269  */
270 #define CBU8_APPEND_UNSAFE(s, i, c) { \
271     if((uint32_t)(c)<=0x7f) { \
272         (s)[(i)++]=(uint8_t)(c); \
273     } else { \
274         if((uint32_t)(c)<=0x7ff) { \
275             (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \
276         } else { \
277             if((uint32_t)(c)<=0xffff) { \
278                 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \
279             } else { \
280                 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \
281                 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \
282             } \
283             (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \
284         } \
285         (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \
286     } \
287 }
288 
289 // source/common/unicode/utf16.h
290 
291 /**
292  * Does this code unit alone encode a code point (BMP, not a surrogate)?
293  * @param c 16-bit code unit
294  * @return TRUE or FALSE
295  * @stable ICU 2.4
296  */
297 #define CBU16_IS_SINGLE(c) !CBU_IS_SURROGATE(c)
298 
299 /**
300  * Is this code unit a lead surrogate (U+d800..U+dbff)?
301  * @param c 16-bit code unit
302  * @return TRUE or FALSE
303  * @stable ICU 2.4
304  */
305 #define CBU16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
306 
307 /**
308  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
309  * @param c 16-bit code unit
310  * @return TRUE or FALSE
311  * @stable ICU 2.4
312  */
313 #define CBU16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
314 
315 /**
316  * Is this code unit a surrogate (U+d800..U+dfff)?
317  * @param c 16-bit code unit
318  * @return TRUE or FALSE
319  * @stable ICU 2.4
320  */
321 #define CBU16_IS_SURROGATE(c) CBU_IS_SURROGATE(c)
322 
323 /**
324  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
325  * is it a lead surrogate?
326  * @param c 16-bit code unit
327  * @return TRUE or FALSE
328  * @stable ICU 2.4
329  */
330 #define CBU16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
331 
332 /**
333  * Helper constant for U16_GET_SUPPLEMENTARY.
334  * @internal
335  */
336 #define CBU16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
337 
338 /**
339  * Get a supplementary code point value (U+10000..U+10ffff)
340  * from its lead and trail surrogates.
341  * The result is undefined if the input values are not
342  * lead and trail surrogates.
343  *
344  * @param lead lead surrogate (U+d800..U+dbff)
345  * @param trail trail surrogate (U+dc00..U+dfff)
346  * @return supplementary code point (U+10000..U+10ffff)
347  * @stable ICU 2.4
348  */
349 #define CBU16_GET_SUPPLEMENTARY(lead, trail) \
350     (((::base_icu::UChar32)(lead)<<10UL)+(::base_icu::UChar32)(trail)-CBU16_SURROGATE_OFFSET)
351 
352 /**
353  * Get the lead surrogate (0xd800..0xdbff) for a
354  * supplementary code point (0x10000..0x10ffff).
355  * @param supplementary 32-bit code point (U+10000..U+10ffff)
356  * @return lead surrogate (U+d800..U+dbff) for supplementary
357  * @stable ICU 2.4
358  */
359 #define CBU16_LEAD(supplementary) (::base_icu::UChar)(((supplementary)>>10)+0xd7c0)
360 
361 /**
362  * Get the trail surrogate (0xdc00..0xdfff) for a
363  * supplementary code point (0x10000..0x10ffff).
364  * @param supplementary 32-bit code point (U+10000..U+10ffff)
365  * @return trail surrogate (U+dc00..U+dfff) for supplementary
366  * @stable ICU 2.4
367  */
368 #define CBU16_TRAIL(supplementary) (::base_icu::UChar)(((supplementary)&0x3ff)|0xdc00)
369 
370 /**
371  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
372  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
373  * @param c 32-bit code point
374  * @return 1 or 2
375  * @stable ICU 2.4
376  */
377 #define CBU16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
378 
379 /**
380  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
381  * @return 2
382  * @stable ICU 2.4
383  */
384 #define CBU16_MAX_LENGTH 2
385 
386 /**
387  * Get a code point from a string at a code point boundary offset,
388  * and advance the offset to the next code point boundary.
389  * (Post-incrementing forward iteration.)
390  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
391  *
392  * The length can be negative for a NUL-terminated string.
393  *
394  * The offset may point to the lead surrogate unit
395  * for a supplementary code point, in which case the macro will read
396  * the following trail surrogate as well.
397  * If the offset points to a trail surrogate or
398  * to a single, unpaired lead surrogate, then c is set to that unpaired surrogate.
399  *
400  * @param s const UChar * string
401  * @param i string offset, must be i<length
402  * @param length string length
403  * @param c output UChar32 variable
404  * @see U16_NEXT_UNSAFE
405  * @stable ICU 2.4
406  */
407 #define CBU16_NEXT(s, i, length, c) { \
408     (c)=(s)[(i)++]; \
409     if(CBU16_IS_LEAD(c)) { \
410         uint16_t __c2; \
411         if((i)!=(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
412             ++(i); \
413             (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
414         } \
415     } \
416 }
417 
418 /**
419  * Append a code point to a string, overwriting 1 or 2 code units.
420  * The offset points to the current end of the string contents
421  * and is advanced (post-increment).
422  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
423  * Otherwise, the result is undefined.
424  *
425  * @param s const UChar * string buffer
426  * @param i string offset
427  * @param c code point to append
428  * @see U16_APPEND
429  * @stable ICU 2.4
430  */
431 #define CBU16_APPEND_UNSAFE(s, i, c) { \
432     if((uint32_t)(c)<=0xffff) { \
433         (s)[(i)++]=(uint16_t)(c); \
434     } else { \
435         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
436         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
437     } \
438 }
439 
440 }  // namesapce base_icu
441 
442 #endif  // BASE_THIRD_PARTY_ICU_ICU_UTF_H_
443