1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2012, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  utf16.h
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 1999sep09
16 *   created by: Markus W. Scherer
17 */
18 
19 /**
20  * \file
21  * \brief C API: 16-bit Unicode handling macros
22  *
23  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
24  *
25  * For more information see utf.h and the ICU User Guide Strings chapter
26  * (http://userguide.icu-project.org/strings).
27  *
28  * <em>Usage:</em>
29  * ICU coding guidelines for if() statements should be followed when using these macros.
30  * Compound statements (curly braces {}) must be used  for if-else-while...
31  * bodies and all macro statements should be terminated with semicolon.
32  */
33 
34 #ifndef __UTF16_H__
35 #define __UTF16_H__
36 
37 #include "unicode/umachine.h"
38 #ifndef __UTF_H__
39 #   include "unicode/utf.h"
40 #endif
41 
42 /* single-code point definitions -------------------------------------------- */
43 
44 /**
45  * Does this code unit alone encode a code point (BMP, not a surrogate)?
46  * @param c 16-bit code unit
47  * @return TRUE or FALSE
48  * @stable ICU 2.4
49  */
50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
51 
52 /**
53  * Is this code unit a lead surrogate (U+d800..U+dbff)?
54  * @param c 16-bit code unit
55  * @return TRUE or FALSE
56  * @stable ICU 2.4
57  */
58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
59 
60 /**
61  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
62  * @param c 16-bit code unit
63  * @return TRUE or FALSE
64  * @stable ICU 2.4
65  */
66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
67 
68 /**
69  * Is this code unit a surrogate (U+d800..U+dfff)?
70  * @param c 16-bit code unit
71  * @return TRUE or FALSE
72  * @stable ICU 2.4
73  */
74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
75 
76 /**
77  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
78  * is it a lead surrogate?
79  * @param c 16-bit code unit
80  * @return TRUE or FALSE
81  * @stable ICU 2.4
82  */
83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
84 
85 /**
86  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
87  * is it a trail surrogate?
88  * @param c 16-bit code unit
89  * @return TRUE or FALSE
90  * @stable ICU 4.2
91  */
92 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
93 
94 /**
95  * Helper constant for U16_GET_SUPPLEMENTARY.
96  * @internal
97  */
98 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
99 
100 /**
101  * Get a supplementary code point value (U+10000..U+10ffff)
102  * from its lead and trail surrogates.
103  * The result is undefined if the input values are not
104  * lead and trail surrogates.
105  *
106  * @param lead lead surrogate (U+d800..U+dbff)
107  * @param trail trail surrogate (U+dc00..U+dfff)
108  * @return supplementary code point (U+10000..U+10ffff)
109  * @stable ICU 2.4
110  */
111 #define U16_GET_SUPPLEMENTARY(lead, trail) \
112     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
113 
114 
115 /**
116  * Get the lead surrogate (0xd800..0xdbff) for a
117  * supplementary code point (0x10000..0x10ffff).
118  * @param supplementary 32-bit code point (U+10000..U+10ffff)
119  * @return lead surrogate (U+d800..U+dbff) for supplementary
120  * @stable ICU 2.4
121  */
122 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
123 
124 /**
125  * Get the trail surrogate (0xdc00..0xdfff) for a
126  * supplementary code point (0x10000..0x10ffff).
127  * @param supplementary 32-bit code point (U+10000..U+10ffff)
128  * @return trail surrogate (U+dc00..U+dfff) for supplementary
129  * @stable ICU 2.4
130  */
131 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
132 
133 /**
134  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
135  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
136  * @param c 32-bit code point
137  * @return 1 or 2
138  * @stable ICU 2.4
139  */
140 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
141 
142 /**
143  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
144  * @return 2
145  * @stable ICU 2.4
146  */
147 #define U16_MAX_LENGTH 2
148 
149 /**
150  * Get a code point from a string at a random-access offset,
151  * without changing the offset.
152  * "Unsafe" macro, assumes well-formed UTF-16.
153  *
154  * The offset may point to either the lead or trail surrogate unit
155  * for a supplementary code point, in which case the macro will read
156  * the adjacent matching surrogate as well.
157  * The result is undefined if the offset points to a single, unpaired surrogate.
158  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
159  *
160  * @param s const UChar * string
161  * @param i string offset
162  * @param c output UChar32 variable
163  * @see U16_GET
164  * @stable ICU 2.4
165  */
166 #define U16_GET_UNSAFE(s, i, c) { \
167     (c)=(s)[i]; \
168     if(U16_IS_SURROGATE(c)) { \
169         if(U16_IS_SURROGATE_LEAD(c)) { \
170             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
171         } else { \
172             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
173         } \
174     } \
175 }
176 
177 /**
178  * Get a code point from a string at a random-access offset,
179  * without changing the offset.
180  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
181  *
182  * The offset may point to either the lead or trail surrogate unit
183  * for a supplementary code point, in which case the macro will read
184  * the adjacent matching surrogate as well.
185  *
186  * The length can be negative for a NUL-terminated string.
187  *
188  * If the offset points to a single, unpaired surrogate, then that itself
189  * will be returned as the code point.
190  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
191  *
192  * @param s const UChar * string
193  * @param start starting string offset (usually 0)
194  * @param i string offset, must be start<=i<length
195  * @param length string length
196  * @param c output UChar32 variable
197  * @see U16_GET_UNSAFE
198  * @stable ICU 2.4
199  */
200 #define U16_GET(s, start, i, length, c) { \
201     (c)=(s)[i]; \
202     if(U16_IS_SURROGATE(c)) { \
203         uint16_t __c2; \
204         if(U16_IS_SURROGATE_LEAD(c)) { \
205             if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
206                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
207             } \
208         } else { \
209             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
210                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
211             } \
212         } \
213     } \
214 }
215 
216 /* definitions with forward iteration --------------------------------------- */
217 
218 /**
219  * Get a code point from a string at a code point boundary offset,
220  * and advance the offset to the next code point boundary.
221  * (Post-incrementing forward iteration.)
222  * "Unsafe" macro, assumes well-formed UTF-16.
223  *
224  * The offset may point to the lead surrogate unit
225  * for a supplementary code point, in which case the macro will read
226  * the following trail surrogate as well.
227  * If the offset points to a trail surrogate, then that itself
228  * will be returned as the code point.
229  * The result is undefined if the offset points to a single, unpaired lead surrogate.
230  *
231  * @param s const UChar * string
232  * @param i string offset
233  * @param c output UChar32 variable
234  * @see U16_NEXT
235  * @stable ICU 2.4
236  */
237 #define U16_NEXT_UNSAFE(s, i, c) { \
238     (c)=(s)[(i)++]; \
239     if(U16_IS_LEAD(c)) { \
240         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
241     } \
242 }
243 
244 /**
245  * Get a code point from a string at a code point boundary offset,
246  * and advance the offset to the next code point boundary.
247  * (Post-incrementing forward iteration.)
248  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
249  *
250  * The length can be negative for a NUL-terminated string.
251  *
252  * The offset may point to the lead surrogate unit
253  * for a supplementary code point, in which case the macro will read
254  * the following trail surrogate as well.
255  * If the offset points to a trail surrogate or
256  * to a single, unpaired lead surrogate, then that itself
257  * will be returned as the code point.
258  *
259  * @param s const UChar * string
260  * @param i string offset, must be i<length
261  * @param length string length
262  * @param c output UChar32 variable
263  * @see U16_NEXT_UNSAFE
264  * @stable ICU 2.4
265  */
266 #define U16_NEXT(s, i, length, c) { \
267     (c)=(s)[(i)++]; \
268     if(U16_IS_LEAD(c)) { \
269         uint16_t __c2; \
270         if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
271             ++(i); \
272             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
273         } \
274     } \
275 }
276 
277 /**
278  * Append a code point to a string, overwriting 1 or 2 code units.
279  * The offset points to the current end of the string contents
280  * and is advanced (post-increment).
281  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
282  * Otherwise, the result is undefined.
283  *
284  * @param s const UChar * string buffer
285  * @param i string offset
286  * @param c code point to append
287  * @see U16_APPEND
288  * @stable ICU 2.4
289  */
290 #define U16_APPEND_UNSAFE(s, i, c) { \
291     if((uint32_t)(c)<=0xffff) { \
292         (s)[(i)++]=(uint16_t)(c); \
293     } else { \
294         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
295         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
296     } \
297 }
298 
299 /**
300  * Append a code point to a string, overwriting 1 or 2 code units.
301  * The offset points to the current end of the string contents
302  * and is advanced (post-increment).
303  * "Safe" macro, checks for a valid code point.
304  * If a surrogate pair is written, checks for sufficient space in the string.
305  * If the code point is not valid or a trail surrogate does not fit,
306  * then isError is set to TRUE.
307  *
308  * @param s const UChar * string buffer
309  * @param i string offset, must be i<capacity
310  * @param capacity size of the string buffer
311  * @param c code point to append
312  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
313  * @see U16_APPEND_UNSAFE
314  * @stable ICU 2.4
315  */
316 #define U16_APPEND(s, i, capacity, c, isError) { \
317     if((uint32_t)(c)<=0xffff) { \
318         (s)[(i)++]=(uint16_t)(c); \
319     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
320         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
321         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
322     } else /* c>0x10ffff or not enough space */ { \
323         (isError)=TRUE; \
324     } \
325 }
326 
327 /**
328  * Advance the string offset from one code point boundary to the next.
329  * (Post-incrementing iteration.)
330  * "Unsafe" macro, assumes well-formed UTF-16.
331  *
332  * @param s const UChar * string
333  * @param i string offset
334  * @see U16_FWD_1
335  * @stable ICU 2.4
336  */
337 #define U16_FWD_1_UNSAFE(s, i) { \
338     if(U16_IS_LEAD((s)[(i)++])) { \
339         ++(i); \
340     } \
341 }
342 
343 /**
344  * Advance the string offset from one code point boundary to the next.
345  * (Post-incrementing iteration.)
346  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
347  *
348  * The length can be negative for a NUL-terminated string.
349  *
350  * @param s const UChar * string
351  * @param i string offset, must be i<length
352  * @param length string length
353  * @see U16_FWD_1_UNSAFE
354  * @stable ICU 2.4
355  */
356 #define U16_FWD_1(s, i, length) { \
357     if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
358         ++(i); \
359     } \
360 }
361 
362 /**
363  * Advance the string offset from one code point boundary to the n-th next one,
364  * i.e., move forward by n code points.
365  * (Post-incrementing iteration.)
366  * "Unsafe" macro, assumes well-formed UTF-16.
367  *
368  * @param s const UChar * string
369  * @param i string offset
370  * @param n number of code points to skip
371  * @see U16_FWD_N
372  * @stable ICU 2.4
373  */
374 #define U16_FWD_N_UNSAFE(s, i, n) { \
375     int32_t __N=(n); \
376     while(__N>0) { \
377         U16_FWD_1_UNSAFE(s, i); \
378         --__N; \
379     } \
380 }
381 
382 /**
383  * Advance the string offset from one code point boundary to the n-th next one,
384  * i.e., move forward by n code points.
385  * (Post-incrementing iteration.)
386  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
387  *
388  * The length can be negative for a NUL-terminated string.
389  *
390  * @param s const UChar * string
391  * @param i int32_t string offset, must be i<length
392  * @param length int32_t string length
393  * @param n number of code points to skip
394  * @see U16_FWD_N_UNSAFE
395  * @stable ICU 2.4
396  */
397 #define U16_FWD_N(s, i, length, n) { \
398     int32_t __N=(n); \
399     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
400         U16_FWD_1(s, i, length); \
401         --__N; \
402     } \
403 }
404 
405 /**
406  * Adjust a random-access offset to a code point boundary
407  * at the start of a code point.
408  * If the offset points to the trail surrogate of a surrogate pair,
409  * then the offset is decremented.
410  * Otherwise, it is not modified.
411  * "Unsafe" macro, assumes well-formed UTF-16.
412  *
413  * @param s const UChar * string
414  * @param i string offset
415  * @see U16_SET_CP_START
416  * @stable ICU 2.4
417  */
418 #define U16_SET_CP_START_UNSAFE(s, i) { \
419     if(U16_IS_TRAIL((s)[i])) { \
420         --(i); \
421     } \
422 }
423 
424 /**
425  * Adjust a random-access offset to a code point boundary
426  * at the start of a code point.
427  * If the offset points to the trail surrogate of a surrogate pair,
428  * then the offset is decremented.
429  * Otherwise, it is not modified.
430  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
431  *
432  * @param s const UChar * string
433  * @param start starting string offset (usually 0)
434  * @param i string offset, must be start<=i
435  * @see U16_SET_CP_START_UNSAFE
436  * @stable ICU 2.4
437  */
438 #define U16_SET_CP_START(s, start, i) { \
439     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
440         --(i); \
441     } \
442 }
443 
444 /* definitions with backward iteration -------------------------------------- */
445 
446 /**
447  * Move the string offset from one code point boundary to the previous one
448  * and get the code point between them.
449  * (Pre-decrementing backward iteration.)
450  * "Unsafe" macro, assumes well-formed UTF-16.
451  *
452  * The input offset may be the same as the string length.
453  * If the offset is behind a trail surrogate unit
454  * for a supplementary code point, then the macro will read
455  * the preceding lead surrogate as well.
456  * If the offset is behind a lead surrogate, then that itself
457  * will be returned as the code point.
458  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
459  *
460  * @param s const UChar * string
461  * @param i string offset
462  * @param c output UChar32 variable
463  * @see U16_PREV
464  * @stable ICU 2.4
465  */
466 #define U16_PREV_UNSAFE(s, i, c) { \
467     (c)=(s)[--(i)]; \
468     if(U16_IS_TRAIL(c)) { \
469         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
470     } \
471 }
472 
473 /**
474  * Move the string offset from one code point boundary to the previous one
475  * and get the code point between them.
476  * (Pre-decrementing backward iteration.)
477  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
478  *
479  * The input offset may be the same as the string length.
480  * If the offset is behind a trail surrogate unit
481  * for a supplementary code point, then the macro will read
482  * the preceding lead surrogate as well.
483  * If the offset is behind a lead surrogate or behind a single, unpaired
484  * trail surrogate, then that itself
485  * will be returned as the code point.
486  *
487  * @param s const UChar * string
488  * @param start starting string offset (usually 0)
489  * @param i string offset, must be start<i
490  * @param c output UChar32 variable
491  * @see U16_PREV_UNSAFE
492  * @stable ICU 2.4
493  */
494 #define U16_PREV(s, start, i, c) { \
495     (c)=(s)[--(i)]; \
496     if(U16_IS_TRAIL(c)) { \
497         uint16_t __c2; \
498         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
499             --(i); \
500             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
501         } \
502     } \
503 }
504 
505 /**
506  * Move the string offset from one code point boundary to the previous one.
507  * (Pre-decrementing backward iteration.)
508  * The input offset may be the same as the string length.
509  * "Unsafe" macro, assumes well-formed UTF-16.
510  *
511  * @param s const UChar * string
512  * @param i string offset
513  * @see U16_BACK_1
514  * @stable ICU 2.4
515  */
516 #define U16_BACK_1_UNSAFE(s, i) { \
517     if(U16_IS_TRAIL((s)[--(i)])) { \
518         --(i); \
519     } \
520 }
521 
522 /**
523  * Move the string offset from one code point boundary to the previous one.
524  * (Pre-decrementing backward iteration.)
525  * The input offset may be the same as the string length.
526  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
527  *
528  * @param s const UChar * string
529  * @param start starting string offset (usually 0)
530  * @param i string offset, must be start<i
531  * @see U16_BACK_1_UNSAFE
532  * @stable ICU 2.4
533  */
534 #define U16_BACK_1(s, start, i) { \
535     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
536         --(i); \
537     } \
538 }
539 
540 /**
541  * Move the string offset from one code point boundary to the n-th one before it,
542  * i.e., move backward by n code points.
543  * (Pre-decrementing backward iteration.)
544  * The input offset may be the same as the string length.
545  * "Unsafe" macro, assumes well-formed UTF-16.
546  *
547  * @param s const UChar * string
548  * @param i string offset
549  * @param n number of code points to skip
550  * @see U16_BACK_N
551  * @stable ICU 2.4
552  */
553 #define U16_BACK_N_UNSAFE(s, i, n) { \
554     int32_t __N=(n); \
555     while(__N>0) { \
556         U16_BACK_1_UNSAFE(s, i); \
557         --__N; \
558     } \
559 }
560 
561 /**
562  * Move the string offset from one code point boundary to the n-th one before it,
563  * i.e., move backward by n code points.
564  * (Pre-decrementing backward iteration.)
565  * The input offset may be the same as the string length.
566  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
567  *
568  * @param s const UChar * string
569  * @param start start of string
570  * @param i string offset, must be start<i
571  * @param n number of code points to skip
572  * @see U16_BACK_N_UNSAFE
573  * @stable ICU 2.4
574  */
575 #define U16_BACK_N(s, start, i, n) { \
576     int32_t __N=(n); \
577     while(__N>0 && (i)>(start)) { \
578         U16_BACK_1(s, start, i); \
579         --__N; \
580     } \
581 }
582 
583 /**
584  * Adjust a random-access offset to a code point boundary after a code point.
585  * If the offset is behind the lead surrogate of a surrogate pair,
586  * then the offset is incremented.
587  * Otherwise, it is not modified.
588  * The input offset may be the same as the string length.
589  * "Unsafe" macro, assumes well-formed UTF-16.
590  *
591  * @param s const UChar * string
592  * @param i string offset
593  * @see U16_SET_CP_LIMIT
594  * @stable ICU 2.4
595  */
596 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
597     if(U16_IS_LEAD((s)[(i)-1])) { \
598         ++(i); \
599     } \
600 }
601 
602 /**
603  * Adjust a random-access offset to a code point boundary after a code point.
604  * If the offset is behind the lead surrogate of a surrogate pair,
605  * then the offset is incremented.
606  * Otherwise, it is not modified.
607  * The input offset may be the same as the string length.
608  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
609  *
610  * The length can be negative for a NUL-terminated string.
611  *
612  * @param s const UChar * string
613  * @param start int32_t starting string offset (usually 0)
614  * @param i int32_t string offset, start<=i<=length
615  * @param length int32_t string length
616  * @see U16_SET_CP_LIMIT_UNSAFE
617  * @stable ICU 2.4
618  */
619 #define U16_SET_CP_LIMIT(s, start, i, length) { \
620     if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
621         ++(i); \
622     } \
623 }
624 
625 #endif
626