1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2012, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  utf16.h
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 1999sep09
14 *   created by: Markus W. Scherer
15 */
16 
17 /**
18  * \file
19  * \brief C API: 16-bit Unicode handling macros
20  *
21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
22  *
23  * For more information see utf.h and the ICU User Guide Strings chapter
24  * (http://userguide.icu-project.org/strings).
25  *
26  * <em>Usage:</em>
27  * ICU coding guidelines for if() statements should be followed when using these macros.
28  * Compound statements (curly braces {}) must be used  for if-else-while...
29  * bodies and all macro statements should be terminated with semicolon.
30  */
31 
32 #ifndef __UTF16_H__
33 #define __UTF16_H__
34 
35 #include "unicode/umachine.h"
36 #ifndef __UTF_H__
37 #   include "unicode/utf.h"
38 #endif
39 
40 /* single-code point definitions -------------------------------------------- */
41 
42 /**
43  * Does this code unit alone encode a code point (BMP, not a surrogate)?
44  * @param c 16-bit code unit
45  * @return TRUE or FALSE
46  * @stable ICU 2.4
47  */
48 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
49 
50 /**
51  * Is this code unit a lead surrogate (U+d800..U+dbff)?
52  * @param c 16-bit code unit
53  * @return TRUE or FALSE
54  * @stable ICU 2.4
55  */
56 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
57 
58 /**
59  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
60  * @param c 16-bit code unit
61  * @return TRUE or FALSE
62  * @stable ICU 2.4
63  */
64 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
65 
66 /**
67  * Is this code unit a surrogate (U+d800..U+dfff)?
68  * @param c 16-bit code unit
69  * @return TRUE or FALSE
70  * @stable ICU 2.4
71  */
72 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
73 
74 /**
75  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
76  * is it a lead surrogate?
77  * @param c 16-bit code unit
78  * @return TRUE or FALSE
79  * @stable ICU 2.4
80  */
81 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
82 
83 /**
84  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
85  * is it a trail surrogate?
86  * @param c 16-bit code unit
87  * @return TRUE or FALSE
88  * @stable ICU 4.2
89  */
90 #define U16_IS_SURROGATE_TRAIL(c) (((c)&0x400)!=0)
91 
92 /**
93  * Helper constant for U16_GET_SUPPLEMENTARY.
94  * @internal
95  */
96 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
97 
98 /**
99  * Get a supplementary code point value (U+10000..U+10ffff)
100  * from its lead and trail surrogates.
101  * The result is undefined if the input values are not
102  * lead and trail surrogates.
103  *
104  * @param lead lead surrogate (U+d800..U+dbff)
105  * @param trail trail surrogate (U+dc00..U+dfff)
106  * @return supplementary code point (U+10000..U+10ffff)
107  * @stable ICU 2.4
108  */
109 #define U16_GET_SUPPLEMENTARY(lead, trail) \
110     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
111 
112 
113 /**
114  * Get the lead surrogate (0xd800..0xdbff) for a
115  * supplementary code point (0x10000..0x10ffff).
116  * @param supplementary 32-bit code point (U+10000..U+10ffff)
117  * @return lead surrogate (U+d800..U+dbff) for supplementary
118  * @stable ICU 2.4
119  */
120 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
121 
122 /**
123  * Get the trail surrogate (0xdc00..0xdfff) for a
124  * supplementary code point (0x10000..0x10ffff).
125  * @param supplementary 32-bit code point (U+10000..U+10ffff)
126  * @return trail surrogate (U+dc00..U+dfff) for supplementary
127  * @stable ICU 2.4
128  */
129 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
130 
131 /**
132  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
133  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
134  * @param c 32-bit code point
135  * @return 1 or 2
136  * @stable ICU 2.4
137  */
138 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
139 
140 /**
141  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
142  * @return 2
143  * @stable ICU 2.4
144  */
145 #define U16_MAX_LENGTH 2
146 
147 /**
148  * Get a code point from a string at a random-access offset,
149  * without changing the offset.
150  * "Unsafe" macro, assumes well-formed UTF-16.
151  *
152  * The offset may point to either the lead or trail surrogate unit
153  * for a supplementary code point, in which case the macro will read
154  * the adjacent matching surrogate as well.
155  * The result is undefined if the offset points to a single, unpaired surrogate.
156  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
157  *
158  * @param s const UChar * string
159  * @param i string offset
160  * @param c output UChar32 variable
161  * @see U16_GET
162  * @stable ICU 2.4
163  */
164 #define U16_GET_UNSAFE(s, i, c) { \
165     (c)=(s)[i]; \
166     if(U16_IS_SURROGATE(c)) { \
167         if(U16_IS_SURROGATE_LEAD(c)) { \
168             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
169         } else { \
170             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
171         } \
172     } \
173 }
174 
175 /**
176  * Get a code point from a string at a random-access offset,
177  * without changing the offset.
178  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
179  *
180  * The offset may point to either the lead or trail surrogate unit
181  * for a supplementary code point, in which case the macro will read
182  * the adjacent matching surrogate as well.
183  *
184  * The length can be negative for a NUL-terminated string.
185  *
186  * If the offset points to a single, unpaired surrogate, then that itself
187  * will be returned as the code point.
188  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
189  *
190  * @param s const UChar * string
191  * @param start starting string offset (usually 0)
192  * @param i string offset, must be start<=i<length
193  * @param length string length
194  * @param c output UChar32 variable
195  * @see U16_GET_UNSAFE
196  * @stable ICU 2.4
197  */
198 #define U16_GET(s, start, i, length, c) { \
199     (c)=(s)[i]; \
200     if(U16_IS_SURROGATE(c)) { \
201         uint16_t __c2; \
202         if(U16_IS_SURROGATE_LEAD(c)) { \
203             if((i)+1!=(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
204                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
205             } \
206         } else { \
207             if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
208                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
209             } \
210         } \
211     } \
212 }
213 
214 /* definitions with forward iteration --------------------------------------- */
215 
216 /**
217  * Get a code point from a string at a code point boundary offset,
218  * and advance the offset to the next code point boundary.
219  * (Post-incrementing forward iteration.)
220  * "Unsafe" macro, assumes well-formed UTF-16.
221  *
222  * The offset may point to the lead surrogate unit
223  * for a supplementary code point, in which case the macro will read
224  * the following trail surrogate as well.
225  * If the offset points to a trail surrogate, then that itself
226  * will be returned as the code point.
227  * The result is undefined if the offset points to a single, unpaired lead surrogate.
228  *
229  * @param s const UChar * string
230  * @param i string offset
231  * @param c output UChar32 variable
232  * @see U16_NEXT
233  * @stable ICU 2.4
234  */
235 #define U16_NEXT_UNSAFE(s, i, c) { \
236     (c)=(s)[(i)++]; \
237     if(U16_IS_LEAD(c)) { \
238         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
239     } \
240 }
241 
242 /**
243  * Get a code point from a string at a code point boundary offset,
244  * and advance the offset to the next code point boundary.
245  * (Post-incrementing forward iteration.)
246  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
247  *
248  * The length can be negative for a NUL-terminated string.
249  *
250  * The offset may point to the lead surrogate unit
251  * for a supplementary code point, in which case the macro will read
252  * the following trail surrogate as well.
253  * If the offset points to a trail surrogate or
254  * to a single, unpaired lead surrogate, then that itself
255  * will be returned as the code point.
256  *
257  * @param s const UChar * string
258  * @param i string offset, must be i<length
259  * @param length string length
260  * @param c output UChar32 variable
261  * @see U16_NEXT_UNSAFE
262  * @stable ICU 2.4
263  */
264 #define U16_NEXT(s, i, length, c) { \
265     (c)=(s)[(i)++]; \
266     if(U16_IS_LEAD(c)) { \
267         uint16_t __c2; \
268         if((i)!=(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
269             ++(i); \
270             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
271         } \
272     } \
273 }
274 
275 /**
276  * Append a code point to a string, overwriting 1 or 2 code units.
277  * The offset points to the current end of the string contents
278  * and is advanced (post-increment).
279  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
280  * Otherwise, the result is undefined.
281  *
282  * @param s const UChar * string buffer
283  * @param i string offset
284  * @param c code point to append
285  * @see U16_APPEND
286  * @stable ICU 2.4
287  */
288 #define U16_APPEND_UNSAFE(s, i, c) { \
289     if((uint32_t)(c)<=0xffff) { \
290         (s)[(i)++]=(uint16_t)(c); \
291     } else { \
292         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
293         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
294     } \
295 }
296 
297 /**
298  * Append a code point to a string, overwriting 1 or 2 code units.
299  * The offset points to the current end of the string contents
300  * and is advanced (post-increment).
301  * "Safe" macro, checks for a valid code point.
302  * If a surrogate pair is written, checks for sufficient space in the string.
303  * If the code point is not valid or a trail surrogate does not fit,
304  * then isError is set to TRUE.
305  *
306  * @param s const UChar * string buffer
307  * @param i string offset, must be i<capacity
308  * @param capacity size of the string buffer
309  * @param c code point to append
310  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
311  * @see U16_APPEND_UNSAFE
312  * @stable ICU 2.4
313  */
314 #define U16_APPEND(s, i, capacity, c, isError) { \
315     if((uint32_t)(c)<=0xffff) { \
316         (s)[(i)++]=(uint16_t)(c); \
317     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
318         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
319         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
320     } else /* c>0x10ffff or not enough space */ { \
321         (isError)=TRUE; \
322     } \
323 }
324 
325 /**
326  * Advance the string offset from one code point boundary to the next.
327  * (Post-incrementing iteration.)
328  * "Unsafe" macro, assumes well-formed UTF-16.
329  *
330  * @param s const UChar * string
331  * @param i string offset
332  * @see U16_FWD_1
333  * @stable ICU 2.4
334  */
335 #define U16_FWD_1_UNSAFE(s, i) { \
336     if(U16_IS_LEAD((s)[(i)++])) { \
337         ++(i); \
338     } \
339 }
340 
341 /**
342  * Advance the string offset from one code point boundary to the next.
343  * (Post-incrementing iteration.)
344  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
345  *
346  * The length can be negative for a NUL-terminated string.
347  *
348  * @param s const UChar * string
349  * @param i string offset, must be i<length
350  * @param length string length
351  * @see U16_FWD_1_UNSAFE
352  * @stable ICU 2.4
353  */
354 #define U16_FWD_1(s, i, length) { \
355     if(U16_IS_LEAD((s)[(i)++]) && (i)!=(length) && U16_IS_TRAIL((s)[i])) { \
356         ++(i); \
357     } \
358 }
359 
360 /**
361  * Advance the string offset from one code point boundary to the n-th next one,
362  * i.e., move forward by n code points.
363  * (Post-incrementing iteration.)
364  * "Unsafe" macro, assumes well-formed UTF-16.
365  *
366  * @param s const UChar * string
367  * @param i string offset
368  * @param n number of code points to skip
369  * @see U16_FWD_N
370  * @stable ICU 2.4
371  */
372 #define U16_FWD_N_UNSAFE(s, i, n) { \
373     int32_t __N=(n); \
374     while(__N>0) { \
375         U16_FWD_1_UNSAFE(s, i); \
376         --__N; \
377     } \
378 }
379 
380 /**
381  * Advance the string offset from one code point boundary to the n-th next one,
382  * i.e., move forward by n code points.
383  * (Post-incrementing iteration.)
384  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
385  *
386  * The length can be negative for a NUL-terminated string.
387  *
388  * @param s const UChar * string
389  * @param i int32_t string offset, must be i<length
390  * @param length int32_t string length
391  * @param n number of code points to skip
392  * @see U16_FWD_N_UNSAFE
393  * @stable ICU 2.4
394  */
395 #define U16_FWD_N(s, i, length, n) { \
396     int32_t __N=(n); \
397     while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \
398         U16_FWD_1(s, i, length); \
399         --__N; \
400     } \
401 }
402 
403 /**
404  * Adjust a random-access offset to a code point boundary
405  * at the start of a code point.
406  * If the offset points to the trail surrogate of a surrogate pair,
407  * then the offset is decremented.
408  * Otherwise, it is not modified.
409  * "Unsafe" macro, assumes well-formed UTF-16.
410  *
411  * @param s const UChar * string
412  * @param i string offset
413  * @see U16_SET_CP_START
414  * @stable ICU 2.4
415  */
416 #define U16_SET_CP_START_UNSAFE(s, i) { \
417     if(U16_IS_TRAIL((s)[i])) { \
418         --(i); \
419     } \
420 }
421 
422 /**
423  * Adjust a random-access offset to a code point boundary
424  * at the start of a code point.
425  * If the offset points to the trail surrogate of a surrogate pair,
426  * then the offset is decremented.
427  * Otherwise, it is not modified.
428  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
429  *
430  * @param s const UChar * string
431  * @param start starting string offset (usually 0)
432  * @param i string offset, must be start<=i
433  * @see U16_SET_CP_START_UNSAFE
434  * @stable ICU 2.4
435  */
436 #define U16_SET_CP_START(s, start, i) { \
437     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
438         --(i); \
439     } \
440 }
441 
442 /* definitions with backward iteration -------------------------------------- */
443 
444 /**
445  * Move the string offset from one code point boundary to the previous one
446  * and get the code point between them.
447  * (Pre-decrementing backward iteration.)
448  * "Unsafe" macro, assumes well-formed UTF-16.
449  *
450  * The input offset may be the same as the string length.
451  * If the offset is behind a trail surrogate unit
452  * for a supplementary code point, then the macro will read
453  * the preceding lead surrogate as well.
454  * If the offset is behind a lead surrogate, then that itself
455  * will be returned as the code point.
456  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
457  *
458  * @param s const UChar * string
459  * @param i string offset
460  * @param c output UChar32 variable
461  * @see U16_PREV
462  * @stable ICU 2.4
463  */
464 #define U16_PREV_UNSAFE(s, i, c) { \
465     (c)=(s)[--(i)]; \
466     if(U16_IS_TRAIL(c)) { \
467         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
468     } \
469 }
470 
471 /**
472  * Move the string offset from one code point boundary to the previous one
473  * and get the code point between them.
474  * (Pre-decrementing backward iteration.)
475  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
476  *
477  * The input offset may be the same as the string length.
478  * If the offset is behind a trail surrogate unit
479  * for a supplementary code point, then the macro will read
480  * the preceding lead surrogate as well.
481  * If the offset is behind a lead surrogate or behind a single, unpaired
482  * trail surrogate, then that itself
483  * will be returned as the code point.
484  *
485  * @param s const UChar * string
486  * @param start starting string offset (usually 0)
487  * @param i string offset, must be start<i
488  * @param c output UChar32 variable
489  * @see U16_PREV_UNSAFE
490  * @stable ICU 2.4
491  */
492 #define U16_PREV(s, start, i, c) { \
493     (c)=(s)[--(i)]; \
494     if(U16_IS_TRAIL(c)) { \
495         uint16_t __c2; \
496         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
497             --(i); \
498             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
499         } \
500     } \
501 }
502 
503 /**
504  * Move the string offset from one code point boundary to the previous one.
505  * (Pre-decrementing backward iteration.)
506  * The input offset may be the same as the string length.
507  * "Unsafe" macro, assumes well-formed UTF-16.
508  *
509  * @param s const UChar * string
510  * @param i string offset
511  * @see U16_BACK_1
512  * @stable ICU 2.4
513  */
514 #define U16_BACK_1_UNSAFE(s, i) { \
515     if(U16_IS_TRAIL((s)[--(i)])) { \
516         --(i); \
517     } \
518 }
519 
520 /**
521  * Move the string offset from one code point boundary to the previous one.
522  * (Pre-decrementing backward iteration.)
523  * The input offset may be the same as the string length.
524  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
525  *
526  * @param s const UChar * string
527  * @param start starting string offset (usually 0)
528  * @param i string offset, must be start<i
529  * @see U16_BACK_1_UNSAFE
530  * @stable ICU 2.4
531  */
532 #define U16_BACK_1(s, start, i) { \
533     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
534         --(i); \
535     } \
536 }
537 
538 /**
539  * Move the string offset from one code point boundary to the n-th one before it,
540  * i.e., move backward by n code points.
541  * (Pre-decrementing backward iteration.)
542  * The input offset may be the same as the string length.
543  * "Unsafe" macro, assumes well-formed UTF-16.
544  *
545  * @param s const UChar * string
546  * @param i string offset
547  * @param n number of code points to skip
548  * @see U16_BACK_N
549  * @stable ICU 2.4
550  */
551 #define U16_BACK_N_UNSAFE(s, i, n) { \
552     int32_t __N=(n); \
553     while(__N>0) { \
554         U16_BACK_1_UNSAFE(s, i); \
555         --__N; \
556     } \
557 }
558 
559 /**
560  * Move the string offset from one code point boundary to the n-th one before it,
561  * i.e., move backward by n code points.
562  * (Pre-decrementing backward iteration.)
563  * The input offset may be the same as the string length.
564  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
565  *
566  * @param s const UChar * string
567  * @param start start of string
568  * @param i string offset, must be start<i
569  * @param n number of code points to skip
570  * @see U16_BACK_N_UNSAFE
571  * @stable ICU 2.4
572  */
573 #define U16_BACK_N(s, start, i, n) { \
574     int32_t __N=(n); \
575     while(__N>0 && (i)>(start)) { \
576         U16_BACK_1(s, start, i); \
577         --__N; \
578     } \
579 }
580 
581 /**
582  * Adjust a random-access offset to a code point boundary after a code point.
583  * If the offset is behind the lead surrogate of a surrogate pair,
584  * then the offset is incremented.
585  * Otherwise, it is not modified.
586  * The input offset may be the same as the string length.
587  * "Unsafe" macro, assumes well-formed UTF-16.
588  *
589  * @param s const UChar * string
590  * @param i string offset
591  * @see U16_SET_CP_LIMIT
592  * @stable ICU 2.4
593  */
594 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
595     if(U16_IS_LEAD((s)[(i)-1])) { \
596         ++(i); \
597     } \
598 }
599 
600 /**
601  * Adjust a random-access offset to a code point boundary after a code point.
602  * If the offset is behind the lead surrogate of a surrogate pair,
603  * then the offset is incremented.
604  * Otherwise, it is not modified.
605  * The input offset may be the same as the string length.
606  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
607  *
608  * The length can be negative for a NUL-terminated string.
609  *
610  * @param s const UChar * string
611  * @param start int32_t starting string offset (usually 0)
612  * @param i int32_t string offset, start<=i<=length
613  * @param length int32_t string length
614  * @see U16_SET_CP_LIMIT_UNSAFE
615  * @stable ICU 2.4
616  */
617 #define U16_SET_CP_LIMIT(s, start, i, length) { \
618     if((start)<(i) && ((i)<(length) || (length)<0) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
619         ++(i); \
620     } \
621 }
622 
623 #endif
624