1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: utf8.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 1999sep13 14 * created by: Markus W. Scherer 15 */ 16 17 /** 18 * \file 19 * \brief C API: 8-bit Unicode handling macros 20 * 21 * This file defines macros to deal with 8-bit Unicode (UTF-8) code units (bytes) and strings. 22 * 23 * For more information see utf.h and the ICU User Guide Strings chapter 24 * (http://userguide.icu-project.org/strings). 25 * 26 * <em>Usage:</em> 27 * ICU coding guidelines for if() statements should be followed when using these macros. 28 * Compound statements (curly braces {}) must be used for if-else-while... 29 * bodies and all macro statements should be terminated with semicolon. 30 */ 31 32 #ifndef __UTF8_H__ 33 #define __UTF8_H__ 34 35 #include "unicode/umachine.h" 36 #ifndef __UTF_H__ 37 # include "unicode/utf.h" 38 #endif 39 40 /* internal definitions ----------------------------------------------------- */ 41 42 /** 43 * \var utf8_countTrailBytes 44 * Internal array with numbers of trail bytes for any given byte used in 45 * lead byte position. 46 * 47 * This is internal since it is not meant to be called directly by external clients; 48 * however it is called by public macros in this file and thus must remain stable, 49 * and should not be hidden when other internal functions are hidden (otherwise 50 * public macros would fail to compile). 51 * @internal 52 */ 53 #ifdef U_UTF8_IMPL 54 U_EXPORT const uint8_t 55 #elif defined(U_STATIC_IMPLEMENTATION) || defined(U_COMMON_IMPLEMENTATION) 56 U_CFUNC const uint8_t 57 #else 58 U_CFUNC U_IMPORT const uint8_t /* U_IMPORT2? */ /*U_IMPORT*/ 59 #endif 60 utf8_countTrailBytes[256]; 61 62 /** 63 * Counts the trail bytes for a UTF-8 lead byte. 64 * Returns 0 for 0..0xbf as well as for 0xfe and 0xff. 65 * 66 * This is internal since it is not meant to be called directly by external clients; 67 * however it is called by public macros in this file and thus must remain stable. 68 * 69 * Note: Beginning with ICU 50, the implementation uses a multi-condition expression 70 * which was shown in 2012 (on x86-64) to compile to fast, branch-free code. 71 * leadByte is evaluated multiple times. 72 * 73 * The pre-ICU 50 implementation used the exported array utf8_countTrailBytes: 74 * #define U8_COUNT_TRAIL_BYTES(leadByte) (utf8_countTrailBytes[leadByte]) 75 * leadByte was evaluated exactly once. 76 * 77 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 78 * @internal 79 */ 80 #define U8_COUNT_TRAIL_BYTES(leadByte) \ 81 ((uint8_t)(leadByte)<0xf0 ? \ 82 ((uint8_t)(leadByte)>=0xc0)+((uint8_t)(leadByte)>=0xe0) : \ 83 (uint8_t)(leadByte)<0xfe ? 3+((uint8_t)(leadByte)>=0xf8)+((uint8_t)(leadByte)>=0xfc) : 0) 84 85 /** 86 * Counts the trail bytes for a UTF-8 lead byte of a valid UTF-8 sequence. 87 * The maximum supported lead byte is 0xf4 corresponding to U+10FFFF. 88 * leadByte might be evaluated multiple times. 89 * 90 * This is internal since it is not meant to be called directly by external clients; 91 * however it is called by public macros in this file and thus must remain stable. 92 * 93 * @param leadByte The first byte of a UTF-8 sequence. Must be 0..0xff. 94 * @internal 95 */ 96 #define U8_COUNT_TRAIL_BYTES_UNSAFE(leadByte) \ 97 (((leadByte)>=0xc0)+((leadByte)>=0xe0)+((leadByte)>=0xf0)) 98 99 /** 100 * Mask a UTF-8 lead byte, leave only the lower bits that form part of the code point value. 101 * 102 * This is internal since it is not meant to be called directly by external clients; 103 * however it is called by public macros in this file and thus must remain stable. 104 * @internal 105 */ 106 #define U8_MASK_LEAD_BYTE(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) 107 108 /** 109 * Function for handling "next code point" with error-checking. 110 * 111 * This is internal since it is not meant to be called directly by external clients; 112 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 113 * file and thus must remain stable, and should not be hidden when other internal 114 * functions are hidden (otherwise public macros would fail to compile). 115 * @internal 116 */ 117 U_STABLE UChar32 U_EXPORT2 118 utf8_nextCharSafeBody(const uint8_t *s, int32_t *pi, int32_t length, UChar32 c, UBool strict); 119 120 /** 121 * Function for handling "append code point" with error-checking. 122 * 123 * This is internal since it is not meant to be called directly by external clients; 124 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 125 * file and thus must remain stable, and should not be hidden when other internal 126 * functions are hidden (otherwise public macros would fail to compile). 127 * @internal 128 */ 129 U_STABLE int32_t U_EXPORT2 130 utf8_appendCharSafeBody(uint8_t *s, int32_t i, int32_t length, UChar32 c, UBool *pIsError); 131 132 /** 133 * Function for handling "previous code point" with error-checking. 134 * 135 * This is internal since it is not meant to be called directly by external clients; 136 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 137 * file and thus must remain stable, and should not be hidden when other internal 138 * functions are hidden (otherwise public macros would fail to compile). 139 * @internal 140 */ 141 U_STABLE UChar32 U_EXPORT2 142 utf8_prevCharSafeBody(const uint8_t *s, int32_t start, int32_t *pi, UChar32 c, UBool strict); 143 144 /** 145 * Function for handling "skip backward one code point" with error-checking. 146 * 147 * This is internal since it is not meant to be called directly by external clients; 148 * however it is U_STABLE (not U_INTERNAL) since it is called by public macros in this 149 * file and thus must remain stable, and should not be hidden when other internal 150 * functions are hidden (otherwise public macros would fail to compile). 151 * @internal 152 */ 153 U_STABLE int32_t U_EXPORT2 154 utf8_back1SafeBody(const uint8_t *s, int32_t start, int32_t i); 155 156 /* single-code point definitions -------------------------------------------- */ 157 158 /** 159 * Does this code unit (byte) encode a code point by itself (US-ASCII 0..0x7f)? 160 * @param c 8-bit code unit (byte) 161 * @return TRUE or FALSE 162 * @stable ICU 2.4 163 */ 164 #define U8_IS_SINGLE(c) (((c)&0x80)==0) 165 166 /** 167 * Is this code unit (byte) a UTF-8 lead byte? 168 * @param c 8-bit code unit (byte) 169 * @return TRUE or FALSE 170 * @stable ICU 2.4 171 */ 172 #define U8_IS_LEAD(c) ((uint8_t)((c)-0xc0)<0x3e) 173 174 /** 175 * Is this code unit (byte) a UTF-8 trail byte? 176 * @param c 8-bit code unit (byte) 177 * @return TRUE or FALSE 178 * @stable ICU 2.4 179 */ 180 #define U8_IS_TRAIL(c) (((c)&0xc0)==0x80) 181 182 /** 183 * How many code units (bytes) are used for the UTF-8 encoding 184 * of this Unicode code point? 185 * @param c 32-bit code point 186 * @return 1..4, or 0 if c is a surrogate or not a Unicode code point 187 * @stable ICU 2.4 188 */ 189 #define U8_LENGTH(c) \ 190 ((uint32_t)(c)<=0x7f ? 1 : \ 191 ((uint32_t)(c)<=0x7ff ? 2 : \ 192 ((uint32_t)(c)<=0xd7ff ? 3 : \ 193 ((uint32_t)(c)<=0xdfff || (uint32_t)(c)>0x10ffff ? 0 : \ 194 ((uint32_t)(c)<=0xffff ? 3 : 4)\ 195 ) \ 196 ) \ 197 ) \ 198 ) 199 200 /** 201 * The maximum number of UTF-8 code units (bytes) per Unicode code point (U+0000..U+10ffff). 202 * @return 4 203 * @stable ICU 2.4 204 */ 205 #define U8_MAX_LENGTH 4 206 207 /** 208 * Get a code point from a string at a random-access offset, 209 * without changing the offset. 210 * The offset may point to either the lead byte or one of the trail bytes 211 * for a code point, in which case the macro will read all of the bytes 212 * for the code point. 213 * The result is undefined if the offset points to an illegal UTF-8 214 * byte sequence. 215 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 216 * 217 * @param s const uint8_t * string 218 * @param i string offset 219 * @param c output UChar32 variable 220 * @see U8_GET 221 * @stable ICU 2.4 222 */ 223 #define U8_GET_UNSAFE(s, i, c) { \ 224 int32_t _u8_get_unsafe_index=(int32_t)(i); \ 225 U8_SET_CP_START_UNSAFE(s, _u8_get_unsafe_index); \ 226 U8_NEXT_UNSAFE(s, _u8_get_unsafe_index, c); \ 227 } 228 229 /** 230 * Get a code point from a string at a random-access offset, 231 * without changing the offset. 232 * The offset may point to either the lead byte or one of the trail bytes 233 * for a code point, in which case the macro will read all of the bytes 234 * for the code point. 235 * 236 * The length can be negative for a NUL-terminated string. 237 * 238 * If the offset points to an illegal UTF-8 byte sequence, then 239 * c is set to a negative value. 240 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT. 241 * 242 * @param s const uint8_t * string 243 * @param start int32_t starting string offset 244 * @param i int32_t string offset, must be start<=i<length 245 * @param length int32_t string length 246 * @param c output UChar32 variable, set to <0 in case of an error 247 * @see U8_GET_UNSAFE 248 * @stable ICU 2.4 249 */ 250 #define U8_GET(s, start, i, length, c) { \ 251 int32_t _u8_get_index=(i); \ 252 U8_SET_CP_START(s, start, _u8_get_index); \ 253 U8_NEXT(s, _u8_get_index, length, c); \ 254 } 255 256 /** 257 * Get a code point from a string at a random-access offset, 258 * without changing the offset. 259 * The offset may point to either the lead byte or one of the trail bytes 260 * for a code point, in which case the macro will read all of the bytes 261 * for the code point. 262 * 263 * The length can be negative for a NUL-terminated string. 264 * 265 * If the offset points to an illegal UTF-8 byte sequence, then 266 * c is set to U+FFFD. 267 * Iteration through a string is more efficient with U8_NEXT_UNSAFE or U8_NEXT_OR_FFFD. 268 * 269 * This macro does not distinguish between a real U+FFFD in the text 270 * and U+FFFD returned for an ill-formed sequence. 271 * Use U8_GET() if that distinction is important. 272 * 273 * @param s const uint8_t * string 274 * @param start int32_t starting string offset 275 * @param i int32_t string offset, must be start<=i<length 276 * @param length int32_t string length 277 * @param c output UChar32 variable, set to U+FFFD in case of an error 278 * @see U8_GET 279 * @stable ICU 51 280 */ 281 #define U8_GET_OR_FFFD(s, start, i, length, c) { \ 282 int32_t _u8_get_index=(i); \ 283 U8_SET_CP_START(s, start, _u8_get_index); \ 284 U8_NEXT_OR_FFFD(s, _u8_get_index, length, c); \ 285 } 286 287 /* definitions with forward iteration --------------------------------------- */ 288 289 /** 290 * Get a code point from a string at a code point boundary offset, 291 * and advance the offset to the next code point boundary. 292 * (Post-incrementing forward iteration.) 293 * "Unsafe" macro, assumes well-formed UTF-8. 294 * 295 * The offset may point to the lead byte of a multi-byte sequence, 296 * in which case the macro will read the whole sequence. 297 * The result is undefined if the offset points to a trail byte 298 * or an illegal UTF-8 sequence. 299 * 300 * @param s const uint8_t * string 301 * @param i string offset 302 * @param c output UChar32 variable 303 * @see U8_NEXT 304 * @stable ICU 2.4 305 */ 306 #define U8_NEXT_UNSAFE(s, i, c) { \ 307 (c)=(uint8_t)(s)[(i)++]; \ 308 if((c)>=0x80) { \ 309 if((c)<0xe0) { \ 310 (c)=(((c)&0x1f)<<6)|((s)[(i)++]&0x3f); \ 311 } else if((c)<0xf0) { \ 312 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 313 (c)=(UChar)(((c)<<12)|(((s)[i]&0x3f)<<6)|((s)[(i)+1]&0x3f)); \ 314 (i)+=2; \ 315 } else { \ 316 (c)=(((c)&7)<<18)|(((s)[i]&0x3f)<<12)|(((s)[(i)+1]&0x3f)<<6)|((s)[(i)+2]&0x3f); \ 317 (i)+=3; \ 318 } \ 319 } \ 320 } 321 322 /** 323 * Get a code point from a string at a code point boundary offset, 324 * and advance the offset to the next code point boundary. 325 * (Post-incrementing forward iteration.) 326 * "Safe" macro, checks for illegal sequences and for string boundaries. 327 * 328 * The length can be negative for a NUL-terminated string. 329 * 330 * The offset may point to the lead byte of a multi-byte sequence, 331 * in which case the macro will read the whole sequence. 332 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 333 * c is set to a negative value. 334 * 335 * @param s const uint8_t * string 336 * @param i int32_t string offset, must be i<length 337 * @param length int32_t string length 338 * @param c output UChar32 variable, set to <0 in case of an error 339 * @see U8_NEXT_UNSAFE 340 * @stable ICU 2.4 341 */ 342 #define U8_NEXT(s, i, length, c) { \ 343 (c)=(uint8_t)(s)[(i)++]; \ 344 if((c)>=0x80) { \ 345 uint8_t __t1, __t2; \ 346 if( /* handle U+1000..U+CFFF inline */ \ 347 (0xe0<(c) && (c)<=0xec) && \ 348 (((i)+1)<(length) || (length)<0) && \ 349 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ 350 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ 351 ) { \ 352 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 353 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ 354 (i)+=2; \ 355 } else if( /* handle U+0080..U+07FF inline */ \ 356 ((c)<0xe0 && (c)>=0xc2) && \ 357 ((i)!=(length)) && \ 358 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ 359 ) { \ 360 (c)=(((c)&0x1f)<<6)|__t1; \ 361 ++(i); \ 362 } else { \ 363 /* function call for "complicated" and error cases */ \ 364 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -1); \ 365 } \ 366 } \ 367 } 368 369 /** 370 * Get a code point from a string at a code point boundary offset, 371 * and advance the offset to the next code point boundary. 372 * (Post-incrementing forward iteration.) 373 * "Safe" macro, checks for illegal sequences and for string boundaries. 374 * 375 * The length can be negative for a NUL-terminated string. 376 * 377 * The offset may point to the lead byte of a multi-byte sequence, 378 * in which case the macro will read the whole sequence. 379 * If the offset points to a trail byte or an illegal UTF-8 sequence, then 380 * c is set to U+FFFD. 381 * 382 * This macro does not distinguish between a real U+FFFD in the text 383 * and U+FFFD returned for an ill-formed sequence. 384 * Use U8_NEXT() if that distinction is important. 385 * 386 * @param s const uint8_t * string 387 * @param i int32_t string offset, must be i<length 388 * @param length int32_t string length 389 * @param c output UChar32 variable, set to U+FFFD in case of an error 390 * @see U8_NEXT 391 * @stable ICU 51 392 */ 393 #define U8_NEXT_OR_FFFD(s, i, length, c) { \ 394 (c)=(uint8_t)(s)[(i)++]; \ 395 if((c)>=0x80) { \ 396 uint8_t __t1, __t2; \ 397 if( /* handle U+1000..U+CFFF inline */ \ 398 (0xe0<(c) && (c)<=0xec) && \ 399 (((i)+1)<(length) || (length)<0) && \ 400 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f && \ 401 (__t2=(uint8_t)((s)[(i)+1]-0x80))<= 0x3f \ 402 ) { \ 403 /* no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ \ 404 (c)=(UChar)(((c)<<12)|(__t1<<6)|__t2); \ 405 (i)+=2; \ 406 } else if( /* handle U+0080..U+07FF inline */ \ 407 ((c)<0xe0 && (c)>=0xc2) && \ 408 ((i)!=(length)) && \ 409 (__t1=(uint8_t)((s)[i]-0x80))<=0x3f \ 410 ) { \ 411 (c)=(((c)&0x1f)<<6)|__t1; \ 412 ++(i); \ 413 } else { \ 414 /* function call for "complicated" and error cases */ \ 415 (c)=utf8_nextCharSafeBody((const uint8_t *)s, &(i), (length), c, -3); \ 416 } \ 417 } \ 418 } 419 420 /** 421 * Append a code point to a string, overwriting 1 to 4 bytes. 422 * The offset points to the current end of the string contents 423 * and is advanced (post-increment). 424 * "Unsafe" macro, assumes a valid code point and sufficient space in the string. 425 * Otherwise, the result is undefined. 426 * 427 * @param s const uint8_t * string buffer 428 * @param i string offset 429 * @param c code point to append 430 * @see U8_APPEND 431 * @stable ICU 2.4 432 */ 433 #define U8_APPEND_UNSAFE(s, i, c) { \ 434 if((uint32_t)(c)<=0x7f) { \ 435 (s)[(i)++]=(uint8_t)(c); \ 436 } else { \ 437 if((uint32_t)(c)<=0x7ff) { \ 438 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 439 } else { \ 440 if((uint32_t)(c)<=0xffff) { \ 441 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 442 } else { \ 443 (s)[(i)++]=(uint8_t)(((c)>>18)|0xf0); \ 444 (s)[(i)++]=(uint8_t)((((c)>>12)&0x3f)|0x80); \ 445 } \ 446 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 447 } \ 448 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 449 } \ 450 } 451 452 /** 453 * Append a code point to a string, overwriting 1 to 4 bytes. 454 * The offset points to the current end of the string contents 455 * and is advanced (post-increment). 456 * "Safe" macro, checks for a valid code point. 457 * If a non-ASCII code point is written, checks for sufficient space in the string. 458 * If the code point is not valid or trail bytes do not fit, 459 * then isError is set to TRUE. 460 * 461 * @param s const uint8_t * string buffer 462 * @param i int32_t string offset, must be i<capacity 463 * @param capacity int32_t size of the string buffer 464 * @param c UChar32 code point to append 465 * @param isError output UBool set to TRUE if an error occurs, otherwise not modified 466 * @see U8_APPEND_UNSAFE 467 * @stable ICU 2.4 468 */ 469 #define U8_APPEND(s, i, capacity, c, isError) { \ 470 if((uint32_t)(c)<=0x7f) { \ 471 (s)[(i)++]=(uint8_t)(c); \ 472 } else if((uint32_t)(c)<=0x7ff && (i)+1<(capacity)) { \ 473 (s)[(i)++]=(uint8_t)(((c)>>6)|0xc0); \ 474 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 475 } else if((uint32_t)(c)<=0xd7ff && (i)+2<(capacity)) { \ 476 (s)[(i)++]=(uint8_t)(((c)>>12)|0xe0); \ 477 (s)[(i)++]=(uint8_t)((((c)>>6)&0x3f)|0x80); \ 478 (s)[(i)++]=(uint8_t)(((c)&0x3f)|0x80); \ 479 } else { \ 480 (i)=utf8_appendCharSafeBody(s, (i), (capacity), c, &(isError)); \ 481 } \ 482 } 483 484 /** 485 * Advance the string offset from one code point boundary to the next. 486 * (Post-incrementing iteration.) 487 * "Unsafe" macro, assumes well-formed UTF-8. 488 * 489 * @param s const uint8_t * string 490 * @param i string offset 491 * @see U8_FWD_1 492 * @stable ICU 2.4 493 */ 494 #define U8_FWD_1_UNSAFE(s, i) { \ 495 (i)+=1+U8_COUNT_TRAIL_BYTES_UNSAFE((uint8_t)(s)[i]); \ 496 } 497 498 /** 499 * Advance the string offset from one code point boundary to the next. 500 * (Post-incrementing iteration.) 501 * "Safe" macro, checks for illegal sequences and for string boundaries. 502 * 503 * The length can be negative for a NUL-terminated string. 504 * 505 * @param s const uint8_t * string 506 * @param i int32_t string offset, must be i<length 507 * @param length int32_t string length 508 * @see U8_FWD_1_UNSAFE 509 * @stable ICU 2.4 510 */ 511 #define U8_FWD_1(s, i, length) { \ 512 uint8_t __b=(uint8_t)(s)[(i)++]; \ 513 if(U8_IS_LEAD(__b)) { \ 514 uint8_t __count=U8_COUNT_TRAIL_BYTES(__b); \ 515 if((i)+__count>(length) && (length)>=0) { \ 516 __count=(uint8_t)((length)-(i)); \ 517 } \ 518 while(__count>0 && U8_IS_TRAIL((s)[i])) { \ 519 ++(i); \ 520 --__count; \ 521 } \ 522 } \ 523 } 524 525 /** 526 * Advance the string offset from one code point boundary to the n-th next one, 527 * i.e., move forward by n code points. 528 * (Post-incrementing iteration.) 529 * "Unsafe" macro, assumes well-formed UTF-8. 530 * 531 * @param s const uint8_t * string 532 * @param i string offset 533 * @param n number of code points to skip 534 * @see U8_FWD_N 535 * @stable ICU 2.4 536 */ 537 #define U8_FWD_N_UNSAFE(s, i, n) { \ 538 int32_t __N=(n); \ 539 while(__N>0) { \ 540 U8_FWD_1_UNSAFE(s, i); \ 541 --__N; \ 542 } \ 543 } 544 545 /** 546 * Advance the string offset from one code point boundary to the n-th next one, 547 * i.e., move forward by n code points. 548 * (Post-incrementing iteration.) 549 * "Safe" macro, checks for illegal sequences and for string boundaries. 550 * 551 * The length can be negative for a NUL-terminated string. 552 * 553 * @param s const uint8_t * string 554 * @param i int32_t string offset, must be i<length 555 * @param length int32_t string length 556 * @param n number of code points to skip 557 * @see U8_FWD_N_UNSAFE 558 * @stable ICU 2.4 559 */ 560 #define U8_FWD_N(s, i, length, n) { \ 561 int32_t __N=(n); \ 562 while(__N>0 && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 563 U8_FWD_1(s, i, length); \ 564 --__N; \ 565 } \ 566 } 567 568 /** 569 * Adjust a random-access offset to a code point boundary 570 * at the start of a code point. 571 * If the offset points to a UTF-8 trail byte, 572 * then the offset is moved backward to the corresponding lead byte. 573 * Otherwise, it is not modified. 574 * "Unsafe" macro, assumes well-formed UTF-8. 575 * 576 * @param s const uint8_t * string 577 * @param i string offset 578 * @see U8_SET_CP_START 579 * @stable ICU 2.4 580 */ 581 #define U8_SET_CP_START_UNSAFE(s, i) { \ 582 while(U8_IS_TRAIL((s)[i])) { --(i); } \ 583 } 584 585 /** 586 * Adjust a random-access offset to a code point boundary 587 * at the start of a code point. 588 * If the offset points to a UTF-8 trail byte, 589 * then the offset is moved backward to the corresponding lead byte. 590 * Otherwise, it is not modified. 591 * "Safe" macro, checks for illegal sequences and for string boundaries. 592 * 593 * @param s const uint8_t * string 594 * @param start int32_t starting string offset (usually 0) 595 * @param i int32_t string offset, must be start<=i 596 * @see U8_SET_CP_START_UNSAFE 597 * @stable ICU 2.4 598 */ 599 #define U8_SET_CP_START(s, start, i) { \ 600 if(U8_IS_TRAIL((s)[(i)])) { \ 601 (i)=utf8_back1SafeBody(s, start, (i)); \ 602 } \ 603 } 604 605 /* definitions with backward iteration -------------------------------------- */ 606 607 /** 608 * Move the string offset from one code point boundary to the previous one 609 * and get the code point between them. 610 * (Pre-decrementing backward iteration.) 611 * "Unsafe" macro, assumes well-formed UTF-8. 612 * 613 * The input offset may be the same as the string length. 614 * If the offset is behind a multi-byte sequence, then the macro will read 615 * the whole sequence. 616 * If the offset is behind a lead byte, then that itself 617 * will be returned as the code point. 618 * The result is undefined if the offset is behind an illegal UTF-8 sequence. 619 * 620 * @param s const uint8_t * string 621 * @param i string offset 622 * @param c output UChar32 variable 623 * @see U8_PREV 624 * @stable ICU 2.4 625 */ 626 #define U8_PREV_UNSAFE(s, i, c) { \ 627 (c)=(uint8_t)(s)[--(i)]; \ 628 if(U8_IS_TRAIL(c)) { \ 629 uint8_t __b, __count=1, __shift=6; \ 630 \ 631 /* c is a trail byte */ \ 632 (c)&=0x3f; \ 633 for(;;) { \ 634 __b=(uint8_t)(s)[--(i)]; \ 635 if(__b>=0xc0) { \ 636 U8_MASK_LEAD_BYTE(__b, __count); \ 637 (c)|=(UChar32)__b<<__shift; \ 638 break; \ 639 } else { \ 640 (c)|=(UChar32)(__b&0x3f)<<__shift; \ 641 ++__count; \ 642 __shift+=6; \ 643 } \ 644 } \ 645 } \ 646 } 647 648 /** 649 * Move the string offset from one code point boundary to the previous one 650 * and get the code point between them. 651 * (Pre-decrementing backward iteration.) 652 * "Safe" macro, checks for illegal sequences and for string boundaries. 653 * 654 * The input offset may be the same as the string length. 655 * If the offset is behind a multi-byte sequence, then the macro will read 656 * the whole sequence. 657 * If the offset is behind a lead byte, then that itself 658 * will be returned as the code point. 659 * If the offset is behind an illegal UTF-8 sequence, then c is set to a negative value. 660 * 661 * @param s const uint8_t * string 662 * @param start int32_t starting string offset (usually 0) 663 * @param i int32_t string offset, must be start<i 664 * @param c output UChar32 variable, set to <0 in case of an error 665 * @see U8_PREV_UNSAFE 666 * @stable ICU 2.4 667 */ 668 #define U8_PREV(s, start, i, c) { \ 669 (c)=(uint8_t)(s)[--(i)]; \ 670 if((c)>=0x80) { \ 671 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -1); \ 672 } \ 673 } 674 675 /** 676 * Move the string offset from one code point boundary to the previous one 677 * and get the code point between them. 678 * (Pre-decrementing backward iteration.) 679 * "Safe" macro, checks for illegal sequences and for string boundaries. 680 * 681 * The input offset may be the same as the string length. 682 * If the offset is behind a multi-byte sequence, then the macro will read 683 * the whole sequence. 684 * If the offset is behind a lead byte, then that itself 685 * will be returned as the code point. 686 * If the offset is behind an illegal UTF-8 sequence, then c is set to U+FFFD. 687 * 688 * This macro does not distinguish between a real U+FFFD in the text 689 * and U+FFFD returned for an ill-formed sequence. 690 * Use U8_PREV() if that distinction is important. 691 * 692 * @param s const uint8_t * string 693 * @param start int32_t starting string offset (usually 0) 694 * @param i int32_t string offset, must be start<i 695 * @param c output UChar32 variable, set to U+FFFD in case of an error 696 * @see U8_PREV 697 * @stable ICU 51 698 */ 699 #define U8_PREV_OR_FFFD(s, start, i, c) { \ 700 (c)=(uint8_t)(s)[--(i)]; \ 701 if((c)>=0x80) { \ 702 (c)=utf8_prevCharSafeBody((const uint8_t *)s, start, &(i), c, -3); \ 703 } \ 704 } 705 706 /** 707 * Move the string offset from one code point boundary to the previous one. 708 * (Pre-decrementing backward iteration.) 709 * The input offset may be the same as the string length. 710 * "Unsafe" macro, assumes well-formed UTF-8. 711 * 712 * @param s const uint8_t * string 713 * @param i string offset 714 * @see U8_BACK_1 715 * @stable ICU 2.4 716 */ 717 #define U8_BACK_1_UNSAFE(s, i) { \ 718 while(U8_IS_TRAIL((s)[--(i)])) {} \ 719 } 720 721 /** 722 * Move the string offset from one code point boundary to the previous one. 723 * (Pre-decrementing backward iteration.) 724 * The input offset may be the same as the string length. 725 * "Safe" macro, checks for illegal sequences and for string boundaries. 726 * 727 * @param s const uint8_t * string 728 * @param start int32_t starting string offset (usually 0) 729 * @param i int32_t string offset, must be start<i 730 * @see U8_BACK_1_UNSAFE 731 * @stable ICU 2.4 732 */ 733 #define U8_BACK_1(s, start, i) { \ 734 if(U8_IS_TRAIL((s)[--(i)])) { \ 735 (i)=utf8_back1SafeBody(s, start, (i)); \ 736 } \ 737 } 738 739 /** 740 * Move the string offset from one code point boundary to the n-th one before it, 741 * i.e., move backward by n code points. 742 * (Pre-decrementing backward iteration.) 743 * The input offset may be the same as the string length. 744 * "Unsafe" macro, assumes well-formed UTF-8. 745 * 746 * @param s const uint8_t * string 747 * @param i string offset 748 * @param n number of code points to skip 749 * @see U8_BACK_N 750 * @stable ICU 2.4 751 */ 752 #define U8_BACK_N_UNSAFE(s, i, n) { \ 753 int32_t __N=(n); \ 754 while(__N>0) { \ 755 U8_BACK_1_UNSAFE(s, i); \ 756 --__N; \ 757 } \ 758 } 759 760 /** 761 * Move the string offset from one code point boundary to the n-th one before it, 762 * i.e., move backward by n code points. 763 * (Pre-decrementing backward iteration.) 764 * The input offset may be the same as the string length. 765 * "Safe" macro, checks for illegal sequences and for string boundaries. 766 * 767 * @param s const uint8_t * string 768 * @param start int32_t index of the start of the string 769 * @param i int32_t string offset, must be start<i 770 * @param n number of code points to skip 771 * @see U8_BACK_N_UNSAFE 772 * @stable ICU 2.4 773 */ 774 #define U8_BACK_N(s, start, i, n) { \ 775 int32_t __N=(n); \ 776 while(__N>0 && (i)>(start)) { \ 777 U8_BACK_1(s, start, i); \ 778 --__N; \ 779 } \ 780 } 781 782 /** 783 * Adjust a random-access offset to a code point boundary after a code point. 784 * If the offset is behind a partial multi-byte sequence, 785 * then the offset is incremented to behind the whole sequence. 786 * Otherwise, it is not modified. 787 * The input offset may be the same as the string length. 788 * "Unsafe" macro, assumes well-formed UTF-8. 789 * 790 * @param s const uint8_t * string 791 * @param i string offset 792 * @see U8_SET_CP_LIMIT 793 * @stable ICU 2.4 794 */ 795 #define U8_SET_CP_LIMIT_UNSAFE(s, i) { \ 796 U8_BACK_1_UNSAFE(s, i); \ 797 U8_FWD_1_UNSAFE(s, i); \ 798 } 799 800 /** 801 * Adjust a random-access offset to a code point boundary after a code point. 802 * If the offset is behind a partial multi-byte sequence, 803 * then the offset is incremented to behind the whole sequence. 804 * Otherwise, it is not modified. 805 * The input offset may be the same as the string length. 806 * "Safe" macro, checks for illegal sequences and for string boundaries. 807 * 808 * The length can be negative for a NUL-terminated string. 809 * 810 * @param s const uint8_t * string 811 * @param start int32_t starting string offset (usually 0) 812 * @param i int32_t string offset, must be start<=i<=length 813 * @param length int32_t string length 814 * @see U8_SET_CP_LIMIT_UNSAFE 815 * @stable ICU 2.4 816 */ 817 #define U8_SET_CP_LIMIT(s, start, i, length) { \ 818 if((start)<(i) && ((i)<(length) || ((length)<0 && (s)[i]!=0))) { \ 819 U8_BACK_1(s, start, i); \ 820 U8_FWD_1(s, i, length); \ 821 } \ 822 } 823 824 #endif 825