1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2011 International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uiter.h 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002jan18 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UITER_H__ 20 #define __UITER_H__ 21 22 /** 23 * \file 24 * \brief C API: Unicode Character Iteration 25 * 26 * @see UCharIterator 27 */ 28 29 #include "unicode/utypes.h" 30 31 #if U_SHOW_CPLUSPLUS_API 32 U_NAMESPACE_BEGIN 33 34 class CharacterIterator; 35 class Replaceable; 36 37 U_NAMESPACE_END 38 #endif 39 40 U_CDECL_BEGIN 41 42 struct UCharIterator; 43 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ 44 45 /** 46 * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). 47 * @see UCharIteratorMove 48 * @see UCharIterator 49 * @stable ICU 2.1 50 */ 51 typedef enum UCharIteratorOrigin { 52 UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH 53 } UCharIteratorOrigin; 54 55 /** Constants for UCharIterator. @stable ICU 2.6 */ 56 enum { 57 /** 58 * Constant value that may be returned by UCharIteratorMove 59 * indicating that the final UTF-16 index is not known, but that the move succeeded. 60 * This can occur when moving relative to limit or length, or 61 * when moving relative to the current index after a setState() 62 * when the current UTF-16 index is not known. 63 * 64 * It would be very inefficient to have to count from the beginning of the text 65 * just to get the current/limit/length index after moving relative to it. 66 * The actual index can be determined with getIndex(UITER_CURRENT) 67 * which will count the UChars if necessary. 68 * 69 * @stable ICU 2.6 70 */ 71 UITER_UNKNOWN_INDEX=-2 72 }; 73 74 75 /** 76 * Constant for UCharIterator getState() indicating an error or 77 * an unknown state. 78 * Returned by uiter_getState()/UCharIteratorGetState 79 * when an error occurs. 80 * Also, some UCharIterator implementations may not be able to return 81 * a valid state for each position. This will be clearly documented 82 * for each such iterator (none of the public ones here). 83 * 84 * @stable ICU 2.6 85 */ 86 #define UITER_NO_STATE ((uint32_t)0xffffffff) 87 88 /** 89 * Function type declaration for UCharIterator.getIndex(). 90 * 91 * Gets the current position, or the start or limit of the 92 * iteration range. 93 * 94 * This function may perform slowly for UITER_CURRENT after setState() was called, 95 * or for UITER_LENGTH, because an iterator implementation may have to count 96 * UChars if the underlying storage is not UTF-16. 97 * 98 * @param iter the UCharIterator structure ("this pointer") 99 * @param origin get the 0, start, limit, length, or current index 100 * @return the requested index, or U_SENTINEL in an error condition 101 * 102 * @see UCharIteratorOrigin 103 * @see UCharIterator 104 * @stable ICU 2.1 105 */ 106 typedef int32_t U_CALLCONV 107 UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); 108 109 /** 110 * Function type declaration for UCharIterator.move(). 111 * 112 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). 113 * 114 * Moves the current position relative to the start or limit of the 115 * iteration range, or relative to the current position itself. 116 * The movement is expressed in numbers of code units forward 117 * or backward by specifying a positive or negative delta. 118 * Out of bounds movement will be pinned to the start or limit. 119 * 120 * This function may perform slowly for moving relative to UITER_LENGTH 121 * because an iterator implementation may have to count the rest of the 122 * UChars if the native storage is not UTF-16. 123 * 124 * When moving relative to the limit or length, or 125 * relative to the current position after setState() was called, 126 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient 127 * determination of the actual UTF-16 index. 128 * The actual index can be determined with getIndex(UITER_CURRENT) 129 * which will count the UChars if necessary. 130 * See UITER_UNKNOWN_INDEX for details. 131 * 132 * @param iter the UCharIterator structure ("this pointer") 133 * @param delta can be positive, zero, or negative 134 * @param origin move relative to the 0, start, limit, length, or current index 135 * @return the new index, or U_SENTINEL on an error condition, 136 * or UITER_UNKNOWN_INDEX when the index is not known. 137 * 138 * @see UCharIteratorOrigin 139 * @see UCharIterator 140 * @see UITER_UNKNOWN_INDEX 141 * @stable ICU 2.1 142 */ 143 typedef int32_t U_CALLCONV 144 UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); 145 146 /** 147 * Function type declaration for UCharIterator.hasNext(). 148 * 149 * Check if current() and next() can still 150 * return another code unit. 151 * 152 * @param iter the UCharIterator structure ("this pointer") 153 * @return boolean value for whether current() and next() can still return another code unit 154 * 155 * @see UCharIterator 156 * @stable ICU 2.1 157 */ 158 typedef UBool U_CALLCONV 159 UCharIteratorHasNext(UCharIterator *iter); 160 161 /** 162 * Function type declaration for UCharIterator.hasPrevious(). 163 * 164 * Check if previous() can still return another code unit. 165 * 166 * @param iter the UCharIterator structure ("this pointer") 167 * @return boolean value for whether previous() can still return another code unit 168 * 169 * @see UCharIterator 170 * @stable ICU 2.1 171 */ 172 typedef UBool U_CALLCONV 173 UCharIteratorHasPrevious(UCharIterator *iter); 174 175 /** 176 * Function type declaration for UCharIterator.current(). 177 * 178 * Return the code unit at the current position, 179 * or U_SENTINEL if there is none (index is at the limit). 180 * 181 * @param iter the UCharIterator structure ("this pointer") 182 * @return the current code unit 183 * 184 * @see UCharIterator 185 * @stable ICU 2.1 186 */ 187 typedef UChar32 U_CALLCONV 188 UCharIteratorCurrent(UCharIterator *iter); 189 190 /** 191 * Function type declaration for UCharIterator.next(). 192 * 193 * Return the code unit at the current index and increment 194 * the index (post-increment, like s[i++]), 195 * or return U_SENTINEL if there is none (index is at the limit). 196 * 197 * @param iter the UCharIterator structure ("this pointer") 198 * @return the current code unit (and post-increment the current index) 199 * 200 * @see UCharIterator 201 * @stable ICU 2.1 202 */ 203 typedef UChar32 U_CALLCONV 204 UCharIteratorNext(UCharIterator *iter); 205 206 /** 207 * Function type declaration for UCharIterator.previous(). 208 * 209 * Decrement the index and return the code unit from there 210 * (pre-decrement, like s[--i]), 211 * or return U_SENTINEL if there is none (index is at the start). 212 * 213 * @param iter the UCharIterator structure ("this pointer") 214 * @return the previous code unit (after pre-decrementing the current index) 215 * 216 * @see UCharIterator 217 * @stable ICU 2.1 218 */ 219 typedef UChar32 U_CALLCONV 220 UCharIteratorPrevious(UCharIterator *iter); 221 222 /** 223 * Function type declaration for UCharIterator.reservedFn(). 224 * Reserved for future use. 225 * 226 * @param iter the UCharIterator structure ("this pointer") 227 * @param something some integer argument 228 * @return some integer 229 * 230 * @see UCharIterator 231 * @stable ICU 2.1 232 */ 233 typedef int32_t U_CALLCONV 234 UCharIteratorReserved(UCharIterator *iter, int32_t something); 235 236 /** 237 * Function type declaration for UCharIterator.getState(). 238 * 239 * Get the "state" of the iterator in the form of a single 32-bit word. 240 * It is recommended that the state value be calculated to be as small as 241 * is feasible. For strings with limited lengths, fewer than 32 bits may 242 * be sufficient. 243 * 244 * This is used together with setState()/UCharIteratorSetState 245 * to save and restore the iterator position more efficiently than with 246 * getIndex()/move(). 247 * 248 * The iterator state is defined as a uint32_t value because it is designed 249 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state 250 * of the character iterator. 251 * 252 * With some UCharIterator implementations (e.g., UTF-8), 253 * getting and setting the UTF-16 index with existing functions 254 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but 255 * relatively slow because the iterator has to "walk" from a known index 256 * to the requested one. 257 * This takes more time the farther it needs to go. 258 * 259 * An opaque state value allows an iterator implementation to provide 260 * an internal index (UTF-8: the source byte array index) for 261 * fast, constant-time restoration. 262 * 263 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because 264 * the UTF-16 index may not be restored as well, but the iterator can deliver 265 * the correct text contents and move relative to the current position 266 * without performance degradation. 267 * 268 * Some UCharIterator implementations may not be able to return 269 * a valid state for each position, in which case they return UITER_NO_STATE instead. 270 * This will be clearly documented for each such iterator (none of the public ones here). 271 * 272 * @param iter the UCharIterator structure ("this pointer") 273 * @return the state word 274 * 275 * @see UCharIterator 276 * @see UCharIteratorSetState 277 * @see UITER_NO_STATE 278 * @stable ICU 2.6 279 */ 280 typedef uint32_t U_CALLCONV 281 UCharIteratorGetState(const UCharIterator *iter); 282 283 /** 284 * Function type declaration for UCharIterator.setState(). 285 * 286 * Restore the "state" of the iterator using a state word from a getState() call. 287 * The iterator object need not be the same one as for which getState() was called, 288 * but it must be of the same type (set up using the same uiter_setXYZ function) 289 * and it must iterate over the same string 290 * (binary identical regardless of memory address). 291 * For more about the state word see UCharIteratorGetState. 292 * 293 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because 294 * the UTF-16 index may not be restored as well, but the iterator can deliver 295 * the correct text contents and move relative to the current position 296 * without performance degradation. 297 * 298 * @param iter the UCharIterator structure ("this pointer") 299 * @param state the state word from a getState() call 300 * on a same-type, same-string iterator 301 * @param pErrorCode Must be a valid pointer to an error code value, 302 * which must not indicate a failure before the function call. 303 * 304 * @see UCharIterator 305 * @see UCharIteratorGetState 306 * @stable ICU 2.6 307 */ 308 typedef void U_CALLCONV 309 UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); 310 311 312 /** 313 * C API for code unit iteration. 314 * This can be used as a C wrapper around 315 * CharacterIterator, Replaceable, or implemented using simple strings, etc. 316 * 317 * There are two roles for using UCharIterator: 318 * 319 * A "provider" sets the necessary function pointers and controls the "protected" 320 * fields of the UCharIterator structure. A "provider" passes a UCharIterator 321 * into C APIs that need a UCharIterator as an abstract, flexible string interface. 322 * 323 * Implementations of such C APIs are "callers" of UCharIterator functions; 324 * they only use the "public" function pointers and never access the "protected" 325 * fields directly. 326 * 327 * The current() and next() functions only check the current index against the 328 * limit, and previous() only checks the current index against the start, 329 * to see if the iterator already reached the end of the iteration range. 330 * 331 * The assumption - in all iterators - is that the index is moved via the API, 332 * which means it won't go out of bounds, or the index is modified by 333 * user code that knows enough about the iterator implementation to set valid 334 * index values. 335 * 336 * UCharIterator functions return code unit values 0..0xffff, 337 * or U_SENTINEL if the iteration bounds are reached. 338 * 339 * @stable ICU 2.1 340 */ 341 struct UCharIterator { 342 /** 343 * (protected) Pointer to string or wrapped object or similar. 344 * Not used by caller. 345 * @stable ICU 2.1 346 */ 347 const void *context; 348 349 /** 350 * (protected) Length of string or similar. 351 * Not used by caller. 352 * @stable ICU 2.1 353 */ 354 int32_t length; 355 356 /** 357 * (protected) Start index or similar. 358 * Not used by caller. 359 * @stable ICU 2.1 360 */ 361 int32_t start; 362 363 /** 364 * (protected) Current index or similar. 365 * Not used by caller. 366 * @stable ICU 2.1 367 */ 368 int32_t index; 369 370 /** 371 * (protected) Limit index or similar. 372 * Not used by caller. 373 * @stable ICU 2.1 374 */ 375 int32_t limit; 376 377 /** 378 * (protected) Used by UTF-8 iterators and possibly others. 379 * @stable ICU 2.1 380 */ 381 int32_t reservedField; 382 383 /** 384 * (public) Returns the current position or the 385 * start or limit index of the iteration range. 386 * 387 * @see UCharIteratorGetIndex 388 * @stable ICU 2.1 389 */ 390 UCharIteratorGetIndex *getIndex; 391 392 /** 393 * (public) Moves the current position relative to the start or limit of the 394 * iteration range, or relative to the current position itself. 395 * The movement is expressed in numbers of code units forward 396 * or backward by specifying a positive or negative delta. 397 * 398 * @see UCharIteratorMove 399 * @stable ICU 2.1 400 */ 401 UCharIteratorMove *move; 402 403 /** 404 * (public) Check if current() and next() can still 405 * return another code unit. 406 * 407 * @see UCharIteratorHasNext 408 * @stable ICU 2.1 409 */ 410 UCharIteratorHasNext *hasNext; 411 412 /** 413 * (public) Check if previous() can still return another code unit. 414 * 415 * @see UCharIteratorHasPrevious 416 * @stable ICU 2.1 417 */ 418 UCharIteratorHasPrevious *hasPrevious; 419 420 /** 421 * (public) Return the code unit at the current position, 422 * or U_SENTINEL if there is none (index is at the limit). 423 * 424 * @see UCharIteratorCurrent 425 * @stable ICU 2.1 426 */ 427 UCharIteratorCurrent *current; 428 429 /** 430 * (public) Return the code unit at the current index and increment 431 * the index (post-increment, like s[i++]), 432 * or return U_SENTINEL if there is none (index is at the limit). 433 * 434 * @see UCharIteratorNext 435 * @stable ICU 2.1 436 */ 437 UCharIteratorNext *next; 438 439 /** 440 * (public) Decrement the index and return the code unit from there 441 * (pre-decrement, like s[--i]), 442 * or return U_SENTINEL if there is none (index is at the start). 443 * 444 * @see UCharIteratorPrevious 445 * @stable ICU 2.1 446 */ 447 UCharIteratorPrevious *previous; 448 449 /** 450 * (public) Reserved for future use. Currently NULL. 451 * 452 * @see UCharIteratorReserved 453 * @stable ICU 2.1 454 */ 455 UCharIteratorReserved *reservedFn; 456 457 /** 458 * (public) Return the state of the iterator, to be restored later with setState(). 459 * This function pointer is NULL if the iterator does not implement it. 460 * 461 * @see UCharIteratorGet 462 * @stable ICU 2.6 463 */ 464 UCharIteratorGetState *getState; 465 466 /** 467 * (public) Restore the iterator state from the state word from a call 468 * to getState(). 469 * This function pointer is NULL if the iterator does not implement it. 470 * 471 * @see UCharIteratorSet 472 * @stable ICU 2.6 473 */ 474 UCharIteratorSetState *setState; 475 }; 476 477 /** 478 * Helper function for UCharIterator to get the code point 479 * at the current index. 480 * 481 * Return the code point that includes the code unit at the current position, 482 * or U_SENTINEL if there is none (index is at the limit). 483 * If the current code unit is a lead or trail surrogate, 484 * then the following or preceding surrogate is used to form 485 * the code point value. 486 * 487 * @param iter the UCharIterator structure ("this pointer") 488 * @return the current code point 489 * 490 * @see UCharIterator 491 * @see U16_GET 492 * @see UnicodeString::char32At() 493 * @stable ICU 2.1 494 */ 495 U_STABLE UChar32 U_EXPORT2 496 uiter_current32(UCharIterator *iter); 497 498 /** 499 * Helper function for UCharIterator to get the next code point. 500 * 501 * Return the code point at the current index and increment 502 * the index (post-increment, like s[i++]), 503 * or return U_SENTINEL if there is none (index is at the limit). 504 * 505 * @param iter the UCharIterator structure ("this pointer") 506 * @return the current code point (and post-increment the current index) 507 * 508 * @see UCharIterator 509 * @see U16_NEXT 510 * @stable ICU 2.1 511 */ 512 U_STABLE UChar32 U_EXPORT2 513 uiter_next32(UCharIterator *iter); 514 515 /** 516 * Helper function for UCharIterator to get the previous code point. 517 * 518 * Decrement the index and return the code point from there 519 * (pre-decrement, like s[--i]), 520 * or return U_SENTINEL if there is none (index is at the start). 521 * 522 * @param iter the UCharIterator structure ("this pointer") 523 * @return the previous code point (after pre-decrementing the current index) 524 * 525 * @see UCharIterator 526 * @see U16_PREV 527 * @stable ICU 2.1 528 */ 529 U_STABLE UChar32 U_EXPORT2 530 uiter_previous32(UCharIterator *iter); 531 532 /** 533 * Get the "state" of the iterator in the form of a single 32-bit word. 534 * This is a convenience function that calls iter->getState(iter) 535 * if iter->getState is not NULL; 536 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. 537 * 538 * Some UCharIterator implementations may not be able to return 539 * a valid state for each position, in which case they return UITER_NO_STATE instead. 540 * This will be clearly documented for each such iterator (none of the public ones here). 541 * 542 * @param iter the UCharIterator structure ("this pointer") 543 * @return the state word 544 * 545 * @see UCharIterator 546 * @see UCharIteratorGetState 547 * @see UITER_NO_STATE 548 * @stable ICU 2.6 549 */ 550 U_STABLE uint32_t U_EXPORT2 551 uiter_getState(const UCharIterator *iter); 552 553 /** 554 * Restore the "state" of the iterator using a state word from a getState() call. 555 * This is a convenience function that calls iter->setState(iter, state, pErrorCode) 556 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. 557 * 558 * @param iter the UCharIterator structure ("this pointer") 559 * @param state the state word from a getState() call 560 * on a same-type, same-string iterator 561 * @param pErrorCode Must be a valid pointer to an error code value, 562 * which must not indicate a failure before the function call. 563 * 564 * @see UCharIterator 565 * @see UCharIteratorSetState 566 * @stable ICU 2.6 567 */ 568 U_STABLE void U_EXPORT2 569 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); 570 571 /** 572 * Set up a UCharIterator to iterate over a string. 573 * 574 * Sets the UCharIterator function pointers for iteration over the string s 575 * with iteration boundaries start=index=0 and length=limit=string length. 576 * The "provider" may set the start, index, and limit values at any time 577 * within the range 0..length. 578 * The length field will be ignored. 579 * 580 * The string pointer s is set into UCharIterator.context without copying 581 * or reallocating the string contents. 582 * 583 * getState() simply returns the current index. 584 * move() will always return the final index. 585 * 586 * @param iter UCharIterator structure to be set for iteration 587 * @param s String to iterate over 588 * @param length Length of s, or -1 if NUL-terminated 589 * 590 * @see UCharIterator 591 * @stable ICU 2.1 592 */ 593 U_STABLE void U_EXPORT2 594 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); 595 596 /** 597 * Set up a UCharIterator to iterate over a UTF-16BE string 598 * (byte vector with a big-endian pair of bytes per UChar). 599 * 600 * Everything works just like with a normal UChar iterator (uiter_setString), 601 * except that UChars are assembled from byte pairs, 602 * and that the length argument here indicates an even number of bytes. 603 * 604 * getState() simply returns the current index. 605 * move() will always return the final index. 606 * 607 * @param iter UCharIterator structure to be set for iteration 608 * @param s UTF-16BE string to iterate over 609 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated 610 * (NUL means pair of 0 bytes at even index from s) 611 * 612 * @see UCharIterator 613 * @see uiter_setString 614 * @stable ICU 2.6 615 */ 616 U_STABLE void U_EXPORT2 617 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); 618 619 /** 620 * Set up a UCharIterator to iterate over a UTF-8 string. 621 * 622 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s 623 * with UTF-8 iteration boundaries 0 and length. 624 * The implementation counts the UTF-16 index on the fly and 625 * lazily evaluates the UTF-16 length of the text. 626 * 627 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. 628 * When the reservedField is not 0, then it contains a supplementary code point 629 * and the UTF-16 index is between the two corresponding surrogates. 630 * At that point, the UTF-8 index is behind that code point. 631 * 632 * The UTF-8 string pointer s is set into UCharIterator.context without copying 633 * or reallocating the string contents. 634 * 635 * getState() returns a state value consisting of 636 * - the current UTF-8 source byte index (bits 31..1) 637 * - a flag (bit 0) that indicates whether the UChar position is in the middle 638 * of a surrogate pair 639 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) 640 * 641 * getState() cannot also encode the UTF-16 index in the state value. 642 * move(relative to limit or length), or 643 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. 644 * 645 * @param iter UCharIterator structure to be set for iteration 646 * @param s UTF-8 string to iterate over 647 * @param length Length of s in bytes, or -1 if NUL-terminated 648 * 649 * @see UCharIterator 650 * @stable ICU 2.6 651 */ 652 U_STABLE void U_EXPORT2 653 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); 654 655 #if U_SHOW_CPLUSPLUS_API 656 657 /** 658 * Set up a UCharIterator to wrap around a C++ CharacterIterator. 659 * 660 * Sets the UCharIterator function pointers for iteration using the 661 * CharacterIterator charIter. 662 * 663 * The CharacterIterator pointer charIter is set into UCharIterator.context 664 * without copying or cloning the CharacterIterator object. 665 * The other "protected" UCharIterator fields are set to 0 and will be ignored. 666 * The iteration index and boundaries are controlled by the CharacterIterator. 667 * 668 * getState() simply returns the current index. 669 * move() will always return the final index. 670 * 671 * @param iter UCharIterator structure to be set for iteration 672 * @param charIter CharacterIterator to wrap 673 * 674 * @see UCharIterator 675 * @stable ICU 2.1 676 */ 677 U_STABLE void U_EXPORT2 678 uiter_setCharacterIterator(UCharIterator *iter, icu::CharacterIterator *charIter); 679 680 /** 681 * Set up a UCharIterator to iterate over a C++ Replaceable. 682 * 683 * Sets the UCharIterator function pointers for iteration over the 684 * Replaceable rep with iteration boundaries start=index=0 and 685 * length=limit=rep->length(). 686 * The "provider" may set the start, index, and limit values at any time 687 * within the range 0..length=rep->length(). 688 * The length field will be ignored. 689 * 690 * The Replaceable pointer rep is set into UCharIterator.context without copying 691 * or cloning/reallocating the Replaceable object. 692 * 693 * getState() simply returns the current index. 694 * move() will always return the final index. 695 * 696 * @param iter UCharIterator structure to be set for iteration 697 * @param rep Replaceable to iterate over 698 * 699 * @see UCharIterator 700 * @stable ICU 2.1 701 */ 702 U_STABLE void U_EXPORT2 703 uiter_setReplaceable(UCharIterator *iter, const icu::Replaceable *rep); 704 705 #endif 706 707 U_CDECL_END 708 709 #endif 710