1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucharstrie.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010nov14 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __UCHARSTRIE_H__ 18 #define __UCHARSTRIE_H__ 19 20 /** 21 * \file 22 * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences) 23 * to integer values. 24 */ 25 26 #include "unicode/utypes.h" 27 28 #if U_SHOW_CPLUSPLUS_API 29 30 #include "unicode/unistr.h" 31 #include "unicode/uobject.h" 32 #include "unicode/ustringtrie.h" 33 34 U_NAMESPACE_BEGIN 35 36 class Appendable; 37 class UCharsTrieBuilder; 38 class UVector32; 39 40 /** 41 * Light-weight, non-const reader class for a UCharsTrie. 42 * Traverses a char16_t-serialized data structure with minimal state, 43 * for mapping strings (16-bit-unit sequences) to non-negative integer values. 44 * 45 * This class owns the serialized trie data only if it was constructed by 46 * the builder's build() method. 47 * The public constructor and the copy constructor only alias the data (only copy the pointer). 48 * There is no assignment operator. 49 * 50 * This class is not intended for public subclassing. 51 * @stable ICU 4.8 52 */ 53 class U_COMMON_API UCharsTrie : public UMemory { 54 public: 55 /** 56 * Constructs a UCharsTrie reader instance. 57 * 58 * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder, 59 * starting with the first char16_t of that sequence. 60 * The UCharsTrie object will not read more char16_ts than 61 * the UCharsTrieBuilder generated in the corresponding build() call. 62 * 63 * The array is not copied/cloned and must not be modified while 64 * the UCharsTrie object is in use. 65 * 66 * @param trieUChars The char16_t array that contains the serialized trie. 67 * @stable ICU 4.8 68 */ UCharsTrie(ConstChar16Ptr trieUChars)69 UCharsTrie(ConstChar16Ptr trieUChars) 70 : ownedArray_(NULL), uchars_(trieUChars), 71 pos_(uchars_), remainingMatchLength_(-1) {} 72 73 /** 74 * Destructor. 75 * @stable ICU 4.8 76 */ 77 ~UCharsTrie(); 78 79 /** 80 * Copy constructor, copies the other trie reader object and its state, 81 * but not the char16_t array which will be shared. (Shallow copy.) 82 * @param other Another UCharsTrie object. 83 * @stable ICU 4.8 84 */ UCharsTrie(const UCharsTrie & other)85 UCharsTrie(const UCharsTrie &other) 86 : ownedArray_(NULL), uchars_(other.uchars_), 87 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 88 89 /** 90 * Resets this trie to its initial state. 91 * @return *this 92 * @stable ICU 4.8 93 */ reset()94 UCharsTrie &reset() { 95 pos_=uchars_; 96 remainingMatchLength_=-1; 97 return *this; 98 } 99 100 /** 101 * Returns the state of this trie as a 64-bit integer. 102 * The state value is never 0. 103 * 104 * @return opaque state value 105 * @see resetToState64 106 * @stable ICU 65 107 */ getState64()108 uint64_t getState64() const { 109 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) | 110 (uint64_t)(pos_ - uchars_); 111 } 112 113 /** 114 * Resets this trie to the saved state. 115 * Unlike resetToState(State), the 64-bit state value 116 * must be from getState64() from the same trie object or 117 * from one initialized the exact same way. 118 * Because of no validation, this method is faster. 119 * 120 * @param state The opaque trie state value from getState64(). 121 * @return *this 122 * @see getState64 123 * @see resetToState 124 * @see reset 125 * @stable ICU 65 126 */ resetToState64(uint64_t state)127 UCharsTrie &resetToState64(uint64_t state) { 128 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2; 129 pos_ = uchars_ + (state & kState64PosMask); 130 return *this; 131 } 132 133 /** 134 * UCharsTrie state object, for saving a trie's current state 135 * and resetting the trie back to this state later. 136 * @stable ICU 4.8 137 */ 138 class State : public UMemory { 139 public: 140 /** 141 * Constructs an empty State. 142 * @stable ICU 4.8 143 */ State()144 State() { uchars=NULL; } 145 private: 146 friend class UCharsTrie; 147 148 const char16_t *uchars; 149 const char16_t *pos; 150 int32_t remainingMatchLength; 151 }; 152 153 /** 154 * Saves the state of this trie. 155 * @param state The State object to hold the trie's state. 156 * @return *this 157 * @see resetToState 158 * @stable ICU 4.8 159 */ saveState(State & state)160 const UCharsTrie &saveState(State &state) const { 161 state.uchars=uchars_; 162 state.pos=pos_; 163 state.remainingMatchLength=remainingMatchLength_; 164 return *this; 165 } 166 167 /** 168 * Resets this trie to the saved state. 169 * If the state object contains no state, or the state of a different trie, 170 * then this trie remains unchanged. 171 * @param state The State object which holds a saved trie state. 172 * @return *this 173 * @see saveState 174 * @see reset 175 * @stable ICU 4.8 176 */ resetToState(const State & state)177 UCharsTrie &resetToState(const State &state) { 178 if(uchars_==state.uchars && uchars_!=NULL) { 179 pos_=state.pos; 180 remainingMatchLength_=state.remainingMatchLength; 181 } 182 return *this; 183 } 184 185 /** 186 * Determines whether the string so far matches, whether it has a value, 187 * and whether another input char16_t can continue a matching string. 188 * @return The match/value Result. 189 * @stable ICU 4.8 190 */ 191 UStringTrieResult current() const; 192 193 /** 194 * Traverses the trie from the initial state for this input char16_t. 195 * Equivalent to reset().next(uchar). 196 * @param uchar Input char value. Values below 0 and above 0xffff will never match. 197 * @return The match/value Result. 198 * @stable ICU 4.8 199 */ first(int32_t uchar)200 inline UStringTrieResult first(int32_t uchar) { 201 remainingMatchLength_=-1; 202 return nextImpl(uchars_, uchar); 203 } 204 205 /** 206 * Traverses the trie from the initial state for the 207 * one or two UTF-16 code units for this input code point. 208 * Equivalent to reset().nextForCodePoint(cp). 209 * @param cp A Unicode code point 0..0x10ffff. 210 * @return The match/value Result. 211 * @stable ICU 4.8 212 */ 213 UStringTrieResult firstForCodePoint(UChar32 cp); 214 215 /** 216 * Traverses the trie from the current state for this input char16_t. 217 * @param uchar Input char value. Values below 0 and above 0xffff will never match. 218 * @return The match/value Result. 219 * @stable ICU 4.8 220 */ 221 UStringTrieResult next(int32_t uchar); 222 223 /** 224 * Traverses the trie from the current state for the 225 * one or two UTF-16 code units for this input code point. 226 * @param cp A Unicode code point 0..0x10ffff. 227 * @return The match/value Result. 228 * @stable ICU 4.8 229 */ 230 UStringTrieResult nextForCodePoint(UChar32 cp); 231 232 /** 233 * Traverses the trie from the current state for this string. 234 * Equivalent to 235 * \code 236 * Result result=current(); 237 * for(each c in s) 238 * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 239 * result=next(c); 240 * return result; 241 * \endcode 242 * @param s A string. Can be NULL if length is 0. 243 * @param length The length of the string. Can be -1 if NUL-terminated. 244 * @return The match/value Result. 245 * @stable ICU 4.8 246 */ 247 UStringTrieResult next(ConstChar16Ptr s, int32_t length); 248 249 /** 250 * Returns a matching string's value if called immediately after 251 * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 252 * getValue() can be called multiple times. 253 * 254 * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 255 * @return The value for the string so far. 256 * @stable ICU 4.8 257 */ getValue()258 inline int32_t getValue() const { 259 const char16_t *pos=pos_; 260 int32_t leadUnit=*pos++; 261 // U_ASSERT(leadUnit>=kMinValueLead); 262 return leadUnit&kValueIsFinal ? 263 readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit); 264 } 265 266 /** 267 * Determines whether all strings reachable from the current state 268 * map to the same value. 269 * @param uniqueValue Receives the unique value, if this function returns true. 270 * (output-only) 271 * @return true if all strings reachable from the current state 272 * map to the same value. 273 * @stable ICU 4.8 274 */ hasUniqueValue(int32_t & uniqueValue)275 inline UBool hasUniqueValue(int32_t &uniqueValue) const { 276 const char16_t *pos=pos_; 277 // Skip the rest of a pending linear-match node. 278 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, false, uniqueValue); 279 } 280 281 /** 282 * Finds each char16_t which continues the string from the current state. 283 * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now. 284 * @param out Each next char16_t is appended to this object. 285 * @return the number of char16_ts which continue the string from here 286 * @stable ICU 4.8 287 */ 288 int32_t getNextUChars(Appendable &out) const; 289 290 /** 291 * Iterator for all of the (string, value) pairs in a UCharsTrie. 292 * @stable ICU 4.8 293 */ 294 class U_COMMON_API Iterator : public UMemory { 295 public: 296 /** 297 * Iterates from the root of a char16_t-serialized UCharsTrie. 298 * @param trieUChars The trie char16_ts. 299 * @param maxStringLength If 0, the iterator returns full strings. 300 * Otherwise, the iterator returns strings with this maximum length. 301 * @param errorCode Standard ICU error code. Its input value must 302 * pass the U_SUCCESS() test, or else the function returns 303 * immediately. Check for U_FAILURE() on output or use with 304 * function chaining. (See User Guide for details.) 305 * @stable ICU 4.8 306 */ 307 Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode); 308 309 /** 310 * Iterates from the current state of the specified UCharsTrie. 311 * @param trie The trie whose state will be copied for iteration. 312 * @param maxStringLength If 0, the iterator returns full strings. 313 * Otherwise, the iterator returns strings with this maximum length. 314 * @param errorCode Standard ICU error code. Its input value must 315 * pass the U_SUCCESS() test, or else the function returns 316 * immediately. Check for U_FAILURE() on output or use with 317 * function chaining. (See User Guide for details.) 318 * @stable ICU 4.8 319 */ 320 Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 321 322 /** 323 * Destructor. 324 * @stable ICU 4.8 325 */ 326 ~Iterator(); 327 328 /** 329 * Resets this iterator to its initial state. 330 * @return *this 331 * @stable ICU 4.8 332 */ 333 Iterator &reset(); 334 335 /** 336 * @return true if there are more elements. 337 * @stable ICU 4.8 338 */ 339 UBool hasNext() const; 340 341 /** 342 * Finds the next (string, value) pair if there is one. 343 * 344 * If the string is truncated to the maximum length and does not 345 * have a real value, then the value is set to -1. 346 * In this case, this "not a real value" is indistinguishable from 347 * a real value of -1. 348 * @param errorCode Standard ICU error code. Its input value must 349 * pass the U_SUCCESS() test, or else the function returns 350 * immediately. Check for U_FAILURE() on output or use with 351 * function chaining. (See User Guide for details.) 352 * @return true if there is another element. 353 * @stable ICU 4.8 354 */ 355 UBool next(UErrorCode &errorCode); 356 357 /** 358 * @return The string for the last successful next(). 359 * @stable ICU 4.8 360 */ getString()361 const UnicodeString &getString() const { return str_; } 362 /** 363 * @return The value for the last successful next(). 364 * @stable ICU 4.8 365 */ getValue()366 int32_t getValue() const { return value_; } 367 368 private: truncateAndStop()369 UBool truncateAndStop() { 370 pos_=NULL; 371 value_=-1; // no real value for str 372 return true; 373 } 374 375 const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode); 376 377 const char16_t *uchars_; 378 const char16_t *pos_; 379 const char16_t *initialPos_; 380 int32_t remainingMatchLength_; 381 int32_t initialRemainingMatchLength_; 382 UBool skipValue_; // Skip intermediate value which was already delivered. 383 384 UnicodeString str_; 385 int32_t maxLength_; 386 int32_t value_; 387 388 // The stack stores pairs of integers for backtracking to another 389 // outbound edge of a branch node. 390 // The first integer is an offset from uchars_. 391 // The second integer has the str_.length() from before the node in bits 15..0, 392 // and the remaining branch length in bits 31..16. 393 // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit, 394 // but the code looks more confusing that way.) 395 UVector32 *stack_; 396 }; 397 398 private: 399 friend class UCharsTrieBuilder; 400 401 /** 402 * Constructs a UCharsTrie reader instance. 403 * Unlike the public constructor which just aliases an array, 404 * this constructor adopts the builder's array. 405 * This constructor is only called by the builder. 406 */ UCharsTrie(char16_t * adoptUChars,const char16_t * trieUChars)407 UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars) 408 : ownedArray_(adoptUChars), uchars_(trieUChars), 409 pos_(uchars_), remainingMatchLength_(-1) {} 410 411 // No assignment operator. 412 UCharsTrie &operator=(const UCharsTrie &other); 413 stop()414 inline void stop() { 415 pos_=NULL; 416 } 417 418 // Reads a compact 32-bit integer. 419 // pos is already after the leadUnit, and the lead unit has bit 15 reset. readValue(const char16_t * pos,int32_t leadUnit)420 static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) { 421 int32_t value; 422 if(leadUnit<kMinTwoUnitValueLead) { 423 value=leadUnit; 424 } else if(leadUnit<kThreeUnitValueLead) { 425 value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos; 426 } else { 427 value=(pos[0]<<16)|pos[1]; 428 } 429 return value; 430 } skipValue(const char16_t * pos,int32_t leadUnit)431 static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) { 432 if(leadUnit>=kMinTwoUnitValueLead) { 433 if(leadUnit<kThreeUnitValueLead) { 434 ++pos; 435 } else { 436 pos+=2; 437 } 438 } 439 return pos; 440 } skipValue(const char16_t * pos)441 static inline const char16_t *skipValue(const char16_t *pos) { 442 int32_t leadUnit=*pos++; 443 return skipValue(pos, leadUnit&0x7fff); 444 } 445 readNodeValue(const char16_t * pos,int32_t leadUnit)446 static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) { 447 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 448 int32_t value; 449 if(leadUnit<kMinTwoUnitNodeValueLead) { 450 value=(leadUnit>>6)-1; 451 } else if(leadUnit<kThreeUnitNodeValueLead) { 452 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos; 453 } else { 454 value=(pos[0]<<16)|pos[1]; 455 } 456 return value; 457 } skipNodeValue(const char16_t * pos,int32_t leadUnit)458 static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) { 459 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 460 if(leadUnit>=kMinTwoUnitNodeValueLead) { 461 if(leadUnit<kThreeUnitNodeValueLead) { 462 ++pos; 463 } else { 464 pos+=2; 465 } 466 } 467 return pos; 468 } 469 jumpByDelta(const char16_t * pos)470 static inline const char16_t *jumpByDelta(const char16_t *pos) { 471 int32_t delta=*pos++; 472 if(delta>=kMinTwoUnitDeltaLead) { 473 if(delta==kThreeUnitDeltaLead) { 474 delta=(pos[0]<<16)|pos[1]; 475 pos+=2; 476 } else { 477 delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++; 478 } 479 } 480 return pos+delta; 481 } 482 skipDelta(const char16_t * pos)483 static const char16_t *skipDelta(const char16_t *pos) { 484 int32_t delta=*pos++; 485 if(delta>=kMinTwoUnitDeltaLead) { 486 if(delta==kThreeUnitDeltaLead) { 487 pos+=2; 488 } else { 489 ++pos; 490 } 491 } 492 return pos; 493 } 494 valueResult(int32_t node)495 static inline UStringTrieResult valueResult(int32_t node) { 496 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15)); 497 } 498 499 // Handles a branch node for both next(uchar) and next(string). 500 UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar); 501 502 // Requires remainingLength_<0. 503 UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar); 504 505 // Helper functions for hasUniqueValue(). 506 // Recursively finds a unique value (or whether there is not a unique one) 507 // from a branch. 508 static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length, 509 UBool haveUniqueValue, int32_t &uniqueValue); 510 // Recursively finds a unique value (or whether there is not a unique one) 511 // starting from a position on a node lead unit. 512 static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 513 514 // Helper functions for getNextUChars(). 515 // getNextUChars() when pos is on a branch node. 516 static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out); 517 518 // UCharsTrie data structure 519 // 520 // The trie consists of a series of char16_t-serialized nodes for incremental 521 // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer) 522 // The root node is at the beginning of the trie data. 523 // 524 // Types of nodes are distinguished by their node lead unit ranges. 525 // After each node, except a final-value node, another node follows to 526 // encode match values or continue matching further units. 527 // 528 // Node types: 529 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 530 // The value is for the string/char16_t sequence so far. 531 // - Match node, optionally with an intermediate value in a different compact format. 532 // The value, if present, is for the string/char16_t sequence so far. 533 // 534 // Aside from the value, which uses the node lead unit's high bits: 535 // 536 // - Linear-match node: Matches a number of units. 537 // - Branch node: Branches to other nodes according to the current input unit. 538 // The node unit is the length of the branch (number of units to select from) 539 // minus 1. It is followed by a sub-node: 540 // - If the length is at most kMaxBranchLinearSubNodeLength, then 541 // there are length-1 (key, value) pairs and then one more comparison unit. 542 // If one of the key units matches, then the value is either a final value for 543 // the string so far, or a "jump" delta to the next node. 544 // If the last unit matches, then matching continues with the next node. 545 // (Values have the same encoding as final-value nodes.) 546 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 547 // there is one unit and one "jump" delta. 548 // If the input unit is less than the sub-node unit, then "jump" by delta to 549 // the next sub-node which will have a length of length/2. 550 // (The delta has its own compact encoding.) 551 // Otherwise, skip the "jump" delta to the next sub-node 552 // which will have a length of length-length/2. 553 554 // Match-node lead unit values, after masking off intermediate-value bits: 555 556 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 557 // the length is one more than the next unit. 558 559 // For a branch sub-node with at most this many entries, we drop down 560 // to a linear search. 561 static const int32_t kMaxBranchLinearSubNodeLength=5; 562 563 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 564 static const int32_t kMinLinearMatch=0x30; 565 static const int32_t kMaxLinearMatchLength=0x10; 566 567 // Match-node lead unit bits 14..6 for the optional intermediate value. 568 // If these bits are 0, then there is no intermediate value. 569 // Otherwise, see the *NodeValue* constants below. 570 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 571 static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f 572 573 // A final-value node has bit 15 set. 574 static const int32_t kValueIsFinal=0x8000; 575 576 // Compact value: After testing and masking off bit 15, use the following thresholds. 577 static const int32_t kMaxOneUnitValue=0x3fff; 578 579 static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 580 static const int32_t kThreeUnitValueLead=0x7fff; 581 582 static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 583 584 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 585 static const int32_t kMaxOneUnitNodeValue=0xff; 586 static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 587 static const int32_t kThreeUnitNodeValueLead=0x7fc0; 588 589 static const int32_t kMaxTwoUnitNodeValue= 590 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 591 592 // Compact delta integers. 593 static const int32_t kMaxOneUnitDelta=0xfbff; 594 static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 595 static const int32_t kThreeUnitDeltaLead=0xffff; 596 597 static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 598 599 // For getState64(): 600 // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 601 // so we need at least 5 bits for that. 602 // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. 603 static constexpr int32_t kState64RemainingShift = 59; 604 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; 605 606 char16_t *ownedArray_; 607 608 // Fixed value referencing the UCharsTrie words. 609 const char16_t *uchars_; 610 611 // Iterator variables. 612 613 // Pointer to next trie unit to read. NULL if no more matches. 614 const char16_t *pos_; 615 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 616 int32_t remainingMatchLength_; 617 }; 618 619 U_NAMESPACE_END 620 621 #endif /* U_SHOW_CPLUSPLUS_API */ 622 623 #endif // __UCHARSTRIE_H__ 624