1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2011-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * created on: 2011jan06 9 * created by: Markus W. Scherer 10 * ported from ICU4C ucharstrie.h/.cpp 11 */ 12 13 package com.ibm.icu.util; 14 15 import java.io.IOException; 16 import java.util.ArrayList; 17 import java.util.NoSuchElementException; 18 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.util.BytesTrie.Result; 21 22 /** 23 * Light-weight, non-const reader class for a CharsTrie. 24 * Traverses a char-serialized data structure with minimal state, 25 * for mapping strings (16-bit-unit sequences) to non-negative integer values. 26 * 27 * <p>This class is not intended for public subclassing. 28 * 29 * @stable ICU 4.8 30 * @author Markus W. Scherer 31 */ 32 public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> { 33 /** 34 * Constructs a CharsTrie reader instance. 35 * 36 * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder, 37 * with the offset indicating the first char of that sequence. 38 * The CharsTrie object will not read more chars than 39 * the CharsTrieBuilder generated in the corresponding build() call. 40 * 41 * <p>The CharSequence is not copied/cloned and must not be modified while 42 * the CharsTrie object is in use. 43 * 44 * @param trieChars CharSequence that contains the serialized trie. 45 * @param offset Root offset of the trie in the CharSequence. 46 * @stable ICU 4.8 47 */ CharsTrie(CharSequence trieChars, int offset)48 public CharsTrie(CharSequence trieChars, int offset) { 49 chars_=trieChars; 50 pos_=root_=offset; 51 remainingMatchLength_=-1; 52 } 53 54 /** 55 * Clones this trie reader object and its state, 56 * but not the char array which will be shared. 57 * @return A shallow clone of this trie. 58 * @stable ICU 4.8 59 */ 60 @Override clone()61 public Object clone() throws CloneNotSupportedException { 62 return super.clone(); // A shallow copy is just what we need. 63 } 64 65 /** 66 * Resets this trie to its initial state. 67 * @return this 68 * @stable ICU 4.8 69 */ reset()70 public CharsTrie reset() { 71 pos_=root_; 72 remainingMatchLength_=-1; 73 return this; 74 } 75 76 /** 77 * CharsTrie state object, for saving a trie's current state 78 * and resetting the trie back to this state later. 79 * @stable ICU 4.8 80 */ 81 public static final class State { 82 /** 83 * Constructs an empty State. 84 * @stable ICU 4.8 85 */ State()86 public State() {} 87 private CharSequence chars; 88 private int root; 89 private int pos; 90 private int remainingMatchLength; 91 } 92 93 /** 94 * Saves the state of this trie. 95 * @param state The State object to hold the trie's state. 96 * @return this 97 * @see #resetToState 98 * @stable ICU 4.8 99 */ saveState(State state)100 public CharsTrie saveState(State state) /*const*/ { 101 state.chars=chars_; 102 state.root=root_; 103 state.pos=pos_; 104 state.remainingMatchLength=remainingMatchLength_; 105 return this; 106 } 107 108 /** 109 * Resets this trie to the saved state. 110 * @param state The State object which holds a saved trie state. 111 * @return this 112 * @throws IllegalArgumentException if the state object contains no state, 113 * or the state of a different trie 114 * @see #saveState 115 * @see #reset 116 * @stable ICU 4.8 117 */ resetToState(State state)118 public CharsTrie resetToState(State state) { 119 if(chars_==state.chars && chars_!=null && root_==state.root) { 120 pos_=state.pos; 121 remainingMatchLength_=state.remainingMatchLength; 122 } else { 123 throw new IllegalArgumentException("incompatible trie state"); 124 } 125 return this; 126 } 127 128 /** 129 * Determines whether the string so far matches, whether it has a value, 130 * and whether another input char can continue a matching string. 131 * @return The match/value Result. 132 * @stable ICU 4.8 133 */ current()134 public Result current() /*const*/ { 135 int pos=pos_; 136 if(pos<0) { 137 return Result.NO_MATCH; 138 } else { 139 int node; 140 return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 141 valueResults_[node>>15] : Result.NO_VALUE; 142 } 143 } 144 145 /** 146 * Traverses the trie from the initial state for this input char. 147 * Equivalent to reset().next(inUnit). 148 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 149 * @return The match/value Result. 150 * @stable ICU 4.8 151 */ first(int inUnit)152 public Result first(int inUnit) { 153 remainingMatchLength_=-1; 154 return nextImpl(root_, inUnit); 155 } 156 157 /** 158 * Traverses the trie from the initial state for the 159 * one or two UTF-16 code units for this input code point. 160 * Equivalent to reset().nextForCodePoint(cp). 161 * @param cp A Unicode code point 0..0x10ffff. 162 * @return The match/value Result. 163 * @stable ICU 4.8 164 */ firstForCodePoint(int cp)165 public Result firstForCodePoint(int cp) { 166 return cp<=0xffff ? 167 first(cp) : 168 (first(UTF16.getLeadSurrogate(cp)).hasNext() ? 169 next(UTF16.getTrailSurrogate(cp)) : 170 Result.NO_MATCH); 171 } 172 173 /** 174 * Traverses the trie from the current state for this input char. 175 * @param inUnit Input char value. Values below 0 and above 0xffff will never match. 176 * @return The match/value Result. 177 * @stable ICU 4.8 178 */ next(int inUnit)179 public Result next(int inUnit) { 180 int pos=pos_; 181 if(pos<0) { 182 return Result.NO_MATCH; 183 } 184 int length=remainingMatchLength_; // Actual remaining match length minus 1. 185 if(length>=0) { 186 // Remaining part of a linear-match node. 187 if(inUnit==chars_.charAt(pos++)) { 188 remainingMatchLength_=--length; 189 pos_=pos; 190 int node; 191 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 192 valueResults_[node>>15] : Result.NO_VALUE; 193 } else { 194 stop(); 195 return Result.NO_MATCH; 196 } 197 } 198 return nextImpl(pos, inUnit); 199 } 200 201 /** 202 * Traverses the trie from the current state for the 203 * one or two UTF-16 code units for this input code point. 204 * @param cp A Unicode code point 0..0x10ffff. 205 * @return The match/value Result. 206 * @stable ICU 4.8 207 */ nextForCodePoint(int cp)208 public Result nextForCodePoint(int cp) { 209 return cp<=0xffff ? 210 next(cp) : 211 (next(UTF16.getLeadSurrogate(cp)).hasNext() ? 212 next(UTF16.getTrailSurrogate(cp)) : 213 Result.NO_MATCH); 214 } 215 216 /** 217 * Traverses the trie from the current state for this string. 218 * Equivalent to 219 * <pre> 220 * Result result=current(); 221 * for(each c in s) 222 * if(!result.hasNext()) return Result.NO_MATCH; 223 * result=next(c); 224 * return result; 225 * </pre> 226 * @param s Contains a string. 227 * @param sIndex The start index of the string in s. 228 * @param sLimit The (exclusive) end index of the string in s. 229 * @return The match/value Result. 230 * @stable ICU 4.8 231 */ next(CharSequence s, int sIndex, int sLimit)232 public Result next(CharSequence s, int sIndex, int sLimit) { 233 if(sIndex>=sLimit) { 234 // Empty input. 235 return current(); 236 } 237 int pos=pos_; 238 if(pos<0) { 239 return Result.NO_MATCH; 240 } 241 int length=remainingMatchLength_; // Actual remaining match length minus 1. 242 for(;;) { 243 // Fetch the next input unit, if there is one. 244 // Continue a linear-match node. 245 char inUnit; 246 for(;;) { 247 if(sIndex==sLimit) { 248 remainingMatchLength_=length; 249 pos_=pos; 250 int node; 251 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 252 valueResults_[node>>15] : Result.NO_VALUE; 253 } 254 inUnit=s.charAt(sIndex++); 255 if(length<0) { 256 remainingMatchLength_=length; 257 break; 258 } 259 if(inUnit!=chars_.charAt(pos)) { 260 stop(); 261 return Result.NO_MATCH; 262 } 263 ++pos; 264 --length; 265 } 266 int node=chars_.charAt(pos++); 267 for(;;) { 268 if(node<kMinLinearMatch) { 269 Result result=branchNext(pos, node, inUnit); 270 if(result==Result.NO_MATCH) { 271 return Result.NO_MATCH; 272 } 273 // Fetch the next input unit, if there is one. 274 if(sIndex==sLimit) { 275 return result; 276 } 277 if(result==Result.FINAL_VALUE) { 278 // No further matching units. 279 stop(); 280 return Result.NO_MATCH; 281 } 282 inUnit=s.charAt(sIndex++); 283 pos=pos_; // branchNext() advanced pos and wrote it to pos_ . 284 node=chars_.charAt(pos++); 285 } else if(node<kMinValueLead) { 286 // Match length+1 units. 287 length=node-kMinLinearMatch; // Actual match length minus 1. 288 if(inUnit!=chars_.charAt(pos)) { 289 stop(); 290 return Result.NO_MATCH; 291 } 292 ++pos; 293 --length; 294 break; 295 } else if((node&kValueIsFinal)!=0) { 296 // No further matching units. 297 stop(); 298 return Result.NO_MATCH; 299 } else { 300 // Skip intermediate value. 301 pos=skipNodeValue(pos, node); 302 node&=kNodeTypeMask; 303 } 304 } 305 } 306 } 307 308 /** 309 * Returns a matching string's value if called immediately after 310 * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE. 311 * getValue() can be called multiple times. 312 * 313 * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE! 314 * @return The value for the string so far. 315 * @stable ICU 4.8 316 */ getValue()317 public int getValue() /*const*/ { 318 int pos=pos_; 319 int leadUnit=chars_.charAt(pos++); 320 assert(leadUnit>=kMinValueLead); 321 return (leadUnit&kValueIsFinal)!=0 ? 322 readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit); 323 } 324 325 /** 326 * Determines whether all strings reachable from the current state 327 * map to the same value, and if so, returns that value. 328 * @return The unique value in bits 32..1 with bit 0 set, 329 * if all strings reachable from the current state 330 * map to the same value; otherwise returns 0. 331 * @stable ICU 4.8 332 */ getUniqueValue()333 public long getUniqueValue() /*const*/ { 334 int pos=pos_; 335 if(pos<0) { 336 return 0; 337 } 338 // Skip the rest of a pending linear-match node. 339 long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0); 340 // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32. 341 return (uniqueValue<<31)>>31; 342 } 343 344 /** 345 * Finds each char which continues the string from the current state. 346 * That is, each char c for which it would be next(c)!=Result.NO_MATCH now. 347 * @param out Each next char is appended to this object. 348 * (Only uses the out.append(c) method.) 349 * @return The number of chars which continue the string from here. 350 * @stable ICU 4.8 351 */ getNextChars(Appendable out)352 public int getNextChars(Appendable out) /*const*/ { 353 int pos=pos_; 354 if(pos<0) { 355 return 0; 356 } 357 if(remainingMatchLength_>=0) { 358 append(out, chars_.charAt(pos)); // Next unit of a pending linear-match node. 359 return 1; 360 } 361 int node=chars_.charAt(pos++); 362 if(node>=kMinValueLead) { 363 if((node&kValueIsFinal)!=0) { 364 return 0; 365 } else { 366 pos=skipNodeValue(pos, node); 367 node&=kNodeTypeMask; 368 } 369 } 370 if(node<kMinLinearMatch) { 371 if(node==0) { 372 node=chars_.charAt(pos++); 373 } 374 getNextBranchChars(chars_, pos, ++node, out); 375 return node; 376 } else { 377 // First unit of the linear-match node. 378 append(out, chars_.charAt(pos)); 379 return 1; 380 } 381 } 382 383 /** 384 * Iterates from the current state of this trie. 385 * @return A new CharsTrie.Iterator. 386 * @stable ICU 4.8 387 */ 388 @Override iterator()389 public Iterator iterator() { 390 return new Iterator(chars_, pos_, remainingMatchLength_, 0); 391 } 392 393 /** 394 * Iterates from the current state of this trie. 395 * @param maxStringLength If 0, the iterator returns full strings. 396 * Otherwise, the iterator returns strings with this maximum length. 397 * @return A new CharsTrie.Iterator. 398 * @stable ICU 4.8 399 */ iterator(int maxStringLength)400 public Iterator iterator(int maxStringLength) { 401 return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength); 402 } 403 404 /** 405 * Iterates from the root of a char-serialized BytesTrie. 406 * @param trieChars CharSequence that contains the serialized trie. 407 * @param offset Root offset of the trie in the CharSequence. 408 * @param maxStringLength If 0, the iterator returns full strings. 409 * Otherwise, the iterator returns strings with this maximum length. 410 * @return A new CharsTrie.Iterator. 411 * @stable ICU 4.8 412 */ iterator(CharSequence trieChars, int offset, int maxStringLength)413 public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) { 414 return new Iterator(trieChars, offset, -1, maxStringLength); 415 } 416 417 /** 418 * Return value type for the Iterator. 419 * @stable ICU 4.8 420 */ 421 public static final class Entry { 422 /** 423 * The string. 424 * @stable ICU 4.8 425 */ 426 public CharSequence chars; 427 /** 428 * The value associated with the string. 429 * @stable ICU 4.8 430 */ 431 public int value; 432 Entry()433 private Entry() { 434 } 435 } 436 437 /** 438 * Iterator for all of the (string, value) pairs in a CharsTrie. 439 * @stable ICU 4.8 440 */ 441 public static final class Iterator implements java.util.Iterator<Entry> { Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength)442 private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) { 443 chars_=trieChars; 444 pos_=initialPos_=offset; 445 remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength; 446 maxLength_=maxStringLength; 447 int length=remainingMatchLength_; // Actual remaining match length minus 1. 448 if(length>=0) { 449 // Pending linear-match node, append remaining bytes to str_. 450 ++length; 451 if(maxLength_>0 && length>maxLength_) { 452 length=maxLength_; // This will leave remainingMatchLength>=0 as a signal. 453 } 454 str_.append(chars_, pos_, pos_+length); 455 pos_+=length; 456 remainingMatchLength_-=length; 457 } 458 } 459 460 /** 461 * Resets this iterator to its initial state. 462 * @return this 463 * @stable ICU 4.8 464 */ reset()465 public Iterator reset() { 466 pos_=initialPos_; 467 remainingMatchLength_=initialRemainingMatchLength_; 468 skipValue_=false; 469 int length=remainingMatchLength_+1; // Remaining match length. 470 if(maxLength_>0 && length>maxLength_) { 471 length=maxLength_; 472 } 473 str_.setLength(length); 474 pos_+=length; 475 remainingMatchLength_-=length; 476 stack_.clear(); 477 return this; 478 } 479 480 /** 481 * @return true if there are more elements. 482 * @stable ICU 4.8 483 */ 484 @Override hasNext()485 public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); } 486 487 /** 488 * Finds the next (string, value) pair if there is one. 489 * 490 * If the string is truncated to the maximum length and does not 491 * have a real value, then the value is set to -1. 492 * In this case, this "not a real value" is indistinguishable from 493 * a real value of -1. 494 * @return An Entry with the string and value of the next element. 495 * @throws NoSuchElementException - iteration has no more elements. 496 * @stable ICU 4.8 497 */ 498 @Override next()499 public Entry next() { 500 int pos=pos_; 501 if(pos<0) { 502 if(stack_.isEmpty()) { 503 throw new NoSuchElementException(); 504 } 505 // Pop the state off the stack and continue with the next outbound edge of 506 // the branch node. 507 long top=stack_.remove(stack_.size()-1); 508 int length=(int)top; 509 pos=(int)(top>>32); 510 str_.setLength(length&0xffff); 511 length>>>=16; 512 if(length>1) { 513 pos=branchNext(pos, length); 514 if(pos<0) { 515 return entry_; // Reached a final value. 516 } 517 } else { 518 str_.append(chars_.charAt(pos++)); 519 } 520 } 521 if(remainingMatchLength_>=0) { 522 // We only get here if we started in a pending linear-match node 523 // with more than maxLength remaining units. 524 return truncateAndStop(); 525 } 526 for(;;) { 527 int node=chars_.charAt(pos++); 528 if(node>=kMinValueLead) { 529 if(skipValue_) { 530 pos=skipNodeValue(pos, node); 531 node&=kNodeTypeMask; 532 skipValue_=false; 533 } else { 534 // Deliver value for the string so far. 535 boolean isFinal=(node&kValueIsFinal)!=0; 536 if(isFinal) { 537 entry_.value=readValue(chars_, pos, node&0x7fff); 538 } else { 539 entry_.value=readNodeValue(chars_, pos, node); 540 } 541 if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) { 542 pos_=-1; 543 } else { 544 // We cannot skip the value right here because it shares its 545 // lead unit with a match node which we have to evaluate 546 // next time. 547 // Instead, keep pos_ on the node lead unit itself. 548 pos_=pos-1; 549 skipValue_=true; 550 } 551 entry_.chars=str_; 552 return entry_; 553 } 554 } 555 if(maxLength_>0 && str_.length()==maxLength_) { 556 return truncateAndStop(); 557 } 558 if(node<kMinLinearMatch) { 559 if(node==0) { 560 node=chars_.charAt(pos++); 561 } 562 pos=branchNext(pos, node+1); 563 if(pos<0) { 564 return entry_; // Reached a final value. 565 } 566 } else { 567 // Linear-match node, append length units to str_. 568 int length=node-kMinLinearMatch+1; 569 if(maxLength_>0 && str_.length()+length>maxLength_) { 570 str_.append(chars_, pos, pos+maxLength_-str_.length()); 571 return truncateAndStop(); 572 } 573 str_.append(chars_, pos, pos+length); 574 pos+=length; 575 } 576 } 577 } 578 579 /** 580 * Iterator.remove() is not supported. 581 * @throws UnsupportedOperationException (always) 582 * @stable ICU 4.8 583 */ 584 @Override remove()585 public void remove() { 586 throw new UnsupportedOperationException(); 587 } 588 truncateAndStop()589 private Entry truncateAndStop() { 590 pos_=-1; 591 // We reset entry_.chars every time we return entry_ 592 // just because the caller might have modified the Entry. 593 entry_.chars=str_; 594 entry_.value=-1; // no real value for str 595 return entry_; 596 } 597 branchNext(int pos, int length)598 private int branchNext(int pos, int length) { 599 while(length>kMaxBranchLinearSubNodeLength) { 600 ++pos; // ignore the comparison unit 601 // Push state for the greater-or-equal edge. 602 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length()); 603 // Follow the less-than edge. 604 length>>=1; 605 pos=jumpByDelta(chars_, pos); 606 } 607 // List of key-value pairs where values are either final values or jump deltas. 608 // Read the first (key, value) pair. 609 char trieUnit=chars_.charAt(pos++); 610 int node=chars_.charAt(pos++); 611 boolean isFinal=(node&kValueIsFinal)!=0; 612 int value=readValue(chars_, pos, node&=0x7fff); 613 pos=skipValue(pos, node); 614 stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length()); 615 str_.append(trieUnit); 616 if(isFinal) { 617 pos_=-1; 618 entry_.chars=str_; 619 entry_.value=value; 620 return -1; 621 } else { 622 return pos+value; 623 } 624 } 625 626 private CharSequence chars_; 627 private int pos_; 628 private int initialPos_; 629 private int remainingMatchLength_; 630 private int initialRemainingMatchLength_; 631 private boolean skipValue_; // Skip intermediate value which was already delivered. 632 633 private StringBuilder str_=new StringBuilder(); 634 private int maxLength_; 635 private Entry entry_=new Entry(); 636 637 // The stack stores longs for backtracking to another 638 // outbound edge of a branch node. 639 // Each long has the offset in chars_ in bits 62..32, 640 // the str_.length() from before the node in bits 15..0, 641 // and the remaining branch length in bits 31..16. 642 // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31, 643 // but the code looks more confusing that way.) 644 private ArrayList<Long> stack_=new ArrayList<Long>(); 645 } 646 stop()647 private void stop() { 648 pos_=-1; 649 } 650 651 // Reads a compact 32-bit integer. 652 // pos is already after the leadUnit, and the lead unit has bit 15 reset. readValue(CharSequence chars, int pos, int leadUnit)653 private static int readValue(CharSequence chars, int pos, int leadUnit) { 654 int value; 655 if(leadUnit<kMinTwoUnitValueLead) { 656 value=leadUnit; 657 } else if(leadUnit<kThreeUnitValueLead) { 658 value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos); 659 } else { 660 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 661 } 662 return value; 663 } skipValue(int pos, int leadUnit)664 private static int skipValue(int pos, int leadUnit) { 665 if(leadUnit>=kMinTwoUnitValueLead) { 666 if(leadUnit<kThreeUnitValueLead) { 667 ++pos; 668 } else { 669 pos+=2; 670 } 671 } 672 return pos; 673 } skipValue(CharSequence chars, int pos)674 private static int skipValue(CharSequence chars, int pos) { 675 int leadUnit=chars.charAt(pos++); 676 return skipValue(pos, leadUnit&0x7fff); 677 } 678 readNodeValue(CharSequence chars, int pos, int leadUnit)679 private static int readNodeValue(CharSequence chars, int pos, int leadUnit) { 680 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 681 int value; 682 if(leadUnit<kMinTwoUnitNodeValueLead) { 683 value=(leadUnit>>6)-1; 684 } else if(leadUnit<kThreeUnitNodeValueLead) { 685 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos); 686 } else { 687 value=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 688 } 689 return value; 690 } skipNodeValue(int pos, int leadUnit)691 private static int skipNodeValue(int pos, int leadUnit) { 692 assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal); 693 if(leadUnit>=kMinTwoUnitNodeValueLead) { 694 if(leadUnit<kThreeUnitNodeValueLead) { 695 ++pos; 696 } else { 697 pos+=2; 698 } 699 } 700 return pos; 701 } 702 jumpByDelta(CharSequence chars, int pos)703 private static int jumpByDelta(CharSequence chars, int pos) { 704 int delta=chars.charAt(pos++); 705 if(delta>=kMinTwoUnitDeltaLead) { 706 if(delta==kThreeUnitDeltaLead) { 707 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1); 708 pos+=2; 709 } else { 710 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++); 711 } 712 } 713 return pos+delta; 714 } 715 skipDelta(CharSequence chars, int pos)716 private static int skipDelta(CharSequence chars, int pos) { 717 int delta=chars.charAt(pos++); 718 if(delta>=kMinTwoUnitDeltaLead) { 719 if(delta==kThreeUnitDeltaLead) { 720 pos+=2; 721 } else { 722 ++pos; 723 } 724 } 725 return pos; 726 } 727 728 private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE }; 729 730 // Handles a branch node for both next(unit) and next(string). branchNext(int pos, int length, int inUnit)731 private Result branchNext(int pos, int length, int inUnit) { 732 // Branch according to the current unit. 733 if(length==0) { 734 length=chars_.charAt(pos++); 735 } 736 ++length; 737 // The length of the branch is the number of units to select from. 738 // The data structure encodes a binary search. 739 while(length>kMaxBranchLinearSubNodeLength) { 740 if(inUnit<chars_.charAt(pos++)) { 741 length>>=1; 742 pos=jumpByDelta(chars_, pos); 743 } else { 744 length=length-(length>>1); 745 pos=skipDelta(chars_, pos); 746 } 747 } 748 // Drop down to linear search for the last few units. 749 // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3 750 // and divides length by 2. 751 do { 752 if(inUnit==chars_.charAt(pos++)) { 753 Result result; 754 int node=chars_.charAt(pos); 755 if((node&kValueIsFinal)!=0) { 756 // Leave the final value for getValue() to read. 757 result=Result.FINAL_VALUE; 758 } else { 759 // Use the non-final value as the jump delta. 760 ++pos; 761 // int delta=readValue(pos, node); 762 int delta; 763 if(node<kMinTwoUnitValueLead) { 764 delta=node; 765 } else if(node<kThreeUnitValueLead) { 766 delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++); 767 } else { 768 delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1); 769 pos+=2; 770 } 771 // end readValue() 772 pos+=delta; 773 node=chars_.charAt(pos); 774 result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 775 } 776 pos_=pos; 777 return result; 778 } 779 --length; 780 pos=skipValue(chars_, pos); 781 } while(length>1); 782 if(inUnit==chars_.charAt(pos++)) { 783 pos_=pos; 784 int node=chars_.charAt(pos); 785 return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE; 786 } else { 787 stop(); 788 return Result.NO_MATCH; 789 } 790 } 791 792 // Requires remainingLength_<0. nextImpl(int pos, int inUnit)793 private Result nextImpl(int pos, int inUnit) { 794 int node=chars_.charAt(pos++); 795 for(;;) { 796 if(node<kMinLinearMatch) { 797 return branchNext(pos, node, inUnit); 798 } else if(node<kMinValueLead) { 799 // Match the first of length+1 units. 800 int length=node-kMinLinearMatch; // Actual match length minus 1. 801 if(inUnit==chars_.charAt(pos++)) { 802 remainingMatchLength_=--length; 803 pos_=pos; 804 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ? 805 valueResults_[node>>15] : Result.NO_VALUE; 806 } else { 807 // No match. 808 break; 809 } 810 } else if((node&kValueIsFinal)!=0) { 811 // No further matching units. 812 break; 813 } else { 814 // Skip intermediate value. 815 pos=skipNodeValue(pos, node); 816 node&=kNodeTypeMask; 817 } 818 } 819 stop(); 820 return Result.NO_MATCH; 821 } 822 823 // Helper functions for getUniqueValue(). 824 // Recursively finds a unique value (or whether there is not a unique one) 825 // from a branch. 826 // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue(). 827 // On return, if not 0, then bits 63..33 contain the updated non-negative pos. findUniqueValueFromBranch(CharSequence chars, int pos, int length, long uniqueValue)828 private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length, 829 long uniqueValue) { 830 while(length>kMaxBranchLinearSubNodeLength) { 831 ++pos; // ignore the comparison unit 832 uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue); 833 if(uniqueValue==0) { 834 return 0; 835 } 836 length=length-(length>>1); 837 pos=skipDelta(chars, pos); 838 } 839 do { 840 ++pos; // ignore a comparison unit 841 // handle its value 842 int node=chars.charAt(pos++); 843 boolean isFinal=(node&kValueIsFinal)!=0; 844 node&=0x7fff; 845 int value=readValue(chars, pos, node); 846 pos=skipValue(pos, node); 847 if(isFinal) { 848 if(uniqueValue!=0) { 849 if(value!=(int)(uniqueValue>>1)) { 850 return 0; 851 } 852 } else { 853 uniqueValue=((long)value<<1)|1; 854 } 855 } else { 856 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue); 857 if(uniqueValue==0) { 858 return 0; 859 } 860 } 861 } while(--length>1); 862 // ignore the last comparison byte 863 return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL); 864 } 865 // Recursively finds a unique value (or whether there is not a unique one) 866 // starting from a position on a node lead unit. 867 // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set. 868 // Otherwise, uniqueValue is 0. Bits 63..33 are ignored. findUniqueValue(CharSequence chars, int pos, long uniqueValue)869 private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) { 870 int node=chars.charAt(pos++); 871 for(;;) { 872 if(node<kMinLinearMatch) { 873 if(node==0) { 874 node=chars.charAt(pos++); 875 } 876 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue); 877 if(uniqueValue==0) { 878 return 0; 879 } 880 pos=(int)(uniqueValue>>>33); 881 node=chars.charAt(pos++); 882 } else if(node<kMinValueLead) { 883 // linear-match node 884 pos+=node-kMinLinearMatch+1; // Ignore the match units. 885 node=chars.charAt(pos++); 886 } else { 887 boolean isFinal=(node&kValueIsFinal)!=0; 888 int value; 889 if(isFinal) { 890 value=readValue(chars, pos, node&0x7fff); 891 } else { 892 value=readNodeValue(chars, pos, node); 893 } 894 if(uniqueValue!=0) { 895 if(value!=(int)(uniqueValue>>1)) { 896 return 0; 897 } 898 } else { 899 uniqueValue=((long)value<<1)|1; 900 } 901 if(isFinal) { 902 return uniqueValue; 903 } 904 pos=skipNodeValue(pos, node); 905 node&=kNodeTypeMask; 906 } 907 } 908 } 909 910 // Helper functions for getNextChars(). 911 // getNextChars() when pos is on a branch node. getNextBranchChars(CharSequence chars, int pos, int length, Appendable out)912 private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) { 913 while(length>kMaxBranchLinearSubNodeLength) { 914 ++pos; // ignore the comparison unit 915 getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out); 916 length=length-(length>>1); 917 pos=skipDelta(chars, pos); 918 } 919 do { 920 append(out, chars.charAt(pos++)); 921 pos=skipValue(chars, pos); 922 } while(--length>1); 923 append(out, chars.charAt(pos)); 924 } append(Appendable out, int c)925 private static void append(Appendable out, int c) { 926 try { 927 out.append((char)c); 928 } catch(IOException e) { 929 throw new ICUUncheckedIOException(e); 930 } 931 } 932 933 // CharsTrie data structure 934 // 935 // The trie consists of a series of char-serialized nodes for incremental 936 // Unicode string/char sequence matching. (char=16-bit unsigned integer) 937 // The root node is at the beginning of the trie data. 938 // 939 // Types of nodes are distinguished by their node lead unit ranges. 940 // After each node, except a final-value node, another node follows to 941 // encode match values or continue matching further units. 942 // 943 // Node types: 944 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format. 945 // The value is for the string/char sequence so far. 946 // - Match node, optionally with an intermediate value in a different compact format. 947 // The value, if present, is for the string/char sequence so far. 948 // 949 // Aside from the value, which uses the node lead unit's high bits: 950 // 951 // - Linear-match node: Matches a number of units. 952 // - Branch node: Branches to other nodes according to the current input unit. 953 // The node unit is the length of the branch (number of units to select from) 954 // minus 1. It is followed by a sub-node: 955 // - If the length is at most kMaxBranchLinearSubNodeLength, then 956 // there are length-1 (key, value) pairs and then one more comparison unit. 957 // If one of the key units matches, then the value is either a final value for 958 // the string so far, or a "jump" delta to the next node. 959 // If the last unit matches, then matching continues with the next node. 960 // (Values have the same encoding as final-value nodes.) 961 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 962 // there is one unit and one "jump" delta. 963 // If the input unit is less than the sub-node unit, then "jump" by delta to 964 // the next sub-node which will have a length of length/2. 965 // (The delta has its own compact encoding.) 966 // Otherwise, skip the "jump" delta to the next sub-node 967 // which will have a length of length-length/2. 968 969 // Match-node lead unit values, after masking off intermediate-value bits: 970 971 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise 972 // the length is one more than the next unit. 973 974 // For a branch sub-node with at most this many entries, we drop down 975 // to a linear search. 976 /*package*/ static final int kMaxBranchLinearSubNodeLength=5; 977 978 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node. 979 /*package*/ static final int kMinLinearMatch=0x30; 980 /*package*/ static final int kMaxLinearMatchLength=0x10; 981 982 // Match-node lead unit bits 14..6 for the optional intermediate value. 983 // If these bits are 0, then there is no intermediate value. 984 // Otherwise, see the *NodeValue* constants below. 985 /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040 986 /*package*/ static final int kNodeTypeMask=kMinValueLead-1; // 0x003f 987 988 // A final-value node has bit 15 set. 989 /*package*/ static final int kValueIsFinal=0x8000; 990 991 // Compact value: After testing and masking off bit 15, use the following thresholds. 992 /*package*/ static final int kMaxOneUnitValue=0x3fff; 993 994 /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000 995 /*package*/ static final int kThreeUnitValueLead=0x7fff; 996 997 /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff 998 999 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node. 1000 /*package*/ static final int kMaxOneUnitNodeValue=0xff; 1001 /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040 1002 /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0; 1003 1004 /*package*/ static final int kMaxTwoUnitNodeValue= 1005 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff 1006 1007 // Compact delta integers. 1008 /*package*/ static final int kMaxOneUnitDelta=0xfbff; 1009 /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00 1010 /*package*/ static final int kThreeUnitDeltaLead=0xffff; 1011 1012 /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff 1013 1014 // Fixed value referencing the CharsTrie words. 1015 private CharSequence chars_; 1016 private int root_; 1017 1018 // Iterator variables. 1019 1020 // Pointer to next trie unit to read. NULL if no more matches. 1021 private int pos_; 1022 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 1023 private int remainingMatchLength_; 1024 } 1025