1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 *   Copyright (C) 2011-2014, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 *   created on: 2011jan06
9 *   created by: Markus W. Scherer
10 *   ported from ICU4C ucharstrie.h/.cpp
11 */
12 
13 package com.ibm.icu.util;
14 
15 import java.io.IOException;
16 import java.util.ArrayList;
17 import java.util.NoSuchElementException;
18 
19 import com.ibm.icu.text.UTF16;
20 import com.ibm.icu.util.BytesTrie.Result;
21 
22 /**
23  * Light-weight, non-const reader class for a CharsTrie.
24  * Traverses a char-serialized data structure with minimal state,
25  * for mapping strings (16-bit-unit sequences) to non-negative integer values.
26  *
27  * <p>This class is not intended for public subclassing.
28  *
29  * @stable ICU 4.8
30  * @author Markus W. Scherer
31  */
32 public final class CharsTrie implements Cloneable, Iterable<CharsTrie.Entry> {
33     /**
34      * Constructs a CharsTrie reader instance.
35      *
36      * <p>The CharSequence must contain a copy of a char sequence from the CharsTrieBuilder,
37      * with the offset indicating the first char of that sequence.
38      * The CharsTrie object will not read more chars than
39      * the CharsTrieBuilder generated in the corresponding build() call.
40      *
41      * <p>The CharSequence is not copied/cloned and must not be modified while
42      * the CharsTrie object is in use.
43      *
44      * @param trieChars CharSequence that contains the serialized trie.
45      * @param offset Root offset of the trie in the CharSequence.
46      * @stable ICU 4.8
47      */
CharsTrie(CharSequence trieChars, int offset)48     public CharsTrie(CharSequence trieChars, int offset) {
49         chars_=trieChars;
50         pos_=root_=offset;
51         remainingMatchLength_=-1;
52     }
53 
54     /**
55      * Clones this trie reader object and its state,
56      * but not the char array which will be shared.
57      * @return A shallow clone of this trie.
58      * @stable ICU 4.8
59      */
60     @Override
clone()61     public Object clone() throws CloneNotSupportedException {
62         return super.clone();  // A shallow copy is just what we need.
63     }
64 
65     /**
66      * Resets this trie to its initial state.
67      * @return this
68      * @stable ICU 4.8
69      */
reset()70     public CharsTrie reset() {
71         pos_=root_;
72         remainingMatchLength_=-1;
73         return this;
74     }
75 
76     /**
77      * CharsTrie state object, for saving a trie's current state
78      * and resetting the trie back to this state later.
79      * @stable ICU 4.8
80      */
81     public static final class State {
82         /**
83          * Constructs an empty State.
84          * @stable ICU 4.8
85          */
State()86         public State() {}
87         private CharSequence chars;
88         private int root;
89         private int pos;
90         private int remainingMatchLength;
91     }
92 
93     /**
94      * Saves the state of this trie.
95      * @param state The State object to hold the trie's state.
96      * @return this
97      * @see #resetToState
98      * @stable ICU 4.8
99      */
saveState(State state)100     public CharsTrie saveState(State state) /*const*/ {
101         state.chars=chars_;
102         state.root=root_;
103         state.pos=pos_;
104         state.remainingMatchLength=remainingMatchLength_;
105         return this;
106     }
107 
108     /**
109      * Resets this trie to the saved state.
110      * @param state The State object which holds a saved trie state.
111      * @return this
112      * @throws IllegalArgumentException if the state object contains no state,
113      *         or the state of a different trie
114      * @see #saveState
115      * @see #reset
116      * @stable ICU 4.8
117      */
resetToState(State state)118     public CharsTrie resetToState(State state) {
119         if(chars_==state.chars && chars_!=null && root_==state.root) {
120             pos_=state.pos;
121             remainingMatchLength_=state.remainingMatchLength;
122         } else {
123             throw new IllegalArgumentException("incompatible trie state");
124         }
125         return this;
126     }
127 
128     /**
129      * Determines whether the string so far matches, whether it has a value,
130      * and whether another input char can continue a matching string.
131      * @return The match/value Result.
132      * @stable ICU 4.8
133      */
current()134     public Result current() /*const*/ {
135         int pos=pos_;
136         if(pos<0) {
137             return Result.NO_MATCH;
138         } else {
139             int node;
140             return (remainingMatchLength_<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
141                     valueResults_[node>>15] : Result.NO_VALUE;
142         }
143     }
144 
145     /**
146      * Traverses the trie from the initial state for this input char.
147      * Equivalent to reset().next(inUnit).
148      * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
149      * @return The match/value Result.
150      * @stable ICU 4.8
151      */
first(int inUnit)152     public Result first(int inUnit) {
153         remainingMatchLength_=-1;
154         return nextImpl(root_, inUnit);
155     }
156 
157     /**
158      * Traverses the trie from the initial state for the
159      * one or two UTF-16 code units for this input code point.
160      * Equivalent to reset().nextForCodePoint(cp).
161      * @param cp A Unicode code point 0..0x10ffff.
162      * @return The match/value Result.
163      * @stable ICU 4.8
164      */
firstForCodePoint(int cp)165     public Result firstForCodePoint(int cp) {
166         return cp<=0xffff ?
167             first(cp) :
168             (first(UTF16.getLeadSurrogate(cp)).hasNext() ?
169                 next(UTF16.getTrailSurrogate(cp)) :
170                 Result.NO_MATCH);
171     }
172 
173     /**
174      * Traverses the trie from the current state for this input char.
175      * @param inUnit Input char value. Values below 0 and above 0xffff will never match.
176      * @return The match/value Result.
177      * @stable ICU 4.8
178      */
next(int inUnit)179     public Result next(int inUnit) {
180         int pos=pos_;
181         if(pos<0) {
182             return Result.NO_MATCH;
183         }
184         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
185         if(length>=0) {
186             // Remaining part of a linear-match node.
187             if(inUnit==chars_.charAt(pos++)) {
188                 remainingMatchLength_=--length;
189                 pos_=pos;
190                 int node;
191                 return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
192                         valueResults_[node>>15] : Result.NO_VALUE;
193             } else {
194                 stop();
195                 return Result.NO_MATCH;
196             }
197         }
198         return nextImpl(pos, inUnit);
199     }
200 
201     /**
202      * Traverses the trie from the current state for the
203      * one or two UTF-16 code units for this input code point.
204      * @param cp A Unicode code point 0..0x10ffff.
205      * @return The match/value Result.
206      * @stable ICU 4.8
207      */
nextForCodePoint(int cp)208     public Result nextForCodePoint(int cp) {
209         return cp<=0xffff ?
210             next(cp) :
211             (next(UTF16.getLeadSurrogate(cp)).hasNext() ?
212                 next(UTF16.getTrailSurrogate(cp)) :
213                 Result.NO_MATCH);
214     }
215 
216     /**
217      * Traverses the trie from the current state for this string.
218      * Equivalent to
219      * <pre>
220      * Result result=current();
221      * for(each c in s)
222      *   if(!result.hasNext()) return Result.NO_MATCH;
223      *   result=next(c);
224      * return result;
225      * </pre>
226      * @param s Contains a string.
227      * @param sIndex The start index of the string in s.
228      * @param sLimit The (exclusive) end index of the string in s.
229      * @return The match/value Result.
230      * @stable ICU 4.8
231      */
next(CharSequence s, int sIndex, int sLimit)232     public Result next(CharSequence s, int sIndex, int sLimit) {
233         if(sIndex>=sLimit) {
234             // Empty input.
235             return current();
236         }
237         int pos=pos_;
238         if(pos<0) {
239             return Result.NO_MATCH;
240         }
241         int length=remainingMatchLength_;  // Actual remaining match length minus 1.
242         for(;;) {
243             // Fetch the next input unit, if there is one.
244             // Continue a linear-match node.
245             char inUnit;
246             for(;;) {
247                 if(sIndex==sLimit) {
248                     remainingMatchLength_=length;
249                     pos_=pos;
250                     int node;
251                     return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
252                             valueResults_[node>>15] : Result.NO_VALUE;
253                 }
254                 inUnit=s.charAt(sIndex++);
255                 if(length<0) {
256                     remainingMatchLength_=length;
257                     break;
258                 }
259                 if(inUnit!=chars_.charAt(pos)) {
260                     stop();
261                     return Result.NO_MATCH;
262                 }
263                 ++pos;
264                 --length;
265             }
266             int node=chars_.charAt(pos++);
267             for(;;) {
268                 if(node<kMinLinearMatch) {
269                     Result result=branchNext(pos, node, inUnit);
270                     if(result==Result.NO_MATCH) {
271                         return Result.NO_MATCH;
272                     }
273                     // Fetch the next input unit, if there is one.
274                     if(sIndex==sLimit) {
275                         return result;
276                     }
277                     if(result==Result.FINAL_VALUE) {
278                         // No further matching units.
279                         stop();
280                         return Result.NO_MATCH;
281                     }
282                     inUnit=s.charAt(sIndex++);
283                     pos=pos_;  // branchNext() advanced pos and wrote it to pos_ .
284                     node=chars_.charAt(pos++);
285                 } else if(node<kMinValueLead) {
286                     // Match length+1 units.
287                     length=node-kMinLinearMatch;  // Actual match length minus 1.
288                     if(inUnit!=chars_.charAt(pos)) {
289                         stop();
290                         return Result.NO_MATCH;
291                     }
292                     ++pos;
293                     --length;
294                     break;
295                 } else if((node&kValueIsFinal)!=0) {
296                     // No further matching units.
297                     stop();
298                     return Result.NO_MATCH;
299                 } else {
300                     // Skip intermediate value.
301                     pos=skipNodeValue(pos, node);
302                     node&=kNodeTypeMask;
303                 }
304             }
305         }
306     }
307 
308     /**
309      * Returns a matching string's value if called immediately after
310      * current()/first()/next() returned Result.INTERMEDIATE_VALUE or Result.FINAL_VALUE.
311      * getValue() can be called multiple times.
312      *
313      * Do not call getValue() after Result.NO_MATCH or Result.NO_VALUE!
314      * @return The value for the string so far.
315      * @stable ICU 4.8
316      */
getValue()317     public int getValue() /*const*/ {
318         int pos=pos_;
319         int leadUnit=chars_.charAt(pos++);
320         assert(leadUnit>=kMinValueLead);
321         return (leadUnit&kValueIsFinal)!=0 ?
322             readValue(chars_, pos, leadUnit&0x7fff) : readNodeValue(chars_, pos, leadUnit);
323     }
324 
325     /**
326      * Determines whether all strings reachable from the current state
327      * map to the same value, and if so, returns that value.
328      * @return The unique value in bits 32..1 with bit 0 set,
329      *         if all strings reachable from the current state
330      *         map to the same value; otherwise returns 0.
331      * @stable ICU 4.8
332      */
getUniqueValue()333     public long getUniqueValue() /*const*/ {
334         int pos=pos_;
335         if(pos<0) {
336             return 0;
337         }
338         // Skip the rest of a pending linear-match node.
339         long uniqueValue=findUniqueValue(chars_, pos+remainingMatchLength_+1, 0);
340         // Ignore internally used bits 63..33; extend the actual value's sign bit from bit 32.
341         return (uniqueValue<<31)>>31;
342     }
343 
344     /**
345      * Finds each char which continues the string from the current state.
346      * That is, each char c for which it would be next(c)!=Result.NO_MATCH now.
347      * @param out Each next char is appended to this object.
348      *            (Only uses the out.append(c) method.)
349      * @return The number of chars which continue the string from here.
350      * @stable ICU 4.8
351      */
getNextChars(Appendable out)352     public int getNextChars(Appendable out) /*const*/ {
353         int pos=pos_;
354         if(pos<0) {
355             return 0;
356         }
357         if(remainingMatchLength_>=0) {
358             append(out, chars_.charAt(pos));  // Next unit of a pending linear-match node.
359             return 1;
360         }
361         int node=chars_.charAt(pos++);
362         if(node>=kMinValueLead) {
363             if((node&kValueIsFinal)!=0) {
364                 return 0;
365             } else {
366                 pos=skipNodeValue(pos, node);
367                 node&=kNodeTypeMask;
368             }
369         }
370         if(node<kMinLinearMatch) {
371             if(node==0) {
372                 node=chars_.charAt(pos++);
373             }
374             getNextBranchChars(chars_, pos, ++node, out);
375             return node;
376         } else {
377             // First unit of the linear-match node.
378             append(out, chars_.charAt(pos));
379             return 1;
380         }
381     }
382 
383     /**
384      * Iterates from the current state of this trie.
385      * @return A new CharsTrie.Iterator.
386      * @stable ICU 4.8
387      */
388     @Override
iterator()389     public Iterator iterator() {
390         return new Iterator(chars_, pos_, remainingMatchLength_, 0);
391     }
392 
393     /**
394      * Iterates from the current state of this trie.
395      * @param maxStringLength If 0, the iterator returns full strings.
396      *                        Otherwise, the iterator returns strings with this maximum length.
397      * @return A new CharsTrie.Iterator.
398      * @stable ICU 4.8
399      */
iterator(int maxStringLength)400     public Iterator iterator(int maxStringLength) {
401         return new Iterator(chars_, pos_, remainingMatchLength_, maxStringLength);
402     }
403 
404     /**
405      * Iterates from the root of a char-serialized BytesTrie.
406      * @param trieChars CharSequence that contains the serialized trie.
407      * @param offset Root offset of the trie in the CharSequence.
408      * @param maxStringLength If 0, the iterator returns full strings.
409      *                        Otherwise, the iterator returns strings with this maximum length.
410      * @return A new CharsTrie.Iterator.
411      * @stable ICU 4.8
412      */
iterator(CharSequence trieChars, int offset, int maxStringLength)413     public static Iterator iterator(CharSequence trieChars, int offset, int maxStringLength) {
414         return new Iterator(trieChars, offset, -1, maxStringLength);
415     }
416 
417     /**
418      * Return value type for the Iterator.
419      * @stable ICU 4.8
420      */
421     public static final class Entry {
422         /**
423          * The string.
424          * @stable ICU 4.8
425          */
426         public CharSequence chars;
427         /**
428          * The value associated with the string.
429          * @stable ICU 4.8
430          */
431         public int value;
432 
Entry()433         private Entry() {
434         }
435     }
436 
437     /**
438      * Iterator for all of the (string, value) pairs in a CharsTrie.
439      * @stable ICU 4.8
440      */
441     public static final class Iterator implements java.util.Iterator<Entry> {
Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength)442         private Iterator(CharSequence trieChars, int offset, int remainingMatchLength, int maxStringLength) {
443             chars_=trieChars;
444             pos_=initialPos_=offset;
445             remainingMatchLength_=initialRemainingMatchLength_=remainingMatchLength;
446             maxLength_=maxStringLength;
447             int length=remainingMatchLength_;  // Actual remaining match length minus 1.
448             if(length>=0) {
449                 // Pending linear-match node, append remaining bytes to str_.
450                 ++length;
451                 if(maxLength_>0 && length>maxLength_) {
452                     length=maxLength_;  // This will leave remainingMatchLength>=0 as a signal.
453                 }
454                 str_.append(chars_, pos_, pos_+length);
455                 pos_+=length;
456                 remainingMatchLength_-=length;
457             }
458         }
459 
460         /**
461          * Resets this iterator to its initial state.
462          * @return this
463          * @stable ICU 4.8
464          */
reset()465         public Iterator reset() {
466             pos_=initialPos_;
467             remainingMatchLength_=initialRemainingMatchLength_;
468             skipValue_=false;
469             int length=remainingMatchLength_+1;  // Remaining match length.
470             if(maxLength_>0 && length>maxLength_) {
471                 length=maxLength_;
472             }
473             str_.setLength(length);
474             pos_+=length;
475             remainingMatchLength_-=length;
476             stack_.clear();
477             return this;
478         }
479 
480         /**
481          * @return true if there are more elements.
482          * @stable ICU 4.8
483          */
484         @Override
hasNext()485         public boolean hasNext() /*const*/ { return pos_>=0 || !stack_.isEmpty(); }
486 
487         /**
488          * Finds the next (string, value) pair if there is one.
489          *
490          * If the string is truncated to the maximum length and does not
491          * have a real value, then the value is set to -1.
492          * In this case, this "not a real value" is indistinguishable from
493          * a real value of -1.
494          * @return An Entry with the string and value of the next element.
495          * @throws NoSuchElementException - iteration has no more elements.
496          * @stable ICU 4.8
497          */
498         @Override
next()499         public Entry next() {
500             int pos=pos_;
501             if(pos<0) {
502                 if(stack_.isEmpty()) {
503                     throw new NoSuchElementException();
504                 }
505                 // Pop the state off the stack and continue with the next outbound edge of
506                 // the branch node.
507                 long top=stack_.remove(stack_.size()-1);
508                 int length=(int)top;
509                 pos=(int)(top>>32);
510                 str_.setLength(length&0xffff);
511                 length>>>=16;
512                 if(length>1) {
513                     pos=branchNext(pos, length);
514                     if(pos<0) {
515                         return entry_;  // Reached a final value.
516                     }
517                 } else {
518                     str_.append(chars_.charAt(pos++));
519                 }
520             }
521             if(remainingMatchLength_>=0) {
522                 // We only get here if we started in a pending linear-match node
523                 // with more than maxLength remaining units.
524                 return truncateAndStop();
525             }
526             for(;;) {
527                 int node=chars_.charAt(pos++);
528                 if(node>=kMinValueLead) {
529                     if(skipValue_) {
530                         pos=skipNodeValue(pos, node);
531                         node&=kNodeTypeMask;
532                         skipValue_=false;
533                     } else {
534                         // Deliver value for the string so far.
535                         boolean isFinal=(node&kValueIsFinal)!=0;
536                         if(isFinal) {
537                             entry_.value=readValue(chars_, pos, node&0x7fff);
538                         } else {
539                             entry_.value=readNodeValue(chars_, pos, node);
540                         }
541                         if(isFinal || (maxLength_>0 && str_.length()==maxLength_)) {
542                             pos_=-1;
543                         } else {
544                             // We cannot skip the value right here because it shares its
545                             // lead unit with a match node which we have to evaluate
546                             // next time.
547                             // Instead, keep pos_ on the node lead unit itself.
548                             pos_=pos-1;
549                             skipValue_=true;
550                         }
551                         entry_.chars=str_;
552                         return entry_;
553                     }
554                 }
555                 if(maxLength_>0 && str_.length()==maxLength_) {
556                     return truncateAndStop();
557                 }
558                 if(node<kMinLinearMatch) {
559                     if(node==0) {
560                         node=chars_.charAt(pos++);
561                     }
562                     pos=branchNext(pos, node+1);
563                     if(pos<0) {
564                         return entry_;  // Reached a final value.
565                     }
566                 } else {
567                     // Linear-match node, append length units to str_.
568                     int length=node-kMinLinearMatch+1;
569                     if(maxLength_>0 && str_.length()+length>maxLength_) {
570                         str_.append(chars_, pos, pos+maxLength_-str_.length());
571                         return truncateAndStop();
572                     }
573                     str_.append(chars_, pos, pos+length);
574                     pos+=length;
575                 }
576             }
577         }
578 
579         /**
580          * Iterator.remove() is not supported.
581          * @throws UnsupportedOperationException (always)
582          * @stable ICU 4.8
583          */
584         @Override
remove()585         public void remove() {
586             throw new UnsupportedOperationException();
587         }
588 
truncateAndStop()589         private Entry truncateAndStop() {
590             pos_=-1;
591             // We reset entry_.chars every time we return entry_
592             // just because the caller might have modified the Entry.
593             entry_.chars=str_;
594             entry_.value=-1;  // no real value for str
595             return entry_;
596         }
597 
branchNext(int pos, int length)598         private int branchNext(int pos, int length) {
599             while(length>kMaxBranchLinearSubNodeLength) {
600                 ++pos;  // ignore the comparison unit
601                 // Push state for the greater-or-equal edge.
602                 stack_.add(((long)skipDelta(chars_, pos)<<32)|((length-(length>>1))<<16)|str_.length());
603                 // Follow the less-than edge.
604                 length>>=1;
605                 pos=jumpByDelta(chars_, pos);
606             }
607             // List of key-value pairs where values are either final values or jump deltas.
608             // Read the first (key, value) pair.
609             char trieUnit=chars_.charAt(pos++);
610             int node=chars_.charAt(pos++);
611             boolean isFinal=(node&kValueIsFinal)!=0;
612             int value=readValue(chars_, pos, node&=0x7fff);
613             pos=skipValue(pos, node);
614             stack_.add(((long)pos<<32)|((length-1)<<16)|str_.length());
615             str_.append(trieUnit);
616             if(isFinal) {
617                 pos_=-1;
618                 entry_.chars=str_;
619                 entry_.value=value;
620                 return -1;
621             } else {
622                 return pos+value;
623             }
624         }
625 
626         private CharSequence chars_;
627         private int pos_;
628         private int initialPos_;
629         private int remainingMatchLength_;
630         private int initialRemainingMatchLength_;
631         private boolean skipValue_;  // Skip intermediate value which was already delivered.
632 
633         private StringBuilder str_=new StringBuilder();
634         private int maxLength_;
635         private Entry entry_=new Entry();
636 
637         // The stack stores longs for backtracking to another
638         // outbound edge of a branch node.
639         // Each long has the offset in chars_ in bits 62..32,
640         // the str_.length() from before the node in bits 15..0,
641         // and the remaining branch length in bits 31..16.
642         // (We could store the remaining branch length minus 1 in bits 30..16 and not use bit 31,
643         // but the code looks more confusing that way.)
644         private ArrayList<Long> stack_=new ArrayList<Long>();
645     }
646 
stop()647     private void stop() {
648         pos_=-1;
649     }
650 
651     // Reads a compact 32-bit integer.
652     // pos is already after the leadUnit, and the lead unit has bit 15 reset.
readValue(CharSequence chars, int pos, int leadUnit)653     private static int readValue(CharSequence chars, int pos, int leadUnit) {
654         int value;
655         if(leadUnit<kMinTwoUnitValueLead) {
656             value=leadUnit;
657         } else if(leadUnit<kThreeUnitValueLead) {
658             value=((leadUnit-kMinTwoUnitValueLead)<<16)|chars.charAt(pos);
659         } else {
660             value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
661         }
662         return value;
663     }
skipValue(int pos, int leadUnit)664     private static int skipValue(int pos, int leadUnit) {
665         if(leadUnit>=kMinTwoUnitValueLead) {
666             if(leadUnit<kThreeUnitValueLead) {
667                 ++pos;
668             } else {
669                 pos+=2;
670             }
671         }
672         return pos;
673     }
skipValue(CharSequence chars, int pos)674     private static int skipValue(CharSequence chars, int pos) {
675         int leadUnit=chars.charAt(pos++);
676         return skipValue(pos, leadUnit&0x7fff);
677     }
678 
readNodeValue(CharSequence chars, int pos, int leadUnit)679     private static int readNodeValue(CharSequence chars, int pos, int leadUnit) {
680         assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
681         int value;
682         if(leadUnit<kMinTwoUnitNodeValueLead) {
683             value=(leadUnit>>6)-1;
684         } else if(leadUnit<kThreeUnitNodeValueLead) {
685             value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|chars.charAt(pos);
686         } else {
687             value=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
688         }
689         return value;
690     }
skipNodeValue(int pos, int leadUnit)691     private static int skipNodeValue(int pos, int leadUnit) {
692         assert(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
693         if(leadUnit>=kMinTwoUnitNodeValueLead) {
694             if(leadUnit<kThreeUnitNodeValueLead) {
695                 ++pos;
696             } else {
697                 pos+=2;
698             }
699         }
700         return pos;
701     }
702 
jumpByDelta(CharSequence chars, int pos)703     private static int jumpByDelta(CharSequence chars, int pos) {
704         int delta=chars.charAt(pos++);
705         if(delta>=kMinTwoUnitDeltaLead) {
706             if(delta==kThreeUnitDeltaLead) {
707                 delta=(chars.charAt(pos)<<16)|chars.charAt(pos+1);
708                 pos+=2;
709             } else {
710                 delta=((delta-kMinTwoUnitDeltaLead)<<16)|chars.charAt(pos++);
711             }
712         }
713         return pos+delta;
714     }
715 
skipDelta(CharSequence chars, int pos)716     private static int skipDelta(CharSequence chars, int pos) {
717         int delta=chars.charAt(pos++);
718         if(delta>=kMinTwoUnitDeltaLead) {
719             if(delta==kThreeUnitDeltaLead) {
720                 pos+=2;
721             } else {
722                 ++pos;
723             }
724         }
725         return pos;
726     }
727 
728     private static Result[] valueResults_={ Result.INTERMEDIATE_VALUE, Result.FINAL_VALUE };
729 
730     // Handles a branch node for both next(unit) and next(string).
branchNext(int pos, int length, int inUnit)731     private Result branchNext(int pos, int length, int inUnit) {
732         // Branch according to the current unit.
733         if(length==0) {
734             length=chars_.charAt(pos++);
735         }
736         ++length;
737         // The length of the branch is the number of units to select from.
738         // The data structure encodes a binary search.
739         while(length>kMaxBranchLinearSubNodeLength) {
740             if(inUnit<chars_.charAt(pos++)) {
741                 length>>=1;
742                 pos=jumpByDelta(chars_, pos);
743             } else {
744                 length=length-(length>>1);
745                 pos=skipDelta(chars_, pos);
746             }
747         }
748         // Drop down to linear search for the last few units.
749         // length>=2 because the loop body above sees length>kMaxBranchLinearSubNodeLength>=3
750         // and divides length by 2.
751         do {
752             if(inUnit==chars_.charAt(pos++)) {
753                 Result result;
754                 int node=chars_.charAt(pos);
755                 if((node&kValueIsFinal)!=0) {
756                     // Leave the final value for getValue() to read.
757                     result=Result.FINAL_VALUE;
758                 } else {
759                     // Use the non-final value as the jump delta.
760                     ++pos;
761                     // int delta=readValue(pos, node);
762                     int delta;
763                     if(node<kMinTwoUnitValueLead) {
764                         delta=node;
765                     } else if(node<kThreeUnitValueLead) {
766                         delta=((node-kMinTwoUnitValueLead)<<16)|chars_.charAt(pos++);
767                     } else {
768                         delta=(chars_.charAt(pos)<<16)|chars_.charAt(pos+1);
769                         pos+=2;
770                     }
771                     // end readValue()
772                     pos+=delta;
773                     node=chars_.charAt(pos);
774                     result= node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
775                 }
776                 pos_=pos;
777                 return result;
778             }
779             --length;
780             pos=skipValue(chars_, pos);
781         } while(length>1);
782         if(inUnit==chars_.charAt(pos++)) {
783             pos_=pos;
784             int node=chars_.charAt(pos);
785             return node>=kMinValueLead ? valueResults_[node>>15] : Result.NO_VALUE;
786         } else {
787             stop();
788             return Result.NO_MATCH;
789         }
790     }
791 
792     // Requires remainingLength_<0.
nextImpl(int pos, int inUnit)793     private Result nextImpl(int pos, int inUnit) {
794         int node=chars_.charAt(pos++);
795         for(;;) {
796             if(node<kMinLinearMatch) {
797                 return branchNext(pos, node, inUnit);
798             } else if(node<kMinValueLead) {
799                 // Match the first of length+1 units.
800                 int length=node-kMinLinearMatch;  // Actual match length minus 1.
801                 if(inUnit==chars_.charAt(pos++)) {
802                     remainingMatchLength_=--length;
803                     pos_=pos;
804                     return (length<0 && (node=chars_.charAt(pos))>=kMinValueLead) ?
805                             valueResults_[node>>15] : Result.NO_VALUE;
806                 } else {
807                     // No match.
808                     break;
809                 }
810             } else if((node&kValueIsFinal)!=0) {
811                 // No further matching units.
812                 break;
813             } else {
814                 // Skip intermediate value.
815                 pos=skipNodeValue(pos, node);
816                 node&=kNodeTypeMask;
817             }
818         }
819         stop();
820         return Result.NO_MATCH;
821     }
822 
823     // Helper functions for getUniqueValue().
824     // Recursively finds a unique value (or whether there is not a unique one)
825     // from a branch.
826     // uniqueValue: On input, same as for getUniqueValue()/findUniqueValue().
827     // On return, if not 0, then bits 63..33 contain the updated non-negative pos.
findUniqueValueFromBranch(CharSequence chars, int pos, int length, long uniqueValue)828     private static long findUniqueValueFromBranch(CharSequence chars, int pos, int length,
829                                                   long uniqueValue) {
830         while(length>kMaxBranchLinearSubNodeLength) {
831             ++pos;  // ignore the comparison unit
832             uniqueValue=findUniqueValueFromBranch(chars, jumpByDelta(chars, pos), length>>1, uniqueValue);
833             if(uniqueValue==0) {
834                 return 0;
835             }
836             length=length-(length>>1);
837             pos=skipDelta(chars, pos);
838         }
839         do {
840             ++pos;  // ignore a comparison unit
841             // handle its value
842             int node=chars.charAt(pos++);
843             boolean isFinal=(node&kValueIsFinal)!=0;
844             node&=0x7fff;
845             int value=readValue(chars, pos, node);
846             pos=skipValue(pos, node);
847             if(isFinal) {
848                 if(uniqueValue!=0) {
849                     if(value!=(int)(uniqueValue>>1)) {
850                         return 0;
851                     }
852                 } else {
853                     uniqueValue=((long)value<<1)|1;
854                 }
855             } else {
856                 uniqueValue=findUniqueValue(chars, pos+value, uniqueValue);
857                 if(uniqueValue==0) {
858                     return 0;
859                 }
860             }
861         } while(--length>1);
862         // ignore the last comparison byte
863         return ((long)(pos+1)<<33)|(uniqueValue&0x1ffffffffL);
864     }
865     // Recursively finds a unique value (or whether there is not a unique one)
866     // starting from a position on a node lead unit.
867     // uniqueValue: If there is one, then bits 32..1 contain the value and bit 0 is set.
868     // Otherwise, uniqueValue is 0. Bits 63..33 are ignored.
findUniqueValue(CharSequence chars, int pos, long uniqueValue)869     private static long findUniqueValue(CharSequence chars, int pos, long uniqueValue) {
870         int node=chars.charAt(pos++);
871         for(;;) {
872             if(node<kMinLinearMatch) {
873                 if(node==0) {
874                     node=chars.charAt(pos++);
875                 }
876                 uniqueValue=findUniqueValueFromBranch(chars, pos, node+1, uniqueValue);
877                 if(uniqueValue==0) {
878                     return 0;
879                 }
880                 pos=(int)(uniqueValue>>>33);
881                 node=chars.charAt(pos++);
882             } else if(node<kMinValueLead) {
883                 // linear-match node
884                 pos+=node-kMinLinearMatch+1;  // Ignore the match units.
885                 node=chars.charAt(pos++);
886             } else {
887                 boolean isFinal=(node&kValueIsFinal)!=0;
888                 int value;
889                 if(isFinal) {
890                     value=readValue(chars, pos, node&0x7fff);
891                 } else {
892                     value=readNodeValue(chars, pos, node);
893                 }
894                 if(uniqueValue!=0) {
895                     if(value!=(int)(uniqueValue>>1)) {
896                         return 0;
897                     }
898                 } else {
899                     uniqueValue=((long)value<<1)|1;
900                 }
901                 if(isFinal) {
902                     return uniqueValue;
903                 }
904                 pos=skipNodeValue(pos, node);
905                 node&=kNodeTypeMask;
906             }
907         }
908     }
909 
910     // Helper functions for getNextChars().
911     // getNextChars() when pos is on a branch node.
getNextBranchChars(CharSequence chars, int pos, int length, Appendable out)912     private static void getNextBranchChars(CharSequence chars, int pos, int length, Appendable out) {
913         while(length>kMaxBranchLinearSubNodeLength) {
914             ++pos;  // ignore the comparison unit
915             getNextBranchChars(chars, jumpByDelta(chars, pos), length>>1, out);
916             length=length-(length>>1);
917             pos=skipDelta(chars, pos);
918         }
919         do {
920             append(out, chars.charAt(pos++));
921             pos=skipValue(chars, pos);
922         } while(--length>1);
923         append(out, chars.charAt(pos));
924     }
append(Appendable out, int c)925     private static void append(Appendable out, int c) {
926         try {
927             out.append((char)c);
928         } catch(IOException e) {
929             throw new ICUUncheckedIOException(e);
930         }
931     }
932 
933     // CharsTrie data structure
934     //
935     // The trie consists of a series of char-serialized nodes for incremental
936     // Unicode string/char sequence matching. (char=16-bit unsigned integer)
937     // The root node is at the beginning of the trie data.
938     //
939     // Types of nodes are distinguished by their node lead unit ranges.
940     // After each node, except a final-value node, another node follows to
941     // encode match values or continue matching further units.
942     //
943     // Node types:
944     //  - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
945     //    The value is for the string/char sequence so far.
946     //  - Match node, optionally with an intermediate value in a different compact format.
947     //    The value, if present, is for the string/char sequence so far.
948     //
949     //  Aside from the value, which uses the node lead unit's high bits:
950     //
951     //  - Linear-match node: Matches a number of units.
952     //  - Branch node: Branches to other nodes according to the current input unit.
953     //    The node unit is the length of the branch (number of units to select from)
954     //    minus 1. It is followed by a sub-node:
955     //    - If the length is at most kMaxBranchLinearSubNodeLength, then
956     //      there are length-1 (key, value) pairs and then one more comparison unit.
957     //      If one of the key units matches, then the value is either a final value for
958     //      the string so far, or a "jump" delta to the next node.
959     //      If the last unit matches, then matching continues with the next node.
960     //      (Values have the same encoding as final-value nodes.)
961     //    - If the length is greater than kMaxBranchLinearSubNodeLength, then
962     //      there is one unit and one "jump" delta.
963     //      If the input unit is less than the sub-node unit, then "jump" by delta to
964     //      the next sub-node which will have a length of length/2.
965     //      (The delta has its own compact encoding.)
966     //      Otherwise, skip the "jump" delta to the next sub-node
967     //      which will have a length of length-length/2.
968 
969     // Match-node lead unit values, after masking off intermediate-value bits:
970 
971     // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
972     // the length is one more than the next unit.
973 
974     // For a branch sub-node with at most this many entries, we drop down
975     // to a linear search.
976     /*package*/ static final int kMaxBranchLinearSubNodeLength=5;
977 
978     // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
979     /*package*/ static final int kMinLinearMatch=0x30;
980     /*package*/ static final int kMaxLinearMatchLength=0x10;
981 
982     // Match-node lead unit bits 14..6 for the optional intermediate value.
983     // If these bits are 0, then there is no intermediate value.
984     // Otherwise, see the *NodeValue* constants below.
985     /*package*/ static final int kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength;  // 0x0040
986     /*package*/ static final int kNodeTypeMask=kMinValueLead-1;  // 0x003f
987 
988     // A final-value node has bit 15 set.
989     /*package*/ static final int kValueIsFinal=0x8000;
990 
991     // Compact value: After testing and masking off bit 15, use the following thresholds.
992     /*package*/ static final int kMaxOneUnitValue=0x3fff;
993 
994     /*package*/ static final int kMinTwoUnitValueLead=kMaxOneUnitValue+1;  // 0x4000
995     /*package*/ static final int kThreeUnitValueLead=0x7fff;
996 
997     /*package*/ static final int kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1;  // 0x3ffeffff
998 
999     // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
1000     /*package*/ static final int kMaxOneUnitNodeValue=0xff;
1001     /*package*/ static final int kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6);  // 0x4040
1002     /*package*/ static final int kThreeUnitNodeValueLead=0x7fc0;
1003 
1004     /*package*/ static final int kMaxTwoUnitNodeValue=
1005         ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1;  // 0xfdffff
1006 
1007     // Compact delta integers.
1008     /*package*/ static final int kMaxOneUnitDelta=0xfbff;
1009     /*package*/ static final int kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1;  // 0xfc00
1010     /*package*/ static final int kThreeUnitDeltaLead=0xffff;
1011 
1012     /*package*/ static final int kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1;  // 0x03feffff
1013 
1014     // Fixed value referencing the CharsTrie words.
1015     private CharSequence chars_;
1016     private int root_;
1017 
1018     // Iterator variables.
1019 
1020     // Pointer to next trie unit to read. NULL if no more matches.
1021     private int pos_;
1022     // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
1023     private int remainingMatchLength_;
1024 }
1025