1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2014, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * CollationIterator.java, ported from collationiterator.h/.cpp
9 *
10 * C++ version created on: 2010oct27
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import com.ibm.icu.impl.Normalizer2Impl.Hangul;
17 import com.ibm.icu.impl.Trie2_32;
18 import com.ibm.icu.util.BytesTrie;
19 import com.ibm.icu.util.CharsTrie;
20 import com.ibm.icu.util.ICUException;
21 
22 /**
23  * Collation element iterator and abstract character iterator.
24  *
25  * When a method returns a code point value, it must be in 0..10FFFF,
26  * except it can be negative as a sentinel value.
27  */
28 public abstract class CollationIterator {
29     private static final class CEBuffer {
30         /** Large enough for CEs of most short strings. */
31         private static final int INITIAL_CAPACITY = 40;
32 
CEBuffer()33         CEBuffer() {}
34 
append(long ce)35         void append(long ce) {
36             if(length >= INITIAL_CAPACITY) {
37                 ensureAppendCapacity(1);
38             }
39             buffer[length++] = ce;
40         }
41 
appendUnsafe(long ce)42         void appendUnsafe(long ce) {
43             buffer[length++] = ce;
44         }
45 
ensureAppendCapacity(int appCap)46         void ensureAppendCapacity(int appCap) {
47             int capacity = buffer.length;
48             if((length + appCap) <= capacity) { return; }
49             do {
50                 if(capacity < 1000) {
51                     capacity *= 4;
52                 } else {
53                     capacity *= 2;
54                 }
55             } while(capacity < (length + appCap));
56             long[] newBuffer = new long[capacity];
57             System.arraycopy(buffer, 0, newBuffer, 0, length);
58             buffer = newBuffer;
59         }
60 
incLength()61         void incLength() {
62             // Use INITIAL_CAPACITY for a very simple fastpath.
63             // (Rather than buffer.getCapacity().)
64             if(length >= INITIAL_CAPACITY) {
65                 ensureAppendCapacity(1);
66             }
67             ++length;
68         }
69 
set(int i, long ce)70         long set(int i, long ce) {
71             return buffer[i] = ce;
72         }
get(int i)73         long get(int i) { return buffer[i]; }
74 
getCEs()75         long[] getCEs() { return buffer; }
76 
77         int length = 0;
78 
79         private long[] buffer = new long[INITIAL_CAPACITY];
80     }
81 
82     // State of combining marks skipped in discontiguous contraction.
83     // We create a state object on first use and keep it around deactivated between uses.
84     private static final class SkippedState {
85         // Born active but empty.
SkippedState()86         SkippedState() {}
clear()87         void clear() {
88             oldBuffer.setLength(0);
89             pos = 0;
90             // The newBuffer is reset by setFirstSkipped().
91         }
92 
isEmpty()93         boolean isEmpty() { return oldBuffer.length() == 0; }
94 
hasNext()95         boolean hasNext() { return pos < oldBuffer.length(); }
96 
97         // Requires hasNext().
next()98         int next() {
99             int c = oldBuffer.codePointAt(pos);
100             pos += Character.charCount(c);
101             return c;
102         }
103 
104         // Accounts for one more input code point read beyond the end of the marks buffer.
incBeyond()105         void incBeyond() {
106             assert(!hasNext());
107             ++pos;
108         }
109 
110         // Goes backward through the skipped-marks buffer.
111         // Returns the number of code points read beyond the skipped marks
112         // that need to be backtracked through normal input.
backwardNumCodePoints(int n)113         int backwardNumCodePoints(int n) {
114             int length = oldBuffer.length();
115             int beyond = pos - length;
116             if(beyond > 0) {
117                 if(beyond >= n) {
118                     // Not back far enough to re-enter the oldBuffer.
119                     pos -= n;
120                     return n;
121                 } else {
122                     // Back out all beyond-oldBuffer code points and re-enter the buffer.
123                     pos = oldBuffer.offsetByCodePoints(length, beyond - n);
124                     return beyond;
125                 }
126             } else {
127                 // Go backwards from inside the oldBuffer.
128                 pos = oldBuffer.offsetByCodePoints(pos, -n);
129                 return 0;
130             }
131         }
132 
setFirstSkipped(int c)133         void setFirstSkipped(int c) {
134             skipLengthAtMatch = 0;
135             newBuffer.setLength(0);
136             newBuffer.appendCodePoint(c);
137         }
138 
skip(int c)139         void skip(int c) {
140             newBuffer.appendCodePoint(c);
141         }
142 
recordMatch()143         void recordMatch() { skipLengthAtMatch = newBuffer.length(); }
144 
145         // Replaces the characters we consumed with the newly skipped ones.
replaceMatch()146         void replaceMatch() {
147             // Note: UnicodeString.replace() pins pos to at most length().
148             int oldLength = oldBuffer.length();
149             if(pos > oldLength) { pos = oldLength; }
150             oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch);
151             pos = 0;
152         }
153 
saveTrieState(CharsTrie trie)154         void saveTrieState(CharsTrie trie) { trie.saveState(state); }
resetToTrieState(CharsTrie trie)155         void resetToTrieState(CharsTrie trie) { trie.resetToState(state); }
156 
157         // Combining marks skipped in previous discontiguous-contraction matching.
158         // After that discontiguous contraction was completed, we start reading them from here.
159         private final StringBuilder oldBuffer = new StringBuilder();
160         // Combining marks newly skipped in current discontiguous-contraction matching.
161         // These might have been read from the normal text or from the oldBuffer.
162         private final StringBuilder newBuffer = new StringBuilder();
163         // Reading index in oldBuffer,
164         // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()).
165         private int pos;
166         // newBuffer.length() at the time of the last matching character.
167         // When a partial match fails, we back out skipped and partial-matching input characters.
168         private int skipLengthAtMatch;
169         // We save the trie state before we attempt to match a character,
170         // so that we can skip it and try the next one.
171         private CharsTrie.State state = new CharsTrie.State();
172     };
173 
174     /**
175      * Partially constructs the iterator.
176      * In Java, we cache partially constructed iterators
177      * and finish their setup when starting to work on text
178      * (via reset(boolean) and the setText(numeric, ...) methods of subclasses).
179      * This avoids memory allocations for iterators that remain unused.
180      *
181      * <p>In C++, there is only one constructor, and iterators are
182      * stack-allocated as needed.
183      */
CollationIterator(CollationData d)184     public CollationIterator(CollationData d) {
185         trie = d.trie;
186         data = d;
187         numCpFwd = -1;
188         isNumeric = false;
189         ceBuffer = null;
190     }
191 
CollationIterator(CollationData d, boolean numeric)192     public CollationIterator(CollationData d, boolean numeric) {
193         trie = d.trie;
194         data = d;
195         numCpFwd = -1;
196         isNumeric = numeric;
197         ceBuffer = new CEBuffer();
198     }
199 
200     @Override
equals(Object other)201     public boolean equals(Object other) {
202         // Subclasses: Call this method and then add more specific checks.
203         // Compare the iterator state but not the collation data (trie & data fields):
204         // Assume that the caller compares the data.
205         // Ignore skipped since that should be unused between calls to nextCE().
206         // (It only stays around to avoid another memory allocation.)
207         if(other == null) { return false; }
208         if(!this.getClass().equals(other.getClass())) { return false; }
209         CollationIterator o = (CollationIterator)other;
210         if(!(ceBuffer.length == o.ceBuffer.length &&
211                 cesIndex == o.cesIndex &&
212                 numCpFwd == o.numCpFwd &&
213                 isNumeric == o.isNumeric)) {
214             return false;
215         }
216         for(int i = 0; i < ceBuffer.length; ++i) {
217             if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; }
218         }
219         return true;
220     }
221 
222     @Override
hashCode()223     public int hashCode() {
224         // Dummy return to prevent compile warnings.
225         return 0;
226     }
227 
228     /**
229      * Resets the iterator state and sets the position to the specified offset.
230      * Subclasses must implement, and must call the parent class method,
231      * or CollationIterator.reset().
232      */
resetToOffset(int newOffset)233     public abstract void resetToOffset(int newOffset);
234 
getOffset()235     public abstract int getOffset();
236 
237     /**
238      * Returns the next collation element.
239      */
nextCE()240     public final long nextCE() {
241         if(cesIndex < ceBuffer.length) {
242             // Return the next buffered CE.
243             return ceBuffer.get(cesIndex++);
244         }
245         assert cesIndex == ceBuffer.length;
246         ceBuffer.incLength();
247         long cAndCE32 = handleNextCE32();
248         int c = (int)(cAndCE32 >> 32);
249         int ce32 = (int)cAndCE32;
250         int t = ce32 & 0xff;
251         if(t < Collation.SPECIAL_CE32_LOW_BYTE) {  // Forced-inline of isSpecialCE32(ce32).
252             // Normal CE from the main data.
253             // Forced-inline of ceFromSimpleCE32(ce32).
254             return ceBuffer.set(cesIndex++,
255                     ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
256         }
257         CollationData d;
258         // The compiler should be able to optimize the previous and the following
259         // comparisons of t with the same constant.
260         if(t == Collation.SPECIAL_CE32_LOW_BYTE) {
261             if(c < 0) {
262                 return ceBuffer.set(cesIndex++, Collation.NO_CE);
263             }
264             d = data.base;
265             ce32 = d.getCE32(c);
266             t = ce32 & 0xff;
267             if(t < Collation.SPECIAL_CE32_LOW_BYTE) {
268                 // Normal CE from the base data.
269                 return ceBuffer.set(cesIndex++,
270                         ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8));
271             }
272         } else {
273             d = data;
274         }
275         if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) {
276             // Forced-inline of ceFromLongPrimaryCE32(ce32).
277             return ceBuffer.set(cesIndex++,
278                     ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE);
279         }
280         return nextCEFromCE32(d, c, ce32);
281     }
282 
283     /**
284      * Fetches all CEs.
285      * @return getCEsLength()
286      */
fetchCEs()287     public final int fetchCEs() {
288         while(nextCE() != Collation.NO_CE) {
289             // No need to loop for each expansion CE.
290             cesIndex = ceBuffer.length;
291         }
292         return ceBuffer.length;
293     }
294 
295     /**
296      * Overwrites the current CE (the last one returned by nextCE()).
297      */
setCurrentCE(long ce)298     final void setCurrentCE(long ce) {
299         assert cesIndex > 0;
300         ceBuffer.set(cesIndex - 1, ce);
301     }
302 
303     /**
304      * Returns the previous collation element.
305      */
previousCE(UVector32 offsets)306     public final long previousCE(UVector32 offsets) {
307         if(ceBuffer.length > 0) {
308             // Return the previous buffered CE.
309             return ceBuffer.get(--ceBuffer.length);
310         }
311         offsets.removeAllElements();
312         int limitOffset = getOffset();
313         int c = previousCodePoint();
314         if(c < 0) { return Collation.NO_CE; }
315         if(data.isUnsafeBackward(c, isNumeric)) {
316             return previousCEUnsafe(c, offsets);
317         }
318         // Simple, safe-backwards iteration:
319         // Get a CE going backwards, handle prefixes but no contractions.
320         int ce32 = data.getCE32(c);
321         CollationData d;
322         if(ce32 == Collation.FALLBACK_CE32) {
323             d = data.base;
324             ce32 = d.getCE32(c);
325         } else {
326             d = data;
327         }
328         if(Collation.isSimpleOrLongCE32(ce32)) {
329             return Collation.ceFromCE32(ce32);
330         }
331         appendCEsFromCE32(d, c, ce32, false);
332         if(ceBuffer.length > 1) {
333             offsets.addElement(getOffset());
334             // For an expansion, the offset of each non-initial CE is the limit offset,
335             // consistent with forward iteration.
336             while(offsets.size() <= ceBuffer.length) {
337                 offsets.addElement(limitOffset);
338             };
339         }
340         return ceBuffer.get(--ceBuffer.length);
341     }
342 
getCEsLength()343     public final int getCEsLength() {
344         return ceBuffer.length;
345     }
346 
getCE(int i)347     public final long getCE(int i) {
348         return ceBuffer.get(i);
349     }
350 
getCEs()351     public final long[] getCEs() {
352         return ceBuffer.getCEs();
353     }
354 
clearCEs()355     final void clearCEs() {
356         cesIndex = ceBuffer.length = 0;
357     }
358 
clearCEsIfNoneRemaining()359     public final void clearCEsIfNoneRemaining() {
360         if(cesIndex == ceBuffer.length) { clearCEs(); }
361     }
362 
363     /**
364      * Returns the next code point (with post-increment).
365      * Public for identical-level comparison and for testing.
366      */
nextCodePoint()367     public abstract int nextCodePoint();
368 
369     /**
370      * Returns the previous code point (with pre-decrement).
371      * Public for identical-level comparison and for testing.
372      */
previousCodePoint()373     public abstract int previousCodePoint();
374 
reset()375     protected final void reset() {
376         cesIndex = ceBuffer.length = 0;
377         if(skipped != null) { skipped.clear(); }
378     }
379     /**
380      * Resets the state as well as the numeric setting,
381      * and completes the initialization.
382      * Only exists in Java where we reset cached CollationIterator instances
383      * rather than stack-allocating temporary ones.
384      * (See also the constructor comments.)
385      */
reset(boolean numeric)386     protected final void reset(boolean numeric) {
387         if(ceBuffer == null) {
388             ceBuffer = new CEBuffer();
389         }
390         reset();
391         isNumeric = numeric;
392     }
393 
394     /**
395      * Returns the next code point and its local CE32 value.
396      * Returns Collation.FALLBACK_CE32 at the end of the text (c<0)
397      * or when c's CE32 value is to be looked up in the base data (fallback).
398      *
399      * The code point is used for fallbacks, context and implicit weights.
400      * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32).
401      *
402      * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0.
403      */
handleNextCE32()404     protected long handleNextCE32() {
405         int c = nextCodePoint();
406         if(c < 0) { return NO_CP_AND_CE32; }
407         return makeCodePointAndCE32Pair(c, data.getCE32(c));
408     }
makeCodePointAndCE32Pair(int c, int ce32)409     protected long makeCodePointAndCE32Pair(int c, int ce32) {
410         return ((long)c << 32) | (ce32 & 0xffffffffL);
411     }
412     protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL);
413 
414     /**
415      * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit.
416      * Returns the trail surrogate in that case and advances past it,
417      * if a trail surrogate follows the lead surrogate.
418      * Otherwise returns any other code unit and does not advance.
419      */
handleGetTrailSurrogate()420     protected char handleGetTrailSurrogate() {
421         return 0;
422     }
423 
424     /**
425      * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator.
426      * (Not needed in Java.)
427      */
428     /*protected boolean foundNULTerminator() {
429         return false;
430     }*/
431 
432     /**
433      * @return false if surrogate code points U+D800..U+DFFF
434      *         map to their own implicit primary weights (for UTF-16),
435      *         or true if they map to CE(U+FFFD) (for UTF-8)
436      */
forbidSurrogateCodePoints()437     protected boolean forbidSurrogateCodePoints() {
438         return false;
439     }
440 
forwardNumCodePoints(int num)441     protected abstract void forwardNumCodePoints(int num);
442 
backwardNumCodePoints(int num)443     protected abstract void backwardNumCodePoints(int num);
444 
445     /**
446      * Returns the CE32 from the data trie.
447      * Normally the same as data.getCE32(), but overridden in the builder.
448      * Call this only when the faster data.getCE32() cannot be used.
449      */
getDataCE32(int c)450     protected int getDataCE32(int c) {
451         return data.getCE32(c);
452     }
453 
getCE32FromBuilderData(int ce32)454     protected int getCE32FromBuilderData(int ce32) {
455         throw new ICUException("internal program error: should be unreachable");
456     }
457 
appendCEsFromCE32(CollationData d, int c, int ce32, boolean forward)458     protected final void appendCEsFromCE32(CollationData d, int c, int ce32,
459                            boolean forward) {
460         while(Collation.isSpecialCE32(ce32)) {
461             switch(Collation.tagFromCE32(ce32)) {
462             case Collation.FALLBACK_TAG:
463             case Collation.RESERVED_TAG_3:
464                 throw new ICUException("internal program error: should be unreachable");
465             case Collation.LONG_PRIMARY_TAG:
466                 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32));
467                 return;
468             case Collation.LONG_SECONDARY_TAG:
469                 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32));
470                 return;
471             case Collation.LATIN_EXPANSION_TAG:
472                 ceBuffer.ensureAppendCapacity(2);
473                 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32));
474                 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32));
475                 ceBuffer.length += 2;
476                 return;
477             case Collation.EXPANSION32_TAG: {
478                 int index = Collation.indexFromCE32(ce32);
479                 int length = Collation.lengthFromCE32(ce32);
480                 ceBuffer.ensureAppendCapacity(length);
481                 do {
482                     ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++]));
483                 } while(--length > 0);
484                 return;
485             }
486             case Collation.EXPANSION_TAG: {
487                 int index = Collation.indexFromCE32(ce32);
488                 int length = Collation.lengthFromCE32(ce32);
489                 ceBuffer.ensureAppendCapacity(length);
490                 do {
491                     ceBuffer.appendUnsafe(d.ces[index++]);
492                 } while(--length > 0);
493                 return;
494             }
495             case Collation.BUILDER_DATA_TAG:
496                 ce32 = getCE32FromBuilderData(ce32);
497                 if(ce32 == Collation.FALLBACK_CE32) {
498                     d = data.base;
499                     ce32 = d.getCE32(c);
500                 }
501                 break;
502             case Collation.PREFIX_TAG:
503                 if(forward) { backwardNumCodePoints(1); }
504                 ce32 = getCE32FromPrefix(d, ce32);
505                 if(forward) { forwardNumCodePoints(1); }
506                 break;
507             case Collation.CONTRACTION_TAG: {
508                 int index = Collation.indexFromCE32(ce32);
509                 int defaultCE32 = d.getCE32FromContexts(index);  // Default if no suffix match.
510                 if(!forward) {
511                     // Backward contractions are handled by previousCEUnsafe().
512                     // c has contractions but they were not found.
513                     ce32 = defaultCE32;
514                     break;
515                 }
516                 int nextCp;
517                 if(skipped == null && numCpFwd < 0) {
518                     // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path,
519                     // avoiding the function call and the nextSkippedCodePoint() overhead.
520                     nextCp = nextCodePoint();
521                     if(nextCp < 0) {
522                         // No more text.
523                         ce32 = defaultCE32;
524                         break;
525                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
526                             !CollationFCD.mayHaveLccc(nextCp)) {
527                         // All contraction suffixes start with characters with lccc!=0
528                         // but the next code point has lccc==0.
529                         backwardNumCodePoints(1);
530                         ce32 = defaultCE32;
531                         break;
532                     }
533                 } else {
534                     nextCp = nextSkippedCodePoint();
535                     if(nextCp < 0) {
536                         // No more text.
537                         ce32 = defaultCE32;
538                         break;
539                     } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 &&
540                             !CollationFCD.mayHaveLccc(nextCp)) {
541                         // All contraction suffixes start with characters with lccc!=0
542                         // but the next code point has lccc==0.
543                         backwardNumSkipped(1);
544                         ce32 = defaultCE32;
545                         break;
546                     }
547                 }
548                 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp);
549                 if(ce32 == Collation.NO_CE32) {
550                     // CEs from a discontiguous contraction plus the skipped combining marks
551                     // have been appended already.
552                     return;
553                 }
554                 break;
555             }
556             case Collation.DIGIT_TAG:
557                 if(isNumeric) {
558                     appendNumericCEs(ce32, forward);
559                     return;
560                 } else {
561                     // Fetch the non-numeric-collation CE32 and continue.
562                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
563                     break;
564                 }
565             case Collation.U0000_TAG:
566                 assert(c == 0);
567                 // NUL-terminated input not supported in Java.
568                 // Fetch the normal ce32 for U+0000 and continue.
569                 ce32 = d.ce32s[0];
570                 break;
571             case Collation.HANGUL_TAG: {
572                 int[] jamoCE32s = d.jamoCE32s;
573                 c -= Hangul.HANGUL_BASE;
574                 int t = c % Hangul.JAMO_T_COUNT;
575                 c /= Hangul.JAMO_T_COUNT;
576                 int v = c % Hangul.JAMO_V_COUNT;
577                 c /= Hangul.JAMO_V_COUNT;
578                 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) {
579                     // None of the Jamo CE32s are isSpecialCE32().
580                     // Avoid recursive function calls and per-Jamo tests.
581                     ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3);
582                     ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c]));
583                     ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v]));
584                     ceBuffer.length += 2;
585                     if(t != 0) {
586                         ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t]));
587                     }
588                     return;
589                 } else {
590                     // We should not need to compute each Jamo code point.
591                     // In particular, there should be no offset or implicit ce32.
592                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward);
593                     appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward);
594                     if(t == 0) { return; }
595                     // offset 39 = 19 + 21 - 1:
596                     // 19 = JAMO_L_COUNT
597                     // 21 = JAMO_T_COUNT
598                     // -1 = omit t==0
599                     ce32 = jamoCE32s[39 + t];
600                     c = Collation.SENTINEL_CP;
601                     break;
602                 }
603             }
604             case Collation.LEAD_SURROGATE_TAG: {
605                 assert(forward);  // Backward iteration should never see lead surrogate code _unit_ data.
606                 assert(isLeadSurrogate(c));
607                 char trail;
608                 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) {
609                     c = Character.toCodePoint((char)c, trail);
610                     ce32 &= Collation.LEAD_TYPE_MASK;
611                     if(ce32 == Collation.LEAD_ALL_UNASSIGNED) {
612                         ce32 = Collation.UNASSIGNED_CE32;  // unassigned-implicit
613                     } else if(ce32 == Collation.LEAD_ALL_FALLBACK ||
614                             (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) {
615                         // fall back to the base data
616                         d = d.base;
617                         ce32 = d.getCE32FromSupplementary(c);
618                     }
619                 } else {
620                     // c is an unpaired surrogate.
621                     ce32 = Collation.UNASSIGNED_CE32;
622                 }
623                 break;
624             }
625             case Collation.OFFSET_TAG:
626                 assert(c >= 0);
627                 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32));
628                 return;
629             case Collation.IMPLICIT_TAG:
630                 assert(c >= 0);
631                 if(isSurrogate(c) && forbidSurrogateCodePoints()) {
632                     ce32 = Collation.FFFD_CE32;
633                     break;
634                 } else {
635                     ceBuffer.append(Collation.unassignedCEFromCodePoint(c));
636                     return;
637                 }
638             }
639         }
640         ceBuffer.append(Collation.ceFromSimpleCE32(ce32));
641     }
642 
643     // TODO: Propose widening the UTF16 method.
isSurrogate(int c)644     private static final boolean isSurrogate(int c) {
645         return (c & 0xfffff800) == 0xd800;
646     }
647 
648     // TODO: Propose widening the UTF16 method.
isLeadSurrogate(int c)649     protected static final boolean isLeadSurrogate(int c) {
650         return (c & 0xfffffc00) == 0xd800;
651     }
652 
653     // TODO: Propose widening the UTF16 method.
isTrailSurrogate(int c)654     protected static final boolean isTrailSurrogate(int c) {
655         return (c & 0xfffffc00) == 0xdc00;
656     }
657 
658     // Main lookup trie of the data object.
659     protected final Trie2_32 trie;
660     protected final CollationData data;
661 
nextCEFromCE32(CollationData d, int c, int ce32)662     private final long nextCEFromCE32(CollationData d, int c, int ce32) {
663         --ceBuffer.length;  // Undo ceBuffer.incLength().
664         appendCEsFromCE32(d, c, ce32, true);
665         return ceBuffer.get(cesIndex++);
666     }
667 
getCE32FromPrefix(CollationData d, int ce32)668     private final int getCE32FromPrefix(CollationData d, int ce32) {
669         int index = Collation.indexFromCE32(ce32);
670         ce32 = d.getCE32FromContexts(index);  // Default if no prefix match.
671         index += 2;
672         // Number of code points read before the original code point.
673         int lookBehind = 0;
674         CharsTrie prefixes = new CharsTrie(d.contexts, index);
675         for(;;) {
676             int c = previousCodePoint();
677             if(c < 0) { break; }
678             ++lookBehind;
679             BytesTrie.Result match = prefixes.nextForCodePoint(c);
680             if(match.hasValue()) {
681                 ce32 = prefixes.getValue();
682             }
683             if(!match.hasNext()) { break; }
684         }
685         forwardNumCodePoints(lookBehind);
686         return ce32;
687     }
688 
nextSkippedCodePoint()689     private final int nextSkippedCodePoint() {
690         if(skipped != null && skipped.hasNext()) { return skipped.next(); }
691         if(numCpFwd == 0) { return Collation.SENTINEL_CP; }
692         int c = nextCodePoint();
693         if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); }
694         if(numCpFwd > 0 && c >= 0) { --numCpFwd; }
695         return c;
696     }
697 
backwardNumSkipped(int n)698     private final void backwardNumSkipped(int n) {
699         if(skipped != null && !skipped.isEmpty()) {
700             n = skipped.backwardNumCodePoints(n);
701         }
702         backwardNumCodePoints(n);
703         if(numCpFwd >= 0) { numCpFwd += n; }
704     }
705 
nextCE32FromContraction( CollationData d, int contractionCE32, CharSequence trieChars, int trieOffset, int ce32, int c)706     private final int nextCE32FromContraction(
707             CollationData d, int contractionCE32,
708             CharSequence trieChars, int trieOffset, int ce32, int c) {
709         // c: next code point after the original one
710 
711         // Number of code points read beyond the original code point.
712         // Needed for discontiguous contraction matching.
713         int lookAhead = 1;
714         // Number of code points read since the last match (initially only c).
715         int sinceMatch = 1;
716         // Normally we only need a contiguous match,
717         // and therefore need not remember the suffixes state from before a mismatch for retrying.
718         // If we are already processing skipped combining marks, then we do track the state.
719         CharsTrie suffixes = new CharsTrie(trieChars, trieOffset);
720         if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
721         BytesTrie.Result match = suffixes.firstForCodePoint(c);
722         for(;;) {
723             int nextCp;
724             if(match.hasValue()) {
725                 ce32 = suffixes.getValue();
726                 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) {
727                     return ce32;
728                 }
729                 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); }
730                 sinceMatch = 1;
731             } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) {
732                 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text.
733                 // Back up if necessary, and try a discontiguous contraction.
734                 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 &&
735                         // Discontiguous contraction matching extends an existing match.
736                         // If there is no match yet, then there is nothing to do.
737                         ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 ||
738                             sinceMatch < lookAhead)) {
739                     // The last character of at least one suffix has lccc!=0,
740                     // allowing for discontiguous contractions.
741                     // UCA S2.1.1 only processes non-starters immediately following
742                     // "a match in the table" (sinceMatch=1).
743                     if(sinceMatch > 1) {
744                         // Return to the state after the last match.
745                         // (Return to sinceMatch=0 and re-fetch the first partially-matched character.)
746                         backwardNumSkipped(sinceMatch);
747                         c = nextSkippedCodePoint();
748                         lookAhead -= sinceMatch - 1;
749                         sinceMatch = 1;
750                     }
751                     if(d.getFCD16(c) > 0xff) {
752                         return nextCE32FromDiscontiguousContraction(
753                             d, suffixes, ce32, lookAhead, c);
754                     }
755                 }
756                 break;
757             } else {
758                 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c.
759                 // It does not have a result value, therefore it is not itself "a match in the table".
760                 // If a partially-matched c has ccc!=0 then
761                 // it might be skipped in discontiguous contraction.
762                 c = nextCp;
763                 ++sinceMatch;
764             }
765             ++lookAhead;
766             match = suffixes.nextForCodePoint(c);
767         }
768         backwardNumSkipped(sinceMatch);
769         return ce32;
770     }
771 
nextCE32FromDiscontiguousContraction( CollationData d, CharsTrie suffixes, int ce32, int lookAhead, int c)772     private final int nextCE32FromDiscontiguousContraction(
773             CollationData d, CharsTrie suffixes, int ce32,
774             int lookAhead, int c) {
775         // UCA section 3.3.2 Contractions:
776         // Contractions that end with non-starter characters
777         // are known as discontiguous contractions.
778         // ... discontiguous contractions must be detected in input text
779         // whenever the final sequence of non-starter characters could be rearranged
780         // so as to make a contiguous matching sequence that is canonically equivalent.
781 
782         // UCA: http://www.unicode.org/reports/tr10/#S2.1
783         // S2.1 Find the longest initial substring S at each point that has a match in the table.
784         // S2.1.1 If there are any non-starters following S, process each non-starter C.
785         // S2.1.2 If C is not blocked from S, find if S + C has a match in the table.
786         //     Note: A non-starter in a string is called blocked
787         //     if there is another non-starter of the same canonical combining class or zero
788         //     between it and the last character of canonical combining class 0.
789         // S2.1.3 If there is a match, replace S by S + C, and remove C.
790 
791         // First: Is a discontiguous contraction even possible?
792         int fcd16 = d.getFCD16(c);
793         assert(fcd16 > 0xff);  // The caller checked this already, as a shortcut.
794         int nextCp = nextSkippedCodePoint();
795         if(nextCp < 0) {
796             // No further text.
797             backwardNumSkipped(1);
798             return ce32;
799         }
800         ++lookAhead;
801         int prevCC = fcd16 & 0xff;
802         fcd16 = d.getFCD16(nextCp);
803         if(fcd16 <= 0xff) {
804             // The next code point after c is a starter (S2.1.1 "process each non-starter").
805             backwardNumSkipped(2);
806             return ce32;
807         }
808 
809         // We have read and matched (lookAhead-2) code points,
810         // read non-matching c and peeked ahead at nextCp.
811         // Return to the state before the mismatch and continue matching with nextCp.
812         if(skipped == null || skipped.isEmpty()) {
813             if(skipped == null) {
814                 skipped = new SkippedState();
815             }
816             suffixes.reset();
817             if(lookAhead > 2) {
818                 // Replay the partial match so far.
819                 backwardNumCodePoints(lookAhead);
820                 suffixes.firstForCodePoint(nextCodePoint());
821                 for(int i = 3; i < lookAhead; ++i) {
822                     suffixes.nextForCodePoint(nextCodePoint());
823                 }
824                 // Skip c (which did not match) and nextCp (which we will try now).
825                 forwardNumCodePoints(2);
826             }
827             skipped.saveTrieState(suffixes);
828         } else {
829             // Reset to the trie state before the failed match of c.
830             skipped.resetToTrieState(suffixes);
831         }
832 
833         skipped.setFirstSkipped(c);
834         // Number of code points read since the last match (at this point: c and nextCp).
835         int sinceMatch = 2;
836         c = nextCp;
837         for(;;) {
838             BytesTrie.Result match;
839             // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2)
840             if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) {
841                 // "If there is a match, replace S by S + C, and remove C." (S2.1.3)
842                 // Keep prevCC unchanged.
843                 ce32 = suffixes.getValue();
844                 sinceMatch = 0;
845                 skipped.recordMatch();
846                 if(!match.hasNext()) { break; }
847                 skipped.saveTrieState(suffixes);
848             } else {
849                 // No match for "S + C", skip C.
850                 skipped.skip(c);
851                 skipped.resetToTrieState(suffixes);
852                 prevCC = fcd16 & 0xff;
853             }
854             if((c = nextSkippedCodePoint()) < 0) { break; }
855             ++sinceMatch;
856             fcd16 = d.getFCD16(c);
857             if(fcd16 <= 0xff) {
858                 // The next code point after c is a starter (S2.1.1 "process each non-starter").
859                 break;
860             }
861         }
862         backwardNumSkipped(sinceMatch);
863         boolean isTopDiscontiguous = skipped.isEmpty();
864         skipped.replaceMatch();
865         if(isTopDiscontiguous && !skipped.isEmpty()) {
866             // We did get a match after skipping one or more combining marks,
867             // and we are not in a recursive discontiguous contraction.
868             // Append CEs from the contraction ce32
869             // and then from the combining marks that we skipped before the match.
870             c = Collation.SENTINEL_CP;
871             for(;;) {
872                 appendCEsFromCE32(d, c, ce32, true);
873                 // Fetch CE32s for skipped combining marks from the normal data, with fallback,
874                 // rather than from the CollationData where we found the contraction.
875                 if(!skipped.hasNext()) { break; }
876                 c = skipped.next();
877                 ce32 = getDataCE32(c);
878                 if(ce32 == Collation.FALLBACK_CE32) {
879                     d = data.base;
880                     ce32 = d.getCE32(c);
881                 } else {
882                     d = data;
883                 }
884                 // Note: A nested discontiguous-contraction match
885                 // replaces consumed combining marks with newly skipped ones
886                 // and resets the reading position to the beginning.
887             }
888             skipped.clear();
889             ce32 = Collation.NO_CE32;  // Signal to the caller that the result is in the ceBuffer.
890         }
891         return ce32;
892     }
893 
894     /**
895      * Returns the previous CE when data.isUnsafeBackward(c, isNumeric).
896      */
previousCEUnsafe(int c, UVector32 offsets)897     private final long previousCEUnsafe(int c, UVector32 offsets) {
898         // We just move through the input counting safe and unsafe code points
899         // without collecting the unsafe-backward substring into a buffer and
900         // switching to it.
901         // This is to keep the logic simple. Otherwise we would have to handle
902         // prefix matching going before the backward buffer, switching
903         // to iteration and back, etc.
904         // In the most important case of iterating over a normal string,
905         // reading from the string itself is already maximally fast.
906         // The only drawback there is that after getting the CEs we always
907         // skip backward to the safe character rather than switching out
908         // of a backwardBuffer.
909         // But this should not be the common case for previousCE(),
910         // and correctness and maintainability are more important than
911         // complex optimizations.
912         // Find the first safe character before c.
913         int numBackward = 1;
914         while((c = previousCodePoint()) >= 0) {
915             ++numBackward;
916             if(!data.isUnsafeBackward(c, isNumeric)) {
917                 break;
918             }
919         }
920         // Set the forward iteration limit.
921         // Note: This counts code points.
922         // We cannot enforce a limit in the middle of a surrogate pair or similar.
923         numCpFwd = numBackward;
924         // Reset the forward iterator.
925         cesIndex = 0;
926         assert(ceBuffer.length == 0);
927         // Go forward and collect the CEs.
928         int offset = getOffset();
929         while(numCpFwd > 0) {
930             // nextCE() normally reads one code point.
931             // Contraction matching and digit specials read more and check numCpFwd.
932             --numCpFwd;
933             // Append one or more CEs to the ceBuffer.
934             nextCE();
935             assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE);
936             // No need to loop for getting each expansion CE from nextCE().
937             cesIndex = ceBuffer.length;
938             // However, we need to write an offset for each CE.
939             // This is for CollationElementIterator.getOffset() to return
940             // intermediate offsets from the unsafe-backwards segment.
941             assert(offsets.size() < ceBuffer.length);
942             offsets.addElement(offset);
943             // For an expansion, the offset of each non-initial CE is the limit offset,
944             // consistent with forward iteration.
945             offset = getOffset();
946             while(offsets.size() < ceBuffer.length) {
947                 offsets.addElement(offset);
948             };
949         }
950         assert(offsets.size() == ceBuffer.length);
951         // End offset corresponding to just after the unsafe-backwards segment.
952         offsets.addElement(offset);
953         // Reset the forward iteration limit
954         // and move backward to before the segment for which we fetched CEs.
955         numCpFwd = -1;
956         backwardNumCodePoints(numBackward);
957         // Use the collected CEs and return the last one.
958         cesIndex = 0;  // Avoid cesIndex > ceBuffer.length when that gets decremented.
959         return ceBuffer.get(--ceBuffer.length);
960     }
961 
962     /**
963      * Turns a string of digits (bytes 0..9)
964      * into a sequence of CEs that will sort in numeric order.
965      *
966      * Starts from this ce32's digit value and consumes the following/preceding digits.
967      * The digits string must not be empty and must not have leading zeros.
968      */
969     private final void appendNumericCEs(int ce32, boolean forward) {
970         // Collect digits.
971         // TODO: Use some kind of a byte buffer? We only store values 0..9.
972         StringBuilder digits = new StringBuilder();
973         if(forward) {
974             for(;;) {
975                 char digit = Collation.digitFromCE32(ce32);
976                 digits.append(digit);
977                 if(numCpFwd == 0) { break; }
978                 int c = nextCodePoint();
979                 if(c < 0) { break; }
980                 ce32 = data.getCE32(c);
981                 if(ce32 == Collation.FALLBACK_CE32) {
982                     ce32 = data.base.getCE32(c);
983                 }
984                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
985                     backwardNumCodePoints(1);
986                     break;
987                 }
988                 if(numCpFwd > 0) { --numCpFwd; }
989             }
990         } else {
991             for(;;) {
992                 char digit = Collation.digitFromCE32(ce32);
993                 digits.append(digit);
994                 int c = previousCodePoint();
995                 if(c < 0) { break; }
996                 ce32 = data.getCE32(c);
997                 if(ce32 == Collation.FALLBACK_CE32) {
998                     ce32 = data.base.getCE32(c);
999                 }
1000                 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) {
1001                     forwardNumCodePoints(1);
1002                     break;
1003                 }
1004             }
1005             // Reverse the digit string.
1006             digits.reverse();
1007         }
1008         int pos = 0;
1009         do {
1010             // Skip leading zeros.
1011             while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; }
1012             // Write a sequence of CEs for at most 254 digits at a time.
1013             int segmentLength = digits.length() - pos;
1014             if(segmentLength > 254) { segmentLength = 254; }
1015             appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength));
1016             pos += segmentLength;
1017         } while(pos < digits.length());
1018     }
1019 
1020     /**
1021      * Turns 1..254 digits into a sequence of CEs.
1022      * Called by appendNumericCEs() for each segment of at most 254 digits.
1023      */
1024     private final void appendNumericSegmentCEs(CharSequence digits) {
1025         int length = digits.length();
1026         assert(1 <= length && length <= 254);
1027         assert(length == 1 || digits.charAt(0) != 0);
1028         long numericPrimary = data.numericPrimary;
1029         // Note: We use primary byte values 2..255: digits are not compressible.
1030         if(length <= 7) {
1031             // Very dense encoding for small numbers.
1032             int value = digits.charAt(0);
1033             for(int i = 1; i < length; ++i) {
1034                 value = value * 10 + digits.charAt(i);
1035             }
1036             // Primary weight second byte values:
1037             //     74 byte values   2.. 75 for small numbers in two-byte primary weights.
1038             //     40 byte values  76..115 for medium numbers in three-byte primary weights.
1039             //     16 byte values 116..131 for large numbers in four-byte primary weights.
1040             //    124 byte values 132..255 for very large numbers with 4..127 digit pairs.
1041             int firstByte = 2;
1042             int numBytes = 74;
1043             if(value < numBytes) {
1044                 // Two-byte primary for 0..73, good for day & month numbers etc.
1045                 long primary = numericPrimary | ((firstByte + value) << 16);
1046                 ceBuffer.append(Collation.makeCE(primary));
1047                 return;
1048             }
1049             value -= numBytes;
1050             firstByte += numBytes;
1051             numBytes = 40;
1052             if(value < numBytes * 254) {
1053                 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more.
1054                 long primary = numericPrimary |
1055                     ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8);
1056                 ceBuffer.append(Collation.makeCE(primary));
1057                 return;
1058             }
1059             value -= numBytes * 254;
1060             firstByte += numBytes;
1061             numBytes = 16;
1062             if(value < numBytes * 254 * 254) {
1063                 // Four-byte primary for 10234..1042489=10234+16*254*254-1.
1064                 long primary = numericPrimary | (2 + value % 254);
1065                 value /= 254;
1066                 primary |= (2 + value % 254) << 8;
1067                 value /= 254;
1068                 primary |= (firstByte + value % 254) << 16;
1069                 ceBuffer.append(Collation.makeCE(primary));
1070                 return;
1071             }
1072             // original value > 1042489
1073         }
1074         assert(length >= 7);
1075 
1076         // The second primary byte value 132..255 indicates the number of digit pairs (4..127),
1077         // then we generate primary bytes with those pairs.
1078         // Omit trailing 00 pairs.
1079         // Decrement the value for the last pair.
1080 
1081         // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255.
1082         int numPairs = (length + 1) / 2;
1083         long primary = numericPrimary | ((132 - 4 + numPairs) << 16);
1084         // Find the length without trailing 00 pairs.
1085         while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) {
1086             length -= 2;
1087         }
1088         // Read the first pair.
1089         int pair;
1090         int pos;
1091         if((length & 1) != 0) {
1092             // Only "half a pair" if we have an odd number of digits.
1093             pair = digits.charAt(0);
1094             pos = 1;
1095         } else {
1096             pair = digits.charAt(0) * 10 + digits.charAt(1);
1097             pos = 2;
1098         }
1099         pair = 11 + 2 * pair;
1100         // Add the pairs of digits between pos and length.
1101         int shift = 8;
1102         while(pos < length) {
1103             if(shift == 0) {
1104                 // Every three pairs/bytes we need to store a 4-byte-primary CE
1105                 // and start with a new CE with the '0' primary lead byte.
1106                 primary |= pair;
1107                 ceBuffer.append(Collation.makeCE(primary));
1108                 primary = numericPrimary;
1109                 shift = 16;
1110             } else {
1111                 primary |= pair << shift;
1112                 shift -= 8;
1113             }
1114             pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1));
1115             pos += 2;
1116         }
1117         primary |= (pair - 1) << shift;
1118         ceBuffer.append(Collation.makeCE(primary));
1119     }
1120 
1121     private CEBuffer ceBuffer;
1122     private int cesIndex;
1123 
1124     private SkippedState skipped;
1125 
1126     // Number of code points to read forward, or -1.
1127     // Used as a forward iteration limit in previousCEUnsafe().
1128     private int numCpFwd;
1129     // Numeric collation (CollationSettings.NUMERIC).
1130     private boolean isNumeric;
1131 }
1132