1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * CollationFastLatin.java, ported from collationfastlatin.h/.cpp
9 *
10 * C++ version created on: 2013aug09
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import com.ibm.icu.lang.UScript;
17 import com.ibm.icu.text.Collator;
18 
19 public final class CollationFastLatin /* all static */ {
20     /**
21      * Fast Latin format version (one byte 1..FF).
22      * Must be incremented for any runtime-incompatible changes,
23      * in particular, for changes to any of the following constants.
24      *
25      * When the major version number of the main data format changes,
26      * we can reset this fast Latin version to 1.
27      */
28     public static final int VERSION = 2;
29 
30     public static final int LATIN_MAX = 0x17f;
31     public static final int LATIN_LIMIT = LATIN_MAX + 1;
32 
33     static final int LATIN_MAX_UTF8_LEAD = 0xc5;  // UTF-8 lead byte of LATIN_MAX
34 
35     static final int PUNCT_START = 0x2000;
36     static final int PUNCT_LIMIT = 0x2040;
37 
38     // excludes U+FFFE & U+FFFF
39     static final int NUM_FAST_CHARS = LATIN_LIMIT + (PUNCT_LIMIT - PUNCT_START);
40 
41     // Note on the supported weight ranges:
42     // Analysis of UCA 6.3 and CLDR 23 non-search tailorings shows that
43     // the CEs for characters in the above ranges, excluding expansions with length >2,
44     // excluding contractions of >2 characters, and other restrictions
45     // (see the builder's getCEsFromCE32()),
46     // use at most about 150 primary weights,
47     // where about 94 primary weights are possibly-variable (space/punct/symbol/currency),
48     // at most 4 secondary before-common weights,
49     // at most 4 secondary after-common weights,
50     // at most 16 secondary high weights (in secondary CEs), and
51     // at most 4 tertiary after-common weights.
52     // The following ranges are designed to support slightly more weights than that.
53     // (en_US_POSIX is unusual: It creates about 64 variable + 116 Latin primaries.)
54 
55     // Digits may use long primaries (preserving more short ones)
56     // or short primaries (faster) without changing this data structure.
57     // (If we supported numeric collation, then digits would have to have long primaries
58     // so that special handling does not affect the fast path.)
59 
60     static final int SHORT_PRIMARY_MASK = 0xfc00;  // bits 15..10
61     static final int INDEX_MASK = 0x3ff;  // bits 9..0 for expansions & contractions
62     static final int SECONDARY_MASK = 0x3e0;  // bits 9..5
63     static final int CASE_MASK = 0x18;  // bits 4..3
64     static final int LONG_PRIMARY_MASK = 0xfff8;  // bits 15..3
65     static final int TERTIARY_MASK = 7;  // bits 2..0
66     static final int CASE_AND_TERTIARY_MASK = CASE_MASK | TERTIARY_MASK;
67 
68     static final int TWO_SHORT_PRIMARIES_MASK =
69             (SHORT_PRIMARY_MASK << 16) | SHORT_PRIMARY_MASK;  // 0xfc00fc00
70     static final int TWO_LONG_PRIMARIES_MASK =
71             (LONG_PRIMARY_MASK << 16) | LONG_PRIMARY_MASK;  // 0xfff8fff8
72     static final int TWO_SECONDARIES_MASK =
73             (SECONDARY_MASK << 16) | SECONDARY_MASK;  // 0x3e003e0
74     static final int TWO_CASES_MASK =
75             (CASE_MASK << 16) | CASE_MASK;  // 0x180018
76     static final int TWO_TERTIARIES_MASK =
77             (TERTIARY_MASK << 16) | TERTIARY_MASK;  // 0x70007
78 
79     /**
80      * Contraction with one fast Latin character.
81      * Use INDEX_MASK to find the start of the contraction list after the fixed table.
82      * The first entry contains the default mapping.
83      * Otherwise use CONTR_CHAR_MASK for the contraction character index
84      * (in ascending order).
85      * Use CONTR_LENGTH_SHIFT for the length of the entry
86      * (1=BAIL_OUT, 2=one CE, 3=two CEs).
87      *
88      * Also, U+0000 maps to a contraction entry, so that the fast path need not
89      * check for NUL termination.
90      * It usually maps to a contraction list with only the completely ignorable default value.
91      */
92     static final int CONTRACTION = 0x400;
93     /**
94      * An expansion encodes two CEs.
95      * Use INDEX_MASK to find the pair of CEs after the fixed table.
96      *
97      * The higher a mini CE value, the easier it is to process.
98      * For expansions and higher, no context needs to be considered.
99      */
100     static final int EXPANSION = 0x800;
101     /**
102      * Encodes one CE with a long/low mini primary (there are 128).
103      * All potentially-variable primaries must be in this range,
104      * to make the short-primary path as fast as possible.
105      */
106     static final int MIN_LONG = 0xc00;
107     static final int LONG_INC = 8;
108     static final int MAX_LONG = 0xff8;
109     /**
110      * Encodes one CE with a short/high primary (there are 60),
111      * plus a secondary CE if the secondary weight is high.
112      * Fast handling: At least all letter primaries should be in this range.
113      */
114     static final int MIN_SHORT = 0x1000;
115     static final int SHORT_INC = 0x400;
116     /** The highest primary weight is reserved for U+FFFF. */
117     static final int MAX_SHORT = SHORT_PRIMARY_MASK;
118 
119     static final int MIN_SEC_BEFORE = 0;  // must add SEC_OFFSET
120     static final int SEC_INC = 0x20;
121     static final int MAX_SEC_BEFORE = MIN_SEC_BEFORE + 4 * SEC_INC;  // 5 before common
122     static final int COMMON_SEC = MAX_SEC_BEFORE + SEC_INC;
123     static final int MIN_SEC_AFTER = COMMON_SEC + SEC_INC;
124     static final int MAX_SEC_AFTER = MIN_SEC_AFTER + 5 * SEC_INC;  // 6 after common
125     static final int MIN_SEC_HIGH = MAX_SEC_AFTER + SEC_INC;  // 20 high secondaries
126     static final int MAX_SEC_HIGH = SECONDARY_MASK;
127 
128     /**
129      * Lookup: Add this offset to secondary weights, except for completely ignorable CEs.
130      * Must be greater than any special value, e.g., MERGE_WEIGHT.
131      * The exact value is not relevant for the format version.
132      */
133     static final int SEC_OFFSET = SEC_INC;
134     static final int COMMON_SEC_PLUS_OFFSET = COMMON_SEC + SEC_OFFSET;
135 
136     static final int TWO_SEC_OFFSETS =
137             (SEC_OFFSET << 16) | SEC_OFFSET;  // 0x200020
138     static final int TWO_COMMON_SEC_PLUS_OFFSET =
139             (COMMON_SEC_PLUS_OFFSET << 16) | COMMON_SEC_PLUS_OFFSET;
140 
141     static final int LOWER_CASE = 8;  // case bits include this offset
142     static final int TWO_LOWER_CASES = (LOWER_CASE << 16) | LOWER_CASE;  // 0x80008
143 
144     static final int COMMON_TER = 0;  // must add TER_OFFSET
145     static final int MAX_TER_AFTER = 7;  // 7 after common
146 
147     /**
148      * Lookup: Add this offset to tertiary weights, except for completely ignorable CEs.
149      * Must be greater than any special value, e.g., MERGE_WEIGHT.
150      * Must be greater than case bits as well, so that with combined case+tertiary weights
151      * plus the offset the tertiary bits does not spill over into the case bits.
152      * The exact value is not relevant for the format version.
153      */
154     static final int TER_OFFSET = SEC_OFFSET;
155     static final int COMMON_TER_PLUS_OFFSET = COMMON_TER + TER_OFFSET;
156 
157     static final int TWO_TER_OFFSETS = (TER_OFFSET << 16) | TER_OFFSET;
158     static final int TWO_COMMON_TER_PLUS_OFFSET =
159             (COMMON_TER_PLUS_OFFSET << 16) | COMMON_TER_PLUS_OFFSET;
160 
161     static final int MERGE_WEIGHT = 3;
162     static final int EOS = 2;  // end of string
163     static final int BAIL_OUT = 1;
164 
165     /**
166      * Contraction result first word bits 8..0 contain the
167      * second contraction character, as a char index 0..NUM_FAST_CHARS-1.
168      * Each contraction list is terminated with a word containing CONTR_CHAR_MASK.
169      */
170     static final int CONTR_CHAR_MASK = 0x1ff;
171     /**
172      * Contraction result first word bits 10..9 contain the result length:
173      * 1=bail out, 2=one mini CE, 3=two mini CEs
174      */
175     static final int CONTR_LENGTH_SHIFT = 9;
176 
177     /**
178      * Comparison return value when the regular comparison must be used.
179      * The exact value is not relevant for the format version.
180      */
181     public static final int BAIL_OUT_RESULT = -2;
182 
getCharIndex(char c)183     static int getCharIndex(char c) {
184         if(c <= LATIN_MAX) {
185             return c;
186         } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
187             return c - (PUNCT_START - LATIN_LIMIT);
188         } else {
189             // Not a fast Latin character.
190             // Note: U+FFFE & U+FFFF are forbidden in tailorings
191             // and thus do not occur in any contractions.
192             return -1;
193         }
194     }
195 
196     /**
197      * Computes the options value for the compare functions
198      * and writes the precomputed primary weights.
199      * Returns -1 if the Latin fastpath is not supported for the data and settings.
200      * The capacity must be LATIN_LIMIT.
201      */
getOptions(CollationData data, CollationSettings settings, char[] primaries)202     public static int getOptions(CollationData data, CollationSettings settings,
203             char[] primaries) {
204         char[] header = data.fastLatinTableHeader;
205         if(header == null) { return -1; }
206         assert((header[0] >> 8) == VERSION);
207         if(primaries.length != LATIN_LIMIT) {
208             assert false;
209             return -1;
210         }
211 
212         int miniVarTop;
213         if((settings.options & CollationSettings.ALTERNATE_MASK) == 0) {
214             // No mini primaries are variable, set a variableTop just below the
215             // lowest long mini primary.
216             miniVarTop = MIN_LONG - 1;
217         } else {
218             int headerLength = header[0] & 0xff;
219             int i = 1 + settings.getMaxVariable();
220             if(i >= headerLength) {
221                 return -1;  // variableTop >= digits, should not occur
222             }
223             miniVarTop = header[i];
224         }
225 
226         boolean digitsAreReordered = false;
227         if(settings.hasReordering()) {
228             long prevStart = 0;
229             long beforeDigitStart = 0;
230             long digitStart = 0;
231             long afterDigitStart = 0;
232             for(int group = Collator.ReorderCodes.FIRST;
233                     group < Collator.ReorderCodes.FIRST + CollationData.MAX_NUM_SPECIAL_REORDER_CODES;
234                     ++group) {
235                 long start = data.getFirstPrimaryForGroup(group);
236                 start = settings.reorder(start);
237                 if(group == Collator.ReorderCodes.DIGIT) {
238                     beforeDigitStart = prevStart;
239                     digitStart = start;
240                 } else if(start != 0) {
241                     if(start < prevStart) {
242                         // The permutation affects the groups up to Latin.
243                         return -1;
244                     }
245                     // In the future, there might be a special group between digits & Latin.
246                     if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
247                         afterDigitStart = start;
248                     }
249                     prevStart = start;
250                 }
251             }
252             long latinStart = data.getFirstPrimaryForGroup(UScript.LATIN);
253             latinStart = settings.reorder(latinStart);
254             if(latinStart < prevStart) {
255                 return -1;
256             }
257             if(afterDigitStart == 0) {
258                 afterDigitStart = latinStart;
259             }
260             if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
261                 digitsAreReordered = true;
262             }
263         }
264 
265         char[] table = data.fastLatinTable;  // skip the header
266         for(int c = 0; c < LATIN_LIMIT; ++c) {
267             int p = table[c];
268             if(p >= MIN_SHORT) {
269                 p &= SHORT_PRIMARY_MASK;
270             } else if(p > miniVarTop) {
271                 p &= LONG_PRIMARY_MASK;
272             } else {
273                 p = 0;
274             }
275             primaries[c] = (char)p;
276         }
277         if(digitsAreReordered || (settings.options & CollationSettings.NUMERIC) != 0) {
278             // Bail out for digits.
279             for(int c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
280         }
281 
282         // Shift the miniVarTop above other options.
283         return (miniVarTop << 16) | settings.options;
284     }
285 
compareUTF16(char[] table, char[] primaries, int options, CharSequence left, CharSequence right, int startIndex)286     public static int compareUTF16(char[] table, char[] primaries, int options,
287             CharSequence left, CharSequence right, int startIndex) {
288         // This is a modified copy of CollationCompare.compareUpToQuaternary(),
289         // optimized for common Latin text.
290         // Keep them in sync!
291 
292         int variableTop = options >> 16;  // see getOptions()
293         options &= 0xffff;  // needed for CollationSettings.getStrength() to work
294 
295         // Check for supported characters, fetch mini CEs, and compare primaries.
296         int leftIndex = startIndex, rightIndex = startIndex;
297         /**
298          * Single mini CE or a pair.
299          * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
300          * If there is only one, then it is in the lower bits, and the upper bits are 0.
301          */
302         int leftPair = 0, rightPair = 0;
303         for(;;) {
304             // We fetch CEs until we get a non-ignorable primary or reach the end.
305             while(leftPair == 0) {
306                 if(leftIndex == left.length()) {
307                     leftPair = EOS;
308                     break;
309                 }
310                 int c = left.charAt(leftIndex++);
311                 if(c <= LATIN_MAX) {
312                     leftPair = primaries[c];
313                     if(leftPair != 0) { break; }
314                     if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
315                         return BAIL_OUT_RESULT;
316                     }
317                     leftPair = table[c];
318                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
319                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
320                 } else {
321                     leftPair = lookup(table, c);
322                 }
323                 if(leftPair >= MIN_SHORT) {
324                     leftPair &= SHORT_PRIMARY_MASK;
325                     break;
326                 } else if(leftPair > variableTop) {
327                     leftPair &= LONG_PRIMARY_MASK;
328                     break;
329                 } else {
330                     long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
331                     if(pairAndInc < 0) {
332                         ++leftIndex;
333                         pairAndInc = ~pairAndInc;
334                     }
335                     leftPair = (int)pairAndInc;
336                     if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
337                     leftPair = getPrimaries(variableTop, leftPair);
338                 }
339             }
340 
341             while(rightPair == 0) {
342                 if(rightIndex == right.length()) {
343                     rightPair = EOS;
344                     break;
345                 }
346                 int c = right.charAt(rightIndex++);
347                 if(c <= LATIN_MAX) {
348                     rightPair = primaries[c];
349                     if(rightPair != 0) { break; }
350                     if(c <= 0x39 && c >= 0x30 && (options & CollationSettings.NUMERIC) != 0) {
351                         return BAIL_OUT_RESULT;
352                     }
353                     rightPair = table[c];
354                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
355                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
356                 } else {
357                     rightPair = lookup(table, c);
358                 }
359                 if(rightPair >= MIN_SHORT) {
360                     rightPair &= SHORT_PRIMARY_MASK;
361                     break;
362                 } else if(rightPair > variableTop) {
363                     rightPair &= LONG_PRIMARY_MASK;
364                     break;
365                 } else {
366                     long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
367                     if(pairAndInc < 0) {
368                         ++rightIndex;
369                         pairAndInc = ~pairAndInc;
370                     }
371                     rightPair = (int)pairAndInc;
372                     if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
373                     rightPair = getPrimaries(variableTop, rightPair);
374                 }
375             }
376 
377             if(leftPair == rightPair) {
378                 if(leftPair == EOS) { break; }
379                 leftPair = rightPair = 0;
380                 continue;
381             }
382             int leftPrimary = leftPair & 0xffff;
383             int rightPrimary = rightPair & 0xffff;
384             if(leftPrimary != rightPrimary) {
385                 // Return the primary difference.
386                 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
387             }
388             if(leftPair == EOS) { break; }
389             leftPair >>>= 16;
390             rightPair >>>= 16;
391         }
392         // In the following, we need to re-fetch each character because we did not buffer the CEs,
393         // but we know that the string is well-formed and
394         // only contains supported characters and mappings.
395 
396         // We might skip the secondary level but continue with the case level
397         // which is turned on separately.
398         if(CollationSettings.getStrength(options) >= Collator.SECONDARY) {
399             leftIndex = rightIndex = startIndex;
400             leftPair = rightPair = 0;
401             for(;;) {
402                 while(leftPair == 0) {
403                     if(leftIndex == left.length()) {
404                         leftPair = EOS;
405                         break;
406                     }
407                     int c = left.charAt(leftIndex++);
408                     if(c <= LATIN_MAX) {
409                         leftPair = table[c];
410                     } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
411                         leftPair = table[c - PUNCT_START + LATIN_LIMIT];
412                     } else {
413                         leftPair = lookup(table, c);
414                     }
415                     if(leftPair >= MIN_SHORT) {
416                         leftPair = getSecondariesFromOneShortCE(leftPair);
417                         break;
418                     } else if(leftPair > variableTop) {
419                         leftPair = COMMON_SEC_PLUS_OFFSET;
420                         break;
421                     } else {
422                         long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
423                         if(pairAndInc < 0) {
424                             ++leftIndex;
425                             pairAndInc = ~pairAndInc;
426                         }
427                         leftPair = getSecondaries(variableTop, (int)pairAndInc);
428                     }
429                 }
430 
431                 while(rightPair == 0) {
432                     if(rightIndex == right.length()) {
433                         rightPair = EOS;
434                         break;
435                     }
436                     int c = right.charAt(rightIndex++);
437                     if(c <= LATIN_MAX) {
438                         rightPair = table[c];
439                     } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
440                         rightPair = table[c - PUNCT_START + LATIN_LIMIT];
441                     } else {
442                         rightPair = lookup(table, c);
443                     }
444                     if(rightPair >= MIN_SHORT) {
445                         rightPair = getSecondariesFromOneShortCE(rightPair);
446                         break;
447                     } else if(rightPair > variableTop) {
448                         rightPair = COMMON_SEC_PLUS_OFFSET;
449                         break;
450                     } else {
451                         long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
452                         if(pairAndInc < 0) {
453                             ++rightIndex;
454                             pairAndInc = ~pairAndInc;
455                         }
456                         rightPair = getSecondaries(variableTop, (int)pairAndInc);
457                     }
458                 }
459 
460                 if(leftPair == rightPair) {
461                     if(leftPair == EOS) { break; }
462                     leftPair = rightPair = 0;
463                     continue;
464                 }
465                 int leftSecondary = leftPair & 0xffff;
466                 int rightSecondary = rightPair & 0xffff;
467                 if(leftSecondary != rightSecondary) {
468                     if((options & CollationSettings.BACKWARD_SECONDARY) != 0) {
469                         // Full support for backwards secondary requires backwards contraction matching
470                         // and moving backwards between merge separators.
471                         return BAIL_OUT_RESULT;
472                     }
473                     return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
474                 }
475                 if(leftPair == EOS) { break; }
476                 leftPair >>>= 16;
477                 rightPair >>>= 16;
478             }
479         }
480 
481         if((options & CollationSettings.CASE_LEVEL) != 0) {
482             boolean strengthIsPrimary = CollationSettings.getStrength(options) == Collator.PRIMARY;
483             leftIndex = rightIndex = startIndex;
484             leftPair = rightPair = 0;
485             for(;;) {
486                 while(leftPair == 0) {
487                     if(leftIndex == left.length()) {
488                         leftPair = EOS;
489                         break;
490                     }
491                     int c = left.charAt(leftIndex++);
492                     leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
493                     if(leftPair < MIN_LONG) {
494                         long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
495                         if(pairAndInc < 0) {
496                             ++leftIndex;
497                             pairAndInc = ~pairAndInc;
498                         }
499                         leftPair = (int)pairAndInc;
500                     }
501                     leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
502                 }
503 
504                 while(rightPair == 0) {
505                     if(rightIndex == right.length()) {
506                         rightPair = EOS;
507                         break;
508                     }
509                     int c = right.charAt(rightIndex++);
510                     rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
511                     if(rightPair < MIN_LONG) {
512                         long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
513                         if(pairAndInc < 0) {
514                             ++rightIndex;
515                             pairAndInc = ~pairAndInc;
516                         }
517                         rightPair = (int)pairAndInc;
518                     }
519                     rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
520                 }
521 
522                 if(leftPair == rightPair) {
523                     if(leftPair == EOS) { break; }
524                     leftPair = rightPair = 0;
525                     continue;
526                 }
527                 int leftCase = leftPair & 0xffff;
528                 int rightCase = rightPair & 0xffff;
529                 if(leftCase != rightCase) {
530                     if((options & CollationSettings.UPPER_FIRST) == 0) {
531                         return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
532                     } else {
533                         return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
534                     }
535                 }
536                 if(leftPair == EOS) { break; }
537                 leftPair >>>= 16;
538                 rightPair >>>= 16;
539             }
540         }
541         if(CollationSettings.getStrength(options) <= Collator.SECONDARY) { return Collation.EQUAL; }
542 
543         // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
544         boolean withCaseBits = CollationSettings.isTertiaryWithCaseBits(options);
545 
546         leftIndex = rightIndex = startIndex;
547         leftPair = rightPair = 0;
548         for(;;) {
549             while(leftPair == 0) {
550                 if(leftIndex == left.length()) {
551                     leftPair = EOS;
552                     break;
553                 }
554                 int c = left.charAt(leftIndex++);
555                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
556                 if(leftPair < MIN_LONG) {
557                     long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
558                     if(pairAndInc < 0) {
559                         ++leftIndex;
560                         pairAndInc = ~pairAndInc;
561                     }
562                     leftPair = (int)pairAndInc;
563                 }
564                 leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
565             }
566 
567             while(rightPair == 0) {
568                 if(rightIndex == right.length()) {
569                     rightPair = EOS;
570                     break;
571                 }
572                 int c = right.charAt(rightIndex++);
573                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
574                 if(rightPair < MIN_LONG) {
575                     long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
576                     if(pairAndInc < 0) {
577                         ++rightIndex;
578                         pairAndInc = ~pairAndInc;
579                     }
580                     rightPair = (int)pairAndInc;
581                 }
582                 rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
583             }
584 
585             if(leftPair == rightPair) {
586                 if(leftPair == EOS) { break; }
587                 leftPair = rightPair = 0;
588                 continue;
589             }
590             int leftTertiary = leftPair & 0xffff;
591             int rightTertiary = rightPair & 0xffff;
592             if(leftTertiary != rightTertiary) {
593                 if(CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
594                     // Pass through EOS and MERGE_WEIGHT
595                     // and keep real tertiary weights larger than the MERGE_WEIGHT.
596                     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
597                     if(leftTertiary > MERGE_WEIGHT) {
598                         leftTertiary ^= CASE_MASK;
599                     }
600                     if(rightTertiary > MERGE_WEIGHT) {
601                         rightTertiary ^= CASE_MASK;
602                     }
603                 }
604                 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
605             }
606             if(leftPair == EOS) { break; }
607             leftPair >>>= 16;
608             rightPair >>>= 16;
609         }
610         if(CollationSettings.getStrength(options) <= Collator.TERTIARY) { return Collation.EQUAL; }
611 
612         leftIndex = rightIndex = startIndex;
613         leftPair = rightPair = 0;
614         for(;;) {
615             while(leftPair == 0) {
616                 if(leftIndex == left.length()) {
617                     leftPair = EOS;
618                     break;
619                 }
620                 int c = left.charAt(leftIndex++);
621                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
622                 if(leftPair < MIN_LONG) {
623                     long pairAndInc = nextPair(table, c, leftPair, left, leftIndex);
624                     if(pairAndInc < 0) {
625                         ++leftIndex;
626                         pairAndInc = ~pairAndInc;
627                     }
628                     leftPair = (int)pairAndInc;
629                 }
630                 leftPair = getQuaternaries(variableTop, leftPair);
631             }
632 
633             while(rightPair == 0) {
634                 if(rightIndex == right.length()) {
635                     rightPair = EOS;
636                     break;
637                 }
638                 int c = right.charAt(rightIndex++);
639                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
640                 if(rightPair < MIN_LONG) {
641                     long pairAndInc = nextPair(table, c, rightPair, right, rightIndex);
642                     if(pairAndInc < 0) {
643                         ++rightIndex;
644                         pairAndInc = ~pairAndInc;
645                     }
646                     rightPair = (int)pairAndInc;
647                 }
648                 rightPair = getQuaternaries(variableTop, rightPair);
649             }
650 
651             if(leftPair == rightPair) {
652                 if(leftPair == EOS) { break; }
653                 leftPair = rightPair = 0;
654                 continue;
655             }
656             int leftQuaternary = leftPair & 0xffff;
657             int rightQuaternary = rightPair & 0xffff;
658             if(leftQuaternary != rightQuaternary) {
659                 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
660             }
661             if(leftPair == EOS) { break; }
662             leftPair >>>= 16;
663             rightPair >>>= 16;
664         }
665         return Collation.EQUAL;
666     }
667 
lookup(char[] table, int c)668     private static int lookup(char[] table, int c) {
669         assert(c > LATIN_MAX);
670         if(PUNCT_START <= c && c < PUNCT_LIMIT) {
671             return table[c - PUNCT_START + LATIN_LIMIT];
672         } else if(c == 0xfffe) {
673             return MERGE_WEIGHT;
674         } else if(c == 0xffff) {
675             return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
676         } else {
677             return BAIL_OUT;
678         }
679     }
680 
681     /**
682      * Java returns a negative result (use the '~' operator) if sIndex is to be incremented.
683      * C++ modifies sIndex.
684      */
nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex)685     private static long nextPair(char[] table, int c, int ce, CharSequence s16, int sIndex) {
686         if(ce >= MIN_LONG || ce < CONTRACTION) {
687             return ce;  // simple or special mini CE
688         } else if(ce >= EXPANSION) {
689             int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
690             return ((long)table[index + 1] << 16) | table[index];
691         } else /* ce >= CONTRACTION */ {
692             // Contraction list: Default mapping followed by
693             // 0 or more single-character contraction suffix mappings.
694             int index = NUM_FAST_CHARS + (ce & INDEX_MASK);
695             boolean inc = false;  // true if the next char is consumed.
696             if(sIndex != s16.length()) {
697                 // Read the next character.
698                 int c2;
699                 int nextIndex = sIndex;
700                 c2 = s16.charAt(nextIndex++);
701                 if(c2 > LATIN_MAX) {
702                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
703                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
704                     } else if(c2 == 0xfffe || c2 == 0xffff) {
705                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
706                     } else {
707                         return BAIL_OUT;
708                     }
709                 }
710                 // Look for the next character in the contraction suffix list,
711                 // which is in ascending order of single suffix characters.
712                 int i = index;
713                 int head = table[i];  // first skip the default mapping
714                 int x;
715                 do {
716                     i += head >> CONTR_LENGTH_SHIFT;
717                     head = table[i];
718                     x = head & CONTR_CHAR_MASK;
719                 } while(x < c2);
720                 if(x == c2) {
721                     index = i;
722                     inc = true;
723                 }
724             }
725             // Return the CE or CEs for the default or contraction mapping.
726             int length = table[index] >> CONTR_LENGTH_SHIFT;
727             if(length == 1) {
728                 return BAIL_OUT;
729             }
730             ce = table[index + 1];
731             long result;
732             if(length == 2) {
733                 result = ce;
734             } else {
735                 result = ((long)table[index + 2] << 16) | ce;
736             }
737             return inc ? ~result : result;
738         }
739     }
740 
getPrimaries(int variableTop, int pair)741     private static int getPrimaries(int variableTop, int pair) {
742         int ce = pair & 0xffff;
743         if(ce >= MIN_SHORT) { return pair & TWO_SHORT_PRIMARIES_MASK; }
744         if(ce > variableTop) { return pair & TWO_LONG_PRIMARIES_MASK; }
745         if(ce >= MIN_LONG) { return 0; }  // variable
746         return pair;  // special mini CE
747     }
748 
getSecondariesFromOneShortCE(int ce)749     private static int getSecondariesFromOneShortCE(int ce) {
750         ce &= SECONDARY_MASK;
751         if(ce < MIN_SEC_HIGH) {
752             return ce + SEC_OFFSET;
753         } else {
754             return ((ce + SEC_OFFSET) << 16) | COMMON_SEC_PLUS_OFFSET;
755         }
756     }
757 
getSecondaries(int variableTop, int pair)758     private static int getSecondaries(int variableTop, int pair) {
759         if(pair <= 0xffff) {
760             // one mini CE
761             if(pair >= MIN_SHORT) {
762                 pair = getSecondariesFromOneShortCE(pair);
763             } else if(pair > variableTop) {
764                 pair = COMMON_SEC_PLUS_OFFSET;
765             } else if(pair >= MIN_LONG) {
766                 pair = 0;  // variable
767             }
768             // else special mini CE
769         } else {
770             int ce = pair & 0xffff;
771             if(ce >= MIN_SHORT) {
772                 pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
773             } else if(ce > variableTop) {
774                 pair = TWO_COMMON_SEC_PLUS_OFFSET;
775             } else {
776                 assert(ce >= MIN_LONG);
777                 pair = 0;  // variable
778             }
779         }
780         return pair;
781     }
782 
getCases(int variableTop, boolean strengthIsPrimary, int pair)783     private static int getCases(int variableTop, boolean strengthIsPrimary, int pair) {
784         // Primary+caseLevel: Ignore case level weights of primary ignorables.
785         // Otherwise: Ignore case level weights of secondary ignorables.
786         // For details see the comments in the CollationCompare class.
787         // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
788         if(pair <= 0xffff) {
789             // one mini CE
790             if(pair >= MIN_SHORT) {
791                 // A high secondary weight means we really have two CEs,
792                 // a primary CE and a secondary CE.
793                 int ce = pair;
794                 pair &= CASE_MASK;  // explicit weight of primary CE
795                 if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
796                     pair |= LOWER_CASE << 16;  // implied weight of secondary CE
797                 }
798             } else if(pair > variableTop) {
799                 pair = LOWER_CASE;
800             } else if(pair >= MIN_LONG) {
801                 pair = 0;  // variable
802             }
803             // else special mini CE
804         } else {
805             // two mini CEs, same primary groups, neither expands like above
806             int ce = pair & 0xffff;
807             if(ce >= MIN_SHORT) {
808                 if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
809                     pair &= CASE_MASK;
810                 } else {
811                     pair &= TWO_CASES_MASK;
812                 }
813             } else if(ce > variableTop) {
814                 pair = TWO_LOWER_CASES;
815             } else {
816                 assert(ce >= MIN_LONG);
817                 pair = 0;  // variable
818             }
819         }
820         return pair;
821     }
822 
getTertiaries(int variableTop, boolean withCaseBits, int pair)823     private static int getTertiaries(int variableTop, boolean withCaseBits, int pair) {
824         if(pair <= 0xffff) {
825             // one mini CE
826             if(pair >= MIN_SHORT) {
827                 // A high secondary weight means we really have two CEs,
828                 // a primary CE and a secondary CE.
829                 int ce = pair;
830                 if(withCaseBits) {
831                     pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
832                     if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
833                         pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
834                     }
835                 } else {
836                     pair = (pair & TERTIARY_MASK) + TER_OFFSET;
837                     if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
838                         pair |= COMMON_TER_PLUS_OFFSET << 16;
839                     }
840                 }
841             } else if(pair > variableTop) {
842                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
843                 if(withCaseBits) {
844                     pair |= LOWER_CASE;
845                 }
846             } else if(pair >= MIN_LONG) {
847                 pair = 0;  // variable
848             }
849             // else special mini CE
850         } else {
851             // two mini CEs, same primary groups, neither expands like above
852             int ce = pair & 0xffff;
853             if(ce >= MIN_SHORT) {
854                 if(withCaseBits) {
855                     pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
856                 } else {
857                     pair &= TWO_TERTIARIES_MASK;
858                 }
859                 pair += TWO_TER_OFFSETS;
860             } else if(ce > variableTop) {
861                 pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
862                 if(withCaseBits) {
863                     pair |= TWO_LOWER_CASES;
864                 }
865             } else {
866                 assert(ce >= MIN_LONG);
867                 pair = 0;  // variable
868             }
869         }
870         return pair;
871     }
872 
getQuaternaries(int variableTop, int pair)873     private static int getQuaternaries(int variableTop, int pair) {
874         // Return the primary weight of a variable CE,
875         // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
876         if(pair <= 0xffff) {
877             // one mini CE
878             if(pair >= MIN_SHORT) {
879                 // A high secondary weight means we really have two CEs,
880                 // a primary CE and a secondary CE.
881                 if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
882                     pair = TWO_SHORT_PRIMARIES_MASK;
883                 } else {
884                     pair = SHORT_PRIMARY_MASK;
885                 }
886             } else if(pair > variableTop) {
887                 pair = SHORT_PRIMARY_MASK;
888             } else if(pair >= MIN_LONG) {
889                 pair &= LONG_PRIMARY_MASK;  // variable
890             }
891             // else special mini CE
892         } else {
893             // two mini CEs, same primary groups, neither expands like above
894             int ce = pair & 0xffff;
895             if(ce > variableTop) {
896                 pair = TWO_SHORT_PRIMARIES_MASK;
897             } else {
898                 assert(ce >= MIN_LONG);
899                 pair &= TWO_LONG_PRIMARIES_MASK;  // variable
900             }
901         }
902         return pair;
903     }
904 
CollationFastLatin()905     private CollationFastLatin() {}  // no constructor
906 }
907