1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
7 *
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "putilimp.h"  // U_ALIGN_CODE
21 #include "uassert.h"
22 
23 U_NAMESPACE_BEGIN
24 
25 int32_t
getOptions(const CollationData * data,const CollationSettings & settings,uint16_t * primaries,int32_t capacity)26 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
27                                uint16_t *primaries, int32_t capacity) {
28     const uint16_t *table = data->fastLatinTable;
29     if(table == NULL) { return -1; }
30     U_ASSERT(capacity == LATIN_LIMIT);
31     if(capacity != LATIN_LIMIT) { return -1; }
32 
33     uint32_t miniVarTop;
34     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
35         // No mini primaries are variable, set a variableTop just below the
36         // lowest long mini primary.
37         miniVarTop = MIN_LONG - 1;
38     } else {
39         int32_t headerLength = *table & 0xff;
40         int32_t i = 1 + settings.getMaxVariable();
41         if(i >= headerLength) {
42             return -1;  // variableTop >= digits, should not occur
43         }
44         miniVarTop = table[i];
45     }
46 
47     UBool digitsAreReordered = FALSE;
48     if(settings.hasReordering()) {
49         uint32_t prevStart = 0;
50         uint32_t beforeDigitStart = 0;
51         uint32_t digitStart = 0;
52         uint32_t afterDigitStart = 0;
53         for(int32_t group = UCOL_REORDER_CODE_FIRST;
54                 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
55                 ++group) {
56             uint32_t start = data->getFirstPrimaryForGroup(group);
57             start = settings.reorder(start);
58             if(group == UCOL_REORDER_CODE_DIGIT) {
59                 beforeDigitStart = prevStart;
60                 digitStart = start;
61             } else if(start != 0) {
62                 if(start < prevStart) {
63                     // The permutation affects the groups up to Latin.
64                     return -1;
65                 }
66                 // In the future, there might be a special group between digits & Latin.
67                 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
68                     afterDigitStart = start;
69                 }
70                 prevStart = start;
71             }
72         }
73         uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
74         latinStart = settings.reorder(latinStart);
75         if(latinStart < prevStart) {
76             return -1;
77         }
78         if(afterDigitStart == 0) {
79             afterDigitStart = latinStart;
80         }
81         if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
82             digitsAreReordered = TRUE;
83         }
84     }
85 
86     table += (table[0] & 0xff);  // skip the header
87     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
88         uint32_t p = table[c];
89         if(p >= MIN_SHORT) {
90             p &= SHORT_PRIMARY_MASK;
91         } else if(p > miniVarTop) {
92             p &= LONG_PRIMARY_MASK;
93         } else {
94             p = 0;
95         }
96         primaries[c] = (uint16_t)p;
97     }
98     if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
99         // Bail out for digits.
100         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
101     }
102 
103     // Shift the miniVarTop above other options.
104     return ((int32_t)miniVarTop << 16) | settings.options;
105 }
106 
107 int32_t
compareUTF16(const uint16_t * table,const uint16_t * primaries,int32_t options,const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength)108 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
109                                  const UChar *left, int32_t leftLength,
110                                  const UChar *right, int32_t rightLength) {
111     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
112     // optimized for common Latin text.
113     // Keep them in sync!
114     // Keep compareUTF16() and compareUTF8() in sync very closely!
115 
116     U_ASSERT((table[0] >> 8) == VERSION);
117     table += (table[0] & 0xff);  // skip the header
118     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
119     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
120 
121     // Check for supported characters, fetch mini CEs, and compare primaries.
122     U_ALIGN_CODE(16);
123     int32_t leftIndex = 0, rightIndex = 0;
124     /**
125      * Single mini CE or a pair.
126      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
127      * If there is only one, then it is in the lower bits, and the upper bits are 0.
128      */
129     uint32_t leftPair = 0, rightPair = 0;
130     for(;;) {
131         // We fetch CEs until we get a non-ignorable primary or reach the end.
132         while(leftPair == 0) {
133             if(leftIndex == leftLength) {
134                 leftPair = EOS;
135                 break;
136             }
137             UChar32 c = left[leftIndex++];
138             if(c <= LATIN_MAX) {
139                 leftPair = primaries[c];
140                 if(leftPair != 0) { break; }
141                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
142                     return BAIL_OUT_RESULT;
143                 }
144                 leftPair = table[c];
145             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
146                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
147             } else {
148                 leftPair = lookup(table, c);
149             }
150             if(leftPair >= MIN_SHORT) {
151                 leftPair &= SHORT_PRIMARY_MASK;
152                 break;
153             } else if(leftPair > variableTop) {
154                 leftPair &= LONG_PRIMARY_MASK;
155                 break;
156             } else {
157                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
158                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
159                 leftPair = getPrimaries(variableTop, leftPair);
160             }
161         }
162 
163         while(rightPair == 0) {
164             if(rightIndex == rightLength) {
165                 rightPair = EOS;
166                 break;
167             }
168             UChar32 c = right[rightIndex++];
169             if(c <= LATIN_MAX) {
170                 rightPair = primaries[c];
171                 if(rightPair != 0) { break; }
172                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
173                     return BAIL_OUT_RESULT;
174                 }
175                 rightPair = table[c];
176             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
177                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
178             } else {
179                 rightPair = lookup(table, c);
180             }
181             if(rightPair >= MIN_SHORT) {
182                 rightPair &= SHORT_PRIMARY_MASK;
183                 break;
184             } else if(rightPair > variableTop) {
185                 rightPair &= LONG_PRIMARY_MASK;
186                 break;
187             } else {
188                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
189                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
190                 rightPair = getPrimaries(variableTop, rightPair);
191             }
192         }
193 
194         if(leftPair == rightPair) {
195             if(leftPair == EOS) { break; }
196             leftPair = rightPair = 0;
197             continue;
198         }
199         uint32_t leftPrimary = leftPair & 0xffff;
200         uint32_t rightPrimary = rightPair & 0xffff;
201         if(leftPrimary != rightPrimary) {
202             // Return the primary difference.
203             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
204         }
205         if(leftPair == EOS) { break; }
206         leftPair >>= 16;
207         rightPair >>= 16;
208     }
209     // In the following, we need to re-fetch each character because we did not buffer the CEs,
210     // but we know that the string is well-formed and
211     // only contains supported characters and mappings.
212 
213     // We might skip the secondary level but continue with the case level
214     // which is turned on separately.
215     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
216         leftIndex = rightIndex = 0;
217         leftPair = rightPair = 0;
218         for(;;) {
219             while(leftPair == 0) {
220                 if(leftIndex == leftLength) {
221                     leftPair = EOS;
222                     break;
223                 }
224                 UChar32 c = left[leftIndex++];
225                 if(c <= LATIN_MAX) {
226                     leftPair = table[c];
227                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
228                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
229                 } else {
230                     leftPair = lookup(table, c);
231                 }
232                 if(leftPair >= MIN_SHORT) {
233                     leftPair = getSecondariesFromOneShortCE(leftPair);
234                     break;
235                 } else if(leftPair > variableTop) {
236                     leftPair = COMMON_SEC_PLUS_OFFSET;
237                     break;
238                 } else {
239                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
240                     leftPair = getSecondaries(variableTop, leftPair);
241                 }
242             }
243 
244             while(rightPair == 0) {
245                 if(rightIndex == rightLength) {
246                     rightPair = EOS;
247                     break;
248                 }
249                 UChar32 c = right[rightIndex++];
250                 if(c <= LATIN_MAX) {
251                     rightPair = table[c];
252                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
253                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
254                 } else {
255                     rightPair = lookup(table, c);
256                 }
257                 if(rightPair >= MIN_SHORT) {
258                     rightPair = getSecondariesFromOneShortCE(rightPair);
259                     break;
260                 } else if(rightPair > variableTop) {
261                     rightPair = COMMON_SEC_PLUS_OFFSET;
262                     break;
263                 } else {
264                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
265                     rightPair = getSecondaries(variableTop, rightPair);
266                 }
267             }
268 
269             if(leftPair == rightPair) {
270                 if(leftPair == EOS) { break; }
271                 leftPair = rightPair = 0;
272                 continue;
273             }
274             uint32_t leftSecondary = leftPair & 0xffff;
275             uint32_t rightSecondary = rightPair & 0xffff;
276             if(leftSecondary != rightSecondary) {
277                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
278                     // Full support for backwards secondary requires backwards contraction matching
279                     // and moving backwards between merge separators.
280                     return BAIL_OUT_RESULT;
281                 }
282                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
283             }
284             if(leftPair == EOS) { break; }
285             leftPair >>= 16;
286             rightPair >>= 16;
287         }
288     }
289 
290     if((options & CollationSettings::CASE_LEVEL) != 0) {
291         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
292         leftIndex = rightIndex = 0;
293         leftPair = rightPair = 0;
294         for(;;) {
295             while(leftPair == 0) {
296                 if(leftIndex == leftLength) {
297                     leftPair = EOS;
298                     break;
299                 }
300                 UChar32 c = left[leftIndex++];
301                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
302                 if(leftPair < MIN_LONG) {
303                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
304                 }
305                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
306             }
307 
308             while(rightPair == 0) {
309                 if(rightIndex == rightLength) {
310                     rightPair = EOS;
311                     break;
312                 }
313                 UChar32 c = right[rightIndex++];
314                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
315                 if(rightPair < MIN_LONG) {
316                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
317                 }
318                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
319             }
320 
321             if(leftPair == rightPair) {
322                 if(leftPair == EOS) { break; }
323                 leftPair = rightPair = 0;
324                 continue;
325             }
326             uint32_t leftCase = leftPair & 0xffff;
327             uint32_t rightCase = rightPair & 0xffff;
328             if(leftCase != rightCase) {
329                 if((options & CollationSettings::UPPER_FIRST) == 0) {
330                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
331                 } else {
332                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
333                 }
334             }
335             if(leftPair == EOS) { break; }
336             leftPair >>= 16;
337             rightPair >>= 16;
338         }
339     }
340     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
341 
342     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
343     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
344 
345     leftIndex = rightIndex = 0;
346     leftPair = rightPair = 0;
347     for(;;) {
348         while(leftPair == 0) {
349             if(leftIndex == leftLength) {
350                 leftPair = EOS;
351                 break;
352             }
353             UChar32 c = left[leftIndex++];
354             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
355             if(leftPair < MIN_LONG) {
356                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
357             }
358             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
359         }
360 
361         while(rightPair == 0) {
362             if(rightIndex == rightLength) {
363                 rightPair = EOS;
364                 break;
365             }
366             UChar32 c = right[rightIndex++];
367             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
368             if(rightPair < MIN_LONG) {
369                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
370             }
371             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
372         }
373 
374         if(leftPair == rightPair) {
375             if(leftPair == EOS) { break; }
376             leftPair = rightPair = 0;
377             continue;
378         }
379         uint32_t leftTertiary = leftPair & 0xffff;
380         uint32_t rightTertiary = rightPair & 0xffff;
381         if(leftTertiary != rightTertiary) {
382             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
383                 // Pass through EOS and MERGE_WEIGHT
384                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
385                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
386                 if(leftTertiary > MERGE_WEIGHT) {
387                     leftTertiary ^= CASE_MASK;
388                 }
389                 if(rightTertiary > MERGE_WEIGHT) {
390                     rightTertiary ^= CASE_MASK;
391                 }
392             }
393             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
394         }
395         if(leftPair == EOS) { break; }
396         leftPair >>= 16;
397         rightPair >>= 16;
398     }
399     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
400 
401     leftIndex = rightIndex = 0;
402     leftPair = rightPair = 0;
403     for(;;) {
404         while(leftPair == 0) {
405             if(leftIndex == leftLength) {
406                 leftPair = EOS;
407                 break;
408             }
409             UChar32 c = left[leftIndex++];
410             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
411             if(leftPair < MIN_LONG) {
412                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
413             }
414             leftPair = getQuaternaries(variableTop, leftPair);
415         }
416 
417         while(rightPair == 0) {
418             if(rightIndex == rightLength) {
419                 rightPair = EOS;
420                 break;
421             }
422             UChar32 c = right[rightIndex++];
423             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
424             if(rightPair < MIN_LONG) {
425                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
426             }
427             rightPair = getQuaternaries(variableTop, rightPair);
428         }
429 
430         if(leftPair == rightPair) {
431             if(leftPair == EOS) { break; }
432             leftPair = rightPair = 0;
433             continue;
434         }
435         uint32_t leftQuaternary = leftPair & 0xffff;
436         uint32_t rightQuaternary = rightPair & 0xffff;
437         if(leftQuaternary != rightQuaternary) {
438             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
439         }
440         if(leftPair == EOS) { break; }
441         leftPair >>= 16;
442         rightPair >>= 16;
443     }
444     return UCOL_EQUAL;
445 }
446 
447 int32_t
compareUTF8(const uint16_t * table,const uint16_t * primaries,int32_t options,const uint8_t * left,int32_t leftLength,const uint8_t * right,int32_t rightLength)448 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
449                                  const uint8_t *left, int32_t leftLength,
450                                  const uint8_t *right, int32_t rightLength) {
451     // Keep compareUTF16() and compareUTF8() in sync very closely!
452 
453     U_ASSERT((table[0] >> 8) == VERSION);
454     table += (table[0] & 0xff);  // skip the header
455     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
456     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
457 
458     // Check for supported characters, fetch mini CEs, and compare primaries.
459     U_ALIGN_CODE(16);
460     int32_t leftIndex = 0, rightIndex = 0;
461     /**
462      * Single mini CE or a pair.
463      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
464      * If there is only one, then it is in the lower bits, and the upper bits are 0.
465      */
466     uint32_t leftPair = 0, rightPair = 0;
467     // Note: There is no need to assemble the code point.
468     // We only need to look up the table entry for the character,
469     // and nextPair() looks for whether c==0.
470     for(;;) {
471         // We fetch CEs until we get a non-ignorable primary or reach the end.
472         while(leftPair == 0) {
473             if(leftIndex == leftLength) {
474                 leftPair = EOS;
475                 break;
476             }
477             UChar32 c = left[leftIndex++];
478             uint8_t t;
479             if(c <= 0x7f) {
480                 leftPair = primaries[c];
481                 if(leftPair != 0) { break; }
482                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
483                     return BAIL_OUT_RESULT;
484                 }
485                 leftPair = table[c];
486             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
487                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
488                 ++leftIndex;
489                 c = ((c - 0xc2) << 6) + t;
490                 leftPair = primaries[c];
491                 if(leftPair != 0) { break; }
492                 leftPair = table[c];
493             } else {
494                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
495             }
496             if(leftPair >= MIN_SHORT) {
497                 leftPair &= SHORT_PRIMARY_MASK;
498                 break;
499             } else if(leftPair > variableTop) {
500                 leftPair &= LONG_PRIMARY_MASK;
501                 break;
502             } else {
503                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
504                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
505                 leftPair = getPrimaries(variableTop, leftPair);
506             }
507         }
508 
509         while(rightPair == 0) {
510             if(rightIndex == rightLength) {
511                 rightPair = EOS;
512                 break;
513             }
514             UChar32 c = right[rightIndex++];
515             uint8_t t;
516             if(c <= 0x7f) {
517                 rightPair = primaries[c];
518                 if(rightPair != 0) { break; }
519                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
520                     return BAIL_OUT_RESULT;
521                 }
522                 rightPair = table[c];
523             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
524                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
525                 ++rightIndex;
526                 c = ((c - 0xc2) << 6) + t;
527                 rightPair = primaries[c];
528                 if(rightPair != 0) { break; }
529                 rightPair = table[c];
530             } else {
531                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
532             }
533             if(rightPair >= MIN_SHORT) {
534                 rightPair &= SHORT_PRIMARY_MASK;
535                 break;
536             } else if(rightPair > variableTop) {
537                 rightPair &= LONG_PRIMARY_MASK;
538                 break;
539             } else {
540                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
541                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
542                 rightPair = getPrimaries(variableTop, rightPair);
543             }
544         }
545 
546         if(leftPair == rightPair) {
547             if(leftPair == EOS) { break; }
548             leftPair = rightPair = 0;
549             continue;
550         }
551         uint32_t leftPrimary = leftPair & 0xffff;
552         uint32_t rightPrimary = rightPair & 0xffff;
553         if(leftPrimary != rightPrimary) {
554             // Return the primary difference.
555             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
556         }
557         if(leftPair == EOS) { break; }
558         leftPair >>= 16;
559         rightPair >>= 16;
560     }
561     // In the following, we need to re-fetch each character because we did not buffer the CEs,
562     // but we know that the string is well-formed and
563     // only contains supported characters and mappings.
564 
565     // We might skip the secondary level but continue with the case level
566     // which is turned on separately.
567     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
568         leftIndex = rightIndex = 0;
569         leftPair = rightPair = 0;
570         for(;;) {
571             while(leftPair == 0) {
572                 if(leftIndex == leftLength) {
573                     leftPair = EOS;
574                     break;
575                 }
576                 UChar32 c = left[leftIndex++];
577                 if(c <= 0x7f) {
578                     leftPair = table[c];
579                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
580                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
581                 } else {
582                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
583                 }
584                 if(leftPair >= MIN_SHORT) {
585                     leftPair = getSecondariesFromOneShortCE(leftPair);
586                     break;
587                 } else if(leftPair > variableTop) {
588                     leftPair = COMMON_SEC_PLUS_OFFSET;
589                     break;
590                 } else {
591                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
592                     leftPair = getSecondaries(variableTop, leftPair);
593                 }
594             }
595 
596             while(rightPair == 0) {
597                 if(rightIndex == rightLength) {
598                     rightPair = EOS;
599                     break;
600                 }
601                 UChar32 c = right[rightIndex++];
602                 if(c <= 0x7f) {
603                     rightPair = table[c];
604                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
605                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
606                 } else {
607                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
608                 }
609                 if(rightPair >= MIN_SHORT) {
610                     rightPair = getSecondariesFromOneShortCE(rightPair);
611                     break;
612                 } else if(rightPair > variableTop) {
613                     rightPair = COMMON_SEC_PLUS_OFFSET;
614                     break;
615                 } else {
616                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
617                     rightPair = getSecondaries(variableTop, rightPair);
618                 }
619             }
620 
621             if(leftPair == rightPair) {
622                 if(leftPair == EOS) { break; }
623                 leftPair = rightPair = 0;
624                 continue;
625             }
626             uint32_t leftSecondary = leftPair & 0xffff;
627             uint32_t rightSecondary = rightPair & 0xffff;
628             if(leftSecondary != rightSecondary) {
629                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
630                     // Full support for backwards secondary requires backwards contraction matching
631                     // and moving backwards between merge separators.
632                     return BAIL_OUT_RESULT;
633                 }
634                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
635             }
636             if(leftPair == EOS) { break; }
637             leftPair >>= 16;
638             rightPair >>= 16;
639         }
640     }
641 
642     if((options & CollationSettings::CASE_LEVEL) != 0) {
643         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
644         leftIndex = rightIndex = 0;
645         leftPair = rightPair = 0;
646         for(;;) {
647             while(leftPair == 0) {
648                 if(leftIndex == leftLength) {
649                     leftPair = EOS;
650                     break;
651                 }
652                 UChar32 c = left[leftIndex++];
653                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
654                 if(leftPair < MIN_LONG) {
655                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
656                 }
657                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
658             }
659 
660             while(rightPair == 0) {
661                 if(rightIndex == rightLength) {
662                     rightPair = EOS;
663                     break;
664                 }
665                 UChar32 c = right[rightIndex++];
666                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
667                 if(rightPair < MIN_LONG) {
668                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
669                 }
670                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
671             }
672 
673             if(leftPair == rightPair) {
674                 if(leftPair == EOS) { break; }
675                 leftPair = rightPair = 0;
676                 continue;
677             }
678             uint32_t leftCase = leftPair & 0xffff;
679             uint32_t rightCase = rightPair & 0xffff;
680             if(leftCase != rightCase) {
681                 if((options & CollationSettings::UPPER_FIRST) == 0) {
682                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
683                 } else {
684                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
685                 }
686             }
687             if(leftPair == EOS) { break; }
688             leftPair >>= 16;
689             rightPair >>= 16;
690         }
691     }
692     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
693 
694     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
695     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
696 
697     leftIndex = rightIndex = 0;
698     leftPair = rightPair = 0;
699     for(;;) {
700         while(leftPair == 0) {
701             if(leftIndex == leftLength) {
702                 leftPair = EOS;
703                 break;
704             }
705             UChar32 c = left[leftIndex++];
706             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
707             if(leftPair < MIN_LONG) {
708                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
709             }
710             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
711         }
712 
713         while(rightPair == 0) {
714             if(rightIndex == rightLength) {
715                 rightPair = EOS;
716                 break;
717             }
718             UChar32 c = right[rightIndex++];
719             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
720             if(rightPair < MIN_LONG) {
721                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
722             }
723             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
724         }
725 
726         if(leftPair == rightPair) {
727             if(leftPair == EOS) { break; }
728             leftPair = rightPair = 0;
729             continue;
730         }
731         uint32_t leftTertiary = leftPair & 0xffff;
732         uint32_t rightTertiary = rightPair & 0xffff;
733         if(leftTertiary != rightTertiary) {
734             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
735                 // Pass through EOS and MERGE_WEIGHT
736                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
737                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
738                 if(leftTertiary > MERGE_WEIGHT) {
739                     leftTertiary ^= CASE_MASK;
740                 }
741                 if(rightTertiary > MERGE_WEIGHT) {
742                     rightTertiary ^= CASE_MASK;
743                 }
744             }
745             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
746         }
747         if(leftPair == EOS) { break; }
748         leftPair >>= 16;
749         rightPair >>= 16;
750     }
751     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
752 
753     leftIndex = rightIndex = 0;
754     leftPair = rightPair = 0;
755     for(;;) {
756         while(leftPair == 0) {
757             if(leftIndex == leftLength) {
758                 leftPair = EOS;
759                 break;
760             }
761             UChar32 c = left[leftIndex++];
762             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
763             if(leftPair < MIN_LONG) {
764                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
765             }
766             leftPair = getQuaternaries(variableTop, leftPair);
767         }
768 
769         while(rightPair == 0) {
770             if(rightIndex == rightLength) {
771                 rightPair = EOS;
772                 break;
773             }
774             UChar32 c = right[rightIndex++];
775             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
776             if(rightPair < MIN_LONG) {
777                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
778             }
779             rightPair = getQuaternaries(variableTop, rightPair);
780         }
781 
782         if(leftPair == rightPair) {
783             if(leftPair == EOS) { break; }
784             leftPair = rightPair = 0;
785             continue;
786         }
787         uint32_t leftQuaternary = leftPair & 0xffff;
788         uint32_t rightQuaternary = rightPair & 0xffff;
789         if(leftQuaternary != rightQuaternary) {
790             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
791         }
792         if(leftPair == EOS) { break; }
793         leftPair >>= 16;
794         rightPair >>= 16;
795     }
796     return UCOL_EQUAL;
797 }
798 
799 uint32_t
lookup(const uint16_t * table,UChar32 c)800 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
801     U_ASSERT(c > LATIN_MAX);
802     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
803         return table[c - PUNCT_START + LATIN_LIMIT];
804     } else if(c == 0xfffe) {
805         return MERGE_WEIGHT;
806     } else if(c == 0xffff) {
807         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
808     } else {
809         return BAIL_OUT;
810     }
811 }
812 
813 uint32_t
lookupUTF8(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex,int32_t sLength)814 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
815                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
816     // The caller handled ASCII and valid/supported Latin.
817     U_ASSERT(c > 0x7f);
818     int32_t i2 = sIndex + 1;
819     if(i2 < sLength || sLength < 0) {
820         uint8_t t1 = s8[sIndex];
821         uint8_t t2 = s8[i2];
822         sIndex += 2;
823         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
824             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
825         } else if(c == 0xef && t1 == 0xbf) {
826             if(t2 == 0xbe) {
827                 return MERGE_WEIGHT;  // U+FFFE
828             } else if(t2 == 0xbf) {
829                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
830             }
831         }
832     }
833     return BAIL_OUT;
834 }
835 
836 uint32_t
lookupUTF8Unsafe(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex)837 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
838                                      const uint8_t *s8, int32_t &sIndex) {
839     // The caller handled ASCII.
840     // The string is well-formed and contains only supported characters.
841     U_ASSERT(c > 0x7f);
842     if(c <= LATIN_MAX_UTF8_LEAD) {
843         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
844     }
845     uint8_t t2 = s8[sIndex + 1];
846     sIndex += 2;
847     if(c == 0xe2) {
848         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
849     } else if(t2 == 0xbe) {
850         return MERGE_WEIGHT;  // U+FFFE
851     } else {
852         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
853     }
854 }
855 
856 uint32_t
nextPair(const uint16_t * table,UChar32 c,uint32_t ce,const UChar * s16,const uint8_t * s8,int32_t & sIndex,int32_t & sLength)857 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
858                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
859     if(ce >= MIN_LONG || ce < CONTRACTION) {
860         return ce;  // simple or special mini CE
861     } else if(ce >= EXPANSION) {
862         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
863         return ((uint32_t)table[index + 1] << 16) | table[index];
864     } else /* ce >= CONTRACTION */ {
865         if(c == 0 && sLength < 0) {
866             sLength = sIndex - 1;
867             return EOS;
868         }
869         // Contraction list: Default mapping followed by
870         // 0 or more single-character contraction suffix mappings.
871         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
872         if(sIndex != sLength) {
873             // Read the next character.
874             int32_t c2;
875             int32_t nextIndex = sIndex;
876             if(s16 != NULL) {
877                 c2 = s16[nextIndex++];
878                 if(c2 > LATIN_MAX) {
879                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
880                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
881                     } else if(c2 == 0xfffe || c2 == 0xffff) {
882                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
883                     } else {
884                         return BAIL_OUT;
885                     }
886                 }
887             } else {
888                 c2 = s8[nextIndex++];
889                 if(c2 > 0x7f) {
890                     uint8_t t;
891                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
892                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
893                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
894                         ++nextIndex;
895                     } else {
896                         int32_t i2 = nextIndex + 1;
897                         if(i2 < sLength || sLength < 0) {
898                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
899                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
900                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
901                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
902                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
903                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
904                             } else {
905                                 return BAIL_OUT;
906                             }
907                         } else {
908                             return BAIL_OUT;
909                         }
910                         nextIndex += 2;
911                     }
912                 }
913             }
914             if(c2 == 0 && sLength < 0) {
915                 sLength = sIndex;
916                 c2 = -1;
917             }
918             // Look for the next character in the contraction suffix list,
919             // which is in ascending order of single suffix characters.
920             int32_t i = index;
921             int32_t head = table[i];  // first skip the default mapping
922             int32_t x;
923             do {
924                 i += head >> CONTR_LENGTH_SHIFT;
925                 head = table[i];
926                 x = head & CONTR_CHAR_MASK;
927             } while(x < c2);
928             if(x == c2) {
929                 index = i;
930                 sIndex = nextIndex;
931             }
932         }
933         // Return the CE or CEs for the default or contraction mapping.
934         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
935         if(length == 1) {
936             return BAIL_OUT;
937         }
938         ce = table[index + 1];
939         if(length == 2) {
940             return ce;
941         } else {
942             return ((uint32_t)table[index + 2] << 16) | ce;
943         }
944     }
945 }
946 
947 uint32_t
getSecondaries(uint32_t variableTop,uint32_t pair)948 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
949     if(pair <= 0xffff) {
950         // one mini CE
951         if(pair >= MIN_SHORT) {
952             pair = getSecondariesFromOneShortCE(pair);
953         } else if(pair > variableTop) {
954             pair = COMMON_SEC_PLUS_OFFSET;
955         } else if(pair >= MIN_LONG) {
956             pair = 0;  // variable
957         }
958         // else special mini CE
959     } else {
960         uint32_t ce = pair & 0xffff;
961         if(ce >= MIN_SHORT) {
962             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
963         } else if(ce > variableTop) {
964             pair = TWO_COMMON_SEC_PLUS_OFFSET;
965         } else {
966             U_ASSERT(ce >= MIN_LONG);
967             pair = 0;  // variable
968         }
969     }
970     return pair;
971 }
972 
973 uint32_t
getCases(uint32_t variableTop,UBool strengthIsPrimary,uint32_t pair)974 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
975     // Primary+caseLevel: Ignore case level weights of primary ignorables.
976     // Otherwise: Ignore case level weights of secondary ignorables.
977     // For details see the comments in the CollationCompare class.
978     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
979     if(pair <= 0xffff) {
980         // one mini CE
981         if(pair >= MIN_SHORT) {
982             // A high secondary weight means we really have two CEs,
983             // a primary CE and a secondary CE.
984             uint32_t ce = pair;
985             pair &= CASE_MASK;  // explicit weight of primary CE
986             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
987                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
988             }
989         } else if(pair > variableTop) {
990             pair = LOWER_CASE;
991         } else if(pair >= MIN_LONG) {
992             pair = 0;  // variable
993         }
994         // else special mini CE
995     } else {
996         // two mini CEs, same primary groups, neither expands like above
997         uint32_t ce = pair & 0xffff;
998         if(ce >= MIN_SHORT) {
999             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
1000                 pair &= CASE_MASK;
1001             } else {
1002                 pair &= TWO_CASES_MASK;
1003             }
1004         } else if(ce > variableTop) {
1005             pair = TWO_LOWER_CASES;
1006         } else {
1007             U_ASSERT(ce >= MIN_LONG);
1008             pair = 0;  // variable
1009         }
1010     }
1011     return pair;
1012 }
1013 
1014 uint32_t
getTertiaries(uint32_t variableTop,UBool withCaseBits,uint32_t pair)1015 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
1016     if(pair <= 0xffff) {
1017         // one mini CE
1018         if(pair >= MIN_SHORT) {
1019             // A high secondary weight means we really have two CEs,
1020             // a primary CE and a secondary CE.
1021             uint32_t ce = pair;
1022             if(withCaseBits) {
1023                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1024                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1025                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1026                 }
1027             } else {
1028                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1029                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1030                     pair |= COMMON_TER_PLUS_OFFSET << 16;
1031                 }
1032             }
1033         } else if(pair > variableTop) {
1034             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1035             if(withCaseBits) {
1036                 pair |= LOWER_CASE;
1037             }
1038         } else if(pair >= MIN_LONG) {
1039             pair = 0;  // variable
1040         }
1041         // else special mini CE
1042     } else {
1043         // two mini CEs, same primary groups, neither expands like above
1044         uint32_t ce = pair & 0xffff;
1045         if(ce >= MIN_SHORT) {
1046             if(withCaseBits) {
1047                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1048             } else {
1049                 pair &= TWO_TERTIARIES_MASK;
1050             }
1051             pair += TWO_TER_OFFSETS;
1052         } else if(ce > variableTop) {
1053             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1054             if(withCaseBits) {
1055                 pair |= TWO_LOWER_CASES;
1056             }
1057         } else {
1058             U_ASSERT(ce >= MIN_LONG);
1059             pair = 0;  // variable
1060         }
1061     }
1062     return pair;
1063 }
1064 
1065 uint32_t
getQuaternaries(uint32_t variableTop,uint32_t pair)1066 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1067     // Return the primary weight of a variable CE,
1068     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1069     if(pair <= 0xffff) {
1070         // one mini CE
1071         if(pair >= MIN_SHORT) {
1072             // A high secondary weight means we really have two CEs,
1073             // a primary CE and a secondary CE.
1074             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1075                 pair = TWO_SHORT_PRIMARIES_MASK;
1076             } else {
1077                 pair = SHORT_PRIMARY_MASK;
1078             }
1079         } else if(pair > variableTop) {
1080             pair = SHORT_PRIMARY_MASK;
1081         } else if(pair >= MIN_LONG) {
1082             pair &= LONG_PRIMARY_MASK;  // variable
1083         }
1084         // else special mini CE
1085     } else {
1086         // two mini CEs, same primary groups, neither expands like above
1087         uint32_t ce = pair & 0xffff;
1088         if(ce > variableTop) {
1089             pair = TWO_SHORT_PRIMARIES_MASK;
1090         } else {
1091             U_ASSERT(ce >= MIN_LONG);
1092             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
1093         }
1094     }
1095     return pair;
1096 }
1097 
1098 U_NAMESPACE_END
1099 
1100 #endif  // !UCONFIG_NO_COLLATION
1101