• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationfastlatin.cpp
7 *
8 * created on: 2013aug18
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "collationdata.h"
18 #include "collationfastlatin.h"
19 #include "collationsettings.h"
20 #include "uassert.h"
21 
22 U_NAMESPACE_BEGIN
23 
24 int32_t
getOptions(const CollationData * data,const CollationSettings & settings,uint16_t * primaries,int32_t capacity)25 CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings,
26                                uint16_t *primaries, int32_t capacity) {
27     const uint16_t *table = data->fastLatinTable;
28     if(table == NULL) { return -1; }
29     U_ASSERT(capacity == LATIN_LIMIT);
30     if(capacity != LATIN_LIMIT) { return -1; }
31 
32     uint32_t miniVarTop;
33     if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) {
34         // No mini primaries are variable, set a variableTop just below the
35         // lowest long mini primary.
36         miniVarTop = MIN_LONG - 1;
37     } else {
38         int32_t headerLength = *table & 0xff;
39         int32_t i = 1 + settings.getMaxVariable();
40         if(i >= headerLength) {
41             return -1;  // variableTop >= digits, should not occur
42         }
43         miniVarTop = table[i];
44     }
45 
46     UBool digitsAreReordered = FALSE;
47     if(settings.hasReordering()) {
48         uint32_t prevStart = 0;
49         uint32_t beforeDigitStart = 0;
50         uint32_t digitStart = 0;
51         uint32_t afterDigitStart = 0;
52         for(int32_t group = UCOL_REORDER_CODE_FIRST;
53                 group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES;
54                 ++group) {
55             uint32_t start = data->getFirstPrimaryForGroup(group);
56             start = settings.reorder(start);
57             if(group == UCOL_REORDER_CODE_DIGIT) {
58                 beforeDigitStart = prevStart;
59                 digitStart = start;
60             } else if(start != 0) {
61                 if(start < prevStart) {
62                     // The permutation affects the groups up to Latin.
63                     return -1;
64                 }
65                 // In the future, there might be a special group between digits & Latin.
66                 if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) {
67                     afterDigitStart = start;
68                 }
69                 prevStart = start;
70             }
71         }
72         uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN);
73         latinStart = settings.reorder(latinStart);
74         if(latinStart < prevStart) {
75             return -1;
76         }
77         if(afterDigitStart == 0) {
78             afterDigitStart = latinStart;
79         }
80         if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) {
81             digitsAreReordered = TRUE;
82         }
83     }
84 
85     table += (table[0] & 0xff);  // skip the header
86     for(UChar32 c = 0; c < LATIN_LIMIT; ++c) {
87         uint32_t p = table[c];
88         if(p >= MIN_SHORT) {
89             p &= SHORT_PRIMARY_MASK;
90         } else if(p > miniVarTop) {
91             p &= LONG_PRIMARY_MASK;
92         } else {
93             p = 0;
94         }
95         primaries[c] = (uint16_t)p;
96     }
97     if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) {
98         // Bail out for digits.
99         for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; }
100     }
101 
102     // Shift the miniVarTop above other options.
103     return ((int32_t)miniVarTop << 16) | settings.options;
104 }
105 
106 int32_t
compareUTF16(const uint16_t * table,const uint16_t * primaries,int32_t options,const UChar * left,int32_t leftLength,const UChar * right,int32_t rightLength)107 CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options,
108                                  const UChar *left, int32_t leftLength,
109                                  const UChar *right, int32_t rightLength) {
110     // This is a modified copy of CollationCompare::compareUpToQuaternary(),
111     // optimized for common Latin text.
112     // Keep them in sync!
113     // Keep compareUTF16() and compareUTF8() in sync very closely!
114 
115     U_ASSERT((table[0] >> 8) == VERSION);
116     table += (table[0] & 0xff);  // skip the header
117     uint32_t variableTop = (uint32_t)options >> 16;  // see getOptions()
118     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
119 
120     // Check for supported characters, fetch mini CEs, and compare primaries.
121     int32_t leftIndex = 0, rightIndex = 0;
122     /**
123      * Single mini CE or a pair.
124      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
125      * If there is only one, then it is in the lower bits, and the upper bits are 0.
126      */
127     uint32_t leftPair = 0, rightPair = 0;
128     for(;;) {
129         // We fetch CEs until we get a non-ignorable primary or reach the end.
130         while(leftPair == 0) {
131             if(leftIndex == leftLength) {
132                 leftPair = EOS;
133                 break;
134             }
135             UChar32 c = left[leftIndex++];
136             if(c <= LATIN_MAX) {
137                 leftPair = primaries[c];
138                 if(leftPair != 0) { break; }
139                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
140                     return BAIL_OUT_RESULT;
141                 }
142                 leftPair = table[c];
143             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
144                 leftPair = table[c - PUNCT_START + LATIN_LIMIT];
145             } else {
146                 leftPair = lookup(table, c);
147             }
148             if(leftPair >= MIN_SHORT) {
149                 leftPair &= SHORT_PRIMARY_MASK;
150                 break;
151             } else if(leftPair > variableTop) {
152                 leftPair &= LONG_PRIMARY_MASK;
153                 break;
154             } else {
155                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
156                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
157                 leftPair = getPrimaries(variableTop, leftPair);
158             }
159         }
160 
161         while(rightPair == 0) {
162             if(rightIndex == rightLength) {
163                 rightPair = EOS;
164                 break;
165             }
166             UChar32 c = right[rightIndex++];
167             if(c <= LATIN_MAX) {
168                 rightPair = primaries[c];
169                 if(rightPair != 0) { break; }
170                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
171                     return BAIL_OUT_RESULT;
172                 }
173                 rightPair = table[c];
174             } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
175                 rightPair = table[c - PUNCT_START + LATIN_LIMIT];
176             } else {
177                 rightPair = lookup(table, c);
178             }
179             if(rightPair >= MIN_SHORT) {
180                 rightPair &= SHORT_PRIMARY_MASK;
181                 break;
182             } else if(rightPair > variableTop) {
183                 rightPair &= LONG_PRIMARY_MASK;
184                 break;
185             } else {
186                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
187                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
188                 rightPair = getPrimaries(variableTop, rightPair);
189             }
190         }
191 
192         if(leftPair == rightPair) {
193             if(leftPair == EOS) { break; }
194             leftPair = rightPair = 0;
195             continue;
196         }
197         uint32_t leftPrimary = leftPair & 0xffff;
198         uint32_t rightPrimary = rightPair & 0xffff;
199         if(leftPrimary != rightPrimary) {
200             // Return the primary difference.
201             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
202         }
203         if(leftPair == EOS) { break; }
204         leftPair >>= 16;
205         rightPair >>= 16;
206     }
207     // In the following, we need to re-fetch each character because we did not buffer the CEs,
208     // but we know that the string is well-formed and
209     // only contains supported characters and mappings.
210 
211     // We might skip the secondary level but continue with the case level
212     // which is turned on separately.
213     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
214         leftIndex = rightIndex = 0;
215         leftPair = rightPair = 0;
216         for(;;) {
217             while(leftPair == 0) {
218                 if(leftIndex == leftLength) {
219                     leftPair = EOS;
220                     break;
221                 }
222                 UChar32 c = left[leftIndex++];
223                 if(c <= LATIN_MAX) {
224                     leftPair = table[c];
225                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
226                     leftPair = table[c - PUNCT_START + LATIN_LIMIT];
227                 } else {
228                     leftPair = lookup(table, c);
229                 }
230                 if(leftPair >= MIN_SHORT) {
231                     leftPair = getSecondariesFromOneShortCE(leftPair);
232                     break;
233                 } else if(leftPair > variableTop) {
234                     leftPair = COMMON_SEC_PLUS_OFFSET;
235                     break;
236                 } else {
237                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
238                     leftPair = getSecondaries(variableTop, leftPair);
239                 }
240             }
241 
242             while(rightPair == 0) {
243                 if(rightIndex == rightLength) {
244                     rightPair = EOS;
245                     break;
246                 }
247                 UChar32 c = right[rightIndex++];
248                 if(c <= LATIN_MAX) {
249                     rightPair = table[c];
250                 } else if(PUNCT_START <= c && c < PUNCT_LIMIT) {
251                     rightPair = table[c - PUNCT_START + LATIN_LIMIT];
252                 } else {
253                     rightPair = lookup(table, c);
254                 }
255                 if(rightPair >= MIN_SHORT) {
256                     rightPair = getSecondariesFromOneShortCE(rightPair);
257                     break;
258                 } else if(rightPair > variableTop) {
259                     rightPair = COMMON_SEC_PLUS_OFFSET;
260                     break;
261                 } else {
262                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
263                     rightPair = getSecondaries(variableTop, rightPair);
264                 }
265             }
266 
267             if(leftPair == rightPair) {
268                 if(leftPair == EOS) { break; }
269                 leftPair = rightPair = 0;
270                 continue;
271             }
272             uint32_t leftSecondary = leftPair & 0xffff;
273             uint32_t rightSecondary = rightPair & 0xffff;
274             if(leftSecondary != rightSecondary) {
275                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
276                     // Full support for backwards secondary requires backwards contraction matching
277                     // and moving backwards between merge separators.
278                     return BAIL_OUT_RESULT;
279                 }
280                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
281             }
282             if(leftPair == EOS) { break; }
283             leftPair >>= 16;
284             rightPair >>= 16;
285         }
286     }
287 
288     if((options & CollationSettings::CASE_LEVEL) != 0) {
289         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
290         leftIndex = rightIndex = 0;
291         leftPair = rightPair = 0;
292         for(;;) {
293             while(leftPair == 0) {
294                 if(leftIndex == leftLength) {
295                     leftPair = EOS;
296                     break;
297                 }
298                 UChar32 c = left[leftIndex++];
299                 leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
300                 if(leftPair < MIN_LONG) {
301                     leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
302                 }
303                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
304             }
305 
306             while(rightPair == 0) {
307                 if(rightIndex == rightLength) {
308                     rightPair = EOS;
309                     break;
310                 }
311                 UChar32 c = right[rightIndex++];
312                 rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
313                 if(rightPair < MIN_LONG) {
314                     rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
315                 }
316                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
317             }
318 
319             if(leftPair == rightPair) {
320                 if(leftPair == EOS) { break; }
321                 leftPair = rightPair = 0;
322                 continue;
323             }
324             uint32_t leftCase = leftPair & 0xffff;
325             uint32_t rightCase = rightPair & 0xffff;
326             if(leftCase != rightCase) {
327                 if((options & CollationSettings::UPPER_FIRST) == 0) {
328                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
329                 } else {
330                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
331                 }
332             }
333             if(leftPair == EOS) { break; }
334             leftPair >>= 16;
335             rightPair >>= 16;
336         }
337     }
338     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
339 
340     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
341     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
342 
343     leftIndex = rightIndex = 0;
344     leftPair = rightPair = 0;
345     for(;;) {
346         while(leftPair == 0) {
347             if(leftIndex == leftLength) {
348                 leftPair = EOS;
349                 break;
350             }
351             UChar32 c = left[leftIndex++];
352             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
353             if(leftPair < MIN_LONG) {
354                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
355             }
356             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
357         }
358 
359         while(rightPair == 0) {
360             if(rightIndex == rightLength) {
361                 rightPair = EOS;
362                 break;
363             }
364             UChar32 c = right[rightIndex++];
365             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
366             if(rightPair < MIN_LONG) {
367                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
368             }
369             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
370         }
371 
372         if(leftPair == rightPair) {
373             if(leftPair == EOS) { break; }
374             leftPair = rightPair = 0;
375             continue;
376         }
377         uint32_t leftTertiary = leftPair & 0xffff;
378         uint32_t rightTertiary = rightPair & 0xffff;
379         if(leftTertiary != rightTertiary) {
380             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
381                 // Pass through EOS and MERGE_WEIGHT
382                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
383                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
384                 if(leftTertiary > MERGE_WEIGHT) {
385                     leftTertiary ^= CASE_MASK;
386                 }
387                 if(rightTertiary > MERGE_WEIGHT) {
388                     rightTertiary ^= CASE_MASK;
389                 }
390             }
391             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
392         }
393         if(leftPair == EOS) { break; }
394         leftPair >>= 16;
395         rightPair >>= 16;
396     }
397     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
398 
399     leftIndex = rightIndex = 0;
400     leftPair = rightPair = 0;
401     for(;;) {
402         while(leftPair == 0) {
403             if(leftIndex == leftLength) {
404                 leftPair = EOS;
405                 break;
406             }
407             UChar32 c = left[leftIndex++];
408             leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
409             if(leftPair < MIN_LONG) {
410                 leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength);
411             }
412             leftPair = getQuaternaries(variableTop, leftPair);
413         }
414 
415         while(rightPair == 0) {
416             if(rightIndex == rightLength) {
417                 rightPair = EOS;
418                 break;
419             }
420             UChar32 c = right[rightIndex++];
421             rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c);
422             if(rightPair < MIN_LONG) {
423                 rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength);
424             }
425             rightPair = getQuaternaries(variableTop, rightPair);
426         }
427 
428         if(leftPair == rightPair) {
429             if(leftPair == EOS) { break; }
430             leftPair = rightPair = 0;
431             continue;
432         }
433         uint32_t leftQuaternary = leftPair & 0xffff;
434         uint32_t rightQuaternary = rightPair & 0xffff;
435         if(leftQuaternary != rightQuaternary) {
436             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
437         }
438         if(leftPair == EOS) { break; }
439         leftPair >>= 16;
440         rightPair >>= 16;
441     }
442     return UCOL_EQUAL;
443 }
444 
445 int32_t
compareUTF8(const uint16_t * table,const uint16_t * primaries,int32_t options,const uint8_t * left,int32_t leftLength,const uint8_t * right,int32_t rightLength)446 CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options,
447                                  const uint8_t *left, int32_t leftLength,
448                                  const uint8_t *right, int32_t rightLength) {
449     // Keep compareUTF16() and compareUTF8() in sync very closely!
450 
451     U_ASSERT((table[0] >> 8) == VERSION);
452     table += (table[0] & 0xff);  // skip the header
453     uint32_t variableTop = (uint32_t)options >> 16;  // see RuleBasedCollator::getFastLatinOptions()
454     options &= 0xffff;  // needed for CollationSettings::getStrength() to work
455 
456     // Check for supported characters, fetch mini CEs, and compare primaries.
457     int32_t leftIndex = 0, rightIndex = 0;
458     /**
459      * Single mini CE or a pair.
460      * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits.
461      * If there is only one, then it is in the lower bits, and the upper bits are 0.
462      */
463     uint32_t leftPair = 0, rightPair = 0;
464     // Note: There is no need to assemble the code point.
465     // We only need to look up the table entry for the character,
466     // and nextPair() looks for whether c==0.
467     for(;;) {
468         // We fetch CEs until we get a non-ignorable primary or reach the end.
469         while(leftPair == 0) {
470             if(leftIndex == leftLength) {
471                 leftPair = EOS;
472                 break;
473             }
474             UChar32 c = left[leftIndex++];
475             uint8_t t;
476             if(c <= 0x7f) {
477                 leftPair = primaries[c];
478                 if(leftPair != 0) { break; }
479                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
480                     return BAIL_OUT_RESULT;
481                 }
482                 leftPair = table[c];
483             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength &&
484                     0x80 <= (t = left[leftIndex]) && t <= 0xbf) {
485                 ++leftIndex;
486                 c = ((c - 0xc2) << 6) + t;
487                 leftPair = primaries[c];
488                 if(leftPair != 0) { break; }
489                 leftPair = table[c];
490             } else {
491                 leftPair = lookupUTF8(table, c, left, leftIndex, leftLength);
492             }
493             if(leftPair >= MIN_SHORT) {
494                 leftPair &= SHORT_PRIMARY_MASK;
495                 break;
496             } else if(leftPair > variableTop) {
497                 leftPair &= LONG_PRIMARY_MASK;
498                 break;
499             } else {
500                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
501                 if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
502                 leftPair = getPrimaries(variableTop, leftPair);
503             }
504         }
505 
506         while(rightPair == 0) {
507             if(rightIndex == rightLength) {
508                 rightPair = EOS;
509                 break;
510             }
511             UChar32 c = right[rightIndex++];
512             uint8_t t;
513             if(c <= 0x7f) {
514                 rightPair = primaries[c];
515                 if(rightPair != 0) { break; }
516                 if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) {
517                     return BAIL_OUT_RESULT;
518                 }
519                 rightPair = table[c];
520             } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength &&
521                     0x80 <= (t = right[rightIndex]) && t <= 0xbf) {
522                 ++rightIndex;
523                 c = ((c - 0xc2) << 6) + t;
524                 rightPair = primaries[c];
525                 if(rightPair != 0) { break; }
526                 rightPair = table[c];
527             } else {
528                 rightPair = lookupUTF8(table, c, right, rightIndex, rightLength);
529             }
530             if(rightPair >= MIN_SHORT) {
531                 rightPair &= SHORT_PRIMARY_MASK;
532                 break;
533             } else if(rightPair > variableTop) {
534                 rightPair &= LONG_PRIMARY_MASK;
535                 break;
536             } else {
537                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
538                 if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; }
539                 rightPair = getPrimaries(variableTop, rightPair);
540             }
541         }
542 
543         if(leftPair == rightPair) {
544             if(leftPair == EOS) { break; }
545             leftPair = rightPair = 0;
546             continue;
547         }
548         uint32_t leftPrimary = leftPair & 0xffff;
549         uint32_t rightPrimary = rightPair & 0xffff;
550         if(leftPrimary != rightPrimary) {
551             // Return the primary difference.
552             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
553         }
554         if(leftPair == EOS) { break; }
555         leftPair >>= 16;
556         rightPair >>= 16;
557     }
558     // In the following, we need to re-fetch each character because we did not buffer the CEs,
559     // but we know that the string is well-formed and
560     // only contains supported characters and mappings.
561 
562     // We might skip the secondary level but continue with the case level
563     // which is turned on separately.
564     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
565         leftIndex = rightIndex = 0;
566         leftPair = rightPair = 0;
567         for(;;) {
568             while(leftPair == 0) {
569                 if(leftIndex == leftLength) {
570                     leftPair = EOS;
571                     break;
572                 }
573                 UChar32 c = left[leftIndex++];
574                 if(c <= 0x7f) {
575                     leftPair = table[c];
576                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
577                     leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]];
578                 } else {
579                     leftPair = lookupUTF8Unsafe(table, c, left, leftIndex);
580                 }
581                 if(leftPair >= MIN_SHORT) {
582                     leftPair = getSecondariesFromOneShortCE(leftPair);
583                     break;
584                 } else if(leftPair > variableTop) {
585                     leftPair = COMMON_SEC_PLUS_OFFSET;
586                     break;
587                 } else {
588                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
589                     leftPair = getSecondaries(variableTop, leftPair);
590                 }
591             }
592 
593             while(rightPair == 0) {
594                 if(rightIndex == rightLength) {
595                     rightPair = EOS;
596                     break;
597                 }
598                 UChar32 c = right[rightIndex++];
599                 if(c <= 0x7f) {
600                     rightPair = table[c];
601                 } else if(c <= LATIN_MAX_UTF8_LEAD) {
602                     rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]];
603                 } else {
604                     rightPair = lookupUTF8Unsafe(table, c, right, rightIndex);
605                 }
606                 if(rightPair >= MIN_SHORT) {
607                     rightPair = getSecondariesFromOneShortCE(rightPair);
608                     break;
609                 } else if(rightPair > variableTop) {
610                     rightPair = COMMON_SEC_PLUS_OFFSET;
611                     break;
612                 } else {
613                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
614                     rightPair = getSecondaries(variableTop, rightPair);
615                 }
616             }
617 
618             if(leftPair == rightPair) {
619                 if(leftPair == EOS) { break; }
620                 leftPair = rightPair = 0;
621                 continue;
622             }
623             uint32_t leftSecondary = leftPair & 0xffff;
624             uint32_t rightSecondary = rightPair & 0xffff;
625             if(leftSecondary != rightSecondary) {
626                 if((options & CollationSettings::BACKWARD_SECONDARY) != 0) {
627                     // Full support for backwards secondary requires backwards contraction matching
628                     // and moving backwards between merge separators.
629                     return BAIL_OUT_RESULT;
630                 }
631                 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
632             }
633             if(leftPair == EOS) { break; }
634             leftPair >>= 16;
635             rightPair >>= 16;
636         }
637     }
638 
639     if((options & CollationSettings::CASE_LEVEL) != 0) {
640         UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY;
641         leftIndex = rightIndex = 0;
642         leftPair = rightPair = 0;
643         for(;;) {
644             while(leftPair == 0) {
645                 if(leftIndex == leftLength) {
646                     leftPair = EOS;
647                     break;
648                 }
649                 UChar32 c = left[leftIndex++];
650                 leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
651                 if(leftPair < MIN_LONG) {
652                     leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
653                 }
654                 leftPair = getCases(variableTop, strengthIsPrimary, leftPair);
655             }
656 
657             while(rightPair == 0) {
658                 if(rightIndex == rightLength) {
659                     rightPair = EOS;
660                     break;
661                 }
662                 UChar32 c = right[rightIndex++];
663                 rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
664                 if(rightPair < MIN_LONG) {
665                     rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
666                 }
667                 rightPair = getCases(variableTop, strengthIsPrimary, rightPair);
668             }
669 
670             if(leftPair == rightPair) {
671                 if(leftPair == EOS) { break; }
672                 leftPair = rightPair = 0;
673                 continue;
674             }
675             uint32_t leftCase = leftPair & 0xffff;
676             uint32_t rightCase = rightPair & 0xffff;
677             if(leftCase != rightCase) {
678                 if((options & CollationSettings::UPPER_FIRST) == 0) {
679                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
680                 } else {
681                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
682                 }
683             }
684             if(leftPair == EOS) { break; }
685             leftPair >>= 16;
686             rightPair >>= 16;
687         }
688     }
689     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
690 
691     // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
692     UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options);
693 
694     leftIndex = rightIndex = 0;
695     leftPair = rightPair = 0;
696     for(;;) {
697         while(leftPair == 0) {
698             if(leftIndex == leftLength) {
699                 leftPair = EOS;
700                 break;
701             }
702             UChar32 c = left[leftIndex++];
703             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
704             if(leftPair < MIN_LONG) {
705                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
706             }
707             leftPair = getTertiaries(variableTop, withCaseBits, leftPair);
708         }
709 
710         while(rightPair == 0) {
711             if(rightIndex == rightLength) {
712                 rightPair = EOS;
713                 break;
714             }
715             UChar32 c = right[rightIndex++];
716             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
717             if(rightPair < MIN_LONG) {
718                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
719             }
720             rightPair = getTertiaries(variableTop, withCaseBits, rightPair);
721         }
722 
723         if(leftPair == rightPair) {
724             if(leftPair == EOS) { break; }
725             leftPair = rightPair = 0;
726             continue;
727         }
728         uint32_t leftTertiary = leftPair & 0xffff;
729         uint32_t rightTertiary = rightPair & 0xffff;
730         if(leftTertiary != rightTertiary) {
731             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
732                 // Pass through EOS and MERGE_WEIGHT
733                 // and keep real tertiary weights larger than the MERGE_WEIGHT.
734                 // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
735                 if(leftTertiary > MERGE_WEIGHT) {
736                     leftTertiary ^= CASE_MASK;
737                 }
738                 if(rightTertiary > MERGE_WEIGHT) {
739                     rightTertiary ^= CASE_MASK;
740                 }
741             }
742             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
743         }
744         if(leftPair == EOS) { break; }
745         leftPair >>= 16;
746         rightPair >>= 16;
747     }
748     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
749 
750     leftIndex = rightIndex = 0;
751     leftPair = rightPair = 0;
752     for(;;) {
753         while(leftPair == 0) {
754             if(leftIndex == leftLength) {
755                 leftPair = EOS;
756                 break;
757             }
758             UChar32 c = left[leftIndex++];
759             leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex);
760             if(leftPair < MIN_LONG) {
761                 leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength);
762             }
763             leftPair = getQuaternaries(variableTop, leftPair);
764         }
765 
766         while(rightPair == 0) {
767             if(rightIndex == rightLength) {
768                 rightPair = EOS;
769                 break;
770             }
771             UChar32 c = right[rightIndex++];
772             rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex);
773             if(rightPair < MIN_LONG) {
774                 rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength);
775             }
776             rightPair = getQuaternaries(variableTop, rightPair);
777         }
778 
779         if(leftPair == rightPair) {
780             if(leftPair == EOS) { break; }
781             leftPair = rightPair = 0;
782             continue;
783         }
784         uint32_t leftQuaternary = leftPair & 0xffff;
785         uint32_t rightQuaternary = rightPair & 0xffff;
786         if(leftQuaternary != rightQuaternary) {
787             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
788         }
789         if(leftPair == EOS) { break; }
790         leftPair >>= 16;
791         rightPair >>= 16;
792     }
793     return UCOL_EQUAL;
794 }
795 
796 uint32_t
lookup(const uint16_t * table,UChar32 c)797 CollationFastLatin::lookup(const uint16_t *table, UChar32 c) {
798     U_ASSERT(c > LATIN_MAX);
799     if(PUNCT_START <= c && c < PUNCT_LIMIT) {
800         return table[c - PUNCT_START + LATIN_LIMIT];
801     } else if(c == 0xfffe) {
802         return MERGE_WEIGHT;
803     } else if(c == 0xffff) {
804         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;
805     } else {
806         return BAIL_OUT;
807     }
808 }
809 
810 uint32_t
lookupUTF8(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex,int32_t sLength)811 CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c,
812                                const uint8_t *s8, int32_t &sIndex, int32_t sLength) {
813     // The caller handled ASCII and valid/supported Latin.
814     U_ASSERT(c > 0x7f);
815     int32_t i2 = sIndex + 1;
816     if(i2 < sLength || sLength < 0) {
817         uint8_t t1 = s8[sIndex];
818         uint8_t t2 = s8[i2];
819         sIndex += 2;
820         if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) {
821             return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
822         } else if(c == 0xef && t1 == 0xbf) {
823             if(t2 == 0xbe) {
824                 return MERGE_WEIGHT;  // U+FFFE
825             } else if(t2 == 0xbf) {
826                 return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
827             }
828         }
829     }
830     return BAIL_OUT;
831 }
832 
833 uint32_t
lookupUTF8Unsafe(const uint16_t * table,UChar32 c,const uint8_t * s8,int32_t & sIndex)834 CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c,
835                                      const uint8_t *s8, int32_t &sIndex) {
836     // The caller handled ASCII.
837     // The string is well-formed and contains only supported characters.
838     U_ASSERT(c > 0x7f);
839     if(c <= LATIN_MAX_UTF8_LEAD) {
840         return table[((c - 0xc2) << 6) + s8[sIndex++]];  // 0080..017F
841     }
842     uint8_t t2 = s8[sIndex + 1];
843     sIndex += 2;
844     if(c == 0xe2) {
845         return table[(LATIN_LIMIT - 0x80) + t2];  // 2000..203F -> 0180..01BF
846     } else if(t2 == 0xbe) {
847         return MERGE_WEIGHT;  // U+FFFE
848     } else {
849         return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER;  // U+FFFF
850     }
851 }
852 
853 uint32_t
nextPair(const uint16_t * table,UChar32 c,uint32_t ce,const UChar * s16,const uint8_t * s8,int32_t & sIndex,int32_t & sLength)854 CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce,
855                              const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) {
856     if(ce >= MIN_LONG || ce < CONTRACTION) {
857         return ce;  // simple or special mini CE
858     } else if(ce >= EXPANSION) {
859         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
860         return ((uint32_t)table[index + 1] << 16) | table[index];
861     } else /* ce >= CONTRACTION */ {
862         if(c == 0 && sLength < 0) {
863             sLength = sIndex - 1;
864             return EOS;
865         }
866         // Contraction list: Default mapping followed by
867         // 0 or more single-character contraction suffix mappings.
868         int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK);
869         if(sIndex != sLength) {
870             // Read the next character.
871             int32_t c2;
872             int32_t nextIndex = sIndex;
873             if(s16 != NULL) {
874                 c2 = s16[nextIndex++];
875                 if(c2 > LATIN_MAX) {
876                     if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) {
877                         c2 = c2 - PUNCT_START + LATIN_LIMIT;  // 2000..203F -> 0180..01BF
878                     } else if(c2 == 0xfffe || c2 == 0xffff) {
879                         c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
880                     } else {
881                         return BAIL_OUT;
882                     }
883                 }
884             } else {
885                 c2 = s8[nextIndex++];
886                 if(c2 > 0x7f) {
887                     uint8_t t;
888                     if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength &&
889                             0x80 <= (t = s8[nextIndex]) && t <= 0xbf) {
890                         c2 = ((c2 - 0xc2) << 6) + t;  // 0080..017F
891                         ++nextIndex;
892                     } else {
893                         int32_t i2 = nextIndex + 1;
894                         if(i2 < sLength || sLength < 0) {
895                             if(c2 == 0xe2 && s8[nextIndex] == 0x80 &&
896                                     0x80 <= (t = s8[i2]) && t <= 0xbf) {
897                                 c2 = (LATIN_LIMIT - 0x80) + t;  // 2000..203F -> 0180..01BF
898                             } else if(c2 == 0xef && s8[nextIndex] == 0xbf &&
899                                     ((t = s8[i2]) == 0xbe || t == 0xbf)) {
900                                 c2 = -1;  // U+FFFE & U+FFFF cannot occur in contractions.
901                             } else {
902                                 return BAIL_OUT;
903                             }
904                         } else {
905                             return BAIL_OUT;
906                         }
907                         nextIndex += 2;
908                     }
909                 }
910             }
911             if(c2 == 0 && sLength < 0) {
912                 sLength = sIndex;
913                 c2 = -1;
914             }
915             // Look for the next character in the contraction suffix list,
916             // which is in ascending order of single suffix characters.
917             int32_t i = index;
918             int32_t head = table[i];  // first skip the default mapping
919             int32_t x;
920             do {
921                 i += head >> CONTR_LENGTH_SHIFT;
922                 head = table[i];
923                 x = head & CONTR_CHAR_MASK;
924             } while(x < c2);
925             if(x == c2) {
926                 index = i;
927                 sIndex = nextIndex;
928             }
929         }
930         // Return the CE or CEs for the default or contraction mapping.
931         int32_t length = table[index] >> CONTR_LENGTH_SHIFT;
932         if(length == 1) {
933             return BAIL_OUT;
934         }
935         ce = table[index + 1];
936         if(length == 2) {
937             return ce;
938         } else {
939             return ((uint32_t)table[index + 2] << 16) | ce;
940         }
941     }
942 }
943 
944 uint32_t
getSecondaries(uint32_t variableTop,uint32_t pair)945 CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) {
946     if(pair <= 0xffff) {
947         // one mini CE
948         if(pair >= MIN_SHORT) {
949             pair = getSecondariesFromOneShortCE(pair);
950         } else if(pair > variableTop) {
951             pair = COMMON_SEC_PLUS_OFFSET;
952         } else if(pair >= MIN_LONG) {
953             pair = 0;  // variable
954         }
955         // else special mini CE
956     } else {
957         uint32_t ce = pair & 0xffff;
958         if(ce >= MIN_SHORT) {
959             pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS;
960         } else if(ce > variableTop) {
961             pair = TWO_COMMON_SEC_PLUS_OFFSET;
962         } else {
963             U_ASSERT(ce >= MIN_LONG);
964             pair = 0;  // variable
965         }
966     }
967     return pair;
968 }
969 
970 uint32_t
getCases(uint32_t variableTop,UBool strengthIsPrimary,uint32_t pair)971 CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) {
972     // Primary+caseLevel: Ignore case level weights of primary ignorables.
973     // Otherwise: Ignore case level weights of secondary ignorables.
974     // For details see the comments in the CollationCompare class.
975     // Tertiary CEs (secondary ignorables) are not supported in fast Latin.
976     if(pair <= 0xffff) {
977         // one mini CE
978         if(pair >= MIN_SHORT) {
979             // A high secondary weight means we really have two CEs,
980             // a primary CE and a secondary CE.
981             uint32_t ce = pair;
982             pair &= CASE_MASK;  // explicit weight of primary CE
983             if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
984                 pair |= LOWER_CASE << 16;  // implied weight of secondary CE
985             }
986         } else if(pair > variableTop) {
987             pair = LOWER_CASE;
988         } else if(pair >= MIN_LONG) {
989             pair = 0;  // variable
990         }
991         // else special mini CE
992     } else {
993         // two mini CEs, same primary groups, neither expands like above
994         uint32_t ce = pair & 0xffff;
995         if(ce >= MIN_SHORT) {
996             if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) {
997                 pair &= CASE_MASK;
998             } else {
999                 pair &= TWO_CASES_MASK;
1000             }
1001         } else if(ce > variableTop) {
1002             pair = TWO_LOWER_CASES;
1003         } else {
1004             U_ASSERT(ce >= MIN_LONG);
1005             pair = 0;  // variable
1006         }
1007     }
1008     return pair;
1009 }
1010 
1011 uint32_t
getTertiaries(uint32_t variableTop,UBool withCaseBits,uint32_t pair)1012 CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) {
1013     if(pair <= 0xffff) {
1014         // one mini CE
1015         if(pair >= MIN_SHORT) {
1016             // A high secondary weight means we really have two CEs,
1017             // a primary CE and a secondary CE.
1018             uint32_t ce = pair;
1019             if(withCaseBits) {
1020                 pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET;
1021                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1022                     pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16;
1023                 }
1024             } else {
1025                 pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1026                 if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1027                     pair |= COMMON_TER_PLUS_OFFSET << 16;
1028                 }
1029             }
1030         } else if(pair > variableTop) {
1031             pair = (pair & TERTIARY_MASK) + TER_OFFSET;
1032             if(withCaseBits) {
1033                 pair |= LOWER_CASE;
1034             }
1035         } else if(pair >= MIN_LONG) {
1036             pair = 0;  // variable
1037         }
1038         // else special mini CE
1039     } else {
1040         // two mini CEs, same primary groups, neither expands like above
1041         uint32_t ce = pair & 0xffff;
1042         if(ce >= MIN_SHORT) {
1043             if(withCaseBits) {
1044                 pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK;
1045             } else {
1046                 pair &= TWO_TERTIARIES_MASK;
1047             }
1048             pair += TWO_TER_OFFSETS;
1049         } else if(ce > variableTop) {
1050             pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS;
1051             if(withCaseBits) {
1052                 pair |= TWO_LOWER_CASES;
1053             }
1054         } else {
1055             U_ASSERT(ce >= MIN_LONG);
1056             pair = 0;  // variable
1057         }
1058     }
1059     return pair;
1060 }
1061 
1062 uint32_t
getQuaternaries(uint32_t variableTop,uint32_t pair)1063 CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) {
1064     // Return the primary weight of a variable CE,
1065     // or the maximum primary weight for a non-variable, not-completely-ignorable CE.
1066     if(pair <= 0xffff) {
1067         // one mini CE
1068         if(pair >= MIN_SHORT) {
1069             // A high secondary weight means we really have two CEs,
1070             // a primary CE and a secondary CE.
1071             if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) {
1072                 pair = TWO_SHORT_PRIMARIES_MASK;
1073             } else {
1074                 pair = SHORT_PRIMARY_MASK;
1075             }
1076         } else if(pair > variableTop) {
1077             pair = SHORT_PRIMARY_MASK;
1078         } else if(pair >= MIN_LONG) {
1079             pair &= LONG_PRIMARY_MASK;  // variable
1080         }
1081         // else special mini CE
1082     } else {
1083         // two mini CEs, same primary groups, neither expands like above
1084         uint32_t ce = pair & 0xffff;
1085         if(ce > variableTop) {
1086             pair = TWO_SHORT_PRIMARIES_MASK;
1087         } else {
1088             U_ASSERT(ce >= MIN_LONG);
1089             pair &= TWO_LONG_PRIMARIES_MASK;  // variable
1090         }
1091     }
1092     return pair;
1093 }
1094 
1095 U_NAMESPACE_END
1096 
1097 #endif  // !UCONFIG_NO_COLLATION
1098