1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationcompare.cpp
7 *
8 * created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "cmemory.h"
18 #include "collation.h"
19 #include "collationcompare.h"
20 #include "collationiterator.h"
21 #include "collationsettings.h"
22 #include "uassert.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 UCollationResult
compareUpToQuaternary(CollationIterator & left,CollationIterator & right,const CollationSettings & settings,UErrorCode & errorCode)27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28                                         const CollationSettings &settings,
29                                         UErrorCode &errorCode) {
30     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31 
32     int32_t options = settings.options;
33     uint32_t variableTop;
34     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35         variableTop = 0;
36     } else {
37         // +1 so that we can use "<" and primary ignorables test out early.
38         variableTop = settings.variableTop + 1;
39     }
40     UBool anyVariable = FALSE;
41 
42     // Fetch CEs, compare primaries, store secondary & tertiary weights.
43     U_ALIGN_CODE(16);
44     for(;;) {
45         // We fetch CEs until we get a non-ignorable primary or reach the end.
46         uint32_t leftPrimary;
47         do {
48             int64_t ce = left.nextCE(errorCode);
49             leftPrimary = (uint32_t)(ce >> 32);
50             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
51                 // Variable CE, shift it to quaternary level.
52                 // Ignore all following primary ignorables, and shift further variable CEs.
53                 anyVariable = TRUE;
54                 do {
55                     // Store only the primary of the variable CE.
56                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
57                     for(;;) {
58                         ce = left.nextCE(errorCode);
59                         leftPrimary = (uint32_t)(ce >> 32);
60                         if(leftPrimary == 0) {
61                             left.setCurrentCE(0);
62                         } else {
63                             break;
64                         }
65                     }
66                 } while(leftPrimary < variableTop &&
67                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
68             }
69         } while(leftPrimary == 0);
70 
71         uint32_t rightPrimary;
72         do {
73             int64_t ce = right.nextCE(errorCode);
74             rightPrimary = (uint32_t)(ce >> 32);
75             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
76                 // Variable CE, shift it to quaternary level.
77                 // Ignore all following primary ignorables, and shift further variable CEs.
78                 anyVariable = TRUE;
79                 do {
80                     // Store only the primary of the variable CE.
81                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
82                     for(;;) {
83                         ce = right.nextCE(errorCode);
84                         rightPrimary = (uint32_t)(ce >> 32);
85                         if(rightPrimary == 0) {
86                             right.setCurrentCE(0);
87                         } else {
88                             break;
89                         }
90                     }
91                 } while(rightPrimary < variableTop &&
92                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
93             }
94         } while(rightPrimary == 0);
95 
96         if(leftPrimary != rightPrimary) {
97             // Return the primary difference, with script reordering.
98             if(settings.hasReordering()) {
99                 leftPrimary = settings.reorder(leftPrimary);
100                 rightPrimary = settings.reorder(rightPrimary);
101             }
102             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
103         }
104         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
105     }
106     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
107 
108     // Compare the buffered secondary & tertiary weights.
109     // We might skip the secondary level but continue with the case level
110     // which is turned on separately.
111     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
112         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
113             int32_t leftIndex = 0;
114             int32_t rightIndex = 0;
115             for(;;) {
116                 uint32_t leftSecondary;
117                 do {
118                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
119                 } while(leftSecondary == 0);
120 
121                 uint32_t rightSecondary;
122                 do {
123                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
124                 } while(rightSecondary == 0);
125 
126                 if(leftSecondary != rightSecondary) {
127                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
128                 }
129                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
130             }
131         } else {
132             // The backwards secondary level compares secondary weights backwards
133             // within segments separated by the merge separator (U+FFFE, weight 02).
134             int32_t leftStart = 0;
135             int32_t rightStart = 0;
136             for(;;) {
137                 // Find the merge separator or the NO_CE terminator.
138                 uint32_t p;
139                 int32_t leftLimit = leftStart;
140                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
141                             Collation::MERGE_SEPARATOR_PRIMARY ||
142                         p == 0) {
143                     ++leftLimit;
144                 }
145                 int32_t rightLimit = rightStart;
146                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
147                             Collation::MERGE_SEPARATOR_PRIMARY ||
148                         p == 0) {
149                     ++rightLimit;
150                 }
151 
152                 // Compare the segments.
153                 int32_t leftIndex = leftLimit;
154                 int32_t rightIndex = rightLimit;
155                 for(;;) {
156                     int32_t leftSecondary = 0;
157                     while(leftSecondary == 0 && leftIndex > leftStart) {
158                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
159                     }
160 
161                     int32_t rightSecondary = 0;
162                     while(rightSecondary == 0 && rightIndex > rightStart) {
163                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
164                     }
165 
166                     if(leftSecondary != rightSecondary) {
167                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
168                     }
169                     if(leftSecondary == 0) { break; }
170                 }
171 
172                 // Did we reach the end of either string?
173                 // Both strings have the same number of merge separators,
174                 // or else there would have been a primary-level difference.
175                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
176                 if(p == Collation::NO_CE_PRIMARY) { break; }
177                 // Skip both merge separators and continue.
178                 leftStart = leftLimit + 1;
179                 rightStart = rightLimit + 1;
180             }
181         }
182     }
183 
184     if((options & CollationSettings::CASE_LEVEL) != 0) {
185         int32_t strength = CollationSettings::getStrength(options);
186         int32_t leftIndex = 0;
187         int32_t rightIndex = 0;
188         for(;;) {
189             uint32_t leftCase, leftLower32, rightCase;
190             if(strength == UCOL_PRIMARY) {
191                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
192                 // Otherwise we would get a-umlaut > a
193                 // which is not desirable for accent-insensitive sorting.
194                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
195                 // with only primary weights.
196                 int64_t ce;
197                 do {
198                     ce = left.getCE(leftIndex++);
199                     leftCase = (uint32_t)ce;
200                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
201                 leftLower32 = leftCase;
202                 leftCase &= 0xc000;
203 
204                 do {
205                     ce = right.getCE(rightIndex++);
206                     rightCase = (uint32_t)ce;
207                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
208                 rightCase &= 0xc000;
209             } else {
210                 // Secondary+caseLevel: By analogy with the above,
211                 // ignore case level weights of secondary ignorables.
212                 //
213                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
214                 // to keep tertiary+caseFirst well-formed.
215                 //
216                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
217                 // Otherwise a tertiary CE's uppercase would be no greater than
218                 // a primary/secondary CE's uppercase.
219                 // (See UCA well-formedness condition 2.)
220                 // We could construct a special case weight higher than uppercase,
221                 // but it's simpler to always ignore case weights of secondary ignorables,
222                 // turning 0.0.ut into 0.0.0.t.
223                 // (See LDML Collation, Case Parameters.)
224                 do {
225                     leftCase = (uint32_t)left.getCE(leftIndex++);
226                 } while(leftCase <= 0xffff);
227                 leftLower32 = leftCase;
228                 leftCase &= 0xc000;
229 
230                 do {
231                     rightCase = (uint32_t)right.getCE(rightIndex++);
232                 } while(rightCase <= 0xffff);
233                 rightCase &= 0xc000;
234             }
235 
236             // No need to handle NO_CE and MERGE_SEPARATOR specially:
237             // There is one case weight for each previous-level weight,
238             // so level length differences were handled there.
239             if(leftCase != rightCase) {
240                 if((options & CollationSettings::UPPER_FIRST) == 0) {
241                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
242                 } else {
243                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
244                 }
245             }
246             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
247         }
248     }
249     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
250 
251     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
252 
253     int32_t leftIndex = 0;
254     int32_t rightIndex = 0;
255     uint32_t anyQuaternaries = 0;
256     for(;;) {
257         uint32_t leftLower32, leftTertiary;
258         do {
259             leftLower32 = (uint32_t)left.getCE(leftIndex++);
260             anyQuaternaries |= leftLower32;
261             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
262                      (leftLower32 & 0xc0c0) == 0);
263             leftTertiary = leftLower32 & tertiaryMask;
264         } while(leftTertiary == 0);
265 
266         uint32_t rightLower32, rightTertiary;
267         do {
268             rightLower32 = (uint32_t)right.getCE(rightIndex++);
269             anyQuaternaries |= rightLower32;
270             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
271                      (rightLower32 & 0xc0c0) == 0);
272             rightTertiary = rightLower32 & tertiaryMask;
273         } while(rightTertiary == 0);
274 
275         if(leftTertiary != rightTertiary) {
276             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
277                 // Pass through NO_CE and keep real tertiary weights larger than that.
278                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
279                 // to keep tertiary CEs well-formed.
280                 // Their case+tertiary weights must be greater than those of
281                 // primary and secondary CEs.
282                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
283                     if(leftLower32 > 0xffff) {
284                         leftTertiary ^= 0xc000;
285                     } else {
286                         leftTertiary += 0x4000;
287                     }
288                 }
289                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
290                     if(rightLower32 > 0xffff) {
291                         rightTertiary ^= 0xc000;
292                     } else {
293                         rightTertiary += 0x4000;
294                     }
295                 }
296             }
297             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
298         }
299         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
300     }
301     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
302 
303     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
304         // If there are no "variable" CEs and no non-zero quaternary weights,
305         // then there are no quaternary differences.
306         return UCOL_EQUAL;
307     }
308 
309     leftIndex = 0;
310     rightIndex = 0;
311     for(;;) {
312         uint32_t leftQuaternary;
313         do {
314             int64_t ce = left.getCE(leftIndex++);
315             leftQuaternary = (uint32_t)ce & 0xffff;
316             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
317                 // Variable primary or completely ignorable or NO_CE.
318                 leftQuaternary = (uint32_t)(ce >> 32);
319             } else {
320                 // Regular CE, not tertiary ignorable.
321                 // Preserve the quaternary weight in bits 7..6.
322                 leftQuaternary |= 0xffffff3f;
323             }
324         } while(leftQuaternary == 0);
325 
326         uint32_t rightQuaternary;
327         do {
328             int64_t ce = right.getCE(rightIndex++);
329             rightQuaternary = (uint32_t)ce & 0xffff;
330             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
331                 // Variable primary or completely ignorable or NO_CE.
332                 rightQuaternary = (uint32_t)(ce >> 32);
333             } else {
334                 // Regular CE, not tertiary ignorable.
335                 // Preserve the quaternary weight in bits 7..6.
336                 rightQuaternary |= 0xffffff3f;
337             }
338         } while(rightQuaternary == 0);
339 
340         if(leftQuaternary != rightQuaternary) {
341             // Return the difference, with script reordering.
342             if(settings.hasReordering()) {
343                 leftQuaternary = settings.reorder(leftQuaternary);
344                 rightQuaternary = settings.reorder(rightQuaternary);
345             }
346             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
347         }
348         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
349     }
350     return UCOL_EQUAL;
351 }
352 
353 U_NAMESPACE_END
354 
355 #endif  // !UCONFIG_NO_COLLATION
356