• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 * Copyright (C) 1996-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationcompare.cpp
7 *
8 * created on: 2012feb14 with new and old collation code
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "cmemory.h"
18 #include "collation.h"
19 #include "collationcompare.h"
20 #include "collationiterator.h"
21 #include "collationsettings.h"
22 #include "uassert.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 UCollationResult
compareUpToQuaternary(CollationIterator & left,CollationIterator & right,const CollationSettings & settings,UErrorCode & errorCode)27 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right,
28                                         const CollationSettings &settings,
29                                         UErrorCode &errorCode) {
30     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
31 
32     int32_t options = settings.options;
33     uint32_t variableTop;
34     if((options & CollationSettings::ALTERNATE_MASK) == 0) {
35         variableTop = 0;
36     } else {
37         // +1 so that we can use "<" and primary ignorables test out early.
38         variableTop = settings.variableTop + 1;
39     }
40     UBool anyVariable = FALSE;
41 
42     // Fetch CEs, compare primaries, store secondary & tertiary weights.
43     for(;;) {
44         // We fetch CEs until we get a non-ignorable primary or reach the end.
45         uint32_t leftPrimary;
46         do {
47             int64_t ce = left.nextCE(errorCode);
48             leftPrimary = (uint32_t)(ce >> 32);
49             if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
50                 // Variable CE, shift it to quaternary level.
51                 // Ignore all following primary ignorables, and shift further variable CEs.
52                 anyVariable = TRUE;
53                 do {
54                     // Store only the primary of the variable CE.
55                     left.setCurrentCE(ce & INT64_C(0xffffffff00000000));
56                     for(;;) {
57                         ce = left.nextCE(errorCode);
58                         leftPrimary = (uint32_t)(ce >> 32);
59                         if(leftPrimary == 0) {
60                             left.setCurrentCE(0);
61                         } else {
62                             break;
63                         }
64                     }
65                 } while(leftPrimary < variableTop &&
66                         leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
67             }
68         } while(leftPrimary == 0);
69 
70         uint32_t rightPrimary;
71         do {
72             int64_t ce = right.nextCE(errorCode);
73             rightPrimary = (uint32_t)(ce >> 32);
74             if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) {
75                 // Variable CE, shift it to quaternary level.
76                 // Ignore all following primary ignorables, and shift further variable CEs.
77                 anyVariable = TRUE;
78                 do {
79                     // Store only the primary of the variable CE.
80                     right.setCurrentCE(ce & INT64_C(0xffffffff00000000));
81                     for(;;) {
82                         ce = right.nextCE(errorCode);
83                         rightPrimary = (uint32_t)(ce >> 32);
84                         if(rightPrimary == 0) {
85                             right.setCurrentCE(0);
86                         } else {
87                             break;
88                         }
89                     }
90                 } while(rightPrimary < variableTop &&
91                         rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY);
92             }
93         } while(rightPrimary == 0);
94 
95         if(leftPrimary != rightPrimary) {
96             // Return the primary difference, with script reordering.
97             if(settings.hasReordering()) {
98                 leftPrimary = settings.reorder(leftPrimary);
99                 rightPrimary = settings.reorder(rightPrimary);
100             }
101             return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER;
102         }
103         if(leftPrimary == Collation::NO_CE_PRIMARY) { break; }
104     }
105     if(U_FAILURE(errorCode)) { return UCOL_EQUAL; }
106 
107     // Compare the buffered secondary & tertiary weights.
108     // We might skip the secondary level but continue with the case level
109     // which is turned on separately.
110     if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) {
111         if((options & CollationSettings::BACKWARD_SECONDARY) == 0) {
112             int32_t leftIndex = 0;
113             int32_t rightIndex = 0;
114             for(;;) {
115                 uint32_t leftSecondary;
116                 do {
117                     leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16;
118                 } while(leftSecondary == 0);
119 
120                 uint32_t rightSecondary;
121                 do {
122                     rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16;
123                 } while(rightSecondary == 0);
124 
125                 if(leftSecondary != rightSecondary) {
126                     return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
127                 }
128                 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; }
129             }
130         } else {
131             // The backwards secondary level compares secondary weights backwards
132             // within segments separated by the merge separator (U+FFFE, weight 02).
133             int32_t leftStart = 0;
134             int32_t rightStart = 0;
135             for(;;) {
136                 // Find the merge separator or the NO_CE terminator.
137                 uint32_t p;
138                 int32_t leftLimit = leftStart;
139                 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) >
140                             Collation::MERGE_SEPARATOR_PRIMARY ||
141                         p == 0) {
142                     ++leftLimit;
143                 }
144                 int32_t rightLimit = rightStart;
145                 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) >
146                             Collation::MERGE_SEPARATOR_PRIMARY ||
147                         p == 0) {
148                     ++rightLimit;
149                 }
150 
151                 // Compare the segments.
152                 int32_t leftIndex = leftLimit;
153                 int32_t rightIndex = rightLimit;
154                 for(;;) {
155                     int32_t leftSecondary = 0;
156                     while(leftSecondary == 0 && leftIndex > leftStart) {
157                         leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16;
158                     }
159 
160                     int32_t rightSecondary = 0;
161                     while(rightSecondary == 0 && rightIndex > rightStart) {
162                         rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16;
163                     }
164 
165                     if(leftSecondary != rightSecondary) {
166                         return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER;
167                     }
168                     if(leftSecondary == 0) { break; }
169                 }
170 
171                 // Did we reach the end of either string?
172                 // Both strings have the same number of merge separators,
173                 // or else there would have been a primary-level difference.
174                 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit));
175                 if(p == Collation::NO_CE_PRIMARY) { break; }
176                 // Skip both merge separators and continue.
177                 leftStart = leftLimit + 1;
178                 rightStart = rightLimit + 1;
179             }
180         }
181     }
182 
183     if((options & CollationSettings::CASE_LEVEL) != 0) {
184         int32_t strength = CollationSettings::getStrength(options);
185         int32_t leftIndex = 0;
186         int32_t rightIndex = 0;
187         for(;;) {
188             uint32_t leftCase, leftLower32, rightCase;
189             if(strength == UCOL_PRIMARY) {
190                 // Primary+caseLevel: Ignore case level weights of primary ignorables.
191                 // Otherwise we would get a-umlaut > a
192                 // which is not desirable for accent-insensitive sorting.
193                 // Check for (lower 32 bits) == 0 as well because variable CEs are stored
194                 // with only primary weights.
195                 int64_t ce;
196                 do {
197                     ce = left.getCE(leftIndex++);
198                     leftCase = (uint32_t)ce;
199                 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0);
200                 leftLower32 = leftCase;
201                 leftCase &= 0xc000;
202 
203                 do {
204                     ce = right.getCE(rightIndex++);
205                     rightCase = (uint32_t)ce;
206                 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0);
207                 rightCase &= 0xc000;
208             } else {
209                 // Secondary+caseLevel: By analogy with the above,
210                 // ignore case level weights of secondary ignorables.
211                 //
212                 // Note: A tertiary CE has uppercase case bits (0.0.ut)
213                 // to keep tertiary+caseFirst well-formed.
214                 //
215                 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
216                 // Otherwise a tertiary CE's uppercase would be no greater than
217                 // a primary/secondary CE's uppercase.
218                 // (See UCA well-formedness condition 2.)
219                 // We could construct a special case weight higher than uppercase,
220                 // but it's simpler to always ignore case weights of secondary ignorables,
221                 // turning 0.0.ut into 0.0.0.t.
222                 // (See LDML Collation, Case Parameters.)
223                 do {
224                     leftCase = (uint32_t)left.getCE(leftIndex++);
225                 } while(leftCase <= 0xffff);
226                 leftLower32 = leftCase;
227                 leftCase &= 0xc000;
228 
229                 do {
230                     rightCase = (uint32_t)right.getCE(rightIndex++);
231                 } while(rightCase <= 0xffff);
232                 rightCase &= 0xc000;
233             }
234 
235             // No need to handle NO_CE and MERGE_SEPARATOR specially:
236             // There is one case weight for each previous-level weight,
237             // so level length differences were handled there.
238             if(leftCase != rightCase) {
239                 if((options & CollationSettings::UPPER_FIRST) == 0) {
240                     return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER;
241                 } else {
242                     return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS;
243                 }
244             }
245             if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; }
246         }
247     }
248     if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; }
249 
250     uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options);
251 
252     int32_t leftIndex = 0;
253     int32_t rightIndex = 0;
254     uint32_t anyQuaternaries = 0;
255     for(;;) {
256         uint32_t leftLower32, leftTertiary;
257         do {
258             leftLower32 = (uint32_t)left.getCE(leftIndex++);
259             anyQuaternaries |= leftLower32;
260             U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
261                      (leftLower32 & 0xc0c0) == 0);
262             leftTertiary = leftLower32 & tertiaryMask;
263         } while(leftTertiary == 0);
264 
265         uint32_t rightLower32, rightTertiary;
266         do {
267             rightLower32 = (uint32_t)right.getCE(rightIndex++);
268             anyQuaternaries |= rightLower32;
269             U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 ||
270                      (rightLower32 & 0xc0c0) == 0);
271             rightTertiary = rightLower32 & tertiaryMask;
272         } while(rightTertiary == 0);
273 
274         if(leftTertiary != rightTertiary) {
275             if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) {
276                 // Pass through NO_CE and keep real tertiary weights larger than that.
277                 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
278                 // to keep tertiary CEs well-formed.
279                 // Their case+tertiary weights must be greater than those of
280                 // primary and secondary CEs.
281                 if(leftTertiary > Collation::NO_CE_WEIGHT16) {
282                     if(leftLower32 > 0xffff) {
283                         leftTertiary ^= 0xc000;
284                     } else {
285                         leftTertiary += 0x4000;
286                     }
287                 }
288                 if(rightTertiary > Collation::NO_CE_WEIGHT16) {
289                     if(rightLower32 > 0xffff) {
290                         rightTertiary ^= 0xc000;
291                     } else {
292                         rightTertiary += 0x4000;
293                     }
294                 }
295             }
296             return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER;
297         }
298         if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; }
299     }
300     if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; }
301 
302     if(!anyVariable && (anyQuaternaries & 0xc0) == 0) {
303         // If there are no "variable" CEs and no non-zero quaternary weights,
304         // then there are no quaternary differences.
305         return UCOL_EQUAL;
306     }
307 
308     leftIndex = 0;
309     rightIndex = 0;
310     for(;;) {
311         uint32_t leftQuaternary;
312         do {
313             int64_t ce = left.getCE(leftIndex++);
314             leftQuaternary = (uint32_t)ce & 0xffff;
315             if(leftQuaternary <= Collation::NO_CE_WEIGHT16) {
316                 // Variable primary or completely ignorable or NO_CE.
317                 leftQuaternary = (uint32_t)(ce >> 32);
318             } else {
319                 // Regular CE, not tertiary ignorable.
320                 // Preserve the quaternary weight in bits 7..6.
321                 leftQuaternary |= 0xffffff3f;
322             }
323         } while(leftQuaternary == 0);
324 
325         uint32_t rightQuaternary;
326         do {
327             int64_t ce = right.getCE(rightIndex++);
328             rightQuaternary = (uint32_t)ce & 0xffff;
329             if(rightQuaternary <= Collation::NO_CE_WEIGHT16) {
330                 // Variable primary or completely ignorable or NO_CE.
331                 rightQuaternary = (uint32_t)(ce >> 32);
332             } else {
333                 // Regular CE, not tertiary ignorable.
334                 // Preserve the quaternary weight in bits 7..6.
335                 rightQuaternary |= 0xffffff3f;
336             }
337         } while(rightQuaternary == 0);
338 
339         if(leftQuaternary != rightQuaternary) {
340             // Return the difference, with script reordering.
341             if(settings.hasReordering()) {
342                 leftQuaternary = settings.reorder(leftQuaternary);
343                 rightQuaternary = settings.reorder(rightQuaternary);
344             }
345             return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER;
346         }
347         if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; }
348     }
349     return UCOL_EQUAL;
350 }
351 
352 U_NAMESPACE_END
353 
354 #endif  // !UCONFIG_NO_COLLATION
355