1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines
6  * Corporation and others.  All Rights Reserved.
7  *******************************************************************************
8  * CollationCompare.java, ported from collationcompare.h/.cpp
9  *
10  * C++ version created on: 2012feb14 with new and old collation code
11  * created by: Markus W. Scherer
12  */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import com.ibm.icu.text.Collator;
17 
18 public final class CollationCompare /* all static */ {
compareUpToQuaternary(CollationIterator left, CollationIterator right, CollationSettings settings)19     public static int compareUpToQuaternary(CollationIterator left, CollationIterator right,
20             CollationSettings settings) {
21         int options = settings.options;
22         long variableTop;
23         if ((options & CollationSettings.ALTERNATE_MASK) == 0) {
24             variableTop = 0;
25         } else {
26             // +1 so that we can use "<" and primary ignorables test out early.
27             variableTop = settings.variableTop + 1;
28         }
29         boolean anyVariable = false;
30 
31         // Fetch CEs, compare primaries, store secondary & tertiary weights.
32         for (;;) {
33             // We fetch CEs until we get a non-ignorable primary or reach the end.
34             long leftPrimary;
35             do {
36                 long ce = left.nextCE();
37                 leftPrimary = ce >>> 32;
38                 if (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
39                     // Variable CE, shift it to quaternary level.
40                     // Ignore all following primary ignorables, and shift further variable CEs.
41                     anyVariable = true;
42                     do {
43                         // Store only the primary of the variable CE.
44                         left.setCurrentCE(ce & 0xffffffff00000000L);
45                         for (;;) {
46                             ce = left.nextCE();
47                             leftPrimary = ce >>> 32;
48                             if (leftPrimary == 0) {
49                                 left.setCurrentCE(0);
50                             } else {
51                                 break;
52                             }
53                         }
54                     } while (leftPrimary < variableTop && leftPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
55                 }
56             } while (leftPrimary == 0);
57 
58             long rightPrimary;
59             do {
60                 long ce = right.nextCE();
61                 rightPrimary = ce >>> 32;
62                 if (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY) {
63                     // Variable CE, shift it to quaternary level.
64                     // Ignore all following primary ignorables, and shift further variable CEs.
65                     anyVariable = true;
66                     do {
67                         // Store only the primary of the variable CE.
68                         right.setCurrentCE(ce & 0xffffffff00000000L);
69                         for (;;) {
70                             ce = right.nextCE();
71                             rightPrimary = ce >>> 32;
72                             if (rightPrimary == 0) {
73                                 right.setCurrentCE(0);
74                             } else {
75                                 break;
76                             }
77                         }
78                     } while (rightPrimary < variableTop && rightPrimary > Collation.MERGE_SEPARATOR_PRIMARY);
79                 }
80             } while (rightPrimary == 0);
81 
82             if (leftPrimary != rightPrimary) {
83                 // Return the primary difference, with script reordering.
84                 if (settings.hasReordering()) {
85                     leftPrimary = settings.reorder(leftPrimary);
86                     rightPrimary = settings.reorder(rightPrimary);
87                 }
88                 return (leftPrimary < rightPrimary) ? Collation.LESS : Collation.GREATER;
89             }
90             if (leftPrimary == Collation.NO_CE_PRIMARY) {
91                 break;
92             }
93         }
94 
95         // Compare the buffered secondary & tertiary weights.
96         // We might skip the secondary level but continue with the case level
97         // which is turned on separately.
98         if (CollationSettings.getStrength(options) >= Collator.SECONDARY) {
99             if ((options & CollationSettings.BACKWARD_SECONDARY) == 0) {
100                 int leftIndex = 0;
101                 int rightIndex = 0;
102                 for (;;) {
103                     int leftSecondary;
104                     do {
105                         leftSecondary = ((int) left.getCE(leftIndex++)) >>> 16;
106                     } while (leftSecondary == 0);
107 
108                     int rightSecondary;
109                     do {
110                         rightSecondary = ((int) right.getCE(rightIndex++)) >>> 16;
111                     } while (rightSecondary == 0);
112 
113                     if (leftSecondary != rightSecondary) {
114                         return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
115                     }
116                     if (leftSecondary == Collation.NO_CE_WEIGHT16) {
117                         break;
118                     }
119                 }
120             } else {
121                 // The backwards secondary level compares secondary weights backwards
122                 // within segments separated by the merge separator (U+FFFE, weight 02).
123                 int leftStart = 0;
124                 int rightStart = 0;
125                 for (;;) {
126                     // Find the merge separator or the NO_CE terminator.
127                     long p;
128                     int leftLimit = leftStart;
129                     while ((p = left.getCE(leftLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
130                             || p == 0) {
131                         ++leftLimit;
132                     }
133                     int rightLimit = rightStart;
134                     while ((p = right.getCE(rightLimit) >>> 32) > Collation.MERGE_SEPARATOR_PRIMARY
135                             || p == 0) {
136                         ++rightLimit;
137                     }
138 
139                     // Compare the segments.
140                     int leftIndex = leftLimit;
141                     int rightIndex = rightLimit;
142                     for (;;) {
143                         int leftSecondary = 0;
144                         while (leftSecondary == 0 && leftIndex > leftStart) {
145                             leftSecondary = ((int) left.getCE(--leftIndex)) >>> 16;
146                         }
147 
148                         int rightSecondary = 0;
149                         while (rightSecondary == 0 && rightIndex > rightStart) {
150                             rightSecondary = ((int) right.getCE(--rightIndex)) >>> 16;
151                         }
152 
153                         if (leftSecondary != rightSecondary) {
154                             return (leftSecondary < rightSecondary) ? Collation.LESS : Collation.GREATER;
155                         }
156                         if (leftSecondary == 0) {
157                             break;
158                         }
159                     }
160 
161                     // Did we reach the end of either string?
162                     // Both strings have the same number of merge separators,
163                     // or else there would have been a primary-level difference.
164                     assert (left.getCE(leftLimit) == right.getCE(rightLimit));
165                     if (p == Collation.NO_CE_PRIMARY) {
166                         break;
167                     }
168                     // Skip both merge separators and continue.
169                     leftStart = leftLimit + 1;
170                     rightStart = rightLimit + 1;
171                 }
172             }
173         }
174 
175         if ((options & CollationSettings.CASE_LEVEL) != 0) {
176             int strength = CollationSettings.getStrength(options);
177             int leftIndex = 0;
178             int rightIndex = 0;
179             for (;;) {
180                 int leftCase, leftLower32, rightCase;
181                 if (strength == Collator.PRIMARY) {
182                     // Primary+caseLevel: Ignore case level weights of primary ignorables.
183                     // Otherwise we would get a-umlaut > a
184                     // which is not desirable for accent-insensitive sorting.
185                     // Check for (lower 32 bits) == 0 as well because variable CEs are stored
186                     // with only primary weights.
187                     long ce;
188                     do {
189                         ce = left.getCE(leftIndex++);
190                         leftCase = (int) ce;
191                     } while ((ce >>> 32) == 0 || leftCase == 0);
192                     leftLower32 = leftCase;
193                     leftCase &= 0xc000;
194 
195                     do {
196                         ce = right.getCE(rightIndex++);
197                         rightCase = (int) ce;
198                     } while ((ce >>> 32) == 0 || rightCase == 0);
199                     rightCase &= 0xc000;
200                 } else {
201                     // Secondary+caseLevel: By analogy with the above,
202                     // ignore case level weights of secondary ignorables.
203                     //
204                     // Note: A tertiary CE has uppercase case bits (0.0.ut)
205                     // to keep tertiary+caseFirst well-formed.
206                     //
207                     // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables.
208                     // Otherwise a tertiary CE's uppercase would be no greater than
209                     // a primary/secondary CE's uppercase.
210                     // (See UCA well-formedness condition 2.)
211                     // We could construct a special case weight higher than uppercase,
212                     // but it's simpler to always ignore case weights of secondary ignorables,
213                     // turning 0.0.ut into 0.0.0.t.
214                     // (See LDML Collation, Case Parameters.)
215                     do {
216                         leftCase = (int) left.getCE(leftIndex++);
217                     } while ((leftCase & 0xffff0000) == 0);
218                     leftLower32 = leftCase;
219                     leftCase &= 0xc000;
220 
221                     do {
222                         rightCase = (int) right.getCE(rightIndex++);
223                     } while ((rightCase & 0xffff0000) == 0);
224                     rightCase &= 0xc000;
225                 }
226 
227                 // No need to handle NO_CE and MERGE_SEPARATOR specially:
228                 // There is one case weight for each previous-level weight,
229                 // so level length differences were handled there.
230                 if (leftCase != rightCase) {
231                     if ((options & CollationSettings.UPPER_FIRST) == 0) {
232                         return (leftCase < rightCase) ? Collation.LESS : Collation.GREATER;
233                     } else {
234                         return (leftCase < rightCase) ? Collation.GREATER : Collation.LESS;
235                     }
236                 }
237                 if ((leftLower32 >>> 16) == Collation.NO_CE_WEIGHT16) {
238                     break;
239                 }
240             }
241         }
242         if (CollationSettings.getStrength(options) <= Collator.SECONDARY) {
243             return Collation.EQUAL;
244         }
245 
246         int tertiaryMask = CollationSettings.getTertiaryMask(options);
247 
248         int leftIndex = 0;
249         int rightIndex = 0;
250         int anyQuaternaries = 0;
251         for (;;) {
252             int leftLower32, leftTertiary;
253             do {
254                 leftLower32 = (int) left.getCE(leftIndex++);
255                 anyQuaternaries |= leftLower32;
256                 assert ((leftLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (leftLower32 & 0xc0c0) == 0);
257                 leftTertiary = leftLower32 & tertiaryMask;
258             } while (leftTertiary == 0);
259 
260             int rightLower32, rightTertiary;
261             do {
262                 rightLower32 = (int) right.getCE(rightIndex++);
263                 anyQuaternaries |= rightLower32;
264                 assert ((rightLower32 & Collation.ONLY_TERTIARY_MASK) != 0 || (rightLower32 & 0xc0c0) == 0);
265                 rightTertiary = rightLower32 & tertiaryMask;
266             } while (rightTertiary == 0);
267 
268             if (leftTertiary != rightTertiary) {
269                 if (CollationSettings.sortsTertiaryUpperCaseFirst(options)) {
270                     // Pass through NO_CE and keep real tertiary weights larger than that.
271                     // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut),
272                     // to keep tertiary CEs well-formed.
273                     // Their case+tertiary weights must be greater than those of
274                     // primary and secondary CEs.
275                     if (leftTertiary > Collation.NO_CE_WEIGHT16) {
276                         if ((leftLower32 & 0xffff0000) != 0) {
277                             leftTertiary ^= 0xc000;
278                         } else {
279                             leftTertiary += 0x4000;
280                         }
281                     }
282                     if (rightTertiary > Collation.NO_CE_WEIGHT16) {
283                         if ((rightLower32 & 0xffff0000) != 0) {
284                             rightTertiary ^= 0xc000;
285                         } else {
286                             rightTertiary += 0x4000;
287                         }
288                     }
289                 }
290                 return (leftTertiary < rightTertiary) ? Collation.LESS : Collation.GREATER;
291             }
292             if (leftTertiary == Collation.NO_CE_WEIGHT16) {
293                 break;
294             }
295         }
296         if (CollationSettings.getStrength(options) <= Collator.TERTIARY) {
297             return Collation.EQUAL;
298         }
299 
300         if (!anyVariable && (anyQuaternaries & 0xc0) == 0) {
301             // If there are no "variable" CEs and no non-zero quaternary weights,
302             // then there are no quaternary differences.
303             return Collation.EQUAL;
304         }
305 
306         leftIndex = 0;
307         rightIndex = 0;
308         for (;;) {
309             long leftQuaternary;
310             do {
311                 long ce = left.getCE(leftIndex++);
312                 leftQuaternary = ce & 0xffff;
313                 if (leftQuaternary <= Collation.NO_CE_WEIGHT16) {
314                     // Variable primary or completely ignorable or NO_CE.
315                     leftQuaternary = ce >>> 32;
316                 } else {
317                     // Regular CE, not tertiary ignorable.
318                     // Preserve the quaternary weight in bits 7..6.
319                     leftQuaternary |= 0xffffff3fL;
320                 }
321             } while (leftQuaternary == 0);
322 
323             long rightQuaternary;
324             do {
325                 long ce = right.getCE(rightIndex++);
326                 rightQuaternary = ce & 0xffff;
327                 if (rightQuaternary <= Collation.NO_CE_WEIGHT16) {
328                     // Variable primary or completely ignorable or NO_CE.
329                     rightQuaternary = ce >>> 32;
330                 } else {
331                     // Regular CE, not tertiary ignorable.
332                     // Preserve the quaternary weight in bits 7..6.
333                     rightQuaternary |= 0xffffff3fL;
334                 }
335             } while (rightQuaternary == 0);
336 
337             if (leftQuaternary != rightQuaternary) {
338                 // Return the difference, with script reordering.
339                 if (settings.hasReordering()) {
340                     leftQuaternary = settings.reorder(leftQuaternary);
341                     rightQuaternary = settings.reorder(rightQuaternary);
342                 }
343                 return (leftQuaternary < rightQuaternary) ? Collation.LESS : Collation.GREATER;
344             }
345             if (leftQuaternary == Collation.NO_CE_PRIMARY) {
346                 break;
347             }
348         }
349         return Collation.EQUAL;
350     }
351 }
352