1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationcompare.cpp 9 * 10 * created on: 2012feb14 with new and old collation code 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/ucol.h" 19 #include "cmemory.h" 20 #include "collation.h" 21 #include "collationcompare.h" 22 #include "collationiterator.h" 23 #include "collationsettings.h" 24 #include "uassert.h" 25 26 U_NAMESPACE_BEGIN 27 28 UCollationResult 29 CollationCompare::compareUpToQuaternary(CollationIterator &left, CollationIterator &right, 30 const CollationSettings &settings, 31 UErrorCode &errorCode) { 32 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 33 34 int32_t options = settings.options; 35 uint32_t variableTop; 36 if((options & CollationSettings::ALTERNATE_MASK) == 0) { 37 variableTop = 0; 38 } else { 39 // +1 so that we can use "<" and primary ignorables test out early. 40 variableTop = settings.variableTop + 1; 41 } 42 UBool anyVariable = FALSE; 43 44 // Fetch CEs, compare primaries, store secondary & tertiary weights. 45 for(;;) { 46 // We fetch CEs until we get a non-ignorable primary or reach the end. 47 uint32_t leftPrimary; 48 do { 49 int64_t ce = left.nextCE(errorCode); 50 leftPrimary = (uint32_t)(ce >> 32); 51 if(leftPrimary < variableTop && leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 52 // Variable CE, shift it to quaternary level. 53 // Ignore all following primary ignorables, and shift further variable CEs. 54 anyVariable = TRUE; 55 do { 56 // Store only the primary of the variable CE. 57 left.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 58 for(;;) { 59 ce = left.nextCE(errorCode); 60 leftPrimary = (uint32_t)(ce >> 32); 61 if(leftPrimary == 0) { 62 left.setCurrentCE(0); 63 } else { 64 break; 65 } 66 } 67 } while(leftPrimary < variableTop && 68 leftPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 69 } 70 } while(leftPrimary == 0); 71 72 uint32_t rightPrimary; 73 do { 74 int64_t ce = right.nextCE(errorCode); 75 rightPrimary = (uint32_t)(ce >> 32); 76 if(rightPrimary < variableTop && rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY) { 77 // Variable CE, shift it to quaternary level. 78 // Ignore all following primary ignorables, and shift further variable CEs. 79 anyVariable = TRUE; 80 do { 81 // Store only the primary of the variable CE. 82 right.setCurrentCE(ce & INT64_C(0xffffffff00000000)); 83 for(;;) { 84 ce = right.nextCE(errorCode); 85 rightPrimary = (uint32_t)(ce >> 32); 86 if(rightPrimary == 0) { 87 right.setCurrentCE(0); 88 } else { 89 break; 90 } 91 } 92 } while(rightPrimary < variableTop && 93 rightPrimary > Collation::MERGE_SEPARATOR_PRIMARY); 94 } 95 } while(rightPrimary == 0); 96 97 if(leftPrimary != rightPrimary) { 98 // Return the primary difference, with script reordering. 99 if(settings.hasReordering()) { 100 leftPrimary = settings.reorder(leftPrimary); 101 rightPrimary = settings.reorder(rightPrimary); 102 } 103 return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; 104 } 105 if(leftPrimary == Collation::NO_CE_PRIMARY) { break; } 106 } 107 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 108 109 // Compare the buffered secondary & tertiary weights. 110 // We might skip the secondary level but continue with the case level 111 // which is turned on separately. 112 if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { 113 if((options & CollationSettings::BACKWARD_SECONDARY) == 0) { 114 int32_t leftIndex = 0; 115 int32_t rightIndex = 0; 116 for(;;) { 117 uint32_t leftSecondary; 118 do { 119 leftSecondary = ((uint32_t)left.getCE(leftIndex++)) >> 16; 120 } while(leftSecondary == 0); 121 122 uint32_t rightSecondary; 123 do { 124 rightSecondary = ((uint32_t)right.getCE(rightIndex++)) >> 16; 125 } while(rightSecondary == 0); 126 127 if(leftSecondary != rightSecondary) { 128 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 129 } 130 if(leftSecondary == Collation::NO_CE_WEIGHT16) { break; } 131 } 132 } else { 133 // The backwards secondary level compares secondary weights backwards 134 // within segments separated by the merge separator (U+FFFE, weight 02). 135 int32_t leftStart = 0; 136 int32_t rightStart = 0; 137 for(;;) { 138 // Find the merge separator or the NO_CE terminator. 139 uint32_t p; 140 int32_t leftLimit = leftStart; 141 while((p = (uint32_t)(left.getCE(leftLimit) >> 32)) > 142 Collation::MERGE_SEPARATOR_PRIMARY || 143 p == 0) { 144 ++leftLimit; 145 } 146 int32_t rightLimit = rightStart; 147 while((p = (uint32_t)(right.getCE(rightLimit) >> 32)) > 148 Collation::MERGE_SEPARATOR_PRIMARY || 149 p == 0) { 150 ++rightLimit; 151 } 152 153 // Compare the segments. 154 int32_t leftIndex = leftLimit; 155 int32_t rightIndex = rightLimit; 156 for(;;) { 157 int32_t leftSecondary = 0; 158 while(leftSecondary == 0 && leftIndex > leftStart) { 159 leftSecondary = ((uint32_t)left.getCE(--leftIndex)) >> 16; 160 } 161 162 int32_t rightSecondary = 0; 163 while(rightSecondary == 0 && rightIndex > rightStart) { 164 rightSecondary = ((uint32_t)right.getCE(--rightIndex)) >> 16; 165 } 166 167 if(leftSecondary != rightSecondary) { 168 return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; 169 } 170 if(leftSecondary == 0) { break; } 171 } 172 173 // Did we reach the end of either string? 174 // Both strings have the same number of merge separators, 175 // or else there would have been a primary-level difference. 176 U_ASSERT(left.getCE(leftLimit) == right.getCE(rightLimit)); 177 if(p == Collation::NO_CE_PRIMARY) { break; } 178 // Skip both merge separators and continue. 179 leftStart = leftLimit + 1; 180 rightStart = rightLimit + 1; 181 } 182 } 183 } 184 185 if((options & CollationSettings::CASE_LEVEL) != 0) { 186 int32_t strength = CollationSettings::getStrength(options); 187 int32_t leftIndex = 0; 188 int32_t rightIndex = 0; 189 for(;;) { 190 uint32_t leftCase, leftLower32, rightCase; 191 if(strength == UCOL_PRIMARY) { 192 // Primary+caseLevel: Ignore case level weights of primary ignorables. 193 // Otherwise we would get a-umlaut > a 194 // which is not desirable for accent-insensitive sorting. 195 // Check for (lower 32 bits) == 0 as well because variable CEs are stored 196 // with only primary weights. 197 int64_t ce; 198 do { 199 ce = left.getCE(leftIndex++); 200 leftCase = (uint32_t)ce; 201 } while((uint32_t)(ce >> 32) == 0 || leftCase == 0); 202 leftLower32 = leftCase; 203 leftCase &= 0xc000; 204 205 do { 206 ce = right.getCE(rightIndex++); 207 rightCase = (uint32_t)ce; 208 } while((uint32_t)(ce >> 32) == 0 || rightCase == 0); 209 rightCase &= 0xc000; 210 } else { 211 // Secondary+caseLevel: By analogy with the above, 212 // ignore case level weights of secondary ignorables. 213 // 214 // Note: A tertiary CE has uppercase case bits (0.0.ut) 215 // to keep tertiary+caseFirst well-formed. 216 // 217 // Tertiary+caseLevel: Also ignore case level weights of secondary ignorables. 218 // Otherwise a tertiary CE's uppercase would be no greater than 219 // a primary/secondary CE's uppercase. 220 // (See UCA well-formedness condition 2.) 221 // We could construct a special case weight higher than uppercase, 222 // but it's simpler to always ignore case weights of secondary ignorables, 223 // turning 0.0.ut into 0.0.0.t. 224 // (See LDML Collation, Case Parameters.) 225 do { 226 leftCase = (uint32_t)left.getCE(leftIndex++); 227 } while(leftCase <= 0xffff); 228 leftLower32 = leftCase; 229 leftCase &= 0xc000; 230 231 do { 232 rightCase = (uint32_t)right.getCE(rightIndex++); 233 } while(rightCase <= 0xffff); 234 rightCase &= 0xc000; 235 } 236 237 // No need to handle NO_CE and MERGE_SEPARATOR specially: 238 // There is one case weight for each previous-level weight, 239 // so level length differences were handled there. 240 if(leftCase != rightCase) { 241 if((options & CollationSettings::UPPER_FIRST) == 0) { 242 return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; 243 } else { 244 return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; 245 } 246 } 247 if((leftLower32 >> 16) == Collation::NO_CE_WEIGHT16) { break; } 248 } 249 } 250 if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } 251 252 uint32_t tertiaryMask = CollationSettings::getTertiaryMask(options); 253 254 int32_t leftIndex = 0; 255 int32_t rightIndex = 0; 256 uint32_t anyQuaternaries = 0; 257 for(;;) { 258 uint32_t leftLower32, leftTertiary; 259 do { 260 leftLower32 = (uint32_t)left.getCE(leftIndex++); 261 anyQuaternaries |= leftLower32; 262 U_ASSERT((leftLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 263 (leftLower32 & 0xc0c0) == 0); 264 leftTertiary = leftLower32 & tertiaryMask; 265 } while(leftTertiary == 0); 266 267 uint32_t rightLower32, rightTertiary; 268 do { 269 rightLower32 = (uint32_t)right.getCE(rightIndex++); 270 anyQuaternaries |= rightLower32; 271 U_ASSERT((rightLower32 & Collation::ONLY_TERTIARY_MASK) != 0 || 272 (rightLower32 & 0xc0c0) == 0); 273 rightTertiary = rightLower32 & tertiaryMask; 274 } while(rightTertiary == 0); 275 276 if(leftTertiary != rightTertiary) { 277 if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { 278 // Pass through NO_CE and keep real tertiary weights larger than that. 279 // Do not change the artificial uppercase weight of a tertiary CE (0.0.ut), 280 // to keep tertiary CEs well-formed. 281 // Their case+tertiary weights must be greater than those of 282 // primary and secondary CEs. 283 if(leftTertiary > Collation::NO_CE_WEIGHT16) { 284 if(leftLower32 > 0xffff) { 285 leftTertiary ^= 0xc000; 286 } else { 287 leftTertiary += 0x4000; 288 } 289 } 290 if(rightTertiary > Collation::NO_CE_WEIGHT16) { 291 if(rightLower32 > 0xffff) { 292 rightTertiary ^= 0xc000; 293 } else { 294 rightTertiary += 0x4000; 295 } 296 } 297 } 298 return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; 299 } 300 if(leftTertiary == Collation::NO_CE_WEIGHT16) { break; } 301 } 302 if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } 303 304 if(!anyVariable && (anyQuaternaries & 0xc0) == 0) { 305 // If there are no "variable" CEs and no non-zero quaternary weights, 306 // then there are no quaternary differences. 307 return UCOL_EQUAL; 308 } 309 310 leftIndex = 0; 311 rightIndex = 0; 312 for(;;) { 313 uint32_t leftQuaternary; 314 do { 315 int64_t ce = left.getCE(leftIndex++); 316 leftQuaternary = (uint32_t)ce & 0xffff; 317 if(leftQuaternary <= Collation::NO_CE_WEIGHT16) { 318 // Variable primary or completely ignorable or NO_CE. 319 leftQuaternary = (uint32_t)(ce >> 32); 320 } else { 321 // Regular CE, not tertiary ignorable. 322 // Preserve the quaternary weight in bits 7..6. 323 leftQuaternary |= 0xffffff3f; 324 } 325 } while(leftQuaternary == 0); 326 327 uint32_t rightQuaternary; 328 do { 329 int64_t ce = right.getCE(rightIndex++); 330 rightQuaternary = (uint32_t)ce & 0xffff; 331 if(rightQuaternary <= Collation::NO_CE_WEIGHT16) { 332 // Variable primary or completely ignorable or NO_CE. 333 rightQuaternary = (uint32_t)(ce >> 32); 334 } else { 335 // Regular CE, not tertiary ignorable. 336 // Preserve the quaternary weight in bits 7..6. 337 rightQuaternary |= 0xffffff3f; 338 } 339 } while(rightQuaternary == 0); 340 341 if(leftQuaternary != rightQuaternary) { 342 // Return the difference, with script reordering. 343 if(settings.hasReordering()) { 344 leftQuaternary = settings.reorder(leftQuaternary); 345 rightQuaternary = settings.reorder(rightQuaternary); 346 } 347 return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; 348 } 349 if(leftQuaternary == Collation::NO_CE_PRIMARY) { break; } 350 } 351 return UCOL_EQUAL; 352 } 353 354 U_NAMESPACE_END 355 356 #endif // !UCONFIG_NO_COLLATION 357