1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationIterator.java, ported from collationiterator.h/.cpp 9 * 10 * C++ version created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import com.ibm.icu.impl.Normalizer2Impl.Hangul; 17 import com.ibm.icu.impl.Trie2_32; 18 import com.ibm.icu.util.BytesTrie; 19 import com.ibm.icu.util.CharsTrie; 20 import com.ibm.icu.util.ICUException; 21 22 /** 23 * Collation element iterator and abstract character iterator. 24 * 25 * When a method returns a code point value, it must be in 0..10FFFF, 26 * except it can be negative as a sentinel value. 27 */ 28 public abstract class CollationIterator { 29 private static final class CEBuffer { 30 /** Large enough for CEs of most short strings. */ 31 private static final int INITIAL_CAPACITY = 40; 32 CEBuffer()33 CEBuffer() {} 34 append(long ce)35 void append(long ce) { 36 if(length >= INITIAL_CAPACITY) { 37 ensureAppendCapacity(1); 38 } 39 buffer[length++] = ce; 40 } 41 appendUnsafe(long ce)42 void appendUnsafe(long ce) { 43 buffer[length++] = ce; 44 } 45 ensureAppendCapacity(int appCap)46 void ensureAppendCapacity(int appCap) { 47 int capacity = buffer.length; 48 if((length + appCap) <= capacity) { return; } 49 do { 50 if(capacity < 1000) { 51 capacity *= 4; 52 } else { 53 capacity *= 2; 54 } 55 } while(capacity < (length + appCap)); 56 long[] newBuffer = new long[capacity]; 57 System.arraycopy(buffer, 0, newBuffer, 0, length); 58 buffer = newBuffer; 59 } 60 incLength()61 void incLength() { 62 // Use INITIAL_CAPACITY for a very simple fastpath. 63 // (Rather than buffer.getCapacity().) 64 if(length >= INITIAL_CAPACITY) { 65 ensureAppendCapacity(1); 66 } 67 ++length; 68 } 69 set(int i, long ce)70 long set(int i, long ce) { 71 return buffer[i] = ce; 72 } get(int i)73 long get(int i) { return buffer[i]; } 74 getCEs()75 long[] getCEs() { return buffer; } 76 77 int length = 0; 78 79 private long[] buffer = new long[INITIAL_CAPACITY]; 80 } 81 82 // State of combining marks skipped in discontiguous contraction. 83 // We create a state object on first use and keep it around deactivated between uses. 84 private static final class SkippedState { 85 // Born active but empty. SkippedState()86 SkippedState() {} clear()87 void clear() { 88 oldBuffer.setLength(0); 89 pos = 0; 90 // The newBuffer is reset by setFirstSkipped(). 91 } 92 isEmpty()93 boolean isEmpty() { return oldBuffer.length() == 0; } 94 hasNext()95 boolean hasNext() { return pos < oldBuffer.length(); } 96 97 // Requires hasNext(). next()98 int next() { 99 int c = oldBuffer.codePointAt(pos); 100 pos += Character.charCount(c); 101 return c; 102 } 103 104 // Accounts for one more input code point read beyond the end of the marks buffer. incBeyond()105 void incBeyond() { 106 assert(!hasNext()); 107 ++pos; 108 } 109 110 // Goes backward through the skipped-marks buffer. 111 // Returns the number of code points read beyond the skipped marks 112 // that need to be backtracked through normal input. backwardNumCodePoints(int n)113 int backwardNumCodePoints(int n) { 114 int length = oldBuffer.length(); 115 int beyond = pos - length; 116 if(beyond > 0) { 117 if(beyond >= n) { 118 // Not back far enough to re-enter the oldBuffer. 119 pos -= n; 120 return n; 121 } else { 122 // Back out all beyond-oldBuffer code points and re-enter the buffer. 123 pos = oldBuffer.offsetByCodePoints(length, beyond - n); 124 return beyond; 125 } 126 } else { 127 // Go backwards from inside the oldBuffer. 128 pos = oldBuffer.offsetByCodePoints(pos, -n); 129 return 0; 130 } 131 } 132 setFirstSkipped(int c)133 void setFirstSkipped(int c) { 134 skipLengthAtMatch = 0; 135 newBuffer.setLength(0); 136 newBuffer.appendCodePoint(c); 137 } 138 skip(int c)139 void skip(int c) { 140 newBuffer.appendCodePoint(c); 141 } 142 recordMatch()143 void recordMatch() { skipLengthAtMatch = newBuffer.length(); } 144 145 // Replaces the characters we consumed with the newly skipped ones. replaceMatch()146 void replaceMatch() { 147 // Note: UnicodeString.replace() pins pos to at most length(). 148 int oldLength = oldBuffer.length(); 149 if(pos > oldLength) { pos = oldLength; } 150 oldBuffer.delete(0, pos).insert(0, newBuffer, 0, skipLengthAtMatch); 151 pos = 0; 152 } 153 saveTrieState(CharsTrie trie)154 void saveTrieState(CharsTrie trie) { trie.saveState(state); } resetToTrieState(CharsTrie trie)155 void resetToTrieState(CharsTrie trie) { trie.resetToState(state); } 156 157 // Combining marks skipped in previous discontiguous-contraction matching. 158 // After that discontiguous contraction was completed, we start reading them from here. 159 private final StringBuilder oldBuffer = new StringBuilder(); 160 // Combining marks newly skipped in current discontiguous-contraction matching. 161 // These might have been read from the normal text or from the oldBuffer. 162 private final StringBuilder newBuffer = new StringBuilder(); 163 // Reading index in oldBuffer, 164 // or counter for how many code points have been read beyond oldBuffer (pos-oldBuffer.length()). 165 private int pos; 166 // newBuffer.length() at the time of the last matching character. 167 // When a partial match fails, we back out skipped and partial-matching input characters. 168 private int skipLengthAtMatch; 169 // We save the trie state before we attempt to match a character, 170 // so that we can skip it and try the next one. 171 private CharsTrie.State state = new CharsTrie.State(); 172 }; 173 174 /** 175 * Partially constructs the iterator. 176 * In Java, we cache partially constructed iterators 177 * and finish their setup when starting to work on text 178 * (via reset(boolean) and the setText(numeric, ...) methods of subclasses). 179 * This avoids memory allocations for iterators that remain unused. 180 * 181 * <p>In C++, there is only one constructor, and iterators are 182 * stack-allocated as needed. 183 */ CollationIterator(CollationData d)184 public CollationIterator(CollationData d) { 185 trie = d.trie; 186 data = d; 187 numCpFwd = -1; 188 isNumeric = false; 189 ceBuffer = null; 190 } 191 CollationIterator(CollationData d, boolean numeric)192 public CollationIterator(CollationData d, boolean numeric) { 193 trie = d.trie; 194 data = d; 195 numCpFwd = -1; 196 isNumeric = numeric; 197 ceBuffer = new CEBuffer(); 198 } 199 200 @Override equals(Object other)201 public boolean equals(Object other) { 202 // Subclasses: Call this method and then add more specific checks. 203 // Compare the iterator state but not the collation data (trie & data fields): 204 // Assume that the caller compares the data. 205 // Ignore skipped since that should be unused between calls to nextCE(). 206 // (It only stays around to avoid another memory allocation.) 207 if(other == null) { return false; } 208 if(!this.getClass().equals(other.getClass())) { return false; } 209 CollationIterator o = (CollationIterator)other; 210 if(!(ceBuffer.length == o.ceBuffer.length && 211 cesIndex == o.cesIndex && 212 numCpFwd == o.numCpFwd && 213 isNumeric == o.isNumeric)) { 214 return false; 215 } 216 for(int i = 0; i < ceBuffer.length; ++i) { 217 if(ceBuffer.get(i) != o.ceBuffer.get(i)) { return false; } 218 } 219 return true; 220 } 221 222 @Override hashCode()223 public int hashCode() { 224 // Dummy return to prevent compile warnings. 225 return 0; 226 } 227 228 /** 229 * Resets the iterator state and sets the position to the specified offset. 230 * Subclasses must implement, and must call the parent class method, 231 * or CollationIterator.reset(). 232 */ resetToOffset(int newOffset)233 public abstract void resetToOffset(int newOffset); 234 getOffset()235 public abstract int getOffset(); 236 237 /** 238 * Returns the next collation element. 239 */ nextCE()240 public final long nextCE() { 241 if(cesIndex < ceBuffer.length) { 242 // Return the next buffered CE. 243 return ceBuffer.get(cesIndex++); 244 } 245 assert cesIndex == ceBuffer.length; 246 ceBuffer.incLength(); 247 long cAndCE32 = handleNextCE32(); 248 int c = (int)(cAndCE32 >> 32); 249 int ce32 = (int)cAndCE32; 250 int t = ce32 & 0xff; 251 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { // Forced-inline of isSpecialCE32(ce32). 252 // Normal CE from the main data. 253 // Forced-inline of ceFromSimpleCE32(ce32). 254 return ceBuffer.set(cesIndex++, 255 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 256 } 257 CollationData d; 258 // The compiler should be able to optimize the previous and the following 259 // comparisons of t with the same constant. 260 if(t == Collation.SPECIAL_CE32_LOW_BYTE) { 261 if(c < 0) { 262 return ceBuffer.set(cesIndex++, Collation.NO_CE); 263 } 264 d = data.base; 265 ce32 = d.getCE32(c); 266 t = ce32 & 0xff; 267 if(t < Collation.SPECIAL_CE32_LOW_BYTE) { 268 // Normal CE from the base data. 269 return ceBuffer.set(cesIndex++, 270 ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (t << 8)); 271 } 272 } else { 273 d = data; 274 } 275 if(t == Collation.LONG_PRIMARY_CE32_LOW_BYTE) { 276 // Forced-inline of ceFromLongPrimaryCE32(ce32). 277 return ceBuffer.set(cesIndex++, 278 ((long)(ce32 - t) << 32) | Collation.COMMON_SEC_AND_TER_CE); 279 } 280 return nextCEFromCE32(d, c, ce32); 281 } 282 283 /** 284 * Fetches all CEs. 285 * @return getCEsLength() 286 */ fetchCEs()287 public final int fetchCEs() { 288 while(nextCE() != Collation.NO_CE) { 289 // No need to loop for each expansion CE. 290 cesIndex = ceBuffer.length; 291 } 292 return ceBuffer.length; 293 } 294 295 /** 296 * Overwrites the current CE (the last one returned by nextCE()). 297 */ setCurrentCE(long ce)298 final void setCurrentCE(long ce) { 299 assert cesIndex > 0; 300 ceBuffer.set(cesIndex - 1, ce); 301 } 302 303 /** 304 * Returns the previous collation element. 305 */ previousCE(UVector32 offsets)306 public final long previousCE(UVector32 offsets) { 307 if(ceBuffer.length > 0) { 308 // Return the previous buffered CE. 309 return ceBuffer.get(--ceBuffer.length); 310 } 311 offsets.removeAllElements(); 312 int limitOffset = getOffset(); 313 int c = previousCodePoint(); 314 if(c < 0) { return Collation.NO_CE; } 315 if(data.isUnsafeBackward(c, isNumeric)) { 316 return previousCEUnsafe(c, offsets); 317 } 318 // Simple, safe-backwards iteration: 319 // Get a CE going backwards, handle prefixes but no contractions. 320 int ce32 = data.getCE32(c); 321 CollationData d; 322 if(ce32 == Collation.FALLBACK_CE32) { 323 d = data.base; 324 ce32 = d.getCE32(c); 325 } else { 326 d = data; 327 } 328 if(Collation.isSimpleOrLongCE32(ce32)) { 329 return Collation.ceFromCE32(ce32); 330 } 331 appendCEsFromCE32(d, c, ce32, false); 332 if(ceBuffer.length > 1) { 333 offsets.addElement(getOffset()); 334 // For an expansion, the offset of each non-initial CE is the limit offset, 335 // consistent with forward iteration. 336 while(offsets.size() <= ceBuffer.length) { 337 offsets.addElement(limitOffset); 338 }; 339 } 340 return ceBuffer.get(--ceBuffer.length); 341 } 342 getCEsLength()343 public final int getCEsLength() { 344 return ceBuffer.length; 345 } 346 getCE(int i)347 public final long getCE(int i) { 348 return ceBuffer.get(i); 349 } 350 getCEs()351 public final long[] getCEs() { 352 return ceBuffer.getCEs(); 353 } 354 clearCEs()355 final void clearCEs() { 356 cesIndex = ceBuffer.length = 0; 357 } 358 clearCEsIfNoneRemaining()359 public final void clearCEsIfNoneRemaining() { 360 if(cesIndex == ceBuffer.length) { clearCEs(); } 361 } 362 363 /** 364 * Returns the next code point (with post-increment). 365 * Public for identical-level comparison and for testing. 366 */ nextCodePoint()367 public abstract int nextCodePoint(); 368 369 /** 370 * Returns the previous code point (with pre-decrement). 371 * Public for identical-level comparison and for testing. 372 */ previousCodePoint()373 public abstract int previousCodePoint(); 374 reset()375 protected final void reset() { 376 cesIndex = ceBuffer.length = 0; 377 if(skipped != null) { skipped.clear(); } 378 } 379 /** 380 * Resets the state as well as the numeric setting, 381 * and completes the initialization. 382 * Only exists in Java where we reset cached CollationIterator instances 383 * rather than stack-allocating temporary ones. 384 * (See also the constructor comments.) 385 */ reset(boolean numeric)386 protected final void reset(boolean numeric) { 387 if(ceBuffer == null) { 388 ceBuffer = new CEBuffer(); 389 } 390 reset(); 391 isNumeric = numeric; 392 } 393 394 /** 395 * Returns the next code point and its local CE32 value. 396 * Returns Collation.FALLBACK_CE32 at the end of the text (c<0) 397 * or when c's CE32 value is to be looked up in the base data (fallback). 398 * 399 * The code point is used for fallbacks, context and implicit weights. 400 * It is ignored when the returned CE32 is not special (e.g., FFFD_CE32). 401 * 402 * Returns the code point in bits 63..32 (signed) and the CE32 in bits 31..0. 403 */ handleNextCE32()404 protected long handleNextCE32() { 405 int c = nextCodePoint(); 406 if(c < 0) { return NO_CP_AND_CE32; } 407 return makeCodePointAndCE32Pair(c, data.getCE32(c)); 408 } makeCodePointAndCE32Pair(int c, int ce32)409 protected long makeCodePointAndCE32Pair(int c, int ce32) { 410 return ((long)c << 32) | (ce32 & 0xffffffffL); 411 } 412 protected static final long NO_CP_AND_CE32 = (-1L << 32) | (Collation.FALLBACK_CE32 & 0xffffffffL); 413 414 /** 415 * Called when handleNextCE32() returns a LEAD_SURROGATE_TAG for a lead surrogate code unit. 416 * Returns the trail surrogate in that case and advances past it, 417 * if a trail surrogate follows the lead surrogate. 418 * Otherwise returns any other code unit and does not advance. 419 */ handleGetTrailSurrogate()420 protected char handleGetTrailSurrogate() { 421 return 0; 422 } 423 424 /** 425 * Called when handleNextCE32() returns with c==0, to see whether it is a NUL terminator. 426 * (Not needed in Java.) 427 */ 428 /*protected boolean foundNULTerminator() { 429 return false; 430 }*/ 431 432 /** 433 * @return false if surrogate code points U+D800..U+DFFF 434 * map to their own implicit primary weights (for UTF-16), 435 * or true if they map to CE(U+FFFD) (for UTF-8) 436 */ forbidSurrogateCodePoints()437 protected boolean forbidSurrogateCodePoints() { 438 return false; 439 } 440 forwardNumCodePoints(int num)441 protected abstract void forwardNumCodePoints(int num); 442 backwardNumCodePoints(int num)443 protected abstract void backwardNumCodePoints(int num); 444 445 /** 446 * Returns the CE32 from the data trie. 447 * Normally the same as data.getCE32(), but overridden in the builder. 448 * Call this only when the faster data.getCE32() cannot be used. 449 */ getDataCE32(int c)450 protected int getDataCE32(int c) { 451 return data.getCE32(c); 452 } 453 getCE32FromBuilderData(int ce32)454 protected int getCE32FromBuilderData(int ce32) { 455 throw new ICUException("internal program error: should be unreachable"); 456 } 457 appendCEsFromCE32(CollationData d, int c, int ce32, boolean forward)458 protected final void appendCEsFromCE32(CollationData d, int c, int ce32, 459 boolean forward) { 460 while(Collation.isSpecialCE32(ce32)) { 461 switch(Collation.tagFromCE32(ce32)) { 462 case Collation.FALLBACK_TAG: 463 case Collation.RESERVED_TAG_3: 464 throw new ICUException("internal program error: should be unreachable"); 465 case Collation.LONG_PRIMARY_TAG: 466 ceBuffer.append(Collation.ceFromLongPrimaryCE32(ce32)); 467 return; 468 case Collation.LONG_SECONDARY_TAG: 469 ceBuffer.append(Collation.ceFromLongSecondaryCE32(ce32)); 470 return; 471 case Collation.LATIN_EXPANSION_TAG: 472 ceBuffer.ensureAppendCapacity(2); 473 ceBuffer.set(ceBuffer.length, Collation.latinCE0FromCE32(ce32)); 474 ceBuffer.set(ceBuffer.length + 1, Collation.latinCE1FromCE32(ce32)); 475 ceBuffer.length += 2; 476 return; 477 case Collation.EXPANSION32_TAG: { 478 int index = Collation.indexFromCE32(ce32); 479 int length = Collation.lengthFromCE32(ce32); 480 ceBuffer.ensureAppendCapacity(length); 481 do { 482 ceBuffer.appendUnsafe(Collation.ceFromCE32(d.ce32s[index++])); 483 } while(--length > 0); 484 return; 485 } 486 case Collation.EXPANSION_TAG: { 487 int index = Collation.indexFromCE32(ce32); 488 int length = Collation.lengthFromCE32(ce32); 489 ceBuffer.ensureAppendCapacity(length); 490 do { 491 ceBuffer.appendUnsafe(d.ces[index++]); 492 } while(--length > 0); 493 return; 494 } 495 case Collation.BUILDER_DATA_TAG: 496 ce32 = getCE32FromBuilderData(ce32); 497 if(ce32 == Collation.FALLBACK_CE32) { 498 d = data.base; 499 ce32 = d.getCE32(c); 500 } 501 break; 502 case Collation.PREFIX_TAG: 503 if(forward) { backwardNumCodePoints(1); } 504 ce32 = getCE32FromPrefix(d, ce32); 505 if(forward) { forwardNumCodePoints(1); } 506 break; 507 case Collation.CONTRACTION_TAG: { 508 int index = Collation.indexFromCE32(ce32); 509 int defaultCE32 = d.getCE32FromContexts(index); // Default if no suffix match. 510 if(!forward) { 511 // Backward contractions are handled by previousCEUnsafe(). 512 // c has contractions but they were not found. 513 ce32 = defaultCE32; 514 break; 515 } 516 int nextCp; 517 if(skipped == null && numCpFwd < 0) { 518 // Some portion of nextCE32FromContraction() pulled out here as an ASCII fast path, 519 // avoiding the function call and the nextSkippedCodePoint() overhead. 520 nextCp = nextCodePoint(); 521 if(nextCp < 0) { 522 // No more text. 523 ce32 = defaultCE32; 524 break; 525 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 526 !CollationFCD.mayHaveLccc(nextCp)) { 527 // All contraction suffixes start with characters with lccc!=0 528 // but the next code point has lccc==0. 529 backwardNumCodePoints(1); 530 ce32 = defaultCE32; 531 break; 532 } 533 } else { 534 nextCp = nextSkippedCodePoint(); 535 if(nextCp < 0) { 536 // No more text. 537 ce32 = defaultCE32; 538 break; 539 } else if((ce32 & Collation.CONTRACT_NEXT_CCC) != 0 && 540 !CollationFCD.mayHaveLccc(nextCp)) { 541 // All contraction suffixes start with characters with lccc!=0 542 // but the next code point has lccc==0. 543 backwardNumSkipped(1); 544 ce32 = defaultCE32; 545 break; 546 } 547 } 548 ce32 = nextCE32FromContraction(d, ce32, d.contexts, index + 2, defaultCE32, nextCp); 549 if(ce32 == Collation.NO_CE32) { 550 // CEs from a discontiguous contraction plus the skipped combining marks 551 // have been appended already. 552 return; 553 } 554 break; 555 } 556 case Collation.DIGIT_TAG: 557 if(isNumeric) { 558 appendNumericCEs(ce32, forward); 559 return; 560 } else { 561 // Fetch the non-numeric-collation CE32 and continue. 562 ce32 = d.ce32s[Collation.indexFromCE32(ce32)]; 563 break; 564 } 565 case Collation.U0000_TAG: 566 assert(c == 0); 567 // NUL-terminated input not supported in Java. 568 // Fetch the normal ce32 for U+0000 and continue. 569 ce32 = d.ce32s[0]; 570 break; 571 case Collation.HANGUL_TAG: { 572 int[] jamoCE32s = d.jamoCE32s; 573 c -= Hangul.HANGUL_BASE; 574 int t = c % Hangul.JAMO_T_COUNT; 575 c /= Hangul.JAMO_T_COUNT; 576 int v = c % Hangul.JAMO_V_COUNT; 577 c /= Hangul.JAMO_V_COUNT; 578 if((ce32 & Collation.HANGUL_NO_SPECIAL_JAMO) != 0) { 579 // None of the Jamo CE32s are isSpecialCE32(). 580 // Avoid recursive function calls and per-Jamo tests. 581 ceBuffer.ensureAppendCapacity(t == 0 ? 2 : 3); 582 ceBuffer.set(ceBuffer.length, Collation.ceFromCE32(jamoCE32s[c])); 583 ceBuffer.set(ceBuffer.length + 1, Collation.ceFromCE32(jamoCE32s[19 + v])); 584 ceBuffer.length += 2; 585 if(t != 0) { 586 ceBuffer.appendUnsafe(Collation.ceFromCE32(jamoCE32s[39 + t])); 587 } 588 return; 589 } else { 590 // We should not need to compute each Jamo code point. 591 // In particular, there should be no offset or implicit ce32. 592 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[c], forward); 593 appendCEsFromCE32(d, Collation.SENTINEL_CP, jamoCE32s[19 + v], forward); 594 if(t == 0) { return; } 595 // offset 39 = 19 + 21 - 1: 596 // 19 = JAMO_L_COUNT 597 // 21 = JAMO_T_COUNT 598 // -1 = omit t==0 599 ce32 = jamoCE32s[39 + t]; 600 c = Collation.SENTINEL_CP; 601 break; 602 } 603 } 604 case Collation.LEAD_SURROGATE_TAG: { 605 assert(forward); // Backward iteration should never see lead surrogate code _unit_ data. 606 assert(isLeadSurrogate(c)); 607 char trail; 608 if(Character.isLowSurrogate(trail = handleGetTrailSurrogate())) { 609 c = Character.toCodePoint((char)c, trail); 610 ce32 &= Collation.LEAD_TYPE_MASK; 611 if(ce32 == Collation.LEAD_ALL_UNASSIGNED) { 612 ce32 = Collation.UNASSIGNED_CE32; // unassigned-implicit 613 } else if(ce32 == Collation.LEAD_ALL_FALLBACK || 614 (ce32 = d.getCE32FromSupplementary(c)) == Collation.FALLBACK_CE32) { 615 // fall back to the base data 616 d = d.base; 617 ce32 = d.getCE32FromSupplementary(c); 618 } 619 } else { 620 // c is an unpaired surrogate. 621 ce32 = Collation.UNASSIGNED_CE32; 622 } 623 break; 624 } 625 case Collation.OFFSET_TAG: 626 assert(c >= 0); 627 ceBuffer.append(d.getCEFromOffsetCE32(c, ce32)); 628 return; 629 case Collation.IMPLICIT_TAG: 630 assert(c >= 0); 631 if(isSurrogate(c) && forbidSurrogateCodePoints()) { 632 ce32 = Collation.FFFD_CE32; 633 break; 634 } else { 635 ceBuffer.append(Collation.unassignedCEFromCodePoint(c)); 636 return; 637 } 638 } 639 } 640 ceBuffer.append(Collation.ceFromSimpleCE32(ce32)); 641 } 642 643 // TODO: Propose widening the UTF16 method. isSurrogate(int c)644 private static final boolean isSurrogate(int c) { 645 return (c & 0xfffff800) == 0xd800; 646 } 647 648 // TODO: Propose widening the UTF16 method. isLeadSurrogate(int c)649 protected static final boolean isLeadSurrogate(int c) { 650 return (c & 0xfffffc00) == 0xd800; 651 } 652 653 // TODO: Propose widening the UTF16 method. isTrailSurrogate(int c)654 protected static final boolean isTrailSurrogate(int c) { 655 return (c & 0xfffffc00) == 0xdc00; 656 } 657 658 // Main lookup trie of the data object. 659 protected final Trie2_32 trie; 660 protected final CollationData data; 661 nextCEFromCE32(CollationData d, int c, int ce32)662 private final long nextCEFromCE32(CollationData d, int c, int ce32) { 663 --ceBuffer.length; // Undo ceBuffer.incLength(). 664 appendCEsFromCE32(d, c, ce32, true); 665 return ceBuffer.get(cesIndex++); 666 } 667 getCE32FromPrefix(CollationData d, int ce32)668 private final int getCE32FromPrefix(CollationData d, int ce32) { 669 int index = Collation.indexFromCE32(ce32); 670 ce32 = d.getCE32FromContexts(index); // Default if no prefix match. 671 index += 2; 672 // Number of code points read before the original code point. 673 int lookBehind = 0; 674 CharsTrie prefixes = new CharsTrie(d.contexts, index); 675 for(;;) { 676 int c = previousCodePoint(); 677 if(c < 0) { break; } 678 ++lookBehind; 679 BytesTrie.Result match = prefixes.nextForCodePoint(c); 680 if(match.hasValue()) { 681 ce32 = prefixes.getValue(); 682 } 683 if(!match.hasNext()) { break; } 684 } 685 forwardNumCodePoints(lookBehind); 686 return ce32; 687 } 688 nextSkippedCodePoint()689 private final int nextSkippedCodePoint() { 690 if(skipped != null && skipped.hasNext()) { return skipped.next(); } 691 if(numCpFwd == 0) { return Collation.SENTINEL_CP; } 692 int c = nextCodePoint(); 693 if(skipped != null && !skipped.isEmpty() && c >= 0) { skipped.incBeyond(); } 694 if(numCpFwd > 0 && c >= 0) { --numCpFwd; } 695 return c; 696 } 697 backwardNumSkipped(int n)698 private final void backwardNumSkipped(int n) { 699 if(skipped != null && !skipped.isEmpty()) { 700 n = skipped.backwardNumCodePoints(n); 701 } 702 backwardNumCodePoints(n); 703 if(numCpFwd >= 0) { numCpFwd += n; } 704 } 705 nextCE32FromContraction( CollationData d, int contractionCE32, CharSequence trieChars, int trieOffset, int ce32, int c)706 private final int nextCE32FromContraction( 707 CollationData d, int contractionCE32, 708 CharSequence trieChars, int trieOffset, int ce32, int c) { 709 // c: next code point after the original one 710 711 // Number of code points read beyond the original code point. 712 // Needed for discontiguous contraction matching. 713 int lookAhead = 1; 714 // Number of code points read since the last match (initially only c). 715 int sinceMatch = 1; 716 // Normally we only need a contiguous match, 717 // and therefore need not remember the suffixes state from before a mismatch for retrying. 718 // If we are already processing skipped combining marks, then we do track the state. 719 CharsTrie suffixes = new CharsTrie(trieChars, trieOffset); 720 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 721 BytesTrie.Result match = suffixes.firstForCodePoint(c); 722 for(;;) { 723 int nextCp; 724 if(match.hasValue()) { 725 ce32 = suffixes.getValue(); 726 if(!match.hasNext() || (c = nextSkippedCodePoint()) < 0) { 727 return ce32; 728 } 729 if(skipped != null && !skipped.isEmpty()) { skipped.saveTrieState(suffixes); } 730 sinceMatch = 1; 731 } else if(match == BytesTrie.Result.NO_MATCH || (nextCp = nextSkippedCodePoint()) < 0) { 732 // No match for c, or partial match (BytesTrie.Result.NO_VALUE) and no further text. 733 // Back up if necessary, and try a discontiguous contraction. 734 if((contractionCE32 & Collation.CONTRACT_TRAILING_CCC) != 0 && 735 // Discontiguous contraction matching extends an existing match. 736 // If there is no match yet, then there is nothing to do. 737 ((contractionCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) == 0 || 738 sinceMatch < lookAhead)) { 739 // The last character of at least one suffix has lccc!=0, 740 // allowing for discontiguous contractions. 741 // UCA S2.1.1 only processes non-starters immediately following 742 // "a match in the table" (sinceMatch=1). 743 if(sinceMatch > 1) { 744 // Return to the state after the last match. 745 // (Return to sinceMatch=0 and re-fetch the first partially-matched character.) 746 backwardNumSkipped(sinceMatch); 747 c = nextSkippedCodePoint(); 748 lookAhead -= sinceMatch - 1; 749 sinceMatch = 1; 750 } 751 if(d.getFCD16(c) > 0xff) { 752 return nextCE32FromDiscontiguousContraction( 753 d, suffixes, ce32, lookAhead, c); 754 } 755 } 756 break; 757 } else { 758 // Continue after partial match (BytesTrie.Result.NO_VALUE) for c. 759 // It does not have a result value, therefore it is not itself "a match in the table". 760 // If a partially-matched c has ccc!=0 then 761 // it might be skipped in discontiguous contraction. 762 c = nextCp; 763 ++sinceMatch; 764 } 765 ++lookAhead; 766 match = suffixes.nextForCodePoint(c); 767 } 768 backwardNumSkipped(sinceMatch); 769 return ce32; 770 } 771 nextCE32FromDiscontiguousContraction( CollationData d, CharsTrie suffixes, int ce32, int lookAhead, int c)772 private final int nextCE32FromDiscontiguousContraction( 773 CollationData d, CharsTrie suffixes, int ce32, 774 int lookAhead, int c) { 775 // UCA section 3.3.2 Contractions: 776 // Contractions that end with non-starter characters 777 // are known as discontiguous contractions. 778 // ... discontiguous contractions must be detected in input text 779 // whenever the final sequence of non-starter characters could be rearranged 780 // so as to make a contiguous matching sequence that is canonically equivalent. 781 782 // UCA: http://www.unicode.org/reports/tr10/#S2.1 783 // S2.1 Find the longest initial substring S at each point that has a match in the table. 784 // S2.1.1 If there are any non-starters following S, process each non-starter C. 785 // S2.1.2 If C is not blocked from S, find if S + C has a match in the table. 786 // Note: A non-starter in a string is called blocked 787 // if there is another non-starter of the same canonical combining class or zero 788 // between it and the last character of canonical combining class 0. 789 // S2.1.3 If there is a match, replace S by S + C, and remove C. 790 791 // First: Is a discontiguous contraction even possible? 792 int fcd16 = d.getFCD16(c); 793 assert(fcd16 > 0xff); // The caller checked this already, as a shortcut. 794 int nextCp = nextSkippedCodePoint(); 795 if(nextCp < 0) { 796 // No further text. 797 backwardNumSkipped(1); 798 return ce32; 799 } 800 ++lookAhead; 801 int prevCC = fcd16 & 0xff; 802 fcd16 = d.getFCD16(nextCp); 803 if(fcd16 <= 0xff) { 804 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 805 backwardNumSkipped(2); 806 return ce32; 807 } 808 809 // We have read and matched (lookAhead-2) code points, 810 // read non-matching c and peeked ahead at nextCp. 811 // Return to the state before the mismatch and continue matching with nextCp. 812 if(skipped == null || skipped.isEmpty()) { 813 if(skipped == null) { 814 skipped = new SkippedState(); 815 } 816 suffixes.reset(); 817 if(lookAhead > 2) { 818 // Replay the partial match so far. 819 backwardNumCodePoints(lookAhead); 820 suffixes.firstForCodePoint(nextCodePoint()); 821 for(int i = 3; i < lookAhead; ++i) { 822 suffixes.nextForCodePoint(nextCodePoint()); 823 } 824 // Skip c (which did not match) and nextCp (which we will try now). 825 forwardNumCodePoints(2); 826 } 827 skipped.saveTrieState(suffixes); 828 } else { 829 // Reset to the trie state before the failed match of c. 830 skipped.resetToTrieState(suffixes); 831 } 832 833 skipped.setFirstSkipped(c); 834 // Number of code points read since the last match (at this point: c and nextCp). 835 int sinceMatch = 2; 836 c = nextCp; 837 for(;;) { 838 BytesTrie.Result match; 839 // "If C is not blocked from S, find if S + C has a match in the table." (S2.1.2) 840 if(prevCC < (fcd16 >> 8) && (match = suffixes.nextForCodePoint(c)).hasValue()) { 841 // "If there is a match, replace S by S + C, and remove C." (S2.1.3) 842 // Keep prevCC unchanged. 843 ce32 = suffixes.getValue(); 844 sinceMatch = 0; 845 skipped.recordMatch(); 846 if(!match.hasNext()) { break; } 847 skipped.saveTrieState(suffixes); 848 } else { 849 // No match for "S + C", skip C. 850 skipped.skip(c); 851 skipped.resetToTrieState(suffixes); 852 prevCC = fcd16 & 0xff; 853 } 854 if((c = nextSkippedCodePoint()) < 0) { break; } 855 ++sinceMatch; 856 fcd16 = d.getFCD16(c); 857 if(fcd16 <= 0xff) { 858 // The next code point after c is a starter (S2.1.1 "process each non-starter"). 859 break; 860 } 861 } 862 backwardNumSkipped(sinceMatch); 863 boolean isTopDiscontiguous = skipped.isEmpty(); 864 skipped.replaceMatch(); 865 if(isTopDiscontiguous && !skipped.isEmpty()) { 866 // We did get a match after skipping one or more combining marks, 867 // and we are not in a recursive discontiguous contraction. 868 // Append CEs from the contraction ce32 869 // and then from the combining marks that we skipped before the match. 870 c = Collation.SENTINEL_CP; 871 for(;;) { 872 appendCEsFromCE32(d, c, ce32, true); 873 // Fetch CE32s for skipped combining marks from the normal data, with fallback, 874 // rather than from the CollationData where we found the contraction. 875 if(!skipped.hasNext()) { break; } 876 c = skipped.next(); 877 ce32 = getDataCE32(c); 878 if(ce32 == Collation.FALLBACK_CE32) { 879 d = data.base; 880 ce32 = d.getCE32(c); 881 } else { 882 d = data; 883 } 884 // Note: A nested discontiguous-contraction match 885 // replaces consumed combining marks with newly skipped ones 886 // and resets the reading position to the beginning. 887 } 888 skipped.clear(); 889 ce32 = Collation.NO_CE32; // Signal to the caller that the result is in the ceBuffer. 890 } 891 return ce32; 892 } 893 894 /** 895 * Returns the previous CE when data.isUnsafeBackward(c, isNumeric). 896 */ previousCEUnsafe(int c, UVector32 offsets)897 private final long previousCEUnsafe(int c, UVector32 offsets) { 898 // We just move through the input counting safe and unsafe code points 899 // without collecting the unsafe-backward substring into a buffer and 900 // switching to it. 901 // This is to keep the logic simple. Otherwise we would have to handle 902 // prefix matching going before the backward buffer, switching 903 // to iteration and back, etc. 904 // In the most important case of iterating over a normal string, 905 // reading from the string itself is already maximally fast. 906 // The only drawback there is that after getting the CEs we always 907 // skip backward to the safe character rather than switching out 908 // of a backwardBuffer. 909 // But this should not be the common case for previousCE(), 910 // and correctness and maintainability are more important than 911 // complex optimizations. 912 // Find the first safe character before c. 913 int numBackward = 1; 914 while((c = previousCodePoint()) >= 0) { 915 ++numBackward; 916 if(!data.isUnsafeBackward(c, isNumeric)) { 917 break; 918 } 919 } 920 // Set the forward iteration limit. 921 // Note: This counts code points. 922 // We cannot enforce a limit in the middle of a surrogate pair or similar. 923 numCpFwd = numBackward; 924 // Reset the forward iterator. 925 cesIndex = 0; 926 assert(ceBuffer.length == 0); 927 // Go forward and collect the CEs. 928 int offset = getOffset(); 929 while(numCpFwd > 0) { 930 // nextCE() normally reads one code point. 931 // Contraction matching and digit specials read more and check numCpFwd. 932 --numCpFwd; 933 // Append one or more CEs to the ceBuffer. 934 nextCE(); 935 assert(ceBuffer.get(ceBuffer.length - 1) != Collation.NO_CE); 936 // No need to loop for getting each expansion CE from nextCE(). 937 cesIndex = ceBuffer.length; 938 // However, we need to write an offset for each CE. 939 // This is for CollationElementIterator.getOffset() to return 940 // intermediate offsets from the unsafe-backwards segment. 941 assert(offsets.size() < ceBuffer.length); 942 offsets.addElement(offset); 943 // For an expansion, the offset of each non-initial CE is the limit offset, 944 // consistent with forward iteration. 945 offset = getOffset(); 946 while(offsets.size() < ceBuffer.length) { 947 offsets.addElement(offset); 948 }; 949 } 950 assert(offsets.size() == ceBuffer.length); 951 // End offset corresponding to just after the unsafe-backwards segment. 952 offsets.addElement(offset); 953 // Reset the forward iteration limit 954 // and move backward to before the segment for which we fetched CEs. 955 numCpFwd = -1; 956 backwardNumCodePoints(numBackward); 957 // Use the collected CEs and return the last one. 958 cesIndex = 0; // Avoid cesIndex > ceBuffer.length when that gets decremented. 959 return ceBuffer.get(--ceBuffer.length); 960 } 961 962 /** 963 * Turns a string of digits (bytes 0..9) 964 * into a sequence of CEs that will sort in numeric order. 965 * 966 * Starts from this ce32's digit value and consumes the following/preceding digits. 967 * The digits string must not be empty and must not have leading zeros. 968 */ 969 private final void appendNumericCEs(int ce32, boolean forward) { 970 // Collect digits. 971 // TODO: Use some kind of a byte buffer? We only store values 0..9. 972 StringBuilder digits = new StringBuilder(); 973 if(forward) { 974 for(;;) { 975 char digit = Collation.digitFromCE32(ce32); 976 digits.append(digit); 977 if(numCpFwd == 0) { break; } 978 int c = nextCodePoint(); 979 if(c < 0) { break; } 980 ce32 = data.getCE32(c); 981 if(ce32 == Collation.FALLBACK_CE32) { 982 ce32 = data.base.getCE32(c); 983 } 984 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 985 backwardNumCodePoints(1); 986 break; 987 } 988 if(numCpFwd > 0) { --numCpFwd; } 989 } 990 } else { 991 for(;;) { 992 char digit = Collation.digitFromCE32(ce32); 993 digits.append(digit); 994 int c = previousCodePoint(); 995 if(c < 0) { break; } 996 ce32 = data.getCE32(c); 997 if(ce32 == Collation.FALLBACK_CE32) { 998 ce32 = data.base.getCE32(c); 999 } 1000 if(!Collation.hasCE32Tag(ce32, Collation.DIGIT_TAG)) { 1001 forwardNumCodePoints(1); 1002 break; 1003 } 1004 } 1005 // Reverse the digit string. 1006 digits.reverse(); 1007 } 1008 int pos = 0; 1009 do { 1010 // Skip leading zeros. 1011 while(pos < (digits.length() - 1) && digits.charAt(pos) == 0) { ++pos; } 1012 // Write a sequence of CEs for at most 254 digits at a time. 1013 int segmentLength = digits.length() - pos; 1014 if(segmentLength > 254) { segmentLength = 254; } 1015 appendNumericSegmentCEs(digits.subSequence(pos, pos + segmentLength)); 1016 pos += segmentLength; 1017 } while(pos < digits.length()); 1018 } 1019 1020 /** 1021 * Turns 1..254 digits into a sequence of CEs. 1022 * Called by appendNumericCEs() for each segment of at most 254 digits. 1023 */ 1024 private final void appendNumericSegmentCEs(CharSequence digits) { 1025 int length = digits.length(); 1026 assert(1 <= length && length <= 254); 1027 assert(length == 1 || digits.charAt(0) != 0); 1028 long numericPrimary = data.numericPrimary; 1029 // Note: We use primary byte values 2..255: digits are not compressible. 1030 if(length <= 7) { 1031 // Very dense encoding for small numbers. 1032 int value = digits.charAt(0); 1033 for(int i = 1; i < length; ++i) { 1034 value = value * 10 + digits.charAt(i); 1035 } 1036 // Primary weight second byte values: 1037 // 74 byte values 2.. 75 for small numbers in two-byte primary weights. 1038 // 40 byte values 76..115 for medium numbers in three-byte primary weights. 1039 // 16 byte values 116..131 for large numbers in four-byte primary weights. 1040 // 124 byte values 132..255 for very large numbers with 4..127 digit pairs. 1041 int firstByte = 2; 1042 int numBytes = 74; 1043 if(value < numBytes) { 1044 // Two-byte primary for 0..73, good for day & month numbers etc. 1045 long primary = numericPrimary | ((firstByte + value) << 16); 1046 ceBuffer.append(Collation.makeCE(primary)); 1047 return; 1048 } 1049 value -= numBytes; 1050 firstByte += numBytes; 1051 numBytes = 40; 1052 if(value < numBytes * 254) { 1053 // Three-byte primary for 74..10233=74+40*254-1, good for year numbers and more. 1054 long primary = numericPrimary | 1055 ((firstByte + value / 254) << 16) | ((2 + value % 254) << 8); 1056 ceBuffer.append(Collation.makeCE(primary)); 1057 return; 1058 } 1059 value -= numBytes * 254; 1060 firstByte += numBytes; 1061 numBytes = 16; 1062 if(value < numBytes * 254 * 254) { 1063 // Four-byte primary for 10234..1042489=10234+16*254*254-1. 1064 long primary = numericPrimary | (2 + value % 254); 1065 value /= 254; 1066 primary |= (2 + value % 254) << 8; 1067 value /= 254; 1068 primary |= (firstByte + value % 254) << 16; 1069 ceBuffer.append(Collation.makeCE(primary)); 1070 return; 1071 } 1072 // original value > 1042489 1073 } 1074 assert(length >= 7); 1075 1076 // The second primary byte value 132..255 indicates the number of digit pairs (4..127), 1077 // then we generate primary bytes with those pairs. 1078 // Omit trailing 00 pairs. 1079 // Decrement the value for the last pair. 1080 1081 // Set the exponent. 4 pairs.132, 5 pairs.133, ..., 127 pairs.255. 1082 int numPairs = (length + 1) / 2; 1083 long primary = numericPrimary | ((132 - 4 + numPairs) << 16); 1084 // Find the length without trailing 00 pairs. 1085 while(digits.charAt(length - 1) == 0 && digits.charAt(length - 2) == 0) { 1086 length -= 2; 1087 } 1088 // Read the first pair. 1089 int pair; 1090 int pos; 1091 if((length & 1) != 0) { 1092 // Only "half a pair" if we have an odd number of digits. 1093 pair = digits.charAt(0); 1094 pos = 1; 1095 } else { 1096 pair = digits.charAt(0) * 10 + digits.charAt(1); 1097 pos = 2; 1098 } 1099 pair = 11 + 2 * pair; 1100 // Add the pairs of digits between pos and length. 1101 int shift = 8; 1102 while(pos < length) { 1103 if(shift == 0) { 1104 // Every three pairs/bytes we need to store a 4-byte-primary CE 1105 // and start with a new CE with the '0' primary lead byte. 1106 primary |= pair; 1107 ceBuffer.append(Collation.makeCE(primary)); 1108 primary = numericPrimary; 1109 shift = 16; 1110 } else { 1111 primary |= pair << shift; 1112 shift -= 8; 1113 } 1114 pair = 11 + 2 * (digits.charAt(pos) * 10 + digits.charAt(pos + 1)); 1115 pos += 2; 1116 } 1117 primary |= (pair - 1) << shift; 1118 ceBuffer.append(Collation.makeCE(primary)); 1119 } 1120 1121 private CEBuffer ceBuffer; 1122 private int cesIndex; 1123 1124 private SkippedState skipped; 1125 1126 // Number of code points to read forward, or -1. 1127 // Used as a forward iteration limit in previousCEUnsafe(). 1128 private int numCpFwd; 1129 // Numeric collation (CollationSettings.NUMERIC). 1130 private boolean isNumeric; 1131 } 1132