1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2012-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationDataBuilder.java, ported from collationdatabuilder.h/.cpp 9 * 10 * C++ version created on: 2012apr01 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import java.util.ArrayList; 17 import java.util.Arrays; 18 import java.util.Iterator; 19 20 import com.ibm.icu.impl.Norm2AllModes; 21 import com.ibm.icu.impl.Normalizer2Impl; 22 import com.ibm.icu.impl.Normalizer2Impl.Hangul; 23 import com.ibm.icu.impl.Trie2; 24 import com.ibm.icu.impl.Trie2Writable; 25 import com.ibm.icu.lang.UCharacter; 26 import com.ibm.icu.text.UnicodeSet; 27 import com.ibm.icu.text.UnicodeSetIterator; 28 import com.ibm.icu.util.CharsTrie; 29 import com.ibm.icu.util.CharsTrieBuilder; 30 import com.ibm.icu.util.StringTrieBuilder; 31 32 /** 33 * Low-level CollationData builder. 34 * Takes (character, CE) pairs and builds them into runtime data structures. 35 * Supports characters with context prefixes and contraction suffixes. 36 */ 37 final class CollationDataBuilder { // not final in C++ 38 /** 39 * Collation element modifier. Interface class for a modifier 40 * that changes a tailoring builder's temporary CEs to final CEs. 41 * Called for every non-special CE32 and every expansion CE. 42 */ 43 interface CEModifier { 44 /** Returns a new CE to replace the non-special input CE32, or else Collation.NO_CE. */ modifyCE32(int ce32)45 long modifyCE32(int ce32); 46 /** Returns a new CE to replace the input CE, or else Collation.NO_CE. */ modifyCE(long ce)47 long modifyCE(long ce); 48 } 49 CollationDataBuilder()50 CollationDataBuilder() { 51 nfcImpl = Norm2AllModes.getNFCInstance().impl; 52 base = null; 53 baseSettings = null; 54 trie = null; 55 ce32s = new UVector32(); 56 ce64s = new UVector64(); 57 conditionalCE32s = new ArrayList<ConditionalCE32>(); 58 modified = false; 59 fastLatinEnabled = false; 60 fastLatinBuilder = null; 61 collIter = null; 62 // Reserve the first CE32 for U+0000. 63 ce32s.addElement(0); 64 } 65 initForTailoring(CollationData b)66 void initForTailoring(CollationData b) { 67 if(trie != null) { 68 throw new IllegalStateException("attempt to reuse a CollationDataBuilder"); 69 } 70 if(b == null) { 71 throw new IllegalArgumentException("null CollationData"); 72 } 73 base = b; 74 75 // For a tailoring, the default is to fall back to the base. 76 trie = new Trie2Writable(Collation.FALLBACK_CE32, Collation.FFFD_CE32); 77 78 // Set the Latin-1 letters block so that it is allocated first in the data array, 79 // to try to improve locality of reference when sorting Latin-1 text. 80 // Do not use utrie2_setRange32() since that will not actually allocate blocks 81 // that are filled with the default value. 82 // ASCII (0..7F) is already preallocated anyway. 83 for(int c = 0xc0; c <= 0xff; ++c) { 84 trie.set(c, Collation.FALLBACK_CE32); 85 } 86 87 // Hangul syllables are not tailorable (except via tailoring Jamos). 88 // Always set the Hangul tag to help performance. 89 // Do this here, rather than in buildMappings(), 90 // so that we see the HANGUL_TAG in various assertions. 91 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 92 trie.setRange(Hangul.HANGUL_BASE, Hangul.HANGUL_END, hangulCE32, true); 93 94 // Copy the set contents but don't copy/clone the set as a whole because 95 // that would copy the isFrozen state too. 96 unsafeBackwardSet.addAll(b.unsafeBackwardSet); 97 } 98 isCompressibleLeadByte(int b)99 boolean isCompressibleLeadByte(int b) { 100 return base.isCompressibleLeadByte(b); 101 } 102 isCompressiblePrimary(long p)103 boolean isCompressiblePrimary(long p) { 104 return isCompressibleLeadByte((int)p >>> 24); 105 } 106 107 /** 108 * @return true if this builder has mappings (e.g., add() has been called) 109 */ hasMappings()110 boolean hasMappings() { return modified; } 111 112 /** 113 * @return true if c has CEs in this builder 114 */ isAssigned(int c)115 boolean isAssigned(int c) { 116 return Collation.isAssignedCE32(trie.get(c)); 117 } 118 add(CharSequence prefix, CharSequence s, long ces[], int cesLength)119 void add(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 120 int ce32 = encodeCEs(ces, cesLength); 121 addCE32(prefix, s, ce32); 122 } 123 124 /** 125 * Encodes the ces as either the returned ce32 by itself, 126 * or by storing an expansion, with the returned ce32 referring to that. 127 * 128 * <p>add(p, s, ces, cesLength) = addCE32(p, s, encodeCEs(ces, cesLength)) 129 */ encodeCEs(long ces[], int cesLength)130 int encodeCEs(long ces[], int cesLength) { 131 if(cesLength < 0 || cesLength > Collation.MAX_EXPANSION_LENGTH) { 132 throw new IllegalArgumentException("mapping to too many CEs"); 133 } 134 if(!isMutable()) { 135 throw new IllegalStateException("attempt to add mappings after build()"); 136 } 137 if(cesLength == 0) { 138 // Convenience: We cannot map to nothing, but we can map to a completely ignorable CE. 139 // Do this here so that callers need not do it. 140 return encodeOneCEAsCE32(0); 141 } else if(cesLength == 1) { 142 return encodeOneCE(ces[0]); 143 } else if(cesLength == 2) { 144 // Try to encode two CEs as one CE32. 145 long ce0 = ces[0]; 146 long ce1 = ces[1]; 147 long p0 = ce0 >>> 32; 148 if((ce0 & 0xffffffffff00ffL) == Collation.COMMON_SECONDARY_CE && 149 (ce1 & 0xffffffff00ffffffL) == Collation.COMMON_TERTIARY_CE && 150 p0 != 0) { 151 // Latin mini expansion 152 return 153 (int)p0 | 154 (((int)ce0 & 0xff00) << 8) | 155 (((int)ce1 >> 16) & 0xff00) | 156 Collation.SPECIAL_CE32_LOW_BYTE | 157 Collation.LATIN_EXPANSION_TAG; 158 } 159 } 160 // Try to encode two or more CEs as CE32s. 161 int[] newCE32s = new int[Collation.MAX_EXPANSION_LENGTH]; // TODO: instance field? 162 for(int i = 0;; ++i) { 163 if(i == cesLength) { 164 return encodeExpansion32(newCE32s, 0, cesLength); 165 } 166 int ce32 = encodeOneCEAsCE32(ces[i]); 167 if(ce32 == Collation.NO_CE32) { break; } 168 newCE32s[i] = ce32; 169 } 170 return encodeExpansion(ces, 0, cesLength); 171 } 172 addCE32(CharSequence prefix, CharSequence s, int ce32)173 void addCE32(CharSequence prefix, CharSequence s, int ce32) { 174 if(s.length() == 0) { 175 throw new IllegalArgumentException("mapping from empty string"); 176 } 177 if(!isMutable()) { 178 throw new IllegalStateException("attempt to add mappings after build()"); 179 } 180 int c = Character.codePointAt(s, 0); 181 int cLength = Character.charCount(c); 182 int oldCE32 = trie.get(c); 183 boolean hasContext = prefix.length() != 0|| s.length() > cLength; 184 if(oldCE32 == Collation.FALLBACK_CE32) { 185 // First tailoring for c. 186 // If c has contextual base mappings or if we add a contextual mapping, 187 // then copy the base mappings. 188 // Otherwise we just override the base mapping. 189 int baseCE32 = base.getFinalCE32(base.getCE32(c)); 190 if(hasContext || Collation.ce32HasContext(baseCE32)) { 191 oldCE32 = copyFromBaseCE32(c, baseCE32, true); 192 trie.set(c, oldCE32); 193 } 194 } 195 if(!hasContext) { 196 // No prefix, no contraction. 197 if(!isBuilderContextCE32(oldCE32)) { 198 trie.set(c, ce32); 199 } else { 200 ConditionalCE32 cond = getConditionalCE32ForCE32(oldCE32); 201 cond.builtCE32 = Collation.NO_CE32; 202 cond.ce32 = ce32; 203 } 204 } else { 205 ConditionalCE32 cond; 206 if(!isBuilderContextCE32(oldCE32)) { 207 // Replace the simple oldCE32 with a builder context CE32 208 // pointing to a new ConditionalCE32 list head. 209 int index = addConditionalCE32("\0", oldCE32); 210 int contextCE32 = makeBuilderContextCE32(index); 211 trie.set(c, contextCE32); 212 contextChars.add(c); 213 cond = getConditionalCE32(index); 214 } else { 215 cond = getConditionalCE32ForCE32(oldCE32); 216 cond.builtCE32 = Collation.NO_CE32; 217 } 218 CharSequence suffix = s.subSequence(cLength, s.length()); 219 String context = new StringBuilder().append((char)prefix.length()). 220 append(prefix).append(suffix).toString(); 221 unsafeBackwardSet.addAll(suffix); 222 for(;;) { 223 // invariant: context > cond.context 224 int next = cond.next; 225 if(next < 0) { 226 // Append a new ConditionalCE32 after cond. 227 int index = addConditionalCE32(context, ce32); 228 cond.next = index; 229 break; 230 } 231 ConditionalCE32 nextCond = getConditionalCE32(next); 232 int cmp = context.compareTo(nextCond.context); 233 if(cmp < 0) { 234 // Insert a new ConditionalCE32 between cond and nextCond. 235 int index = addConditionalCE32(context, ce32); 236 cond.next = index; 237 getConditionalCE32(index).next = next; 238 break; 239 } else if(cmp == 0) { 240 // Same context as before, overwrite its ce32. 241 nextCond.ce32 = ce32; 242 break; 243 } 244 cond = nextCond; 245 } 246 } 247 modified = true; 248 } 249 250 /** 251 * Copies all mappings from the src builder, with modifications. 252 * This builder here must not be built yet, and should be empty. 253 */ copyFrom(CollationDataBuilder src, CEModifier modifier)254 void copyFrom(CollationDataBuilder src, CEModifier modifier) { 255 if(!isMutable()) { 256 throw new IllegalStateException("attempt to copyFrom() after build()"); 257 } 258 CopyHelper helper = new CopyHelper(src, this, modifier); 259 Iterator<Trie2.Range> trieIterator = src.trie.iterator(); 260 Trie2.Range range; 261 while(trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 262 enumRangeForCopy(range.startCodePoint, range.endCodePoint, range.value, helper); 263 } 264 // Update the contextChars and the unsafeBackwardSet while copying, 265 // in case a character had conditional mappings in the source builder 266 // and they were removed later. 267 modified |= src.modified; 268 } 269 optimize(UnicodeSet set)270 void optimize(UnicodeSet set) { 271 if(set.isEmpty()) { return; } 272 UnicodeSetIterator iter = new UnicodeSetIterator(set); 273 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 274 int c = iter.codepoint; 275 int ce32 = trie.get(c); 276 if(ce32 == Collation.FALLBACK_CE32) { 277 ce32 = base.getFinalCE32(base.getCE32(c)); 278 ce32 = copyFromBaseCE32(c, ce32, true); 279 trie.set(c, ce32); 280 } 281 } 282 modified = true; 283 } 284 suppressContractions(UnicodeSet set)285 void suppressContractions(UnicodeSet set) { 286 if(set.isEmpty()) { return; } 287 UnicodeSetIterator iter = new UnicodeSetIterator(set); 288 while(iter.next() && iter.codepoint != UnicodeSetIterator.IS_STRING) { 289 int c = iter.codepoint; 290 int ce32 = trie.get(c); 291 if(ce32 == Collation.FALLBACK_CE32) { 292 ce32 = base.getFinalCE32(base.getCE32(c)); 293 if(Collation.ce32HasContext(ce32)) { 294 ce32 = copyFromBaseCE32(c, ce32, false /* without context */); 295 trie.set(c, ce32); 296 } 297 } else if(isBuilderContextCE32(ce32)) { 298 ce32 = getConditionalCE32ForCE32(ce32).ce32; 299 // Simply abandon the list of ConditionalCE32. 300 // The caller will copy this builder in the end, 301 // eliminating unreachable data. 302 trie.set(c, ce32); 303 contextChars.remove(c); 304 } 305 } 306 modified = true; 307 } 308 enableFastLatin()309 void enableFastLatin() { fastLatinEnabled = true; } build(CollationData data)310 void build(CollationData data) { 311 buildMappings(data); 312 if(base != null) { 313 data.numericPrimary = base.numericPrimary; 314 data.compressibleBytes = base.compressibleBytes; 315 data.numScripts = base.numScripts; 316 data.scriptsIndex = base.scriptsIndex; 317 data.scriptStarts = base.scriptStarts; 318 } 319 buildFastLatinTable(data); 320 } 321 322 /** 323 * Looks up CEs for s and appends them to the ces array. 324 * Does not handle normalization: s should be in FCD form. 325 * 326 * Does not write completely ignorable CEs. 327 * Does not write beyond Collation.MAX_EXPANSION_LENGTH. 328 * 329 * @return incremented cesLength 330 */ getCEs(CharSequence s, long ces[], int cesLength)331 int getCEs(CharSequence s, long ces[], int cesLength) { 332 return getCEs(s, 0, ces, cesLength); 333 } 334 getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength)335 int getCEs(CharSequence prefix, CharSequence s, long ces[], int cesLength) { 336 int prefixLength = prefix.length(); 337 if(prefixLength == 0) { 338 return getCEs(s, 0, ces, cesLength); 339 } else { 340 return getCEs(new StringBuilder(prefix).append(s), prefixLength, ces, cesLength); 341 } 342 } 343 344 /** 345 * Build-time context and CE32 for a code point. 346 * If a code point has contextual mappings, then the default (no-context) mapping 347 * and all conditional mappings are stored in a singly-linked list 348 * of ConditionalCE32, sorted by context strings. 349 * 350 * Context strings sort by prefix length, then by prefix, then by contraction suffix. 351 * Context strings must be unique and in ascending order. 352 */ 353 private static final class ConditionalCE32 { ConditionalCE32(String ct, int ce)354 ConditionalCE32(String ct, int ce) { 355 context = ct; 356 ce32 = ce; 357 defaultCE32 = Collation.NO_CE32; 358 builtCE32 = Collation.NO_CE32; 359 next = -1; 360 } 361 hasContext()362 boolean hasContext() { return context.length() > 1; } prefixLength()363 int prefixLength() { return context.charAt(0); } 364 365 /** 366 * "\0" for the first entry for any code point, with its default CE32. 367 * 368 * Otherwise one unit with the length of the prefix string, 369 * then the prefix string, then the contraction suffix. 370 */ 371 String context; 372 /** 373 * CE32 for the code point and its context. 374 * Can be special (e.g., for an expansion) but not contextual (prefix or contraction tag). 375 */ 376 int ce32; 377 /** 378 * Default CE32 for all contexts with this same prefix. 379 * Initially NO_CE32. Set only while building runtime data structures, 380 * and only on one of the nodes of a sub-list with the same prefix. 381 */ 382 int defaultCE32; 383 /** 384 * CE32 for the built contexts. 385 * When fetching CEs from the builder, the contexts are built into their runtime form 386 * so that the normal collation implementation can process them. 387 * The result is cached in the list head. It is reset when the contexts are modified. 388 */ 389 int builtCE32; 390 /** 391 * Index of the next ConditionalCE32. 392 * Negative for the end of the list. 393 */ 394 int next; 395 } 396 getCE32FromOffsetCE32(boolean fromBase, int c, int ce32)397 protected int getCE32FromOffsetCE32(boolean fromBase, int c, int ce32) { 398 int i = Collation.indexFromCE32(ce32); 399 long dataCE = fromBase ? base.ces[i] : ce64s.elementAti(i); 400 long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 401 return Collation.makeLongPrimaryCE32(p); 402 } 403 addCE(long ce)404 protected int addCE(long ce) { 405 int length = ce64s.size(); 406 for(int i = 0; i < length; ++i) { 407 if(ce == ce64s.elementAti(i)) { return i; } 408 } 409 ce64s.addElement(ce); 410 return length; 411 } 412 addCE32(int ce32)413 protected int addCE32(int ce32) { 414 int length = ce32s.size(); 415 for(int i = 0; i < length; ++i) { 416 if(ce32 == ce32s.elementAti(i)) { return i; } 417 } 418 ce32s.addElement(ce32); 419 return length; 420 } 421 addConditionalCE32(String context, int ce32)422 protected int addConditionalCE32(String context, int ce32) { 423 assert(context.length() != 0); 424 int index = conditionalCE32s.size(); 425 if(index > Collation.MAX_INDEX) { 426 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 427 // BufferOverflowException is a better fit 428 // but cannot be constructed with a message string. 429 } 430 ConditionalCE32 cond = new ConditionalCE32(context, ce32); 431 conditionalCE32s.add(cond); 432 return index; 433 } 434 getConditionalCE32(int index)435 protected ConditionalCE32 getConditionalCE32(int index) { 436 return conditionalCE32s.get(index); 437 } getConditionalCE32ForCE32(int ce32)438 protected ConditionalCE32 getConditionalCE32ForCE32(int ce32) { 439 return getConditionalCE32(Collation.indexFromCE32(ce32)); 440 } 441 makeBuilderContextCE32(int index)442 protected static int makeBuilderContextCE32(int index) { 443 return Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, index); 444 } isBuilderContextCE32(int ce32)445 protected static boolean isBuilderContextCE32(int ce32) { 446 return Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG); 447 } 448 encodeOneCEAsCE32(long ce)449 protected static int encodeOneCEAsCE32(long ce) { 450 long p = ce >>> 32; 451 int lower32 = (int)ce; 452 int t = lower32 & 0xffff; 453 assert((t & 0xc000) != 0xc000); // Impossible case bits 11 mark special CE32s. 454 if((ce & 0xffff00ff00ffL) == 0) { 455 // normal form ppppsstt 456 return (int)p | (lower32 >>> 16) | (t >> 8); 457 } else if((ce & 0xffffffffffL) == Collation.COMMON_SEC_AND_TER_CE) { 458 // long-primary form ppppppC1 459 return Collation.makeLongPrimaryCE32(p); 460 } else if(p == 0 && (t & 0xff) == 0) { 461 // long-secondary form ssssttC2 462 return Collation.makeLongSecondaryCE32(lower32); 463 } 464 return Collation.NO_CE32; 465 } 466 encodeOneCE(long ce)467 protected int encodeOneCE(long ce) { 468 // Try to encode one CE as one CE32. 469 int ce32 = encodeOneCEAsCE32(ce); 470 if(ce32 != Collation.NO_CE32) { return ce32; } 471 int index = addCE(ce); 472 if(index > Collation.MAX_INDEX) { 473 throw new IndexOutOfBoundsException("too many mappings"); 474 // BufferOverflowException is a better fit 475 // but cannot be constructed with a message string. 476 } 477 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, index, 1); 478 } 479 encodeExpansion(long ces[], int start, int length)480 protected int encodeExpansion(long ces[], int start, int length) { 481 // See if this sequence of CEs has already been stored. 482 long first = ces[start]; 483 int ce64sMax = ce64s.size() - length; 484 for(int i = 0; i <= ce64sMax; ++i) { 485 if(first == ce64s.elementAti(i)) { 486 if(i > Collation.MAX_INDEX) { 487 throw new IndexOutOfBoundsException("too many mappings"); 488 // BufferOverflowException is a better fit 489 // but cannot be constructed with a message string. 490 } 491 for(int j = 1;; ++j) { 492 if(j == length) { 493 return Collation.makeCE32FromTagIndexAndLength( 494 Collation.EXPANSION_TAG, i, length); 495 } 496 if(ce64s.elementAti(i + j) != ces[start + j]) { break; } 497 } 498 } 499 } 500 // Store the new sequence. 501 int i = ce64s.size(); 502 if(i > Collation.MAX_INDEX) { 503 throw new IndexOutOfBoundsException("too many mappings"); 504 // BufferOverflowException is a better fit 505 // but cannot be constructed with a message string. 506 } 507 for(int j = 0; j < length; ++j) { 508 ce64s.addElement(ces[start + j]); 509 } 510 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION_TAG, i, length); 511 } 512 encodeExpansion32(int newCE32s[], int start, int length)513 protected int encodeExpansion32(int newCE32s[], int start, int length) { 514 // See if this sequence of CE32s has already been stored. 515 int first = newCE32s[start]; 516 int ce32sMax = ce32s.size() - length; 517 for(int i = 0; i <= ce32sMax; ++i) { 518 if(first == ce32s.elementAti(i)) { 519 if(i > Collation.MAX_INDEX) { 520 throw new IndexOutOfBoundsException("too many mappings"); 521 // BufferOverflowException is a better fit 522 // but cannot be constructed with a message string. 523 } 524 for(int j = 1;; ++j) { 525 if(j == length) { 526 return Collation.makeCE32FromTagIndexAndLength( 527 Collation.EXPANSION32_TAG, i, length); 528 } 529 if(ce32s.elementAti(i + j) != newCE32s[start + j]) { break; } 530 } 531 } 532 } 533 // Store the new sequence. 534 int i = ce32s.size(); 535 if(i > Collation.MAX_INDEX) { 536 throw new IndexOutOfBoundsException("too many mappings"); 537 // BufferOverflowException is a better fit 538 // but cannot be constructed with a message string. 539 } 540 for(int j = 0; j < length; ++j) { 541 ce32s.addElement(newCE32s[start + j]); 542 } 543 return Collation.makeCE32FromTagIndexAndLength(Collation.EXPANSION32_TAG, i, length); 544 } 545 copyFromBaseCE32(int c, int ce32, boolean withContext)546 protected int copyFromBaseCE32(int c, int ce32, boolean withContext) { 547 if(!Collation.isSpecialCE32(ce32)) { return ce32; } 548 switch(Collation.tagFromCE32(ce32)) { 549 case Collation.LONG_PRIMARY_TAG: 550 case Collation.LONG_SECONDARY_TAG: 551 case Collation.LATIN_EXPANSION_TAG: 552 // copy as is 553 break; 554 case Collation.EXPANSION32_TAG: { 555 int index = Collation.indexFromCE32(ce32); 556 int length = Collation.lengthFromCE32(ce32); 557 ce32 = encodeExpansion32(base.ce32s, index, length); 558 break; 559 } 560 case Collation.EXPANSION_TAG: { 561 int index = Collation.indexFromCE32(ce32); 562 int length = Collation.lengthFromCE32(ce32); 563 ce32 = encodeExpansion(base.ces, index, length); 564 break; 565 } 566 case Collation.PREFIX_TAG: { 567 // Flatten prefixes and nested suffixes (contractions) 568 // into a linear list of ConditionalCE32. 569 int trieIndex = Collation.indexFromCE32(ce32); 570 ce32 = base.getCE32FromContexts(trieIndex); // Default if no prefix match. 571 if(!withContext) { 572 return copyFromBaseCE32(c, ce32, false); 573 } 574 ConditionalCE32 head = new ConditionalCE32("", 0); 575 StringBuilder context = new StringBuilder("\0"); 576 int index; 577 if(Collation.isContractionCE32(ce32)) { 578 index = copyContractionsFromBaseCE32(context, c, ce32, head); 579 } else { 580 ce32 = copyFromBaseCE32(c, ce32, true); 581 head.next = index = addConditionalCE32(context.toString(), ce32); 582 } 583 ConditionalCE32 cond = getConditionalCE32(index); // the last ConditionalCE32 so far 584 CharsTrie.Iterator prefixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 585 while(prefixes.hasNext()) { 586 CharsTrie.Entry entry = prefixes.next(); 587 context.setLength(0); 588 context.append(entry.chars).reverse().insert(0, (char)entry.chars.length()); 589 ce32 = entry.value; 590 if(Collation.isContractionCE32(ce32)) { 591 index = copyContractionsFromBaseCE32(context, c, ce32, cond); 592 } else { 593 ce32 = copyFromBaseCE32(c, ce32, true); 594 cond.next = index = addConditionalCE32(context.toString(), ce32); 595 } 596 cond = getConditionalCE32(index); 597 } 598 ce32 = makeBuilderContextCE32(head.next); 599 contextChars.add(c); 600 break; 601 } 602 case Collation.CONTRACTION_TAG: { 603 if(!withContext) { 604 int index = Collation.indexFromCE32(ce32); 605 ce32 = base.getCE32FromContexts(index); // Default if no suffix match. 606 return copyFromBaseCE32(c, ce32, false); 607 } 608 ConditionalCE32 head = new ConditionalCE32("", 0); 609 StringBuilder context = new StringBuilder("\0"); 610 copyContractionsFromBaseCE32(context, c, ce32, head); 611 ce32 = makeBuilderContextCE32(head.next); 612 contextChars.add(c); 613 break; 614 } 615 case Collation.HANGUL_TAG: 616 throw new UnsupportedOperationException("We forbid tailoring of Hangul syllables."); 617 case Collation.OFFSET_TAG: 618 ce32 = getCE32FromOffsetCE32(true, c, ce32); 619 break; 620 case Collation.IMPLICIT_TAG: 621 ce32 = encodeOneCE(Collation.unassignedCEFromCodePoint(c)); 622 break; 623 default: 624 throw new AssertionError("copyFromBaseCE32(c, ce32, withContext) " + 625 "requires ce32 == base.getFinalCE32(ce32)"); 626 } 627 return ce32; 628 } 629 630 /** 631 * Copies base contractions to a list of ConditionalCE32. 632 * Sets cond.next to the index of the first new item 633 * and returns the index of the last new item. 634 */ copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, ConditionalCE32 cond)635 protected int copyContractionsFromBaseCE32(StringBuilder context, int c, int ce32, 636 ConditionalCE32 cond) { 637 int trieIndex = Collation.indexFromCE32(ce32); 638 int index; 639 if((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 640 // No match on the single code point. 641 // We are underneath a prefix, and the default mapping is just 642 // a fallback to the mappings for a shorter prefix. 643 assert(context.length() > 1); 644 index = -1; 645 } else { 646 ce32 = base.getCE32FromContexts(trieIndex); // Default if no suffix match. 647 assert(!Collation.isContractionCE32(ce32)); 648 ce32 = copyFromBaseCE32(c, ce32, true); 649 cond.next = index = addConditionalCE32(context.toString(), ce32); 650 cond = getConditionalCE32(index); 651 } 652 653 int suffixStart = context.length(); 654 CharsTrie.Iterator suffixes = CharsTrie.iterator(base.contexts, trieIndex + 2, 0); 655 while(suffixes.hasNext()) { 656 CharsTrie.Entry entry = suffixes.next(); 657 context.append(entry.chars); 658 ce32 = copyFromBaseCE32(c, entry.value, true); 659 cond.next = index = addConditionalCE32(context.toString(), ce32); 660 // No need to update the unsafeBackwardSet because the tailoring set 661 // is already a copy of the base set. 662 cond = getConditionalCE32(index); 663 context.setLength(suffixStart); 664 } 665 assert(index >= 0); 666 return index; 667 } 668 669 private static final class CopyHelper { CopyHelper(CollationDataBuilder s, CollationDataBuilder d, CollationDataBuilder.CEModifier m)670 CopyHelper(CollationDataBuilder s, CollationDataBuilder d, 671 CollationDataBuilder.CEModifier m) { 672 src = s; 673 dest = d; 674 modifier = m; 675 } 676 copyRangeCE32(int start, int end, int ce32)677 void copyRangeCE32(int start, int end, int ce32) { 678 ce32 = copyCE32(ce32); 679 dest.trie.setRange(start, end, ce32, true); 680 if(CollationDataBuilder.isBuilderContextCE32(ce32)) { 681 dest.contextChars.add(start, end); 682 } 683 } 684 copyCE32(int ce32)685 int copyCE32(int ce32) { 686 if(!Collation.isSpecialCE32(ce32)) { 687 long ce = modifier.modifyCE32(ce32); 688 if(ce != Collation.NO_CE) { 689 ce32 = dest.encodeOneCE(ce); 690 } 691 } else { 692 int tag = Collation.tagFromCE32(ce32); 693 if(tag == Collation.EXPANSION32_TAG) { 694 int[] srcCE32s = src.ce32s.getBuffer(); 695 int srcIndex = Collation.indexFromCE32(ce32); 696 int length = Collation.lengthFromCE32(ce32); 697 // Inspect the source CE32s. Just copy them if none are modified. 698 // Otherwise copy to modifiedCEs, with modifications. 699 boolean isModified = false; 700 for(int i = 0; i < length; ++i) { 701 ce32 = srcCE32s[srcIndex + i]; 702 long ce; 703 if(Collation.isSpecialCE32(ce32) || 704 (ce = modifier.modifyCE32(ce32)) == Collation.NO_CE) { 705 if(isModified) { 706 modifiedCEs[i] = Collation.ceFromCE32(ce32); 707 } 708 } else { 709 if(!isModified) { 710 for(int j = 0; j < i; ++j) { 711 modifiedCEs[j] = Collation.ceFromCE32(srcCE32s[srcIndex + j]); 712 } 713 isModified = true; 714 } 715 modifiedCEs[i] = ce; 716 } 717 } 718 if(isModified) { 719 ce32 = dest.encodeCEs(modifiedCEs, length); 720 } else { 721 ce32 = dest.encodeExpansion32(srcCE32s, srcIndex, length); 722 } 723 } else if(tag == Collation.EXPANSION_TAG) { 724 long[] srcCEs = src.ce64s.getBuffer(); 725 int srcIndex = Collation.indexFromCE32(ce32); 726 int length = Collation.lengthFromCE32(ce32); 727 // Inspect the source CEs. Just copy them if none are modified. 728 // Otherwise copy to modifiedCEs, with modifications. 729 boolean isModified = false; 730 for(int i = 0; i < length; ++i) { 731 long srcCE = srcCEs[srcIndex + i]; 732 long ce = modifier.modifyCE(srcCE); 733 if(ce == Collation.NO_CE) { 734 if(isModified) { 735 modifiedCEs[i] = srcCE; 736 } 737 } else { 738 if(!isModified) { 739 for(int j = 0; j < i; ++j) { 740 modifiedCEs[j] = srcCEs[srcIndex + j]; 741 } 742 isModified = true; 743 } 744 modifiedCEs[i] = ce; 745 } 746 } 747 if(isModified) { 748 ce32 = dest.encodeCEs(modifiedCEs, length); 749 } else { 750 ce32 = dest.encodeExpansion(srcCEs, srcIndex, length); 751 } 752 } else if(tag == Collation.BUILDER_DATA_TAG) { 753 // Copy the list of ConditionalCE32. 754 ConditionalCE32 cond = src.getConditionalCE32ForCE32(ce32); 755 assert(!cond.hasContext()); 756 int destIndex = dest.addConditionalCE32( 757 cond.context, copyCE32(cond.ce32)); 758 ce32 = CollationDataBuilder.makeBuilderContextCE32(destIndex); 759 while(cond.next >= 0) { 760 cond = src.getConditionalCE32(cond.next); 761 ConditionalCE32 prevDestCond = dest.getConditionalCE32(destIndex); 762 destIndex = dest.addConditionalCE32( 763 cond.context, copyCE32(cond.ce32)); 764 int suffixStart = cond.prefixLength() + 1; 765 dest.unsafeBackwardSet.addAll(cond.context.substring(suffixStart)); 766 prevDestCond.next = destIndex; 767 } 768 } else { 769 // Just copy long CEs and Latin mini expansions (and other expected values) as is, 770 // assuming that the modifier would not modify them. 771 assert(tag == Collation.LONG_PRIMARY_TAG || 772 tag == Collation.LONG_SECONDARY_TAG || 773 tag == Collation.LATIN_EXPANSION_TAG || 774 tag == Collation.HANGUL_TAG); 775 } 776 } 777 return ce32; 778 } 779 780 CollationDataBuilder src; 781 CollationDataBuilder dest; 782 CollationDataBuilder.CEModifier modifier; 783 long[] modifiedCEs = new long[Collation.MAX_EXPANSION_LENGTH]; 784 } 785 786 private static void enumRangeForCopy(int start, int end, int value, CopyHelper helper)787 enumRangeForCopy(int start, int end, int value, CopyHelper helper) { 788 if(value != Collation.UNASSIGNED_CE32 && value != Collation.FALLBACK_CE32) { 789 helper.copyRangeCE32(start, end, value); 790 } 791 } 792 getJamoCE32s(int jamoCE32s[])793 protected boolean getJamoCE32s(int jamoCE32s[]) { 794 boolean anyJamoAssigned = base == null; // always set jamoCE32s in the base data 795 boolean needToCopyFromBase = false; 796 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 797 int jamo = jamoCpFromIndex(j); 798 boolean fromBase = false; 799 int ce32 = trie.get(jamo); 800 anyJamoAssigned |= Collation.isAssignedCE32(ce32); 801 // TODO: Try to prevent [optimize [Jamo]] from counting as anyJamoAssigned. 802 // (As of CLDR 24 [2013] the Korean tailoring does not optimize conjoining Jamo.) 803 if(ce32 == Collation.FALLBACK_CE32) { 804 fromBase = true; 805 ce32 = base.getCE32(jamo); 806 } 807 if(Collation.isSpecialCE32(ce32)) { 808 switch(Collation.tagFromCE32(ce32)) { 809 case Collation.LONG_PRIMARY_TAG: 810 case Collation.LONG_SECONDARY_TAG: 811 case Collation.LATIN_EXPANSION_TAG: 812 // Copy the ce32 as-is. 813 break; 814 case Collation.EXPANSION32_TAG: 815 case Collation.EXPANSION_TAG: 816 case Collation.PREFIX_TAG: 817 case Collation.CONTRACTION_TAG: 818 if(fromBase) { 819 // Defer copying until we know if anyJamoAssigned. 820 ce32 = Collation.FALLBACK_CE32; 821 needToCopyFromBase = true; 822 } 823 break; 824 case Collation.IMPLICIT_TAG: 825 // An unassigned Jamo should only occur in tests with incomplete bases. 826 assert(fromBase); 827 ce32 = Collation.FALLBACK_CE32; 828 needToCopyFromBase = true; 829 break; 830 case Collation.OFFSET_TAG: 831 ce32 = getCE32FromOffsetCE32(fromBase, jamo, ce32); 832 break; 833 case Collation.FALLBACK_TAG: 834 case Collation.RESERVED_TAG_3: 835 case Collation.BUILDER_DATA_TAG: 836 case Collation.DIGIT_TAG: 837 case Collation.U0000_TAG: 838 case Collation.HANGUL_TAG: 839 case Collation.LEAD_SURROGATE_TAG: 840 throw new AssertionError(String.format("unexpected special tag in ce32=0x%08x", ce32)); 841 } 842 } 843 jamoCE32s[j] = ce32; 844 } 845 if(anyJamoAssigned && needToCopyFromBase) { 846 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { 847 if(jamoCE32s[j] == Collation.FALLBACK_CE32) { 848 int jamo = jamoCpFromIndex(j); 849 jamoCE32s[j] = copyFromBaseCE32(jamo, base.getCE32(jamo), 850 /*withContext=*/ true); 851 } 852 } 853 } 854 return anyJamoAssigned; 855 } 856 setDigitTags()857 protected void setDigitTags() { 858 UnicodeSet digits = new UnicodeSet("[:Nd:]"); 859 UnicodeSetIterator iter = new UnicodeSetIterator(digits); 860 while(iter.next()) { 861 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 862 int c = iter.codepoint; 863 int ce32 = trie.get(c); 864 if(ce32 != Collation.FALLBACK_CE32 && ce32 != Collation.UNASSIGNED_CE32) { 865 int index = addCE32(ce32); 866 if(index > Collation.MAX_INDEX) { 867 throw new IndexOutOfBoundsException("too many mappings"); 868 // BufferOverflowException is a better fit 869 // but cannot be constructed with a message string. 870 } 871 ce32 = Collation.makeCE32FromTagIndexAndLength( 872 Collation.DIGIT_TAG, index, UCharacter.digit(c)); // u_charDigitValue(c) 873 trie.set(c, ce32); 874 } 875 } 876 } 877 setLeadSurrogates()878 protected void setLeadSurrogates() { 879 for(char lead = 0xd800; lead < 0xdc00; ++lead) { 880 int leadValue = -1; 881 // utrie2_enumForLeadSurrogate(trie, lead, null, , &value); 882 Iterator<Trie2.Range> trieIterator = trie.iteratorForLeadSurrogate(lead); 883 while(trieIterator.hasNext()) { 884 Trie2.Range range = trieIterator.next(); 885 // The rest of this loop is equivalent to C++ enumRangeLeadValue(). 886 int value = range.value; 887 if(value == Collation.UNASSIGNED_CE32) { 888 value = Collation.LEAD_ALL_UNASSIGNED; 889 } else if(value == Collation.FALLBACK_CE32) { 890 value = Collation.LEAD_ALL_FALLBACK; 891 } else { 892 leadValue = Collation.LEAD_MIXED; 893 break; 894 } 895 if(leadValue < 0) { 896 leadValue = value; 897 } else if(leadValue != value) { 898 leadValue = Collation.LEAD_MIXED; 899 break; 900 } 901 } 902 trie.setForLeadSurrogateCodeUnit(lead, 903 Collation.makeCE32FromTagAndIndex(Collation.LEAD_SURROGATE_TAG, 0) | leadValue); 904 } 905 } 906 buildMappings(CollationData data)907 protected void buildMappings(CollationData data) { 908 if(!isMutable()) { 909 throw new IllegalStateException("attempt to build() after build()"); 910 } 911 912 buildContexts(); 913 914 int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 915 int jamoIndex = -1; 916 if(getJamoCE32s(jamoCE32s)) { 917 jamoIndex = ce32s.size(); 918 for(int i = 0; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 919 ce32s.addElement(jamoCE32s[i]); 920 } 921 // Small optimization: Use a bit in the Hangul ce32 922 // to indicate that none of the Jamo CE32s are isSpecialCE32() 923 // (as it should be in the root collator). 924 // It allows CollationIterator to avoid recursive function calls and per-Jamo tests. 925 // In order to still have good trie compression and keep this code simple, 926 // we only set this flag if a whole block of 588 Hangul syllables starting with 927 // a common leading consonant (Jamo L) has this property. 928 boolean isAnyJamoVTSpecial = false; 929 for(int i = Hangul.JAMO_L_COUNT; i < CollationData.JAMO_CE32S_LENGTH; ++i) { 930 if(Collation.isSpecialCE32(jamoCE32s[i])) { 931 isAnyJamoVTSpecial = true; 932 break; 933 } 934 } 935 int hangulCE32 = Collation.makeCE32FromTagAndIndex(Collation.HANGUL_TAG, 0); 936 int c = Hangul.HANGUL_BASE; 937 for(int i = 0; i < Hangul.JAMO_L_COUNT; ++i) { // iterate over the Jamo L 938 int ce32 = hangulCE32; 939 if(!isAnyJamoVTSpecial && !Collation.isSpecialCE32(jamoCE32s[i])) { 940 ce32 |= Collation.HANGUL_NO_SPECIAL_JAMO; 941 } 942 int limit = c + Hangul.JAMO_VT_COUNT; 943 trie.setRange(c, limit - 1, ce32, true); 944 c = limit; 945 } 946 } else { 947 // Copy the Hangul CE32s from the base in blocks per Jamo L, 948 // assuming that HANGUL_NO_SPECIAL_JAMO is set or not set for whole blocks. 949 for(int c = Hangul.HANGUL_BASE; c < Hangul.HANGUL_LIMIT;) { 950 int ce32 = base.getCE32(c); 951 assert(Collation.hasCE32Tag(ce32, Collation.HANGUL_TAG)); 952 int limit = c + Hangul.JAMO_VT_COUNT; 953 trie.setRange(c, limit - 1, ce32, true); 954 c = limit; 955 } 956 } 957 958 setDigitTags(); 959 setLeadSurrogates(); 960 961 // For U+0000, move its normal ce32 into CE32s[0] and set U0000_TAG. 962 ce32s.setElementAt(trie.get(0), 0); 963 trie.set(0, Collation.makeCE32FromTagAndIndex(Collation.U0000_TAG, 0)); 964 965 data.trie = trie.toTrie2_32(); 966 967 // Mark each lead surrogate as "unsafe" 968 // if any of its 1024 associated supplementary code points is "unsafe". 969 int c = 0x10000; 970 for(char lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) { 971 if(unsafeBackwardSet.containsSome(c, c + 0x3ff)) { 972 unsafeBackwardSet.add(lead); 973 } 974 } 975 unsafeBackwardSet.freeze(); 976 977 data.ce32s = ce32s.getBuffer(); 978 data.ces = ce64s.getBuffer(); 979 data.contexts = contexts.toString(); 980 981 data.base = base; 982 if(jamoIndex >= 0) { 983 data.jamoCE32s = jamoCE32s; // C++: data.ce32s + jamoIndex 984 } else { 985 data.jamoCE32s = base.jamoCE32s; 986 } 987 data.unsafeBackwardSet = unsafeBackwardSet; 988 } 989 clearContexts()990 protected void clearContexts() { 991 contexts.setLength(0); 992 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 993 while(iter.next()) { 994 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 995 int ce32 = trie.get(iter.codepoint); 996 assert(isBuilderContextCE32(ce32)); 997 getConditionalCE32ForCE32(ce32).builtCE32 = Collation.NO_CE32; 998 } 999 } 1000 buildContexts()1001 protected void buildContexts() { 1002 // Ignore abandoned lists and the cached builtCE32, 1003 // and build all contexts from scratch. 1004 contexts.setLength(0); 1005 UnicodeSetIterator iter = new UnicodeSetIterator(contextChars); 1006 while(iter.next()) { 1007 assert(iter.codepoint != UnicodeSetIterator.IS_STRING); 1008 int c = iter.codepoint; 1009 int ce32 = trie.get(c); 1010 if(!isBuilderContextCE32(ce32)) { 1011 throw new AssertionError("Impossible: No context data for c in contextChars."); 1012 } 1013 ConditionalCE32 cond = getConditionalCE32ForCE32(ce32); 1014 ce32 = buildContext(cond); 1015 trie.set(c, ce32); 1016 } 1017 } 1018 buildContext(ConditionalCE32 head)1019 protected int buildContext(ConditionalCE32 head) { 1020 // The list head must have no context. 1021 assert(!head.hasContext()); 1022 // The list head must be followed by one or more nodes that all do have context. 1023 assert(head.next >= 0); 1024 CharsTrieBuilder prefixBuilder = new CharsTrieBuilder(); 1025 CharsTrieBuilder contractionBuilder = new CharsTrieBuilder(); 1026 for(ConditionalCE32 cond = head;; cond = getConditionalCE32(cond.next)) { 1027 // After the list head, the prefix or suffix can be empty, but not both. 1028 assert(cond == head || cond.hasContext()); 1029 int prefixLength = cond.prefixLength(); 1030 StringBuilder prefix = new StringBuilder().append(cond.context, 0, prefixLength + 1); 1031 String prefixString = prefix.toString(); 1032 // Collect all contraction suffixes for one prefix. 1033 ConditionalCE32 firstCond = cond; 1034 ConditionalCE32 lastCond = cond; 1035 while(cond.next >= 0 && 1036 (cond = getConditionalCE32(cond.next)).context.startsWith(prefixString)) { 1037 lastCond = cond; 1038 } 1039 int ce32; 1040 int suffixStart = prefixLength + 1; // == prefix.length() 1041 if(lastCond.context.length() == suffixStart) { 1042 // One prefix without contraction suffix. 1043 assert(firstCond == lastCond); 1044 ce32 = lastCond.ce32; 1045 cond = lastCond; 1046 } else { 1047 // Build the contractions trie. 1048 contractionBuilder.clear(); 1049 // Entry for an empty suffix, to be stored before the trie. 1050 int emptySuffixCE32 = Collation.NO_CE32; // Will always be set to a real value. 1051 int flags = 0; 1052 if(firstCond.context.length() == suffixStart) { 1053 // There is a mapping for the prefix and the single character c. (p|c) 1054 // If no other suffix matches, then we return this value. 1055 emptySuffixCE32 = firstCond.ce32; 1056 cond = getConditionalCE32(firstCond.next); 1057 } else { 1058 // There is no mapping for the prefix and just the single character. 1059 // (There is no p|c, only p|cd, p|ce etc.) 1060 flags |= Collation.CONTRACT_SINGLE_CP_NO_MATCH; 1061 // When the prefix matches but none of the prefix-specific suffixes, 1062 // then we fall back to the mappings with the next-longest prefix, 1063 // and ultimately to mappings with no prefix. 1064 // Each fallback might be another set of contractions. 1065 // For example, if there are mappings for ch, p|cd, p|ce, but not for p|c, 1066 // then in text "pch" we find the ch contraction. 1067 for(cond = head;; cond = getConditionalCE32(cond.next)) { 1068 int length = cond.prefixLength(); 1069 if(length == prefixLength) { break; } 1070 if(cond.defaultCE32 != Collation.NO_CE32 && 1071 (length==0 || prefixString.regionMatches( 1072 prefix.length() - length, cond.context, 1, length) 1073 /* C++: prefix.endsWith(cond.context, 1, length) */)) { 1074 emptySuffixCE32 = cond.defaultCE32; 1075 } 1076 } 1077 cond = firstCond; 1078 } 1079 // Optimization: Set a flag when 1080 // the first character of every contraction suffix has lccc!=0. 1081 // Short-circuits contraction matching when a normal letter follows. 1082 flags |= Collation.CONTRACT_NEXT_CCC; 1083 // Add all of the non-empty suffixes into the contraction trie. 1084 for(;;) { 1085 String suffix = cond.context.substring(suffixStart); 1086 int fcd16 = nfcImpl.getFCD16(suffix.codePointAt(0)); 1087 if(fcd16 <= 0xff) { 1088 flags &= ~Collation.CONTRACT_NEXT_CCC; 1089 } 1090 fcd16 = nfcImpl.getFCD16(suffix.codePointBefore(suffix.length())); 1091 if(fcd16 > 0xff) { 1092 // The last suffix character has lccc!=0, allowing for discontiguous contractions. 1093 flags |= Collation.CONTRACT_TRAILING_CCC; 1094 } 1095 contractionBuilder.add(suffix, cond.ce32); 1096 if(cond == lastCond) { break; } 1097 cond = getConditionalCE32(cond.next); 1098 } 1099 int index = addContextTrie(emptySuffixCE32, contractionBuilder); 1100 if(index > Collation.MAX_INDEX) { 1101 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1102 // BufferOverflowException is a better fit 1103 // but cannot be constructed with a message string. 1104 } 1105 ce32 = Collation.makeCE32FromTagAndIndex(Collation.CONTRACTION_TAG, index) | flags; 1106 } 1107 assert(cond == lastCond); 1108 firstCond.defaultCE32 = ce32; 1109 if(prefixLength == 0) { 1110 if(cond.next < 0) { 1111 // No non-empty prefixes, only contractions. 1112 return ce32; 1113 } 1114 } else { 1115 prefix.delete(0, 1); // Remove the length unit. 1116 prefix.reverse(); 1117 prefixBuilder.add(prefix, ce32); 1118 if(cond.next < 0) { break; } 1119 } 1120 } 1121 assert(head.defaultCE32 != Collation.NO_CE32); 1122 int index = addContextTrie(head.defaultCE32, prefixBuilder); 1123 if(index > Collation.MAX_INDEX) { 1124 throw new IndexOutOfBoundsException("too many context-sensitive mappings"); 1125 // BufferOverflowException is a better fit 1126 // but cannot be constructed with a message string. 1127 } 1128 return Collation.makeCE32FromTagAndIndex(Collation.PREFIX_TAG, index); 1129 } 1130 addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder)1131 protected int addContextTrie(int defaultCE32, CharsTrieBuilder trieBuilder) { 1132 StringBuilder context = new StringBuilder(); 1133 context.append((char)(defaultCE32 >> 16)).append((char)defaultCE32); 1134 context.append(trieBuilder.buildCharSequence(StringTrieBuilder.Option.SMALL)); 1135 int index = contexts.indexOf(context.toString()); 1136 if(index < 0) { 1137 index = contexts.length(); 1138 contexts.append(context); 1139 } 1140 return index; 1141 } 1142 buildFastLatinTable(CollationData data)1143 protected void buildFastLatinTable(CollationData data) { 1144 if(!fastLatinEnabled) { return; } 1145 1146 fastLatinBuilder = new CollationFastLatinBuilder(); 1147 if(fastLatinBuilder.forData(data)) { 1148 char[] header = fastLatinBuilder.getHeader(); 1149 char[] table = fastLatinBuilder.getTable(); 1150 if(base != null && 1151 Arrays.equals(header, base.fastLatinTableHeader) && 1152 Arrays.equals(table, base.fastLatinTable)) { 1153 // Same fast Latin table as in the base, use that one instead. 1154 fastLatinBuilder = null; 1155 header = base.fastLatinTableHeader; 1156 table = base.fastLatinTable; 1157 } 1158 data.fastLatinTableHeader = header; 1159 data.fastLatinTable = table; 1160 } else { 1161 fastLatinBuilder = null; 1162 } 1163 } 1164 getCEs(CharSequence s, int start, long ces[], int cesLength)1165 protected int getCEs(CharSequence s, int start, long ces[], int cesLength) { 1166 if(collIter == null) { 1167 collIter = new DataBuilderCollationIterator(this, new CollationData(nfcImpl)); 1168 if(collIter == null) { return 0; } 1169 } 1170 return collIter.fetchCEs(s, start, ces, cesLength); 1171 } 1172 jamoCpFromIndex(int i)1173 protected static int jamoCpFromIndex(int i) { 1174 // 0 <= i < CollationData.JAMO_CE32S_LENGTH = 19 + 21 + 27 1175 if(i < Hangul.JAMO_L_COUNT) { return Hangul.JAMO_L_BASE + i; } 1176 i -= Hangul.JAMO_L_COUNT; 1177 if(i < Hangul.JAMO_V_COUNT) { return Hangul.JAMO_V_BASE + i; } 1178 i -= Hangul.JAMO_V_COUNT; 1179 // i < 27 1180 return Hangul.JAMO_T_BASE + 1 + i; 1181 } 1182 1183 /** 1184 * Build-time collation element and character iterator. 1185 * Uses the runtime CollationIterator for fetching CEs for a string 1186 * but reads from the builder's unfinished data structures. 1187 * In particular, this class reads from the unfinished trie 1188 * and has to avoid CollationIterator.nextCE() and redirect other 1189 * calls to data.getCE32() and data.getCE32FromSupplementary(). 1190 * 1191 * We do this so that we need not implement the collation algorithm 1192 * again for the builder and make it behave exactly like the runtime code. 1193 * That would be more difficult to test and maintain than this indirection. 1194 * 1195 * Some CE32 tags (for example, the DIGIT_TAG) do not occur in the builder data, 1196 * so the data accesses from those code paths need not be modified. 1197 * 1198 * This class iterates directly over whole code points 1199 * so that the CollationIterator does not need the finished trie 1200 * for handling the LEAD_SURROGATE_TAG. 1201 */ 1202 private static final class DataBuilderCollationIterator extends CollationIterator { DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData)1203 DataBuilderCollationIterator(CollationDataBuilder b, CollationData newData) { 1204 super(newData, /*numeric=*/ false); 1205 builder = b; 1206 builderData = newData; 1207 builderData.base = builder.base; 1208 // Set all of the jamoCE32s[] to indirection CE32s. 1209 for(int j = 0; j < CollationData.JAMO_CE32S_LENGTH; ++j) { // Count across Jamo types. 1210 int jamo = CollationDataBuilder.jamoCpFromIndex(j); 1211 jamoCE32s[j] = Collation.makeCE32FromTagAndIndex(Collation.BUILDER_DATA_TAG, jamo) | 1212 CollationDataBuilder.IS_BUILDER_JAMO_CE32; 1213 } 1214 builderData.jamoCE32s = jamoCE32s; 1215 } 1216 fetchCEs(CharSequence str, int start, long ces[], int cesLength)1217 int fetchCEs(CharSequence str, int start, long ces[], int cesLength) { 1218 // Set the pointers each time, in case they changed due to reallocation. 1219 builderData.ce32s = builder.ce32s.getBuffer(); 1220 builderData.ces = builder.ce64s.getBuffer(); 1221 builderData.contexts = builder.contexts.toString(); 1222 // Modified copy of CollationIterator.nextCE() and CollationIterator.nextCEFromCE32(). 1223 reset(); 1224 s = str; 1225 pos = start; 1226 while(pos < s.length()) { 1227 // No need to keep all CEs in the iterator buffer. 1228 clearCEs(); 1229 int c = Character.codePointAt(s, pos); 1230 pos += Character.charCount(c); 1231 int ce32 = builder.trie.get(c); 1232 CollationData d; 1233 if(ce32 == Collation.FALLBACK_CE32) { 1234 d = builder.base; 1235 ce32 = builder.base.getCE32(c); 1236 } else { 1237 d = builderData; 1238 } 1239 appendCEsFromCE32(d, c, ce32, /*forward=*/ true); 1240 for(int i = 0; i < getCEsLength(); ++i) { 1241 long ce = getCE(i); 1242 if(ce != 0) { 1243 if(cesLength < Collation.MAX_EXPANSION_LENGTH) { 1244 ces[cesLength] = ce; 1245 } 1246 ++cesLength; 1247 } 1248 } 1249 } 1250 return cesLength; 1251 } 1252 1253 @Override resetToOffset(int newOffset)1254 public void resetToOffset(int newOffset) { 1255 reset(); 1256 pos = newOffset; 1257 } 1258 1259 @Override getOffset()1260 public int getOffset() { 1261 return pos; 1262 } 1263 1264 @Override nextCodePoint()1265 public int nextCodePoint() { 1266 if(pos == s.length()) { 1267 return Collation.SENTINEL_CP; 1268 } 1269 int c = Character.codePointAt(s, pos); 1270 pos += Character.charCount(c); 1271 return c; 1272 } 1273 1274 @Override previousCodePoint()1275 public int previousCodePoint() { 1276 if(pos == 0) { 1277 return Collation.SENTINEL_CP; 1278 } 1279 int c = Character.codePointBefore(s, pos); 1280 pos -= Character.charCount(c); 1281 return c; 1282 } 1283 1284 @Override forwardNumCodePoints(int num)1285 protected void forwardNumCodePoints(int num) { 1286 pos = Character.offsetByCodePoints(s, pos, num); 1287 } 1288 1289 @Override backwardNumCodePoints(int num)1290 protected void backwardNumCodePoints(int num) { 1291 pos = Character.offsetByCodePoints(s, pos, -num); 1292 } 1293 1294 @Override getDataCE32(int c)1295 protected int getDataCE32(int c) { 1296 return builder.trie.get(c); 1297 } 1298 1299 @Override getCE32FromBuilderData(int ce32)1300 protected int getCE32FromBuilderData(int ce32) { 1301 assert(Collation.hasCE32Tag(ce32, Collation.BUILDER_DATA_TAG)); 1302 if((ce32 & CollationDataBuilder.IS_BUILDER_JAMO_CE32) != 0) { 1303 int jamo = Collation.indexFromCE32(ce32); 1304 return builder.trie.get(jamo); 1305 } else { 1306 ConditionalCE32 cond = builder.getConditionalCE32ForCE32(ce32); 1307 if(cond.builtCE32 == Collation.NO_CE32) { 1308 // Build the context-sensitive mappings into their runtime form and cache the result. 1309 try { 1310 cond.builtCE32 = builder.buildContext(cond); 1311 } catch(IndexOutOfBoundsException e) { 1312 builder.clearContexts(); 1313 cond.builtCE32 = builder.buildContext(cond); 1314 } 1315 builderData.contexts = builder.contexts.toString(); 1316 } 1317 return cond.builtCE32; 1318 } 1319 } 1320 1321 protected final CollationDataBuilder builder; 1322 protected final CollationData builderData; 1323 protected final int[] jamoCE32s = new int[CollationData.JAMO_CE32S_LENGTH]; 1324 protected CharSequence s; 1325 protected int pos; 1326 } 1327 isMutable()1328 protected final boolean isMutable() { 1329 // C++ tests !(trie == NULL || utrie2_isFrozen(trie)) 1330 // but Java Trie2Writable does not have an observable isFrozen() state. 1331 return trie != null && unsafeBackwardSet != null && !unsafeBackwardSet.isFrozen(); 1332 } 1333 1334 /** @see Collation.BUILDER_DATA_TAG */ 1335 private static final int IS_BUILDER_JAMO_CE32 = 0x100; 1336 1337 protected Normalizer2Impl nfcImpl; 1338 protected CollationData base; 1339 protected CollationSettings baseSettings; 1340 protected Trie2Writable trie; 1341 protected UVector32 ce32s; 1342 protected UVector64 ce64s; 1343 protected ArrayList<ConditionalCE32> conditionalCE32s; // vector of ConditionalCE32 1344 // Characters that have context (prefixes or contraction suffixes). 1345 protected UnicodeSet contextChars = new UnicodeSet(); 1346 // Serialized UCharsTrie structures for finalized contexts. 1347 protected StringBuilder contexts = new StringBuilder(); 1348 protected UnicodeSet unsafeBackwardSet = new UnicodeSet(); 1349 protected boolean modified; 1350 1351 protected boolean fastLatinEnabled; 1352 protected CollationFastLatinBuilder fastLatinBuilder; 1353 1354 protected DataBuilderCollationIterator collIter; 1355 } 1356