1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.ArrayList; 15 16 import com.ibm.icu.text.UTF16; 17 import com.ibm.icu.text.UnicodeSet; 18 import com.ibm.icu.util.CodePointMap; 19 import com.ibm.icu.util.CodePointTrie; 20 import com.ibm.icu.util.ICUUncheckedIOException; 21 import com.ibm.icu.util.MutableCodePointTrie; 22 import com.ibm.icu.util.VersionInfo; 23 24 /** 25 * Low-level implementation of the Unicode Normalization Algorithm. 26 * For the data structure and details see the documentation at the end of 27 * C++ normalizer2impl.h and in the design doc at 28 * http://site.icu-project.org/design/normalization/custom 29 */ 30 public final class Normalizer2Impl { 31 public static final class Hangul { 32 /* Korean Hangul and Jamo constants */ 33 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 34 public static final int JAMO_L_END=0x1112; 35 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 36 public static final int JAMO_V_END=0x1175; 37 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 38 public static final int JAMO_T_END=0x11c2; 39 40 public static final int HANGUL_BASE=0xac00; 41 public static final int HANGUL_END=0xd7a3; 42 43 public static final int JAMO_L_COUNT=19; 44 public static final int JAMO_V_COUNT=21; 45 public static final int JAMO_T_COUNT=28; 46 47 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 48 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 49 50 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 51 52 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 53 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 54 isHangul(int c)55 public static boolean isHangul(int c) { 56 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 57 } isHangulLV(int c)58 public static boolean isHangulLV(int c) { 59 c-=HANGUL_BASE; 60 return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 61 } isJamoL(int c)62 public static boolean isJamoL(int c) { 63 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 64 } isJamoV(int c)65 public static boolean isJamoV(int c) { 66 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 67 } isJamoT(int c)68 public static boolean isJamoT(int c) { 69 int t=c-JAMO_T_BASE; 70 return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself 71 } isJamo(int c)72 public static boolean isJamo(int c) { 73 return JAMO_L_BASE<=c && c<=JAMO_T_END && 74 (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c); 75 } 76 77 /** 78 * Decomposes c, which must be a Hangul syllable, into buffer 79 * and returns the length of the decomposition (2 or 3). 80 */ decompose(int c, Appendable buffer)81 public static int decompose(int c, Appendable buffer) { 82 try { 83 c-=HANGUL_BASE; 84 int c2=c%JAMO_T_COUNT; 85 c/=JAMO_T_COUNT; 86 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 87 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 88 if(c2==0) { 89 return 2; 90 } else { 91 buffer.append((char)(JAMO_T_BASE+c2)); 92 return 3; 93 } 94 } catch(IOException e) { 95 // Will not occur because we do not write to I/O. 96 throw new ICUUncheckedIOException(e); 97 } 98 } 99 100 /** 101 * Decomposes c, which must be a Hangul syllable, into buffer. 102 * This is the raw, not recursive, decomposition. Its length is always 2. 103 */ getRawDecomposition(int c, Appendable buffer)104 public static void getRawDecomposition(int c, Appendable buffer) { 105 try { 106 int orig=c; 107 c-=HANGUL_BASE; 108 int c2=c%JAMO_T_COUNT; 109 if(c2==0) { 110 c/=JAMO_T_COUNT; 111 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 112 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 113 } else { 114 buffer.append((char)(orig-c2)); // LV syllable 115 buffer.append((char)(JAMO_T_BASE+c2)); 116 } 117 } catch(IOException e) { 118 // Will not occur because we do not write to I/O. 119 throw new ICUUncheckedIOException(e); 120 } 121 } 122 } 123 124 /** 125 * Writable buffer that takes care of canonical ordering. 126 * Its Appendable methods behave like the C++ implementation's 127 * appendZeroCC() methods. 128 * <p> 129 * If dest is a StringBuilder, then the buffer writes directly to it. 130 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 131 * until no further changes are necessary and whole segments are appended. 132 * append() methods that take combining-class values always write to the StringBuilder. 133 * Other append() methods flush and append to the Appendable. 134 */ 135 public static final class ReorderingBuffer implements Appendable { ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)136 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 137 impl=ni; 138 app=dest; 139 if(app instanceof StringBuilder) { 140 appIsStringBuilder=true; 141 str=(StringBuilder)dest; 142 // In Java, the constructor subsumes public void init(int destCapacity) { 143 str.ensureCapacity(destCapacity); 144 reorderStart=0; 145 if(str.length()==0) { 146 lastCC=0; 147 } else { 148 setIterator(); 149 lastCC=previousCC(); 150 // Set reorderStart after the last code point with cc<=1 if there is one. 151 if(lastCC>1) { 152 while(previousCC()>1) {} 153 } 154 reorderStart=codePointLimit; 155 } 156 } else { 157 appIsStringBuilder=false; 158 str=new StringBuilder(); 159 reorderStart=0; 160 lastCC=0; 161 } 162 } 163 isEmpty()164 public boolean isEmpty() { return str.length()==0; } length()165 public int length() { return str.length(); } getLastCC()166 public int getLastCC() { return lastCC; } 167 getStringBuilder()168 public StringBuilder getStringBuilder() { return str; } 169 equals(CharSequence s, int start, int limit)170 public boolean equals(CharSequence s, int start, int limit) { 171 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 172 } 173 append(int c, int cc)174 public void append(int c, int cc) { 175 if(lastCC<=cc || cc==0) { 176 str.appendCodePoint(c); 177 lastCC=cc; 178 if(cc<=1) { 179 reorderStart=str.length(); 180 } 181 } else { 182 insert(c, cc); 183 } 184 } append(CharSequence s, int start, int limit, boolean isNFD, int leadCC, int trailCC)185 public void append(CharSequence s, int start, int limit, boolean isNFD, 186 int leadCC, int trailCC) { 187 if(start==limit) { 188 return; 189 } 190 if(lastCC<=leadCC || leadCC==0) { 191 if(trailCC<=1) { 192 reorderStart=str.length()+(limit-start); 193 } else if(leadCC<=1) { 194 reorderStart=str.length()+1; // Ok if not a code point boundary. 195 } 196 str.append(s, start, limit); 197 lastCC=trailCC; 198 } else { 199 int c=Character.codePointAt(s, start); 200 start+=Character.charCount(c); 201 insert(c, leadCC); // insert first code point 202 while(start<limit) { 203 c=Character.codePointAt(s, start); 204 start+=Character.charCount(c); 205 if(start<limit) { 206 if (isNFD) { 207 leadCC = getCCFromYesOrMaybe(impl.getNorm16(c)); 208 } else { 209 leadCC = impl.getCC(impl.getNorm16(c)); 210 } 211 } else { 212 leadCC=trailCC; 213 } 214 append(c, leadCC); 215 } 216 } 217 } 218 // The following append() methods work like C++ appendZeroCC(). 219 // They assume that the cc or trailCC of their input is 0. 220 // Most of them implement Appendable interface methods. 221 @Override append(char c)222 public ReorderingBuffer append(char c) { 223 str.append(c); 224 lastCC=0; 225 reorderStart=str.length(); 226 return this; 227 } appendZeroCC(int c)228 public void appendZeroCC(int c) { 229 str.appendCodePoint(c); 230 lastCC=0; 231 reorderStart=str.length(); 232 } 233 @Override append(CharSequence s)234 public ReorderingBuffer append(CharSequence s) { 235 if(s.length()!=0) { 236 str.append(s); 237 lastCC=0; 238 reorderStart=str.length(); 239 } 240 return this; 241 } 242 @Override append(CharSequence s, int start, int limit)243 public ReorderingBuffer append(CharSequence s, int start, int limit) { 244 if(start!=limit) { 245 str.append(s, start, limit); 246 lastCC=0; 247 reorderStart=str.length(); 248 } 249 return this; 250 } 251 /** 252 * Flushes from the intermediate StringBuilder to the Appendable, 253 * if they are different objects. 254 * Used after recomposition. 255 * Must be called at the end when writing to a non-StringBuilder Appendable. 256 */ flush()257 public void flush() { 258 if(appIsStringBuilder) { 259 reorderStart=str.length(); 260 } else { 261 try { 262 app.append(str); 263 str.setLength(0); 264 reorderStart=0; 265 } catch(IOException e) { 266 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 267 } 268 } 269 lastCC=0; 270 } 271 /** 272 * Flushes from the intermediate StringBuilder to the Appendable, 273 * if they are different objects. 274 * Then appends the new text to the Appendable or StringBuilder. 275 * Normally used after quick check loops find a non-empty sequence. 276 */ flushAndAppendZeroCC(CharSequence s, int start, int limit)277 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 278 if(appIsStringBuilder) { 279 str.append(s, start, limit); 280 reorderStart=str.length(); 281 } else { 282 try { 283 app.append(str).append(s, start, limit); 284 str.setLength(0); 285 reorderStart=0; 286 } catch(IOException e) { 287 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 288 } 289 } 290 lastCC=0; 291 return this; 292 } remove()293 public void remove() { 294 str.setLength(0); 295 lastCC=0; 296 reorderStart=0; 297 } removeSuffix(int suffixLength)298 public void removeSuffix(int suffixLength) { 299 int oldLength=str.length(); 300 str.delete(oldLength-suffixLength, oldLength); 301 lastCC=0; 302 reorderStart=str.length(); 303 } 304 305 /* 306 * TODO: Revisit whether it makes sense to track reorderStart. 307 * It is set to after the last known character with cc<=1, 308 * which stops previousCC() before it reads that character and looks up its cc. 309 * previousCC() is normally only called from insert(). 310 * In other words, reorderStart speeds up the insertion of a combining mark 311 * into a multi-combining mark sequence where it does not belong at the end. 312 * This might not be worth the trouble. 313 * On the other hand, it's not a huge amount of trouble. 314 * 315 * We probably need it for UNORM_SIMPLE_APPEND. 316 */ 317 318 // Inserts c somewhere before the last character. 319 // Requires 0<cc<lastCC which implies reorderStart<limit. insert(int c, int cc)320 private void insert(int c, int cc) { 321 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 322 // insert c at codePointLimit, after the character with prevCC<=cc 323 if(c<=0xffff) { 324 str.insert(codePointLimit, (char)c); 325 if(cc<=1) { 326 reorderStart=codePointLimit+1; 327 } 328 } else { 329 str.insert(codePointLimit, Character.toChars(c)); 330 if(cc<=1) { 331 reorderStart=codePointLimit+2; 332 } 333 } 334 } 335 336 private final Normalizer2Impl impl; 337 private final Appendable app; 338 private final StringBuilder str; 339 private final boolean appIsStringBuilder; 340 private int reorderStart; 341 private int lastCC; 342 343 // private backward iterator setIterator()344 private void setIterator() { codePointStart=str.length(); } skipPrevious()345 private void skipPrevious() { // Requires 0<codePointStart. 346 codePointLimit=codePointStart; 347 codePointStart=str.offsetByCodePoints(codePointStart, -1); 348 } previousCC()349 private int previousCC() { // Returns 0 if there is no previous character. 350 codePointLimit=codePointStart; 351 if(reorderStart>=codePointStart) { 352 return 0; 353 } 354 int c=str.codePointBefore(codePointStart); 355 codePointStart-=Character.charCount(c); 356 return impl.getCCFromYesOrMaybeCP(c); 357 } 358 359 private int codePointStart, codePointLimit; 360 } 361 362 // TODO: Propose as public API on the UTF16 class. 363 // TODO: Propose widening UTF16 methods that take char to take int. 364 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 365 public static final class UTF16Plus { 366 /** 367 * Is this code point a lead surrogate (U+d800..U+dbff)? 368 * @param c code unit or code point 369 * @return true or false 370 */ isLeadSurrogate(int c)371 public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; } 372 /** 373 * Is this code point a trail surrogate (U+dc00..U+dfff)? 374 * @param c code unit or code point 375 * @return true or false 376 */ isTrailSurrogate(int c)377 public static boolean isTrailSurrogate(int c) { return (c & 0xfffffc00) == 0xdc00; } 378 /** 379 * Is this code point a surrogate (U+d800..U+dfff)? 380 * @param c code unit or code point 381 * @return true or false 382 */ isSurrogate(int c)383 public static boolean isSurrogate(int c) { return (c & 0xfffff800) == 0xd800; } 384 /** 385 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 386 * is it a lead surrogate? 387 * @param c code unit or code point 388 * @return true or false 389 */ isSurrogateLead(int c)390 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 391 /** 392 * Compares two CharSequence objects for binary equality. 393 * @param s1 first sequence 394 * @param s2 second sequence 395 * @return true if s1 contains the same text as s2 396 */ equal(CharSequence s1, CharSequence s2)397 public static boolean equal(CharSequence s1, CharSequence s2) { 398 if(s1==s2) { 399 return true; 400 } 401 int length=s1.length(); 402 if(length!=s2.length()) { 403 return false; 404 } 405 for(int i=0; i<length; ++i) { 406 if(s1.charAt(i)!=s2.charAt(i)) { 407 return false; 408 } 409 } 410 return true; 411 } 412 /** 413 * Compares two CharSequence subsequences for binary equality. 414 * @param s1 first sequence 415 * @param start1 start offset in first sequence 416 * @param limit1 limit offset in first sequence 417 * @param s2 second sequence 418 * @param start2 start offset in second sequence 419 * @param limit2 limit offset in second sequence 420 * @return true if s1.subSequence(start1, limit1) contains the same text 421 * as s2.subSequence(start2, limit2) 422 */ equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)423 public static boolean equal(CharSequence s1, int start1, int limit1, 424 CharSequence s2, int start2, int limit2) { 425 if((limit1-start1)!=(limit2-start2)) { 426 return false; 427 } 428 if(s1==s2 && start1==start2) { 429 return true; 430 } 431 while(start1<limit1) { 432 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 433 return false; 434 } 435 } 436 return true; 437 } 438 } 439 Normalizer2Impl()440 public Normalizer2Impl() {} 441 442 private static final class IsAcceptable implements ICUBinary.Authenticate { 443 @Override isDataVersionAcceptable(byte version[])444 public boolean isDataVersionAcceptable(byte version[]) { 445 return version[0]==4; 446 } 447 } 448 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 449 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 450 load(ByteBuffer bytes)451 public Normalizer2Impl load(ByteBuffer bytes) { 452 try { 453 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 454 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 455 if(indexesLength<=IX_MIN_LCCC_CP) { 456 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 457 } 458 int[] inIndexes=new int[indexesLength]; 459 inIndexes[0]=indexesLength*4; 460 for(int i=1; i<indexesLength; ++i) { 461 inIndexes[i]=bytes.getInt(); 462 } 463 464 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 465 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 466 minLcccCP=inIndexes[IX_MIN_LCCC_CP]; 467 468 minYesNo=inIndexes[IX_MIN_YES_NO]; 469 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 470 minNoNo=inIndexes[IX_MIN_NO_NO]; 471 minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]; 472 minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]; 473 minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY]; 474 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 475 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 476 assert((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields 477 centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1; 478 479 // Read the normTrie. 480 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 481 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 482 int triePosition = bytes.position(); 483 normTrie = CodePointTrie.Fast16.fromBinary(bytes); 484 int trieLength = bytes.position() - triePosition; 485 if(trieLength>(nextOffset-offset)) { 486 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 487 } 488 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 489 490 // Read the composition and mapping data. 491 offset=nextOffset; 492 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 493 int numChars=(nextOffset-offset)/2; 494 if(numChars!=0) { 495 maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0); 496 extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); 497 } 498 499 // smallFCD: new in formatVersion 2 500 offset=nextOffset; 501 smallFCD=new byte[0x100]; 502 bytes.get(smallFCD); 503 504 return this; 505 } catch(IOException e) { 506 throw new ICUUncheckedIOException(e); 507 } 508 } load(String name)509 public Normalizer2Impl load(String name) { 510 return load(ICUBinary.getRequiredData(name)); 511 } 512 addLcccChars(UnicodeSet set)513 public void addLcccChars(UnicodeSet set) { 514 int start = 0; 515 CodePointMap.Range range = new CodePointMap.Range(); 516 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 517 null, range)) { 518 int end = range.getEnd(); 519 int norm16 = range.getValue(); 520 if (norm16 > MIN_NORMAL_MAYBE_YES && norm16 != JAMO_VT) { 521 set.add(start, end); 522 } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) { 523 int fcd16 = getFCD16(start); 524 if (fcd16 > 0xff) { set.add(start, end); } 525 } 526 start = end + 1; 527 } 528 } 529 addPropertyStarts(UnicodeSet set)530 public void addPropertyStarts(UnicodeSet set) { 531 // Add the start code point of each same-value range of the trie. 532 int start = 0; 533 CodePointMap.Range range = new CodePointMap.Range(); 534 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 535 null, range)) { 536 int end = range.getEnd(); 537 int value = range.getValue(); 538 set.add(start); 539 if (start != end && isAlgorithmicNoNo(value) && 540 (value & DELTA_TCCC_MASK) > DELTA_TCCC_1) { 541 // Range of code points with same-norm16-value algorithmic decompositions. 542 // They might have different non-zero FCD16 values. 543 int prevFCD16 = getFCD16(start); 544 while (++start <= end) { 545 int fcd16 = getFCD16(start); 546 if (fcd16 != prevFCD16) { 547 set.add(start); 548 prevFCD16 = fcd16; 549 } 550 } 551 } 552 start = end + 1; 553 } 554 555 /* add Hangul LV syllables and LV+1 because of skippables */ 556 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 557 set.add(c); 558 set.add(c+1); 559 } 560 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 561 } 562 addCanonIterPropertyStarts(UnicodeSet set)563 public void addCanonIterPropertyStarts(UnicodeSet set) { 564 // Add the start code point of each same-value range of the canonical iterator data trie. 565 ensureCanonIterData(); 566 // Currently only used for the SEGMENT_STARTER property. 567 int start = 0; 568 CodePointMap.Range range = new CodePointMap.Range(); 569 while (canonIterData.getRange(start, segmentStarterMapper, range)) { 570 set.add(start); 571 start = range.getEnd() + 1; 572 } 573 } 574 private static final CodePointMap.ValueFilter segmentStarterMapper = 575 new CodePointMap.ValueFilter() { 576 @Override 577 public int apply(int value) { 578 return value & CANON_NOT_SEGMENT_STARTER; 579 } 580 }; 581 582 // low-level properties ------------------------------------------------ *** 583 584 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 585 // still had getFCDTrie() which built and cached an FCD trie. 586 // That provided faster access to FCD data than getFCD16FromNormData() 587 // but required synchronization and consumed some 10kB of heap memory 588 // in any process that uses FCD (e.g., via collation). 589 // minDecompNoCP etc. and smallFCD[] are intended to help with any loss of performance, 590 // at least for ASCII & CJK. 591 592 /** 593 * Builds the canonical-iterator data for this instance. 594 * This is required before any of {@link #isCanonSegmentStarter(int)} or 595 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 596 * or else they crash. 597 * @return this 598 */ ensureCanonIterData()599 public synchronized Normalizer2Impl ensureCanonIterData() { 600 if(canonIterData==null) { 601 MutableCodePointTrie mutableTrie = new MutableCodePointTrie(0, 0); 602 canonStartSets=new ArrayList<UnicodeSet>(); 603 int start = 0; 604 CodePointMap.Range range = new CodePointMap.Range(); 605 while (normTrie.getRange(start, CodePointMap.RangeOption.FIXED_LEAD_SURROGATES, INERT, 606 null, range)) { 607 final int end = range.getEnd(); 608 final int norm16 = range.getValue(); 609 if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { 610 // Inert, or 2-way mapping (including Hangul syllable). 611 // We do not write a canonStartSet for any yesNo character. 612 // Composites from 2-way mappings are added at runtime from the 613 // starter's compositions list, and the other characters in 614 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 615 // "maybe" characters. 616 start = end + 1; 617 continue; 618 } 619 for (int c = start; c <= end; ++c) { 620 final int oldValue = mutableTrie.get(c); 621 int newValue=oldValue; 622 if(isMaybeOrNonZeroCC(norm16)) { 623 // not a segment starter if it occurs in a decomposition or has cc!=0 624 newValue|=CANON_NOT_SEGMENT_STARTER; 625 if(norm16<MIN_NORMAL_MAYBE_YES) { 626 newValue|=CANON_HAS_COMPOSITIONS; 627 } 628 } else if(norm16<minYesNo) { 629 newValue|=CANON_HAS_COMPOSITIONS; 630 } else { 631 // c has a one-way decomposition 632 int c2=c; 633 // Do not modify the whole-range norm16 value. 634 int norm16_2=norm16; 635 if (isDecompNoAlgorithmic(norm16_2)) { 636 // Maps to an isCompYesAndZeroCC. 637 c2 = mapAlgorithmic(c2, norm16_2); 638 norm16_2 = getRawNorm16(c2); 639 // No compatibility mappings for the CanonicalIterator. 640 assert(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2))); 641 } 642 if (norm16_2 > minYesNo) { 643 // c decomposes, get everything from the variable-length extra data 644 int mapping=norm16_2>>OFFSET_SHIFT; 645 int firstUnit=extraData.charAt(mapping); 646 int length=firstUnit&MAPPING_LENGTH_MASK; 647 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 648 if(c==c2 && (extraData.charAt(mapping-1)&0xff)!=0) { 649 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 650 } 651 } 652 // Skip empty mappings (no characters in the decomposition). 653 if(length!=0) { 654 ++mapping; // skip over the firstUnit 655 // add c to first code point's start set 656 int limit=mapping+length; 657 c2=extraData.codePointAt(mapping); 658 addToStartSet(mutableTrie, c, c2); 659 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 660 // one-way mapping. A 2-way mapping is possible here after 661 // intermediate algorithmic mapping. 662 if(norm16_2>=minNoNo) { 663 while((mapping+=Character.charCount(c2))<limit) { 664 c2=extraData.codePointAt(mapping); 665 int c2Value = mutableTrie.get(c2); 666 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 667 mutableTrie.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 668 } 669 } 670 } 671 } 672 } else { 673 // c decomposed to c2 algorithmically; c has cc==0 674 addToStartSet(mutableTrie, c, c2); 675 } 676 } 677 if(newValue!=oldValue) { 678 mutableTrie.set(c, newValue); 679 } 680 } 681 start = end + 1; 682 } 683 canonIterData = mutableTrie.buildImmutable( 684 CodePointTrie.Type.SMALL, CodePointTrie.ValueWidth.BITS_32); 685 } 686 return this; 687 } 688 689 // The trie stores values for lead surrogate code *units*. 690 // Surrogate code *points* are inert. getNorm16(int c)691 public int getNorm16(int c) { 692 return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c); 693 } getRawNorm16(int c)694 public int getRawNorm16(int c) { return normTrie.get(c); } 695 getCompQuickCheck(int norm16)696 public int getCompQuickCheck(int norm16) { 697 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 698 return 1; // yes 699 } else if(minMaybeYes<=norm16) { 700 return 2; // maybe 701 } else { 702 return 0; // no 703 } 704 } isAlgorithmicNoNo(int norm16)705 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } isCompNo(int norm16)706 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } isDecompYes(int norm16)707 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 708 getCC(int norm16)709 public int getCC(int norm16) { 710 if(norm16>=MIN_NORMAL_MAYBE_YES) { 711 return getCCFromNormalYesOrMaybe(norm16); 712 } 713 if(norm16<minNoNo || limitNoNo<=norm16) { 714 return 0; 715 } 716 return getCCFromNoNo(norm16); 717 } getCCFromNormalYesOrMaybe(int norm16)718 public static int getCCFromNormalYesOrMaybe(int norm16) { 719 return (norm16 >> OFFSET_SHIFT) & 0xff; 720 } getCCFromYesOrMaybe(int norm16)721 public static int getCCFromYesOrMaybe(int norm16) { 722 return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; 723 } getCCFromYesOrMaybeCP(int c)724 public int getCCFromYesOrMaybeCP(int c) { 725 if (c < minCompNoMaybeCP) { return 0; } 726 return getCCFromYesOrMaybe(getNorm16(c)); 727 } 728 729 /** 730 * Returns the FCD data for code point c. 731 * @param c A Unicode code point. 732 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 733 */ getFCD16(int c)734 public int getFCD16(int c) { 735 if(c<minDecompNoCP) { 736 return 0; 737 } else if(c<=0xffff) { 738 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 739 } 740 return getFCD16FromNormData(c); 741 } 742 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ singleLeadMightHaveNonZeroFCD16(int lead)743 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 744 // 0<=lead<=0xffff 745 byte bits=smallFCD[lead>>8]; 746 if(bits==0) { return false; } 747 return ((bits>>((lead>>5)&7))&1)!=0; 748 } 749 750 /** Gets the FCD value from the regular normalization data. */ getFCD16FromNormData(int c)751 public int getFCD16FromNormData(int c) { 752 int norm16=getNorm16(c); 753 if (norm16 >= limitNoNo) { 754 if(norm16>=MIN_NORMAL_MAYBE_YES) { 755 // combining mark 756 norm16=getCCFromNormalYesOrMaybe(norm16); 757 return norm16|(norm16<<8); 758 } else if(norm16>=minMaybeYes) { 759 return 0; 760 } else { // isDecompNoAlgorithmic(norm16) 761 int deltaTrailCC = norm16 & DELTA_TCCC_MASK; 762 if (deltaTrailCC <= DELTA_TCCC_1) { 763 return deltaTrailCC >> OFFSET_SHIFT; 764 } 765 // Maps to an isCompYesAndZeroCC. 766 c=mapAlgorithmic(c, norm16); 767 norm16 = getRawNorm16(c); 768 } 769 } 770 if(norm16<=minYesNo || isHangulLVT(norm16)) { 771 // no decomposition or Hangul syllable, all zeros 772 return 0; 773 } 774 // c decomposes, get everything from the variable-length extra data 775 int mapping=norm16>>OFFSET_SHIFT; 776 int firstUnit=extraData.charAt(mapping); 777 int fcd16=firstUnit>>8; // tccc 778 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 779 fcd16|=extraData.charAt(mapping-1)&0xff00; // lccc 780 } 781 return fcd16; 782 } 783 784 /** 785 * Gets the decomposition for one code point. 786 * @param c code point 787 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 788 */ getDecomposition(int c)789 public String getDecomposition(int c) { 790 int norm16; 791 if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { 792 // c does not decompose 793 return null; 794 } 795 int decomp = -1; 796 if(isDecompNoAlgorithmic(norm16)) { 797 // Maps to an isCompYesAndZeroCC. 798 decomp=c=mapAlgorithmic(c, norm16); 799 // The mapping might decompose further. 800 norm16 = getRawNorm16(c); 801 } 802 if (norm16 < minYesNo) { 803 if(decomp<0) { 804 return null; 805 } else { 806 return UTF16.valueOf(decomp); 807 } 808 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 809 // Hangul syllable: decompose algorithmically 810 StringBuilder buffer=new StringBuilder(); 811 Hangul.decompose(c, buffer); 812 return buffer.toString(); 813 } 814 // c decomposes, get everything from the variable-length extra data 815 int mapping=norm16>>OFFSET_SHIFT; 816 int length=extraData.charAt(mapping++)&MAPPING_LENGTH_MASK; 817 return extraData.substring(mapping, mapping+length); 818 } 819 820 /** 821 * Gets the raw decomposition for one code point. 822 * @param c code point 823 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 824 */ getRawDecomposition(int c)825 public String getRawDecomposition(int c) { 826 int norm16; 827 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 828 // c does not decompose 829 return null; 830 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 831 // Hangul syllable: decompose algorithmically 832 StringBuilder buffer=new StringBuilder(); 833 Hangul.getRawDecomposition(c, buffer); 834 return buffer.toString(); 835 } else if(isDecompNoAlgorithmic(norm16)) { 836 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 837 } 838 // c decomposes, get everything from the variable-length extra data 839 int mapping=norm16>>OFFSET_SHIFT; 840 int firstUnit=extraData.charAt(mapping); 841 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 842 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 843 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 844 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 845 int rawMapping=mapping-((firstUnit>>7)&1)-1; 846 char rm0=extraData.charAt(rawMapping); 847 if(rm0<=MAPPING_LENGTH_MASK) { 848 return extraData.substring(rawMapping-rm0, rawMapping); 849 } else { 850 // Copy the normal mapping and replace its first two code units with rm0. 851 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 852 mapping+=1+2; // skip over the firstUnit and the first two mapping code units 853 return buffer.append(extraData, mapping, mapping+mLength-2).toString(); 854 } 855 } else { 856 mapping+=1; // skip over the firstUnit 857 return extraData.substring(mapping, mapping+mLength); 858 } 859 } 860 861 /** 862 * Returns true if code point c starts a canonical-iterator string segment. 863 * <b>{@link #ensureCanonIterData()} must have been called before this method, 864 * or else this method will crash.</b> 865 * @param c A Unicode code point. 866 * @return true if c starts a canonical-iterator string segment. 867 */ isCanonSegmentStarter(int c)868 public boolean isCanonSegmentStarter(int c) { 869 return canonIterData.get(c)>=0; 870 } 871 /** 872 * Returns true if there are characters whose decomposition starts with c. 873 * If so, then the set is cleared and then filled with those characters. 874 * <b>{@link #ensureCanonIterData()} must have been called before this method, 875 * or else this method will crash.</b> 876 * @param c A Unicode code point. 877 * @param set A UnicodeSet to receive the characters whose decompositions 878 * start with c, if there are any. 879 * @return true if there are characters whose decomposition starts with c. 880 */ getCanonStartSet(int c, UnicodeSet set)881 public boolean getCanonStartSet(int c, UnicodeSet set) { 882 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 883 if(canonValue==0) { 884 return false; 885 } 886 set.clear(); 887 int value=canonValue&CANON_VALUE_MASK; 888 if((canonValue&CANON_HAS_SET)!=0) { 889 set.addAll(canonStartSets.get(value)); 890 } else if(value!=0) { 891 set.add(value); 892 } 893 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 894 int norm16 = getRawNorm16(c); 895 if(norm16==JAMO_L) { 896 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 897 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 898 } else { 899 addComposites(getCompositionsList(norm16), set); 900 } 901 } 902 return true; 903 } 904 905 // Fixed norm16 values. 906 public static final int MIN_YES_YES_WITH_CC=0xfe02; 907 public static final int JAMO_VT=0xfe00; 908 public static final int MIN_NORMAL_MAYBE_YES=0xfc00; 909 public static final int JAMO_L=2; // offset=1 hasCompBoundaryAfter=FALSE 910 public static final int INERT=1; // offset=0 hasCompBoundaryAfter=TRUE 911 912 // norm16 bit 0 is comp-boundary-after. 913 public static final int HAS_COMP_BOUNDARY_AFTER=1; 914 public static final int OFFSET_SHIFT=1; 915 916 // For algorithmic one-way mappings, norm16 bits 2..1 indicate the 917 // tccc (0, 1, >1) for quick FCC boundary-after tests. 918 public static final int DELTA_TCCC_0=0; 919 public static final int DELTA_TCCC_1=2; 920 public static final int DELTA_TCCC_GT_1=4; 921 public static final int DELTA_TCCC_MASK=6; 922 public static final int DELTA_SHIFT=3; 923 924 public static final int MAX_DELTA=0x40; 925 926 // Byte offsets from the start of the data, after the generic header. 927 public static final int IX_NORM_TRIE_OFFSET=0; 928 public static final int IX_EXTRA_DATA_OFFSET=1; 929 public static final int IX_SMALL_FCD_OFFSET=2; 930 public static final int IX_RESERVED3_OFFSET=3; 931 public static final int IX_TOTAL_SIZE=7; 932 933 // Code point thresholds for quick check codes. 934 public static final int IX_MIN_DECOMP_NO_CP=8; 935 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 936 937 // Norm16 value thresholds for quick check combinations and types of extra data. 938 939 /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ 940 public static final int IX_MIN_YES_NO=10; 941 /** Mappings are comp-normalized. */ 942 public static final int IX_MIN_NO_NO=11; 943 public static final int IX_LIMIT_NO_NO=12; 944 public static final int IX_MIN_MAYBE_YES=13; 945 946 /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ 947 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 948 /** Mappings are not comp-normalized but have a comp boundary before. */ 949 public static final int IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE=15; 950 /** Mappings do not have a comp boundary before. */ 951 public static final int IX_MIN_NO_NO_COMP_NO_MAYBE_CC=16; 952 /** Mappings to the empty string. */ 953 public static final int IX_MIN_NO_NO_EMPTY=17; 954 955 public static final int IX_MIN_LCCC_CP=18; 956 public static final int IX_COUNT=20; 957 958 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 959 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 960 // unused bit 0x20; 961 public static final int MAPPING_LENGTH_MASK=0x1f; 962 963 public static final int COMP_1_LAST_TUPLE=0x8000; 964 public static final int COMP_1_TRIPLE=1; 965 public static final int COMP_1_TRAIL_LIMIT=0x3400; 966 public static final int COMP_1_TRAIL_MASK=0x7ffe; 967 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 968 public static final int COMP_2_TRAIL_SHIFT=6; 969 public static final int COMP_2_TRAIL_MASK=0xffc0; 970 971 // higher-level functionality ------------------------------------------ *** 972 973 // NFD without an NFD Normalizer2 instance. decompose(CharSequence s, StringBuilder dest)974 public Appendable decompose(CharSequence s, StringBuilder dest) { 975 decompose(s, 0, s.length(), dest, s.length()); 976 return dest; 977 } 978 /** 979 * Decomposes s[src, limit[ and writes the result to dest. 980 * limit can be NULL if src is NUL-terminated. 981 * destLengthEstimate is the initial dest buffer capacity and can be -1. 982 */ decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)983 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 984 int destLengthEstimate) { 985 if(destLengthEstimate<0) { 986 destLengthEstimate=limit-src; 987 } 988 dest.setLength(0); 989 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 990 decompose(s, src, limit, buffer); 991 } 992 993 // Dual functionality: 994 // buffer!=NULL: normalize 995 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)996 public int decompose(CharSequence s, int src, int limit, 997 ReorderingBuffer buffer) { 998 int minNoCP=minDecompNoCP; 999 1000 int prevSrc; 1001 int c=0; 1002 int norm16=0; 1003 1004 // only for quick check 1005 int prevBoundary=src; 1006 int prevCC=0; 1007 1008 for(;;) { 1009 // count code units below the minimum or with irrelevant data for the quick check 1010 for(prevSrc=src; src!=limit;) { 1011 if( (c=s.charAt(src))<minNoCP || 1012 isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1013 ) { 1014 ++src; 1015 } else if (!UTF16Plus.isLeadSurrogate(c)) { 1016 break; 1017 } else { 1018 char c2; 1019 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1020 c = Character.toCodePoint((char)c, c2); 1021 norm16 = normTrie.suppGet(c); 1022 if (isMostDecompYesAndZeroCC(norm16)) { 1023 src += 2; 1024 } else { 1025 break; 1026 } 1027 } else { 1028 ++src; // unpaired lead surrogate: inert 1029 } 1030 } 1031 } 1032 // copy these code units all at once 1033 if(src!=prevSrc) { 1034 if(buffer!=null) { 1035 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1036 } else { 1037 prevCC=0; 1038 prevBoundary=src; 1039 } 1040 } 1041 if(src==limit) { 1042 break; 1043 } 1044 1045 // Check one above-minimum, relevant code point. 1046 src+=Character.charCount(c); 1047 if(buffer!=null) { 1048 decompose(c, norm16, buffer); 1049 } else { 1050 if(isDecompYes(norm16)) { 1051 int cc=getCCFromYesOrMaybe(norm16); 1052 if(prevCC<=cc || cc==0) { 1053 prevCC=cc; 1054 if(cc<=1) { 1055 prevBoundary=src; 1056 } 1057 continue; 1058 } 1059 } 1060 return prevBoundary; // "no" or cc out of order 1061 } 1062 } 1063 return src; 1064 } decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1065 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1066 int limit=s.length(); 1067 if(limit==0) { 1068 return; 1069 } 1070 if(doDecompose) { 1071 decompose(s, 0, limit, buffer); 1072 return; 1073 } 1074 // Just merge the strings at the boundary. 1075 int c=Character.codePointAt(s, 0); 1076 int src=0; 1077 int firstCC, prevCC, cc; 1078 firstCC=prevCC=cc=getCC(getNorm16(c)); 1079 while(cc!=0) { 1080 prevCC=cc; 1081 src+=Character.charCount(c); 1082 if(src>=limit) { 1083 break; 1084 } 1085 c=Character.codePointAt(s, src); 1086 cc=getCC(getNorm16(c)); 1087 }; 1088 buffer.append(s, 0, src, false, firstCC, prevCC); 1089 buffer.append(s, src, limit); 1090 } 1091 1092 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1093 // doCompose: normalize 1094 // !doCompose: isNormalized (buffer must be empty and initialized) compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1095 public boolean compose(CharSequence s, int src, int limit, 1096 boolean onlyContiguous, 1097 boolean doCompose, 1098 ReorderingBuffer buffer) { 1099 int prevBoundary=src; 1100 int minNoMaybeCP=minCompNoMaybeCP; 1101 1102 for (;;) { 1103 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1104 // or with (compYes && ccc==0) properties. 1105 int prevSrc; 1106 int c = 0; 1107 int norm16 = 0; 1108 for (;;) { 1109 if (src == limit) { 1110 if (prevBoundary != limit && doCompose) { 1111 buffer.append(s, prevBoundary, limit); 1112 } 1113 return true; 1114 } 1115 if( (c=s.charAt(src))<minNoMaybeCP || 1116 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1117 ) { 1118 ++src; 1119 } else { 1120 prevSrc = src++; 1121 if (!UTF16Plus.isLeadSurrogate(c)) { 1122 break; 1123 } else { 1124 char c2; 1125 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1126 ++src; 1127 c = Character.toCodePoint((char)c, c2); 1128 norm16 = normTrie.suppGet(c); 1129 if (!isCompYesAndZeroCC(norm16)) { 1130 break; 1131 } 1132 } 1133 } 1134 } 1135 } 1136 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1137 // The current character is either a "noNo" (has a mapping) 1138 // or a "maybeYes" (combines backward) 1139 // or a "yesYes" with ccc!=0. 1140 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1141 1142 // Medium-fast path: Handle cases that do not require full decomposition and recomposition. 1143 if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes 1144 if (!doCompose) { 1145 return false; 1146 } 1147 // Fast path for mapping a character that is immediately surrounded by boundaries. 1148 // In this case, we need not decompose around the current character. 1149 if (isDecompNoAlgorithmic(norm16)) { 1150 // Maps to a single isCompYesAndZeroCC character 1151 // which also implies hasCompBoundaryBefore. 1152 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1153 hasCompBoundaryBefore(s, src, limit)) { 1154 if (prevBoundary != prevSrc) { 1155 buffer.append(s, prevBoundary, prevSrc); 1156 } 1157 buffer.append(mapAlgorithmic(c, norm16), 0); 1158 prevBoundary = src; 1159 continue; 1160 } 1161 } else if (norm16 < minNoNoCompBoundaryBefore) { 1162 // The mapping is comp-normalized which also implies hasCompBoundaryBefore. 1163 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous) || 1164 hasCompBoundaryBefore(s, src, limit)) { 1165 if (prevBoundary != prevSrc) { 1166 buffer.append(s, prevBoundary, prevSrc); 1167 } 1168 int mapping = norm16 >> OFFSET_SHIFT; 1169 int length = extraData.charAt(mapping++) & MAPPING_LENGTH_MASK; 1170 buffer.append(extraData, mapping, mapping + length); 1171 prevBoundary = src; 1172 continue; 1173 } 1174 } else if (norm16 >= minNoNoEmpty) { 1175 // The current character maps to nothing. 1176 // Simply omit it from the output if there is a boundary before _or_ after it. 1177 // The character itself implies no boundaries. 1178 if (hasCompBoundaryBefore(s, src, limit) || 1179 hasCompBoundaryAfter(s, prevBoundary, prevSrc, onlyContiguous)) { 1180 if (prevBoundary != prevSrc) { 1181 buffer.append(s, prevBoundary, prevSrc); 1182 } 1183 prevBoundary = src; 1184 continue; 1185 } 1186 } 1187 // Other "noNo" type, or need to examine more text around this character: 1188 // Fall through to the slow path. 1189 } else if (isJamoVT(norm16) && prevBoundary != prevSrc) { 1190 char prev=s.charAt(prevSrc-1); 1191 if(c<Hangul.JAMO_T_BASE) { 1192 // The current character is a Jamo Vowel, 1193 // compose with previous Jamo L and following Jamo T. 1194 char l = (char)(prev-Hangul.JAMO_L_BASE); 1195 if(l<Hangul.JAMO_L_COUNT) { 1196 if (!doCompose) { 1197 return false; 1198 } 1199 int t; 1200 if (src != limit && 1201 0 < (t = (s.charAt(src) - Hangul.JAMO_T_BASE)) && 1202 t < Hangul.JAMO_T_COUNT) { 1203 // The next character is a Jamo T. 1204 ++src; 1205 } else if (hasCompBoundaryBefore(s, src, limit)) { 1206 // No Jamo T follows, not even via decomposition. 1207 t = 0; 1208 } else { 1209 t = -1; 1210 } 1211 if (t >= 0) { 1212 int syllable = Hangul.HANGUL_BASE + 1213 (l*Hangul.JAMO_V_COUNT + (c-Hangul.JAMO_V_BASE)) * 1214 Hangul.JAMO_T_COUNT + t; 1215 --prevSrc; // Replace the Jamo L as well. 1216 if (prevBoundary != prevSrc) { 1217 buffer.append(s, prevBoundary, prevSrc); 1218 } 1219 buffer.append((char)syllable); 1220 prevBoundary = src; 1221 continue; 1222 } 1223 // If we see L+V+x where x!=T then we drop to the slow path, 1224 // decompose and recompose. 1225 // This is to deal with NFKC finding normal L and V but a 1226 // compatibility variant of a T. 1227 // We need to either fully compose that combination here 1228 // (which would complicate the code and may not work with strange custom data) 1229 // or use the slow path. 1230 } 1231 } else if (Hangul.isHangulLV(prev)) { 1232 // The current character is a Jamo Trailing consonant, 1233 // compose with previous Hangul LV that does not contain a Jamo T. 1234 if (!doCompose) { 1235 return false; 1236 } 1237 int syllable = prev + c - Hangul.JAMO_T_BASE; 1238 --prevSrc; // Replace the Hangul LV as well. 1239 if (prevBoundary != prevSrc) { 1240 buffer.append(s, prevBoundary, prevSrc); 1241 } 1242 buffer.append((char)syllable); 1243 prevBoundary = src; 1244 continue; 1245 } 1246 // No matching context, or may need to decompose surrounding text first: 1247 // Fall through to the slow path. 1248 } else if (norm16 > JAMO_VT) { // norm16 >= MIN_YES_YES_WITH_CC 1249 // One or more combining marks that do not combine-back: 1250 // Check for canonical order, copy unchanged if ok and 1251 // if followed by a character with a boundary-before. 1252 int cc = getCCFromNormalYesOrMaybe(norm16); // cc!=0 1253 if (onlyContiguous /* FCC */ && getPreviousTrailCC(s, prevBoundary, prevSrc) > cc) { 1254 // Fails FCD test, need to decompose and contiguously recompose. 1255 if (!doCompose) { 1256 return false; 1257 } 1258 } else { 1259 // If !onlyContiguous (not FCC), then we ignore the tccc of 1260 // the previous character which passed the quick check "yes && ccc==0" test. 1261 int n16; 1262 for (;;) { 1263 if (src == limit) { 1264 if (doCompose) { 1265 buffer.append(s, prevBoundary, limit); 1266 } 1267 return true; 1268 } 1269 int prevCC = cc; 1270 c = Character.codePointAt(s, src); 1271 n16 = normTrie.get(c); 1272 if (n16 >= MIN_YES_YES_WITH_CC) { 1273 cc = getCCFromNormalYesOrMaybe(n16); 1274 if (prevCC > cc) { 1275 if (!doCompose) { 1276 return false; 1277 } 1278 break; 1279 } 1280 } else { 1281 break; 1282 } 1283 src += Character.charCount(c); 1284 } 1285 // p is after the last in-order combining mark. 1286 // If there is a boundary here, then we continue with no change. 1287 if (norm16HasCompBoundaryBefore(n16)) { 1288 if (isCompYesAndZeroCC(n16)) { 1289 src += Character.charCount(c); 1290 } 1291 continue; 1292 } 1293 // Use the slow path. There is no boundary in [prevSrc, src[. 1294 } 1295 } 1296 1297 // Slow path: Find the nearest boundaries around the current character, 1298 // decompose and recompose. 1299 if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) { 1300 c = Character.codePointBefore(s, prevSrc); 1301 norm16 = normTrie.get(c); 1302 if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1303 prevSrc -= Character.charCount(c); 1304 } 1305 } 1306 if (doCompose && prevBoundary != prevSrc) { 1307 buffer.append(s, prevBoundary, prevSrc); 1308 } 1309 int recomposeStartIndex=buffer.length(); 1310 // We know there is not a boundary here. 1311 decomposeShort(s, prevSrc, src, false /* !stopAtCompBoundary */, onlyContiguous, 1312 buffer); 1313 // Decompose until the next boundary. 1314 src = decomposeShort(s, src, limit, true /* stopAtCompBoundary */, onlyContiguous, 1315 buffer); 1316 recompose(buffer, recomposeStartIndex, onlyContiguous); 1317 if(!doCompose) { 1318 if(!buffer.equals(s, prevSrc, src)) { 1319 return false; 1320 } 1321 buffer.remove(); 1322 } 1323 prevBoundary=src; 1324 } 1325 } 1326 1327 /** 1328 * Very similar to compose(): Make the same changes in both places if relevant. 1329 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1330 * !doSpan: quickCheck 1331 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1332 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1333 * then the quick check result is "no" 1334 */ composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1335 public int composeQuickCheck(CharSequence s, int src, int limit, 1336 boolean onlyContiguous, boolean doSpan) { 1337 int qcResult=0; 1338 int prevBoundary=src; 1339 int minNoMaybeCP=minCompNoMaybeCP; 1340 1341 for(;;) { 1342 // Fast path: Scan over a sequence of characters below the minimum "no or maybe" code point, 1343 // or with (compYes && ccc==0) properties. 1344 int prevSrc; 1345 int c = 0; 1346 int norm16 = 0; 1347 for (;;) { 1348 if(src==limit) { 1349 return (src<<1)|qcResult; // "yes" or "maybe" 1350 } 1351 if( (c=s.charAt(src))<minNoMaybeCP || 1352 isCompYesAndZeroCC(norm16=normTrie.bmpGet(c)) 1353 ) { 1354 ++src; 1355 } else { 1356 prevSrc = src++; 1357 if (!UTF16Plus.isLeadSurrogate(c)) { 1358 break; 1359 } else { 1360 char c2; 1361 if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) { 1362 ++src; 1363 c = Character.toCodePoint((char)c, c2); 1364 norm16 = normTrie.suppGet(c); 1365 if (!isCompYesAndZeroCC(norm16)) { 1366 break; 1367 } 1368 } 1369 } 1370 } 1371 } 1372 // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1373 // The current character is either a "noNo" (has a mapping) 1374 // or a "maybeYes" (combines backward) 1375 // or a "yesYes" with ccc!=0. 1376 // It is not a Hangul syllable or Jamo L because those have "yes" properties. 1377 1378 int prevNorm16 = INERT; 1379 if (prevBoundary != prevSrc) { 1380 prevBoundary = prevSrc; 1381 if (!norm16HasCompBoundaryBefore(norm16)) { 1382 c = Character.codePointBefore(s, prevSrc); 1383 int n16 = getNorm16(c); 1384 if (!norm16HasCompBoundaryAfter(n16, onlyContiguous)) { 1385 prevBoundary -= Character.charCount(c); 1386 prevNorm16 = n16; 1387 } 1388 } 1389 } 1390 1391 if(isMaybeOrNonZeroCC(norm16)) { 1392 int cc=getCCFromYesOrMaybe(norm16); 1393 if (onlyContiguous /* FCC */ && cc != 0 && 1394 getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { 1395 // The [prevBoundary..prevSrc[ character 1396 // passed the quick check "yes && ccc==0" test 1397 // but is out of canonical order with the current combining mark. 1398 } else { 1399 // If !onlyContiguous (not FCC), then we ignore the tccc of 1400 // the previous character which passed the quick check "yes && ccc==0" test. 1401 for (;;) { 1402 if (norm16 < MIN_YES_YES_WITH_CC) { 1403 if (!doSpan) { 1404 qcResult = 1; 1405 } else { 1406 return prevBoundary << 1; // spanYes does not care to know it's "maybe" 1407 } 1408 } 1409 if (src == limit) { 1410 return (src<<1) | qcResult; // "yes" or "maybe" 1411 } 1412 int prevCC = cc; 1413 c = Character.codePointAt(s, src); 1414 norm16 = getNorm16(c); 1415 if (isMaybeOrNonZeroCC(norm16)) { 1416 cc = getCCFromYesOrMaybe(norm16); 1417 if (!(prevCC <= cc || cc == 0)) { 1418 break; 1419 } 1420 } else { 1421 break; 1422 } 1423 src += Character.charCount(c); 1424 } 1425 // src is after the last in-order combining mark. 1426 if (isCompYesAndZeroCC(norm16)) { 1427 prevBoundary = src; 1428 src += Character.charCount(c); 1429 continue; 1430 } 1431 } 1432 } 1433 return prevBoundary<<1; // "no" 1434 } 1435 } composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1436 public void composeAndAppend(CharSequence s, 1437 boolean doCompose, 1438 boolean onlyContiguous, 1439 ReorderingBuffer buffer) { 1440 int src=0, limit=s.length(); 1441 if(!buffer.isEmpty()) { 1442 int firstStarterInSrc=findNextCompBoundary(s, 0, limit, onlyContiguous); 1443 if(0!=firstStarterInSrc) { 1444 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1445 buffer.length(), onlyContiguous); 1446 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1447 firstStarterInSrc+16); 1448 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1449 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1450 middle.append(s, 0, firstStarterInSrc); 1451 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1452 src=firstStarterInSrc; 1453 } 1454 } 1455 if(doCompose) { 1456 compose(s, src, limit, onlyContiguous, true, buffer); 1457 } else { 1458 buffer.append(s, src, limit); 1459 } 1460 } 1461 // Dual functionality: 1462 // buffer!=NULL: normalize 1463 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1464 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1465 // Note: In this function we use buffer->appendZeroCC() because we track 1466 // the lead and trail combining classes here, rather than leaving it to 1467 // the ReorderingBuffer. 1468 // The exception is the call to decomposeShort() which uses the buffer 1469 // in the normal way. 1470 1471 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1472 // Similar to the prevBoundary in the compose() implementation. 1473 int prevBoundary=src; 1474 int prevSrc; 1475 int c=0; 1476 int prevFCD16=0; 1477 int fcd16=0; 1478 1479 for(;;) { 1480 // count code units with lccc==0 1481 for(prevSrc=src; src!=limit;) { 1482 if((c=s.charAt(src))<minLcccCP) { 1483 prevFCD16=~c; 1484 ++src; 1485 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1486 prevFCD16=0; 1487 ++src; 1488 } else { 1489 if (UTF16Plus.isLeadSurrogate(c)) { 1490 char c2; 1491 if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) { 1492 c = Character.toCodePoint((char)c, c2); 1493 } 1494 } 1495 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1496 prevFCD16=fcd16; 1497 src+=Character.charCount(c); 1498 } else { 1499 break; 1500 } 1501 } 1502 } 1503 // copy these code units all at once 1504 if(src!=prevSrc) { 1505 if(src==limit) { 1506 if(buffer!=null) { 1507 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1508 } 1509 break; 1510 } 1511 prevBoundary=src; 1512 // We know that the previous character's lccc==0. 1513 if(prevFCD16<0) { 1514 // Fetching the fcd16 value was deferred for this below-minLcccCP code point. 1515 int prev=~prevFCD16; 1516 if(prev<minDecompNoCP) { 1517 prevFCD16=0; 1518 } else { 1519 prevFCD16=getFCD16FromNormData(prev); 1520 if(prevFCD16>1) { 1521 --prevBoundary; 1522 } 1523 } 1524 } else { 1525 int p=src-1; 1526 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1527 Character.isHighSurrogate(s.charAt(p-1)) 1528 ) { 1529 --p; 1530 // Need to fetch the previous character's FCD value because 1531 // prevFCD16 was just for the trail surrogate code point. 1532 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1533 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1534 } 1535 if(prevFCD16>1) { 1536 prevBoundary=p; 1537 } 1538 } 1539 if(buffer!=null) { 1540 // The last lccc==0 character is excluded from the 1541 // flush-and-append call in case it needs to be modified. 1542 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1543 buffer.append(s, prevBoundary, src); 1544 } 1545 // The start of the current character (c). 1546 prevSrc=src; 1547 } else if(src==limit) { 1548 break; 1549 } 1550 1551 src+=Character.charCount(c); 1552 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1553 // Check for proper order, and decompose locally if necessary. 1554 if((prevFCD16&0xff)<=(fcd16>>8)) { 1555 // proper order: prev tccc <= current lccc 1556 if((fcd16&0xff)<=1) { 1557 prevBoundary=src; 1558 } 1559 if(buffer!=null) { 1560 buffer.appendZeroCC(c); 1561 } 1562 prevFCD16=fcd16; 1563 continue; 1564 } else if(buffer==null) { 1565 return prevBoundary; // quick check "no" 1566 } else { 1567 /* 1568 * Back out the part of the source that we copied or appended 1569 * already but is now going to be decomposed. 1570 * prevSrc is set to after what was copied/appended. 1571 */ 1572 buffer.removeSuffix(prevSrc-prevBoundary); 1573 /* 1574 * Find the part of the source that needs to be decomposed, 1575 * up to the next safe boundary. 1576 */ 1577 src=findNextFCDBoundary(s, src, limit); 1578 /* 1579 * The source text does not fulfill the conditions for FCD. 1580 * Decompose and reorder a limited piece of the text. 1581 */ 1582 decomposeShort(s, prevBoundary, src, false, false, buffer); 1583 prevBoundary=src; 1584 prevFCD16=0; 1585 } 1586 } 1587 return src; 1588 } makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1589 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1590 int src=0, limit=s.length(); 1591 if(!buffer.isEmpty()) { 1592 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1593 if(0!=firstBoundaryInSrc) { 1594 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1595 buffer.length()); 1596 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1597 firstBoundaryInSrc+16); 1598 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1599 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1600 middle.append(s, 0, firstBoundaryInSrc); 1601 makeFCD(middle, 0, middle.length(), buffer); 1602 src=firstBoundaryInSrc; 1603 } 1604 } 1605 if(doMakeFCD) { 1606 makeFCD(s, src, limit, buffer); 1607 } else { 1608 buffer.append(s, src, limit); 1609 } 1610 } 1611 hasDecompBoundaryBefore(int c)1612 public boolean hasDecompBoundaryBefore(int c) { 1613 return c < minLcccCP || (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) || 1614 norm16HasDecompBoundaryBefore(getNorm16(c)); 1615 } norm16HasDecompBoundaryBefore(int norm16)1616 public boolean norm16HasDecompBoundaryBefore(int norm16) { 1617 if (norm16 < minNoNoCompNoMaybeCC) { 1618 return true; 1619 } 1620 if (norm16 >= limitNoNo) { 1621 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1622 } 1623 // c decomposes, get everything from the variable-length extra data 1624 int mapping=norm16>>OFFSET_SHIFT; 1625 int firstUnit=extraData.charAt(mapping); 1626 // true if leadCC==0 (hasFCDBoundaryBefore()) 1627 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1628 } hasDecompBoundaryAfter(int c)1629 public boolean hasDecompBoundaryAfter(int c) { 1630 if (c < minDecompNoCP) { 1631 return true; 1632 } 1633 if (c <= 0xffff && !singleLeadMightHaveNonZeroFCD16(c)) { 1634 return true; 1635 } 1636 return norm16HasDecompBoundaryAfter(getNorm16(c)); 1637 } norm16HasDecompBoundaryAfter(int norm16)1638 public boolean norm16HasDecompBoundaryAfter(int norm16) { 1639 if(norm16 <= minYesNo || isHangulLVT(norm16)) { 1640 return true; 1641 } 1642 if (norm16 >= limitNoNo) { 1643 if (isMaybeOrNonZeroCC(norm16)) { 1644 return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; 1645 } 1646 // Maps to an isCompYesAndZeroCC. 1647 return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; 1648 } 1649 // c decomposes, get everything from the variable-length extra data 1650 int mapping=norm16>>OFFSET_SHIFT; 1651 int firstUnit=extraData.charAt(mapping); 1652 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1653 // fcd16<=1 || trailCC==0 1654 if(firstUnit>0x1ff) { 1655 return false; // trailCC>1 1656 } 1657 if(firstUnit<=0xff) { 1658 return true; // trailCC==0 1659 } 1660 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1661 // true if leadCC==0 (hasFCDBoundaryBefore()) 1662 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(mapping-1)&0xff00)==0; 1663 } isDecompInert(int c)1664 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1665 hasCompBoundaryBefore(int c)1666 public boolean hasCompBoundaryBefore(int c) { 1667 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c)); 1668 } hasCompBoundaryAfter(int c, boolean onlyContiguous)1669 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous) { 1670 return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous); 1671 } isCompInert(int c, boolean onlyContiguous)1672 public boolean isCompInert(int c, boolean onlyContiguous) { 1673 int norm16=getNorm16(c); 1674 return isCompYesAndZeroCC(norm16) && 1675 (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 1676 (!onlyContiguous || isInert(norm16) || extraData.charAt(norm16>>OFFSET_SHIFT) <= 0x1ff); 1677 } 1678 hasFCDBoundaryBefore(int c)1679 public boolean hasFCDBoundaryBefore(int c) { return hasDecompBoundaryBefore(c); } hasFCDBoundaryAfter(int c)1680 public boolean hasFCDBoundaryAfter(int c) { return hasDecompBoundaryAfter(c); } isFCDInert(int c)1681 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1682 isMaybe(int norm16)1683 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } isMaybeOrNonZeroCC(int norm16)1684 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } isInert(int norm16)1685 private static boolean isInert(int norm16) { return norm16==INERT; } isJamoL(int norm16)1686 private static boolean isJamoL(int norm16) { return norm16==JAMO_L; } isJamoVT(int norm16)1687 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } hangulLVT()1688 private int hangulLVT() { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } isHangulLV(int norm16)1689 private boolean isHangulLV(int norm16) { return norm16==minYesNo; } isHangulLVT(int norm16)1690 private boolean isHangulLVT(int norm16) { 1691 return norm16==hangulLVT(); 1692 } isCompYesAndZeroCC(int norm16)1693 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1694 // UBool isCompYes(uint16_t norm16) const { 1695 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1696 // } 1697 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1698 // return norm16<minNoNo || minMaybeYes<=norm16; 1699 // } 1700 // private boolean hasZeroCCFromDecompYes(int norm16) { 1701 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1702 // } isDecompYesAndZeroCC(int norm16)1703 private boolean isDecompYesAndZeroCC(int norm16) { 1704 return norm16<minYesNo || 1705 norm16==JAMO_VT || 1706 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1707 } 1708 /** 1709 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1710 * the MaybeYes which combine-forward and have ccc=0. 1711 * (Standard Unicode 10 normalization does not have such characters.) 1712 */ isMostDecompYesAndZeroCC(int norm16)1713 private boolean isMostDecompYesAndZeroCC(int norm16) { 1714 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1715 } isDecompNoAlgorithmic(int norm16)1716 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1717 1718 // For use with isCompYes(). 1719 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1720 // static uint8_t getCCFromYes(uint16_t norm16) { 1721 // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; 1722 // } getCCFromNoNo(int norm16)1723 private int getCCFromNoNo(int norm16) { 1724 int mapping=norm16>>OFFSET_SHIFT; 1725 if((extraData.charAt(mapping)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1726 return extraData.charAt(mapping-1)&0xff; 1727 } else { 1728 return 0; 1729 } 1730 } getTrailCCFromCompYesAndZeroCC(int norm16)1731 int getTrailCCFromCompYesAndZeroCC(int norm16) { 1732 if(norm16<=minYesNo) { 1733 return 0; // yesYes and Hangul LV have ccc=tccc=0 1734 } else { 1735 // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. 1736 return extraData.charAt(norm16>>OFFSET_SHIFT)>>8; // tccc from yesNo 1737 } 1738 } 1739 1740 // Requires algorithmic-NoNo. mapAlgorithmic(int c, int norm16)1741 private int mapAlgorithmic(int c, int norm16) { 1742 return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; 1743 } 1744 1745 // Requires minYesNo<norm16<limitNoNo. 1746 // private int getMapping(int norm16) { return extraData+(norm16>>OFFSET_SHIFT); } 1747 1748 /** 1749 * @return index into maybeYesCompositions, or -1 1750 */ getCompositionsListForDecompYes(int norm16)1751 private int getCompositionsListForDecompYes(int norm16) { 1752 if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) { 1753 return -1; 1754 } else { 1755 if((norm16-=minMaybeYes)<0) { 1756 // norm16<minMaybeYes: index into extraData which is a substring at 1757 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1758 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1759 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1760 } 1761 return norm16>>OFFSET_SHIFT; 1762 } 1763 } 1764 /** 1765 * @return index into maybeYesCompositions 1766 */ getCompositionsListForComposite(int norm16)1767 private int getCompositionsListForComposite(int norm16) { 1768 // A composite has both mapping & compositions list. 1769 int list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 1770 int firstUnit=maybeYesCompositions.charAt(list); 1771 return list+ // mapping in maybeYesCompositions 1772 1+ // +1 to skip the first unit with the mapping length 1773 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1774 } getCompositionsListForMaybe(int norm16)1775 private int getCompositionsListForMaybe(int norm16) { 1776 // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES 1777 return (norm16-minMaybeYes)>>OFFSET_SHIFT; 1778 } 1779 /** 1780 * @param c code point must have compositions 1781 * @return index into maybeYesCompositions 1782 */ getCompositionsList(int norm16)1783 private int getCompositionsList(int norm16) { 1784 return isDecompYes(norm16) ? 1785 getCompositionsListForDecompYes(norm16) : 1786 getCompositionsListForComposite(norm16); 1787 } 1788 1789 // Decompose a short piece of text which is likely to contain characters that 1790 // fail the quick check loop and/or where the quick check loop's overhead 1791 // is unlikely to be amortized. 1792 // Called by the compose() and makeFCD() implementations. 1793 // Public in Java for collation implementation code. decomposeShort( CharSequence s, int src, int limit, boolean stopAtCompBoundary, boolean onlyContiguous, ReorderingBuffer buffer)1794 private int decomposeShort( 1795 CharSequence s, int src, int limit, 1796 boolean stopAtCompBoundary, boolean onlyContiguous, 1797 ReorderingBuffer buffer) { 1798 while(src<limit) { 1799 int c=Character.codePointAt(s, src); 1800 if (stopAtCompBoundary && c < minCompNoMaybeCP) { 1801 return src; 1802 } 1803 int norm16 = getNorm16(c); 1804 if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) { 1805 return src; 1806 } 1807 src+=Character.charCount(c); 1808 decompose(c, norm16, buffer); 1809 if (stopAtCompBoundary && norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 1810 return src; 1811 } 1812 } 1813 return src; 1814 } decompose(int c, int norm16, ReorderingBuffer buffer)1815 private void decompose(int c, int norm16, ReorderingBuffer buffer) { 1816 // get the decomposition and the lead and trail cc's 1817 if (norm16 >= limitNoNo) { 1818 if (isMaybeOrNonZeroCC(norm16)) { 1819 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1820 return; 1821 } 1822 // Maps to an isCompYesAndZeroCC. 1823 c=mapAlgorithmic(c, norm16); 1824 norm16 = getRawNorm16(c); 1825 } 1826 if (norm16 < minYesNo) { 1827 // c does not decompose 1828 buffer.append(c, 0); 1829 } else if(isHangulLV(norm16) || isHangulLVT(norm16)) { 1830 // Hangul syllable: decompose algorithmically 1831 Hangul.decompose(c, buffer); 1832 } else { 1833 // c decomposes, get everything from the variable-length extra data 1834 int mapping=norm16>>OFFSET_SHIFT; 1835 int firstUnit=extraData.charAt(mapping); 1836 int length=firstUnit&MAPPING_LENGTH_MASK; 1837 int leadCC, trailCC; 1838 trailCC=firstUnit>>8; 1839 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1840 leadCC=extraData.charAt(mapping-1)>>8; 1841 } else { 1842 leadCC=0; 1843 } 1844 ++mapping; // skip over the firstUnit 1845 buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC); 1846 } 1847 } 1848 1849 /** 1850 * Finds the recomposition result for 1851 * a forward-combining "lead" character, 1852 * specified with a pointer to its compositions list, 1853 * and a backward-combining "trail" character. 1854 * 1855 * <p>If the lead and trail characters combine, then this function returns 1856 * the following "compositeAndFwd" value: 1857 * <pre> 1858 * Bits 21..1 composite character 1859 * Bit 0 set if the composite is a forward-combining starter 1860 * </pre> 1861 * otherwise it returns -1. 1862 * 1863 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1864 * encoded as either pairs or triples of 16-bit units. 1865 * The last entry has the high bit of its first unit set. 1866 * 1867 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1868 * A linear search is used. 1869 * 1870 * <p>See normalizer2impl.h for a more detailed description 1871 * of the compositions list format. 1872 */ combine(String compositions, int list, int trail)1873 private static int combine(String compositions, int list, int trail) { 1874 int key1, firstUnit; 1875 if(trail<COMP_1_TRAIL_LIMIT) { 1876 // trail character is 0..33FF 1877 // result entry may have 2 or 3 units 1878 key1=(trail<<1); 1879 while(key1>(firstUnit=compositions.charAt(list))) { 1880 list+=2+(firstUnit&COMP_1_TRIPLE); 1881 } 1882 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1883 if((firstUnit&COMP_1_TRIPLE)!=0) { 1884 return (compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1885 } else { 1886 return compositions.charAt(list+1); 1887 } 1888 } 1889 } else { 1890 // trail character is 3400..10FFFF 1891 // result entry has 3 units 1892 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1893 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1894 int secondUnit; 1895 for(;;) { 1896 if(key1>(firstUnit=compositions.charAt(list))) { 1897 list+=2+(firstUnit&COMP_1_TRIPLE); 1898 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1899 if(key2>(secondUnit=compositions.charAt(list+1))) { 1900 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1901 break; 1902 } else { 1903 list+=3; 1904 } 1905 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1906 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1907 } else { 1908 break; 1909 } 1910 } else { 1911 break; 1912 } 1913 } 1914 } 1915 return -1; 1916 } 1917 /** 1918 * @param list some character's compositions list 1919 * @param set recursively receives the composites from these compositions 1920 */ addComposites(int list, UnicodeSet set)1921 private void addComposites(int list, UnicodeSet set) { 1922 int firstUnit, compositeAndFwd; 1923 do { 1924 firstUnit=maybeYesCompositions.charAt(list); 1925 if((firstUnit&COMP_1_TRIPLE)==0) { 1926 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1927 list+=2; 1928 } else { 1929 compositeAndFwd=((maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1930 maybeYesCompositions.charAt(list+2); 1931 list+=3; 1932 } 1933 int composite=compositeAndFwd>>1; 1934 if((compositeAndFwd&1)!=0) { 1935 addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set); 1936 } 1937 set.add(composite); 1938 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1939 } 1940 /* 1941 * Recomposes the buffer text starting at recomposeStartIndex 1942 * (which is in NFD - decomposed and canonically ordered), 1943 * and truncates the buffer contents. 1944 * 1945 * Note that recomposition never lengthens the text: 1946 * Any character consists of either one or two code units; 1947 * a composition may contain at most one more code unit than the original starter, 1948 * while the combining mark that is removed has at least one code unit. 1949 */ recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1950 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1951 boolean onlyContiguous) { 1952 StringBuilder sb=buffer.getStringBuilder(); 1953 int p=recomposeStartIndex; 1954 if(p==sb.length()) { 1955 return; 1956 } 1957 1958 int starter, pRemove; 1959 int compositionsList; 1960 int c, compositeAndFwd; 1961 int norm16; 1962 int cc, prevCC; 1963 boolean starterIsSupplementary; 1964 1965 // Some of the following variables are not used until we have a forward-combining starter 1966 // and are only initialized now to avoid compiler warnings. 1967 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1968 starter=-1; 1969 starterIsSupplementary=false; 1970 prevCC=0; 1971 1972 for(;;) { 1973 c=sb.codePointAt(p); 1974 p+=Character.charCount(c); 1975 norm16=getNorm16(c); 1976 cc=getCCFromYesOrMaybe(norm16); 1977 if( // this character combines backward and 1978 isMaybe(norm16) && 1979 // we have seen a starter that combines forward and 1980 compositionsList>=0 && 1981 // the backward-combining character is not blocked 1982 (prevCC<cc || prevCC==0) 1983 ) { 1984 if(isJamoVT(norm16)) { 1985 // c is a Jamo V/T, see if we can compose it with the previous character. 1986 if(c<Hangul.JAMO_T_BASE) { 1987 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1988 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1989 if(prev<Hangul.JAMO_L_COUNT) { 1990 pRemove=p-1; 1991 char syllable=(char) 1992 (Hangul.HANGUL_BASE+ 1993 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1994 Hangul.JAMO_T_COUNT); 1995 char t; 1996 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1997 ++p; 1998 syllable+=t; // The next character was a Jamo T. 1999 } 2000 sb.setCharAt(starter, syllable); 2001 // remove the Jamo V/T 2002 sb.delete(pRemove, p); 2003 p=pRemove; 2004 } 2005 } 2006 /* 2007 * No "else" for Jamo T: 2008 * Since the input is in NFD, there are no Hangul LV syllables that 2009 * a Jamo T could combine with. 2010 * All Jamo Ts are combined above when handling Jamo Vs. 2011 */ 2012 if(p==sb.length()) { 2013 break; 2014 } 2015 compositionsList=-1; 2016 continue; 2017 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 2018 // The starter and the combining mark (c) do combine. 2019 int composite=compositeAndFwd>>1; 2020 2021 // Remove the combining mark. 2022 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 2023 sb.delete(pRemove, p); 2024 p=pRemove; 2025 // Replace the starter with the composite. 2026 if(starterIsSupplementary) { 2027 if(composite>0xffff) { 2028 // both are supplementary 2029 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2030 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 2031 } else { 2032 sb.setCharAt(starter, (char)c); 2033 sb.deleteCharAt(starter+1); 2034 // The composite is shorter than the starter, 2035 // move the intermediate characters forward one. 2036 starterIsSupplementary=false; 2037 --p; 2038 } 2039 } else if(composite>0xffff) { 2040 // The composite is longer than the starter, 2041 // move the intermediate characters back one. 2042 starterIsSupplementary=true; 2043 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 2044 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 2045 ++p; 2046 } else { 2047 // both are on the BMP 2048 sb.setCharAt(starter, (char)composite); 2049 } 2050 2051 // Keep prevCC because we removed the combining mark. 2052 2053 if(p==sb.length()) { 2054 break; 2055 } 2056 // Is the composite a starter that combines forward? 2057 if((compositeAndFwd&1)!=0) { 2058 compositionsList= 2059 getCompositionsListForComposite(getRawNorm16(composite)); 2060 } else { 2061 compositionsList=-1; 2062 } 2063 2064 // We combined; continue with looking for compositions. 2065 continue; 2066 } 2067 } 2068 2069 // no combination this time 2070 prevCC=cc; 2071 if(p==sb.length()) { 2072 break; 2073 } 2074 2075 // If c did not combine, then check if it is a starter. 2076 if(cc==0) { 2077 // Found a new starter. 2078 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2079 // It may combine with something, prepare for it. 2080 if(c<=0xffff) { 2081 starterIsSupplementary=false; 2082 starter=p-1; 2083 } else { 2084 starterIsSupplementary=true; 2085 starter=p-2; 2086 } 2087 } 2088 } else if(onlyContiguous) { 2089 // FCC: no discontiguous compositions; any intervening character blocks. 2090 compositionsList=-1; 2091 } 2092 } 2093 buffer.flush(); 2094 } 2095 composePair(int a, int b)2096 public int composePair(int a, int b) { 2097 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16 2098 int list; 2099 if(isInert(norm16)) { 2100 return -1; 2101 } else if(norm16<minYesNoMappingsOnly) { 2102 // a combines forward. 2103 if(isJamoL(norm16)) { 2104 b-=Hangul.JAMO_V_BASE; 2105 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2106 return 2107 (Hangul.HANGUL_BASE+ 2108 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2109 Hangul.JAMO_T_COUNT); 2110 } else { 2111 return -1; 2112 } 2113 } else if(isHangulLV(norm16)) { 2114 b-=Hangul.JAMO_T_BASE; 2115 if(0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2116 return a+b; 2117 } else { 2118 return -1; 2119 } 2120 } else { 2121 // 'a' has a compositions list in extraData 2122 list=((MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16)>>OFFSET_SHIFT; 2123 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2124 list+= // mapping pointer 2125 1+ // +1 to skip the first unit with the mapping length 2126 (maybeYesCompositions.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2127 } 2128 } 2129 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2130 return -1; 2131 } else { 2132 list=getCompositionsListForMaybe(norm16); // offset into maybeYesCompositions 2133 } 2134 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2135 return -1; 2136 } 2137 return combine(maybeYesCompositions, list, b)>>1; 2138 } 2139 2140 /** 2141 * Does c have a composition boundary before it? 2142 * True if its decomposition begins with a character that has 2143 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2144 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2145 * (isCompYesAndZeroCC()) so we need not decompose. 2146 */ hasCompBoundaryBefore(int c, int norm16)2147 private boolean hasCompBoundaryBefore(int c, int norm16) { 2148 return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16); 2149 } norm16HasCompBoundaryBefore(int norm16)2150 private boolean norm16HasCompBoundaryBefore(int norm16) { 2151 return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16); 2152 } hasCompBoundaryBefore(CharSequence s, int src, int limit)2153 private boolean hasCompBoundaryBefore(CharSequence s, int src, int limit) { 2154 return src == limit || hasCompBoundaryBefore(Character.codePointAt(s, src)); 2155 } norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous)2156 private boolean norm16HasCompBoundaryAfter(int norm16, boolean onlyContiguous) { 2157 return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 && 2158 (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16)); 2159 } hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous)2160 private boolean hasCompBoundaryAfter(CharSequence s, int start, int p, boolean onlyContiguous) { 2161 return start == p || hasCompBoundaryAfter(Character.codePointBefore(s, p), onlyContiguous); 2162 } 2163 /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */ isTrailCC01ForCompBoundaryAfter(int norm16)2164 private boolean isTrailCC01ForCompBoundaryAfter(int norm16) { 2165 return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ? 2166 (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : extraData.charAt(norm16 >> OFFSET_SHIFT) <= 0x1ff); 2167 } 2168 findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous)2169 private int findPreviousCompBoundary(CharSequence s, int p, boolean onlyContiguous) { 2170 while(p>0) { 2171 int c=Character.codePointBefore(s, p); 2172 int norm16 = getNorm16(c); 2173 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2174 break; 2175 } 2176 p-=Character.charCount(c); 2177 if(hasCompBoundaryBefore(c, norm16)) { 2178 break; 2179 } 2180 } 2181 return p; 2182 } findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous)2183 private int findNextCompBoundary(CharSequence s, int p, int limit, boolean onlyContiguous) { 2184 while(p<limit) { 2185 int c=Character.codePointAt(s, p); 2186 int norm16=normTrie.get(c); 2187 if(hasCompBoundaryBefore(c, norm16)) { 2188 break; 2189 } 2190 p+=Character.charCount(c); 2191 if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) { 2192 break; 2193 } 2194 } 2195 return p; 2196 } 2197 findPreviousFCDBoundary(CharSequence s, int p)2198 private int findPreviousFCDBoundary(CharSequence s, int p) { 2199 while(p>0) { 2200 int c=Character.codePointBefore(s, p); 2201 int norm16; 2202 if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16 = getNorm16(c))) { 2203 break; 2204 } 2205 p-=Character.charCount(c); 2206 if (norm16HasDecompBoundaryBefore(norm16)) { 2207 break; 2208 } 2209 } 2210 return p; 2211 } findNextFCDBoundary(CharSequence s, int p, int limit)2212 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2213 while(p<limit) { 2214 int c=Character.codePointAt(s, p); 2215 int norm16; 2216 if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16 = getNorm16(c))) { 2217 break; 2218 } 2219 p+=Character.charCount(c); 2220 if (norm16HasDecompBoundaryAfter(norm16)) { 2221 break; 2222 } 2223 } 2224 return p; 2225 } 2226 getPreviousTrailCC(CharSequence s, int start, int p)2227 private int getPreviousTrailCC(CharSequence s, int start, int p) { 2228 if (start == p) { 2229 return 0; 2230 } 2231 return getFCD16(Character.codePointBefore(s, p)); 2232 } 2233 addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead)2234 private void addToStartSet(MutableCodePointTrie mutableTrie, int origin, int decompLead) { 2235 int canonValue = mutableTrie.get(decompLead); 2236 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2237 // origin is the first character whose decomposition starts with 2238 // the character for which we are setting the value. 2239 mutableTrie.set(decompLead, canonValue|origin); 2240 } else { 2241 // origin is not the first character, or it is U+0000. 2242 UnicodeSet set; 2243 if((canonValue&CANON_HAS_SET)==0) { 2244 int firstOrigin=canonValue&CANON_VALUE_MASK; 2245 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2246 mutableTrie.set(decompLead, canonValue); 2247 canonStartSets.add(set=new UnicodeSet()); 2248 if(firstOrigin!=0) { 2249 set.add(firstOrigin); 2250 } 2251 } else { 2252 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2253 } 2254 set.add(origin); 2255 } 2256 } 2257 2258 @SuppressWarnings("unused") 2259 private VersionInfo dataVersion; 2260 2261 // BMP code point thresholds for quick check loops looking at single UTF-16 code units. 2262 private int minDecompNoCP; 2263 private int minCompNoMaybeCP; 2264 private int minLcccCP; 2265 2266 // Norm16 value thresholds for quick check combinations and types of extra data. 2267 private int minYesNo; 2268 private int minYesNoMappingsOnly; 2269 private int minNoNo; 2270 private int minNoNoCompBoundaryBefore; 2271 private int minNoNoCompNoMaybeCC; 2272 private int minNoNoEmpty; 2273 private int limitNoNo; 2274 private int centerNoNoDelta; 2275 private int minMaybeYes; 2276 2277 private CodePointTrie.Fast16 normTrie; 2278 private String maybeYesCompositions; 2279 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2280 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2281 2282 private CodePointTrie canonIterData; 2283 private ArrayList<UnicodeSet> canonStartSets; 2284 2285 // bits in canonIterData 2286 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2287 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2288 private static final int CANON_HAS_SET = 0x200000; 2289 private static final int CANON_VALUE_MASK = 0x1fffff; 2290 } 2291