1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.impl; 9 10 import java.io.IOException; 11 import java.nio.ByteBuffer; 12 import java.util.ArrayList; 13 import java.util.Iterator; 14 15 import com.ibm.icu.text.UTF16; 16 import com.ibm.icu.text.UnicodeSet; 17 import com.ibm.icu.util.ICUUncheckedIOException; 18 import com.ibm.icu.util.VersionInfo; 19 20 public final class Normalizer2Impl { 21 public static final class Hangul { 22 /* Korean Hangul and Jamo constants */ 23 public static final int JAMO_L_BASE=0x1100; /* "lead" jamo */ 24 public static final int JAMO_L_END=0x1112; 25 public static final int JAMO_V_BASE=0x1161; /* "vowel" jamo */ 26 public static final int JAMO_V_END=0x1175; 27 public static final int JAMO_T_BASE=0x11a7; /* "trail" jamo */ 28 public static final int JAMO_T_END=0x11c2; 29 30 public static final int HANGUL_BASE=0xac00; 31 public static final int HANGUL_END=0xd7a3; 32 33 public static final int JAMO_L_COUNT=19; 34 public static final int JAMO_V_COUNT=21; 35 public static final int JAMO_T_COUNT=28; 36 37 public static final int JAMO_L_LIMIT=JAMO_L_BASE+JAMO_L_COUNT; 38 public static final int JAMO_V_LIMIT=JAMO_V_BASE+JAMO_V_COUNT; 39 40 public static final int JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT; 41 42 public static final int HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT; 43 public static final int HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT; 44 isHangul(int c)45 public static boolean isHangul(int c) { 46 return HANGUL_BASE<=c && c<HANGUL_LIMIT; 47 } isHangulWithoutJamoT(char c)48 public static boolean isHangulWithoutJamoT(char c) { 49 c-=HANGUL_BASE; 50 return c<HANGUL_COUNT && c%JAMO_T_COUNT==0; 51 } isJamoL(int c)52 public static boolean isJamoL(int c) { 53 return JAMO_L_BASE<=c && c<JAMO_L_LIMIT; 54 } isJamoV(int c)55 public static boolean isJamoV(int c) { 56 return JAMO_V_BASE<=c && c<JAMO_V_LIMIT; 57 } 58 59 /** 60 * Decomposes c, which must be a Hangul syllable, into buffer 61 * and returns the length of the decomposition (2 or 3). 62 */ decompose(int c, Appendable buffer)63 public static int decompose(int c, Appendable buffer) { 64 try { 65 c-=HANGUL_BASE; 66 int c2=c%JAMO_T_COUNT; 67 c/=JAMO_T_COUNT; 68 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 69 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 70 if(c2==0) { 71 return 2; 72 } else { 73 buffer.append((char)(JAMO_T_BASE+c2)); 74 return 3; 75 } 76 } catch(IOException e) { 77 // Will not occur because we do not write to I/O. 78 throw new ICUUncheckedIOException(e); 79 } 80 } 81 82 /** 83 * Decomposes c, which must be a Hangul syllable, into buffer. 84 * This is the raw, not recursive, decomposition. Its length is always 2. 85 */ getRawDecomposition(int c, Appendable buffer)86 public static void getRawDecomposition(int c, Appendable buffer) { 87 try { 88 int orig=c; 89 c-=HANGUL_BASE; 90 int c2=c%JAMO_T_COUNT; 91 if(c2==0) { 92 c/=JAMO_T_COUNT; 93 buffer.append((char)(JAMO_L_BASE+c/JAMO_V_COUNT)); 94 buffer.append((char)(JAMO_V_BASE+c%JAMO_V_COUNT)); 95 } else { 96 buffer.append((char)(orig-c2)); // LV syllable 97 buffer.append((char)(JAMO_T_BASE+c2)); 98 } 99 } catch(IOException e) { 100 // Will not occur because we do not write to I/O. 101 throw new ICUUncheckedIOException(e); 102 } 103 } 104 } 105 106 /** 107 * Writable buffer that takes care of canonical ordering. 108 * Its Appendable methods behave like the C++ implementation's 109 * appendZeroCC() methods. 110 * <p> 111 * If dest is a StringBuilder, then the buffer writes directly to it. 112 * Otherwise, the buffer maintains a StringBuilder for intermediate text segments 113 * until no further changes are necessary and whole segments are appended. 114 * append() methods that take combining-class values always write to the StringBuilder. 115 * Other append() methods flush and append to the Appendable. 116 */ 117 public static final class ReorderingBuffer implements Appendable { ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity)118 public ReorderingBuffer(Normalizer2Impl ni, Appendable dest, int destCapacity) { 119 impl=ni; 120 app=dest; 121 if(app instanceof StringBuilder) { 122 appIsStringBuilder=true; 123 str=(StringBuilder)dest; 124 // In Java, the constructor subsumes public void init(int destCapacity) { 125 str.ensureCapacity(destCapacity); 126 reorderStart=0; 127 if(str.length()==0) { 128 lastCC=0; 129 } else { 130 setIterator(); 131 lastCC=previousCC(); 132 // Set reorderStart after the last code point with cc<=1 if there is one. 133 if(lastCC>1) { 134 while(previousCC()>1) {} 135 } 136 reorderStart=codePointLimit; 137 } 138 } else { 139 appIsStringBuilder=false; 140 str=new StringBuilder(); 141 reorderStart=0; 142 lastCC=0; 143 } 144 } 145 isEmpty()146 public boolean isEmpty() { return str.length()==0; } length()147 public int length() { return str.length(); } getLastCC()148 public int getLastCC() { return lastCC; } 149 getStringBuilder()150 public StringBuilder getStringBuilder() { return str; } 151 equals(CharSequence s, int start, int limit)152 public boolean equals(CharSequence s, int start, int limit) { 153 return UTF16Plus.equal(str, 0, str.length(), s, start, limit); 154 } 155 156 // For Hangul composition, replacing the Leading consonant Jamo with the syllable. setLastChar(char c)157 public void setLastChar(char c) { 158 str.setCharAt(str.length()-1, c); 159 } 160 append(int c, int cc)161 public void append(int c, int cc) { 162 if(lastCC<=cc || cc==0) { 163 str.appendCodePoint(c); 164 lastCC=cc; 165 if(cc<=1) { 166 reorderStart=str.length(); 167 } 168 } else { 169 insert(c, cc); 170 } 171 } 172 // s must be in NFD, otherwise change the implementation. append(CharSequence s, int start, int limit, int leadCC, int trailCC)173 public void append(CharSequence s, int start, int limit, 174 int leadCC, int trailCC) { 175 if(start==limit) { 176 return; 177 } 178 if(lastCC<=leadCC || leadCC==0) { 179 if(trailCC<=1) { 180 reorderStart=str.length()+(limit-start); 181 } else if(leadCC<=1) { 182 reorderStart=str.length()+1; // Ok if not a code point boundary. 183 } 184 str.append(s, start, limit); 185 lastCC=trailCC; 186 } else { 187 int c=Character.codePointAt(s, start); 188 start+=Character.charCount(c); 189 insert(c, leadCC); // insert first code point 190 while(start<limit) { 191 c=Character.codePointAt(s, start); 192 start+=Character.charCount(c); 193 if(start<limit) { 194 // s must be in NFD, otherwise we need to use getCC(). 195 leadCC=getCCFromYesOrMaybe(impl.getNorm16(c)); 196 } else { 197 leadCC=trailCC; 198 } 199 append(c, leadCC); 200 } 201 } 202 } 203 // The following append() methods work like C++ appendZeroCC(). 204 // They assume that the cc or trailCC of their input is 0. 205 // Most of them implement Appendable interface methods. 206 // @Override when we switch to Java 6 append(char c)207 public ReorderingBuffer append(char c) { 208 str.append(c); 209 lastCC=0; 210 reorderStart=str.length(); 211 return this; 212 } appendZeroCC(int c)213 public void appendZeroCC(int c) { 214 str.appendCodePoint(c); 215 lastCC=0; 216 reorderStart=str.length(); 217 } 218 // @Override when we switch to Java 6 append(CharSequence s)219 public ReorderingBuffer append(CharSequence s) { 220 if(s.length()!=0) { 221 str.append(s); 222 lastCC=0; 223 reorderStart=str.length(); 224 } 225 return this; 226 } 227 // @Override when we switch to Java 6 append(CharSequence s, int start, int limit)228 public ReorderingBuffer append(CharSequence s, int start, int limit) { 229 if(start!=limit) { 230 str.append(s, start, limit); 231 lastCC=0; 232 reorderStart=str.length(); 233 } 234 return this; 235 } 236 /** 237 * Flushes from the intermediate StringBuilder to the Appendable, 238 * if they are different objects. 239 * Used after recomposition. 240 * Must be called at the end when writing to a non-StringBuilder Appendable. 241 */ flush()242 public void flush() { 243 if(appIsStringBuilder) { 244 reorderStart=str.length(); 245 } else { 246 try { 247 app.append(str); 248 str.setLength(0); 249 reorderStart=0; 250 } catch(IOException e) { 251 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 252 } 253 } 254 lastCC=0; 255 } 256 /** 257 * Flushes from the intermediate StringBuilder to the Appendable, 258 * if they are different objects. 259 * Then appends the new text to the Appendable or StringBuilder. 260 * Normally used after quick check loops find a non-empty sequence. 261 */ flushAndAppendZeroCC(CharSequence s, int start, int limit)262 public ReorderingBuffer flushAndAppendZeroCC(CharSequence s, int start, int limit) { 263 if(appIsStringBuilder) { 264 str.append(s, start, limit); 265 reorderStart=str.length(); 266 } else { 267 try { 268 app.append(str).append(s, start, limit); 269 str.setLength(0); 270 reorderStart=0; 271 } catch(IOException e) { 272 throw new ICUUncheckedIOException(e); // Avoid declaring "throws IOException". 273 } 274 } 275 lastCC=0; 276 return this; 277 } remove()278 public void remove() { 279 str.setLength(0); 280 lastCC=0; 281 reorderStart=0; 282 } removeSuffix(int suffixLength)283 public void removeSuffix(int suffixLength) { 284 int oldLength=str.length(); 285 str.delete(oldLength-suffixLength, oldLength); 286 lastCC=0; 287 reorderStart=str.length(); 288 } 289 290 /* 291 * TODO: Revisit whether it makes sense to track reorderStart. 292 * It is set to after the last known character with cc<=1, 293 * which stops previousCC() before it reads that character and looks up its cc. 294 * previousCC() is normally only called from insert(). 295 * In other words, reorderStart speeds up the insertion of a combining mark 296 * into a multi-combining mark sequence where it does not belong at the end. 297 * This might not be worth the trouble. 298 * On the other hand, it's not a huge amount of trouble. 299 * 300 * We probably need it for UNORM_SIMPLE_APPEND. 301 */ 302 303 // Inserts c somewhere before the last character. 304 // Requires 0<cc<lastCC which implies reorderStart<limit. insert(int c, int cc)305 private void insert(int c, int cc) { 306 for(setIterator(), skipPrevious(); previousCC()>cc;) {} 307 // insert c at codePointLimit, after the character with prevCC<=cc 308 if(c<=0xffff) { 309 str.insert(codePointLimit, (char)c); 310 if(cc<=1) { 311 reorderStart=codePointLimit+1; 312 } 313 } else { 314 str.insert(codePointLimit, Character.toChars(c)); 315 if(cc<=1) { 316 reorderStart=codePointLimit+2; 317 } 318 } 319 } 320 321 private final Normalizer2Impl impl; 322 private final Appendable app; 323 private final StringBuilder str; 324 private final boolean appIsStringBuilder; 325 private int reorderStart; 326 private int lastCC; 327 328 // private backward iterator setIterator()329 private void setIterator() { codePointStart=str.length(); } skipPrevious()330 private void skipPrevious() { // Requires 0<codePointStart. 331 codePointLimit=codePointStart; 332 codePointStart=str.offsetByCodePoints(codePointStart, -1); 333 } previousCC()334 private int previousCC() { // Returns 0 if there is no previous character. 335 codePointLimit=codePointStart; 336 if(reorderStart>=codePointStart) { 337 return 0; 338 } 339 int c=str.codePointBefore(codePointStart); 340 codePointStart-=Character.charCount(c); 341 if(c<MIN_CCC_LCCC_CP) { 342 return 0; 343 } 344 return getCCFromYesOrMaybe(impl.getNorm16(c)); 345 } 346 347 private int codePointStart, codePointLimit; 348 } 349 350 // TODO: Propose as public API on the UTF16 class. 351 // TODO: Propose widening UTF16 methods that take char to take int. 352 // TODO: Propose widening UTF16 methods that take String to take CharSequence. 353 public static final class UTF16Plus { 354 /** 355 * Assuming c is a surrogate code point (UTF16.isSurrogate(c)), 356 * is it a lead surrogate? 357 * @param c code unit or code point 358 * @return true or false 359 */ isSurrogateLead(int c)360 public static boolean isSurrogateLead(int c) { return (c&0x400)==0; } 361 /** 362 * Compares two CharSequence objects for binary equality. 363 * @param s1 first sequence 364 * @param s2 second sequence 365 * @return true if s1 contains the same text as s2 366 */ equal(CharSequence s1, CharSequence s2)367 public static boolean equal(CharSequence s1, CharSequence s2) { 368 if(s1==s2) { 369 return true; 370 } 371 int length=s1.length(); 372 if(length!=s2.length()) { 373 return false; 374 } 375 for(int i=0; i<length; ++i) { 376 if(s1.charAt(i)!=s2.charAt(i)) { 377 return false; 378 } 379 } 380 return true; 381 } 382 /** 383 * Compares two CharSequence subsequences for binary equality. 384 * @param s1 first sequence 385 * @param start1 start offset in first sequence 386 * @param limit1 limit offset in first sequence 387 * @param s2 second sequence 388 * @param start2 start offset in second sequence 389 * @param limit2 limit offset in second sequence 390 * @return true if s1.subSequence(start1, limit1) contains the same text 391 * as s2.subSequence(start2, limit2) 392 */ equal(CharSequence s1, int start1, int limit1, CharSequence s2, int start2, int limit2)393 public static boolean equal(CharSequence s1, int start1, int limit1, 394 CharSequence s2, int start2, int limit2) { 395 if((limit1-start1)!=(limit2-start2)) { 396 return false; 397 } 398 if(s1==s2 && start1==start2) { 399 return true; 400 } 401 while(start1<limit1) { 402 if(s1.charAt(start1++)!=s2.charAt(start2++)) { 403 return false; 404 } 405 } 406 return true; 407 } 408 } 409 Normalizer2Impl()410 public Normalizer2Impl() {} 411 412 private static final class IsAcceptable implements ICUBinary.Authenticate { 413 // @Override when we switch to Java 6 isDataVersionAcceptable(byte version[])414 public boolean isDataVersionAcceptable(byte version[]) { 415 return version[0]==2; 416 } 417 } 418 private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable(); 419 private static final int DATA_FORMAT = 0x4e726d32; // "Nrm2" 420 load(ByteBuffer bytes)421 public Normalizer2Impl load(ByteBuffer bytes) { 422 try { 423 dataVersion=ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, IS_ACCEPTABLE); 424 int indexesLength=bytes.getInt()/4; // inIndexes[IX_NORM_TRIE_OFFSET]/4 425 if(indexesLength<=IX_MIN_MAYBE_YES) { 426 throw new ICUUncheckedIOException("Normalizer2 data: not enough indexes"); 427 } 428 int[] inIndexes=new int[indexesLength]; 429 inIndexes[0]=indexesLength*4; 430 for(int i=1; i<indexesLength; ++i) { 431 inIndexes[i]=bytes.getInt(); 432 } 433 434 minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP]; 435 minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP]; 436 437 minYesNo=inIndexes[IX_MIN_YES_NO]; 438 minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]; 439 minNoNo=inIndexes[IX_MIN_NO_NO]; 440 limitNoNo=inIndexes[IX_LIMIT_NO_NO]; 441 minMaybeYes=inIndexes[IX_MIN_MAYBE_YES]; 442 443 // Read the normTrie. 444 int offset=inIndexes[IX_NORM_TRIE_OFFSET]; 445 int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET]; 446 normTrie=Trie2_16.createFromSerialized(bytes); 447 int trieLength=normTrie.getSerializedLength(); 448 if(trieLength>(nextOffset-offset)) { 449 throw new ICUUncheckedIOException("Normalizer2 data: not enough bytes for normTrie"); 450 } 451 ICUBinary.skipBytes(bytes, (nextOffset-offset)-trieLength); // skip padding after trie bytes 452 453 // Read the composition and mapping data. 454 offset=nextOffset; 455 nextOffset=inIndexes[IX_SMALL_FCD_OFFSET]; 456 int numChars=(nextOffset-offset)/2; 457 char[] chars; 458 if(numChars!=0) { 459 chars=new char[numChars]; 460 for(int i=0; i<numChars; ++i) { 461 chars[i]=bytes.getChar(); 462 } 463 maybeYesCompositions=new String(chars); 464 extraData=maybeYesCompositions.substring(MIN_NORMAL_MAYBE_YES-minMaybeYes); 465 } 466 467 // smallFCD: new in formatVersion 2 468 offset=nextOffset; 469 smallFCD=new byte[0x100]; 470 for(int i=0; i<0x100; ++i) { 471 smallFCD[i]=bytes.get(); 472 } 473 474 // Build tccc180[]. 475 // gennorm2 enforces lccc=0 for c<MIN_CCC_LCCC_CP=U+0300. 476 tccc180=new int[0x180]; 477 int bits=0; 478 for(int c=0; c<0x180; bits>>=1) { 479 if((c&0xff)==0) { 480 bits=smallFCD[c>>8]; // one byte per 0x100 code points 481 } 482 if((bits&1)!=0) { 483 for(int i=0; i<0x20; ++i, ++c) { 484 tccc180[c]=getFCD16FromNormData(c)&0xff; 485 } 486 } else { 487 c+=0x20; 488 } 489 } 490 491 return this; 492 } catch(IOException e) { 493 throw new ICUUncheckedIOException(e); 494 } 495 } load(String name)496 public Normalizer2Impl load(String name) { 497 return load(ICUBinary.getRequiredData(name)); 498 } 499 enumLcccRange(int start, int end, int norm16, UnicodeSet set)500 private void enumLcccRange(int start, int end, int norm16, UnicodeSet set) { 501 if(isAlgorithmicNoNo(norm16)) { 502 // Range of code points with same-norm16-value algorithmic decompositions. 503 // They might have different non-zero FCD16 values. 504 do { 505 int fcd16=getFCD16(start); 506 if(fcd16>0xff) { set.add(start); } 507 } while(++start<=end); 508 } else { 509 int fcd16=getFCD16(start); 510 if(fcd16>0xff) { set.add(start, end); } 511 } 512 } 513 enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set)514 private void enumNorm16PropertyStartsRange(int start, int end, int value, UnicodeSet set) { 515 /* add the start code point to the USet */ 516 set.add(start); 517 if(start!=end && isAlgorithmicNoNo(value)) { 518 // Range of code points with same-norm16-value algorithmic decompositions. 519 // They might have different non-zero FCD16 values. 520 int prevFCD16=getFCD16(start); 521 while(++start<=end) { 522 int fcd16=getFCD16(start); 523 if(fcd16!=prevFCD16) { 524 set.add(start); 525 prevFCD16=fcd16; 526 } 527 } 528 } 529 } 530 addLcccChars(UnicodeSet set)531 public void addLcccChars(UnicodeSet set) { 532 /* add the start code point of each same-value range of each trie */ 533 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 534 Trie2.Range range; 535 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 536 enumLcccRange(range.startCodePoint, range.endCodePoint, range.value, set); 537 } 538 } 539 addPropertyStarts(UnicodeSet set)540 public void addPropertyStarts(UnicodeSet set) { 541 /* add the start code point of each same-value range of each trie */ 542 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 543 Trie2.Range range; 544 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 545 enumNorm16PropertyStartsRange(range.startCodePoint, range.endCodePoint, range.value, set); 546 } 547 548 /* add Hangul LV syllables and LV+1 because of skippables */ 549 for(int c=Hangul.HANGUL_BASE; c<Hangul.HANGUL_LIMIT; c+=Hangul.JAMO_T_COUNT) { 550 set.add(c); 551 set.add(c+1); 552 } 553 set.add(Hangul.HANGUL_LIMIT); /* add Hangul+1 to continue with other properties */ 554 } 555 addCanonIterPropertyStarts(UnicodeSet set)556 public void addCanonIterPropertyStarts(UnicodeSet set) { 557 /* add the start code point of each same-value range of the canonical iterator data trie */ 558 ensureCanonIterData(); 559 // currently only used for the SEGMENT_STARTER property 560 Iterator<Trie2.Range> trieIterator=canonIterData.iterator(segmentStarterMapper); 561 Trie2.Range range; 562 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 563 /* add the start code point to the USet */ 564 set.add(range.startCodePoint); 565 } 566 } 567 private static final Trie2.ValueMapper segmentStarterMapper=new Trie2.ValueMapper() { 568 public int map(int in) { 569 return in&CANON_NOT_SEGMENT_STARTER; 570 } 571 }; 572 573 // low-level properties ------------------------------------------------ *** 574 getNormTrie()575 public Trie2_16 getNormTrie() { return normTrie; } 576 577 // Note: Normalizer2Impl.java r30983 (2011-nov-27) 578 // still had getFCDTrie() which built and cached an FCD trie. 579 // That provided faster access to FCD data than getFCD16FromNormData() 580 // but required synchronization and consumed some 10kB of heap memory 581 // in any process that uses FCD (e.g., via collation). 582 // tccc180[] and smallFCD[] are intended to help with any loss of performance, 583 // at least for Latin & CJK. 584 585 /** 586 * Builds the canonical-iterator data for this instance. 587 * This is required before any of {@link #isCanonSegmentStarter(int)} or 588 * {@link #getCanonStartSet(int, UnicodeSet)} are called, 589 * or else they crash. 590 * @return this 591 */ ensureCanonIterData()592 public synchronized Normalizer2Impl ensureCanonIterData() { 593 if(canonIterData==null) { 594 Trie2Writable newData=new Trie2Writable(0, 0); 595 canonStartSets=new ArrayList<UnicodeSet>(); 596 Iterator<Trie2.Range> trieIterator=normTrie.iterator(); 597 Trie2.Range range; 598 while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) { 599 final int norm16=range.value; 600 if(norm16==0 || (minYesNo<=norm16 && norm16<minNoNo)) { 601 // Inert, or 2-way mapping (including Hangul syllable). 602 // We do not write a canonStartSet for any yesNo character. 603 // Composites from 2-way mappings are added at runtime from the 604 // starter's compositions list, and the other characters in 605 // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are 606 // "maybe" characters. 607 continue; 608 } 609 for(int c=range.startCodePoint; c<=range.endCodePoint; ++c) { 610 final int oldValue=newData.get(c); 611 int newValue=oldValue; 612 if(norm16>=minMaybeYes) { 613 // not a segment starter if it occurs in a decomposition or has cc!=0 614 newValue|=CANON_NOT_SEGMENT_STARTER; 615 if(norm16<MIN_NORMAL_MAYBE_YES) { 616 newValue|=CANON_HAS_COMPOSITIONS; 617 } 618 } else if(norm16<minYesNo) { 619 newValue|=CANON_HAS_COMPOSITIONS; 620 } else { 621 // c has a one-way decomposition 622 int c2=c; 623 int norm16_2=norm16; 624 while(limitNoNo<=norm16_2 && norm16_2<minMaybeYes) { 625 c2=this.mapAlgorithmic(c2, norm16_2); 626 norm16_2=getNorm16(c2); 627 } 628 if(minYesNo<=norm16_2 && norm16_2<limitNoNo) { 629 // c decomposes, get everything from the variable-length extra data 630 int firstUnit=extraData.charAt(norm16_2); 631 int length=firstUnit&MAPPING_LENGTH_MASK; 632 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 633 if(c==c2 && (extraData.charAt(norm16_2-1)&0xff)!=0) { 634 newValue|=CANON_NOT_SEGMENT_STARTER; // original c has cc!=0 635 } 636 } 637 // Skip empty mappings (no characters in the decomposition). 638 if(length!=0) { 639 ++norm16_2; // skip over the firstUnit 640 // add c to first code point's start set 641 int limit=norm16_2+length; 642 c2=extraData.codePointAt(norm16_2); 643 addToStartSet(newData, c, c2); 644 // Set CANON_NOT_SEGMENT_STARTER for each remaining code point of a 645 // one-way mapping. A 2-way mapping is possible here after 646 // intermediate algorithmic mapping. 647 if(norm16_2>=minNoNo) { 648 while((norm16_2+=Character.charCount(c2))<limit) { 649 c2=extraData.codePointAt(norm16_2); 650 int c2Value=newData.get(c2); 651 if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) { 652 newData.set(c2, c2Value|CANON_NOT_SEGMENT_STARTER); 653 } 654 } 655 } 656 } 657 } else { 658 // c decomposed to c2 algorithmically; c has cc==0 659 addToStartSet(newData, c, c2); 660 } 661 } 662 if(newValue!=oldValue) { 663 newData.set(c, newValue); 664 } 665 } 666 } 667 canonIterData=newData.toTrie2_32(); 668 } 669 return this; 670 } 671 getNorm16(int c)672 public int getNorm16(int c) { return normTrie.get(c); } 673 getCompQuickCheck(int norm16)674 public int getCompQuickCheck(int norm16) { 675 if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { 676 return 1; // yes 677 } else if(minMaybeYes<=norm16) { 678 return 2; // maybe 679 } else { 680 return 0; // no 681 } 682 } isAlgorithmicNoNo(int norm16)683 public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; } isCompNo(int norm16)684 public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; } isDecompYes(int norm16)685 public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; } 686 getCC(int norm16)687 public int getCC(int norm16) { 688 if(norm16>=MIN_NORMAL_MAYBE_YES) { 689 return norm16&0xff; 690 } 691 if(norm16<minNoNo || limitNoNo<=norm16) { 692 return 0; 693 } 694 return getCCFromNoNo(norm16); 695 } getCCFromYesOrMaybe(int norm16)696 public static int getCCFromYesOrMaybe(int norm16) { 697 return norm16>=MIN_NORMAL_MAYBE_YES ? norm16&0xff : 0; 698 } 699 700 /** 701 * Returns the FCD data for code point c. 702 * @param c A Unicode code point. 703 * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. 704 */ getFCD16(int c)705 public int getFCD16(int c) { 706 if(c<0) { 707 return 0; 708 } else if(c<0x180) { 709 return tccc180[c]; 710 } else if(c<=0xffff) { 711 if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; } 712 } 713 return getFCD16FromNormData(c); 714 } 715 /** Returns the FCD data for U+0000<=c<U+0180. */ getFCD16FromBelow180(int c)716 public int getFCD16FromBelow180(int c) { return tccc180[c]; } 717 /** Returns true if the single-or-lead code unit c might have non-zero FCD data. */ singleLeadMightHaveNonZeroFCD16(int lead)718 public boolean singleLeadMightHaveNonZeroFCD16(int lead) { 719 // 0<=lead<=0xffff 720 byte bits=smallFCD[lead>>8]; 721 if(bits==0) { return false; } 722 return ((bits>>((lead>>5)&7))&1)!=0; 723 } 724 725 /** Gets the FCD value from the regular normalization data. */ getFCD16FromNormData(int c)726 public int getFCD16FromNormData(int c) { 727 // Only loops for 1:1 algorithmic mappings. 728 for(;;) { 729 int norm16=getNorm16(c); 730 if(norm16<=minYesNo) { 731 // no decomposition or Hangul syllable, all zeros 732 return 0; 733 } else if(norm16>=MIN_NORMAL_MAYBE_YES) { 734 // combining mark 735 norm16&=0xff; 736 return norm16|(norm16<<8); 737 } else if(norm16>=minMaybeYes) { 738 return 0; 739 } else if(isDecompNoAlgorithmic(norm16)) { 740 c=mapAlgorithmic(c, norm16); 741 } else { 742 // c decomposes, get everything from the variable-length extra data 743 int firstUnit=extraData.charAt(norm16); 744 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 745 // A character that is deleted (maps to an empty string) must 746 // get the worst-case lccc and tccc values because arbitrary 747 // characters on both sides will become adjacent. 748 return 0x1ff; 749 } else { 750 int fcd16=firstUnit>>8; // tccc 751 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 752 fcd16|=extraData.charAt(norm16-1)&0xff00; // lccc 753 } 754 return fcd16; 755 } 756 } 757 } 758 } 759 760 /** 761 * Gets the decomposition for one code point. 762 * @param c code point 763 * @return c's decomposition, if it has one; returns null if it does not have a decomposition 764 */ getDecomposition(int c)765 public String getDecomposition(int c) { 766 int decomp=-1; 767 int norm16; 768 for(;;) { 769 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 770 // c does not decompose 771 } else if(isHangul(norm16)) { 772 // Hangul syllable: decompose algorithmically 773 StringBuilder buffer=new StringBuilder(); 774 Hangul.decompose(c, buffer); 775 return buffer.toString(); 776 } else if(isDecompNoAlgorithmic(norm16)) { 777 decomp=c=mapAlgorithmic(c, norm16); 778 continue; 779 } else { 780 // c decomposes, get everything from the variable-length extra data 781 int length=extraData.charAt(norm16++)&MAPPING_LENGTH_MASK; 782 return extraData.substring(norm16, norm16+length); 783 } 784 if(decomp<0) { 785 return null; 786 } else { 787 return UTF16.valueOf(decomp); 788 } 789 } 790 } 791 792 /** 793 * Gets the raw decomposition for one code point. 794 * @param c code point 795 * @return c's raw decomposition, if it has one; returns null if it does not have a decomposition 796 */ getRawDecomposition(int c)797 public String getRawDecomposition(int c) { 798 // We do not loop in this method because an algorithmic mapping itself 799 // becomes a final result rather than having to be decomposed recursively. 800 int norm16; 801 if(c<minDecompNoCP || isDecompYes(norm16=getNorm16(c))) { 802 // c does not decompose 803 return null; 804 } else if(isHangul(norm16)) { 805 // Hangul syllable: decompose algorithmically 806 StringBuilder buffer=new StringBuilder(); 807 Hangul.getRawDecomposition(c, buffer); 808 return buffer.toString(); 809 } else if(isDecompNoAlgorithmic(norm16)) { 810 return UTF16.valueOf(mapAlgorithmic(c, norm16)); 811 } else { 812 // c decomposes, get everything from the variable-length extra data 813 int firstUnit=extraData.charAt(norm16); 814 int mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping 815 if((firstUnit&MAPPING_HAS_RAW_MAPPING)!=0) { 816 // Read the raw mapping from before the firstUnit and before the optional ccc/lccc word. 817 // Bit 7=MAPPING_HAS_CCC_LCCC_WORD 818 int rawMapping=norm16-((firstUnit>>7)&1)-1; 819 char rm0=extraData.charAt(rawMapping); 820 if(rm0<=MAPPING_LENGTH_MASK) { 821 return extraData.substring(rawMapping-rm0, rawMapping); 822 } else { 823 // Copy the normal mapping and replace its first two code units with rm0. 824 StringBuilder buffer=new StringBuilder(mLength-1).append(rm0); 825 norm16+=1+2; // skip over the firstUnit and the first two mapping code units 826 return buffer.append(extraData, norm16, norm16+mLength-2).toString(); 827 } 828 } else { 829 norm16+=1; // skip over the firstUnit 830 return extraData.substring(norm16, norm16+mLength); 831 } 832 } 833 } 834 835 /** 836 * Returns true if code point c starts a canonical-iterator string segment. 837 * <b>{@link #ensureCanonIterData()} must have been called before this method, 838 * or else this method will crash.</b> 839 * @param c A Unicode code point. 840 * @return true if c starts a canonical-iterator string segment. 841 */ isCanonSegmentStarter(int c)842 public boolean isCanonSegmentStarter(int c) { 843 return canonIterData.get(c)>=0; 844 } 845 /** 846 * Returns true if there are characters whose decomposition starts with c. 847 * If so, then the set is cleared and then filled with those characters. 848 * <b>{@link #ensureCanonIterData()} must have been called before this method, 849 * or else this method will crash.</b> 850 * @param c A Unicode code point. 851 * @param set A UnicodeSet to receive the characters whose decompositions 852 * start with c, if there are any. 853 * @return true if there are characters whose decomposition starts with c. 854 */ getCanonStartSet(int c, UnicodeSet set)855 public boolean getCanonStartSet(int c, UnicodeSet set) { 856 int canonValue=canonIterData.get(c)&~CANON_NOT_SEGMENT_STARTER; 857 if(canonValue==0) { 858 return false; 859 } 860 set.clear(); 861 int value=canonValue&CANON_VALUE_MASK; 862 if((canonValue&CANON_HAS_SET)!=0) { 863 set.addAll(canonStartSets.get(value)); 864 } else if(value!=0) { 865 set.add(value); 866 } 867 if((canonValue&CANON_HAS_COMPOSITIONS)!=0) { 868 int norm16=getNorm16(c); 869 if(norm16==JAMO_L) { 870 int syllable=Hangul.HANGUL_BASE+(c-Hangul.JAMO_L_BASE)*Hangul.JAMO_VT_COUNT; 871 set.add(syllable, syllable+Hangul.JAMO_VT_COUNT-1); 872 } else { 873 addComposites(getCompositionsList(norm16), set); 874 } 875 } 876 return true; 877 } 878 879 public static final int MIN_CCC_LCCC_CP=0x300; 880 881 public static final int MIN_YES_YES_WITH_CC=0xff01; 882 public static final int JAMO_VT=0xff00; 883 public static final int MIN_NORMAL_MAYBE_YES=0xfe00; 884 public static final int JAMO_L=1; 885 public static final int MAX_DELTA=0x40; 886 887 // Byte offsets from the start of the data, after the generic header. 888 public static final int IX_NORM_TRIE_OFFSET=0; 889 public static final int IX_EXTRA_DATA_OFFSET=1; 890 public static final int IX_SMALL_FCD_OFFSET=2; 891 public static final int IX_RESERVED3_OFFSET=3; 892 public static final int IX_TOTAL_SIZE=7; 893 894 // Code point thresholds for quick check codes. 895 public static final int IX_MIN_DECOMP_NO_CP=8; 896 public static final int IX_MIN_COMP_NO_MAYBE_CP=9; 897 898 // Norm16 value thresholds for quick check combinations and types of extra data. 899 // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. 900 public static final int IX_MIN_YES_NO=10; 901 public static final int IX_MIN_NO_NO=11; 902 public static final int IX_LIMIT_NO_NO=12; 903 public static final int IX_MIN_MAYBE_YES=13; 904 905 // Mappings only in [minYesNoMappingsOnly..minNoNo[. 906 public static final int IX_MIN_YES_NO_MAPPINGS_ONLY=14; 907 908 public static final int IX_COUNT=16; 909 910 public static final int MAPPING_HAS_CCC_LCCC_WORD=0x80; 911 public static final int MAPPING_HAS_RAW_MAPPING=0x40; 912 public static final int MAPPING_NO_COMP_BOUNDARY_AFTER=0x20; 913 public static final int MAPPING_LENGTH_MASK=0x1f; 914 915 public static final int COMP_1_LAST_TUPLE=0x8000; 916 public static final int COMP_1_TRIPLE=1; 917 public static final int COMP_1_TRAIL_LIMIT=0x3400; 918 public static final int COMP_1_TRAIL_MASK=0x7ffe; 919 public static final int COMP_1_TRAIL_SHIFT=9; // 10-1 for the "triple" bit 920 public static final int COMP_2_TRAIL_SHIFT=6; 921 public static final int COMP_2_TRAIL_MASK=0xffc0; 922 923 // higher-level functionality ------------------------------------------ *** 924 925 // NFD without an NFD Normalizer2 instance. decompose(CharSequence s, StringBuilder dest)926 public Appendable decompose(CharSequence s, StringBuilder dest) { 927 decompose(s, 0, s.length(), dest, s.length()); 928 return dest; 929 } 930 /** 931 * Decomposes s[src, limit[ and writes the result to dest. 932 * limit can be NULL if src is NUL-terminated. 933 * destLengthEstimate is the initial dest buffer capacity and can be -1. 934 */ decompose(CharSequence s, int src, int limit, StringBuilder dest, int destLengthEstimate)935 public void decompose(CharSequence s, int src, int limit, StringBuilder dest, 936 int destLengthEstimate) { 937 if(destLengthEstimate<0) { 938 destLengthEstimate=limit-src; 939 } 940 dest.setLength(0); 941 ReorderingBuffer buffer=new ReorderingBuffer(this, dest, destLengthEstimate); 942 decompose(s, src, limit, buffer); 943 } 944 945 // Dual functionality: 946 // buffer!=NULL: normalize 947 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes decompose(CharSequence s, int src, int limit, ReorderingBuffer buffer)948 public int decompose(CharSequence s, int src, int limit, 949 ReorderingBuffer buffer) { 950 int minNoCP=minDecompNoCP; 951 952 int prevSrc; 953 int c=0; 954 int norm16=0; 955 956 // only for quick check 957 int prevBoundary=src; 958 int prevCC=0; 959 960 for(;;) { 961 // count code units below the minimum or with irrelevant data for the quick check 962 for(prevSrc=src; src!=limit;) { 963 if( (c=s.charAt(src))<minNoCP || 964 isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 965 ) { 966 ++src; 967 } else if(!UTF16.isSurrogate((char)c)) { 968 break; 969 } else { 970 char c2; 971 if(UTF16Plus.isSurrogateLead(c)) { 972 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 973 c=Character.toCodePoint((char)c, c2); 974 } 975 } else /* trail surrogate */ { 976 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 977 --src; 978 c=Character.toCodePoint(c2, (char)c); 979 } 980 } 981 if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) { 982 src+=Character.charCount(c); 983 } else { 984 break; 985 } 986 } 987 } 988 // copy these code units all at once 989 if(src!=prevSrc) { 990 if(buffer!=null) { 991 buffer.flushAndAppendZeroCC(s, prevSrc, src); 992 } else { 993 prevCC=0; 994 prevBoundary=src; 995 } 996 } 997 if(src==limit) { 998 break; 999 } 1000 1001 // Check one above-minimum, relevant code point. 1002 src+=Character.charCount(c); 1003 if(buffer!=null) { 1004 decompose(c, norm16, buffer); 1005 } else { 1006 if(isDecompYes(norm16)) { 1007 int cc=getCCFromYesOrMaybe(norm16); 1008 if(prevCC<=cc || cc==0) { 1009 prevCC=cc; 1010 if(cc<=1) { 1011 prevBoundary=src; 1012 } 1013 continue; 1014 } 1015 } 1016 return prevBoundary; // "no" or cc out of order 1017 } 1018 } 1019 return src; 1020 } decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer)1021 public void decomposeAndAppend(CharSequence s, boolean doDecompose, ReorderingBuffer buffer) { 1022 int limit=s.length(); 1023 if(limit==0) { 1024 return; 1025 } 1026 if(doDecompose) { 1027 decompose(s, 0, limit, buffer); 1028 return; 1029 } 1030 // Just merge the strings at the boundary. 1031 int c=Character.codePointAt(s, 0); 1032 int src=0; 1033 int firstCC, prevCC, cc; 1034 firstCC=prevCC=cc=getCC(getNorm16(c)); 1035 while(cc!=0) { 1036 prevCC=cc; 1037 src+=Character.charCount(c); 1038 if(src>=limit) { 1039 break; 1040 } 1041 c=Character.codePointAt(s, src); 1042 cc=getCC(getNorm16(c)); 1043 }; 1044 buffer.append(s, 0, src, firstCC, prevCC); 1045 buffer.append(s, src, limit); 1046 } 1047 // Very similar to composeQuickCheck(): Make the same changes in both places if relevant. 1048 // doCompose: normalize 1049 // !doCompose: isNormalized (buffer must be empty and initialized) compose(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doCompose, ReorderingBuffer buffer)1050 public boolean compose(CharSequence s, int src, int limit, 1051 boolean onlyContiguous, 1052 boolean doCompose, 1053 ReorderingBuffer buffer) { 1054 int minNoMaybeCP=minCompNoMaybeCP; 1055 1056 /* 1057 * prevBoundary points to the last character before the current one 1058 * that has a composition boundary before it with ccc==0 and quick check "yes". 1059 * Keeping track of prevBoundary saves us looking for a composition boundary 1060 * when we find a "no" or "maybe". 1061 * 1062 * When we back out from prevSrc back to prevBoundary, 1063 * then we also remove those same characters (which had been simply copied 1064 * or canonically-order-inserted) from the ReorderingBuffer. 1065 * Therefore, at all times, the [prevBoundary..prevSrc[ source units 1066 * must correspond 1:1 to destination units at the end of the destination buffer. 1067 */ 1068 int prevBoundary=src; 1069 int prevSrc; 1070 int c=0; 1071 int norm16=0; 1072 1073 // only for isNormalized 1074 int prevCC=0; 1075 1076 for(;;) { 1077 // count code units below the minimum or with irrelevant data for the quick check 1078 for(prevSrc=src; src!=limit;) { 1079 if( (c=s.charAt(src))<minNoMaybeCP || 1080 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1081 ) { 1082 ++src; 1083 } else if(!UTF16.isSurrogate((char)c)) { 1084 break; 1085 } else { 1086 char c2; 1087 if(UTF16Plus.isSurrogateLead(c)) { 1088 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1089 c=Character.toCodePoint((char)c, c2); 1090 } 1091 } else /* trail surrogate */ { 1092 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1093 --src; 1094 c=Character.toCodePoint(c2, (char)c); 1095 } 1096 } 1097 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1098 src+=Character.charCount(c); 1099 } else { 1100 break; 1101 } 1102 } 1103 } 1104 // copy these code units all at once 1105 if(src!=prevSrc) { 1106 if(src==limit) { 1107 if(doCompose) { 1108 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1109 } 1110 break; 1111 } 1112 // Set prevBoundary to the last character in the quick check loop. 1113 prevBoundary=src-1; 1114 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1115 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1116 ) { 1117 --prevBoundary; 1118 } 1119 if(doCompose) { 1120 // The last "quick check yes" character is excluded from the 1121 // flush-and-append call in case it needs to be modified. 1122 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1123 buffer.append(s, prevBoundary, src); 1124 } else { 1125 prevCC=0; 1126 } 1127 // The start of the current character (c). 1128 prevSrc=src; 1129 } else if(src==limit) { 1130 break; 1131 } 1132 1133 src+=Character.charCount(c); 1134 /* 1135 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1136 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1137 * or has ccc!=0. 1138 * Check for Jamo V/T, then for regular characters. 1139 * c is not a Hangul syllable or Jamo L because those have "yes" properties. 1140 */ 1141 if(isJamoVT(norm16) && prevBoundary!=prevSrc) { 1142 char prev=s.charAt(prevSrc-1); 1143 boolean needToDecompose=false; 1144 if(c<Hangul.JAMO_T_BASE) { 1145 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1146 prev-=Hangul.JAMO_L_BASE; 1147 if(prev<Hangul.JAMO_L_COUNT) { 1148 if(!doCompose) { 1149 return false; 1150 } 1151 char syllable=(char) 1152 (Hangul.HANGUL_BASE+ 1153 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1154 Hangul.JAMO_T_COUNT); 1155 char t; 1156 if(src!=limit && (t=(char)(s.charAt(src)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1157 ++src; 1158 syllable+=t; // The next character was a Jamo T. 1159 prevBoundary=src; 1160 buffer.setLastChar(syllable); 1161 continue; 1162 } 1163 // If we see L+V+x where x!=T then we drop to the slow path, 1164 // decompose and recompose. 1165 // This is to deal with NFKC finding normal L and V but a 1166 // compatibility variant of a T. We need to either fully compose that 1167 // combination here (which would complicate the code and may not work 1168 // with strange custom data) or use the slow path -- or else our replacing 1169 // two input characters (L+V) with one output character (LV syllable) 1170 // would violate the invariant that [prevBoundary..prevSrc[ has the same 1171 // length as what we appended to the buffer since prevBoundary. 1172 needToDecompose=true; 1173 } 1174 } else if(Hangul.isHangulWithoutJamoT(prev)) { 1175 // c is a Jamo Trailing consonant, 1176 // compose with previous Hangul LV that does not contain a Jamo T. 1177 if(!doCompose) { 1178 return false; 1179 } 1180 buffer.setLastChar((char)(prev+c-Hangul.JAMO_T_BASE)); 1181 prevBoundary=src; 1182 continue; 1183 } 1184 if(!needToDecompose) { 1185 // The Jamo V/T did not compose into a Hangul syllable. 1186 if(doCompose) { 1187 buffer.append((char)c); 1188 } else { 1189 prevCC=0; 1190 } 1191 continue; 1192 } 1193 } 1194 /* 1195 * Source buffer pointers: 1196 * 1197 * all done quick check current char not yet 1198 * "yes" but (c) processed 1199 * may combine 1200 * forward 1201 * [-------------[-------------[-------------[-------------[ 1202 * | | | | | 1203 * orig. src prevBoundary prevSrc src limit 1204 * 1205 * 1206 * Destination buffer pointers inside the ReorderingBuffer: 1207 * 1208 * all done might take not filled yet 1209 * characters for 1210 * reordering 1211 * [-------------[-------------[-------------[ 1212 * | | | | 1213 * start reorderStart limit | 1214 * +remainingCap.+ 1215 */ 1216 if(norm16>=MIN_YES_YES_WITH_CC) { 1217 int cc=norm16&0xff; // cc!=0 1218 if( onlyContiguous && // FCC 1219 (doCompose ? buffer.getLastCC() : prevCC)==0 && 1220 prevBoundary<prevSrc && 1221 // buffer.getLastCC()==0 && prevBoundary<prevSrc tell us that 1222 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1223 // passed the quick check "yes && ccc==0" test. 1224 // Check whether the last character was a "yesYes" or a "yesNo". 1225 // If a "yesNo", then we get its trailing ccc from its 1226 // mapping and check for canonical order. 1227 // All other cases are ok. 1228 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1229 ) { 1230 // Fails FCD test, need to decompose and contiguously recompose. 1231 if(!doCompose) { 1232 return false; 1233 } 1234 } else if(doCompose) { 1235 buffer.append(c, cc); 1236 continue; 1237 } else if(prevCC<=cc) { 1238 prevCC=cc; 1239 continue; 1240 } else { 1241 return false; 1242 } 1243 } else if(!doCompose && !isMaybeOrNonZeroCC(norm16)) { 1244 return false; 1245 } 1246 1247 /* 1248 * Find appropriate boundaries around this character, 1249 * decompose the source text from between the boundaries, 1250 * and recompose it. 1251 * 1252 * We may need to remove the last few characters from the ReorderingBuffer 1253 * to account for source text that was copied or appended 1254 * but needs to take part in the recomposition. 1255 */ 1256 1257 /* 1258 * Find the last composition boundary in [prevBoundary..src[. 1259 * It is either the decomposition of the current character (at prevSrc), 1260 * or prevBoundary. 1261 */ 1262 if(hasCompBoundaryBefore(c, norm16)) { 1263 prevBoundary=prevSrc; 1264 } else if(doCompose) { 1265 buffer.removeSuffix(prevSrc-prevBoundary); 1266 } 1267 1268 // Find the next composition boundary in [src..limit[ - 1269 // modifies src to point to the next starter. 1270 src=findNextCompBoundary(s, src, limit); 1271 1272 // Decompose [prevBoundary..src[ into the buffer and then recompose that part of it. 1273 int recomposeStartIndex=buffer.length(); 1274 decomposeShort(s, prevBoundary, src, buffer); 1275 recompose(buffer, recomposeStartIndex, onlyContiguous); 1276 if(!doCompose) { 1277 if(!buffer.equals(s, prevBoundary, src)) { 1278 return false; 1279 } 1280 buffer.remove(); 1281 prevCC=0; 1282 } 1283 1284 // Move to the next starter. We never need to look back before this point again. 1285 prevBoundary=src; 1286 } 1287 return true; 1288 } 1289 /** 1290 * Very similar to compose(): Make the same changes in both places if relevant. 1291 * doSpan: spanQuickCheckYes (ignore bit 0 of the return value) 1292 * !doSpan: quickCheck 1293 * @return bits 31..1: spanQuickCheckYes (==s.length() if "yes") and 1294 * bit 0: set if "maybe"; otherwise, if the span length<s.length() 1295 * then the quick check result is "no" 1296 */ composeQuickCheck(CharSequence s, int src, int limit, boolean onlyContiguous, boolean doSpan)1297 public int composeQuickCheck(CharSequence s, int src, int limit, 1298 boolean onlyContiguous, boolean doSpan) { 1299 int qcResult=0; 1300 int minNoMaybeCP=minCompNoMaybeCP; 1301 1302 /* 1303 * prevBoundary points to the last character before the current one 1304 * that has a composition boundary before it with ccc==0 and quick check "yes". 1305 */ 1306 int prevBoundary=src; 1307 int prevSrc; 1308 int c=0; 1309 int norm16=0; 1310 int prevCC=0; 1311 1312 for(;;) { 1313 // count code units below the minimum or with irrelevant data for the quick check 1314 for(prevSrc=src;;) { 1315 if(src==limit) { 1316 return (src<<1)|qcResult; // "yes" or "maybe" 1317 } 1318 if( (c=s.charAt(src))<minNoMaybeCP || 1319 isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c)) 1320 ) { 1321 ++src; 1322 } else if(!UTF16.isSurrogate((char)c)) { 1323 break; 1324 } else { 1325 char c2; 1326 if(UTF16Plus.isSurrogateLead(c)) { 1327 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1328 c=Character.toCodePoint((char)c, c2); 1329 } 1330 } else /* trail surrogate */ { 1331 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1332 --src; 1333 c=Character.toCodePoint(c2, (char)c); 1334 } 1335 } 1336 if(isCompYesAndZeroCC(norm16=getNorm16(c))) { 1337 src+=Character.charCount(c); 1338 } else { 1339 break; 1340 } 1341 } 1342 } 1343 if(src!=prevSrc) { 1344 // Set prevBoundary to the last character in the quick check loop. 1345 prevBoundary=src-1; 1346 if( Character.isLowSurrogate(s.charAt(prevBoundary)) && prevSrc<prevBoundary && 1347 Character.isHighSurrogate(s.charAt(prevBoundary-1)) 1348 ) { 1349 --prevBoundary; 1350 } 1351 prevCC=0; 1352 // The start of the current character (c). 1353 prevSrc=src; 1354 } 1355 1356 src+=Character.charCount(c); 1357 /* 1358 * isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. 1359 * c is either a "noNo" (has a mapping) or a "maybeYes" (combines backward) 1360 * or has ccc!=0. 1361 */ 1362 if(isMaybeOrNonZeroCC(norm16)) { 1363 int cc=getCCFromYesOrMaybe(norm16); 1364 if( onlyContiguous && // FCC 1365 cc!=0 && 1366 prevCC==0 && 1367 prevBoundary<prevSrc && 1368 // prevCC==0 && prevBoundary<prevSrc tell us that 1369 // [prevBoundary..prevSrc[ (which is exactly one character under these conditions) 1370 // passed the quick check "yes && ccc==0" test. 1371 // Check whether the last character was a "yesYes" or a "yesNo". 1372 // If a "yesNo", then we get its trailing ccc from its 1373 // mapping and check for canonical order. 1374 // All other cases are ok. 1375 getTrailCCFromCompYesAndZeroCC(s, prevBoundary, prevSrc)>cc 1376 ) { 1377 // Fails FCD test. 1378 } else if(prevCC<=cc || cc==0) { 1379 prevCC=cc; 1380 if(norm16<MIN_YES_YES_WITH_CC) { 1381 if(!doSpan) { 1382 qcResult=1; 1383 } else { 1384 return prevBoundary<<1; // spanYes does not care to know it's "maybe" 1385 } 1386 } 1387 continue; 1388 } 1389 } 1390 return prevBoundary<<1; // "no" 1391 } 1392 } composeAndAppend(CharSequence s, boolean doCompose, boolean onlyContiguous, ReorderingBuffer buffer)1393 public void composeAndAppend(CharSequence s, 1394 boolean doCompose, 1395 boolean onlyContiguous, 1396 ReorderingBuffer buffer) { 1397 int src=0, limit=s.length(); 1398 if(!buffer.isEmpty()) { 1399 int firstStarterInSrc=findNextCompBoundary(s, 0, limit); 1400 if(0!=firstStarterInSrc) { 1401 int lastStarterInDest=findPreviousCompBoundary(buffer.getStringBuilder(), 1402 buffer.length()); 1403 StringBuilder middle=new StringBuilder((buffer.length()-lastStarterInDest)+ 1404 firstStarterInSrc+16); 1405 middle.append(buffer.getStringBuilder(), lastStarterInDest, buffer.length()); 1406 buffer.removeSuffix(buffer.length()-lastStarterInDest); 1407 middle.append(s, 0, firstStarterInSrc); 1408 compose(middle, 0, middle.length(), onlyContiguous, true, buffer); 1409 src=firstStarterInSrc; 1410 } 1411 } 1412 if(doCompose) { 1413 compose(s, src, limit, onlyContiguous, true, buffer); 1414 } else { 1415 buffer.append(s, src, limit); 1416 } 1417 } 1418 // Dual functionality: 1419 // buffer!=NULL: normalize 1420 // buffer==NULL: isNormalized/quickCheck/spanQuickCheckYes makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer)1421 public int makeFCD(CharSequence s, int src, int limit, ReorderingBuffer buffer) { 1422 // Note: In this function we use buffer->appendZeroCC() because we track 1423 // the lead and trail combining classes here, rather than leaving it to 1424 // the ReorderingBuffer. 1425 // The exception is the call to decomposeShort() which uses the buffer 1426 // in the normal way. 1427 1428 // Tracks the last FCD-safe boundary, before lccc=0 or after properly-ordered tccc<=1. 1429 // Similar to the prevBoundary in the compose() implementation. 1430 int prevBoundary=src; 1431 int prevSrc; 1432 int c=0; 1433 int prevFCD16=0; 1434 int fcd16=0; 1435 1436 for(;;) { 1437 // count code units with lccc==0 1438 for(prevSrc=src; src!=limit;) { 1439 if((c=s.charAt(src))<MIN_CCC_LCCC_CP) { 1440 prevFCD16=~c; 1441 ++src; 1442 } else if(!singleLeadMightHaveNonZeroFCD16(c)) { 1443 prevFCD16=0; 1444 ++src; 1445 } else { 1446 if(UTF16.isSurrogate((char)c)) { 1447 char c2; 1448 if(UTF16Plus.isSurrogateLead(c)) { 1449 if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) { 1450 c=Character.toCodePoint((char)c, c2); 1451 } 1452 } else /* trail surrogate */ { 1453 if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) { 1454 --src; 1455 c=Character.toCodePoint(c2, (char)c); 1456 } 1457 } 1458 } 1459 if((fcd16=getFCD16FromNormData(c))<=0xff) { 1460 prevFCD16=fcd16; 1461 src+=Character.charCount(c); 1462 } else { 1463 break; 1464 } 1465 } 1466 } 1467 // copy these code units all at once 1468 if(src!=prevSrc) { 1469 if(src==limit) { 1470 if(buffer!=null) { 1471 buffer.flushAndAppendZeroCC(s, prevSrc, src); 1472 } 1473 break; 1474 } 1475 prevBoundary=src; 1476 // We know that the previous character's lccc==0. 1477 if(prevFCD16<0) { 1478 // Fetching the fcd16 value was deferred for this below-U+0300 code point. 1479 int prev=~prevFCD16; 1480 prevFCD16= prev<0x180 ? tccc180[prev] : getFCD16FromNormData(prev); 1481 if(prevFCD16>1) { 1482 --prevBoundary; 1483 } 1484 } else { 1485 int p=src-1; 1486 if( Character.isLowSurrogate(s.charAt(p)) && prevSrc<p && 1487 Character.isHighSurrogate(s.charAt(p-1)) 1488 ) { 1489 --p; 1490 // Need to fetch the previous character's FCD value because 1491 // prevFCD16 was just for the trail surrogate code point. 1492 prevFCD16=getFCD16FromNormData(Character.toCodePoint(s.charAt(p), s.charAt(p+1))); 1493 // Still known to have lccc==0 because its lead surrogate unit had lccc==0. 1494 } 1495 if(prevFCD16>1) { 1496 prevBoundary=p; 1497 } 1498 } 1499 if(buffer!=null) { 1500 // The last lccc==0 character is excluded from the 1501 // flush-and-append call in case it needs to be modified. 1502 buffer.flushAndAppendZeroCC(s, prevSrc, prevBoundary); 1503 buffer.append(s, prevBoundary, src); 1504 } 1505 // The start of the current character (c). 1506 prevSrc=src; 1507 } else if(src==limit) { 1508 break; 1509 } 1510 1511 src+=Character.charCount(c); 1512 // The current character (c) at [prevSrc..src[ has a non-zero lead combining class. 1513 // Check for proper order, and decompose locally if necessary. 1514 if((prevFCD16&0xff)<=(fcd16>>8)) { 1515 // proper order: prev tccc <= current lccc 1516 if((fcd16&0xff)<=1) { 1517 prevBoundary=src; 1518 } 1519 if(buffer!=null) { 1520 buffer.appendZeroCC(c); 1521 } 1522 prevFCD16=fcd16; 1523 continue; 1524 } else if(buffer==null) { 1525 return prevBoundary; // quick check "no" 1526 } else { 1527 /* 1528 * Back out the part of the source that we copied or appended 1529 * already but is now going to be decomposed. 1530 * prevSrc is set to after what was copied/appended. 1531 */ 1532 buffer.removeSuffix(prevSrc-prevBoundary); 1533 /* 1534 * Find the part of the source that needs to be decomposed, 1535 * up to the next safe boundary. 1536 */ 1537 src=findNextFCDBoundary(s, src, limit); 1538 /* 1539 * The source text does not fulfill the conditions for FCD. 1540 * Decompose and reorder a limited piece of the text. 1541 */ 1542 decomposeShort(s, prevBoundary, src, buffer); 1543 prevBoundary=src; 1544 prevFCD16=0; 1545 } 1546 } 1547 return src; 1548 } makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer)1549 public void makeFCDAndAppend(CharSequence s, boolean doMakeFCD, ReorderingBuffer buffer) { 1550 int src=0, limit=s.length(); 1551 if(!buffer.isEmpty()) { 1552 int firstBoundaryInSrc=findNextFCDBoundary(s, 0, limit); 1553 if(0!=firstBoundaryInSrc) { 1554 int lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStringBuilder(), 1555 buffer.length()); 1556 StringBuilder middle=new StringBuilder((buffer.length()-lastBoundaryInDest)+ 1557 firstBoundaryInSrc+16); 1558 middle.append(buffer.getStringBuilder(), lastBoundaryInDest, buffer.length()); 1559 buffer.removeSuffix(buffer.length()-lastBoundaryInDest); 1560 middle.append(s, 0, firstBoundaryInSrc); 1561 makeFCD(middle, 0, middle.length(), buffer); 1562 src=firstBoundaryInSrc; 1563 } 1564 } 1565 if(doMakeFCD) { 1566 makeFCD(s, src, limit, buffer); 1567 } else { 1568 buffer.append(s, src, limit); 1569 } 1570 } 1571 1572 // Note: hasDecompBoundary() could be implemented as aliases to 1573 // hasFCDBoundaryBefore() and hasFCDBoundaryAfter() 1574 // at the cost of building the FCD trie for a decomposition normalizer. hasDecompBoundary(int c, boolean before)1575 public boolean hasDecompBoundary(int c, boolean before) { 1576 for(;;) { 1577 if(c<minDecompNoCP) { 1578 return true; 1579 } 1580 int norm16=getNorm16(c); 1581 if(isHangul(norm16) || isDecompYesAndZeroCC(norm16)) { 1582 return true; 1583 } else if(norm16>MIN_NORMAL_MAYBE_YES) { 1584 return false; // ccc!=0 1585 } else if(isDecompNoAlgorithmic(norm16)) { 1586 c=mapAlgorithmic(c, norm16); 1587 } else { 1588 // c decomposes, get everything from the variable-length extra data 1589 int firstUnit=extraData.charAt(norm16); 1590 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 1591 return false; 1592 } 1593 if(!before) { 1594 // decomp after-boundary: same as hasFCDBoundaryAfter(), 1595 // fcd16<=1 || trailCC==0 1596 if(firstUnit>0x1ff) { 1597 return false; // trailCC>1 1598 } 1599 if(firstUnit<=0xff) { 1600 return true; // trailCC==0 1601 } 1602 // if(trailCC==1) test leadCC==0, same as checking for before-boundary 1603 } 1604 // true if leadCC==0 (hasFCDBoundaryBefore()) 1605 return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (extraData.charAt(norm16-1)&0xff00)==0; 1606 } 1607 } 1608 } isDecompInert(int c)1609 public boolean isDecompInert(int c) { return isDecompYesAndZeroCC(getNorm16(c)); } 1610 hasCompBoundaryBefore(int c)1611 public boolean hasCompBoundaryBefore(int c) { 1612 return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c)); 1613 } hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert)1614 public boolean hasCompBoundaryAfter(int c, boolean onlyContiguous, boolean testInert) { 1615 for(;;) { 1616 int norm16=getNorm16(c); 1617 if(isInert(norm16)) { 1618 return true; 1619 } else if(norm16<=minYesNo) { 1620 // Hangul: norm16==minYesNo 1621 // Hangul LVT has a boundary after it. 1622 // Hangul LV and non-inert yesYes characters combine forward. 1623 return isHangul(norm16) && !Hangul.isHangulWithoutJamoT((char)c); 1624 } else if(norm16>= (testInert ? minNoNo : minMaybeYes)) { 1625 return false; 1626 } else if(isDecompNoAlgorithmic(norm16)) { 1627 c=mapAlgorithmic(c, norm16); 1628 } else { 1629 // c decomposes, get everything from the variable-length extra data. 1630 // If testInert, then c must be a yesNo character which has lccc=0, 1631 // otherwise it could be a noNo. 1632 int firstUnit=extraData.charAt(norm16); 1633 // true if 1634 // not MAPPING_NO_COMP_BOUNDARY_AFTER 1635 // (which is set if 1636 // c is not deleted, and 1637 // it and its decomposition do not combine forward, and it has a starter) 1638 // and if FCC then trailCC<=1 1639 return 1640 (firstUnit&MAPPING_NO_COMP_BOUNDARY_AFTER)==0 && 1641 (!onlyContiguous || firstUnit<=0x1ff); 1642 } 1643 } 1644 } 1645 hasFCDBoundaryBefore(int c)1646 public boolean hasFCDBoundaryBefore(int c) { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; } hasFCDBoundaryAfter(int c)1647 public boolean hasFCDBoundaryAfter(int c) { 1648 int fcd16=getFCD16(c); 1649 return fcd16<=1 || (fcd16&0xff)==0; 1650 } isFCDInert(int c)1651 public boolean isFCDInert(int c) { return getFCD16(c)<=1; } 1652 isMaybe(int norm16)1653 private boolean isMaybe(int norm16) { return minMaybeYes<=norm16 && norm16<=JAMO_VT; } isMaybeOrNonZeroCC(int norm16)1654 private boolean isMaybeOrNonZeroCC(int norm16) { return norm16>=minMaybeYes; } isInert(int norm16)1655 private static boolean isInert(int norm16) { return norm16==0; } isJamoL(int norm16)1656 private static boolean isJamoL(int norm16) { return norm16==1; } isJamoVT(int norm16)1657 private static boolean isJamoVT(int norm16) { return norm16==JAMO_VT; } isHangul(int norm16)1658 private boolean isHangul(int norm16) { return norm16==minYesNo; } isCompYesAndZeroCC(int norm16)1659 private boolean isCompYesAndZeroCC(int norm16) { return norm16<minNoNo; } 1660 // UBool isCompYes(uint16_t norm16) const { 1661 // return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo; 1662 // } 1663 // UBool isCompYesOrMaybe(uint16_t norm16) const { 1664 // return norm16<minNoNo || minMaybeYes<=norm16; 1665 // } 1666 // private boolean hasZeroCCFromDecompYes(int norm16) { 1667 // return norm16<=MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1668 // } isDecompYesAndZeroCC(int norm16)1669 private boolean isDecompYesAndZeroCC(int norm16) { 1670 return norm16<minYesNo || 1671 norm16==JAMO_VT || 1672 (minMaybeYes<=norm16 && norm16<=MIN_NORMAL_MAYBE_YES); 1673 } 1674 /** 1675 * A little faster and simpler than isDecompYesAndZeroCC() but does not include 1676 * the MaybeYes which combine-forward and have ccc=0. 1677 * (Standard Unicode 5.2 normalization does not have such characters.) 1678 */ isMostDecompYesAndZeroCC(int norm16)1679 private boolean isMostDecompYesAndZeroCC(int norm16) { 1680 return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT; 1681 } isDecompNoAlgorithmic(int norm16)1682 private boolean isDecompNoAlgorithmic(int norm16) { return norm16>=limitNoNo; } 1683 1684 // For use with isCompYes(). 1685 // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. 1686 // static uint8_t getCCFromYes(uint16_t norm16) { 1687 // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; 1688 // } getCCFromNoNo(int norm16)1689 private int getCCFromNoNo(int norm16) { 1690 if((extraData.charAt(norm16)&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1691 return extraData.charAt(norm16-1)&0xff; 1692 } else { 1693 return 0; 1694 } 1695 } 1696 // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit)1697 int getTrailCCFromCompYesAndZeroCC(CharSequence s, int cpStart, int cpLimit) { 1698 int c; 1699 if(cpStart==(cpLimit-1)) { 1700 c=s.charAt(cpStart); 1701 } else { 1702 c=Character.codePointAt(s, cpStart); 1703 } 1704 int prevNorm16=getNorm16(c); 1705 if(prevNorm16<=minYesNo) { 1706 return 0; // yesYes and Hangul LV/LVT have ccc=tccc=0 1707 } else { 1708 return extraData.charAt(prevNorm16)>>8; // tccc from yesNo 1709 } 1710 } 1711 1712 // Requires algorithmic-NoNo. mapAlgorithmic(int c, int norm16)1713 private int mapAlgorithmic(int c, int norm16) { 1714 return c+norm16-(minMaybeYes-MAX_DELTA-1); 1715 } 1716 1717 // Requires minYesNo<norm16<limitNoNo. 1718 // private int getMapping(int norm16) { return /*extraData+*/norm16; } 1719 1720 /** 1721 * @return index into maybeYesCompositions, or -1 1722 */ getCompositionsListForDecompYes(int norm16)1723 private int getCompositionsListForDecompYes(int norm16) { 1724 if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) { 1725 return -1; 1726 } else { 1727 if((norm16-=minMaybeYes)<0) { 1728 // norm16<minMaybeYes: index into extraData which is a substring at 1729 // maybeYesCompositions[MIN_NORMAL_MAYBE_YES-minMaybeYes] 1730 // same as (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16 1731 norm16+=MIN_NORMAL_MAYBE_YES; // for yesYes; if Jamo L: harmless empty list 1732 } 1733 return norm16; 1734 } 1735 } 1736 /** 1737 * @return index into maybeYesCompositions 1738 */ getCompositionsListForComposite(int norm16)1739 private int getCompositionsListForComposite(int norm16) { 1740 // composite has both mapping & compositions list 1741 int firstUnit=extraData.charAt(norm16); 1742 return (MIN_NORMAL_MAYBE_YES-minMaybeYes)+norm16+ // mapping in maybeYesCompositions 1743 1+ // +1 to skip the first unit with the mapping lenth 1744 (firstUnit&MAPPING_LENGTH_MASK); // + mapping length 1745 } 1746 /** 1747 * @param c code point must have compositions 1748 * @return index into maybeYesCompositions 1749 */ getCompositionsList(int norm16)1750 private int getCompositionsList(int norm16) { 1751 return isDecompYes(norm16) ? 1752 getCompositionsListForDecompYes(norm16) : 1753 getCompositionsListForComposite(norm16); 1754 } 1755 1756 // Decompose a short piece of text which is likely to contain characters that 1757 // fail the quick check loop and/or where the quick check loop's overhead 1758 // is unlikely to be amortized. 1759 // Called by the compose() and makeFCD() implementations. 1760 // Public in Java for collation implementation code. decomposeShort(CharSequence s, int src, int limit, ReorderingBuffer buffer)1761 public void decomposeShort(CharSequence s, int src, int limit, 1762 ReorderingBuffer buffer) { 1763 while(src<limit) { 1764 int c=Character.codePointAt(s, src); 1765 src+=Character.charCount(c); 1766 decompose(c, getNorm16(c), buffer); 1767 } 1768 } decompose(int c, int norm16, ReorderingBuffer buffer)1769 private void decompose(int c, int norm16, 1770 ReorderingBuffer buffer) { 1771 // Only loops for 1:1 algorithmic mappings. 1772 for(;;) { 1773 // get the decomposition and the lead and trail cc's 1774 if(isDecompYes(norm16)) { 1775 // c does not decompose 1776 buffer.append(c, getCCFromYesOrMaybe(norm16)); 1777 } else if(isHangul(norm16)) { 1778 // Hangul syllable: decompose algorithmically 1779 Hangul.decompose(c, buffer); 1780 } else if(isDecompNoAlgorithmic(norm16)) { 1781 c=mapAlgorithmic(c, norm16); 1782 norm16=getNorm16(c); 1783 continue; 1784 } else { 1785 // c decomposes, get everything from the variable-length extra data 1786 int firstUnit=extraData.charAt(norm16); 1787 int length=firstUnit&MAPPING_LENGTH_MASK; 1788 int leadCC, trailCC; 1789 trailCC=firstUnit>>8; 1790 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { 1791 leadCC=extraData.charAt(norm16-1)>>8; 1792 } else { 1793 leadCC=0; 1794 } 1795 ++norm16; // skip over the firstUnit 1796 buffer.append(extraData, norm16, norm16+length, leadCC, trailCC); 1797 } 1798 return; 1799 } 1800 } 1801 1802 /** 1803 * Finds the recomposition result for 1804 * a forward-combining "lead" character, 1805 * specified with a pointer to its compositions list, 1806 * and a backward-combining "trail" character. 1807 * 1808 * <p>If the lead and trail characters combine, then this function returns 1809 * the following "compositeAndFwd" value: 1810 * <pre> 1811 * Bits 21..1 composite character 1812 * Bit 0 set if the composite is a forward-combining starter 1813 * </pre> 1814 * otherwise it returns -1. 1815 * 1816 * <p>The compositions list has (trail, compositeAndFwd) pair entries, 1817 * encoded as either pairs or triples of 16-bit units. 1818 * The last entry has the high bit of its first unit set. 1819 * 1820 * <p>The list is sorted by ascending trail characters (there are no duplicates). 1821 * A linear search is used. 1822 * 1823 * <p>See normalizer2impl.h for a more detailed description 1824 * of the compositions list format. 1825 */ combine(String compositions, int list, int trail)1826 private static int combine(String compositions, int list, int trail) { 1827 int key1, firstUnit; 1828 if(trail<COMP_1_TRAIL_LIMIT) { 1829 // trail character is 0..33FF 1830 // result entry may have 2 or 3 units 1831 key1=(trail<<1); 1832 while(key1>(firstUnit=compositions.charAt(list))) { 1833 list+=2+(firstUnit&COMP_1_TRIPLE); 1834 } 1835 if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1836 if((firstUnit&COMP_1_TRIPLE)!=0) { 1837 return ((int)compositions.charAt(list+1)<<16)|compositions.charAt(list+2); 1838 } else { 1839 return compositions.charAt(list+1); 1840 } 1841 } 1842 } else { 1843 // trail character is 3400..10FFFF 1844 // result entry has 3 units 1845 key1=COMP_1_TRAIL_LIMIT+(((trail>>COMP_1_TRAIL_SHIFT))&~COMP_1_TRIPLE); 1846 int key2=(trail<<COMP_2_TRAIL_SHIFT)&0xffff; 1847 int secondUnit; 1848 for(;;) { 1849 if(key1>(firstUnit=compositions.charAt(list))) { 1850 list+=2+(firstUnit&COMP_1_TRIPLE); 1851 } else if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { 1852 if(key2>(secondUnit=compositions.charAt(list+1))) { 1853 if((firstUnit&COMP_1_LAST_TUPLE)!=0) { 1854 break; 1855 } else { 1856 list+=3; 1857 } 1858 } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { 1859 return ((secondUnit&~COMP_2_TRAIL_MASK)<<16)|compositions.charAt(list+2); 1860 } else { 1861 break; 1862 } 1863 } else { 1864 break; 1865 } 1866 } 1867 } 1868 return -1; 1869 } 1870 /** 1871 * @param list some character's compositions list 1872 * @param set recursively receives the composites from these compositions 1873 */ addComposites(int list, UnicodeSet set)1874 private void addComposites(int list, UnicodeSet set) { 1875 int firstUnit, compositeAndFwd; 1876 do { 1877 firstUnit=maybeYesCompositions.charAt(list); 1878 if((firstUnit&COMP_1_TRIPLE)==0) { 1879 compositeAndFwd=maybeYesCompositions.charAt(list+1); 1880 list+=2; 1881 } else { 1882 compositeAndFwd=(((int)maybeYesCompositions.charAt(list+1)&~COMP_2_TRAIL_MASK)<<16)| 1883 maybeYesCompositions.charAt(list+2); 1884 list+=3; 1885 } 1886 int composite=compositeAndFwd>>1; 1887 if((compositeAndFwd&1)!=0) { 1888 addComposites(getCompositionsListForComposite(getNorm16(composite)), set); 1889 } 1890 set.add(composite); 1891 } while((firstUnit&COMP_1_LAST_TUPLE)==0); 1892 } 1893 /* 1894 * Recomposes the buffer text starting at recomposeStartIndex 1895 * (which is in NFD - decomposed and canonically ordered), 1896 * and truncates the buffer contents. 1897 * 1898 * Note that recomposition never lengthens the text: 1899 * Any character consists of either one or two code units; 1900 * a composition may contain at most one more code unit than the original starter, 1901 * while the combining mark that is removed has at least one code unit. 1902 */ recompose(ReorderingBuffer buffer, int recomposeStartIndex, boolean onlyContiguous)1903 private void recompose(ReorderingBuffer buffer, int recomposeStartIndex, 1904 boolean onlyContiguous) { 1905 StringBuilder sb=buffer.getStringBuilder(); 1906 int p=recomposeStartIndex; 1907 if(p==sb.length()) { 1908 return; 1909 } 1910 1911 int starter, pRemove; 1912 int compositionsList; 1913 int c, compositeAndFwd; 1914 int norm16; 1915 int cc, prevCC; 1916 boolean starterIsSupplementary; 1917 1918 // Some of the following variables are not used until we have a forward-combining starter 1919 // and are only initialized now to avoid compiler warnings. 1920 compositionsList=-1; // used as indicator for whether we have a forward-combining starter 1921 starter=-1; 1922 starterIsSupplementary=false; 1923 prevCC=0; 1924 1925 for(;;) { 1926 c=sb.codePointAt(p); 1927 p+=Character.charCount(c); 1928 norm16=getNorm16(c); 1929 cc=getCCFromYesOrMaybe(norm16); 1930 if( // this character combines backward and 1931 isMaybe(norm16) && 1932 // we have seen a starter that combines forward and 1933 compositionsList>=0 && 1934 // the backward-combining character is not blocked 1935 (prevCC<cc || prevCC==0) 1936 ) { 1937 if(isJamoVT(norm16)) { 1938 // c is a Jamo V/T, see if we can compose it with the previous character. 1939 if(c<Hangul.JAMO_T_BASE) { 1940 // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. 1941 char prev=(char)(sb.charAt(starter)-Hangul.JAMO_L_BASE); 1942 if(prev<Hangul.JAMO_L_COUNT) { 1943 pRemove=p-1; 1944 char syllable=(char) 1945 (Hangul.HANGUL_BASE+ 1946 (prev*Hangul.JAMO_V_COUNT+(c-Hangul.JAMO_V_BASE))* 1947 Hangul.JAMO_T_COUNT); 1948 char t; 1949 if(p!=sb.length() && (t=(char)(sb.charAt(p)-Hangul.JAMO_T_BASE))<Hangul.JAMO_T_COUNT) { 1950 ++p; 1951 syllable+=t; // The next character was a Jamo T. 1952 } 1953 sb.setCharAt(starter, syllable); 1954 // remove the Jamo V/T 1955 sb.delete(pRemove, p); 1956 p=pRemove; 1957 } 1958 } 1959 /* 1960 * No "else" for Jamo T: 1961 * Since the input is in NFD, there are no Hangul LV syllables that 1962 * a Jamo T could combine with. 1963 * All Jamo Ts are combined above when handling Jamo Vs. 1964 */ 1965 if(p==sb.length()) { 1966 break; 1967 } 1968 compositionsList=-1; 1969 continue; 1970 } else if((compositeAndFwd=combine(maybeYesCompositions, compositionsList, c))>=0) { 1971 // The starter and the combining mark (c) do combine. 1972 int composite=compositeAndFwd>>1; 1973 1974 // Remove the combining mark. 1975 pRemove=p-Character.charCount(c); // pRemove & p: start & limit of the combining mark 1976 sb.delete(pRemove, p); 1977 p=pRemove; 1978 // Replace the starter with the composite. 1979 if(starterIsSupplementary) { 1980 if(composite>0xffff) { 1981 // both are supplementary 1982 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1983 sb.setCharAt(starter+1, UTF16.getTrailSurrogate(composite)); 1984 } else { 1985 sb.setCharAt(starter, (char)c); 1986 sb.deleteCharAt(starter+1); 1987 // The composite is shorter than the starter, 1988 // move the intermediate characters forward one. 1989 starterIsSupplementary=false; 1990 --p; 1991 } 1992 } else if(composite>0xffff) { 1993 // The composite is longer than the starter, 1994 // move the intermediate characters back one. 1995 starterIsSupplementary=true; 1996 sb.setCharAt(starter, UTF16.getLeadSurrogate(composite)); 1997 sb.insert(starter+1, UTF16.getTrailSurrogate(composite)); 1998 ++p; 1999 } else { 2000 // both are on the BMP 2001 sb.setCharAt(starter, (char)composite); 2002 } 2003 2004 // Keep prevCC because we removed the combining mark. 2005 2006 if(p==sb.length()) { 2007 break; 2008 } 2009 // Is the composite a starter that combines forward? 2010 if((compositeAndFwd&1)!=0) { 2011 compositionsList= 2012 getCompositionsListForComposite(getNorm16(composite)); 2013 } else { 2014 compositionsList=-1; 2015 } 2016 2017 // We combined; continue with looking for compositions. 2018 continue; 2019 } 2020 } 2021 2022 // no combination this time 2023 prevCC=cc; 2024 if(p==sb.length()) { 2025 break; 2026 } 2027 2028 // If c did not combine, then check if it is a starter. 2029 if(cc==0) { 2030 // Found a new starter. 2031 if((compositionsList=getCompositionsListForDecompYes(norm16))>=0) { 2032 // It may combine with something, prepare for it. 2033 if(c<=0xffff) { 2034 starterIsSupplementary=false; 2035 starter=p-1; 2036 } else { 2037 starterIsSupplementary=true; 2038 starter=p-2; 2039 } 2040 } 2041 } else if(onlyContiguous) { 2042 // FCC: no discontiguous compositions; any intervening character blocks. 2043 compositionsList=-1; 2044 } 2045 } 2046 buffer.flush(); 2047 } 2048 composePair(int a, int b)2049 public int composePair(int a, int b) { 2050 int norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0 2051 int list; 2052 if(isInert(norm16)) { 2053 return -1; 2054 } else if(norm16<minYesNoMappingsOnly) { 2055 if(isJamoL(norm16)) { 2056 b-=Hangul.JAMO_V_BASE; 2057 if(0<=b && b<Hangul.JAMO_V_COUNT) { 2058 return 2059 (Hangul.HANGUL_BASE+ 2060 ((a-Hangul.JAMO_L_BASE)*Hangul.JAMO_V_COUNT+b)* 2061 Hangul.JAMO_T_COUNT); 2062 } else { 2063 return -1; 2064 } 2065 } else if(isHangul(norm16)) { 2066 b-=Hangul.JAMO_T_BASE; 2067 if(Hangul.isHangulWithoutJamoT((char)a) && 0<b && b<Hangul.JAMO_T_COUNT) { // not b==0! 2068 return a+b; 2069 } else { 2070 return -1; 2071 } 2072 } else { 2073 // 'a' has a compositions list in extraData 2074 list=norm16; 2075 if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list 2076 list+= // mapping pointer 2077 1+ // +1 to skip the first unit with the mapping lenth 2078 (extraData.charAt(list)&MAPPING_LENGTH_MASK); // + mapping length 2079 } 2080 // Turn the offset-into-extraData into an offset-into-maybeYesCompositions. 2081 list+=MIN_NORMAL_MAYBE_YES-minMaybeYes; 2082 } 2083 } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { 2084 return -1; 2085 } else { 2086 list=norm16-minMaybeYes; // offset into maybeYesCompositions 2087 } 2088 if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b 2089 return -1; 2090 } 2091 return combine(maybeYesCompositions, list, b)>>1; 2092 } 2093 2094 /** 2095 * Does c have a composition boundary before it? 2096 * True if its decomposition begins with a character that has 2097 * ccc=0 && NFC_QC=Yes (isCompYesAndZeroCC()). 2098 * As a shortcut, this is true if c itself has ccc=0 && NFC_QC=Yes 2099 * (isCompYesAndZeroCC()) so we need not decompose. 2100 */ hasCompBoundaryBefore(int c, int norm16)2101 private boolean hasCompBoundaryBefore(int c, int norm16) { 2102 for(;;) { 2103 if(isCompYesAndZeroCC(norm16)) { 2104 return true; 2105 } else if(isMaybeOrNonZeroCC(norm16)) { 2106 return false; 2107 } else if(isDecompNoAlgorithmic(norm16)) { 2108 c=mapAlgorithmic(c, norm16); 2109 norm16=getNorm16(c); 2110 } else { 2111 // c decomposes, get everything from the variable-length extra data 2112 int firstUnit=extraData.charAt(norm16); 2113 if((firstUnit&MAPPING_LENGTH_MASK)==0) { 2114 return false; 2115 } 2116 if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0 && (extraData.charAt(norm16-1)&0xff00)!=0) { 2117 return false; // non-zero leadCC 2118 } 2119 return isCompYesAndZeroCC(getNorm16(Character.codePointAt(extraData, norm16+1))); 2120 } 2121 } 2122 } findPreviousCompBoundary(CharSequence s, int p)2123 private int findPreviousCompBoundary(CharSequence s, int p) { 2124 while(p>0) { 2125 int c=Character.codePointBefore(s, p); 2126 p-=Character.charCount(c); 2127 if(hasCompBoundaryBefore(c)) { 2128 break; 2129 } 2130 // We could also test hasCompBoundaryAfter() and return iter.codePointLimit, 2131 // but that's probably not worth the extra cost. 2132 } 2133 return p; 2134 } findNextCompBoundary(CharSequence s, int p, int limit)2135 private int findNextCompBoundary(CharSequence s, int p, int limit) { 2136 while(p<limit) { 2137 int c=Character.codePointAt(s, p); 2138 int norm16=normTrie.get(c); 2139 if(hasCompBoundaryBefore(c, norm16)) { 2140 break; 2141 } 2142 p+=Character.charCount(c); 2143 } 2144 return p; 2145 } 2146 findPreviousFCDBoundary(CharSequence s, int p)2147 private int findPreviousFCDBoundary(CharSequence s, int p) { 2148 while(p>0) { 2149 int c=Character.codePointBefore(s, p); 2150 p-=Character.charCount(c); 2151 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2152 break; 2153 } 2154 } 2155 return p; 2156 } findNextFCDBoundary(CharSequence s, int p, int limit)2157 private int findNextFCDBoundary(CharSequence s, int p, int limit) { 2158 while(p<limit) { 2159 int c=Character.codePointAt(s, p); 2160 if(c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff) { 2161 break; 2162 } 2163 p+=Character.charCount(c); 2164 } 2165 return p; 2166 } 2167 addToStartSet(Trie2Writable newData, int origin, int decompLead)2168 private void addToStartSet(Trie2Writable newData, int origin, int decompLead) { 2169 int canonValue=newData.get(decompLead); 2170 if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) { 2171 // origin is the first character whose decomposition starts with 2172 // the character for which we are setting the value. 2173 newData.set(decompLead, canonValue|origin); 2174 } else { 2175 // origin is not the first character, or it is U+0000. 2176 UnicodeSet set; 2177 if((canonValue&CANON_HAS_SET)==0) { 2178 int firstOrigin=canonValue&CANON_VALUE_MASK; 2179 canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|canonStartSets.size(); 2180 newData.set(decompLead, canonValue); 2181 canonStartSets.add(set=new UnicodeSet()); 2182 if(firstOrigin!=0) { 2183 set.add(firstOrigin); 2184 } 2185 } else { 2186 set=canonStartSets.get(canonValue&CANON_VALUE_MASK); 2187 } 2188 set.add(origin); 2189 } 2190 } 2191 2192 @SuppressWarnings("unused") 2193 private VersionInfo dataVersion; 2194 2195 // Code point thresholds for quick check codes. 2196 private int minDecompNoCP; 2197 private int minCompNoMaybeCP; 2198 2199 // Norm16 value thresholds for quick check combinations and types of extra data. 2200 private int minYesNo; 2201 private int minYesNoMappingsOnly; 2202 private int minNoNo; 2203 private int limitNoNo; 2204 private int minMaybeYes; 2205 2206 private Trie2_16 normTrie; 2207 private String maybeYesCompositions; 2208 private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters 2209 private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 2210 private int[] tccc180; // [0x180] tccc values for U+0000..U+017F 2211 2212 private Trie2_32 canonIterData; 2213 private ArrayList<UnicodeSet> canonStartSets; 2214 2215 // bits in canonIterData 2216 private static final int CANON_NOT_SEGMENT_STARTER = 0x80000000; 2217 private static final int CANON_HAS_COMPOSITIONS = 0x40000000; 2218 private static final int CANON_HAS_SET = 0x200000; 2219 private static final int CANON_VALUE_MASK = 0x1fffff; 2220 } 2221