1 /* 2 ******************************************************************************* 3 * Copyright (C) 2000-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 import java.nio.CharBuffer; 9 import java.text.CharacterIterator; 10 11 import com.ibm.icu.impl.Norm2AllModes; 12 import com.ibm.icu.impl.Normalizer2Impl; 13 import com.ibm.icu.impl.UCaseProps; 14 import com.ibm.icu.lang.UCharacter; 15 import com.ibm.icu.util.ICUCloneNotSupportedException; 16 17 /** 18 * Unicode Normalization 19 * 20 * <h2>Unicode normalization API</h2> 21 * 22 * <code>normalize</code> transforms Unicode text into an equivalent composed or 23 * decomposed form, allowing for easier sorting and searching of text. 24 * <code>normalize</code> supports the standard normalization forms described in 25 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 26 * Unicode Standard Annex #15 — Unicode Normalization Forms</a>. 27 * 28 * Characters with accents or other adornments can be encoded in 29 * several different ways in Unicode. For example, take the character A-acute. 30 * In Unicode, this can be encoded as a single character (the 31 * "composed" form): 32 * 33 * <pre> 34 * 00C1 LATIN CAPITAL LETTER A WITH ACUTE 35 * </pre> 36 * 37 * or as two separate characters (the "decomposed" form): 38 * 39 * <pre> 40 * 0041 LATIN CAPITAL LETTER A 41 * 0301 COMBINING ACUTE ACCENT 42 * </pre> 43 * 44 * To a user of your program, however, both of these sequences should be 45 * treated as the same "user-level" character "A with acute accent". When you 46 * are searching or comparing text, you must ensure that these two sequences are 47 * treated equivalently. In addition, you must handle characters with more than 48 * one accent. Sometimes the order of a character's combining accents is 49 * significant, while in other cases accent sequences in different orders are 50 * really equivalent. 51 * 52 * Similarly, the string "ffi" can be encoded as three separate letters: 53 * 54 * <pre> 55 * 0066 LATIN SMALL LETTER F 56 * 0066 LATIN SMALL LETTER F 57 * 0069 LATIN SMALL LETTER I 58 * </pre> 59 * 60 * or as the single character 61 * 62 * <pre> 63 * FB03 LATIN SMALL LIGATURE FFI 64 * </pre> 65 * 66 * The ffi ligature is not a distinct semantic character, and strictly speaking 67 * it shouldn't be in Unicode at all, but it was included for compatibility 68 * with existing character sets that already provided it. The Unicode standard 69 * identifies such characters by giving them "compatibility" decompositions 70 * into the corresponding semantic characters. When sorting and searching, you 71 * will often want to use these mappings. 72 * 73 * <code>normalize</code> helps solve these problems by transforming text into 74 * the canonical composed and decomposed forms as shown in the first example 75 * above. In addition, you can have it perform compatibility decompositions so 76 * that you can treat compatibility characters the same as their equivalents. 77 * Finally, <code>normalize</code> rearranges accents into the proper canonical 78 * order, so that you do not have to worry about accent rearrangement on your 79 * own. 80 * 81 * Form FCD, "Fast C or D", is also designed for collation. 82 * It allows to work on strings that are not necessarily normalized 83 * with an algorithm (like in collation) that works under "canonical closure", 84 * i.e., it treats precomposed characters and their decomposed equivalents the 85 * same. 86 * 87 * It is not a normalization form because it does not provide for uniqueness of 88 * representation. Multiple strings may be canonically equivalent (their NFDs 89 * are identical) and may all conform to FCD without being identical themselves. 90 * 91 * The form is defined such that the "raw decomposition", the recursive 92 * canonical decomposition of each character, results in a string that is 93 * canonically ordered. This means that precomposed characters are allowed for 94 * as long as their decompositions do not need canonical reordering. 95 * 96 * Its advantage for a process like collation is that all NFD and most NFC texts 97 * - and many unnormalized texts - already conform to FCD and do not need to be 98 * normalized (NFD) for such a process. The FCD quick check will return YES for 99 * most strings in practice. 100 * 101 * normalize(FCD) may be implemented with NFD. 102 * 103 * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications): 104 * http://www.unicode.org/notes/tn5/#FCD 105 * 106 * ICU collation performs either NFD or FCD normalization automatically if 107 * normalization is turned on for the collator object. Beyond collation and 108 * string search, normalized strings may be useful for string equivalence 109 * comparisons, transliteration/transcription, unique representations, etc. 110 * 111 * The W3C generally recommends to exchange texts in NFC. 112 * Note also that most legacy character encodings use only precomposed forms and 113 * often do not encode any combining marks by themselves. For conversion to such 114 * character encodings the Unicode text needs to be normalized to NFC. 115 * For more usage examples, see the Unicode Standard Annex. 116 * 117 * Note: The Normalizer class also provides API for iterative normalization. 118 * While the setIndex() and getIndex() refer to indices in the 119 * underlying Unicode input text, the next() and previous() methods 120 * iterate through characters in the normalized output. 121 * This means that there is not necessarily a one-to-one correspondence 122 * between characters returned by next() and previous() and the indices 123 * passed to and returned from setIndex() and getIndex(). 124 * It is for this reason that Normalizer does not implement the CharacterIterator interface. 125 * 126 * @stable ICU 2.8 127 */ 128 public final class Normalizer implements Cloneable { 129 // The input text and our position in it 130 private UCharacterIterator text; 131 private Normalizer2 norm2; 132 private Mode mode; 133 private int options; 134 135 // The normalization buffer is the result of normalization 136 // of the source in [currentIndex..nextIndex[ . 137 private int currentIndex; 138 private int nextIndex; 139 140 // A buffer for holding intermediate results 141 private StringBuilder buffer; 142 private int bufferPos; 143 144 // Helper classes to defer loading of normalization data. 145 private static final class ModeImpl { ModeImpl(Normalizer2 n2)146 private ModeImpl(Normalizer2 n2) { 147 normalizer2 = n2; 148 } 149 private final Normalizer2 normalizer2; 150 } 151 private static final class NFDModeImpl { 152 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance()); 153 } 154 private static final class NFKDModeImpl { 155 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance()); 156 } 157 private static final class NFCModeImpl { 158 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance()); 159 } 160 private static final class NFKCModeImpl { 161 private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance()); 162 } 163 private static final class FCDModeImpl { 164 private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2()); 165 } 166 167 private static final class Unicode32 { 168 private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze(); 169 } 170 private static final class NFD32ModeImpl { 171 private static final ModeImpl INSTANCE = 172 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(), 173 Unicode32.INSTANCE)); 174 } 175 private static final class NFKD32ModeImpl { 176 private static final ModeImpl INSTANCE = 177 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(), 178 Unicode32.INSTANCE)); 179 } 180 private static final class NFC32ModeImpl { 181 private static final ModeImpl INSTANCE = 182 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(), 183 Unicode32.INSTANCE)); 184 } 185 private static final class NFKC32ModeImpl { 186 private static final ModeImpl INSTANCE = 187 new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(), 188 Unicode32.INSTANCE)); 189 } 190 private static final class FCD32ModeImpl { 191 private static final ModeImpl INSTANCE = 192 new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(), 193 Unicode32.INSTANCE)); 194 } 195 196 /** 197 * Options bit set value to select Unicode 3.2 normalization 198 * (except NormalizationCorrections). 199 * At most one Unicode version can be selected at a time. 200 * @stable ICU 2.6 201 */ 202 public static final int UNICODE_3_2=0x20; 203 204 /** 205 * Constant indicating that the end of the iteration has been reached. 206 * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}. 207 * @stable ICU 2.8 208 */ 209 public static final int DONE = UCharacterIterator.DONE; 210 211 /** 212 * Constants for normalization modes. 213 * <p> 214 * The Mode class is not intended for public subclassing. 215 * Only the Mode constants provided by the Normalizer class should be used, 216 * and any fields or methods should not be called or overridden by users. 217 * @stable ICU 2.8 218 */ 219 public static abstract class Mode { 220 /** 221 * Sole constructor 222 * @internal 223 * @deprecated This API is ICU internal only. 224 */ 225 @Deprecated Mode()226 protected Mode() { 227 } 228 229 /** 230 * @internal 231 * @deprecated This API is ICU internal only. 232 */ 233 @Deprecated getNormalizer2(int options)234 protected abstract Normalizer2 getNormalizer2(int options); 235 } 236 237 private static final class NONEMode extends Mode { getNormalizer2(int options)238 protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; } 239 } 240 private static final class NFDMode extends Mode { getNormalizer2(int options)241 protected Normalizer2 getNormalizer2(int options) { 242 return (options&UNICODE_3_2) != 0 ? 243 NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2; 244 } 245 } 246 private static final class NFKDMode extends Mode { getNormalizer2(int options)247 protected Normalizer2 getNormalizer2(int options) { 248 return (options&UNICODE_3_2) != 0 ? 249 NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2; 250 } 251 } 252 private static final class NFCMode extends Mode { getNormalizer2(int options)253 protected Normalizer2 getNormalizer2(int options) { 254 return (options&UNICODE_3_2) != 0 ? 255 NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2; 256 } 257 } 258 private static final class NFKCMode extends Mode { getNormalizer2(int options)259 protected Normalizer2 getNormalizer2(int options) { 260 return (options&UNICODE_3_2) != 0 ? 261 NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2; 262 } 263 } 264 private static final class FCDMode extends Mode { getNormalizer2(int options)265 protected Normalizer2 getNormalizer2(int options) { 266 return (options&UNICODE_3_2) != 0 ? 267 FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2; 268 } 269 } 270 271 /** 272 * No decomposition/composition. 273 * @stable ICU 2.8 274 */ 275 public static final Mode NONE = new NONEMode(); 276 277 /** 278 * Canonical decomposition. 279 * @stable ICU 2.8 280 */ 281 public static final Mode NFD = new NFDMode(); 282 283 /** 284 * Compatibility decomposition. 285 * @stable ICU 2.8 286 */ 287 public static final Mode NFKD = new NFKDMode(); 288 289 /** 290 * Canonical decomposition followed by canonical composition. 291 * @stable ICU 2.8 292 */ 293 public static final Mode NFC = new NFCMode(); 294 295 /** 296 * Default normalization. 297 * @stable ICU 2.8 298 */ 299 public static final Mode DEFAULT = NFC; 300 301 /** 302 * Compatibility decomposition followed by canonical composition. 303 * @stable ICU 2.8 304 */ 305 public static final Mode NFKC =new NFKCMode(); 306 307 /** 308 * "Fast C or D" form. 309 * @stable ICU 2.8 310 */ 311 public static final Mode FCD = new FCDMode(); 312 313 /** 314 * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors} 315 * and the static {@link #normalize normalize} method. This value tells 316 * the <tt>Normalizer</tt> to do nothing but return unprocessed characters 317 * from the underlying String or CharacterIterator. If you have code which 318 * requires raw text at some times and normalized text at others, you can 319 * use <tt>NO_OP</tt> for the cases where you want raw text, rather 320 * than having a separate code path that bypasses <tt>Normalizer</tt> 321 * altogether. 322 * <p> 323 * @see #setMode 324 * @deprecated ICU 2.8. Use Nomalizer.NONE 325 * @see #NONE 326 */ 327 @Deprecated 328 public static final Mode NO_OP = NONE; 329 330 /** 331 * Canonical decomposition followed by canonical composition. Used with the 332 * {@link com.ibm.icu.text.Normalizer constructors} and the static 333 * {@link #normalize normalize} method to determine the operation to be 334 * performed. 335 * <p> 336 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 337 * off, this operation produces output that is in 338 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 339 * Form</a> 340 * <b>C</b>. 341 * <p> 342 * @see #setMode 343 * @deprecated ICU 2.8. Use Normalier.NFC 344 * @see #NFC 345 */ 346 @Deprecated 347 public static final Mode COMPOSE = NFC; 348 349 /** 350 * Compatibility decomposition followed by canonical composition. 351 * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static 352 * {@link #normalize normalize} method to determine the operation to be 353 * performed. 354 * <p> 355 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 356 * off, this operation produces output that is in 357 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 358 * Form</a> 359 * <b>KC</b>. 360 * <p> 361 * @see #setMode 362 * @deprecated ICU 2.8. Use Normalizer.NFKC 363 * @see #NFKC 364 */ 365 @Deprecated 366 public static final Mode COMPOSE_COMPAT = NFKC; 367 368 /** 369 * Canonical decomposition. This value is passed to the 370 * {@link com.ibm.icu.text.Normalizer constructors} and the static 371 * {@link #normalize normalize} 372 * method to determine the operation to be performed. 373 * <p> 374 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 375 * off, this operation produces output that is in 376 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 377 * Form</a> 378 * <b>D</b>. 379 * <p> 380 * @see #setMode 381 * @deprecated ICU 2.8. Use Normalizer.NFD 382 * @see #NFD 383 */ 384 @Deprecated 385 public static final Mode DECOMP = NFD; 386 387 /** 388 * Compatibility decomposition. This value is passed to the 389 * {@link com.ibm.icu.text.Normalizer constructors} and the static 390 * {@link #normalize normalize} 391 * method to determine the operation to be performed. 392 * <p> 393 * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned 394 * off, this operation produces output that is in 395 * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical 396 * Form</a> 397 * <b>KD</b>. 398 * <p> 399 * @see #setMode 400 * @deprecated ICU 2.8. Use Normalizer.NFKD 401 * @see #NFKD 402 */ 403 @Deprecated 404 public static final Mode DECOMP_COMPAT = NFKD; 405 406 /** 407 * Option to disable Hangul/Jamo composition and decomposition. 408 * This option applies to Korean text, 409 * which can be represented either in the Jamo alphabet or in Hangul 410 * characters, which are really just two or three Jamo combined 411 * into one visual glyph. Since Jamo takes up more storage space than 412 * Hangul, applications that process only Hangul text may wish to turn 413 * this option on when decomposing text. 414 * <p> 415 * The Unicode standard treates Hangul to Jamo conversion as a 416 * canonical decomposition, so this option must be turned <b>off</b> if you 417 * wish to transform strings into one of the standard 418 * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode"> 419 * Unicode Normalization Forms</a>. 420 * <p> 421 * @see #setOption 422 * @deprecated ICU 2.8. This option is no longer supported. 423 */ 424 @Deprecated 425 public static final int IGNORE_HANGUL = 0x0001; 426 427 /** 428 * Result values for quickCheck(). 429 * For details see Unicode Technical Report 15. 430 * @stable ICU 2.8 431 */ 432 public static final class QuickCheckResult{ 433 //private int resultValue; QuickCheckResult(int value)434 private QuickCheckResult(int value) { 435 //resultValue=value; 436 } 437 } 438 /** 439 * Indicates that string is not in the normalized format 440 * @stable ICU 2.8 441 */ 442 public static final QuickCheckResult NO = new QuickCheckResult(0); 443 444 /** 445 * Indicates that string is in the normalized format 446 * @stable ICU 2.8 447 */ 448 public static final QuickCheckResult YES = new QuickCheckResult(1); 449 450 /** 451 * Indicates it cannot be determined if string is in the normalized 452 * format without further thorough checks. 453 * @stable ICU 2.8 454 */ 455 public static final QuickCheckResult MAYBE = new QuickCheckResult(2); 456 457 /** 458 * Option bit for compare: 459 * Case sensitively compare the strings 460 * @stable ICU 2.8 461 */ 462 public static final int FOLD_CASE_DEFAULT = UCharacter.FOLD_CASE_DEFAULT; 463 464 /** 465 * Option bit for compare: 466 * Both input strings are assumed to fulfill FCD conditions. 467 * @stable ICU 2.8 468 */ 469 public static final int INPUT_IS_FCD = 0x20000; 470 471 /** 472 * Option bit for compare: 473 * Perform case-insensitive comparison. 474 * @stable ICU 2.8 475 */ 476 public static final int COMPARE_IGNORE_CASE = 0x10000; 477 478 /** 479 * Option bit for compare: 480 * Compare strings in code point order instead of code unit order. 481 * @stable ICU 2.8 482 */ 483 public static final int COMPARE_CODE_POINT_ORDER = 0x8000; 484 485 /** 486 * Option value for case folding: 487 * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I 488 * and dotless i appropriately for Turkic languages (tr, az). 489 * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I 490 * @stable ICU 2.8 491 */ 492 public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I; 493 494 /** 495 * Lowest-order bit number of compare() options bits corresponding to 496 * normalization options bits. 497 * 498 * The options parameter for compare() uses most bits for 499 * itself and for various comparison and folding flags. 500 * The most significant bits, however, are shifted down and passed on 501 * to the normalization implementation. 502 * (That is, from compare(..., options, ...), 503 * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the 504 * internal normalization functions.) 505 * 506 * @see #compare 507 * @stable ICU 2.6 508 */ 509 public static final int COMPARE_NORM_OPTIONS_SHIFT = 20; 510 511 //------------------------------------------------------------------------- 512 // Iterator constructors 513 //------------------------------------------------------------------------- 514 515 /** 516 * Creates a new <tt>Normalizer</tt> object for iterating over the 517 * normalized form of a given string. 518 * <p> 519 * The <tt>options</tt> parameter specifies which optional 520 * <tt>Normalizer</tt> features are to be enabled for this object. 521 * <p> 522 * @param str The string to be normalized. The normalization 523 * will start at the beginning of the string. 524 * 525 * @param mode The normalization mode. 526 * 527 * @param opt Any optional features to be enabled. 528 * Currently the only available option is {@link #UNICODE_3_2}. 529 * If you want the default behavior corresponding to one of the 530 * standard Unicode Normalization Forms, use 0 for this argument. 531 * @stable ICU 2.6 532 */ Normalizer(String str, Mode mode, int opt)533 public Normalizer(String str, Mode mode, int opt) { 534 this.text = UCharacterIterator.getInstance(str); 535 this.mode = mode; 536 this.options=opt; 537 norm2 = mode.getNormalizer2(opt); 538 buffer = new StringBuilder(); 539 } 540 541 /** 542 * Creates a new <tt>Normalizer</tt> object for iterating over the 543 * normalized form of the given text. 544 * <p> 545 * @param iter The input text to be normalized. The normalization 546 * will start at the beginning of the string. 547 * 548 * @param mode The normalization mode. 549 * 550 * @param opt Any optional features to be enabled. 551 * Currently the only available option is {@link #UNICODE_3_2}. 552 * If you want the default behavior corresponding to one of the 553 * standard Unicode Normalization Forms, use 0 for this argument. 554 * @stable ICU 2.6 555 */ Normalizer(CharacterIterator iter, Mode mode, int opt)556 public Normalizer(CharacterIterator iter, Mode mode, int opt) { 557 this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone()); 558 this.mode = mode; 559 this.options = opt; 560 norm2 = mode.getNormalizer2(opt); 561 buffer = new StringBuilder(); 562 } 563 564 /** 565 * Creates a new <tt>Normalizer</tt> object for iterating over the 566 * normalized form of the given text. 567 * <p> 568 * @param iter The input text to be normalized. The normalization 569 * will start at the beginning of the string. 570 * 571 * @param mode The normalization mode. 572 * @param options The normalization options, ORed together (0 for no options). 573 * @stable ICU 2.6 574 */ Normalizer(UCharacterIterator iter, Mode mode, int options)575 public Normalizer(UCharacterIterator iter, Mode mode, int options) { 576 try { 577 this.text = (UCharacterIterator)iter.clone(); 578 this.mode = mode; 579 this.options = options; 580 norm2 = mode.getNormalizer2(options); 581 buffer = new StringBuilder(); 582 } catch (CloneNotSupportedException e) { 583 throw new ICUCloneNotSupportedException(e); 584 } 585 } 586 587 /** 588 * Clones this <tt>Normalizer</tt> object. All properties of this 589 * object are duplicated in the new object, including the cloning of any 590 * {@link CharacterIterator} that was passed in to the constructor 591 * or to {@link #setText(CharacterIterator) setText}. 592 * However, the text storage underlying 593 * the <tt>CharacterIterator</tt> is not duplicated unless the 594 * iterator's <tt>clone</tt> method does so. 595 * @stable ICU 2.8 596 */ clone()597 public Object clone() { 598 try { 599 Normalizer copy = (Normalizer) super.clone(); 600 copy.text = (UCharacterIterator) text.clone(); 601 copy.mode = mode; 602 copy.options = options; 603 copy.norm2 = norm2; 604 copy.buffer = new StringBuilder(buffer); 605 copy.bufferPos = bufferPos; 606 copy.currentIndex = currentIndex; 607 copy.nextIndex = nextIndex; 608 return copy; 609 } 610 catch (CloneNotSupportedException e) { 611 throw new ICUCloneNotSupportedException(e); 612 } 613 } 614 615 //-------------------------------------------------------------------------- 616 // Static Utility methods 617 //-------------------------------------------------------------------------- 618 getComposeNormalizer2(boolean compat, int options)619 private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) { 620 return (compat ? NFKC : NFC).getNormalizer2(options); 621 } getDecomposeNormalizer2(boolean compat, int options)622 private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) { 623 return (compat ? NFKD : NFD).getNormalizer2(options); 624 } 625 626 /** 627 * Compose a string. 628 * The string will be composed to according to the specified mode. 629 * @param str The string to compose. 630 * @param compat If true the string will be composed according to 631 * NFKC rules and if false will be composed according to 632 * NFC rules. 633 * @return String The composed string 634 * @stable ICU 2.8 635 */ compose(String str, boolean compat)636 public static String compose(String str, boolean compat) { 637 return compose(str,compat,0); 638 } 639 640 /** 641 * Compose a string. 642 * The string will be composed to according to the specified mode. 643 * @param str The string to compose. 644 * @param compat If true the string will be composed according to 645 * NFKC rules and if false will be composed according to 646 * NFC rules. 647 * @param options The only recognized option is UNICODE_3_2 648 * @return String The composed string 649 * @stable ICU 2.6 650 */ compose(String str, boolean compat, int options)651 public static String compose(String str, boolean compat, int options) { 652 return getComposeNormalizer2(compat, options).normalize(str); 653 } 654 655 /** 656 * Compose a string. 657 * The string will be composed to according to the specified mode. 658 * @param source The char array to compose. 659 * @param target A char buffer to receive the normalized text. 660 * @param compat If true the char array will be composed according to 661 * NFKC rules and if false will be composed according to 662 * NFC rules. 663 * @param options The normalization options, ORed together (0 for no options). 664 * @return int The total buffer size needed;if greater than length of 665 * result, the output was truncated. 666 * @exception IndexOutOfBoundsException if target.length is less than the 667 * required length 668 * @stable ICU 2.6 669 */ compose(char[] source,char[] target, boolean compat, int options)670 public static int compose(char[] source,char[] target, boolean compat, int options) { 671 return compose(source, 0, source.length, target, 0, target.length, compat, options); 672 } 673 674 /** 675 * Compose a string. 676 * The string will be composed to according to the specified mode. 677 * @param src The char array to compose. 678 * @param srcStart Start index of the source 679 * @param srcLimit Limit index of the source 680 * @param dest The char buffer to fill in 681 * @param destStart Start index of the destination buffer 682 * @param destLimit End index of the destination buffer 683 * @param compat If true the char array will be composed according to 684 * NFKC rules and if false will be composed according to 685 * NFC rules. 686 * @param options The normalization options, ORed together (0 for no options). 687 * @return int The total buffer size needed;if greater than length of 688 * result, the output was truncated. 689 * @exception IndexOutOfBoundsException if target.length is less than the 690 * required length 691 * @stable ICU 2.6 692 */ compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)693 public static int compose(char[] src,int srcStart, int srcLimit, 694 char[] dest,int destStart, int destLimit, 695 boolean compat, int options) { 696 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 697 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 698 getComposeNormalizer2(compat, options).normalize(srcBuffer, app); 699 return app.length(); 700 } 701 702 /** 703 * Decompose a string. 704 * The string will be decomposed to according to the specified mode. 705 * @param str The string to decompose. 706 * @param compat If true the string will be decomposed according to NFKD 707 * rules and if false will be decomposed according to NFD 708 * rules. 709 * @return String The decomposed string 710 * @stable ICU 2.8 711 */ decompose(String str, boolean compat)712 public static String decompose(String str, boolean compat) { 713 return decompose(str,compat,0); 714 } 715 716 /** 717 * Decompose a string. 718 * The string will be decomposed to according to the specified mode. 719 * @param str The string to decompose. 720 * @param compat If true the string will be decomposed according to NFKD 721 * rules and if false will be decomposed according to NFD 722 * rules. 723 * @param options The normalization options, ORed together (0 for no options). 724 * @return String The decomposed string 725 * @stable ICU 2.6 726 */ decompose(String str, boolean compat, int options)727 public static String decompose(String str, boolean compat, int options) { 728 return getDecomposeNormalizer2(compat, options).normalize(str); 729 } 730 731 /** 732 * Decompose a string. 733 * The string will be decomposed to according to the specified mode. 734 * @param source The char array to decompose. 735 * @param target A char buffer to receive the normalized text. 736 * @param compat If true the char array will be decomposed according to NFKD 737 * rules and if false will be decomposed according to 738 * NFD rules. 739 * @return int The total buffer size needed;if greater than length of 740 * result,the output was truncated. 741 * @param options The normalization options, ORed together (0 for no options). 742 * @exception IndexOutOfBoundsException if the target capacity is less than 743 * the required length 744 * @stable ICU 2.6 745 */ decompose(char[] source,char[] target, boolean compat, int options)746 public static int decompose(char[] source,char[] target, boolean compat, int options) { 747 return decompose(source, 0, source.length, target, 0, target.length, compat, options); 748 } 749 750 /** 751 * Decompose a string. 752 * The string will be decomposed to according to the specified mode. 753 * @param src The char array to compose. 754 * @param srcStart Start index of the source 755 * @param srcLimit Limit index of the source 756 * @param dest The char buffer to fill in 757 * @param destStart Start index of the destination buffer 758 * @param destLimit End index of the destination buffer 759 * @param compat If true the char array will be decomposed according to NFKD 760 * rules and if false will be decomposed according to 761 * NFD rules. 762 * @param options The normalization options, ORed together (0 for no options). 763 * @return int The total buffer size needed;if greater than length of 764 * result,the output was truncated. 765 * @exception IndexOutOfBoundsException if the target capacity is less than 766 * the required length 767 * @stable ICU 2.6 768 */ decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)769 public static int decompose(char[] src,int srcStart, int srcLimit, 770 char[] dest,int destStart, int destLimit, 771 boolean compat, int options) { 772 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 773 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 774 getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app); 775 return app.length(); 776 } 777 778 /** 779 * Normalizes a <tt>String</tt> using the given normalization operation. 780 * <p> 781 * The <tt>options</tt> parameter specifies which optional 782 * <tt>Normalizer</tt> features are to be enabled for this operation. 783 * Currently the only available option is {@link #UNICODE_3_2}. 784 * If you want the default behavior corresponding to one of the standard 785 * Unicode Normalization Forms, use 0 for this argument. 786 * <p> 787 * @param str the input string to be normalized. 788 * @param mode the normalization mode 789 * @param options the optional features to be enabled. 790 * @return String the normalized string 791 * @stable ICU 2.6 792 */ normalize(String str, Mode mode, int options)793 public static String normalize(String str, Mode mode, int options) { 794 return mode.getNormalizer2(options).normalize(str); 795 } 796 797 /** 798 * Normalize a string. 799 * The string will be normalized according to the specified normalization 800 * mode and options. 801 * @param src The string to normalize. 802 * @param mode The normalization mode; one of Normalizer.NONE, 803 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 804 * Normalizer.NFKD, Normalizer.DEFAULT 805 * @return the normalized string 806 * @stable ICU 2.8 807 * 808 */ normalize(String src,Mode mode)809 public static String normalize(String src,Mode mode) { 810 return normalize(src, mode, 0); 811 } 812 /** 813 * Normalize a string. 814 * The string will be normalized according to the specified normalization 815 * mode and options. 816 * @param source The char array to normalize. 817 * @param target A char buffer to receive the normalized text. 818 * @param mode The normalization mode; one of Normalizer.NONE, 819 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 820 * Normalizer.NFKD, Normalizer.DEFAULT 821 * @param options The normalization options, ORed together (0 for no options). 822 * @return int The total buffer size needed;if greater than length of 823 * result, the output was truncated. 824 * @exception IndexOutOfBoundsException if the target capacity is less 825 * than the required length 826 * @stable ICU 2.6 827 */ normalize(char[] source,char[] target, Mode mode, int options)828 public static int normalize(char[] source,char[] target, Mode mode, int options) { 829 return normalize(source,0,source.length,target,0,target.length,mode, options); 830 } 831 832 /** 833 * Normalize a string. 834 * The string will be normalized according to the specified normalization 835 * mode and options. 836 * @param src The char array to compose. 837 * @param srcStart Start index of the source 838 * @param srcLimit Limit index of the source 839 * @param dest The char buffer to fill in 840 * @param destStart Start index of the destination buffer 841 * @param destLimit End index of the destination buffer 842 * @param mode The normalization mode; one of Normalizer.NONE, 843 * Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC, 844 * Normalizer.NFKD, Normalizer.DEFAULT 845 * @param options The normalization options, ORed together (0 for no options). 846 * @return int The total buffer size needed;if greater than length of 847 * result, the output was truncated. 848 * @exception IndexOutOfBoundsException if the target capacity is 849 * less than the required length 850 * @stable ICU 2.6 851 */ normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)852 public static int normalize(char[] src,int srcStart, int srcLimit, 853 char[] dest,int destStart, int destLimit, 854 Mode mode, int options) { 855 CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart); 856 CharsAppendable app = new CharsAppendable(dest, destStart, destLimit); 857 mode.getNormalizer2(options).normalize(srcBuffer, app); 858 return app.length(); 859 } 860 861 /** 862 * Normalize a codepoint according to the given mode 863 * @param char32 The input string to be normalized. 864 * @param mode The normalization mode 865 * @param options Options for use with exclusion set and tailored Normalization 866 * The only option that is currently recognized is UNICODE_3_2 867 * @return String The normalized string 868 * @stable ICU 2.6 869 * @see #UNICODE_3_2 870 */ normalize(int char32, Mode mode, int options)871 public static String normalize(int char32, Mode mode, int options) { 872 if(mode == NFD && options == 0) { 873 String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32); 874 if(decomposition == null) { 875 decomposition = UTF16.valueOf(char32); 876 } 877 return decomposition; 878 } 879 return normalize(UTF16.valueOf(char32), mode, options); 880 } 881 882 /** 883 * Convenience method to normalize a codepoint according to the given mode 884 * @param char32 The input string to be normalized. 885 * @param mode The normalization mode 886 * @return String The normalized string 887 * @stable ICU 2.6 888 */ normalize(int char32, Mode mode)889 public static String normalize(int char32, Mode mode) { 890 return normalize(char32, mode, 0); 891 } 892 893 /** 894 * Convenience method. 895 * 896 * @param source string for determining if it is in a normalized format 897 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 898 * Normalizer.NFKC,Normalizer.NFKD) 899 * @return Return code to specify if the text is normalized or not 900 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 901 * @stable ICU 2.8 902 */ quickCheck(String source, Mode mode)903 public static QuickCheckResult quickCheck(String source, Mode mode) { 904 return quickCheck(source, mode, 0); 905 } 906 907 /** 908 * Performing quick check on a string, to quickly determine if the string is 909 * in a particular normalization format. 910 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 911 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 912 * string is in the desired normalized format, Normalizer.NO determines that 913 * argument string is not in the desired normalized format. A 914 * Normalizer.MAYBE result indicates that a more thorough check is required, 915 * the user may have to put the string in its normalized form and compare 916 * the results. 917 * 918 * @param source string for determining if it is in a normalized format 919 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 920 * Normalizer.NFKC,Normalizer.NFKD) 921 * @param options Options for use with exclusion set and tailored Normalization 922 * The only option that is currently recognized is UNICODE_3_2 923 * @return Return code to specify if the text is normalized or not 924 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 925 * @stable ICU 2.6 926 */ quickCheck(String source, Mode mode, int options)927 public static QuickCheckResult quickCheck(String source, Mode mode, int options) { 928 return mode.getNormalizer2(options).quickCheck(source); 929 } 930 931 /** 932 * Convenience method. 933 * 934 * @param source Array of characters for determining if it is in a 935 * normalized format 936 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 937 * Normalizer.NFKC,Normalizer.NFKD) 938 * @param options Options for use with exclusion set and tailored Normalization 939 * The only option that is currently recognized is UNICODE_3_2 940 * @return Return code to specify if the text is normalized or not 941 * (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE) 942 * @stable ICU 2.6 943 */ quickCheck(char[] source, Mode mode, int options)944 public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) { 945 return quickCheck(source, 0, source.length, mode, options); 946 } 947 948 /** 949 * Performing quick check on a string, to quickly determine if the string is 950 * in a particular normalization format. 951 * Three types of result can be returned Normalizer.YES, Normalizer.NO or 952 * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument 953 * string is in the desired normalized format, Normalizer.NO determines that 954 * argument string is not in the desired normalized format. A 955 * Normalizer.MAYBE result indicates that a more thorough check is required, 956 * the user may have to put the string in its normalized form and compare 957 * the results. 958 * 959 * @param source string for determining if it is in a normalized format 960 * @param start the start index of the source 961 * @param limit the limit index of the source it is equal to the length 962 * @param mode normalization format (Normalizer.NFC,Normalizer.NFD, 963 * Normalizer.NFKC,Normalizer.NFKD) 964 * @param options Options for use with exclusion set and tailored Normalization 965 * The only option that is currently recognized is UNICODE_3_2 966 * @return Return code to specify if the text is normalized or not 967 * (Normalizer.YES, Normalizer.NO or 968 * Normalizer.MAYBE) 969 * @stable ICU 2.6 970 */ 971 quickCheck(char[] source,int start, int limit, Mode mode,int options)972 public static QuickCheckResult quickCheck(char[] source,int start, 973 int limit, Mode mode,int options) { 974 CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start); 975 return mode.getNormalizer2(options).quickCheck(srcBuffer); 976 } 977 978 /** 979 * Test if a string is in a given normalization form. 980 * This is semantically equivalent to source.equals(normalize(source, mode)). 981 * 982 * Unlike quickCheck(), this function returns a definitive result, 983 * never a "maybe". 984 * For NFD, NFKD, and FCD, both functions work exactly the same. 985 * For NFC and NFKC where quickCheck may return "maybe", this function will 986 * perform further tests to arrive at a true/false result. 987 * @param src The input array of characters to be checked to see if 988 * it is normalized 989 * @param start The strart index in the source 990 * @param limit The limit index in the source 991 * @param mode the normalization mode 992 * @param options Options for use with exclusion set and tailored Normalization 993 * The only option that is currently recognized is UNICODE_3_2 994 * @return Boolean value indicating whether the source string is in the 995 * "mode" normalization form 996 * @stable ICU 2.6 997 */ isNormalized(char[] src,int start, int limit, Mode mode, int options)998 public static boolean isNormalized(char[] src,int start, 999 int limit, Mode mode, 1000 int options) { 1001 CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start); 1002 return mode.getNormalizer2(options).isNormalized(srcBuffer); 1003 } 1004 1005 /** 1006 * Test if a string is in a given normalization form. 1007 * This is semantically equivalent to source.equals(normalize(source, mode)). 1008 * 1009 * Unlike quickCheck(), this function returns a definitive result, 1010 * never a "maybe". 1011 * For NFD, NFKD, and FCD, both functions work exactly the same. 1012 * For NFC and NFKC where quickCheck may return "maybe", this function will 1013 * perform further tests to arrive at a true/false result. 1014 * @param str the input string to be checked to see if it is 1015 * normalized 1016 * @param mode the normalization mode 1017 * @param options Options for use with exclusion set and tailored Normalization 1018 * The only option that is currently recognized is UNICODE_3_2 1019 * @see #isNormalized 1020 * @stable ICU 2.6 1021 */ isNormalized(String str, Mode mode, int options)1022 public static boolean isNormalized(String str, Mode mode, int options) { 1023 return mode.getNormalizer2(options).isNormalized(str); 1024 } 1025 1026 /** 1027 * Convenience Method 1028 * @param char32 the input code point to be checked to see if it is 1029 * normalized 1030 * @param mode the normalization mode 1031 * @param options Options for use with exclusion set and tailored Normalization 1032 * The only option that is currently recognized is UNICODE_3_2 1033 * 1034 * @see #isNormalized 1035 * @stable ICU 2.6 1036 */ isNormalized(int char32, Mode mode,int options)1037 public static boolean isNormalized(int char32, Mode mode,int options) { 1038 return isNormalized(UTF16.valueOf(char32), mode, options); 1039 } 1040 1041 /** 1042 * Compare two strings for canonical equivalence. 1043 * Further options include case-insensitive comparison and 1044 * code point order (as opposed to code unit order). 1045 * 1046 * Canonical equivalence between two strings is defined as their normalized 1047 * forms (NFD or NFC) being identical. 1048 * This function compares strings incrementally instead of normalizing 1049 * (and optionally case-folding) both strings entirely, 1050 * improving performance significantly. 1051 * 1052 * Bulk normalization is only necessary if the strings do not fulfill the 1053 * FCD conditions. Only in this case, and only if the strings are relatively 1054 * long, is memory allocated temporarily. 1055 * For FCD strings and short non-FCD strings there is no memory allocation. 1056 * 1057 * Semantically, this is equivalent to 1058 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1059 * where code point order and foldCase are all optional. 1060 * 1061 * @param s1 First source character array. 1062 * @param s1Start start index of source 1063 * @param s1Limit limit of the source 1064 * 1065 * @param s2 Second source character array. 1066 * @param s2Start start index of the source 1067 * @param s2Limit limit of the source 1068 * 1069 * @param options A bit set of options: 1070 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1071 * Case-sensitive comparison in code unit order, and the input strings 1072 * are quick-checked for FCD. 1073 * 1074 * - INPUT_IS_FCD 1075 * Set if the caller knows that both s1 and s2 fulfill the FCD 1076 * conditions.If not set, the function will quickCheck for FCD 1077 * and normalize if necessary. 1078 * 1079 * - COMPARE_CODE_POINT_ORDER 1080 * Set to choose code point order instead of code unit order 1081 * 1082 * - COMPARE_IGNORE_CASE 1083 * Set to compare strings case-insensitively using case folding, 1084 * instead of case-sensitively. 1085 * If set, then the following case folding options are used. 1086 * 1087 * 1088 * @return <0 or 0 or >0 as usual for string comparisons 1089 * 1090 * @see #normalize 1091 * @see #FCD 1092 * @stable ICU 2.8 1093 */ compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1094 public static int compare(char[] s1, int s1Start, int s1Limit, 1095 char[] s2, int s2Start, int s2Limit, 1096 int options) { 1097 if( s1==null || s1Start<0 || s1Limit<0 || 1098 s2==null || s2Start<0 || s2Limit<0 || 1099 s1Limit<s1Start || s2Limit<s2Start 1100 ) { 1101 throw new IllegalArgumentException(); 1102 } 1103 return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start), 1104 CharBuffer.wrap(s2, s2Start, s2Limit-s2Start), 1105 options); 1106 } 1107 1108 /** 1109 * Compare two strings for canonical equivalence. 1110 * Further options include case-insensitive comparison and 1111 * code point order (as opposed to code unit order). 1112 * 1113 * Canonical equivalence between two strings is defined as their normalized 1114 * forms (NFD or NFC) being identical. 1115 * This function compares strings incrementally instead of normalizing 1116 * (and optionally case-folding) both strings entirely, 1117 * improving performance significantly. 1118 * 1119 * Bulk normalization is only necessary if the strings do not fulfill the 1120 * FCD conditions. Only in this case, and only if the strings are relatively 1121 * long, is memory allocated temporarily. 1122 * For FCD strings and short non-FCD strings there is no memory allocation. 1123 * 1124 * Semantically, this is equivalent to 1125 * strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2))) 1126 * where code point order and foldCase are all optional. 1127 * 1128 * @param s1 First source string. 1129 * @param s2 Second source string. 1130 * 1131 * @param options A bit set of options: 1132 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1133 * Case-sensitive comparison in code unit order, and the input strings 1134 * are quick-checked for FCD. 1135 * 1136 * - INPUT_IS_FCD 1137 * Set if the caller knows that both s1 and s2 fulfill the FCD 1138 * conditions. If not set, the function will quickCheck for FCD 1139 * and normalize if necessary. 1140 * 1141 * - COMPARE_CODE_POINT_ORDER 1142 * Set to choose code point order instead of code unit order 1143 * 1144 * - COMPARE_IGNORE_CASE 1145 * Set to compare strings case-insensitively using case folding, 1146 * instead of case-sensitively. 1147 * If set, then the following case folding options are used. 1148 * 1149 * @return <0 or 0 or >0 as usual for string comparisons 1150 * 1151 * @see #normalize 1152 * @see #FCD 1153 * @stable ICU 2.8 1154 */ compare(String s1, String s2, int options)1155 public static int compare(String s1, String s2, int options) { 1156 return internalCompare(s1, s2, options); 1157 } 1158 1159 /** 1160 * Compare two strings for canonical equivalence. 1161 * Further options include case-insensitive comparison and 1162 * code point order (as opposed to code unit order). 1163 * Convenience method. 1164 * 1165 * @param s1 First source string. 1166 * @param s2 Second source string. 1167 * 1168 * @param options A bit set of options: 1169 * - FOLD_CASE_DEFAULT or 0 is used for default options: 1170 * Case-sensitive comparison in code unit order, and the input strings 1171 * are quick-checked for FCD. 1172 * 1173 * - INPUT_IS_FCD 1174 * Set if the caller knows that both s1 and s2 fulfill the FCD 1175 * conditions. If not set, the function will quickCheck for FCD 1176 * and normalize if necessary. 1177 * 1178 * - COMPARE_CODE_POINT_ORDER 1179 * Set to choose code point order instead of code unit order 1180 * 1181 * - COMPARE_IGNORE_CASE 1182 * Set to compare strings case-insensitively using case folding, 1183 * instead of case-sensitively. 1184 * If set, then the following case folding options are used. 1185 * 1186 * @return <0 or 0 or >0 as usual for string comparisons 1187 * 1188 * @see #normalize 1189 * @see #FCD 1190 * @stable ICU 2.8 1191 */ compare(char[] s1, char[] s2, int options)1192 public static int compare(char[] s1, char[] s2, int options) { 1193 return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options); 1194 } 1195 1196 /** 1197 * Convenience method that can have faster implementation 1198 * by not allocating buffers. 1199 * @param char32a the first code point to be checked against the 1200 * @param char32b the second code point 1201 * @param options A bit set of options 1202 * @stable ICU 2.8 1203 */ compare(int char32a, int char32b, int options)1204 public static int compare(int char32a, int char32b, int options) { 1205 return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD); 1206 } 1207 1208 /** 1209 * Convenience method that can have faster implementation 1210 * by not allocating buffers. 1211 * @param char32a the first code point to be checked against 1212 * @param str2 the second string 1213 * @param options A bit set of options 1214 * @stable ICU 2.8 1215 */ compare(int char32a, String str2, int options)1216 public static int compare(int char32a, String str2, int options) { 1217 return internalCompare(UTF16.valueOf(char32a), str2, options); 1218 } 1219 1220 /* Concatenation of normalized strings --------------------------------- */ 1221 /** 1222 * Concatenate normalized strings, making sure that the result is normalized 1223 * as well. 1224 * 1225 * If both the left and the right strings are in 1226 * the normalization form according to "mode", 1227 * then the result will be 1228 * 1229 * <code> 1230 * dest=normalize(left+right, mode) 1231 * </code> 1232 * 1233 * With the input strings already being normalized, 1234 * this function will use next() and previous() 1235 * to find the adjacent end pieces of the input strings. 1236 * Only the concatenation of these end pieces will be normalized and 1237 * then concatenated with the remaining parts of the input strings. 1238 * 1239 * It is allowed to have dest==left to avoid copying the entire left string. 1240 * 1241 * @param left Left source array, may be same as dest. 1242 * @param leftStart start in the left array. 1243 * @param leftLimit limit in the left array (==length) 1244 * @param right Right source array. 1245 * @param rightStart start in the right array. 1246 * @param rightLimit limit in the right array (==length) 1247 * @param dest The output buffer; can be null if destStart==destLimit==0 1248 * for pure preflighting. 1249 * @param destStart start in the destination array 1250 * @param destLimit limit in the destination array (==length) 1251 * @param mode The normalization mode. 1252 * @param options The normalization options, ORed together (0 for no options). 1253 * @return Length of output (number of chars) when successful or 1254 * IndexOutOfBoundsException 1255 * @exception IndexOutOfBoundsException whose message has the string 1256 * representation of destination capacity required. 1257 * @see #normalize 1258 * @see #next 1259 * @see #previous 1260 * @exception IndexOutOfBoundsException if target capacity is less than the 1261 * required length 1262 * @stable ICU 2.8 1263 */ concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1264 public static int concatenate(char[] left, int leftStart, int leftLimit, 1265 char[] right, int rightStart, int rightLimit, 1266 char[] dest, int destStart, int destLimit, 1267 Normalizer.Mode mode, int options) { 1268 if(dest == null) { 1269 throw new IllegalArgumentException(); 1270 } 1271 1272 /* check for overlapping right and destination */ 1273 if (right == dest && rightStart < destLimit && destStart < rightLimit) { 1274 throw new IllegalArgumentException("overlapping right and dst ranges"); 1275 } 1276 1277 /* allow left==dest */ 1278 StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16); 1279 destBuilder.append(left, leftStart, leftLimit-leftStart); 1280 CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart); 1281 mode.getNormalizer2(options).append(destBuilder, rightBuffer); 1282 int destLength=destBuilder.length(); 1283 if(destLength<=(destLimit-destStart)) { 1284 destBuilder.getChars(0, destLength, dest, destStart); 1285 return destLength; 1286 } else { 1287 throw new IndexOutOfBoundsException(Integer.toString(destLength)); 1288 } 1289 } 1290 1291 /** 1292 * Concatenate normalized strings, making sure that the result is normalized 1293 * as well. 1294 * 1295 * If both the left and the right strings are in 1296 * the normalization form according to "mode", 1297 * then the result will be 1298 * 1299 * <code> 1300 * dest=normalize(left+right, mode) 1301 * </code> 1302 * 1303 * For details see concatenate 1304 * 1305 * @param left Left source string. 1306 * @param right Right source string. 1307 * @param mode The normalization mode. 1308 * @param options The normalization options, ORed together (0 for no options). 1309 * @return result 1310 * 1311 * @see #concatenate 1312 * @see #normalize 1313 * @see #next 1314 * @see #previous 1315 * @see #concatenate 1316 * @stable ICU 2.8 1317 */ concatenate(char[] left, char[] right,Mode mode, int options)1318 public static String concatenate(char[] left, char[] right,Mode mode, int options) { 1319 StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left); 1320 return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString(); 1321 } 1322 1323 /** 1324 * Concatenate normalized strings, making sure that the result is normalized 1325 * as well. 1326 * 1327 * If both the left and the right strings are in 1328 * the normalization form according to "mode", 1329 * then the result will be 1330 * 1331 * <code> 1332 * dest=normalize(left+right, mode) 1333 * </code> 1334 * 1335 * With the input strings already being normalized, 1336 * this function will use next() and previous() 1337 * to find the adjacent end pieces of the input strings. 1338 * Only the concatenation of these end pieces will be normalized and 1339 * then concatenated with the remaining parts of the input strings. 1340 * 1341 * @param left Left source string. 1342 * @param right Right source string. 1343 * @param mode The normalization mode. 1344 * @param options The normalization options, ORed together (0 for no options). 1345 * @return result 1346 * 1347 * @see #concatenate 1348 * @see #normalize 1349 * @see #next 1350 * @see #previous 1351 * @see #concatenate 1352 * @stable ICU 2.8 1353 */ concatenate(String left, String right, Mode mode, int options)1354 public static String concatenate(String left, String right, Mode mode, int options) { 1355 StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left); 1356 return mode.getNormalizer2(options).append(dest, right).toString(); 1357 } 1358 1359 /** 1360 * Gets the FC_NFKC closure value. 1361 * @param c The code point whose closure value is to be retrieved 1362 * @param dest The char array to receive the closure value 1363 * @return the length of the closure value; 0 if there is none 1364 * @stable ICU 3.8 1365 */ getFC_NFKC_Closure(int c,char[] dest)1366 public static int getFC_NFKC_Closure(int c,char[] dest) { 1367 String closure=getFC_NFKC_Closure(c); 1368 int length=closure.length(); 1369 if(length!=0 && dest!=null && length<=dest.length) { 1370 closure.getChars(0, length, dest, 0); 1371 } 1372 return length; 1373 } 1374 /** 1375 * Gets the FC_NFKC closure value. 1376 * @param c The code point whose closure value is to be retrieved 1377 * @return String representation of the closure value; "" if there is none 1378 * @stable ICU 3.8 1379 */ getFC_NFKC_Closure(int c)1380 public static String getFC_NFKC_Closure(int c) { 1381 // Compute the FC_NFKC_Closure on the fly: 1382 // We have the API for complete coverage of Unicode properties, although 1383 // this value by itself is not useful via API. 1384 // (What could be useful is a custom normalization table that combines 1385 // case folding and NFKC.) 1386 // For the derivation, see Unicode's DerivedNormalizationProps.txt. 1387 Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2; 1388 UCaseProps csp=UCaseProps.INSTANCE; 1389 // first: b = NFKC(Fold(a)) 1390 StringBuilder folded=new StringBuilder(); 1391 int folded1Length=csp.toFullFolding(c, folded, 0); 1392 if(folded1Length<0) { 1393 Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl; 1394 if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) { 1395 return ""; // c does not change at all under CaseFolding+NFKC 1396 } 1397 folded.appendCodePoint(c); 1398 } else { 1399 if(folded1Length>UCaseProps.MAX_STRING_LENGTH) { 1400 folded.appendCodePoint(folded1Length); 1401 } 1402 } 1403 String kc1=nfkc.normalize(folded); 1404 // second: c = NFKC(Fold(b)) 1405 String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0)); 1406 // if (c != b) add the mapping from a to c 1407 if(kc1.equals(kc2)) { 1408 return ""; 1409 } else { 1410 return kc2; 1411 } 1412 } 1413 1414 //------------------------------------------------------------------------- 1415 // Iteration API 1416 //------------------------------------------------------------------------- 1417 1418 /** 1419 * Return the current character in the normalized text. 1420 * @return The codepoint as an int 1421 * @stable ICU 2.8 1422 */ current()1423 public int current() { 1424 if(bufferPos<buffer.length() || nextNormalize()) { 1425 return buffer.codePointAt(bufferPos); 1426 } else { 1427 return DONE; 1428 } 1429 } 1430 1431 /** 1432 * Return the next character in the normalized text and advance 1433 * the iteration position by one. If the end 1434 * of the text has already been reached, {@link #DONE} is returned. 1435 * @return The codepoint as an int 1436 * @stable ICU 2.8 1437 */ next()1438 public int next() { 1439 if(bufferPos<buffer.length() || nextNormalize()) { 1440 int c=buffer.codePointAt(bufferPos); 1441 bufferPos+=Character.charCount(c); 1442 return c; 1443 } else { 1444 return DONE; 1445 } 1446 } 1447 1448 1449 /** 1450 * Return the previous character in the normalized text and decrement 1451 * the iteration position by one. If the beginning 1452 * of the text has already been reached, {@link #DONE} is returned. 1453 * @return The codepoint as an int 1454 * @stable ICU 2.8 1455 */ previous()1456 public int previous() { 1457 if(bufferPos>0 || previousNormalize()) { 1458 int c=buffer.codePointBefore(bufferPos); 1459 bufferPos-=Character.charCount(c); 1460 return c; 1461 } else { 1462 return DONE; 1463 } 1464 } 1465 1466 /** 1467 * Reset the index to the beginning of the text. 1468 * This is equivalent to setIndexOnly(startIndex)). 1469 * @stable ICU 2.8 1470 */ reset()1471 public void reset() { 1472 text.setToStart(); 1473 currentIndex=nextIndex=0; 1474 clearBuffer(); 1475 } 1476 1477 /** 1478 * Set the iteration position in the input text that is being normalized, 1479 * without any immediate normalization. 1480 * After setIndexOnly(), getIndex() will return the same index that is 1481 * specified here. 1482 * 1483 * @param index the desired index in the input text. 1484 * @stable ICU 2.8 1485 */ setIndexOnly(int index)1486 public void setIndexOnly(int index) { 1487 text.setIndex(index); // validates index 1488 currentIndex=nextIndex=index; 1489 clearBuffer(); 1490 } 1491 1492 /** 1493 * Set the iteration position in the input text that is being normalized 1494 * and return the first normalized character at that position. 1495 * <p> 1496 * <b>Note:</b> This method sets the position in the <em>input</em> text, 1497 * while {@link #next} and {@link #previous} iterate through characters 1498 * in the normalized <em>output</em>. This means that there is not 1499 * necessarily a one-to-one correspondence between characters returned 1500 * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and 1501 * returned from <tt>setIndex</tt> and {@link #getIndex}. 1502 * <p> 1503 * @param index the desired index in the input text. 1504 * 1505 * @return the first normalized character that is the result of iterating 1506 * forward starting at the given index. 1507 * 1508 * @throws IllegalArgumentException if the given index is less than 1509 * {@link #getBeginIndex} or greater than {@link #getEndIndex}. 1510 * @deprecated ICU 3.2 1511 * @obsolete ICU 3.2 1512 */ 1513 @Deprecated 1514 ///CLOVER:OFF setIndex(int index)1515 public int setIndex(int index) { 1516 setIndexOnly(index); 1517 return current(); 1518 } 1519 ///CLOVER:ON 1520 /** 1521 * Retrieve the index of the start of the input text. This is the begin 1522 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1523 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1524 * @deprecated ICU 2.2. Use startIndex() instead. 1525 * @return The codepoint as an int 1526 * @see #startIndex 1527 */ 1528 @Deprecated getBeginIndex()1529 public int getBeginIndex() { 1530 return 0; 1531 } 1532 1533 /** 1534 * Retrieve the index of the end of the input text. This is the end index 1535 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1536 * over which this <tt>Normalizer</tt> is iterating 1537 * @deprecated ICU 2.2. Use endIndex() instead. 1538 * @return The codepoint as an int 1539 * @see #endIndex 1540 */ 1541 @Deprecated getEndIndex()1542 public int getEndIndex() { 1543 return endIndex(); 1544 } 1545 /** 1546 * Return the first character in the normalized text. This resets 1547 * the <tt>Normalizer's</tt> position to the beginning of the text. 1548 * @return The codepoint as an int 1549 * @stable ICU 2.8 1550 */ first()1551 public int first() { 1552 reset(); 1553 return next(); 1554 } 1555 1556 /** 1557 * Return the last character in the normalized text. This resets 1558 * the <tt>Normalizer's</tt> position to be just before the 1559 * the input text corresponding to that normalized character. 1560 * @return The codepoint as an int 1561 * @stable ICU 2.8 1562 */ last()1563 public int last() { 1564 text.setToLimit(); 1565 currentIndex=nextIndex=text.getIndex(); 1566 clearBuffer(); 1567 return previous(); 1568 } 1569 1570 /** 1571 * Retrieve the current iteration position in the input text that is 1572 * being normalized. This method is useful in applications such as 1573 * searching, where you need to be able to determine the position in 1574 * the input text that corresponds to a given normalized output character. 1575 * <p> 1576 * <b>Note:</b> This method sets the position in the <em>input</em>, while 1577 * {@link #next} and {@link #previous} iterate through characters in the 1578 * <em>output</em>. This means that there is not necessarily a one-to-one 1579 * correspondence between characters returned by <tt>next</tt> and 1580 * <tt>previous</tt> and the indices passed to and returned from 1581 * <tt>setIndex</tt> and {@link #getIndex}. 1582 * @return The current iteration position 1583 * @stable ICU 2.8 1584 */ getIndex()1585 public int getIndex() { 1586 if(bufferPos<buffer.length()) { 1587 return currentIndex; 1588 } else { 1589 return nextIndex; 1590 } 1591 } 1592 1593 /** 1594 * Retrieve the index of the start of the input text. This is the begin 1595 * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the 1596 * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating 1597 * @return The current iteration position 1598 * @stable ICU 2.8 1599 */ startIndex()1600 public int startIndex() { 1601 return 0; 1602 } 1603 1604 /** 1605 * Retrieve the index of the end of the input text. This is the end index 1606 * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt> 1607 * over which this <tt>Normalizer</tt> is iterating 1608 * @return The current iteration position 1609 * @stable ICU 2.8 1610 */ endIndex()1611 public int endIndex() { 1612 return text.getLength(); 1613 } 1614 1615 //------------------------------------------------------------------------- 1616 // Iterator attributes 1617 //------------------------------------------------------------------------- 1618 /** 1619 * Set the normalization mode for this object. 1620 * <p> 1621 * <b>Note:</b>If the normalization mode is changed while iterating 1622 * over a string, calls to {@link #next} and {@link #previous} may 1623 * return previously buffers characters in the old normalization mode 1624 * until the iteration is able to re-sync at the next base character. 1625 * It is safest to call {@link #setText setText()}, {@link #first}, 1626 * {@link #last}, etc. after calling <tt>setMode</tt>. 1627 * <p> 1628 * @param newMode the new mode for this <tt>Normalizer</tt>. 1629 * The supported modes are: 1630 * <ul> 1631 * <li>{@link #NFC} - Unicode canonical decompositiion 1632 * followed by canonical composition. 1633 * <li>{@link #NFKC} - Unicode compatibility decompositiion 1634 * follwed by canonical composition. 1635 * <li>{@link #NFD} - Unicode canonical decomposition 1636 * <li>{@link #NFKD} - Unicode compatibility decomposition. 1637 * <li>{@link #NONE} - Do nothing but return characters 1638 * from the underlying input text. 1639 * </ul> 1640 * 1641 * @see #getMode 1642 * @stable ICU 2.8 1643 */ setMode(Mode newMode)1644 public void setMode(Mode newMode) { 1645 mode = newMode; 1646 norm2 = mode.getNormalizer2(options); 1647 } 1648 /** 1649 * Return the basic operation performed by this <tt>Normalizer</tt> 1650 * 1651 * @see #setMode 1652 * @stable ICU 2.8 1653 */ getMode()1654 public Mode getMode() { 1655 return mode; 1656 } 1657 /** 1658 * Set options that affect this <tt>Normalizer</tt>'s operation. 1659 * Options do not change the basic composition or decomposition operation 1660 * that is being performed , but they control whether 1661 * certain optional portions of the operation are done. 1662 * Currently the only available option is: 1663 * <p> 1664 * <ul> 1665 * <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2. 1666 * </ul> 1667 * <p> 1668 * @param option the option whose value is to be set. 1669 * @param value the new setting for the option. Use <tt>true</tt> to 1670 * turn the option on and <tt>false</tt> to turn it off. 1671 * 1672 * @see #getOption 1673 * @stable ICU 2.6 1674 */ setOption(int option,boolean value)1675 public void setOption(int option,boolean value) { 1676 if (value) { 1677 options |= option; 1678 } else { 1679 options &= (~option); 1680 } 1681 norm2 = mode.getNormalizer2(options); 1682 } 1683 1684 /** 1685 * Determine whether an option is turned on or off. 1686 * <p> 1687 * @see #setOption 1688 * @stable ICU 2.6 1689 */ getOption(int option)1690 public int getOption(int option) { 1691 if((options & option)!=0) { 1692 return 1 ; 1693 } else { 1694 return 0; 1695 } 1696 } 1697 1698 /** 1699 * Gets the underlying text storage 1700 * @param fillIn the char buffer to fill the UTF-16 units. 1701 * The length of the buffer should be equal to the length of the 1702 * underlying text storage 1703 * @throws IndexOutOfBoundsException If the index passed for the array is invalid. 1704 * @see #getLength 1705 * @stable ICU 2.8 1706 */ getText(char[] fillIn)1707 public int getText(char[] fillIn) { 1708 return text.getText(fillIn); 1709 } 1710 1711 /** 1712 * Gets the length of underlying text storage 1713 * @return the length 1714 * @stable ICU 2.8 1715 */ getLength()1716 public int getLength() { 1717 return text.getLength(); 1718 } 1719 1720 /** 1721 * Returns the text under iteration as a string 1722 * @return a copy of the text under iteration. 1723 * @stable ICU 2.8 1724 */ getText()1725 public String getText() { 1726 return text.getText(); 1727 } 1728 1729 /** 1730 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1731 * The iteration position is set to the beginning of the input text. 1732 * @param newText The new string to be normalized. 1733 * @stable ICU 2.8 1734 */ setText(StringBuffer newText)1735 public void setText(StringBuffer newText) { 1736 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1737 if (newIter == null) { 1738 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1739 } 1740 text = newIter; 1741 reset(); 1742 } 1743 1744 /** 1745 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1746 * The iteration position is set to the beginning of the input text. 1747 * @param newText The new string to be normalized. 1748 * @stable ICU 2.8 1749 */ setText(char[] newText)1750 public void setText(char[] newText) { 1751 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1752 if (newIter == null) { 1753 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1754 } 1755 text = newIter; 1756 reset(); 1757 } 1758 1759 /** 1760 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1761 * The iteration position is set to the beginning of the input text. 1762 * @param newText The new string to be normalized. 1763 * @stable ICU 2.8 1764 */ setText(String newText)1765 public void setText(String newText) { 1766 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1767 if (newIter == null) { 1768 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1769 } 1770 text = newIter; 1771 reset(); 1772 } 1773 1774 /** 1775 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1776 * The iteration position is set to the beginning of the input text. 1777 * @param newText The new string to be normalized. 1778 * @stable ICU 2.8 1779 */ setText(CharacterIterator newText)1780 public void setText(CharacterIterator newText) { 1781 UCharacterIterator newIter = UCharacterIterator.getInstance(newText); 1782 if (newIter == null) { 1783 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1784 } 1785 text = newIter; 1786 reset(); 1787 } 1788 1789 /** 1790 * Set the input text over which this <tt>Normalizer</tt> will iterate. 1791 * The iteration position is set to the beginning of the string. 1792 * @param newText The new string to be normalized. 1793 * @stable ICU 2.8 1794 */ setText(UCharacterIterator newText)1795 public void setText(UCharacterIterator newText) { 1796 try{ 1797 UCharacterIterator newIter = (UCharacterIterator)newText.clone(); 1798 if (newIter == null) { 1799 throw new IllegalStateException("Could not create a new UCharacterIterator"); 1800 } 1801 text = newIter; 1802 reset(); 1803 }catch(CloneNotSupportedException e) { 1804 throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e); 1805 } 1806 } 1807 clearBuffer()1808 private void clearBuffer() { 1809 buffer.setLength(0); 1810 bufferPos=0; 1811 } 1812 nextNormalize()1813 private boolean nextNormalize() { 1814 clearBuffer(); 1815 currentIndex=nextIndex; 1816 text.setIndex(nextIndex); 1817 // Skip at least one character so we make progress. 1818 int c=text.nextCodePoint(); 1819 if(c<0) { 1820 return false; 1821 } 1822 StringBuilder segment=new StringBuilder().appendCodePoint(c); 1823 while((c=text.nextCodePoint())>=0) { 1824 if(norm2.hasBoundaryBefore(c)) { 1825 text.moveCodePointIndex(-1); 1826 break; 1827 } 1828 segment.appendCodePoint(c); 1829 } 1830 nextIndex=text.getIndex(); 1831 norm2.normalize(segment, buffer); 1832 return buffer.length()!=0; 1833 } 1834 previousNormalize()1835 private boolean previousNormalize() { 1836 clearBuffer(); 1837 nextIndex=currentIndex; 1838 text.setIndex(currentIndex); 1839 StringBuilder segment=new StringBuilder(); 1840 int c; 1841 while((c=text.previousCodePoint())>=0) { 1842 if(c<=0xffff) { 1843 segment.insert(0, (char)c); 1844 } else { 1845 segment.insert(0, Character.toChars(c)); 1846 } 1847 if(norm2.hasBoundaryBefore(c)) { 1848 break; 1849 } 1850 } 1851 currentIndex=text.getIndex(); 1852 norm2.normalize(segment, buffer); 1853 bufferPos=buffer.length(); 1854 return buffer.length()!=0; 1855 } 1856 1857 /* compare canonically equivalent ------------------------------------------- */ 1858 1859 // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407 internalCompare(CharSequence s1, CharSequence s2, int options)1860 private static int internalCompare(CharSequence s1, CharSequence s2, int options) { 1861 int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT; 1862 options|= COMPARE_EQUIV; 1863 1864 /* 1865 * UAX #21 Case Mappings, as fixed for Unicode version 4 1866 * (see Jitterbug 2021), defines a canonical caseless match as 1867 * 1868 * A string X is a canonical caseless match 1869 * for a string Y if and only if 1870 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1871 * 1872 * For better performance, we check for FCD (or let the caller tell us that 1873 * both strings are in FCD) for the inner normalization. 1874 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 1875 * case-folding preserves the FCD-ness of a string. 1876 * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold() 1877 * when there is a difference. 1878 * 1879 * Exception: When using the Turkic case-folding option, we do perform 1880 * full NFD first. This is because in the Turkic case precomposed characters 1881 * with 0049 capital I or 0069 small i fold differently whether they 1882 * are first decomposed or not, so an FCD check - a check only for 1883 * canonical order - is not sufficient. 1884 */ 1885 if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 1886 Normalizer2 n2; 1887 if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) { 1888 n2=NFD.getNormalizer2(normOptions); 1889 } else { 1890 n2=FCD.getNormalizer2(normOptions); 1891 } 1892 1893 // check if s1 and/or s2 fulfill the FCD conditions 1894 int spanQCYes1=n2.spanQuickCheckYes(s1); 1895 int spanQCYes2=n2.spanQuickCheckYes(s2); 1896 1897 /* 1898 * ICU 2.4 had a further optimization: 1899 * If both strings were not in FCD, then they were both NFD'ed, 1900 * and the COMPARE_EQUIV option was turned off. 1901 * It is not entirely clear that this is valid with the current 1902 * definition of the canonical caseless match. 1903 * Therefore, ICU 2.6 removes that optimization. 1904 */ 1905 1906 if(spanQCYes1<s1.length()) { 1907 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1); 1908 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length())); 1909 } 1910 if(spanQCYes2<s2.length()) { 1911 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2); 1912 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length())); 1913 } 1914 } 1915 1916 return cmpEquivFold(s1, s2, options); 1917 } 1918 1919 /* 1920 * Compare two strings for canonical equivalence. 1921 * Further options include case-insensitive comparison and 1922 * code point order (as opposed to code unit order). 1923 * 1924 * In this function, canonical equivalence is optional as well. 1925 * If canonical equivalence is tested, then both strings must fulfill 1926 * the FCD check. 1927 * 1928 * Semantically, this is equivalent to 1929 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 1930 * where code point order, NFD and foldCase are all optional. 1931 * 1932 * String comparisons almost always yield results before processing both strings 1933 * completely. 1934 * They are generally more efficient working incrementally instead of 1935 * performing the sub-processing (strlen, normalization, case-folding) 1936 * on the entire strings first. 1937 * 1938 * It is also unnecessary to not normalize identical characters. 1939 * 1940 * This function works in principle as follows: 1941 * 1942 * loop { 1943 * get one code unit c1 from s1 (-1 if end of source) 1944 * get one code unit c2 from s2 (-1 if end of source) 1945 * 1946 * if(either string finished) { 1947 * return result; 1948 * } 1949 * if(c1==c2) { 1950 * continue; 1951 * } 1952 * 1953 * // c1!=c2 1954 * try to decompose/case-fold c1/c2, and continue if one does; 1955 * 1956 * // still c1!=c2 and neither decomposes/case-folds, return result 1957 * return c1-c2; 1958 * } 1959 * 1960 * When a character decomposes, then the pointer for that source changes to 1961 * the decomposition, pushing the previous pointer onto a stack. 1962 * When the end of the decomposition is reached, then the code unit reader 1963 * pops the previous source from the stack. 1964 * (Same for case-folding.) 1965 * 1966 * This is complicated further by operating on variable-width UTF-16. 1967 * The top part of the loop works on code units, while lookups for decomposition 1968 * and case-folding need code points. 1969 * Code points are assembled after the equality/end-of-source part. 1970 * The source pointer is only advanced beyond all code units when the code point 1971 * actually decomposes/case-folds. 1972 * 1973 * If we were on a trail surrogate unit when assembling a code point, 1974 * and the code point decomposes/case-folds, then the decomposition/folding 1975 * result must be compared with the part of the other string that corresponds to 1976 * this string's lead surrogate. 1977 * Since we only assemble a code point when hitting a trail unit when the 1978 * preceding lead units were identical, we back up the other string by one unit 1979 * in such a case. 1980 * 1981 * The optional code point order comparison at the end works with 1982 * the same fix-up as the other code point order comparison functions. 1983 * See ustring.c and the comment near the end of this function. 1984 * 1985 * Assumption: A decomposition or case-folding result string never contains 1986 * a single surrogate. This is a safe assumption in the Unicode Standard. 1987 * Therefore, we do not need to check for surrogate pairs across 1988 * decomposition/case-folding boundaries. 1989 * 1990 * Further assumptions (see verifications tstnorm.cpp): 1991 * The API function checks for FCD first, while the core function 1992 * first case-folds and then decomposes. This requires that case-folding does not 1993 * un-FCD any strings. 1994 * 1995 * The API function may also NFD the input and turn off decomposition. 1996 * This requires that case-folding does not un-NFD strings either. 1997 * 1998 * TODO If any of the above two assumptions is violated, 1999 * then this entire code must be re-thought. 2000 * If this happens, then a simple solution is to case-fold both strings up front 2001 * and to turn off UNORM_INPUT_IS_FCD. 2002 * We already do this when not both strings are in FCD because makeFCD 2003 * would be a partial NFD before the case folding, which does not work. 2004 * Note that all of this is only a problem when case-folding _and_ 2005 * canonical equivalence come together. 2006 * (Comments in unorm_compare() are more up to date than this TODO.) 2007 */ 2008 2009 /* stack element for previous-level source/decomposition pointers */ 2010 private static final class CmpEquivLevel { 2011 CharSequence cs; 2012 int s; 2013 }; createCmpEquivLevelStack()2014 private static final CmpEquivLevel[] createCmpEquivLevelStack() { 2015 return new CmpEquivLevel[] { 2016 new CmpEquivLevel(), new CmpEquivLevel() 2017 }; 2018 } 2019 2020 /** 2021 * Internal option for unorm_cmpEquivFold() for decomposing. 2022 * If not set, just do strcasecmp(). 2023 */ 2024 private static final int COMPARE_EQUIV=0x80000; 2025 2026 /* internal function; package visibility for use by UTF16.StringComparator */ cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2027 /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) { 2028 Normalizer2Impl nfcImpl; 2029 UCaseProps csp; 2030 2031 /* current-level start/limit - s1/s2 as current */ 2032 int s1, s2, limit1, limit2; 2033 2034 /* decomposition and case folding variables */ 2035 int length; 2036 2037 /* stacks of previous-level start/current/limit */ 2038 CmpEquivLevel[] stack1=null, stack2=null; 2039 2040 /* buffers for algorithmic decompositions */ 2041 String decomp1, decomp2; 2042 2043 /* case folding buffers, only use current-level start/limit */ 2044 StringBuilder fold1, fold2; 2045 2046 /* track which is the current level per string */ 2047 int level1, level2; 2048 2049 /* current code units, and code points for lookups */ 2050 int c1, c2, cp1, cp2; 2051 2052 /* no argument error checking because this itself is not an API */ 2053 2054 /* 2055 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 2056 * otherwise this function must behave exactly as uprv_strCompare() 2057 * not checking for that here makes testing this function easier 2058 */ 2059 2060 /* normalization/properties data loaded? */ 2061 if((options&COMPARE_EQUIV)!=0) { 2062 nfcImpl=Norm2AllModes.getNFCInstance().impl; 2063 } else { 2064 nfcImpl=null; 2065 } 2066 if((options&COMPARE_IGNORE_CASE)!=0) { 2067 csp=UCaseProps.INSTANCE; 2068 fold1=new StringBuilder(); 2069 fold2=new StringBuilder(); 2070 } else { 2071 csp=null; 2072 fold1=fold2=null; 2073 } 2074 2075 /* initialize */ 2076 s1=0; 2077 limit1=cs1.length(); 2078 s2=0; 2079 limit2=cs2.length(); 2080 2081 level1=level2=0; 2082 c1=c2=-1; 2083 2084 /* comparison loop */ 2085 for(;;) { 2086 /* 2087 * here a code unit value of -1 means "get another code unit" 2088 * below it will mean "this source is finished" 2089 */ 2090 2091 if(c1<0) { 2092 /* get next code unit from string 1, post-increment */ 2093 for(;;) { 2094 if(s1==limit1) { 2095 if(level1==0) { 2096 c1=-1; 2097 break; 2098 } 2099 } else { 2100 c1=cs1.charAt(s1++); 2101 break; 2102 } 2103 2104 /* reached end of level buffer, pop one level */ 2105 do { 2106 --level1; 2107 cs1=stack1[level1].cs; 2108 } while(cs1==null); 2109 s1=stack1[level1].s; 2110 limit1=cs1.length(); 2111 } 2112 } 2113 2114 if(c2<0) { 2115 /* get next code unit from string 2, post-increment */ 2116 for(;;) { 2117 if(s2==limit2) { 2118 if(level2==0) { 2119 c2=-1; 2120 break; 2121 } 2122 } else { 2123 c2=cs2.charAt(s2++); 2124 break; 2125 } 2126 2127 /* reached end of level buffer, pop one level */ 2128 do { 2129 --level2; 2130 cs2=stack2[level2].cs; 2131 } while(cs2==null); 2132 s2=stack2[level2].s; 2133 limit2=cs2.length(); 2134 } 2135 } 2136 2137 /* 2138 * compare c1 and c2 2139 * either variable c1, c2 is -1 only if the corresponding string is finished 2140 */ 2141 if(c1==c2) { 2142 if(c1<0) { 2143 return 0; /* c1==c2==-1 indicating end of strings */ 2144 } 2145 c1=c2=-1; /* make us fetch new code units */ 2146 continue; 2147 } else if(c1<0) { 2148 return -1; /* string 1 ends before string 2 */ 2149 } else if(c2<0) { 2150 return 1; /* string 2 ends before string 1 */ 2151 } 2152 /* c1!=c2 && c1>=0 && c2>=0 */ 2153 2154 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 2155 cp1=c1; 2156 if(UTF16.isSurrogate((char)c1)) { 2157 char c; 2158 2159 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2160 if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) { 2161 /* advance ++s1; only below if cp1 decomposes/case-folds */ 2162 cp1=Character.toCodePoint((char)c1, c); 2163 } 2164 } else /* isTrail(c1) */ { 2165 if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) { 2166 cp1=Character.toCodePoint(c, (char)c1); 2167 } 2168 } 2169 } 2170 2171 cp2=c2; 2172 if(UTF16.isSurrogate((char)c2)) { 2173 char c; 2174 2175 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2176 if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) { 2177 /* advance ++s2; only below if cp2 decomposes/case-folds */ 2178 cp2=Character.toCodePoint((char)c2, c); 2179 } 2180 } else /* isTrail(c2) */ { 2181 if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) { 2182 cp2=Character.toCodePoint(c, (char)c2); 2183 } 2184 } 2185 } 2186 2187 /* 2188 * go down one level for each string 2189 * continue with the main loop as soon as there is a real change 2190 */ 2191 2192 if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2193 (length=csp.toFullFolding(cp1, fold1, options))>=0 2194 ) { 2195 /* cp1 case-folds to the code point "length" or to p[length] */ 2196 if(UTF16.isSurrogate((char)c1)) { 2197 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2198 /* advance beyond source surrogate pair if it case-folds */ 2199 ++s1; 2200 } else /* isTrail(c1) */ { 2201 /* 2202 * we got a supplementary code point when hitting its trail surrogate, 2203 * therefore the lead surrogate must have been the same as in the other string; 2204 * compare this decomposition with the lead surrogate in the other string 2205 * remember that this simulates bulk text replacement: 2206 * the decomposition would replace the entire code point 2207 */ 2208 --s2; 2209 c2=cs2.charAt(s2-1); 2210 } 2211 } 2212 2213 /* push current level pointers */ 2214 if(stack1==null) { 2215 stack1=createCmpEquivLevelStack(); 2216 } 2217 stack1[0].cs=cs1; 2218 stack1[0].s=s1; 2219 ++level1; 2220 2221 /* copy the folding result to fold1[] */ 2222 /* Java: the buffer was probably not empty, remove the old contents */ 2223 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2224 fold1.delete(0, fold1.length()-length); 2225 } else { 2226 fold1.setLength(0); 2227 fold1.appendCodePoint(length); 2228 } 2229 2230 /* set next level pointers to case folding */ 2231 cs1=fold1; 2232 s1=0; 2233 limit1=fold1.length(); 2234 2235 /* get ready to read from decomposition, continue with loop */ 2236 c1=-1; 2237 continue; 2238 } 2239 2240 if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 && 2241 (length=csp.toFullFolding(cp2, fold2, options))>=0 2242 ) { 2243 /* cp2 case-folds to the code point "length" or to p[length] */ 2244 if(UTF16.isSurrogate((char)c2)) { 2245 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2246 /* advance beyond source surrogate pair if it case-folds */ 2247 ++s2; 2248 } else /* isTrail(c2) */ { 2249 /* 2250 * we got a supplementary code point when hitting its trail surrogate, 2251 * therefore the lead surrogate must have been the same as in the other string; 2252 * compare this decomposition with the lead surrogate in the other string 2253 * remember that this simulates bulk text replacement: 2254 * the decomposition would replace the entire code point 2255 */ 2256 --s1; 2257 c1=cs1.charAt(s1-1); 2258 } 2259 } 2260 2261 /* push current level pointers */ 2262 if(stack2==null) { 2263 stack2=createCmpEquivLevelStack(); 2264 } 2265 stack2[0].cs=cs2; 2266 stack2[0].s=s2; 2267 ++level2; 2268 2269 /* copy the folding result to fold2[] */ 2270 /* Java: the buffer was probably not empty, remove the old contents */ 2271 if(length<=UCaseProps.MAX_STRING_LENGTH) { 2272 fold2.delete(0, fold2.length()-length); 2273 } else { 2274 fold2.setLength(0); 2275 fold2.appendCodePoint(length); 2276 } 2277 2278 /* set next level pointers to case folding */ 2279 cs2=fold2; 2280 s2=0; 2281 limit2=fold2.length(); 2282 2283 /* get ready to read from decomposition, continue with loop */ 2284 c2=-1; 2285 continue; 2286 } 2287 2288 if( level1<2 && (options&COMPARE_EQUIV)!=0 && 2289 (decomp1=nfcImpl.getDecomposition(cp1))!=null 2290 ) { 2291 /* cp1 decomposes into p[length] */ 2292 if(UTF16.isSurrogate((char)c1)) { 2293 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) { 2294 /* advance beyond source surrogate pair if it decomposes */ 2295 ++s1; 2296 } else /* isTrail(c1) */ { 2297 /* 2298 * we got a supplementary code point when hitting its trail surrogate, 2299 * therefore the lead surrogate must have been the same as in the other string; 2300 * compare this decomposition with the lead surrogate in the other string 2301 * remember that this simulates bulk text replacement: 2302 * the decomposition would replace the entire code point 2303 */ 2304 --s2; 2305 c2=cs2.charAt(s2-1); 2306 } 2307 } 2308 2309 /* push current level pointers */ 2310 if(stack1==null) { 2311 stack1=createCmpEquivLevelStack(); 2312 } 2313 stack1[level1].cs=cs1; 2314 stack1[level1].s=s1; 2315 ++level1; 2316 2317 /* set empty intermediate level if skipped */ 2318 if(level1<2) { 2319 stack1[level1++].cs=null; 2320 } 2321 2322 /* set next level pointers to decomposition */ 2323 cs1=decomp1; 2324 s1=0; 2325 limit1=decomp1.length(); 2326 2327 /* get ready to read from decomposition, continue with loop */ 2328 c1=-1; 2329 continue; 2330 } 2331 2332 if( level2<2 && (options&COMPARE_EQUIV)!=0 && 2333 (decomp2=nfcImpl.getDecomposition(cp2))!=null 2334 ) { 2335 /* cp2 decomposes into p[length] */ 2336 if(UTF16.isSurrogate((char)c2)) { 2337 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) { 2338 /* advance beyond source surrogate pair if it decomposes */ 2339 ++s2; 2340 } else /* isTrail(c2) */ { 2341 /* 2342 * we got a supplementary code point when hitting its trail surrogate, 2343 * therefore the lead surrogate must have been the same as in the other string; 2344 * compare this decomposition with the lead surrogate in the other string 2345 * remember that this simulates bulk text replacement: 2346 * the decomposition would replace the entire code point 2347 */ 2348 --s1; 2349 c1=cs1.charAt(s1-1); 2350 } 2351 } 2352 2353 /* push current level pointers */ 2354 if(stack2==null) { 2355 stack2=createCmpEquivLevelStack(); 2356 } 2357 stack2[level2].cs=cs2; 2358 stack2[level2].s=s2; 2359 ++level2; 2360 2361 /* set empty intermediate level if skipped */ 2362 if(level2<2) { 2363 stack2[level2++].cs=null; 2364 } 2365 2366 /* set next level pointers to decomposition */ 2367 cs2=decomp2; 2368 s2=0; 2369 limit2=decomp2.length(); 2370 2371 /* get ready to read from decomposition, continue with loop */ 2372 c2=-1; 2373 continue; 2374 } 2375 2376 /* 2377 * no decomposition/case folding, max level for both sides: 2378 * return difference result 2379 * 2380 * code point order comparison must not just return cp1-cp2 2381 * because when single surrogates are present then the surrogate pairs 2382 * that formed cp1 and cp2 may be from different string indexes 2383 * 2384 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 2385 * c1=d800 cp1=10001 c2=dc00 cp2=10000 2386 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 2387 * 2388 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 2389 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 2390 * so we have slightly different pointer/start/limit comparisons here 2391 */ 2392 2393 if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) { 2394 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 2395 if( 2396 (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) || 2397 (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2))) 2398 ) { 2399 /* part of a surrogate pair, leave >=d800 */ 2400 } else { 2401 /* BMP code point - may be surrogate code point - make <d800 */ 2402 c1-=0x2800; 2403 } 2404 2405 if( 2406 (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) || 2407 (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2))) 2408 ) { 2409 /* part of a surrogate pair, leave >=d800 */ 2410 } else { 2411 /* BMP code point - may be surrogate code point - make <d800 */ 2412 c2-=0x2800; 2413 } 2414 } 2415 2416 return c1-c2; 2417 } 2418 } 2419 2420 /** 2421 * An Appendable that writes into a char array with a capacity that may be 2422 * less than array.length. 2423 * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.) 2424 * <p> 2425 * An overflow is only reported at the end, for the old Normalizer API functions that write 2426 * to char arrays. 2427 */ 2428 private static final class CharsAppendable implements Appendable { CharsAppendable(char[] dest, int destStart, int destLimit)2429 public CharsAppendable(char[] dest, int destStart, int destLimit) { 2430 chars=dest; 2431 start=offset=destStart; 2432 limit=destLimit; 2433 } length()2434 public int length() { 2435 int len=offset-start; 2436 if(offset<=limit) { 2437 return len; 2438 } else { 2439 throw new IndexOutOfBoundsException(Integer.toString(len)); 2440 } 2441 } append(char c)2442 public Appendable append(char c) { 2443 if(offset<limit) { 2444 chars[offset]=c; 2445 } 2446 ++offset; 2447 return this; 2448 } append(CharSequence s)2449 public Appendable append(CharSequence s) { 2450 return append(s, 0, s.length()); 2451 } append(CharSequence s, int sStart, int sLimit)2452 public Appendable append(CharSequence s, int sStart, int sLimit) { 2453 int len=sLimit-sStart; 2454 if(len<=(limit-offset)) { 2455 while(sStart<sLimit) { // TODO: Is there a better way to copy the characters? 2456 chars[offset++]=s.charAt(sStart++); 2457 } 2458 } else { 2459 offset+=len; 2460 } 2461 return this; 2462 } 2463 2464 private final char[] chars; 2465 private final int start, limit; 2466 private int offset; 2467 } 2468 } 2469