1 /*
2  *******************************************************************************
3  * Copyright (C) 2000-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 import java.nio.CharBuffer;
9 import java.text.CharacterIterator;
10 
11 import com.ibm.icu.impl.Norm2AllModes;
12 import com.ibm.icu.impl.Normalizer2Impl;
13 import com.ibm.icu.impl.UCaseProps;
14 import com.ibm.icu.lang.UCharacter;
15 import com.ibm.icu.util.ICUCloneNotSupportedException;
16 
17 /**
18  * Unicode Normalization
19  *
20  * <h2>Unicode normalization API</h2>
21  *
22  * <code>normalize</code> transforms Unicode text into an equivalent composed or
23  * decomposed form, allowing for easier sorting and searching of text.
24  * <code>normalize</code> supports the standard normalization forms described in
25  * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
26  * Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
27  *
28  * Characters with accents or other adornments can be encoded in
29  * several different ways in Unicode.  For example, take the character A-acute.
30  * In Unicode, this can be encoded as a single character (the
31  * "composed" form):
32  *
33  * <pre>
34  *      00C1    LATIN CAPITAL LETTER A WITH ACUTE
35  * </pre>
36  *
37  * or as two separate characters (the "decomposed" form):
38  *
39  * <pre>
40  *      0041    LATIN CAPITAL LETTER A
41  *      0301    COMBINING ACUTE ACCENT
42  * </pre>
43  *
44  * To a user of your program, however, both of these sequences should be
45  * treated as the same "user-level" character "A with acute accent".  When you
46  * are searching or comparing text, you must ensure that these two sequences are
47  * treated equivalently.  In addition, you must handle characters with more than
48  * one accent.  Sometimes the order of a character's combining accents is
49  * significant, while in other cases accent sequences in different orders are
50  * really equivalent.
51  *
52  * Similarly, the string "ffi" can be encoded as three separate letters:
53  *
54  * <pre>
55  *      0066    LATIN SMALL LETTER F
56  *      0066    LATIN SMALL LETTER F
57  *      0069    LATIN SMALL LETTER I
58  * </pre>
59  *
60  * or as the single character
61  *
62  * <pre>
63  *      FB03    LATIN SMALL LIGATURE FFI
64  * </pre>
65  *
66  * The ffi ligature is not a distinct semantic character, and strictly speaking
67  * it shouldn't be in Unicode at all, but it was included for compatibility
68  * with existing character sets that already provided it.  The Unicode standard
69  * identifies such characters by giving them "compatibility" decompositions
70  * into the corresponding semantic characters.  When sorting and searching, you
71  * will often want to use these mappings.
72  *
73  * <code>normalize</code> helps solve these problems by transforming text into
74  * the canonical composed and decomposed forms as shown in the first example
75  * above. In addition, you can have it perform compatibility decompositions so
76  * that you can treat compatibility characters the same as their equivalents.
77  * Finally, <code>normalize</code> rearranges accents into the proper canonical
78  * order, so that you do not have to worry about accent rearrangement on your
79  * own.
80  *
81  * Form FCD, "Fast C or D", is also designed for collation.
82  * It allows to work on strings that are not necessarily normalized
83  * with an algorithm (like in collation) that works under "canonical closure",
84  * i.e., it treats precomposed characters and their decomposed equivalents the
85  * same.
86  *
87  * It is not a normalization form because it does not provide for uniqueness of
88  * representation. Multiple strings may be canonically equivalent (their NFDs
89  * are identical) and may all conform to FCD without being identical themselves.
90  *
91  * The form is defined such that the "raw decomposition", the recursive
92  * canonical decomposition of each character, results in a string that is
93  * canonically ordered. This means that precomposed characters are allowed for
94  * as long as their decompositions do not need canonical reordering.
95  *
96  * Its advantage for a process like collation is that all NFD and most NFC texts
97  * - and many unnormalized texts - already conform to FCD and do not need to be
98  * normalized (NFD) for such a process. The FCD quick check will return YES for
99  * most strings in practice.
100  *
101  * normalize(FCD) may be implemented with NFD.
102  *
103  * For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
104  * http://www.unicode.org/notes/tn5/#FCD
105  *
106  * ICU collation performs either NFD or FCD normalization automatically if
107  * normalization is turned on for the collator object. Beyond collation and
108  * string search, normalized strings may be useful for string equivalence
109  * comparisons, transliteration/transcription, unique representations, etc.
110  *
111  * The W3C generally recommends to exchange texts in NFC.
112  * Note also that most legacy character encodings use only precomposed forms and
113  * often do not encode any combining marks by themselves. For conversion to such
114  * character encodings the Unicode text needs to be normalized to NFC.
115  * For more usage examples, see the Unicode Standard Annex.
116  *
117  * Note: The Normalizer class also provides API for iterative normalization.
118  * While the setIndex() and getIndex() refer to indices in the
119  * underlying Unicode input text, the next() and previous() methods
120  * iterate through characters in the normalized output.
121  * This means that there is not necessarily a one-to-one correspondence
122  * between characters returned by next() and previous() and the indices
123  * passed to and returned from setIndex() and getIndex().
124  * It is for this reason that Normalizer does not implement the CharacterIterator interface.
125  *
126  * @stable ICU 2.8
127  */
128 public final class Normalizer implements Cloneable {
129     // The input text and our position in it
130     private UCharacterIterator  text;
131     private Normalizer2         norm2;
132     private Mode                mode;
133     private int                 options;
134 
135     // The normalization buffer is the result of normalization
136     // of the source in [currentIndex..nextIndex[ .
137     private int                 currentIndex;
138     private int                 nextIndex;
139 
140     // A buffer for holding intermediate results
141     private StringBuilder       buffer;
142     private int                 bufferPos;
143 
144     // Helper classes to defer loading of normalization data.
145     private static final class ModeImpl {
ModeImpl(Normalizer2 n2)146         private ModeImpl(Normalizer2 n2) {
147             normalizer2 = n2;
148         }
149         private final Normalizer2 normalizer2;
150     }
151     private static final class NFDModeImpl {
152         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
153     }
154     private static final class NFKDModeImpl {
155         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
156     }
157     private static final class NFCModeImpl {
158         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
159     }
160     private static final class NFKCModeImpl {
161         private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
162     }
163     private static final class FCDModeImpl {
164         private static final ModeImpl INSTANCE = new ModeImpl(Norm2AllModes.getFCDNormalizer2());
165     }
166 
167     private static final class Unicode32 {
168         private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
169     }
170     private static final class NFD32ModeImpl {
171         private static final ModeImpl INSTANCE =
172             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
173                                                  Unicode32.INSTANCE));
174     }
175     private static final class NFKD32ModeImpl {
176         private static final ModeImpl INSTANCE =
177             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
178                                                  Unicode32.INSTANCE));
179     }
180     private static final class NFC32ModeImpl {
181         private static final ModeImpl INSTANCE =
182             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
183                                                  Unicode32.INSTANCE));
184     }
185     private static final class NFKC32ModeImpl {
186         private static final ModeImpl INSTANCE =
187             new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
188                                                  Unicode32.INSTANCE));
189     }
190     private static final class FCD32ModeImpl {
191         private static final ModeImpl INSTANCE =
192             new ModeImpl(new FilteredNormalizer2(Norm2AllModes.getFCDNormalizer2(),
193                                                  Unicode32.INSTANCE));
194     }
195 
196     /**
197      * Options bit set value to select Unicode 3.2 normalization
198      * (except NormalizationCorrections).
199      * At most one Unicode version can be selected at a time.
200      * @stable ICU 2.6
201      */
202     public static final int UNICODE_3_2=0x20;
203 
204     /**
205      * Constant indicating that the end of the iteration has been reached.
206      * This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
207      * @stable ICU 2.8
208      */
209     public static final int DONE = UCharacterIterator.DONE;
210 
211     /**
212      * Constants for normalization modes.
213      * <p>
214      * The Mode class is not intended for public subclassing.
215      * Only the Mode constants provided by the Normalizer class should be used,
216      * and any fields or methods should not be called or overridden by users.
217      * @stable ICU 2.8
218      */
219     public static abstract class Mode {
220         /**
221          * Sole constructor
222          * @internal
223          * @deprecated This API is ICU internal only.
224          */
225         @Deprecated
Mode()226         protected Mode() {
227         }
228 
229         /**
230          * @internal
231          * @deprecated This API is ICU internal only.
232          */
233         @Deprecated
getNormalizer2(int options)234         protected abstract Normalizer2 getNormalizer2(int options);
235     }
236 
237     private static final class NONEMode extends Mode {
getNormalizer2(int options)238         protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
239     }
240     private static final class NFDMode extends Mode {
getNormalizer2(int options)241         protected Normalizer2 getNormalizer2(int options) {
242             return (options&UNICODE_3_2) != 0 ?
243                     NFD32ModeImpl.INSTANCE.normalizer2 : NFDModeImpl.INSTANCE.normalizer2;
244         }
245     }
246     private static final class NFKDMode extends Mode {
getNormalizer2(int options)247         protected Normalizer2 getNormalizer2(int options) {
248             return (options&UNICODE_3_2) != 0 ?
249                     NFKD32ModeImpl.INSTANCE.normalizer2 : NFKDModeImpl.INSTANCE.normalizer2;
250         }
251     }
252     private static final class NFCMode extends Mode {
getNormalizer2(int options)253         protected Normalizer2 getNormalizer2(int options) {
254             return (options&UNICODE_3_2) != 0 ?
255                     NFC32ModeImpl.INSTANCE.normalizer2 : NFCModeImpl.INSTANCE.normalizer2;
256         }
257     }
258     private static final class NFKCMode extends Mode {
getNormalizer2(int options)259         protected Normalizer2 getNormalizer2(int options) {
260             return (options&UNICODE_3_2) != 0 ?
261                     NFKC32ModeImpl.INSTANCE.normalizer2 : NFKCModeImpl.INSTANCE.normalizer2;
262         }
263     }
264     private static final class FCDMode extends Mode {
getNormalizer2(int options)265         protected Normalizer2 getNormalizer2(int options) {
266             return (options&UNICODE_3_2) != 0 ?
267                     FCD32ModeImpl.INSTANCE.normalizer2 : FCDModeImpl.INSTANCE.normalizer2;
268         }
269     }
270 
271     /**
272      * No decomposition/composition.
273      * @stable ICU 2.8
274      */
275     public static final Mode NONE = new NONEMode();
276 
277     /**
278      * Canonical decomposition.
279      * @stable ICU 2.8
280      */
281     public static final Mode NFD = new NFDMode();
282 
283     /**
284      * Compatibility decomposition.
285      * @stable ICU 2.8
286      */
287     public static final Mode NFKD = new NFKDMode();
288 
289     /**
290      * Canonical decomposition followed by canonical composition.
291      * @stable ICU 2.8
292      */
293     public static final Mode NFC = new NFCMode();
294 
295     /**
296      * Default normalization.
297      * @stable ICU 2.8
298      */
299     public static final Mode DEFAULT = NFC;
300 
301     /**
302      * Compatibility decomposition followed by canonical composition.
303      * @stable ICU 2.8
304      */
305     public static final Mode NFKC =new NFKCMode();
306 
307     /**
308      * "Fast C or D" form.
309      * @stable ICU 2.8
310      */
311     public static final Mode FCD = new FCDMode();
312 
313     /**
314      * Null operation for use with the {@link com.ibm.icu.text.Normalizer constructors}
315      * and the static {@link #normalize normalize} method.  This value tells
316      * the <tt>Normalizer</tt> to do nothing but return unprocessed characters
317      * from the underlying String or CharacterIterator.  If you have code which
318      * requires raw text at some times and normalized text at others, you can
319      * use <tt>NO_OP</tt> for the cases where you want raw text, rather
320      * than having a separate code path that bypasses <tt>Normalizer</tt>
321      * altogether.
322      * <p>
323      * @see #setMode
324      * @deprecated ICU 2.8. Use Nomalizer.NONE
325      * @see #NONE
326      */
327     @Deprecated
328     public static final Mode NO_OP = NONE;
329 
330     /**
331      * Canonical decomposition followed by canonical composition.  Used with the
332      * {@link com.ibm.icu.text.Normalizer constructors} and the static
333      * {@link #normalize normalize} method to determine the operation to be
334      * performed.
335      * <p>
336      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
337      * off, this operation produces output that is in
338      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
339      * Form</a>
340      * <b>C</b>.
341      * <p>
342      * @see #setMode
343      * @deprecated ICU 2.8. Use Normalier.NFC
344      * @see #NFC
345      */
346     @Deprecated
347     public static final Mode COMPOSE = NFC;
348 
349     /**
350      * Compatibility decomposition followed by canonical composition.
351      * Used with the {@link com.ibm.icu.text.Normalizer constructors} and the static
352      * {@link #normalize normalize} method to determine the operation to be
353      * performed.
354      * <p>
355      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
356      * off, this operation produces output that is in
357      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
358      * Form</a>
359      * <b>KC</b>.
360      * <p>
361      * @see #setMode
362      * @deprecated ICU 2.8. Use Normalizer.NFKC
363      * @see #NFKC
364      */
365     @Deprecated
366     public static final Mode COMPOSE_COMPAT = NFKC;
367 
368     /**
369      * Canonical decomposition.  This value is passed to the
370      * {@link com.ibm.icu.text.Normalizer constructors} and the static
371      * {@link #normalize normalize}
372      * method to determine the operation to be performed.
373      * <p>
374      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
375      * off, this operation produces output that is in
376      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
377      * Form</a>
378      * <b>D</b>.
379      * <p>
380      * @see #setMode
381      * @deprecated ICU 2.8. Use Normalizer.NFD
382      * @see #NFD
383      */
384     @Deprecated
385     public static final Mode DECOMP = NFD;
386 
387     /**
388      * Compatibility decomposition.  This value is passed to the
389      * {@link com.ibm.icu.text.Normalizer constructors} and the static
390      * {@link #normalize normalize}
391      * method to determine the operation to be performed.
392      * <p>
393      * If all optional features (<i>e.g.</i> {@link #IGNORE_HANGUL}) are turned
394      * off, this operation produces output that is in
395      * <a href=http://www.unicode.org/unicode/reports/tr15/>Unicode Canonical
396      * Form</a>
397      * <b>KD</b>.
398      * <p>
399      * @see #setMode
400      * @deprecated ICU 2.8. Use Normalizer.NFKD
401      * @see #NFKD
402      */
403     @Deprecated
404     public static final Mode DECOMP_COMPAT = NFKD;
405 
406     /**
407      * Option to disable Hangul/Jamo composition and decomposition.
408      * This option applies to Korean text,
409      * which can be represented either in the Jamo alphabet or in Hangul
410      * characters, which are really just two or three Jamo combined
411      * into one visual glyph.  Since Jamo takes up more storage space than
412      * Hangul, applications that process only Hangul text may wish to turn
413      * this option on when decomposing text.
414      * <p>
415      * The Unicode standard treates Hangul to Jamo conversion as a
416      * canonical decomposition, so this option must be turned <b>off</b> if you
417      * wish to transform strings into one of the standard
418      * <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
419      * Unicode Normalization Forms</a>.
420      * <p>
421      * @see #setOption
422      * @deprecated ICU 2.8. This option is no longer supported.
423      */
424     @Deprecated
425     public static final int IGNORE_HANGUL = 0x0001;
426 
427     /**
428      * Result values for quickCheck().
429      * For details see Unicode Technical Report 15.
430      * @stable ICU 2.8
431      */
432     public static final class QuickCheckResult{
433         //private int resultValue;
QuickCheckResult(int value)434         private QuickCheckResult(int value) {
435             //resultValue=value;
436         }
437     }
438     /**
439      * Indicates that string is not in the normalized format
440      * @stable ICU 2.8
441      */
442     public static final QuickCheckResult NO = new QuickCheckResult(0);
443 
444     /**
445      * Indicates that string is in the normalized format
446      * @stable ICU 2.8
447      */
448     public static final QuickCheckResult YES = new QuickCheckResult(1);
449 
450     /**
451      * Indicates it cannot be determined if string is in the normalized
452      * format without further thorough checks.
453      * @stable ICU 2.8
454      */
455     public static final QuickCheckResult MAYBE = new QuickCheckResult(2);
456 
457     /**
458      * Option bit for compare:
459      * Case sensitively compare the strings
460      * @stable ICU 2.8
461      */
462     public static final int FOLD_CASE_DEFAULT =  UCharacter.FOLD_CASE_DEFAULT;
463 
464     /**
465      * Option bit for compare:
466      * Both input strings are assumed to fulfill FCD conditions.
467      * @stable ICU 2.8
468      */
469     public static final int INPUT_IS_FCD    =      0x20000;
470 
471     /**
472      * Option bit for compare:
473      * Perform case-insensitive comparison.
474      * @stable ICU 2.8
475      */
476     public static final int COMPARE_IGNORE_CASE  =     0x10000;
477 
478     /**
479      * Option bit for compare:
480      * Compare strings in code point order instead of code unit order.
481      * @stable ICU 2.8
482      */
483     public static final int COMPARE_CODE_POINT_ORDER = 0x8000;
484 
485     /**
486      * Option value for case folding:
487      * Use the modified set of mappings provided in CaseFolding.txt to handle dotted I
488      * and dotless i appropriately for Turkic languages (tr, az).
489      * @see UCharacter#FOLD_CASE_EXCLUDE_SPECIAL_I
490      * @stable ICU 2.8
491      */
492     public static final int FOLD_CASE_EXCLUDE_SPECIAL_I = UCharacter.FOLD_CASE_EXCLUDE_SPECIAL_I;
493 
494     /**
495      * Lowest-order bit number of compare() options bits corresponding to
496      * normalization options bits.
497      *
498      * The options parameter for compare() uses most bits for
499      * itself and for various comparison and folding flags.
500      * The most significant bits, however, are shifted down and passed on
501      * to the normalization implementation.
502      * (That is, from compare(..., options, ...),
503      * options>>COMPARE_NORM_OPTIONS_SHIFT will be passed on to the
504      * internal normalization functions.)
505      *
506      * @see #compare
507      * @stable ICU 2.6
508      */
509     public static final int COMPARE_NORM_OPTIONS_SHIFT  = 20;
510 
511     //-------------------------------------------------------------------------
512     // Iterator constructors
513     //-------------------------------------------------------------------------
514 
515     /**
516      * Creates a new <tt>Normalizer</tt> object for iterating over the
517      * normalized form of a given string.
518      * <p>
519      * The <tt>options</tt> parameter specifies which optional
520      * <tt>Normalizer</tt> features are to be enabled for this object.
521      * <p>
522      * @param str  The string to be normalized.  The normalization
523      *              will start at the beginning of the string.
524      *
525      * @param mode The normalization mode.
526      *
527      * @param opt Any optional features to be enabled.
528      *            Currently the only available option is {@link #UNICODE_3_2}.
529      *            If you want the default behavior corresponding to one of the
530      *            standard Unicode Normalization Forms, use 0 for this argument.
531      * @stable ICU 2.6
532      */
Normalizer(String str, Mode mode, int opt)533     public Normalizer(String str, Mode mode, int opt) {
534         this.text = UCharacterIterator.getInstance(str);
535         this.mode = mode;
536         this.options=opt;
537         norm2 = mode.getNormalizer2(opt);
538         buffer = new StringBuilder();
539     }
540 
541     /**
542      * Creates a new <tt>Normalizer</tt> object for iterating over the
543      * normalized form of the given text.
544      * <p>
545      * @param iter  The input text to be normalized.  The normalization
546      *              will start at the beginning of the string.
547      *
548      * @param mode  The normalization mode.
549      *
550      * @param opt Any optional features to be enabled.
551      *            Currently the only available option is {@link #UNICODE_3_2}.
552      *            If you want the default behavior corresponding to one of the
553      *            standard Unicode Normalization Forms, use 0 for this argument.
554      * @stable ICU 2.6
555      */
Normalizer(CharacterIterator iter, Mode mode, int opt)556     public Normalizer(CharacterIterator iter, Mode mode, int opt) {
557         this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
558         this.mode = mode;
559         this.options = opt;
560         norm2 = mode.getNormalizer2(opt);
561         buffer = new StringBuilder();
562     }
563 
564     /**
565      * Creates a new <tt>Normalizer</tt> object for iterating over the
566      * normalized form of the given text.
567      * <p>
568      * @param iter  The input text to be normalized.  The normalization
569      *              will start at the beginning of the string.
570      *
571      * @param mode  The normalization mode.
572      * @param options The normalization options, ORed together (0 for no options).
573      * @stable ICU 2.6
574      */
Normalizer(UCharacterIterator iter, Mode mode, int options)575     public Normalizer(UCharacterIterator iter, Mode mode, int options) {
576         try {
577             this.text     = (UCharacterIterator)iter.clone();
578             this.mode     = mode;
579             this.options  = options;
580             norm2 = mode.getNormalizer2(options);
581             buffer = new StringBuilder();
582         } catch (CloneNotSupportedException e) {
583             throw new ICUCloneNotSupportedException(e);
584         }
585     }
586 
587     /**
588      * Clones this <tt>Normalizer</tt> object.  All properties of this
589      * object are duplicated in the new object, including the cloning of any
590      * {@link CharacterIterator} that was passed in to the constructor
591      * or to {@link #setText(CharacterIterator) setText}.
592      * However, the text storage underlying
593      * the <tt>CharacterIterator</tt> is not duplicated unless the
594      * iterator's <tt>clone</tt> method does so.
595      * @stable ICU 2.8
596      */
clone()597     public Object clone() {
598         try {
599             Normalizer copy = (Normalizer) super.clone();
600             copy.text = (UCharacterIterator) text.clone();
601             copy.mode = mode;
602             copy.options = options;
603             copy.norm2 = norm2;
604             copy.buffer = new StringBuilder(buffer);
605             copy.bufferPos = bufferPos;
606             copy.currentIndex = currentIndex;
607             copy.nextIndex = nextIndex;
608             return copy;
609         }
610         catch (CloneNotSupportedException e) {
611             throw new ICUCloneNotSupportedException(e);
612         }
613     }
614 
615     //--------------------------------------------------------------------------
616     // Static Utility methods
617     //--------------------------------------------------------------------------
618 
getComposeNormalizer2(boolean compat, int options)619     private static final Normalizer2 getComposeNormalizer2(boolean compat, int options) {
620         return (compat ? NFKC : NFC).getNormalizer2(options);
621     }
getDecomposeNormalizer2(boolean compat, int options)622     private static final Normalizer2 getDecomposeNormalizer2(boolean compat, int options) {
623         return (compat ? NFKD : NFD).getNormalizer2(options);
624     }
625 
626     /**
627      * Compose a string.
628      * The string will be composed to according to the specified mode.
629      * @param str        The string to compose.
630      * @param compat     If true the string will be composed according to
631      *                    NFKC rules and if false will be composed according to
632      *                    NFC rules.
633      * @return String    The composed string
634      * @stable ICU 2.8
635      */
compose(String str, boolean compat)636     public static String compose(String str, boolean compat) {
637         return compose(str,compat,0);
638     }
639 
640     /**
641      * Compose a string.
642      * The string will be composed to according to the specified mode.
643      * @param str        The string to compose.
644      * @param compat     If true the string will be composed according to
645      *                    NFKC rules and if false will be composed according to
646      *                    NFC rules.
647      * @param options    The only recognized option is UNICODE_3_2
648      * @return String    The composed string
649      * @stable ICU 2.6
650      */
compose(String str, boolean compat, int options)651     public static String compose(String str, boolean compat, int options) {
652         return getComposeNormalizer2(compat, options).normalize(str);
653     }
654 
655     /**
656      * Compose a string.
657      * The string will be composed to according to the specified mode.
658      * @param source The char array to compose.
659      * @param target A char buffer to receive the normalized text.
660      * @param compat If true the char array will be composed according to
661      *                NFKC rules and if false will be composed according to
662      *                NFC rules.
663      * @param options The normalization options, ORed together (0 for no options).
664      * @return int   The total buffer size needed;if greater than length of
665      *                result, the output was truncated.
666      * @exception IndexOutOfBoundsException if target.length is less than the
667      *             required length
668      * @stable ICU 2.6
669      */
compose(char[] source,char[] target, boolean compat, int options)670     public static int compose(char[] source,char[] target, boolean compat, int options) {
671         return compose(source, 0, source.length, target, 0, target.length, compat, options);
672     }
673 
674     /**
675      * Compose a string.
676      * The string will be composed to according to the specified mode.
677      * @param src       The char array to compose.
678      * @param srcStart  Start index of the source
679      * @param srcLimit  Limit index of the source
680      * @param dest      The char buffer to fill in
681      * @param destStart Start index of the destination buffer
682      * @param destLimit End index of the destination buffer
683      * @param compat If true the char array will be composed according to
684      *                NFKC rules and if false will be composed according to
685      *                NFC rules.
686      * @param options The normalization options, ORed together (0 for no options).
687      * @return int   The total buffer size needed;if greater than length of
688      *                result, the output was truncated.
689      * @exception IndexOutOfBoundsException if target.length is less than the
690      *             required length
691      * @stable ICU 2.6
692      */
compose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)693     public static int compose(char[] src,int srcStart, int srcLimit,
694                               char[] dest,int destStart, int destLimit,
695                               boolean compat, int options) {
696         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
697         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
698         getComposeNormalizer2(compat, options).normalize(srcBuffer, app);
699         return app.length();
700     }
701 
702     /**
703      * Decompose a string.
704      * The string will be decomposed to according to the specified mode.
705      * @param str       The string to decompose.
706      * @param compat    If true the string will be decomposed according to NFKD
707      *                   rules and if false will be decomposed according to NFD
708      *                   rules.
709      * @return String   The decomposed string
710      * @stable ICU 2.8
711      */
decompose(String str, boolean compat)712     public static String decompose(String str, boolean compat) {
713         return decompose(str,compat,0);
714     }
715 
716     /**
717      * Decompose a string.
718      * The string will be decomposed to according to the specified mode.
719      * @param str     The string to decompose.
720      * @param compat  If true the string will be decomposed according to NFKD
721      *                 rules and if false will be decomposed according to NFD
722      *                 rules.
723      * @param options The normalization options, ORed together (0 for no options).
724      * @return String The decomposed string
725      * @stable ICU 2.6
726      */
decompose(String str, boolean compat, int options)727     public static String decompose(String str, boolean compat, int options) {
728         return getDecomposeNormalizer2(compat, options).normalize(str);
729     }
730 
731     /**
732      * Decompose a string.
733      * The string will be decomposed to according to the specified mode.
734      * @param source The char array to decompose.
735      * @param target A char buffer to receive the normalized text.
736      * @param compat If true the char array will be decomposed according to NFKD
737      *                rules and if false will be decomposed according to
738      *                NFD rules.
739      * @return int   The total buffer size needed;if greater than length of
740      *                result,the output was truncated.
741      * @param options The normalization options, ORed together (0 for no options).
742      * @exception IndexOutOfBoundsException if the target capacity is less than
743      *             the required length
744      * @stable ICU 2.6
745      */
decompose(char[] source,char[] target, boolean compat, int options)746     public static int decompose(char[] source,char[] target, boolean compat, int options) {
747         return decompose(source, 0, source.length, target, 0, target.length, compat, options);
748     }
749 
750     /**
751      * Decompose a string.
752      * The string will be decomposed to according to the specified mode.
753      * @param src       The char array to compose.
754      * @param srcStart  Start index of the source
755      * @param srcLimit  Limit index of the source
756      * @param dest      The char buffer to fill in
757      * @param destStart Start index of the destination buffer
758      * @param destLimit End index of the destination buffer
759      * @param compat If true the char array will be decomposed according to NFKD
760      *                rules and if false will be decomposed according to
761      *                NFD rules.
762      * @param options The normalization options, ORed together (0 for no options).
763      * @return int   The total buffer size needed;if greater than length of
764      *                result,the output was truncated.
765      * @exception IndexOutOfBoundsException if the target capacity is less than
766      *             the required length
767      * @stable ICU 2.6
768      */
decompose(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, boolean compat, int options)769     public static int decompose(char[] src,int srcStart, int srcLimit,
770                                 char[] dest,int destStart, int destLimit,
771                                 boolean compat, int options) {
772         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
773         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
774         getDecomposeNormalizer2(compat, options).normalize(srcBuffer, app);
775         return app.length();
776     }
777 
778     /**
779      * Normalizes a <tt>String</tt> using the given normalization operation.
780      * <p>
781      * The <tt>options</tt> parameter specifies which optional
782      * <tt>Normalizer</tt> features are to be enabled for this operation.
783      * Currently the only available option is {@link #UNICODE_3_2}.
784      * If you want the default behavior corresponding to one of the standard
785      * Unicode Normalization Forms, use 0 for this argument.
786      * <p>
787      * @param str       the input string to be normalized.
788      * @param mode      the normalization mode
789      * @param options   the optional features to be enabled.
790      * @return String   the normalized string
791      * @stable ICU 2.6
792      */
normalize(String str, Mode mode, int options)793     public static String normalize(String str, Mode mode, int options) {
794         return mode.getNormalizer2(options).normalize(str);
795     }
796 
797     /**
798      * Normalize a string.
799      * The string will be normalized according to the specified normalization
800      * mode and options.
801      * @param src        The string to normalize.
802      * @param mode       The normalization mode; one of Normalizer.NONE,
803      *                    Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
804      *                    Normalizer.NFKD, Normalizer.DEFAULT
805      * @return the normalized string
806      * @stable ICU 2.8
807      *
808      */
normalize(String src,Mode mode)809     public static String normalize(String src,Mode mode) {
810         return normalize(src, mode, 0);
811     }
812     /**
813      * Normalize a string.
814      * The string will be normalized according to the specified normalization
815      * mode and options.
816      * @param source The char array to normalize.
817      * @param target A char buffer to receive the normalized text.
818      * @param mode   The normalization mode; one of Normalizer.NONE,
819      *                Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
820      *                Normalizer.NFKD, Normalizer.DEFAULT
821      * @param options The normalization options, ORed together (0 for no options).
822      * @return int   The total buffer size needed;if greater than length of
823      *                result, the output was truncated.
824      * @exception    IndexOutOfBoundsException if the target capacity is less
825      *                than the required length
826      * @stable ICU 2.6
827      */
normalize(char[] source,char[] target, Mode mode, int options)828     public static int normalize(char[] source,char[] target, Mode  mode, int options) {
829         return normalize(source,0,source.length,target,0,target.length,mode, options);
830     }
831 
832     /**
833      * Normalize a string.
834      * The string will be normalized according to the specified normalization
835      * mode and options.
836      * @param src       The char array to compose.
837      * @param srcStart  Start index of the source
838      * @param srcLimit  Limit index of the source
839      * @param dest      The char buffer to fill in
840      * @param destStart Start index of the destination buffer
841      * @param destLimit End index of the destination buffer
842      * @param mode      The normalization mode; one of Normalizer.NONE,
843      *                   Normalizer.NFD, Normalizer.NFC, Normalizer.NFKC,
844      *                   Normalizer.NFKD, Normalizer.DEFAULT
845      * @param options The normalization options, ORed together (0 for no options).
846      * @return int      The total buffer size needed;if greater than length of
847      *                   result, the output was truncated.
848      * @exception       IndexOutOfBoundsException if the target capacity is
849      *                   less than the required length
850      * @stable ICU 2.6
851      */
normalize(char[] src,int srcStart, int srcLimit, char[] dest,int destStart, int destLimit, Mode mode, int options)852     public static int normalize(char[] src,int srcStart, int srcLimit,
853                                 char[] dest,int destStart, int destLimit,
854                                 Mode  mode, int options) {
855         CharBuffer srcBuffer = CharBuffer.wrap(src, srcStart, srcLimit - srcStart);
856         CharsAppendable app = new CharsAppendable(dest, destStart, destLimit);
857         mode.getNormalizer2(options).normalize(srcBuffer, app);
858         return app.length();
859     }
860 
861     /**
862      * Normalize a codepoint according to the given mode
863      * @param char32    The input string to be normalized.
864      * @param mode      The normalization mode
865      * @param options   Options for use with exclusion set and tailored Normalization
866      *                                   The only option that is currently recognized is UNICODE_3_2
867      * @return String   The normalized string
868      * @stable ICU 2.6
869      * @see #UNICODE_3_2
870      */
normalize(int char32, Mode mode, int options)871     public static String normalize(int char32, Mode mode, int options) {
872         if(mode == NFD && options == 0) {
873             String decomposition = Normalizer2.getNFCInstance().getDecomposition(char32);
874             if(decomposition == null) {
875                 decomposition = UTF16.valueOf(char32);
876             }
877             return decomposition;
878         }
879         return normalize(UTF16.valueOf(char32), mode, options);
880     }
881 
882     /**
883      * Convenience method to normalize a codepoint according to the given mode
884      * @param char32    The input string to be normalized.
885      * @param mode      The normalization mode
886      * @return String   The normalized string
887      * @stable ICU 2.6
888      */
normalize(int char32, Mode mode)889     public static String normalize(int char32, Mode mode) {
890         return normalize(char32, mode, 0);
891     }
892 
893     /**
894      * Convenience method.
895      *
896      * @param source   string for determining if it is in a normalized format
897      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
898      *                  Normalizer.NFKC,Normalizer.NFKD)
899      * @return         Return code to specify if the text is normalized or not
900      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
901      * @stable ICU 2.8
902      */
quickCheck(String source, Mode mode)903     public static QuickCheckResult quickCheck(String source, Mode mode) {
904         return quickCheck(source, mode, 0);
905     }
906 
907     /**
908      * Performing quick check on a string, to quickly determine if the string is
909      * in a particular normalization format.
910      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
911      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
912      * string is in the desired normalized format, Normalizer.NO determines that
913      * argument string is not in the desired normalized format. A
914      * Normalizer.MAYBE result indicates that a more thorough check is required,
915      * the user may have to put the string in its normalized form and compare
916      * the results.
917      *
918      * @param source   string for determining if it is in a normalized format
919      * @param mode     normalization format (Normalizer.NFC,Normalizer.NFD,
920      *                  Normalizer.NFKC,Normalizer.NFKD)
921      * @param options   Options for use with exclusion set and tailored Normalization
922      *                                   The only option that is currently recognized is UNICODE_3_2
923      * @return         Return code to specify if the text is normalized or not
924      *                     (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
925      * @stable ICU 2.6
926      */
quickCheck(String source, Mode mode, int options)927     public static QuickCheckResult quickCheck(String source, Mode mode, int options) {
928         return mode.getNormalizer2(options).quickCheck(source);
929     }
930 
931     /**
932      * Convenience method.
933      *
934      * @param source Array of characters for determining if it is in a
935      *                normalized format
936      * @param mode   normalization format (Normalizer.NFC,Normalizer.NFD,
937      *                Normalizer.NFKC,Normalizer.NFKD)
938      * @param options   Options for use with exclusion set and tailored Normalization
939      *                                   The only option that is currently recognized is UNICODE_3_2
940      * @return       Return code to specify if the text is normalized or not
941      *                (Normalizer.YES, Normalizer.NO or Normalizer.MAYBE)
942      * @stable ICU 2.6
943      */
quickCheck(char[] source, Mode mode, int options)944     public static QuickCheckResult quickCheck(char[] source, Mode mode, int options) {
945         return quickCheck(source, 0, source.length, mode, options);
946     }
947 
948     /**
949      * Performing quick check on a string, to quickly determine if the string is
950      * in a particular normalization format.
951      * Three types of result can be returned Normalizer.YES, Normalizer.NO or
952      * Normalizer.MAYBE. Result Normalizer.YES indicates that the argument
953      * string is in the desired normalized format, Normalizer.NO determines that
954      * argument string is not in the desired normalized format. A
955      * Normalizer.MAYBE result indicates that a more thorough check is required,
956      * the user may have to put the string in its normalized form and compare
957      * the results.
958      *
959      * @param source    string for determining if it is in a normalized format
960      * @param start     the start index of the source
961      * @param limit     the limit index of the source it is equal to the length
962      * @param mode      normalization format (Normalizer.NFC,Normalizer.NFD,
963      *                   Normalizer.NFKC,Normalizer.NFKD)
964      * @param options   Options for use with exclusion set and tailored Normalization
965      *                                   The only option that is currently recognized is UNICODE_3_2
966      * @return          Return code to specify if the text is normalized or not
967      *                   (Normalizer.YES, Normalizer.NO or
968      *                   Normalizer.MAYBE)
969      * @stable ICU 2.6
970      */
971 
quickCheck(char[] source,int start, int limit, Mode mode,int options)972     public static QuickCheckResult quickCheck(char[] source,int start,
973                                               int limit, Mode mode,int options) {
974         CharBuffer srcBuffer = CharBuffer.wrap(source, start, limit - start);
975         return mode.getNormalizer2(options).quickCheck(srcBuffer);
976     }
977 
978     /**
979      * Test if a string is in a given normalization form.
980      * This is semantically equivalent to source.equals(normalize(source, mode)).
981      *
982      * Unlike quickCheck(), this function returns a definitive result,
983      * never a "maybe".
984      * For NFD, NFKD, and FCD, both functions work exactly the same.
985      * For NFC and NFKC where quickCheck may return "maybe", this function will
986      * perform further tests to arrive at a true/false result.
987      * @param src       The input array of characters to be checked to see if
988      *                   it is normalized
989      * @param start     The strart index in the source
990      * @param limit     The limit index in the source
991      * @param mode      the normalization mode
992      * @param options   Options for use with exclusion set and tailored Normalization
993      *                                   The only option that is currently recognized is UNICODE_3_2
994      * @return Boolean value indicating whether the source string is in the
995      *         "mode" normalization form
996      * @stable ICU 2.6
997      */
isNormalized(char[] src,int start, int limit, Mode mode, int options)998     public static boolean isNormalized(char[] src,int start,
999                                        int limit, Mode mode,
1000                                        int options) {
1001         CharBuffer srcBuffer = CharBuffer.wrap(src, start, limit - start);
1002         return mode.getNormalizer2(options).isNormalized(srcBuffer);
1003     }
1004 
1005     /**
1006      * Test if a string is in a given normalization form.
1007      * This is semantically equivalent to source.equals(normalize(source, mode)).
1008      *
1009      * Unlike quickCheck(), this function returns a definitive result,
1010      * never a "maybe".
1011      * For NFD, NFKD, and FCD, both functions work exactly the same.
1012      * For NFC and NFKC where quickCheck may return "maybe", this function will
1013      * perform further tests to arrive at a true/false result.
1014      * @param str       the input string to be checked to see if it is
1015      *                   normalized
1016      * @param mode      the normalization mode
1017      * @param options   Options for use with exclusion set and tailored Normalization
1018      *                  The only option that is currently recognized is UNICODE_3_2
1019      * @see #isNormalized
1020      * @stable ICU 2.6
1021      */
isNormalized(String str, Mode mode, int options)1022     public static boolean isNormalized(String str, Mode mode, int options) {
1023         return mode.getNormalizer2(options).isNormalized(str);
1024     }
1025 
1026     /**
1027      * Convenience Method
1028      * @param char32    the input code point to be checked to see if it is
1029      *                   normalized
1030      * @param mode      the normalization mode
1031      * @param options   Options for use with exclusion set and tailored Normalization
1032      *                  The only option that is currently recognized is UNICODE_3_2
1033      *
1034      * @see #isNormalized
1035      * @stable ICU 2.6
1036      */
isNormalized(int char32, Mode mode,int options)1037     public static boolean isNormalized(int char32, Mode mode,int options) {
1038         return isNormalized(UTF16.valueOf(char32), mode, options);
1039     }
1040 
1041     /**
1042      * Compare two strings for canonical equivalence.
1043      * Further options include case-insensitive comparison and
1044      * code point order (as opposed to code unit order).
1045      *
1046      * Canonical equivalence between two strings is defined as their normalized
1047      * forms (NFD or NFC) being identical.
1048      * This function compares strings incrementally instead of normalizing
1049      * (and optionally case-folding) both strings entirely,
1050      * improving performance significantly.
1051      *
1052      * Bulk normalization is only necessary if the strings do not fulfill the
1053      * FCD conditions. Only in this case, and only if the strings are relatively
1054      * long, is memory allocated temporarily.
1055      * For FCD strings and short non-FCD strings there is no memory allocation.
1056      *
1057      * Semantically, this is equivalent to
1058      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1059      * where code point order and foldCase are all optional.
1060      *
1061      * @param s1        First source character array.
1062      * @param s1Start   start index of source
1063      * @param s1Limit   limit of the source
1064      *
1065      * @param s2        Second source character array.
1066      * @param s2Start   start index of the source
1067      * @param s2Limit   limit of the source
1068      *
1069      * @param options A bit set of options:
1070      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1071      *     Case-sensitive comparison in code unit order, and the input strings
1072      *     are quick-checked for FCD.
1073      *
1074      *   - INPUT_IS_FCD
1075      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1076      *     conditions.If not set, the function will quickCheck for FCD
1077      *     and normalize if necessary.
1078      *
1079      *   - COMPARE_CODE_POINT_ORDER
1080      *     Set to choose code point order instead of code unit order
1081      *
1082      *   - COMPARE_IGNORE_CASE
1083      *     Set to compare strings case-insensitively using case folding,
1084      *     instead of case-sensitively.
1085      *     If set, then the following case folding options are used.
1086      *
1087      *
1088      * @return <0 or 0 or >0 as usual for string comparisons
1089      *
1090      * @see #normalize
1091      * @see #FCD
1092      * @stable ICU 2.8
1093      */
compare(char[] s1, int s1Start, int s1Limit, char[] s2, int s2Start, int s2Limit, int options)1094     public static int compare(char[] s1, int s1Start, int s1Limit,
1095                               char[] s2, int s2Start, int s2Limit,
1096                               int options) {
1097         if( s1==null || s1Start<0 || s1Limit<0 ||
1098             s2==null || s2Start<0 || s2Limit<0 ||
1099             s1Limit<s1Start || s2Limit<s2Start
1100         ) {
1101             throw new IllegalArgumentException();
1102         }
1103         return internalCompare(CharBuffer.wrap(s1, s1Start, s1Limit-s1Start),
1104                                CharBuffer.wrap(s2, s2Start, s2Limit-s2Start),
1105                                options);
1106     }
1107 
1108     /**
1109      * Compare two strings for canonical equivalence.
1110      * Further options include case-insensitive comparison and
1111      * code point order (as opposed to code unit order).
1112      *
1113      * Canonical equivalence between two strings is defined as their normalized
1114      * forms (NFD or NFC) being identical.
1115      * This function compares strings incrementally instead of normalizing
1116      * (and optionally case-folding) both strings entirely,
1117      * improving performance significantly.
1118      *
1119      * Bulk normalization is only necessary if the strings do not fulfill the
1120      * FCD conditions. Only in this case, and only if the strings are relatively
1121      * long, is memory allocated temporarily.
1122      * For FCD strings and short non-FCD strings there is no memory allocation.
1123      *
1124      * Semantically, this is equivalent to
1125      *   strcmp[CodePointOrder](foldCase(NFD(s1)), foldCase(NFD(s2)))
1126      * where code point order and foldCase are all optional.
1127      *
1128      * @param s1 First source string.
1129      * @param s2 Second source string.
1130      *
1131      * @param options A bit set of options:
1132      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1133      *     Case-sensitive comparison in code unit order, and the input strings
1134      *     are quick-checked for FCD.
1135      *
1136      *   - INPUT_IS_FCD
1137      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1138      *     conditions. If not set, the function will quickCheck for FCD
1139      *     and normalize if necessary.
1140      *
1141      *   - COMPARE_CODE_POINT_ORDER
1142      *     Set to choose code point order instead of code unit order
1143      *
1144      *   - COMPARE_IGNORE_CASE
1145      *     Set to compare strings case-insensitively using case folding,
1146      *     instead of case-sensitively.
1147      *     If set, then the following case folding options are used.
1148      *
1149      * @return <0 or 0 or >0 as usual for string comparisons
1150      *
1151      * @see #normalize
1152      * @see #FCD
1153      * @stable ICU 2.8
1154      */
compare(String s1, String s2, int options)1155     public static int compare(String s1, String s2, int options) {
1156         return internalCompare(s1, s2, options);
1157     }
1158 
1159     /**
1160      * Compare two strings for canonical equivalence.
1161      * Further options include case-insensitive comparison and
1162      * code point order (as opposed to code unit order).
1163      * Convenience method.
1164      *
1165      * @param s1 First source string.
1166      * @param s2 Second source string.
1167      *
1168      * @param options A bit set of options:
1169      *   - FOLD_CASE_DEFAULT or 0 is used for default options:
1170      *     Case-sensitive comparison in code unit order, and the input strings
1171      *     are quick-checked for FCD.
1172      *
1173      *   - INPUT_IS_FCD
1174      *     Set if the caller knows that both s1 and s2 fulfill the FCD
1175      *     conditions. If not set, the function will quickCheck for FCD
1176      *     and normalize if necessary.
1177      *
1178      *   - COMPARE_CODE_POINT_ORDER
1179      *     Set to choose code point order instead of code unit order
1180      *
1181      *   - COMPARE_IGNORE_CASE
1182      *     Set to compare strings case-insensitively using case folding,
1183      *     instead of case-sensitively.
1184      *     If set, then the following case folding options are used.
1185      *
1186      * @return <0 or 0 or >0 as usual for string comparisons
1187      *
1188      * @see #normalize
1189      * @see #FCD
1190      * @stable ICU 2.8
1191      */
compare(char[] s1, char[] s2, int options)1192     public static int compare(char[] s1, char[] s2, int options) {
1193         return internalCompare(CharBuffer.wrap(s1), CharBuffer.wrap(s2), options);
1194     }
1195 
1196     /**
1197      * Convenience method that can have faster implementation
1198      * by not allocating buffers.
1199      * @param char32a    the first code point to be checked against the
1200      * @param char32b    the second code point
1201      * @param options    A bit set of options
1202      * @stable ICU 2.8
1203      */
compare(int char32a, int char32b, int options)1204     public static int compare(int char32a, int char32b, int options) {
1205         return internalCompare(UTF16.valueOf(char32a), UTF16.valueOf(char32b), options|INPUT_IS_FCD);
1206     }
1207 
1208     /**
1209      * Convenience method that can have faster implementation
1210      * by not allocating buffers.
1211      * @param char32a   the first code point to be checked against
1212      * @param str2      the second string
1213      * @param options   A bit set of options
1214      * @stable ICU 2.8
1215      */
compare(int char32a, String str2, int options)1216     public static int compare(int char32a, String str2, int options) {
1217         return internalCompare(UTF16.valueOf(char32a), str2, options);
1218     }
1219 
1220     /* Concatenation of normalized strings --------------------------------- */
1221     /**
1222      * Concatenate normalized strings, making sure that the result is normalized
1223      * as well.
1224      *
1225      * If both the left and the right strings are in
1226      * the normalization form according to "mode",
1227      * then the result will be
1228      *
1229      * <code>
1230      *     dest=normalize(left+right, mode)
1231      * </code>
1232      *
1233      * With the input strings already being normalized,
1234      * this function will use next() and previous()
1235      * to find the adjacent end pieces of the input strings.
1236      * Only the concatenation of these end pieces will be normalized and
1237      * then concatenated with the remaining parts of the input strings.
1238      *
1239      * It is allowed to have dest==left to avoid copying the entire left string.
1240      *
1241      * @param left Left source array, may be same as dest.
1242      * @param leftStart start in the left array.
1243      * @param leftLimit limit in the left array (==length)
1244      * @param right Right source array.
1245      * @param rightStart start in the right array.
1246      * @param rightLimit limit in the right array (==length)
1247      * @param dest The output buffer; can be null if destStart==destLimit==0
1248      *              for pure preflighting.
1249      * @param destStart start in the destination array
1250      * @param destLimit limit in the destination array (==length)
1251      * @param mode The normalization mode.
1252      * @param options The normalization options, ORed together (0 for no options).
1253      * @return Length of output (number of chars) when successful or
1254      *          IndexOutOfBoundsException
1255      * @exception IndexOutOfBoundsException whose message has the string
1256      *             representation of destination capacity required.
1257      * @see #normalize
1258      * @see #next
1259      * @see #previous
1260      * @exception IndexOutOfBoundsException if target capacity is less than the
1261      *             required length
1262      * @stable ICU 2.8
1263      */
concatenate(char[] left, int leftStart, int leftLimit, char[] right, int rightStart, int rightLimit, char[] dest, int destStart, int destLimit, Normalizer.Mode mode, int options)1264     public static int concatenate(char[] left,  int leftStart,  int leftLimit,
1265                                   char[] right, int rightStart, int rightLimit,
1266                                   char[] dest,  int destStart,  int destLimit,
1267                                   Normalizer.Mode mode, int options) {
1268         if(dest == null) {
1269             throw new IllegalArgumentException();
1270         }
1271 
1272         /* check for overlapping right and destination */
1273         if (right == dest && rightStart < destLimit && destStart < rightLimit) {
1274             throw new IllegalArgumentException("overlapping right and dst ranges");
1275         }
1276 
1277         /* allow left==dest */
1278         StringBuilder destBuilder=new StringBuilder(leftLimit-leftStart+rightLimit-rightStart+16);
1279         destBuilder.append(left, leftStart, leftLimit-leftStart);
1280         CharBuffer rightBuffer=CharBuffer.wrap(right, rightStart, rightLimit-rightStart);
1281         mode.getNormalizer2(options).append(destBuilder, rightBuffer);
1282         int destLength=destBuilder.length();
1283         if(destLength<=(destLimit-destStart)) {
1284             destBuilder.getChars(0, destLength, dest, destStart);
1285             return destLength;
1286         } else {
1287             throw new IndexOutOfBoundsException(Integer.toString(destLength));
1288         }
1289     }
1290 
1291     /**
1292      * Concatenate normalized strings, making sure that the result is normalized
1293      * as well.
1294      *
1295      * If both the left and the right strings are in
1296      * the normalization form according to "mode",
1297      * then the result will be
1298      *
1299      * <code>
1300      *     dest=normalize(left+right, mode)
1301      * </code>
1302      *
1303      * For details see concatenate
1304      *
1305      * @param left Left source string.
1306      * @param right Right source string.
1307      * @param mode The normalization mode.
1308      * @param options The normalization options, ORed together (0 for no options).
1309      * @return result
1310      *
1311      * @see #concatenate
1312      * @see #normalize
1313      * @see #next
1314      * @see #previous
1315      * @see #concatenate
1316      * @stable ICU 2.8
1317      */
concatenate(char[] left, char[] right,Mode mode, int options)1318     public static String concatenate(char[] left, char[] right,Mode mode, int options) {
1319         StringBuilder dest=new StringBuilder(left.length+right.length+16).append(left);
1320         return mode.getNormalizer2(options).append(dest, CharBuffer.wrap(right)).toString();
1321     }
1322 
1323     /**
1324      * Concatenate normalized strings, making sure that the result is normalized
1325      * as well.
1326      *
1327      * If both the left and the right strings are in
1328      * the normalization form according to "mode",
1329      * then the result will be
1330      *
1331      * <code>
1332      *     dest=normalize(left+right, mode)
1333      * </code>
1334      *
1335      * With the input strings already being normalized,
1336      * this function will use next() and previous()
1337      * to find the adjacent end pieces of the input strings.
1338      * Only the concatenation of these end pieces will be normalized and
1339      * then concatenated with the remaining parts of the input strings.
1340      *
1341      * @param left Left source string.
1342      * @param right Right source string.
1343      * @param mode The normalization mode.
1344      * @param options The normalization options, ORed together (0 for no options).
1345      * @return result
1346      *
1347      * @see #concatenate
1348      * @see #normalize
1349      * @see #next
1350      * @see #previous
1351      * @see #concatenate
1352      * @stable ICU 2.8
1353      */
concatenate(String left, String right, Mode mode, int options)1354     public static String concatenate(String left, String right, Mode mode, int options) {
1355         StringBuilder dest=new StringBuilder(left.length()+right.length()+16).append(left);
1356         return mode.getNormalizer2(options).append(dest, right).toString();
1357     }
1358 
1359     /**
1360      * Gets the FC_NFKC closure value.
1361      * @param c The code point whose closure value is to be retrieved
1362      * @param dest The char array to receive the closure value
1363      * @return the length of the closure value; 0 if there is none
1364      * @stable ICU 3.8
1365      */
getFC_NFKC_Closure(int c,char[] dest)1366     public static int getFC_NFKC_Closure(int c,char[] dest) {
1367         String closure=getFC_NFKC_Closure(c);
1368         int length=closure.length();
1369         if(length!=0 && dest!=null && length<=dest.length) {
1370             closure.getChars(0, length, dest, 0);
1371         }
1372         return length;
1373     }
1374     /**
1375      * Gets the FC_NFKC closure value.
1376      * @param c The code point whose closure value is to be retrieved
1377      * @return String representation of the closure value; "" if there is none
1378      * @stable ICU 3.8
1379      */
getFC_NFKC_Closure(int c)1380     public static String getFC_NFKC_Closure(int c) {
1381         // Compute the FC_NFKC_Closure on the fly:
1382         // We have the API for complete coverage of Unicode properties, although
1383         // this value by itself is not useful via API.
1384         // (What could be useful is a custom normalization table that combines
1385         // case folding and NFKC.)
1386         // For the derivation, see Unicode's DerivedNormalizationProps.txt.
1387         Normalizer2 nfkc=NFKCModeImpl.INSTANCE.normalizer2;
1388         UCaseProps csp=UCaseProps.INSTANCE;
1389         // first: b = NFKC(Fold(a))
1390         StringBuilder folded=new StringBuilder();
1391         int folded1Length=csp.toFullFolding(c, folded, 0);
1392         if(folded1Length<0) {
1393             Normalizer2Impl nfkcImpl=((Norm2AllModes.Normalizer2WithImpl)nfkc).impl;
1394             if(nfkcImpl.getCompQuickCheck(nfkcImpl.getNorm16(c))!=0) {
1395                 return "";  // c does not change at all under CaseFolding+NFKC
1396             }
1397             folded.appendCodePoint(c);
1398         } else {
1399             if(folded1Length>UCaseProps.MAX_STRING_LENGTH) {
1400                 folded.appendCodePoint(folded1Length);
1401             }
1402         }
1403         String kc1=nfkc.normalize(folded);
1404         // second: c = NFKC(Fold(b))
1405         String kc2=nfkc.normalize(UCharacter.foldCase(kc1, 0));
1406         // if (c != b) add the mapping from a to c
1407         if(kc1.equals(kc2)) {
1408             return "";
1409         } else {
1410             return kc2;
1411         }
1412     }
1413 
1414     //-------------------------------------------------------------------------
1415     // Iteration API
1416     //-------------------------------------------------------------------------
1417 
1418     /**
1419      * Return the current character in the normalized text.
1420      * @return The codepoint as an int
1421      * @stable ICU 2.8
1422      */
current()1423     public int current() {
1424         if(bufferPos<buffer.length() || nextNormalize()) {
1425             return buffer.codePointAt(bufferPos);
1426         } else {
1427             return DONE;
1428         }
1429     }
1430 
1431     /**
1432      * Return the next character in the normalized text and advance
1433      * the iteration position by one.  If the end
1434      * of the text has already been reached, {@link #DONE} is returned.
1435      * @return The codepoint as an int
1436      * @stable ICU 2.8
1437      */
next()1438     public int next() {
1439         if(bufferPos<buffer.length() ||  nextNormalize()) {
1440             int c=buffer.codePointAt(bufferPos);
1441             bufferPos+=Character.charCount(c);
1442             return c;
1443         } else {
1444             return DONE;
1445         }
1446     }
1447 
1448 
1449     /**
1450      * Return the previous character in the normalized text and decrement
1451      * the iteration position by one.  If the beginning
1452      * of the text has already been reached, {@link #DONE} is returned.
1453      * @return The codepoint as an int
1454      * @stable ICU 2.8
1455      */
previous()1456     public int previous() {
1457         if(bufferPos>0 || previousNormalize()) {
1458             int c=buffer.codePointBefore(bufferPos);
1459             bufferPos-=Character.charCount(c);
1460             return c;
1461         } else {
1462             return DONE;
1463         }
1464     }
1465 
1466     /**
1467      * Reset the index to the beginning of the text.
1468      * This is equivalent to setIndexOnly(startIndex)).
1469      * @stable ICU 2.8
1470      */
reset()1471     public void reset() {
1472         text.setToStart();
1473         currentIndex=nextIndex=0;
1474         clearBuffer();
1475     }
1476 
1477     /**
1478      * Set the iteration position in the input text that is being normalized,
1479      * without any immediate normalization.
1480      * After setIndexOnly(), getIndex() will return the same index that is
1481      * specified here.
1482      *
1483      * @param index the desired index in the input text.
1484      * @stable ICU 2.8
1485      */
setIndexOnly(int index)1486     public void setIndexOnly(int index) {
1487         text.setIndex(index);  // validates index
1488         currentIndex=nextIndex=index;
1489         clearBuffer();
1490     }
1491 
1492     /**
1493      * Set the iteration position in the input text that is being normalized
1494      * and return the first normalized character at that position.
1495      * <p>
1496      * <b>Note:</b> This method sets the position in the <em>input</em> text,
1497      * while {@link #next} and {@link #previous} iterate through characters
1498      * in the normalized <em>output</em>.  This means that there is not
1499      * necessarily a one-to-one correspondence between characters returned
1500      * by <tt>next</tt> and <tt>previous</tt> and the indices passed to and
1501      * returned from <tt>setIndex</tt> and {@link #getIndex}.
1502      * <p>
1503      * @param index the desired index in the input text.
1504      *
1505      * @return   the first normalized character that is the result of iterating
1506      *            forward starting at the given index.
1507      *
1508      * @throws IllegalArgumentException if the given index is less than
1509      *          {@link #getBeginIndex} or greater than {@link #getEndIndex}.
1510      * @deprecated ICU 3.2
1511      * @obsolete ICU 3.2
1512      */
1513     @Deprecated
1514      ///CLOVER:OFF
setIndex(int index)1515      public int setIndex(int index) {
1516          setIndexOnly(index);
1517          return current();
1518      }
1519      ///CLOVER:ON
1520     /**
1521      * Retrieve the index of the start of the input text. This is the begin
1522      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1523      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1524      * @deprecated ICU 2.2. Use startIndex() instead.
1525      * @return The codepoint as an int
1526      * @see #startIndex
1527      */
1528     @Deprecated
getBeginIndex()1529     public int getBeginIndex() {
1530         return 0;
1531     }
1532 
1533     /**
1534      * Retrieve the index of the end of the input text.  This is the end index
1535      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1536      * over which this <tt>Normalizer</tt> is iterating
1537      * @deprecated ICU 2.2. Use endIndex() instead.
1538      * @return The codepoint as an int
1539      * @see #endIndex
1540      */
1541     @Deprecated
getEndIndex()1542     public int getEndIndex() {
1543         return endIndex();
1544     }
1545     /**
1546      * Return the first character in the normalized text.  This resets
1547      * the <tt>Normalizer's</tt> position to the beginning of the text.
1548      * @return The codepoint as an int
1549      * @stable ICU 2.8
1550      */
first()1551     public int first() {
1552         reset();
1553         return next();
1554     }
1555 
1556     /**
1557      * Return the last character in the normalized text.  This resets
1558      * the <tt>Normalizer's</tt> position to be just before the
1559      * the input text corresponding to that normalized character.
1560      * @return The codepoint as an int
1561      * @stable ICU 2.8
1562      */
last()1563     public int last() {
1564         text.setToLimit();
1565         currentIndex=nextIndex=text.getIndex();
1566         clearBuffer();
1567         return previous();
1568     }
1569 
1570     /**
1571      * Retrieve the current iteration position in the input text that is
1572      * being normalized.  This method is useful in applications such as
1573      * searching, where you need to be able to determine the position in
1574      * the input text that corresponds to a given normalized output character.
1575      * <p>
1576      * <b>Note:</b> This method sets the position in the <em>input</em>, while
1577      * {@link #next} and {@link #previous} iterate through characters in the
1578      * <em>output</em>.  This means that there is not necessarily a one-to-one
1579      * correspondence between characters returned by <tt>next</tt> and
1580      * <tt>previous</tt> and the indices passed to and returned from
1581      * <tt>setIndex</tt> and {@link #getIndex}.
1582      * @return The current iteration position
1583      * @stable ICU 2.8
1584      */
getIndex()1585     public int getIndex() {
1586         if(bufferPos<buffer.length()) {
1587             return currentIndex;
1588         } else {
1589             return nextIndex;
1590         }
1591     }
1592 
1593     /**
1594      * Retrieve the index of the start of the input text. This is the begin
1595      * index of the <tt>CharacterIterator</tt> or the start (i.e. 0) of the
1596      * <tt>String</tt> over which this <tt>Normalizer</tt> is iterating
1597      * @return The current iteration position
1598      * @stable ICU 2.8
1599      */
startIndex()1600     public int startIndex() {
1601         return 0;
1602     }
1603 
1604     /**
1605      * Retrieve the index of the end of the input text.  This is the end index
1606      * of the <tt>CharacterIterator</tt> or the length of the <tt>String</tt>
1607      * over which this <tt>Normalizer</tt> is iterating
1608      * @return The current iteration position
1609      * @stable ICU 2.8
1610      */
endIndex()1611     public int endIndex() {
1612         return text.getLength();
1613     }
1614 
1615     //-------------------------------------------------------------------------
1616     // Iterator attributes
1617     //-------------------------------------------------------------------------
1618     /**
1619      * Set the normalization mode for this object.
1620      * <p>
1621      * <b>Note:</b>If the normalization mode is changed while iterating
1622      * over a string, calls to {@link #next} and {@link #previous} may
1623      * return previously buffers characters in the old normalization mode
1624      * until the iteration is able to re-sync at the next base character.
1625      * It is safest to call {@link #setText setText()}, {@link #first},
1626      * {@link #last}, etc. after calling <tt>setMode</tt>.
1627      * <p>
1628      * @param newMode the new mode for this <tt>Normalizer</tt>.
1629      * The supported modes are:
1630      * <ul>
1631      *  <li>{@link #NFC}    - Unicode canonical decompositiion
1632      *                        followed by canonical composition.
1633      *  <li>{@link #NFKC}   - Unicode compatibility decompositiion
1634      *                        follwed by canonical composition.
1635      *  <li>{@link #NFD}    - Unicode canonical decomposition
1636      *  <li>{@link #NFKD}   - Unicode compatibility decomposition.
1637      *  <li>{@link #NONE}   - Do nothing but return characters
1638      *                        from the underlying input text.
1639      * </ul>
1640      *
1641      * @see #getMode
1642      * @stable ICU 2.8
1643      */
setMode(Mode newMode)1644     public void setMode(Mode newMode) {
1645         mode = newMode;
1646         norm2 = mode.getNormalizer2(options);
1647     }
1648     /**
1649      * Return the basic operation performed by this <tt>Normalizer</tt>
1650      *
1651      * @see #setMode
1652      * @stable ICU 2.8
1653      */
getMode()1654     public Mode getMode() {
1655         return mode;
1656     }
1657     /**
1658      * Set options that affect this <tt>Normalizer</tt>'s operation.
1659      * Options do not change the basic composition or decomposition operation
1660      * that is being performed , but they control whether
1661      * certain optional portions of the operation are done.
1662      * Currently the only available option is:
1663      * <p>
1664      * <ul>
1665      *   <li>{@link #UNICODE_3_2} - Use Normalization conforming to Unicode version 3.2.
1666      * </ul>
1667      * <p>
1668      * @param   option  the option whose value is to be set.
1669      * @param   value   the new setting for the option.  Use <tt>true</tt> to
1670      *                  turn the option on and <tt>false</tt> to turn it off.
1671      *
1672      * @see #getOption
1673      * @stable ICU 2.6
1674      */
setOption(int option,boolean value)1675     public void setOption(int option,boolean value) {
1676         if (value) {
1677             options |= option;
1678         } else {
1679             options &= (~option);
1680         }
1681         norm2 = mode.getNormalizer2(options);
1682     }
1683 
1684     /**
1685      * Determine whether an option is turned on or off.
1686      * <p>
1687      * @see #setOption
1688      * @stable ICU 2.6
1689      */
getOption(int option)1690     public int getOption(int option) {
1691         if((options & option)!=0) {
1692             return 1 ;
1693         } else {
1694             return 0;
1695         }
1696     }
1697 
1698     /**
1699      * Gets the underlying text storage
1700      * @param fillIn the char buffer to fill the UTF-16 units.
1701      *         The length of the buffer should be equal to the length of the
1702      *         underlying text storage
1703      * @throws IndexOutOfBoundsException If the index passed for the array is invalid.
1704      * @see   #getLength
1705      * @stable ICU 2.8
1706      */
getText(char[] fillIn)1707     public int getText(char[] fillIn) {
1708         return text.getText(fillIn);
1709     }
1710 
1711     /**
1712      * Gets the length of underlying text storage
1713      * @return the length
1714      * @stable ICU 2.8
1715      */
getLength()1716     public int getLength() {
1717         return text.getLength();
1718     }
1719 
1720     /**
1721      * Returns the text under iteration as a string
1722      * @return a copy of the text under iteration.
1723      * @stable ICU 2.8
1724      */
getText()1725     public String getText() {
1726         return text.getText();
1727     }
1728 
1729     /**
1730      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1731      * The iteration position is set to the beginning of the input text.
1732      * @param newText   The new string to be normalized.
1733      * @stable ICU 2.8
1734      */
setText(StringBuffer newText)1735     public void setText(StringBuffer newText) {
1736         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1737         if (newIter == null) {
1738             throw new IllegalStateException("Could not create a new UCharacterIterator");
1739         }
1740         text = newIter;
1741         reset();
1742     }
1743 
1744     /**
1745      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1746      * The iteration position is set to the beginning of the input text.
1747      * @param newText   The new string to be normalized.
1748      * @stable ICU 2.8
1749      */
setText(char[] newText)1750     public void setText(char[] newText) {
1751         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1752         if (newIter == null) {
1753             throw new IllegalStateException("Could not create a new UCharacterIterator");
1754         }
1755         text = newIter;
1756         reset();
1757     }
1758 
1759     /**
1760      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1761      * The iteration position is set to the beginning of the input text.
1762      * @param newText   The new string to be normalized.
1763      * @stable ICU 2.8
1764      */
setText(String newText)1765     public void setText(String newText) {
1766         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1767         if (newIter == null) {
1768             throw new IllegalStateException("Could not create a new UCharacterIterator");
1769         }
1770         text = newIter;
1771         reset();
1772     }
1773 
1774     /**
1775      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1776      * The iteration position is set to the beginning of the input text.
1777      * @param newText   The new string to be normalized.
1778      * @stable ICU 2.8
1779      */
setText(CharacterIterator newText)1780     public void setText(CharacterIterator newText) {
1781         UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
1782         if (newIter == null) {
1783             throw new IllegalStateException("Could not create a new UCharacterIterator");
1784         }
1785         text = newIter;
1786         reset();
1787     }
1788 
1789     /**
1790      * Set the input text over which this <tt>Normalizer</tt> will iterate.
1791      * The iteration position is set to the beginning of the string.
1792      * @param newText   The new string to be normalized.
1793      * @stable ICU 2.8
1794      */
setText(UCharacterIterator newText)1795     public void setText(UCharacterIterator newText) {
1796         try{
1797             UCharacterIterator newIter = (UCharacterIterator)newText.clone();
1798             if (newIter == null) {
1799                 throw new IllegalStateException("Could not create a new UCharacterIterator");
1800             }
1801             text = newIter;
1802             reset();
1803         }catch(CloneNotSupportedException e) {
1804             throw new ICUCloneNotSupportedException("Could not clone the UCharacterIterator", e);
1805         }
1806     }
1807 
clearBuffer()1808     private void clearBuffer() {
1809         buffer.setLength(0);
1810         bufferPos=0;
1811     }
1812 
nextNormalize()1813     private boolean nextNormalize() {
1814         clearBuffer();
1815         currentIndex=nextIndex;
1816         text.setIndex(nextIndex);
1817         // Skip at least one character so we make progress.
1818         int c=text.nextCodePoint();
1819         if(c<0) {
1820             return false;
1821         }
1822         StringBuilder segment=new StringBuilder().appendCodePoint(c);
1823         while((c=text.nextCodePoint())>=0) {
1824             if(norm2.hasBoundaryBefore(c)) {
1825                 text.moveCodePointIndex(-1);
1826                 break;
1827             }
1828             segment.appendCodePoint(c);
1829         }
1830         nextIndex=text.getIndex();
1831         norm2.normalize(segment, buffer);
1832         return buffer.length()!=0;
1833     }
1834 
previousNormalize()1835     private boolean previousNormalize() {
1836         clearBuffer();
1837         nextIndex=currentIndex;
1838         text.setIndex(currentIndex);
1839         StringBuilder segment=new StringBuilder();
1840         int c;
1841         while((c=text.previousCodePoint())>=0) {
1842             if(c<=0xffff) {
1843                 segment.insert(0, (char)c);
1844             } else {
1845                 segment.insert(0, Character.toChars(c));
1846             }
1847             if(norm2.hasBoundaryBefore(c)) {
1848                 break;
1849             }
1850         }
1851         currentIndex=text.getIndex();
1852         norm2.normalize(segment, buffer);
1853         bufferPos=buffer.length();
1854         return buffer.length()!=0;
1855     }
1856 
1857     /* compare canonically equivalent ------------------------------------------- */
1858 
1859     // TODO: Broaden the public compare(String, String, options) API like this. Ticket #7407
internalCompare(CharSequence s1, CharSequence s2, int options)1860     private static int internalCompare(CharSequence s1, CharSequence s2, int options) {
1861         int normOptions=options>>>COMPARE_NORM_OPTIONS_SHIFT;
1862         options|= COMPARE_EQUIV;
1863 
1864         /*
1865          * UAX #21 Case Mappings, as fixed for Unicode version 4
1866          * (see Jitterbug 2021), defines a canonical caseless match as
1867          *
1868          * A string X is a canonical caseless match
1869          * for a string Y if and only if
1870          * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1871          *
1872          * For better performance, we check for FCD (or let the caller tell us that
1873          * both strings are in FCD) for the inner normalization.
1874          * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that
1875          * case-folding preserves the FCD-ness of a string.
1876          * The outer normalization is then only performed by NormalizerImpl.cmpEquivFold()
1877          * when there is a difference.
1878          *
1879          * Exception: When using the Turkic case-folding option, we do perform
1880          * full NFD first. This is because in the Turkic case precomposed characters
1881          * with 0049 capital I or 0069 small i fold differently whether they
1882          * are first decomposed or not, so an FCD check - a check only for
1883          * canonical order - is not sufficient.
1884          */
1885         if((options&INPUT_IS_FCD)==0 || (options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1886             Normalizer2 n2;
1887             if((options&FOLD_CASE_EXCLUDE_SPECIAL_I)!=0) {
1888                 n2=NFD.getNormalizer2(normOptions);
1889             } else {
1890                 n2=FCD.getNormalizer2(normOptions);
1891             }
1892 
1893             // check if s1 and/or s2 fulfill the FCD conditions
1894             int spanQCYes1=n2.spanQuickCheckYes(s1);
1895             int spanQCYes2=n2.spanQuickCheckYes(s2);
1896 
1897             /*
1898              * ICU 2.4 had a further optimization:
1899              * If both strings were not in FCD, then they were both NFD'ed,
1900              * and the COMPARE_EQUIV option was turned off.
1901              * It is not entirely clear that this is valid with the current
1902              * definition of the canonical caseless match.
1903              * Therefore, ICU 2.6 removes that optimization.
1904              */
1905 
1906             if(spanQCYes1<s1.length()) {
1907                 StringBuilder fcd1=new StringBuilder(s1.length()+16).append(s1, 0, spanQCYes1);
1908                 s1=n2.normalizeSecondAndAppend(fcd1, s1.subSequence(spanQCYes1, s1.length()));
1909             }
1910             if(spanQCYes2<s2.length()) {
1911                 StringBuilder fcd2=new StringBuilder(s2.length()+16).append(s2, 0, spanQCYes2);
1912                 s2=n2.normalizeSecondAndAppend(fcd2, s2.subSequence(spanQCYes2, s2.length()));
1913             }
1914         }
1915 
1916         return cmpEquivFold(s1, s2, options);
1917     }
1918 
1919     /*
1920      * Compare two strings for canonical equivalence.
1921      * Further options include case-insensitive comparison and
1922      * code point order (as opposed to code unit order).
1923      *
1924      * In this function, canonical equivalence is optional as well.
1925      * If canonical equivalence is tested, then both strings must fulfill
1926      * the FCD check.
1927      *
1928      * Semantically, this is equivalent to
1929      *   strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2)))
1930      * where code point order, NFD and foldCase are all optional.
1931      *
1932      * String comparisons almost always yield results before processing both strings
1933      * completely.
1934      * They are generally more efficient working incrementally instead of
1935      * performing the sub-processing (strlen, normalization, case-folding)
1936      * on the entire strings first.
1937      *
1938      * It is also unnecessary to not normalize identical characters.
1939      *
1940      * This function works in principle as follows:
1941      *
1942      * loop {
1943      *   get one code unit c1 from s1 (-1 if end of source)
1944      *   get one code unit c2 from s2 (-1 if end of source)
1945      *
1946      *   if(either string finished) {
1947      *     return result;
1948      *   }
1949      *   if(c1==c2) {
1950      *     continue;
1951      *   }
1952      *
1953      *   // c1!=c2
1954      *   try to decompose/case-fold c1/c2, and continue if one does;
1955      *
1956      *   // still c1!=c2 and neither decomposes/case-folds, return result
1957      *   return c1-c2;
1958      * }
1959      *
1960      * When a character decomposes, then the pointer for that source changes to
1961      * the decomposition, pushing the previous pointer onto a stack.
1962      * When the end of the decomposition is reached, then the code unit reader
1963      * pops the previous source from the stack.
1964      * (Same for case-folding.)
1965      *
1966      * This is complicated further by operating on variable-width UTF-16.
1967      * The top part of the loop works on code units, while lookups for decomposition
1968      * and case-folding need code points.
1969      * Code points are assembled after the equality/end-of-source part.
1970      * The source pointer is only advanced beyond all code units when the code point
1971      * actually decomposes/case-folds.
1972      *
1973      * If we were on a trail surrogate unit when assembling a code point,
1974      * and the code point decomposes/case-folds, then the decomposition/folding
1975      * result must be compared with the part of the other string that corresponds to
1976      * this string's lead surrogate.
1977      * Since we only assemble a code point when hitting a trail unit when the
1978      * preceding lead units were identical, we back up the other string by one unit
1979      * in such a case.
1980      *
1981      * The optional code point order comparison at the end works with
1982      * the same fix-up as the other code point order comparison functions.
1983      * See ustring.c and the comment near the end of this function.
1984      *
1985      * Assumption: A decomposition or case-folding result string never contains
1986      * a single surrogate. This is a safe assumption in the Unicode Standard.
1987      * Therefore, we do not need to check for surrogate pairs across
1988      * decomposition/case-folding boundaries.
1989      *
1990      * Further assumptions (see verifications tstnorm.cpp):
1991      * The API function checks for FCD first, while the core function
1992      * first case-folds and then decomposes. This requires that case-folding does not
1993      * un-FCD any strings.
1994      *
1995      * The API function may also NFD the input and turn off decomposition.
1996      * This requires that case-folding does not un-NFD strings either.
1997      *
1998      * TODO If any of the above two assumptions is violated,
1999      * then this entire code must be re-thought.
2000      * If this happens, then a simple solution is to case-fold both strings up front
2001      * and to turn off UNORM_INPUT_IS_FCD.
2002      * We already do this when not both strings are in FCD because makeFCD
2003      * would be a partial NFD before the case folding, which does not work.
2004      * Note that all of this is only a problem when case-folding _and_
2005      * canonical equivalence come together.
2006      * (Comments in unorm_compare() are more up to date than this TODO.)
2007      */
2008 
2009     /* stack element for previous-level source/decomposition pointers */
2010     private static final class CmpEquivLevel {
2011         CharSequence cs;
2012         int s;
2013     };
createCmpEquivLevelStack()2014     private static final CmpEquivLevel[] createCmpEquivLevelStack() {
2015         return new CmpEquivLevel[] {
2016             new CmpEquivLevel(), new CmpEquivLevel()
2017         };
2018     }
2019 
2020     /**
2021      * Internal option for unorm_cmpEquivFold() for decomposing.
2022      * If not set, just do strcasecmp().
2023      */
2024     private static final int COMPARE_EQUIV=0x80000;
2025 
2026     /* internal function; package visibility for use by UTF16.StringComparator */
cmpEquivFold(CharSequence cs1, CharSequence cs2, int options)2027     /*package*/ static int cmpEquivFold(CharSequence cs1, CharSequence cs2, int options) {
2028         Normalizer2Impl nfcImpl;
2029         UCaseProps csp;
2030 
2031         /* current-level start/limit - s1/s2 as current */
2032         int s1, s2, limit1, limit2;
2033 
2034         /* decomposition and case folding variables */
2035         int length;
2036 
2037         /* stacks of previous-level start/current/limit */
2038         CmpEquivLevel[] stack1=null, stack2=null;
2039 
2040         /* buffers for algorithmic decompositions */
2041         String decomp1, decomp2;
2042 
2043         /* case folding buffers, only use current-level start/limit */
2044         StringBuilder fold1, fold2;
2045 
2046         /* track which is the current level per string */
2047         int level1, level2;
2048 
2049         /* current code units, and code points for lookups */
2050         int c1, c2, cp1, cp2;
2051 
2052         /* no argument error checking because this itself is not an API */
2053 
2054         /*
2055          * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set
2056          * otherwise this function must behave exactly as uprv_strCompare()
2057          * not checking for that here makes testing this function easier
2058          */
2059 
2060         /* normalization/properties data loaded? */
2061         if((options&COMPARE_EQUIV)!=0) {
2062             nfcImpl=Norm2AllModes.getNFCInstance().impl;
2063         } else {
2064             nfcImpl=null;
2065         }
2066         if((options&COMPARE_IGNORE_CASE)!=0) {
2067             csp=UCaseProps.INSTANCE;
2068             fold1=new StringBuilder();
2069             fold2=new StringBuilder();
2070         } else {
2071             csp=null;
2072             fold1=fold2=null;
2073         }
2074 
2075         /* initialize */
2076         s1=0;
2077         limit1=cs1.length();
2078         s2=0;
2079         limit2=cs2.length();
2080 
2081         level1=level2=0;
2082         c1=c2=-1;
2083 
2084         /* comparison loop */
2085         for(;;) {
2086             /*
2087              * here a code unit value of -1 means "get another code unit"
2088              * below it will mean "this source is finished"
2089              */
2090 
2091             if(c1<0) {
2092                 /* get next code unit from string 1, post-increment */
2093                 for(;;) {
2094                     if(s1==limit1) {
2095                         if(level1==0) {
2096                             c1=-1;
2097                             break;
2098                         }
2099                     } else {
2100                         c1=cs1.charAt(s1++);
2101                         break;
2102                     }
2103 
2104                     /* reached end of level buffer, pop one level */
2105                     do {
2106                         --level1;
2107                         cs1=stack1[level1].cs;
2108                     } while(cs1==null);
2109                     s1=stack1[level1].s;
2110                     limit1=cs1.length();
2111                 }
2112             }
2113 
2114             if(c2<0) {
2115                 /* get next code unit from string 2, post-increment */
2116                 for(;;) {
2117                     if(s2==limit2) {
2118                         if(level2==0) {
2119                             c2=-1;
2120                             break;
2121                         }
2122                     } else {
2123                         c2=cs2.charAt(s2++);
2124                         break;
2125                     }
2126 
2127                     /* reached end of level buffer, pop one level */
2128                     do {
2129                         --level2;
2130                         cs2=stack2[level2].cs;
2131                     } while(cs2==null);
2132                     s2=stack2[level2].s;
2133                     limit2=cs2.length();
2134                 }
2135             }
2136 
2137             /*
2138              * compare c1 and c2
2139              * either variable c1, c2 is -1 only if the corresponding string is finished
2140              */
2141             if(c1==c2) {
2142                 if(c1<0) {
2143                     return 0;   /* c1==c2==-1 indicating end of strings */
2144                 }
2145                 c1=c2=-1;       /* make us fetch new code units */
2146                 continue;
2147             } else if(c1<0) {
2148                 return -1;      /* string 1 ends before string 2 */
2149             } else if(c2<0) {
2150                 return 1;       /* string 2 ends before string 1 */
2151             }
2152             /* c1!=c2 && c1>=0 && c2>=0 */
2153 
2154             /* get complete code points for c1, c2 for lookups if either is a surrogate */
2155             cp1=c1;
2156             if(UTF16.isSurrogate((char)c1)) {
2157                 char c;
2158 
2159                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2160                     if(s1!=limit1 && Character.isLowSurrogate(c=cs1.charAt(s1))) {
2161                         /* advance ++s1; only below if cp1 decomposes/case-folds */
2162                         cp1=Character.toCodePoint((char)c1, c);
2163                     }
2164                 } else /* isTrail(c1) */ {
2165                     if(0<=(s1-2) && Character.isHighSurrogate(c=cs1.charAt(s1-2))) {
2166                         cp1=Character.toCodePoint(c, (char)c1);
2167                     }
2168                 }
2169             }
2170 
2171             cp2=c2;
2172             if(UTF16.isSurrogate((char)c2)) {
2173                 char c;
2174 
2175                 if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2176                     if(s2!=limit2 && Character.isLowSurrogate(c=cs2.charAt(s2))) {
2177                         /* advance ++s2; only below if cp2 decomposes/case-folds */
2178                         cp2=Character.toCodePoint((char)c2, c);
2179                     }
2180                 } else /* isTrail(c2) */ {
2181                     if(0<=(s2-2) && Character.isHighSurrogate(c=cs2.charAt(s2-2))) {
2182                         cp2=Character.toCodePoint(c, (char)c2);
2183                     }
2184                 }
2185             }
2186 
2187             /*
2188              * go down one level for each string
2189              * continue with the main loop as soon as there is a real change
2190              */
2191 
2192             if( level1==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2193                 (length=csp.toFullFolding(cp1, fold1, options))>=0
2194             ) {
2195                 /* cp1 case-folds to the code point "length" or to p[length] */
2196                 if(UTF16.isSurrogate((char)c1)) {
2197                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2198                         /* advance beyond source surrogate pair if it case-folds */
2199                         ++s1;
2200                     } else /* isTrail(c1) */ {
2201                         /*
2202                          * we got a supplementary code point when hitting its trail surrogate,
2203                          * therefore the lead surrogate must have been the same as in the other string;
2204                          * compare this decomposition with the lead surrogate in the other string
2205                          * remember that this simulates bulk text replacement:
2206                          * the decomposition would replace the entire code point
2207                          */
2208                         --s2;
2209                         c2=cs2.charAt(s2-1);
2210                     }
2211                 }
2212 
2213                 /* push current level pointers */
2214                 if(stack1==null) {
2215                     stack1=createCmpEquivLevelStack();
2216                 }
2217                 stack1[0].cs=cs1;
2218                 stack1[0].s=s1;
2219                 ++level1;
2220 
2221                 /* copy the folding result to fold1[] */
2222                 /* Java: the buffer was probably not empty, remove the old contents */
2223                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2224                     fold1.delete(0, fold1.length()-length);
2225                 } else {
2226                     fold1.setLength(0);
2227                     fold1.appendCodePoint(length);
2228                 }
2229 
2230                 /* set next level pointers to case folding */
2231                 cs1=fold1;
2232                 s1=0;
2233                 limit1=fold1.length();
2234 
2235                 /* get ready to read from decomposition, continue with loop */
2236                 c1=-1;
2237                 continue;
2238             }
2239 
2240             if( level2==0 && (options&COMPARE_IGNORE_CASE)!=0 &&
2241                 (length=csp.toFullFolding(cp2, fold2, options))>=0
2242             ) {
2243                 /* cp2 case-folds to the code point "length" or to p[length] */
2244                 if(UTF16.isSurrogate((char)c2)) {
2245                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2246                         /* advance beyond source surrogate pair if it case-folds */
2247                         ++s2;
2248                     } else /* isTrail(c2) */ {
2249                         /*
2250                          * we got a supplementary code point when hitting its trail surrogate,
2251                          * therefore the lead surrogate must have been the same as in the other string;
2252                          * compare this decomposition with the lead surrogate in the other string
2253                          * remember that this simulates bulk text replacement:
2254                          * the decomposition would replace the entire code point
2255                          */
2256                         --s1;
2257                         c1=cs1.charAt(s1-1);
2258                     }
2259                 }
2260 
2261                 /* push current level pointers */
2262                 if(stack2==null) {
2263                     stack2=createCmpEquivLevelStack();
2264                 }
2265                 stack2[0].cs=cs2;
2266                 stack2[0].s=s2;
2267                 ++level2;
2268 
2269                 /* copy the folding result to fold2[] */
2270                 /* Java: the buffer was probably not empty, remove the old contents */
2271                 if(length<=UCaseProps.MAX_STRING_LENGTH) {
2272                     fold2.delete(0, fold2.length()-length);
2273                 } else {
2274                     fold2.setLength(0);
2275                     fold2.appendCodePoint(length);
2276                 }
2277 
2278                 /* set next level pointers to case folding */
2279                 cs2=fold2;
2280                 s2=0;
2281                 limit2=fold2.length();
2282 
2283                 /* get ready to read from decomposition, continue with loop */
2284                 c2=-1;
2285                 continue;
2286             }
2287 
2288             if( level1<2 && (options&COMPARE_EQUIV)!=0 &&
2289                 (decomp1=nfcImpl.getDecomposition(cp1))!=null
2290             ) {
2291                 /* cp1 decomposes into p[length] */
2292                 if(UTF16.isSurrogate((char)c1)) {
2293                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c1)) {
2294                         /* advance beyond source surrogate pair if it decomposes */
2295                         ++s1;
2296                     } else /* isTrail(c1) */ {
2297                         /*
2298                          * we got a supplementary code point when hitting its trail surrogate,
2299                          * therefore the lead surrogate must have been the same as in the other string;
2300                          * compare this decomposition with the lead surrogate in the other string
2301                          * remember that this simulates bulk text replacement:
2302                          * the decomposition would replace the entire code point
2303                          */
2304                         --s2;
2305                         c2=cs2.charAt(s2-1);
2306                     }
2307                 }
2308 
2309                 /* push current level pointers */
2310                 if(stack1==null) {
2311                     stack1=createCmpEquivLevelStack();
2312                 }
2313                 stack1[level1].cs=cs1;
2314                 stack1[level1].s=s1;
2315                 ++level1;
2316 
2317                 /* set empty intermediate level if skipped */
2318                 if(level1<2) {
2319                     stack1[level1++].cs=null;
2320                 }
2321 
2322                 /* set next level pointers to decomposition */
2323                 cs1=decomp1;
2324                 s1=0;
2325                 limit1=decomp1.length();
2326 
2327                 /* get ready to read from decomposition, continue with loop */
2328                 c1=-1;
2329                 continue;
2330             }
2331 
2332             if( level2<2 && (options&COMPARE_EQUIV)!=0 &&
2333                 (decomp2=nfcImpl.getDecomposition(cp2))!=null
2334             ) {
2335                 /* cp2 decomposes into p[length] */
2336                 if(UTF16.isSurrogate((char)c2)) {
2337                     if(Normalizer2Impl.UTF16Plus.isSurrogateLead(c2)) {
2338                         /* advance beyond source surrogate pair if it decomposes */
2339                         ++s2;
2340                     } else /* isTrail(c2) */ {
2341                         /*
2342                          * we got a supplementary code point when hitting its trail surrogate,
2343                          * therefore the lead surrogate must have been the same as in the other string;
2344                          * compare this decomposition with the lead surrogate in the other string
2345                          * remember that this simulates bulk text replacement:
2346                          * the decomposition would replace the entire code point
2347                          */
2348                         --s1;
2349                         c1=cs1.charAt(s1-1);
2350                     }
2351                 }
2352 
2353                 /* push current level pointers */
2354                 if(stack2==null) {
2355                     stack2=createCmpEquivLevelStack();
2356                 }
2357                 stack2[level2].cs=cs2;
2358                 stack2[level2].s=s2;
2359                 ++level2;
2360 
2361                 /* set empty intermediate level if skipped */
2362                 if(level2<2) {
2363                     stack2[level2++].cs=null;
2364                 }
2365 
2366                 /* set next level pointers to decomposition */
2367                 cs2=decomp2;
2368                 s2=0;
2369                 limit2=decomp2.length();
2370 
2371                 /* get ready to read from decomposition, continue with loop */
2372                 c2=-1;
2373                 continue;
2374             }
2375 
2376             /*
2377              * no decomposition/case folding, max level for both sides:
2378              * return difference result
2379              *
2380              * code point order comparison must not just return cp1-cp2
2381              * because when single surrogates are present then the surrogate pairs
2382              * that formed cp1 and cp2 may be from different string indexes
2383              *
2384              * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units
2385              * c1=d800 cp1=10001 c2=dc00 cp2=10000
2386              * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 }
2387              *
2388              * therefore, use same fix-up as in ustring.c/uprv_strCompare()
2389              * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++
2390              * so we have slightly different pointer/start/limit comparisons here
2391              */
2392 
2393             if(c1>=0xd800 && c2>=0xd800 && (options&COMPARE_CODE_POINT_ORDER)!=0) {
2394                 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
2395                 if(
2396                     (c1<=0xdbff && s1!=limit1 && Character.isLowSurrogate(cs1.charAt(s1))) ||
2397                     (Character.isLowSurrogate((char)c1) && 0!=(s1-1) && Character.isHighSurrogate(cs1.charAt(s1-2)))
2398                 ) {
2399                     /* part of a surrogate pair, leave >=d800 */
2400                 } else {
2401                     /* BMP code point - may be surrogate code point - make <d800 */
2402                     c1-=0x2800;
2403                 }
2404 
2405                 if(
2406                     (c2<=0xdbff && s2!=limit2 && Character.isLowSurrogate(cs2.charAt(s2))) ||
2407                     (Character.isLowSurrogate((char)c2) && 0!=(s2-1) && Character.isHighSurrogate(cs2.charAt(s2-2)))
2408                 ) {
2409                     /* part of a surrogate pair, leave >=d800 */
2410                 } else {
2411                     /* BMP code point - may be surrogate code point - make <d800 */
2412                     c2-=0x2800;
2413                 }
2414             }
2415 
2416             return c1-c2;
2417         }
2418     }
2419 
2420     /**
2421      * An Appendable that writes into a char array with a capacity that may be
2422      * less than array.length.
2423      * (By contrast, CharBuffer will write beyond destLimit all the way up to array.length.)
2424      * <p>
2425      * An overflow is only reported at the end, for the old Normalizer API functions that write
2426      * to char arrays.
2427      */
2428     private static final class CharsAppendable implements Appendable {
CharsAppendable(char[] dest, int destStart, int destLimit)2429         public CharsAppendable(char[] dest, int destStart, int destLimit) {
2430             chars=dest;
2431             start=offset=destStart;
2432             limit=destLimit;
2433         }
length()2434         public int length() {
2435             int len=offset-start;
2436             if(offset<=limit) {
2437                 return len;
2438             } else {
2439                 throw new IndexOutOfBoundsException(Integer.toString(len));
2440             }
2441         }
append(char c)2442         public Appendable append(char c) {
2443             if(offset<limit) {
2444                 chars[offset]=c;
2445             }
2446             ++offset;
2447             return this;
2448         }
append(CharSequence s)2449         public Appendable append(CharSequence s) {
2450             return append(s, 0, s.length());
2451         }
append(CharSequence s, int sStart, int sLimit)2452         public Appendable append(CharSequence s, int sStart, int sLimit) {
2453             int len=sLimit-sStart;
2454             if(len<=(limit-offset)) {
2455                 while(sStart<sLimit) {  // TODO: Is there a better way to copy the characters?
2456                     chars[offset++]=s.charAt(sStart++);
2457                 }
2458             } else {
2459                 offset+=len;
2460             }
2461             return this;
2462         }
2463 
2464         private final char[] chars;
2465         private final int start, limit;
2466         private int offset;
2467     }
2468 }
2469