1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  *   Copyright (C) 2009-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  *******************************************************************************
8  */
9 
10 package com.ibm.icu.text;
11 
12 import java.io.IOException;
13 import java.io.InputStream;
14 import java.nio.ByteBuffer;
15 
16 import com.ibm.icu.impl.ICUBinary;
17 import com.ibm.icu.impl.Norm2AllModes;
18 import com.ibm.icu.util.ICUUncheckedIOException;
19 
20 /**
21  * Unicode normalization functionality for standard Unicode normalization or
22  * for using custom mapping tables.
23  * All instances of this class are unmodifiable/immutable.
24  * The Normalizer2 class is not intended for public subclassing.
25  * <p>
26  * The primary functions are to produce a normalized string and to detect whether
27  * a string is already normalized.
28  * The most commonly used normalization forms are those defined in
29  * http://www.unicode.org/unicode/reports/tr15/
30  * However, this API supports additional normalization forms for specialized purposes.
31  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
32  * and can be used in implementations of UTS #46.
33  * <p>
34  * Not only are the standard compose and decompose modes supplied,
35  * but additional modes are provided as documented in the Mode enum.
36  * <p>
37  * Some of the functions in this class identify normalization boundaries.
38  * At a normalization boundary, the portions of the string
39  * before it and starting from it do not interact and can be handled independently.
40  * <p>
41  * The spanQuickCheckYes() stops at a normalization boundary.
42  * When the goal is a normalized string, then the text before the boundary
43  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
44  * <p>
45  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
46  * a character is guaranteed to be at a normalization boundary,
47  * regardless of context.
48  * This is used for moving from one normalization boundary to the next
49  * or preceding boundary, and for performing iterative normalization.
50  * <p>
51  * Iterative normalization is useful when only a small portion of a
52  * longer string needs to be processed.
53  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
54  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
55  * (to process only the substring for which sort key bytes are computed).
56  * <p>
57  * The set of normalization boundaries returned by these functions may not be
58  * complete: There may be more boundaries that could be returned.
59  * Different functions may return different boundaries.
60  * @stable ICU 4.4
61  * @author Markus W. Scherer
62  */
63 public abstract class Normalizer2 {
64     /**
65      * Constants for normalization modes.
66      * For details about standard Unicode normalization forms
67      * and about the algorithms which are also used with custom mapping tables
68      * see http://www.unicode.org/unicode/reports/tr15/
69      * @stable ICU 4.4
70      */
71     public enum Mode {
72         /**
73          * Decomposition followed by composition.
74          * Same as standard NFC when using an "nfc" instance.
75          * Same as standard NFKC when using an "nfkc" instance.
76          * For details about standard Unicode normalization forms
77          * see http://www.unicode.org/unicode/reports/tr15/
78          * @stable ICU 4.4
79          */
80         COMPOSE,
81         /**
82          * Map, and reorder canonically.
83          * Same as standard NFD when using an "nfc" instance.
84          * Same as standard NFKD when using an "nfkc" instance.
85          * For details about standard Unicode normalization forms
86          * see http://www.unicode.org/unicode/reports/tr15/
87          * @stable ICU 4.4
88          */
89         DECOMPOSE,
90         /**
91          * "Fast C or D" form.
92          * If a string is in this form, then further decomposition <i>without reordering</i>
93          * would yield the same form as DECOMPOSE.
94          * Text in "Fast C or D" form can be processed efficiently with data tables
95          * that are "canonically closed", that is, that provide equivalent data for
96          * equivalent text, without having to be fully normalized.<br>
97          * Not a standard Unicode normalization form.<br>
98          * Not a unique form: Different FCD strings can be canonically equivalent.<br>
99          * For details see http://www.unicode.org/notes/tn5/#FCD
100          * @stable ICU 4.4
101          */
102         FCD,
103         /**
104          * Compose only contiguously.
105          * Also known as "FCC" or "Fast C Contiguous".
106          * The result will often but not always be in NFC.
107          * The result will conform to FCD which is useful for processing.<br>
108          * Not a standard Unicode normalization form.<br>
109          * For details see http://www.unicode.org/notes/tn5/#FCC
110          * @stable ICU 4.4
111          */
112         COMPOSE_CONTIGUOUS
113     };
114 
115     /**
116      * Returns a Normalizer2 instance for Unicode NFC normalization.
117      * Same as getInstance(null, "nfc", Mode.COMPOSE).
118      * Returns an unmodifiable singleton instance.
119      * @return the requested Normalizer2, if successful
120      * @stable ICU 49
121      */
getNFCInstance()122     public static Normalizer2 getNFCInstance() {
123         return Norm2AllModes.getNFCInstance().comp;
124     }
125 
126     /**
127      * Returns a Normalizer2 instance for Unicode NFD normalization.
128      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
129      * Returns an unmodifiable singleton instance.
130      * @return the requested Normalizer2, if successful
131      * @stable ICU 49
132      */
getNFDInstance()133     public static Normalizer2 getNFDInstance() {
134         return Norm2AllModes.getNFCInstance().decomp;
135     }
136 
137     /**
138      * Returns a Normalizer2 instance for Unicode NFKC normalization.
139      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
140      * Returns an unmodifiable singleton instance.
141      * @return the requested Normalizer2, if successful
142      * @stable ICU 49
143      */
getNFKCInstance()144     public static Normalizer2 getNFKCInstance() {
145         return Norm2AllModes.getNFKCInstance().comp;
146     }
147 
148     /**
149      * Returns a Normalizer2 instance for Unicode NFKD normalization.
150      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
151      * Returns an unmodifiable singleton instance.
152      * @return the requested Normalizer2, if successful
153      * @stable ICU 49
154      */
getNFKDInstance()155     public static Normalizer2 getNFKDInstance() {
156         return Norm2AllModes.getNFKCInstance().decomp;
157     }
158 
159     /**
160      * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
161      * Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
162      * Returns an unmodifiable singleton instance.
163      * @return the requested Normalizer2, if successful
164      * @stable ICU 49
165      */
getNFKCCasefoldInstance()166     public static Normalizer2 getNFKCCasefoldInstance() {
167         return Norm2AllModes.getNFKC_CFInstance().comp;
168     }
169 
170     /**
171      * Returns a Normalizer2 instance which uses the specified data file
172      * (an ICU data file if data=null, or else custom binary data)
173      * and which composes or decomposes text according to the specified mode.
174      * Returns an unmodifiable singleton instance.
175      * <ul>
176      * <li>Use data=null for data files that are part of ICU's own data.
177      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.
178      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.
179      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
180      * </ul>
181      * If data!=null, then the binary data is read once and cached using the provided
182      * name as the key.
183      * If you know or expect the data to be cached already, you can use data!=null
184      * for non-ICU data as well.
185      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
186      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data
187      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
188      * @param mode normalization mode (compose or decompose etc.)
189      * @return the requested Normalizer2, if successful
190      * @stable ICU 4.4
191      */
getInstance(InputStream data, String name, Mode mode)192     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {
193         // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer.
194         ByteBuffer bytes = null;
195         if (data != null) {
196             try {
197                 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data);
198             } catch (IOException e) {
199                 throw new ICUUncheckedIOException(e);
200             }
201         }
202         Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name);
203         switch(mode) {
204         case COMPOSE: return all2Modes.comp;
205         case DECOMPOSE: return all2Modes.decomp;
206         case FCD: return all2Modes.fcd;
207         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;
208         default: return null;  // will not occur
209         }
210     }
211 
212     /**
213      * Returns the normalized form of the source string.
214      * @param src source string
215      * @return normalized src
216      * @stable ICU 4.4
217      */
normalize(CharSequence src)218     public String normalize(CharSequence src) {
219         if(src instanceof String) {
220             // Fastpath: Do not construct a new String if the src is a String
221             // and is already normalized.
222             int spanLength=spanQuickCheckYes(src);
223             if(spanLength==src.length()) {
224                 return (String)src;
225             }
226             if (spanLength != 0) {
227                 StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
228                 return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
229             }
230         }
231         return normalize(src, new StringBuilder(src.length())).toString();
232     }
233 
234     /**
235      * Writes the normalized form of the source string to the destination string
236      * (replacing its contents) and returns the destination string.
237      * The source and destination strings must be different objects.
238      * @param src source string
239      * @param dest destination string; its contents is replaced with normalized src
240      * @return dest
241      * @stable ICU 4.4
242      */
normalize(CharSequence src, StringBuilder dest)243     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
244 
245     /**
246      * Writes the normalized form of the source string to the destination Appendable
247      * and returns the destination Appendable.
248      * The source and destination strings must be different objects.
249      *
250      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
251      *
252      * @param src source string
253      * @param dest destination Appendable; gets normalized src appended
254      * @return dest
255      * @stable ICU 4.6
256      */
normalize(CharSequence src, Appendable dest)257     public abstract Appendable normalize(CharSequence src, Appendable dest);
258 
259     /**
260      * Appends the normalized form of the second string to the first string
261      * (merging them at the boundary) and returns the first string.
262      * The result is normalized if the first string was normalized.
263      * The first and second strings must be different objects.
264      * @param first string, should be normalized
265      * @param second string, will be normalized
266      * @return first
267      * @stable ICU 4.4
268      */
normalizeSecondAndAppend( StringBuilder first, CharSequence second)269     public abstract StringBuilder normalizeSecondAndAppend(
270             StringBuilder first, CharSequence second);
271 
272     /**
273      * Appends the second string to the first string
274      * (merging them at the boundary) and returns the first string.
275      * The result is normalized if both the strings were normalized.
276      * The first and second strings must be different objects.
277      * @param first string, should be normalized
278      * @param second string, should be normalized
279      * @return first
280      * @stable ICU 4.4
281      */
append(StringBuilder first, CharSequence second)282     public abstract StringBuilder append(StringBuilder first, CharSequence second);
283 
284     /**
285      * Gets the decomposition mapping of c.
286      * Roughly equivalent to normalizing the String form of c
287      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
288      * returns null if c does not have a decomposition mapping in this instance's data.
289      * This function is independent of the mode of the Normalizer2.
290      * @param c code point
291      * @return c's decomposition mapping, if any; otherwise null
292      * @stable ICU 4.6
293      */
getDecomposition(int c)294     public abstract String getDecomposition(int c);
295 
296     /**
297      * Gets the raw decomposition mapping of c.
298      *
299      * <p>This is similar to the getDecomposition() method but returns the
300      * raw decomposition mapping as specified in UnicodeData.txt or
301      * (for custom data) in the mapping files processed by the gennorm2 tool.
302      * By contrast, getDecomposition() returns the processed,
303      * recursively-decomposed version of this mapping.
304      *
305      * <p>When used on a standard NFKC Normalizer2 instance,
306      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
307      *
308      * <p>When used on a standard NFC Normalizer2 instance,
309      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
310      * in this case, the result contains either one or two code points (=1..4 Java chars).
311      *
312      * <p>This function is independent of the mode of the Normalizer2.
313      * The default implementation returns null.
314      * @param c code point
315      * @return c's raw decomposition mapping, if any; otherwise null
316      * @stable ICU 49
317      */
getRawDecomposition(int c)318     public String getRawDecomposition(int c) { return null; }
319 
320     /**
321      * Performs pairwise composition of a &amp; b and returns the composite if there is one.
322      *
323      * <p>Returns a composite code point c only if c has a two-way mapping to a+b.
324      * In standard Unicode normalization, this means that
325      * c has a canonical decomposition to a+b
326      * and c does not have the Full_Composition_Exclusion property.
327      *
328      * <p>This function is independent of the mode of the Normalizer2.
329      * The default implementation returns a negative value.
330      * @param a A (normalization starter) code point.
331      * @param b Another code point.
332      * @return The non-negative composite code point if there is one; otherwise a negative value.
333      * @stable ICU 49
334      */
composePair(int a, int b)335     public int composePair(int a, int b) { return -1; }
336 
337     /**
338      * Gets the combining class of c.
339      * The default implementation returns 0
340      * but all standard implementations return the Unicode Canonical_Combining_Class value.
341      * @param c code point
342      * @return c's combining class
343      * @stable ICU 49
344      */
getCombiningClass(int c)345     public int getCombiningClass(int c) { return 0; }
346 
347     /**
348      * Tests if the string is normalized.
349      * Internally, in cases where the quickCheck() method would return "maybe"
350      * (which is only possible for the two COMPOSE modes) this method
351      * resolves to "yes" or "no" to provide a definitive result,
352      * at the cost of doing more work in those cases.
353      * @param s input string
354      * @return true if s is normalized
355      * @stable ICU 4.4
356      */
isNormalized(CharSequence s)357     public abstract boolean isNormalized(CharSequence s);
358 
359     /**
360      * Tests if the string is normalized.
361      * For the two COMPOSE modes, the result could be "maybe" in cases that
362      * would take a little more work to resolve definitively.
363      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
364      * combination of quick check + normalization, to avoid
365      * re-checking the "yes" prefix.
366      * @param s input string
367      * @return the quick check result
368      * @stable ICU 4.4
369      */
quickCheck(CharSequence s)370     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);
371 
372     /**
373      * Returns the end of the normalized substring of the input string.
374      * In other words, with <code>end=spanQuickCheckYes(s);</code>
375      * the substring <code>s.subSequence(0, end)</code>
376      * will pass the quick check with a "yes" result.
377      * <p>
378      * The returned end index is usually one or more characters before the
379      * "no" or "maybe" character: The end index is at a normalization boundary.
380      * (See the class documentation for more about normalization boundaries.)
381      * <p>
382      * When the goal is a normalized string and most input strings are expected
383      * to be normalized already, then call this method,
384      * and if it returns a prefix shorter than the input string,
385      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
386      * @param s input string
387      * @return "yes" span end index
388      * @stable ICU 4.4
389      */
spanQuickCheckYes(CharSequence s)390     public abstract int spanQuickCheckYes(CharSequence s);
391 
392     /**
393      * Tests if the character always has a normalization boundary before it,
394      * regardless of context.
395      * If true, then the character does not normalization-interact with
396      * preceding characters.
397      * In other words, a string containing this character can be normalized
398      * by processing portions before this character and starting from this
399      * character independently.
400      * This is used for iterative normalization. See the class documentation for details.
401      * @param c character to test
402      * @return true if c has a normalization boundary before it
403      * @stable ICU 4.4
404      */
hasBoundaryBefore(int c)405     public abstract boolean hasBoundaryBefore(int c);
406 
407     /**
408      * Tests if the character always has a normalization boundary after it,
409      * regardless of context.
410      * If true, then the character does not normalization-interact with
411      * following characters.
412      * In other words, a string containing this character can be normalized
413      * by processing portions up to this character and after this
414      * character independently.
415      * This is used for iterative normalization. See the class documentation for details.
416      * <p>
417      * Note that this operation may be significantly slower than hasBoundaryBefore().
418      * @param c character to test
419      * @return true if c has a normalization boundary after it
420      * @stable ICU 4.4
421      */
hasBoundaryAfter(int c)422     public abstract boolean hasBoundaryAfter(int c);
423 
424     /**
425      * Tests if the character is normalization-inert.
426      * If true, then the character does not change, nor normalization-interact with
427      * preceding or following characters.
428      * In other words, a string containing this character can be normalized
429      * by processing portions before this character and after this
430      * character independently.
431      * This is used for iterative normalization. See the class documentation for details.
432      * <p>
433      * Note that this operation may be significantly slower than hasBoundaryBefore().
434      * @param c character to test
435      * @return true if c is normalization-inert
436      * @stable ICU 4.4
437      */
isInert(int c)438     public abstract boolean isInert(int c);
439 
440     /**
441      * Sole constructor.  (For invocation by subclass constructors,
442      * typically implicit.)
443      * @internal
444      * @deprecated This API is ICU internal only.
445      */
446     @Deprecated
Normalizer2()447     protected Normalizer2() {
448     }
449 }
450