1 /*
2  *******************************************************************************
3  *   Copyright (C) 2009-2015, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.text;
9 
10 import java.io.IOException;
11 import java.io.InputStream;
12 import java.nio.ByteBuffer;
13 
14 import com.ibm.icu.impl.ICUBinary;
15 import com.ibm.icu.impl.Norm2AllModes;
16 import com.ibm.icu.util.ICUUncheckedIOException;
17 
18 /**
19  * Unicode normalization functionality for standard Unicode normalization or
20  * for using custom mapping tables.
21  * All instances of this class are unmodifiable/immutable.
22  * The Normalizer2 class is not intended for public subclassing.
23  * <p>
24  * The primary functions are to produce a normalized string and to detect whether
25  * a string is already normalized.
26  * The most commonly used normalization forms are those defined in
27  * http://www.unicode.org/unicode/reports/tr15/
28  * However, this API supports additional normalization forms for specialized purposes.
29  * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
30  * and can be used in implementations of UTS #46.
31  * <p>
32  * Not only are the standard compose and decompose modes supplied,
33  * but additional modes are provided as documented in the Mode enum.
34  * <p>
35  * Some of the functions in this class identify normalization boundaries.
36  * At a normalization boundary, the portions of the string
37  * before it and starting from it do not interact and can be handled independently.
38  * <p>
39  * The spanQuickCheckYes() stops at a normalization boundary.
40  * When the goal is a normalized string, then the text before the boundary
41  * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
42  * <p>
43  * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
44  * a character is guaranteed to be at a normalization boundary,
45  * regardless of context.
46  * This is used for moving from one normalization boundary to the next
47  * or preceding boundary, and for performing iterative normalization.
48  * <p>
49  * Iterative normalization is useful when only a small portion of a
50  * longer string needs to be processed.
51  * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
52  * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
53  * (to process only the substring for which sort key bytes are computed).
54  * <p>
55  * The set of normalization boundaries returned by these functions may not be
56  * complete: There may be more boundaries that could be returned.
57  * Different functions may return different boundaries.
58  * @stable ICU 4.4
59  * @author Markus W. Scherer
60  */
61 public abstract class Normalizer2 {
62     /**
63      * Constants for normalization modes.
64      * For details about standard Unicode normalization forms
65      * and about the algorithms which are also used with custom mapping tables
66      * see http://www.unicode.org/unicode/reports/tr15/
67      * @stable ICU 4.4
68      */
69     public enum Mode {
70         /**
71          * Decomposition followed by composition.
72          * Same as standard NFC when using an "nfc" instance.
73          * Same as standard NFKC when using an "nfkc" instance.
74          * For details about standard Unicode normalization forms
75          * see http://www.unicode.org/unicode/reports/tr15/
76          * @stable ICU 4.4
77          */
78         COMPOSE,
79         /**
80          * Map, and reorder canonically.
81          * Same as standard NFD when using an "nfc" instance.
82          * Same as standard NFKD when using an "nfkc" instance.
83          * For details about standard Unicode normalization forms
84          * see http://www.unicode.org/unicode/reports/tr15/
85          * @stable ICU 4.4
86          */
87         DECOMPOSE,
88         /**
89          * "Fast C or D" form.
90          * If a string is in this form, then further decomposition <i>without reordering</i>
91          * would yield the same form as DECOMPOSE.
92          * Text in "Fast C or D" form can be processed efficiently with data tables
93          * that are "canonically closed", that is, that provide equivalent data for
94          * equivalent text, without having to be fully normalized.<br>
95          * Not a standard Unicode normalization form.<br>
96          * Not a unique form: Different FCD strings can be canonically equivalent.<br>
97          * For details see http://www.unicode.org/notes/tn5/#FCD
98          * @stable ICU 4.4
99          */
100         FCD,
101         /**
102          * Compose only contiguously.
103          * Also known as "FCC" or "Fast C Contiguous".
104          * The result will often but not always be in NFC.
105          * The result will conform to FCD which is useful for processing.<br>
106          * Not a standard Unicode normalization form.<br>
107          * For details see http://www.unicode.org/notes/tn5/#FCC
108          * @stable ICU 4.4
109          */
110         COMPOSE_CONTIGUOUS
111     };
112 
113     /**
114      * Returns a Normalizer2 instance for Unicode NFC normalization.
115      * Same as getInstance(null, "nfc", Mode.COMPOSE).
116      * Returns an unmodifiable singleton instance.
117      * @return the requested Normalizer2, if successful
118      * @stable ICU 49
119      */
getNFCInstance()120     public static Normalizer2 getNFCInstance() {
121         return Norm2AllModes.getNFCInstance().comp;
122     }
123 
124     /**
125      * Returns a Normalizer2 instance for Unicode NFD normalization.
126      * Same as getInstance(null, "nfc", Mode.DECOMPOSE).
127      * Returns an unmodifiable singleton instance.
128      * @return the requested Normalizer2, if successful
129      * @stable ICU 49
130      */
getNFDInstance()131     public static Normalizer2 getNFDInstance() {
132         return Norm2AllModes.getNFCInstance().decomp;
133     }
134 
135     /**
136      * Returns a Normalizer2 instance for Unicode NFKC normalization.
137      * Same as getInstance(null, "nfkc", Mode.COMPOSE).
138      * Returns an unmodifiable singleton instance.
139      * @return the requested Normalizer2, if successful
140      * @stable ICU 49
141      */
getNFKCInstance()142     public static Normalizer2 getNFKCInstance() {
143         return Norm2AllModes.getNFKCInstance().comp;
144     }
145 
146     /**
147      * Returns a Normalizer2 instance for Unicode NFKD normalization.
148      * Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
149      * Returns an unmodifiable singleton instance.
150      * @return the requested Normalizer2, if successful
151      * @stable ICU 49
152      */
getNFKDInstance()153     public static Normalizer2 getNFKDInstance() {
154         return Norm2AllModes.getNFKCInstance().decomp;
155     }
156 
157     /**
158      * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
159      * Same as getInstance(null, "nfkc_cf", Mode.COMPOSE).
160      * Returns an unmodifiable singleton instance.
161      * @return the requested Normalizer2, if successful
162      * @stable ICU 49
163      */
getNFKCCasefoldInstance()164     public static Normalizer2 getNFKCCasefoldInstance() {
165         return Norm2AllModes.getNFKC_CFInstance().comp;
166     }
167 
168     /**
169      * Returns a Normalizer2 instance which uses the specified data file
170      * (an ICU data file if data=null, or else custom binary data)
171      * and which composes or decomposes text according to the specified mode.
172      * Returns an unmodifiable singleton instance.
173      * <ul>
174      * <li>Use data=null for data files that are part of ICU's own data.
175      * <li>Use name="nfc" and COMPOSE/DECOMPOSE for Unicode standard NFC/NFD.
176      * <li>Use name="nfkc" and COMPOSE/DECOMPOSE for Unicode standard NFKC/NFKD.
177      * <li>Use name="nfkc_cf" and COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
178      * </ul>
179      * If data!=null, then the binary data is read once and cached using the provided
180      * name as the key.
181      * If you know or expect the data to be cached already, you can use data!=null
182      * for non-ICU data as well.
183      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
184      * @param data the binary, big-endian normalization (.nrm file) data, or null for ICU data
185      * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
186      * @param mode normalization mode (compose or decompose etc.)
187      * @return the requested Normalizer2, if successful
188      * @stable ICU 4.4
189      */
getInstance(InputStream data, String name, Mode mode)190     public static Normalizer2 getInstance(InputStream data, String name, Mode mode) {
191         // TODO: If callers really use this API, then we should add an overload that takes a ByteBuffer.
192         ByteBuffer bytes = null;
193         if (data != null) {
194             try {
195                 bytes = ICUBinary.getByteBufferFromInputStreamAndCloseStream(data);
196             } catch (IOException e) {
197                 throw new ICUUncheckedIOException(e);
198             }
199         }
200         Norm2AllModes all2Modes=Norm2AllModes.getInstance(bytes, name);
201         switch(mode) {
202         case COMPOSE: return all2Modes.comp;
203         case DECOMPOSE: return all2Modes.decomp;
204         case FCD: return all2Modes.fcd;
205         case COMPOSE_CONTIGUOUS: return all2Modes.fcc;
206         default: return null;  // will not occur
207         }
208     }
209 
210     /**
211      * Returns the normalized form of the source string.
212      * @param src source string
213      * @return normalized src
214      * @stable ICU 4.4
215      */
normalize(CharSequence src)216     public String normalize(CharSequence src) {
217         if(src instanceof String) {
218             // Fastpath: Do not construct a new String if the src is a String
219             // and is already normalized.
220             int spanLength=spanQuickCheckYes(src);
221             if(spanLength==src.length()) {
222                 return (String)src;
223             }
224             StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
225             return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
226         }
227         return normalize(src, new StringBuilder(src.length())).toString();
228     }
229 
230     /**
231      * Writes the normalized form of the source string to the destination string
232      * (replacing its contents) and returns the destination string.
233      * The source and destination strings must be different objects.
234      * @param src source string
235      * @param dest destination string; its contents is replaced with normalized src
236      * @return dest
237      * @stable ICU 4.4
238      */
normalize(CharSequence src, StringBuilder dest)239     public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
240 
241     /**
242      * Writes the normalized form of the source string to the destination Appendable
243      * and returns the destination Appendable.
244      * The source and destination strings must be different objects.
245      *
246      * <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
247      *
248      * @param src source string
249      * @param dest destination Appendable; gets normalized src appended
250      * @return dest
251      * @stable ICU 4.6
252      */
normalize(CharSequence src, Appendable dest)253     public abstract Appendable normalize(CharSequence src, Appendable dest);
254 
255     /**
256      * Appends the normalized form of the second string to the first string
257      * (merging them at the boundary) and returns the first string.
258      * The result is normalized if the first string was normalized.
259      * The first and second strings must be different objects.
260      * @param first string, should be normalized
261      * @param second string, will be normalized
262      * @return first
263      * @stable ICU 4.4
264      */
normalizeSecondAndAppend( StringBuilder first, CharSequence second)265     public abstract StringBuilder normalizeSecondAndAppend(
266             StringBuilder first, CharSequence second);
267 
268     /**
269      * Appends the second string to the first string
270      * (merging them at the boundary) and returns the first string.
271      * The result is normalized if both the strings were normalized.
272      * The first and second strings must be different objects.
273      * @param first string, should be normalized
274      * @param second string, should be normalized
275      * @return first
276      * @stable ICU 4.4
277      */
append(StringBuilder first, CharSequence second)278     public abstract StringBuilder append(StringBuilder first, CharSequence second);
279 
280     /**
281      * Gets the decomposition mapping of c.
282      * Roughly equivalent to normalizing the String form of c
283      * on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
284      * returns null if c does not have a decomposition mapping in this instance's data.
285      * This function is independent of the mode of the Normalizer2.
286      * @param c code point
287      * @return c's decomposition mapping, if any; otherwise null
288      * @stable ICU 4.6
289      */
getDecomposition(int c)290     public abstract String getDecomposition(int c);
291 
292     /**
293      * Gets the raw decomposition mapping of c.
294      *
295      * <p>This is similar to the getDecomposition() method but returns the
296      * raw decomposition mapping as specified in UnicodeData.txt or
297      * (for custom data) in the mapping files processed by the gennorm2 tool.
298      * By contrast, getDecomposition() returns the processed,
299      * recursively-decomposed version of this mapping.
300      *
301      * <p>When used on a standard NFKC Normalizer2 instance,
302      * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
303      *
304      * <p>When used on a standard NFC Normalizer2 instance,
305      * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
306      * in this case, the result contains either one or two code points (=1..4 Java chars).
307      *
308      * <p>This function is independent of the mode of the Normalizer2.
309      * The default implementation returns null.
310      * @param c code point
311      * @return c's raw decomposition mapping, if any; otherwise null
312      * @stable ICU 49
313      */
getRawDecomposition(int c)314     public String getRawDecomposition(int c) { return null; }
315 
316     /**
317      * Performs pairwise composition of a & b and returns the composite if there is one.
318      *
319      * <p>Returns a composite code point c only if c has a two-way mapping to a+b.
320      * In standard Unicode normalization, this means that
321      * c has a canonical decomposition to a+b
322      * and c does not have the Full_Composition_Exclusion property.
323      *
324      * <p>This function is independent of the mode of the Normalizer2.
325      * The default implementation returns a negative value.
326      * @param a A (normalization starter) code point.
327      * @param b Another code point.
328      * @return The non-negative composite code point if there is one; otherwise a negative value.
329      * @stable ICU 49
330      */
composePair(int a, int b)331     public int composePair(int a, int b) { return -1; }
332 
333     /**
334      * Gets the combining class of c.
335      * The default implementation returns 0
336      * but all standard implementations return the Unicode Canonical_Combining_Class value.
337      * @param c code point
338      * @return c's combining class
339      * @stable ICU 49
340      */
getCombiningClass(int c)341     public int getCombiningClass(int c) { return 0; }
342 
343     /**
344      * Tests if the string is normalized.
345      * Internally, in cases where the quickCheck() method would return "maybe"
346      * (which is only possible for the two COMPOSE modes) this method
347      * resolves to "yes" or "no" to provide a definitive result,
348      * at the cost of doing more work in those cases.
349      * @param s input string
350      * @return true if s is normalized
351      * @stable ICU 4.4
352      */
isNormalized(CharSequence s)353     public abstract boolean isNormalized(CharSequence s);
354 
355     /**
356      * Tests if the string is normalized.
357      * For the two COMPOSE modes, the result could be "maybe" in cases that
358      * would take a little more work to resolve definitively.
359      * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
360      * combination of quick check + normalization, to avoid
361      * re-checking the "yes" prefix.
362      * @param s input string
363      * @return the quick check result
364      * @stable ICU 4.4
365      */
quickCheck(CharSequence s)366     public abstract Normalizer.QuickCheckResult quickCheck(CharSequence s);
367 
368     /**
369      * Returns the end of the normalized substring of the input string.
370      * In other words, with <code>end=spanQuickCheckYes(s);</code>
371      * the substring <code>s.subSequence(0, end)</code>
372      * will pass the quick check with a "yes" result.
373      * <p>
374      * The returned end index is usually one or more characters before the
375      * "no" or "maybe" character: The end index is at a normalization boundary.
376      * (See the class documentation for more about normalization boundaries.)
377      * <p>
378      * When the goal is a normalized string and most input strings are expected
379      * to be normalized already, then call this method,
380      * and if it returns a prefix shorter than the input string,
381      * copy that prefix and use normalizeSecondAndAppend() for the remainder.
382      * @param s input string
383      * @return "yes" span end index
384      * @stable ICU 4.4
385      */
spanQuickCheckYes(CharSequence s)386     public abstract int spanQuickCheckYes(CharSequence s);
387 
388     /**
389      * Tests if the character always has a normalization boundary before it,
390      * regardless of context.
391      * If true, then the character does not normalization-interact with
392      * preceding characters.
393      * In other words, a string containing this character can be normalized
394      * by processing portions before this character and starting from this
395      * character independently.
396      * This is used for iterative normalization. See the class documentation for details.
397      * @param c character to test
398      * @return true if c has a normalization boundary before it
399      * @stable ICU 4.4
400      */
hasBoundaryBefore(int c)401     public abstract boolean hasBoundaryBefore(int c);
402 
403     /**
404      * Tests if the character always has a normalization boundary after it,
405      * regardless of context.
406      * If true, then the character does not normalization-interact with
407      * following characters.
408      * In other words, a string containing this character can be normalized
409      * by processing portions up to this character and after this
410      * character independently.
411      * This is used for iterative normalization. See the class documentation for details.
412      * <p>
413      * Note that this operation may be significantly slower than hasBoundaryBefore().
414      * @param c character to test
415      * @return true if c has a normalization boundary after it
416      * @stable ICU 4.4
417      */
hasBoundaryAfter(int c)418     public abstract boolean hasBoundaryAfter(int c);
419 
420     /**
421      * Tests if the character is normalization-inert.
422      * If true, then the character does not change, nor normalization-interact with
423      * preceding or following characters.
424      * In other words, a string containing this character can be normalized
425      * by processing portions before this character and after this
426      * character independently.
427      * This is used for iterative normalization. See the class documentation for details.
428      * <p>
429      * Note that this operation may be significantly slower than hasBoundaryBefore().
430      * @param c character to test
431      * @return true if c is normalization-inert
432      * @stable ICU 4.4
433      */
isInert(int c)434     public abstract boolean isInert(int c);
435 
436     /**
437      * Sole constructor.  (For invocation by subclass constructors,
438      * typically implicit.)
439      * @internal
440      * @deprecated This API is ICU internal only.
441      */
442     @Deprecated
Normalizer2()443     protected Normalizer2() {
444     }
445 }
446