1 /*
2  *******************************************************************************
3  * Copyright (C) 1996-2012, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 
8 package com.ibm.icu.dev.test.normalizer;
9 
10 import java.text.StringCharacterIterator;
11 import java.util.Random;
12 
13 import com.ibm.icu.dev.test.TestFmwk;
14 import com.ibm.icu.impl.Norm2AllModes;
15 import com.ibm.icu.impl.Normalizer2Impl;
16 import com.ibm.icu.impl.USerializedSet;
17 import com.ibm.icu.impl.Utility;
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.lang.UCharacterCategory;
20 import com.ibm.icu.lang.UProperty;
21 import com.ibm.icu.text.FilteredNormalizer2;
22 import com.ibm.icu.text.Normalizer;
23 import com.ibm.icu.text.Normalizer2;
24 import com.ibm.icu.text.UCharacterIterator;
25 import com.ibm.icu.text.UTF16;
26 import com.ibm.icu.text.UnicodeSet;
27 import com.ibm.icu.text.UnicodeSetIterator;
28 
29 
30 public class BasicTest extends TestFmwk {
main(String[] args)31     public static void main(String[] args) throws Exception {
32         new BasicTest().run(args);
33     }
34 
35     String[][] canonTests = {
36         // Input                Decomposed              Composed
37         { "cat",                "cat",                  "cat"               },
38         { "\u00e0ardvark",      "a\u0300ardvark",       "\u00e0ardvark",    },
39 
40         { "\u1e0a",             "D\u0307",              "\u1e0a"            }, // D-dot_above
41         { "D\u0307",            "D\u0307",              "\u1e0a"            }, // D dot_above
42 
43         { "\u1e0c\u0307",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_below dot_above
44         { "\u1e0a\u0323",       "D\u0323\u0307",        "\u1e0c\u0307"      }, // D-dot_above dot_below
45         { "D\u0307\u0323",      "D\u0323\u0307",        "\u1e0c\u0307"      }, // D dot_below dot_above
46 
47         { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307",  "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above
48         { "D\u0307\u0328\u0323","D\u0328\u0323\u0307",  "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below
49 
50         { "\u1E14",             "E\u0304\u0300",        "\u1E14"            }, // E-macron-grave
51         { "\u0112\u0300",       "E\u0304\u0300",        "\u1E14"            }, // E-macron + grave
52         { "\u00c8\u0304",       "E\u0300\u0304",        "\u00c8\u0304"      }, // E-grave + macron
53 
54         { "\u212b",             "A\u030a",              "\u00c5"            }, // angstrom_sign
55         { "\u00c5",             "A\u030a",              "\u00c5"            }, // A-ring
56 
57         { "\u00c4ffin",         "A\u0308ffin",          "\u00c4ffin"        },
58         { "\u00c4\uFB03n",      "A\u0308\uFB03n",       "\u00c4\uFB03n"     },
59 
60         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        }, //updated with 3.0
61         { "\u00fd\uFB03n",      "y\u0301\uFB03n",       "\u00fd\uFB03n"     }, //updated with 3.0
62 
63         { "Henry IV",           "Henry IV",             "Henry IV"          },
64         { "Henry \u2163",       "Henry \u2163",         "Henry \u2163"      },
65 
66         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)
67         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten
68         { "\uFF76\uFF9E",       "\uFF76\uFF9E",         "\uFF76\uFF9E"      }, // hw_ka + hw_ten
69         { "\u30AB\uFF9E",       "\u30AB\uFF9E",         "\u30AB\uFF9E"      }, // ka + hw_ten
70         { "\uFF76\u3099",       "\uFF76\u3099",         "\uFF76\u3099"      }, // hw_ka + ten
71 
72         { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" },
73         {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"},
74     };
75 
76     String[][] compatTests = {
77             // Input                Decomposed              Composed
78         { "cat",                 "cat",                     "cat"           },
79         { "\uFB4f",             "\u05D0\u05DC",         "\u05D0\u05DC",     }, // Alef-Lamed vs. Alef, Lamed
80 
81         { "\u00C4ffin",         "A\u0308ffin",          "\u00C4ffin"        },
82         { "\u00C4\uFB03n",      "A\u0308ffin",          "\u00C4ffin"        }, // ffi ligature -> f + f + i
83 
84         { "\u00fdffin",         "y\u0301ffin",          "\u00fdffin"        },        //updated for 3.0
85         { "\u00fd\uFB03n",      "y\u0301ffin",          "\u00fdffin"        }, // ffi ligature -> f + f + i
86 
87         { "Henry IV",           "Henry IV",             "Henry IV"          },
88         { "Henry \u2163",       "Henry IV",             "Henry IV"          },
89 
90         { "\u30AC",             "\u30AB\u3099",         "\u30AC"            }, // ga (Katakana)
91         { "\u30AB\u3099",       "\u30AB\u3099",         "\u30AC"            }, // ka + ten
92 
93         { "\uFF76\u3099",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + ten
94 
95         /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/
96         { "\uFF76\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // hw_ka + hw_ten
97         { "\u30AB\uFF9E",       "\u30AB\u3099",         "\u30AC"            }, // ka + hw_ten
98 
99     };
100 
101     // With Canonical decomposition, Hangul syllables should get decomposed
102     // into Jamo, but Jamo characters should not be decomposed into
103     // conjoining Jamo
104     String[][] hangulCanon = {
105         // Input                Decomposed              Composed
106         { "\ud4db",             "\u1111\u1171\u11b6",   "\ud4db"        },
107         { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6",   "\ud4db"        },
108     };
109 
110     // With compatibility decomposition turned on,
111     // it should go all the way down to conjoining Jamo characters.
112     // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE
113     String[][] hangulCompat = {
114         // Input        Decomposed                          Composed
115         // { "\ud4db",     "\u1111\u116e\u1175\u11af\u11c2",   "\ud478\u1175\u11af\u11c2"  },
116     };
117 
TestHangulCompose()118     public void TestHangulCompose()
119                 throws Exception{
120         // Make sure that the static composition methods work
121         logln("Canonical composition...");
122         staticTest(Normalizer.NFC, hangulCanon,  2);
123         logln("Compatibility composition...");
124         staticTest(Normalizer.NFKC, hangulCompat, 2);
125         // Now try iterative composition....
126         logln("Iterative composition...");
127         Normalizer norm = new Normalizer("", Normalizer.NFC,0);
128         iterateTest(norm, hangulCanon, 2);
129 
130         norm.setMode(Normalizer.NFKD);
131         iterateTest(norm, hangulCompat, 2);
132 
133         // And finally, make sure you can do it in reverse too
134         logln("Reverse iteration...");
135         norm.setMode(Normalizer.NFC);
136         backAndForth(norm, hangulCanon);
137      }
138 
TestHangulDecomp()139     public void TestHangulDecomp() throws Exception{
140         // Make sure that the static decomposition methods work
141         logln("Canonical decomposition...");
142         staticTest(Normalizer.NFD, hangulCanon,  1);
143         logln("Compatibility decomposition...");
144         staticTest(Normalizer.NFKD, hangulCompat, 1);
145 
146          // Now the iterative decomposition methods...
147         logln("Iterative decomposition...");
148         Normalizer norm = new Normalizer("", Normalizer.NFD,0);
149         iterateTest(norm, hangulCanon, 1);
150 
151         norm.setMode(Normalizer.NFKD);
152         iterateTest(norm, hangulCompat, 1);
153 
154         // And finally, make sure you can do it in reverse too
155         logln("Reverse iteration...");
156         norm.setMode(Normalizer.NFD);
157         backAndForth(norm, hangulCanon);
158     }
TestNone()159     public void TestNone() throws Exception{
160         Normalizer norm = new Normalizer("", Normalizer.NONE,0);
161         iterateTest(norm, canonTests, 0);
162         staticTest(Normalizer.NONE, canonTests, 0);
163     }
TestDecomp()164     public void TestDecomp() throws Exception{
165         Normalizer norm = new Normalizer("", Normalizer.NFD,0);
166         iterateTest(norm, canonTests, 1);
167         staticTest(Normalizer.NFD, canonTests, 1);
168         decomposeTest(Normalizer.NFD, canonTests, 1);
169     }
170 
TestCompatDecomp()171     public void TestCompatDecomp() throws Exception{
172         Normalizer norm = new Normalizer("", Normalizer.NFKD,0);
173         iterateTest(norm, compatTests, 1);
174         staticTest(Normalizer.NFKD,compatTests, 1);
175         decomposeTest(Normalizer.NFKD,compatTests, 1);
176     }
177 
TestCanonCompose()178     public void TestCanonCompose() throws Exception{
179         Normalizer norm = new Normalizer("", Normalizer.NFC,0);
180         iterateTest(norm, canonTests, 2);
181         staticTest(Normalizer.NFC, canonTests, 2);
182         composeTest(Normalizer.NFC, canonTests, 2);
183     }
184 
TestCompatCompose()185     public void TestCompatCompose() throws Exception{
186         Normalizer norm = new Normalizer("", Normalizer.NFKC,0);
187         iterateTest(norm, compatTests, 2);
188         staticTest(Normalizer.NFKC,compatTests, 2);
189         composeTest(Normalizer.NFKC,compatTests, 2);
190     }
191 
TestExplodingBase()192     public void TestExplodingBase() throws Exception{
193         // \u017f - Latin small letter long s
194         // \u0307 - combining dot above
195         // \u1e61 - Latin small letter s with dot above
196         // \u1e9b - Latin small letter long s with dot above
197         String[][] canon = {
198             // Input                Decomposed              Composed
199             { "Tschu\u017f",        "Tschu\u017f",          "Tschu\u017f"    },
200             { "Tschu\u1e9b",        "Tschu\u017f\u0307",    "Tschu\u1e9b"    },
201         };
202         String[][] compat = {
203             // Input                Decomposed              Composed
204             { "\u017f",        "s",              "s"           },
205             { "\u1e9b",        "s\u0307",        "\u1e61"      },
206         };
207 
208         staticTest(Normalizer.NFD, canon,  1);
209         staticTest(Normalizer.NFC, canon,  2);
210 
211         staticTest(Normalizer.NFKD, compat, 1);
212         staticTest(Normalizer.NFKC, compat, 2);
213 
214     }
215 
216     /**
217      * The Tibetan vowel sign AA, 0f71, was messed up prior to
218      * Unicode version 2.1.9.
219      * Once 2.1.9 or 3.0 is released, uncomment this test.
220      */
TestTibetan()221     public void TestTibetan() throws Exception{
222         String[][] decomp = {
223             { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" }
224         };
225         String[][] compose = {
226             { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" }
227         };
228 
229         staticTest(Normalizer.NFD, decomp, 1);
230         staticTest(Normalizer.NFKD,decomp, 2);
231         staticTest(Normalizer.NFC, compose, 1);
232         staticTest(Normalizer.NFKC,compose, 2);
233     }
234 
235     /**
236      * Make sure characters in the CompositionExclusion.txt list do not get
237      * composed to.
238      */
TestCompositionExclusion()239     public void TestCompositionExclusion()
240                 throws Exception{
241         // This list is generated from CompositionExclusion.txt.
242         // Update whenever the normalizer tables are updated.  Note
243         // that we test all characters listed, even those that can be
244         // derived from the Unicode DB and are therefore commented
245         // out.
246         String EXCLUDED =
247             "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" +
248             "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" +
249             "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" +
250             "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" +
251             "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" +
252             "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" +
253             "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" +
254             "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" +
255             "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" +
256             "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" +
257             "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" +
258             "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" +
259             "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" +
260             "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E";
261         for (int i=0; i<EXCLUDED.length(); ++i) {
262             String a = String.valueOf(EXCLUDED.charAt(i));
263             String b = Normalizer.normalize(a, Normalizer.NFKD);
264             String c = Normalizer.normalize(b, Normalizer.NFC);
265             if (c.equals(a)) {
266                 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
267                       hex(b) + " x COMPOSE => " +
268                       hex(c));
269             } else if (isVerbose()) {
270                 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
271                       hex(b) + " x COMPOSE => " +
272                       hex(c));
273             }
274         }
275         // The following method works too, but it is somewhat
276         // incestuous.  It uses UInfo, which is the same database that
277         // NormalizerBuilder uses, so if something is wrong with
278         // UInfo, the following test won't show it.  All it will show
279         // is that NormalizerBuilder has been run with whatever the
280         // current UInfo is.
281         //
282         // We comment this out in favor of the test above, which
283         // provides independent verification (but also requires
284         // independent updating).
285 //      logln("---");
286 //      UInfo uinfo = new UInfo();
287 //      for (int i=0; i<=0xFFFF; ++i) {
288 //          if (!uinfo.isExcludedComposition((char)i) ||
289 //              (!uinfo.hasCanonicalDecomposition((char)i) &&
290 //               !uinfo.hasCompatibilityDecomposition((char)i))) continue;
291 //          String a = String.valueOf((char)i);
292 //          String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0);
293 //          String c = Normalizer.normalize(b,Normalizer.COMPOSE,0);
294 //          if (c.equals(a)) {
295 //              errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " +
296 //                    hex(b) + " x COMPOSE => " +
297 //                    hex(c));
298 //          } else if (isVerbose()) {
299 //              logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " +
300 //                    hex(b) + " x COMPOSE => " +
301 //                    hex(c));
302 //          }
303 //      }
304     }
305 
306     /**
307      * Test for a problem that showed up just before ICU 1.6 release
308      * having to do with combining characters with an index of zero.
309      * Such characters do not participate in any canonical
310      * decompositions.  However, having an index of zero means that
311      * they all share one typeMask[] entry, that is, they all have to
312      * map to the same canonical class, which is not the case, in
313      * reality.
314      */
TestZeroIndex()315     public void TestZeroIndex()
316                 throws Exception{
317         String[] DATA = {
318             // Expect col1 x COMPOSE_COMPAT => col2
319             // Expect col2 x DECOMP => col3
320             "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300",
321             "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300",
322             "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300",
323             "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327",
324             "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321",
325         };
326 
327         for (int i=0; i<DATA.length; i+=3) {
328             String a = DATA[i];
329             String b = Normalizer.normalize(a, Normalizer.NFKC);
330             String exp = DATA[i+1];
331             if (b.equals(exp)) {
332                 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b));
333             } else {
334                 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) +
335                       ", expect " + hex(exp));
336             }
337             a = Normalizer.normalize(b, Normalizer.NFD);
338             exp = DATA[i+2];
339             if (a.equals(exp)) {
340                 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a));
341             } else {
342                 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) +
343                       ", expect " + hex(exp));
344             }
345         }
346     }
347 
348     /**
349      * Test for a problem found by Verisign.  Problem is that
350      * characters at the start of a string are not put in canonical
351      * order correctly by compose() if there is no starter.
352      */
TestVerisign()353     public void TestVerisign()
354                 throws Exception{
355         String[] inputs = {
356             "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f",
357             "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad"
358         };
359         String[] outputs = {
360             "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f",
361             "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4"
362         };
363 
364         for (int i = 0; i < inputs.length; ++i) {
365             String input = inputs[i];
366             String output = outputs[i];
367             String result = Normalizer.decompose(input, false);
368             if (!result.equals(output)) {
369                 errln("FAIL input: " + hex(input));
370                 errln(" decompose: " + hex(result));
371                 errln("  expected: " + hex(output));
372             }
373             result = Normalizer.compose(input, false);
374             if (!result.equals(output)) {
375                 errln("FAIL input: " + hex(input));
376                 errln("   compose: " + hex(result));
377                 errln("  expected: " + hex(output));
378             }
379         }
380 
381     }
TestQuickCheckResultNO()382     public void  TestQuickCheckResultNO()
383                  throws Exception{
384         final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C,
385                                 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E};
386         final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB,
387                                 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E};
388         final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE,
389                                 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
390         final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE,
391                                 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D};
392 
393 
394         final int SIZE = 10;
395 
396         int count = 0;
397         for (; count < SIZE; count ++)
398         {
399             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
400                     Normalizer.NFD,0) != Normalizer.NO)
401             {
402                 errln("ERROR in NFD quick check at U+" +
403                        Integer.toHexString(CPNFD[count]));
404                 return;
405             }
406             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
407                         Normalizer.NFC,0) !=Normalizer.NO)
408             {
409                 errln("ERROR in NFC quick check at U+"+
410                        Integer.toHexString(CPNFC[count]));
411                 return;
412             }
413             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
414                                 Normalizer.NFKD,0) != Normalizer.NO)
415             {
416                 errln("ERROR in NFKD quick check at U+"+
417                        Integer.toHexString(CPNFKD[count]));
418                 return;
419             }
420             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
421                                          Normalizer.NFKC,0) !=Normalizer.NO)
422             {
423                 errln("ERROR in NFKC quick check at U+"+
424                        Integer.toHexString(CPNFKC[count]));
425                 return;
426             }
427             // for improving coverage
428             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
429                                          Normalizer.NFKC) !=Normalizer.NO)
430             {
431                 errln("ERROR in NFKC quick check at U+"+
432                        Integer.toHexString(CPNFKC[count]));
433                 return;
434             }
435         }
436     }
437 
438 
TestQuickCheckResultYES()439     public void TestQuickCheckResultYES()
440                 throws Exception{
441         final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A,
442                                 0x2261, 0x3075, 0x4000, 0x5000, 0xF000};
443         final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500,
444                                 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000};
445         final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB,
446                                 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27};
447         final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000,
448                                 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E};
449 
450         final int SIZE = 10;
451         int count = 0;
452 
453         char cp = 0;
454         while (cp < 0xA0)
455         {
456             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0)
457                                             != Normalizer.YES)
458             {
459                 errln("ERROR in NFD quick check at U+"+
460                                                       Integer.toHexString(cp));
461                 return;
462             }
463             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0)
464                                              != Normalizer.YES)
465             {
466                 errln("ERROR in NFC quick check at U+"+
467                                                       Integer.toHexString(cp));
468                 return;
469             }
470             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0)
471                                              != Normalizer.YES)
472             {
473                 errln("ERROR in NFKD quick check at U+" +
474                                                       Integer.toHexString(cp));
475                 return;
476             }
477             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0)
478                                              != Normalizer.YES)
479             {
480                 errln("ERROR in NFKC quick check at U+"+
481                                                        Integer.toHexString(cp));
482                 return;
483             }
484             // improve the coverage
485             if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC)
486                                              != Normalizer.YES)
487             {
488                 errln("ERROR in NFKC quick check at U+"+
489                                                        Integer.toHexString(cp));
490                 return;
491             }
492             cp++;
493         }
494 
495         for (; count < SIZE; count ++)
496         {
497             if (Normalizer.quickCheck(String.valueOf(CPNFD[count]),
498                                          Normalizer.NFD,0)!=Normalizer.YES)
499             {
500                 errln("ERROR in NFD quick check at U+"+
501                                              Integer.toHexString(CPNFD[count]));
502                 return;
503             }
504             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
505                                          Normalizer.NFC,0)!=Normalizer.YES)
506             {
507                 errln("ERROR in NFC quick check at U+"+
508                                              Integer.toHexString(CPNFC[count]));
509                 return;
510             }
511             if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]),
512                                          Normalizer.NFKD,0)!=Normalizer.YES)
513             {
514                 errln("ERROR in NFKD quick check at U+"+
515                                     Integer.toHexString(CPNFKD[count]));
516                 return;
517             }
518             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
519                                          Normalizer.NFKC,0)!=Normalizer.YES)
520             {
521                 errln("ERROR in NFKC quick check at U+"+
522                         Integer.toHexString(CPNFKC[count]));
523                 return;
524             }
525             // improve the coverage
526             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
527                                          Normalizer.NFKC)!=Normalizer.YES)
528             {
529                 errln("ERROR in NFKC quick check at U+"+
530                         Integer.toHexString(CPNFKC[count]));
531                 return;
532             }
533         }
534     }
TestBengali()535     public void TestBengali() throws Exception{
536         String input = "\u09bc\u09be\u09cd\u09be";
537         String output=Normalizer.normalize(input,Normalizer.NFC);
538         if(!input.equals(output)){
539              errln("ERROR in NFC of string");
540         }
541     }
TestQuickCheckResultMAYBE()542     public void TestQuickCheckResultMAYBE()
543                 throws Exception{
544 
545         final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161,
546                                 0x116A, 0x1173, 0x1175, 0x3099, 0x309A};
547         final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E,
548                                 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099};
549 
550 
551         final int SIZE = 10;
552 
553         int count = 0;
554 
555         /* NFD and NFKD does not have any MAYBE codepoints */
556         for (; count < SIZE; count ++)
557         {
558             if (Normalizer.quickCheck(String.valueOf(CPNFC[count]),
559                                         Normalizer.NFC,0)!=Normalizer.MAYBE)
560             {
561                 errln("ERROR in NFC quick check at U+"+
562                                             Integer.toHexString(CPNFC[count]));
563                 return;
564             }
565             if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]),
566                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)
567             {
568                 errln("ERROR in NFKC quick check at U+"+
569                                             Integer.toHexString(CPNFKC[count]));
570                 return;
571             }
572             if (Normalizer.quickCheck(new char[]{CPNFC[count]},
573                                         Normalizer.NFC,0)!=Normalizer.MAYBE)
574             {
575                 errln("ERROR in NFC quick check at U+"+
576                                             Integer.toHexString(CPNFC[count]));
577                 return;
578             }
579             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
580                                        Normalizer.NFKC,0)!=Normalizer.MAYBE)
581             {
582                 errln("ERROR in NFKC quick check at U+"+
583                                             Integer.toHexString(CPNFKC[count]));
584                 return;
585             }
586             if (Normalizer.quickCheck(new char[]{CPNFKC[count]},
587                                        Normalizer.NONE,0)!=Normalizer.YES)
588             {
589                 errln("ERROR in NONE quick check at U+"+
590                                             Integer.toHexString(CPNFKC[count]));
591                 return;
592             }
593         }
594     }
595 
TestQuickCheckStringResult()596     public void TestQuickCheckStringResult()
597                 throws Exception{
598         int count;
599         String d;
600         String c;
601 
602         for (count = 0; count < canonTests.length; count ++)
603         {
604             d = canonTests[count][1];
605             c = canonTests[count][2];
606             if (Normalizer.quickCheck(d,Normalizer.NFD,0)
607                                             != Normalizer.YES)
608             {
609                 errln("ERROR in NFD quick check for string at count " + count);
610                 return;
611             }
612 
613             if (Normalizer.quickCheck(c, Normalizer.NFC,0)
614                                             == Normalizer.NO)
615             {
616                 errln("ERROR in NFC quick check for string at count " + count);
617                 return;
618             }
619         }
620 
621         for (count = 0; count < compatTests.length; count ++)
622         {
623             d = compatTests[count][1];
624             c = compatTests[count][2];
625             if (Normalizer.quickCheck(d, Normalizer.NFKD,0)
626                                             != Normalizer.YES)
627             {
628                 errln("ERROR in NFKD quick check for string at count " + count);
629                 return;
630             }
631 
632             if (Normalizer.quickCheck(c,  Normalizer.NFKC,0)
633                                             != Normalizer.YES)
634             {
635                 errln("ERROR in NFKC quick check for string at count " + count);
636                 return;
637             }
638         }
639     }
640 
qcToInt(Normalizer.QuickCheckResult qc)641     static final int qcToInt(Normalizer.QuickCheckResult qc) {
642         if(qc==Normalizer.NO) {
643             return 0;
644         } else if(qc==Normalizer.YES) {
645             return 1;
646         } else /* Normalizer.MAYBE */ {
647             return 2;
648         }
649     }
650 
TestQuickCheckPerCP()651     public void TestQuickCheckPerCP() {
652         int c, lead, trail;
653         String s, nfd;
654         int lccc1, lccc2, tccc1, tccc2;
655         int qc1, qc2;
656 
657         if(
658             UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES
659             UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 ||
660             UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE
661             UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 ||
662             UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) ||
663             UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS)
664         ) {
665             errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS");
666         }
667 
668         /*
669          * compare the quick check property values for some code points
670          * to the quick check results for checking same-code point strings
671          */
672         c=0;
673         while(c<0x110000) {
674             s=UTF16.valueOf(c);
675 
676             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK);
677             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC));
678             if(qc1!=qc2) {
679                 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c));
680             }
681 
682             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK);
683             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD));
684             if(qc1!=qc2) {
685                 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c));
686             }
687 
688             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK);
689             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC));
690             if(qc1!=qc2) {
691                 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c));
692             }
693 
694             qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK);
695             qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD));
696             if(qc1!=qc2) {
697                 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c));
698             }
699 
700             nfd=Normalizer.normalize(s, Normalizer.NFD);
701             lead=UTF16.charAt(nfd, 0);
702             trail=UTF16.charAt(nfd, nfd.length()-1);
703 
704             lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS);
705             lccc2=UCharacter.getCombiningClass(lead);
706             tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
707             tccc2=UCharacter.getCombiningClass(trail);
708 
709             if(lccc1!=lccc2) {
710                 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c));
711             }
712             if(tccc1!=tccc2) {
713                 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c));
714             }
715 
716             /* skip some code points */
717             c=(20*c)/19+1;
718         }
719     }
720 
721     //------------------------------------------------------------------------
722     // Internal utilities
723     //
724        //------------------------------------------------------------------------
725     // Internal utilities
726     //
727 
728 /*    private void backAndForth(Normalizer iter, String input)
729     {
730         iter.setText(input);
731 
732         // Run through the iterator forwards and stick it into a StringBuffer
733         StringBuffer forward =  new StringBuffer();
734         for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
735             forward.append(ch);
736         }
737 
738         // Now do it backwards
739         StringBuffer reverse = new StringBuffer();
740         for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
741             reverse.insert(0, ch);
742         }
743 
744         if (!forward.toString().equals(reverse.toString())) {
745             errln("FAIL: Forward/reverse mismatch for input " + hex(input)
746                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
747         } else if (isVerbose()) {
748             logln("Ok: Forward/reverse for input " + hex(input)
749                   + ", forward: " + hex(forward) + ", backward: "+hex(reverse));
750         }
751     }*/
752 
backAndForth(Normalizer iter, String[][] tests)753     private void backAndForth(Normalizer iter, String[][] tests)
754     {
755         for (int i = 0; i < tests.length; i++)
756         {
757             iter.setText(tests[i][0]);
758 
759             // Run through the iterator forwards and stick it into a
760             // StringBuffer
761             StringBuffer forward =  new StringBuffer();
762             for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) {
763                 forward.append(ch);
764             }
765 
766             // Now do it backwards
767             StringBuffer reverse = new StringBuffer();
768             for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) {
769                 reverse.insert(0, ch);
770             }
771 
772             if (!forward.toString().equals(reverse.toString())) {
773                 errln("FAIL: Forward/reverse mismatch for input "
774                     + hex(tests[i][0]) + ", forward: " + hex(forward)
775                     + ", backward: " + hex(reverse));
776             } else if (isVerbose()) {
777                 logln("Ok: Forward/reverse for input " + hex(tests[i][0])
778                       + ", forward: " + hex(forward) + ", backward: "
779                       + hex(reverse));
780             }
781         }
782     }
783 
staticTest(Normalizer.Mode mode, String[][] tests, int outCol)784     private void staticTest (Normalizer.Mode mode,
785                              String[][] tests, int outCol) throws Exception{
786         for (int i = 0; i < tests.length; i++)
787         {
788             String input = Utility.unescape(tests[i][0]);
789             String expect = Utility.unescape(tests[i][outCol]);
790 
791             logln("Normalizing '" + input + "' (" + hex(input) + ")" );
792 
793             String output = Normalizer.normalize(input, mode);
794 
795             if (!output.equals(expect)) {
796                 errln("FAIL: case " + i
797                     + " expected '" + expect + "' (" + hex(expect) + ")"
798                     + " but got '" + output + "' (" + hex(output) + ")" );
799             }
800         }
801         char[] output = new char[1];
802         for (int i = 0; i < tests.length; i++)
803         {
804             char[] input = Utility.unescape(tests[i][0]).toCharArray();
805             String expect =Utility.unescape( tests[i][outCol]);
806 
807             logln("Normalizing '" + new String(input) + "' (" +
808                         hex(new String(input)) + ")" );
809             int reqLength=0;
810             while(true){
811                 try{
812                     reqLength=Normalizer.normalize(input,output, mode,0);
813                     if(reqLength<=output.length    ){
814                         break;
815                     }
816                 }catch(IndexOutOfBoundsException e){
817                     output= new char[Integer.parseInt(e.getMessage())];
818                     continue;
819                 }
820             }
821             if (!expect.equals(new String(output,0,reqLength))) {
822                 errln("FAIL: case " + i
823                     + " expected '" + expect + "' (" + hex(expect) + ")"
824                     + " but got '" + new String(output)
825                     + "' ("  + hex(new String(output)) + ")" );
826             }
827         }
828     }
decomposeTest(Normalizer.Mode mode, String[][] tests, int outCol)829     private void decomposeTest(Normalizer.Mode mode,
830                              String[][] tests, int outCol) throws Exception{
831         for (int i = 0; i < tests.length; i++)
832         {
833             String input = Utility.unescape(tests[i][0]);
834             String expect = Utility.unescape(tests[i][outCol]);
835 
836             logln("Normalizing '" + input + "' (" + hex(input) + ")" );
837 
838             String output = Normalizer.decompose(input, mode==Normalizer.NFKD);
839 
840             if (!output.equals(expect)) {
841                 errln("FAIL: case " + i
842                     + " expected '" + expect + "' (" + hex(expect) + ")"
843                     + " but got '" + output + "' (" + hex(output) + ")" );
844             }
845         }
846         char[] output = new char[1];
847         for (int i = 0; i < tests.length; i++)
848         {
849             char[] input = Utility.unescape(tests[i][0]).toCharArray();
850             String expect = Utility.unescape(tests[i][outCol]);
851 
852             logln("Normalizing '" + new String(input) + "' (" +
853                         hex(new String(input)) + ")" );
854             int reqLength=0;
855             while(true){
856                 try{
857                     reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0);
858                     if(reqLength<=output.length ){
859                         break;
860                     }
861                 }catch(IndexOutOfBoundsException e){
862                     output= new char[Integer.parseInt(e.getMessage())];
863                     continue;
864                 }
865             }
866             if (!expect.equals(new String(output,0,reqLength))) {
867                 errln("FAIL: case " + i
868                     + " expected '" + expect + "' (" + hex(expect) + ")"
869                     + " but got '" + new String(output)
870                     + "' ("  + hex(new String(output)) + ")" );
871             }
872         }
873         output = new char[1];
874         for (int i = 0; i < tests.length; i++)
875         {
876            char[] input = Utility.unescape(tests[i][0]).toCharArray();
877            String expect = Utility.unescape(tests[i][outCol]);
878 
879            logln("Normalizing '" + new String(input) + "' (" +
880                        hex(new String(input)) + ")" );
881            int reqLength=0;
882            while(true){
883                try{
884                    reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0);
885                    if(reqLength<=output.length ){
886                        break;
887                    }
888                }catch(IndexOutOfBoundsException e){
889                    output= new char[Integer.parseInt(e.getMessage())];
890                    continue;
891                }
892            }
893            if (!expect.equals(new String(output,0,reqLength))) {
894                errln("FAIL: case " + i
895                    + " expected '" + expect + "' (" + hex(expect) + ")"
896                    + " but got '" + new String(output)
897                    + "' ("  + hex(new String(output)) + ")" );
898            }
899            char[] output2 = new char[reqLength * 2];
900            System.arraycopy(output, 0, output2, 0, reqLength);
901            int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
902            if(retLength != reqLength){
903                logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
904            }
905         }
906     }
907 
composeTest(Normalizer.Mode mode, String[][] tests, int outCol)908     private void composeTest(Normalizer.Mode mode,
909                              String[][] tests, int outCol) throws Exception{
910         for (int i = 0; i < tests.length; i++)
911         {
912             String input = Utility.unescape(tests[i][0]);
913             String expect = Utility.unescape(tests[i][outCol]);
914 
915             logln("Normalizing '" + input + "' (" + hex(input) + ")" );
916 
917             String output = Normalizer.compose(input, mode==Normalizer.NFKC);
918 
919             if (!output.equals(expect)) {
920                 errln("FAIL: case " + i
921                     + " expected '" + expect + "' (" + hex(expect) + ")"
922                     + " but got '" + output + "' (" + hex(output) + ")" );
923             }
924         }
925         char[] output = new char[1];
926         for (int i = 0; i < tests.length; i++)
927         {
928             char[] input = Utility.unescape(tests[i][0]).toCharArray();
929             String expect = Utility.unescape(tests[i][outCol]);
930 
931             logln("Normalizing '" + new String(input) + "' (" +
932                         hex(new String(input)) + ")" );
933             int reqLength=0;
934             while(true){
935                 try{
936                     reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0);
937                     if(reqLength<=output.length ){
938                         break;
939                     }
940                 }catch(IndexOutOfBoundsException e){
941                     output= new char[Integer.parseInt(e.getMessage())];
942                     continue;
943                 }
944             }
945             if (!expect.equals(new String(output,0,reqLength))) {
946                 errln("FAIL: case " + i
947                     + " expected '" + expect + "' (" + hex(expect) + ")"
948                     + " but got '" + new String(output)
949                     + "' ("  + hex(new String(output)) + ")" );
950             }
951         }
952         output = new char[1];
953         for (int i = 0; i < tests.length; i++)
954         {
955             char[] input = Utility.unescape(tests[i][0]).toCharArray();
956             String expect = Utility.unescape(tests[i][outCol]);
957 
958             logln("Normalizing '" + new String(input) + "' (" +
959                         hex(new String(input)) + ")" );
960             int reqLength=0;
961             while(true){
962                 try{
963                     reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0);
964                     if(reqLength<=output.length ){
965                         break;
966                     }
967                 }catch(IndexOutOfBoundsException e){
968                     output= new char[Integer.parseInt(e.getMessage())];
969                     continue;
970                 }
971             }
972             if (!expect.equals(new String(output,0,reqLength))) {
973                 errln("FAIL: case " + i
974                     + " expected '" + expect + "' (" + hex(expect) + ")"
975                     + " but got '" + new String(output)
976                     + "' ("  + hex(new String(output)) + ")" );
977             }
978 
979             char[] output2 = new char[reqLength * 2];
980             System.arraycopy(output, 0, output2, 0, reqLength);
981             int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0);
982             if(retLength != reqLength){
983                 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength);
984             }
985         }
986     }
iterateTest(Normalizer iter, String[][] tests, int outCol)987     private void iterateTest(Normalizer iter, String[][] tests, int outCol){
988         for (int i = 0; i < tests.length; i++)
989         {
990             String input = Utility.unescape(tests[i][0]);
991             String expect = Utility.unescape(tests[i][outCol]);
992 
993             logln("Normalizing '" + input + "' (" + hex(input) + ")" );
994 
995             iter.setText(input);
996             assertEqual(expect, iter, "case " + i + " ");
997         }
998     }
999 
assertEqual(String expected, Normalizer iter, String msg)1000     private void assertEqual(String expected, Normalizer iter, String msg)
1001     {
1002         int index = 0;
1003         int ch;
1004         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);
1005 
1006         while ((ch=iter.next())!= Normalizer.DONE){
1007             if (index >= expected.length()) {
1008                 errln("FAIL: " + msg + "Unexpected character '" + (char)ch
1009                         + "' (" + hex(ch) + ")"
1010                         + " at index " + index);
1011                 break;
1012             }
1013             int want = UTF16.charAt(expected,index);
1014             if (ch != want) {
1015                 errln("FAIL: " + msg + "got '" + (char)ch
1016                         + "' (" + hex(ch) + ")"
1017                         + " but expected '" + want + "' (" + hex(want)+ ")"
1018                         + " at index " + index);
1019             }
1020             index+=  UTF16.getCharCount(ch);
1021         }
1022         if (index < expected.length()) {
1023             errln("FAIL: " + msg + "Only got " + index + " chars, expected "
1024             + expected.length());
1025         }
1026 
1027         cIter.setToLimit();
1028         while((ch=iter.previous())!=Normalizer.DONE){
1029             int want = cIter.previousCodePoint();
1030             if (ch != want ) {
1031                 errln("FAIL: " + msg + "got '" + (char)ch
1032                         + "' (" + hex(ch) + ")"
1033                         + " but expected '" + want + "' (" + hex(want) + ")"
1034                         + " at index " + index);
1035             }
1036         }
1037     }
1038     //--------------------------------------------------------------------------
1039 
1040     // NOTE: These tests are used for quick debugging so are not ported
1041     // to ICU4C tsnorm.cpp in intltest
1042     //
1043 
TestDebugStatic()1044     public void TestDebugStatic(){
1045         String in = Utility.unescape("\\U0001D157\\U0001D165");
1046         if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){
1047             errln("isNormalized failed");
1048         }
1049 
1050         String input  =  "\uAD8B\uAD8B\uAD8B\uAD8B"+
1051             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1052             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1053             "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1054             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1055             "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1056             "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1057             "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1058             "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1059             "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1060             "\uAD8B\uAD8B\uAD8B\uAD8B"+
1061             "d\u031B\u0307\u0323";
1062         String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
1063                         "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+
1064                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1065                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1066                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1067                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1068                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1069                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1070                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1071                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1072                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1073                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1074                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1075                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1076                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1077                         "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+
1078                         "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+
1079                         "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1080                         "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1081                         "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+
1082                         "cccccccccccccccccccccccccccccccccccccccccccccccc"+
1083                         "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1084                         "dddddddddddddddddddddddd"+
1085                         "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+
1086                         "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307";
1087             String output = Normalizer.normalize(Utility.unescape(input),
1088                             Normalizer.NFD);
1089             if(!expect.equals(output)){
1090                 errln("FAIL expected: "+hex(expect) + " got: "+hex(output));
1091             }
1092 
1093 
1094 
1095     }
TestDebugIter()1096     public void TestDebugIter(){
1097         String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
1098         String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e");
1099         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)),
1100                                                 Normalizer.NONE,0);
1101         int index = 0;
1102         int ch;
1103         UCharacterIterator cIter =  UCharacterIterator.getInstance(expected);
1104 
1105         while ((ch=iter.next())!= Normalizer.DONE){
1106             if (index >= expected.length()) {
1107                 errln("FAIL: " +  "Unexpected character '" + (char)ch
1108                         + "' (" + hex(ch) + ")"
1109                         + " at index " + index);
1110                 break;
1111             }
1112             int want = UTF16.charAt(expected,index);
1113             if (ch != want) {
1114                 errln("FAIL: " +  "got '" + (char)ch
1115                         + "' (" + hex(ch) + ")"
1116                         + " but expected '" + want + "' (" + hex(want)+ ")"
1117                         + " at index " + index);
1118             }
1119             index+=  UTF16.getCharCount(ch);
1120         }
1121         if (index < expected.length()) {
1122             errln("FAIL: " +  "Only got " + index + " chars, expected "
1123             + expected.length());
1124         }
1125 
1126         cIter.setToLimit();
1127         while((ch=iter.previous())!=Normalizer.DONE){
1128             int want = cIter.previousCodePoint();
1129             if (ch != want ) {
1130                 errln("FAIL: " + "got '" + (char)ch
1131                         + "' (" + hex(ch) + ")"
1132                         + " but expected '" + want + "' (" + hex(want) + ")"
1133                         + " at index " + index);
1134             }
1135         }
1136     }
TestDebugIterOld()1137     public void TestDebugIterOld(){
1138         String input = "\\U0001D15E";
1139         String expected = "\uD834\uDD57\uD834\uDD65";
1140         String expectedReverse = "\uD834\uDD65\uD834\uDD57";
1141         int index = 0;
1142         int ch;
1143         Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)),
1144                                                 Normalizer.NFKC,0);
1145         StringBuffer got = new StringBuffer();
1146         for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next())
1147         {
1148             if (index >= expected.length()) {
1149                 errln("FAIL: " +  "Unexpected character '" + (char)ch +
1150                        "' (" + hex(ch) + ")" + " at index " + index);
1151                 break;
1152             }
1153             got.append(UCharacter.toString(ch));
1154             index++;
1155         }
1156         if (!expected.equals(got.toString())) {
1157                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"
1158                         + " but expected '" + expected + "' ("
1159                         + hex(expected) + ")");
1160         }
1161         if (got.length() < expected.length()) {
1162             errln("FAIL: " +  "Only got " + index + " chars, expected "
1163                            + expected.length());
1164         }
1165 
1166         logln("Reverse Iteration\n");
1167         iter.setIndexOnly(iter.endIndex());
1168         got.setLength(0);
1169         for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){
1170             if (index >= expected.length()) {
1171                 errln("FAIL: " +  "Unexpected character '" + (char)ch
1172                                + "' (" + hex(ch) + ")" + " at index " + index);
1173                 break;
1174             }
1175             got.append(UCharacter.toString(ch));
1176         }
1177         if (!expectedReverse.equals(got.toString())) {
1178                 errln("FAIL: " +  "got '" +got+ "' (" + hex(got) + ")"
1179                                + " but expected '" + expected
1180                                + "' (" + hex(expected) + ")");
1181         }
1182         if (got.length() < expected.length()) {
1183             errln("FAIL: " +  "Only got " + index + " chars, expected "
1184                       + expected.length());
1185         }
1186 
1187     }
1188     //--------------------------------------------------------------------------
1189     // helper class for TestPreviousNext()
1190     // simple UTF-32 character iterator
1191     class UCharIterator {
1192 
UCharIterator(int[] src, int len, int index)1193        public UCharIterator(int[] src, int len, int index){
1194 
1195             s=src;
1196             length=len;
1197             i=index;
1198        }
1199 
current()1200         public int current() {
1201             if(i<length) {
1202                 return s[i];
1203             } else {
1204                 return -1;
1205             }
1206         }
1207 
next()1208         public int next() {
1209             if(i<length) {
1210                 return s[i++];
1211             } else {
1212                 return -1;
1213             }
1214         }
1215 
previous()1216         public int previous() {
1217             if(i>0) {
1218                 return s[--i];
1219             } else {
1220                 return -1;
1221             }
1222         }
1223 
getIndex()1224         public int getIndex() {
1225             return i;
1226         }
1227 
1228         private int[] s;
1229         private int length, i;
1230     }
TestPreviousNext()1231     public void TestPreviousNext() {
1232         // src and expect strings
1233         char src[]={
1234             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
1235             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
1236             0xc4,
1237             0x1ed0
1238         };
1239         int expect[]={
1240             0x831d,
1241             0x1d158, 0x1d165,
1242             0x41, 0x308,
1243             0x4f, 0x302, 0x301
1244         };
1245 
1246         // expected src indexes corresponding to expect indexes
1247         int expectIndex[]={
1248             0,
1249             2, 2,
1250             4, 4,
1251             5, 5, 5,
1252             6 // behind last character
1253         };
1254 
1255         // initial indexes into the src and expect strings
1256 
1257         final int SRC_MIDDLE=4;
1258         final int EXPECT_MIDDLE=3;
1259 
1260 
1261         // movement vector
1262         // - for previous(), 0 for current(), + for next()
1263         // not const so that we can terminate it below for the error message
1264         String moves="0+0+0--0-0-+++0--+++++++0--------";
1265 
1266         // iterators
1267         Normalizer iter = new Normalizer(new String(src),
1268                                                 Normalizer.NFD,0);
1269         UCharIterator iter32 = new UCharIterator(expect, expect.length,
1270                                                      EXPECT_MIDDLE);
1271 
1272         int c1, c2;
1273         char m;
1274 
1275         // initially set the indexes into the middle of the strings
1276         iter.setIndexOnly(SRC_MIDDLE);
1277 
1278         // move around and compare the iteration code points with
1279         // the expected ones
1280         int movesIndex =0;
1281         while(movesIndex<moves.length()) {
1282             m=moves.charAt(movesIndex++);
1283             if(m=='-') {
1284                 c1=iter.previous();
1285                 c2=iter32.previous();
1286             } else if(m=='0') {
1287                 c1=iter.current();
1288                 c2=iter32.current();
1289             } else /* m=='+' */ {
1290                 c1=iter.next();
1291                 c2=iter32.next();
1292             }
1293 
1294             // compare results
1295             if(c1!=c2) {
1296                 // copy the moves until the current (m) move, and terminate
1297                 String history = moves.substring(0,movesIndex);
1298                 errln("error: mismatch in Normalizer iteration at "+history+": "
1299                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
1300                 break;
1301             }
1302 
1303             // compare indexes
1304             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
1305                 // copy the moves until the current (m) move, and terminate
1306                 String history = moves.substring(0,movesIndex);
1307                 errln("error: index mismatch in Normalizer iteration at "
1308                       +history+ " : "+ "Normalizer index " +iter.getIndex()
1309                       +" expected "+ expectIndex[iter32.getIndex()]);
1310                 break;
1311             }
1312         }
1313     }
1314     // Only in ICU4j
TestPreviousNextJCI()1315     public void TestPreviousNextJCI() {
1316         // src and expect strings
1317         char src[]={
1318             UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999),
1319             UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f),
1320             0xc4,
1321             0x1ed0
1322         };
1323         int expect[]={
1324             0x831d,
1325             0x1d158, 0x1d165,
1326             0x41, 0x308,
1327             0x4f, 0x302, 0x301
1328         };
1329 
1330         // expected src indexes corresponding to expect indexes
1331         int expectIndex[]={
1332             0,
1333             2, 2,
1334             4, 4,
1335             5, 5, 5,
1336             6 // behind last character
1337         };
1338 
1339         // initial indexes into the src and expect strings
1340 
1341         final int SRC_MIDDLE=4;
1342         final int EXPECT_MIDDLE=3;
1343 
1344 
1345         // movement vector
1346         // - for previous(), 0 for current(), + for next()
1347         // not const so that we can terminate it below for the error message
1348         String moves="0+0+0--0-0-+++0--+++++++0--------";
1349 
1350         // iterators
1351         StringCharacterIterator text = new StringCharacterIterator(new String(src));
1352         Normalizer iter = new Normalizer(text,Normalizer.NFD,0);
1353         UCharIterator iter32 = new UCharIterator(expect, expect.length,
1354                                                      EXPECT_MIDDLE);
1355 
1356         int c1, c2;
1357         char m;
1358 
1359         // initially set the indexes into the middle of the strings
1360         iter.setIndexOnly(SRC_MIDDLE);
1361 
1362         // move around and compare the iteration code points with
1363         // the expected ones
1364         int movesIndex =0;
1365         while(movesIndex<moves.length()) {
1366             m=moves.charAt(movesIndex++);
1367             if(m=='-') {
1368                 c1=iter.previous();
1369                 c2=iter32.previous();
1370             } else if(m=='0') {
1371                 c1=iter.current();
1372                 c2=iter32.current();
1373             } else /* m=='+' */ {
1374                 c1=iter.next();
1375                 c2=iter32.next();
1376             }
1377 
1378             // compare results
1379             if(c1!=c2) {
1380                 // copy the moves until the current (m) move, and terminate
1381                 String history = moves.substring(0,movesIndex);
1382                 errln("error: mismatch in Normalizer iteration at "+history+": "
1383                       +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2));
1384                 break;
1385             }
1386 
1387             // compare indexes
1388             if(iter.getIndex()!=expectIndex[iter32.getIndex()]) {
1389                 // copy the moves until the current (m) move, and terminate
1390                 String history = moves.substring(0,movesIndex);
1391                 errln("error: index mismatch in Normalizer iteration at "
1392                       +history+ " : "+ "Normalizer index " +iter.getIndex()
1393                       +" expected "+ expectIndex[iter32.getIndex()]);
1394                 break;
1395             }
1396         }
1397     }
1398 
1399     // test APIs that are not otherwise used - improve test coverage
TestNormalizerAPI()1400     public void TestNormalizerAPI() throws Exception {
1401         try{
1402             // instantiate a Normalizer from a CharacterIterator
1403             String s=Utility.unescape("a\u0308\uac00\\U0002f800");
1404             // make s a bit longer and more interesting
1405             UCharacterIterator iter = UCharacterIterator.getInstance(s+s);
1406             Normalizer norm = new Normalizer(iter, Normalizer.NFC,0);
1407             if(norm.next()!=0xe4) {
1408                 errln("error in Normalizer(CharacterIterator).next()");
1409             }
1410 
1411             // test clone(), ==, and hashCode()
1412             Normalizer clone=(Normalizer)norm.clone();
1413             if(clone.equals(norm)) {
1414                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm");
1415             }
1416 
1417 
1418             if(clone.getLength()!= norm.getLength()){
1419                errln("error in Normalizer.getBeginIndex()");
1420             }
1421             // clone must have the same hashCode()
1422             //if(clone.hashCode()!=norm.hashCode()) {
1423             //    errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()");
1424             //}
1425             if(clone.next()!=0xac00) {
1426                 errln("error in Normalizer(Normalizer(CharacterIterator)).next()");
1427             }
1428             int ch = clone.next();
1429             if(ch!=0x4e3d) {
1430                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()");
1431             }
1432             // position changed, must change hashCode()
1433             if(clone.hashCode()==norm.hashCode()) {
1434                 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()");
1435             }
1436 
1437             // test compose() and decompose()
1438             StringBuffer tel;
1439             String nfkc, nfkd;
1440             tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121");
1441             tel.insert(1,(char)0x0301);
1442 
1443             nfkc=Normalizer.compose(tel.toString(), true);
1444             nfkd=Normalizer.decompose(tel.toString(), true);
1445             if(
1446                 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))||
1447                 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL"))
1448             ) {
1449                 errln("error in Normalizer::(de)compose(): wrong result(s)");
1450             }
1451 
1452             // test setIndex()
1453 //            ch=norm.setIndex(3);
1454 //            if(ch!=0x4e3d) {
1455 //                errln("error in Normalizer(CharacterIterator).setIndex(3)");
1456 //            }
1457 
1458             // test setText(CharacterIterator) and getText()
1459             String out, out2;
1460             clone.setText(iter);
1461 
1462             out = clone.getText();
1463             out2 = iter.getText();
1464             if( !out.equals(out2) ||
1465                 clone.startIndex()!=0||
1466                 clone.endIndex()!=iter.getLength()
1467             ) {
1468                 errln("error in Normalizer::setText() or Normalizer::getText()");
1469             }
1470 
1471             char[] fillIn1 = new char[clone.getLength()];
1472             char[] fillIn2 = new char[iter.getLength()];
1473             int len = clone.getText(fillIn1);
1474             iter.getText(fillIn2,0);
1475             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
1476                 errln("error in Normalizer.getText(). Normalizer: "+
1477                                 Utility.hex(new String(fillIn1))+
1478                                 " Iter: " + Utility.hex(new String(fillIn2)));
1479             }
1480 
1481             clone.setText(fillIn1);
1482             len = clone.getText(fillIn2);
1483             if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){
1484                 errln("error in Normalizer.setText() or Normalizer.getText()"+
1485                                 Utility.hex(new String(fillIn1))+
1486                                 " Iter: " + Utility.hex(new String(fillIn2)));
1487             }
1488 
1489             // test setText(UChar *), getUMode() and setMode()
1490             clone.setText(s);
1491             clone.setIndexOnly(1);
1492             clone.setMode(Normalizer.NFD);
1493             if(clone.getMode()!=Normalizer.NFD) {
1494                 errln("error in Normalizer::setMode() or Normalizer::getMode()");
1495             }
1496             if(clone.next()!=0x308 || clone.next()!=0x1100) {
1497                 errln("error in Normalizer::setText() or Normalizer::setMode()");
1498             }
1499 
1500             // test last()/previous() with an internal buffer overflow
1501             StringBuffer buf = new StringBuffer("aaaaaaaaaa");
1502             buf.setCharAt(10-1,'\u0308');
1503             clone.setText(buf);
1504             if(clone.last()!=0x308) {
1505                 errln("error in Normalizer(10*U+0308).last()");
1506             }
1507 
1508             // test UNORM_NONE
1509             norm.setMode(Normalizer.NONE);
1510             if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) {
1511                 errln("error in Normalizer(UNORM_NONE).first()/next()/last()");
1512             }
1513             out=Normalizer.normalize(s, Normalizer.NONE);
1514             if(!out.equals(s)) {
1515                 errln("error in Normalizer::normalize(UNORM_NONE)");
1516             }
1517             ch = 0x1D15E;
1518             String exp = "\\U0001D157\\U0001D165";
1519             String ns = Normalizer.normalize(ch,Normalizer.NFC);
1520             if(!ns.equals(Utility.unescape(exp))){
1521                 errln("error in Normalizer.normalize(int,Mode)");
1522             }
1523             ns = Normalizer.normalize(ch,Normalizer.NFC,0);
1524             if(!ns.equals(Utility.unescape(exp))){
1525                 errln("error in Normalizer.normalize(int,Mode,int)");
1526             }
1527 
1528 
1529         }catch(Exception e){
1530             throw e;
1531         }
1532     }
1533 
TestConcatenate()1534     public void TestConcatenate() {
1535 
1536         Object[][]cases=new Object[][]{
1537             /* mode, left, right, result */
1538             {
1539                 Normalizer.NFC,
1540                 "re",
1541                 "\u0301sum\u00e9",
1542                 "r\u00e9sum\u00e9"
1543             },
1544             {
1545                 Normalizer.NFC,
1546                 "a\u1100",
1547                 "\u1161bcdefghijk",
1548                 "a\uac00bcdefghijk"
1549             },
1550             /* ### TODO: add more interesting cases */
1551             {
1552                 Normalizer.NFD,
1553                 "\u03B1\u0345",
1554                 "\u0C4D\uD804\uDCBA\uD834\uDD69",  // 0C4D 110BA 1D169
1555                 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345"  // 03B1 1D169 110BA 0C4D 0345
1556             }
1557         };
1558 
1559         String left, right, expect, result;
1560         Normalizer.Mode mode;
1561         int i;
1562 
1563         /* test concatenation */
1564         for(i=0; i<cases.length; ++i) {
1565             mode = (Normalizer.Mode)cases[i][0];
1566 
1567             left=(String)cases[i][1];
1568             right=(String)cases[i][2];
1569             expect=(String)cases[i][3];
1570             {
1571                 result=Normalizer.concatenate(left, right, mode,0);
1572                 if(!result.equals(expect)) {
1573                     errln("error in Normalizer.concatenate(), cases[] failed"
1574                           +", result==expect: expected: "
1575                           + hex(expect)+" =========> got: " + hex(result));
1576                 }
1577             }
1578             {
1579                 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0);
1580                 if(!result.equals(expect)) {
1581                     errln("error in Normalizer.concatenate(), cases[] failed"
1582                           +", result==expect: expected: "
1583                           + hex(expect)+" =========> got: " + hex(result));
1584                 }
1585             }
1586         }
1587     }
1588     private final int RAND_MAX = 0x7fff;
1589 
TestCheckFCD()1590     public void TestCheckFCD()
1591     {
1592       char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
1593                      0x0008, 0x0009, 0x000A};
1594 
1595       char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301,
1596                       0x02B9, 0x0314, 0x0315, 0x0316};
1597 
1598       char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7,
1599                      0x0050, 0x0730, 0x09EE, 0x1E10};
1600 
1601       char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0},
1602                           {0x0061, 0x030A, 0x00E2, 0x0323, 0},
1603                           {0x0061, 0x0323, 0x00E2, 0x0323, 0},
1604                           {0x0061, 0x0323, 0x1E05, 0x0302, 0}
1605                         };
1606       Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES};
1607 
1608       char[] datachar= {        0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
1609                                 0x6a,
1610                                 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
1611                                 0xea,
1612                                 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306,
1613                                 0x0307, 0x0308, 0x0309, 0x030a,
1614                                 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326,
1615                                 0x0327, 0x0328, 0x0329, 0x032a,
1616                                 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06,
1617                                 0x1e07, 0x1e08, 0x1e09, 0x1e0a
1618                        };
1619 
1620       int count = 0;
1621 
1622       if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES)
1623         errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n");
1624       if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO)
1625         errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n");
1626       if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES)
1627         errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n");
1628 
1629 
1630       while (count < 4)
1631       {
1632         Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0);
1633         if (result[count] != fcdresult) {
1634             errln("Normalizer.quickCheck(FCD) failed: Data set "+ count
1635                     + " expected value "+ result[count]);
1636         }
1637         count ++;
1638       }
1639 
1640       /* random checks of long strings */
1641       //srand((unsigned)time( NULL ));
1642       Random rand = createRandom(); // use test framework's random
1643 
1644       for (count = 0; count < 50; count ++)
1645       {
1646         int size = 0;
1647         Normalizer.QuickCheckResult testresult = Normalizer.YES;
1648         char[] data= new char[20];
1649         char[] norm= new char[100];
1650         char[] nfd = new char[100];
1651         int normStart = 0;
1652         int nfdsize = 0;
1653         while (size != 19) {
1654           data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX];
1655           logln("0x"+data[size]);
1656           normStart += Normalizer.normalize(data,size,size+1,
1657                                               norm,normStart,100,
1658                                               Normalizer.NFD,0);
1659           size ++;
1660         }
1661         logln("\n");
1662 
1663         nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0);
1664         //    nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL,
1665         //                      nfd, 100, &status);
1666         if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) {
1667           testresult = Normalizer.NO;
1668         }
1669         if (testresult == Normalizer.YES) {
1670           logln("result Normalizer.YES\n");
1671         }
1672         else {
1673           logln("result Normalizer.NO\n");
1674         }
1675 
1676         if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) {
1677           errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) );
1678         }
1679       }
1680     }
1681 
1682 
1683     // reference implementation of Normalizer::compare
ref_norm_compare(String s1, String s2, int options)1684     private int ref_norm_compare(String s1, String s2, int options) {
1685         String t1, t2,r1,r2;
1686 
1687         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
1688 
1689         if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) {
1690             // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y)))
1691             r1 = Normalizer.decompose(s1,false,normOptions);
1692             r2 = Normalizer.decompose(s2,false,normOptions);
1693             r1 = UCharacter.foldCase(r1,options);
1694             r2 = UCharacter.foldCase(r2,options);
1695         }else{
1696             r1 = s1;
1697             r2 = s2;
1698         }
1699 
1700         t1 = Normalizer.decompose(r1, false, normOptions);
1701         t2 = Normalizer.decompose(r2, false, normOptions);
1702 
1703         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
1704             UTF16.StringComparator comp
1705                     = new UTF16.StringComparator(true, false,
1706                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);
1707             return comp.compare(t1,t2);
1708         } else {
1709             return t1.compareTo(t2);
1710         }
1711 
1712     }
1713 
1714     // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately
norm_compare(String s1, String s2, int options)1715     private int norm_compare(String s1, String s2, int options) {
1716         int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT);
1717 
1718         if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) &&
1719             Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) {
1720             options|=Normalizer.INPUT_IS_FCD;
1721         }
1722 
1723         return Normalizer.compare(s1, s2, options);
1724     }
1725 
1726     // reference implementation of UnicodeString::caseCompare
ref_case_compare(String s1, String s2, int options)1727     private int ref_case_compare(String s1, String s2, int options) {
1728         String t1, t2;
1729 
1730         t1=s1;
1731         t2=s2;
1732 
1733         t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
1734         t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0));
1735 
1736         if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) {
1737             UTF16.StringComparator comp
1738                     = new UTF16.StringComparator(true, false,
1739                                     UTF16.StringComparator.FOLD_CASE_DEFAULT);
1740             return comp.compare(t1,t2);
1741         } else {
1742             return t1.compareTo(t2);
1743         }
1744 
1745     }
1746 
1747     // reduce an integer to -1/0/1
sign(int value)1748     private static int sign(int value) {
1749         if(value==0) {
1750             return 0;
1751         } else {
1752             return (value>>31)|1;
1753         }
1754     }
signString(int value)1755     private static String signString(int value) {
1756         if(value<0) {
1757             return "<0";
1758         } else if(value==0) {
1759             return "=0";
1760         } else /* value>0 */ {
1761             return ">0";
1762         }
1763     }
1764     // test Normalizer::compare and unorm_compare (thinly wrapped by the former)
1765     // by comparing it with its semantic equivalent
1766     // since we trust the pieces, this is sufficient
1767 
1768     // test each string with itself and each other
1769     // each time with all options
1770     private  String strings[]=new String[]{
1771                 // some cases from NormalizationTest.txt
1772                 // 0..3
1773                 "D\u031B\u0307\u0323",
1774                 "\u1E0C\u031B\u0307",
1775                 "D\u031B\u0323\u0307",
1776                 "d\u031B\u0323\u0307",
1777 
1778                 // 4..6
1779                 "\u00E4",
1780                 "a\u0308",
1781                 "A\u0308",
1782 
1783                 // Angstrom sign = A ring
1784                 // 7..10
1785                 "\u212B",
1786                 "\u00C5",
1787                 "A\u030A",
1788                 "a\u030A",
1789 
1790                 // 11.14
1791                 "a\u059A\u0316\u302A\u032Fb",
1792                 "a\u302A\u0316\u032F\u059Ab",
1793                 "a\u302A\u0316\u032F\u059Ab",
1794                 "A\u059A\u0316\u302A\u032Fb",
1795 
1796                 // from ICU case folding tests
1797                 // 15..20
1798                 "A\u00df\u00b5\ufb03\\U0001040c\u0131",
1799                 "ass\u03bcffi\\U00010434i",
1800                 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff",
1801                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff",
1802                 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff",
1803                 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd",
1804 
1805                 //     U+d800 U+10001   see implementation comment in unorm_cmpEquivFold
1806                 // vs. U+10000          at bottom - code point order
1807                 // 21..22
1808                 "\ud800\ud800\udc01",
1809                 "\ud800\udc00",
1810 
1811                 // other code point order tests from ustrtest.cpp
1812                 // 23..31
1813                 "\u20ac\ud801",
1814                 "\u20ac\ud800\udc00",
1815                 "\ud800",
1816                 "\ud800\uff61",
1817                 "\udfff",
1818                 "\uff61\udfff",
1819                 "\uff61\ud800\udc02",
1820                 "\ud800\udc02",
1821                 "\ud84d\udc56",
1822 
1823                 // long strings, see cnormtst.c/TestNormCoverage()
1824                 // equivalent if case-insensitive
1825                 // 32..33
1826                 "\uAD8B\uAD8B\uAD8B\uAD8B"+
1827                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1828                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1829                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1830                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1831                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1832                 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1833                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1834                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1835                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1836                 "\uAD8B\uAD8B\uAD8B\uAD8B"+
1837                 "d\u031B\u0307\u0323",
1838 
1839                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
1840                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1841                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1842                 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1843                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1844                 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+
1845                 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+
1846                 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+
1847                 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+
1848                 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+
1849                 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+
1850                 "\u1E0C\u031B\u0307",
1851 
1852                 // some strings that may make a difference whether the compare function
1853                 // case-folds or decomposes first
1854                 // 34..41
1855                 "\u0360\u0345\u0334",
1856                 "\u0360\u03b9\u0334",
1857 
1858                 "\u0360\u1f80\u0334",
1859                 "\u0360\u03b1\u0313\u03b9\u0334",
1860 
1861                 "\u0360\u1ffc\u0334",
1862                 "\u0360\u03c9\u03b9\u0334",
1863 
1864                 "a\u0360\u0345\u0360\u0345b",
1865                 "a\u0345\u0360\u0345\u0360b",
1866 
1867                 // interesting cases for canonical caseless match with turkic i handling
1868                 // 42..43
1869                 "\u00cc",
1870                 "\u0069\u0300",
1871 
1872                 // strings with post-Unicode 3.2 normalization or normalization corrections
1873                 // 44..45
1874                 "\u00e4\u193b\\U0002f868",
1875                 "\u0061\u193b\u0308\u36fc",
1876 
1877 
1878     };
1879 
1880     // all combinations of options
1881     // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions
1882     final class Temp {
1883         int options;
1884         String name;
Temp(int opt,String str)1885         public Temp(int opt,String str){
1886             options =opt;
1887             name = str;
1888         }
1889 
1890     }
1891     // set UNORM_UNICODE_3_2 in one additional combination
1892 
1893     private Temp[] opt = new Temp[]{
1894                     new Temp(0,"default"),
1895                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ),
1896                     new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ),
1897                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ),
1898                     new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"),
1899                     new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"),
1900                     new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2")
1901             };
1902 
1903 
TestCompareDebug()1904     public void TestCompareDebug(){
1905 
1906         String[] s = new String[100]; // at least as many items as in strings[] !
1907 
1908 
1909         int i, j, k, count=strings.length;
1910         int result, refResult;
1911 
1912         // create the UnicodeStrings
1913         for(i=0; i<count; ++i) {
1914             s[i]=Utility.unescape(strings[i]);
1915         }
1916         UTF16.StringComparator comp = new UTF16.StringComparator(true, false,
1917                                      UTF16.StringComparator.FOLD_CASE_DEFAULT);
1918         // test them each with each other
1919 
1920         i = 42;
1921         j = 43;
1922         k = 2;
1923         // test Normalizer::compare
1924         result=norm_compare(s[i], s[j], opt[k].options);
1925         refResult=ref_norm_compare(s[i], s[j], opt[k].options);
1926         if(sign(result)!=sign(refResult)) {
1927             errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1928         }
1929 
1930         // test UnicodeString::caseCompare - same internal implementation function
1931          if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
1932         //    result=s[i]. (s[j], opt[k].options);
1933             if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
1934             {
1935                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
1936             }
1937             else {
1938                 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
1939             }
1940 
1941             result=comp.compare(s[i],s[j]);
1942             refResult=ref_case_compare(s[i], s[j], opt[k].options);
1943             if(sign(result)!=sign(refResult)) {
1944                       errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1945                             }
1946         }
1947         String value1 = "\u00dater\u00fd";
1948         String value2 = "\u00fater\u00fd";
1949         if(Normalizer.compare(value1,value2,0)!=0){
1950             if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){
1951 
1952             }
1953         }
1954     }
1955 
TestCompare()1956     public void TestCompare() {
1957 
1958         String[] s = new String[100]; // at least as many items as in strings[] !
1959 
1960         int i, j, k, count=strings.length;
1961         int result, refResult;
1962 
1963         // create the UnicodeStrings
1964         for(i=0; i<count; ++i) {
1965             s[i]=Utility.unescape(strings[i]);
1966         }
1967         UTF16.StringComparator comp = new UTF16.StringComparator();
1968         // test them each with each other
1969         for(i=0; i<count; ++i) {
1970             for(j=i; j<count; ++j) {
1971                 for(k=0; k<opt.length; ++k) {
1972                     // test Normalizer::compare
1973                     result=norm_compare(s[i], s[j], opt[k].options);
1974                     refResult=ref_norm_compare(s[i], s[j], opt[k].options);
1975                     if(sign(result)!=sign(refResult)) {
1976                         errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1977                     }
1978 
1979                     // test UnicodeString::caseCompare - same internal implementation function
1980                      if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) {
1981                         //    result=s[i]. (s[j], opt[k].options);
1982                         if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
1983                         {
1984                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
1985                         }
1986                         else {
1987                             comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
1988                         }
1989 
1990                         comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
1991                         // result=comp.caseCompare(s[i],s[j], opt[k].options);
1992                         result=comp.compare(s[i],s[j]);
1993                         refResult=ref_case_compare(s[i], s[j], opt[k].options);
1994                         if(sign(result)!=sign(refResult)) {
1995                                   errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult);
1996                                          }
1997                     }
1998                 }
1999             }
2000         }
2001 
2002         // test cases with i and I to make sure Turkic works
2003         char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 };
2004         UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet();
2005         Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl;
2006         nfcImpl.ensureCanonIterData();
2007 
2008         String s1, s2;
2009 
2010         // collect all sets into one for contiguous output
2011         for(i=0; i<iI.length; ++i) {
2012             if(nfcImpl.getCanonStartSet(iI[i], iSet)) {
2013                 set.addAll(iSet);
2014             }
2015         }
2016 
2017         // test all of these precomposed characters
2018         Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance();
2019         UnicodeSetIterator it = new UnicodeSetIterator(set);
2020         int c;
2021         while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) {
2022             s1 = UTF16.valueOf(c);
2023             s2 = nfcNorm2.getDecomposition(c);
2024             for(k=0; k<opt.length; ++k) {
2025                 // test Normalizer::compare
2026 
2027                 result= norm_compare(s1, s2, opt[k].options);
2028                 refResult=ref_norm_compare(s1, s2, opt[k].options);
2029                 if(sign(result)!=sign(refResult)) {
2030                     errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")"
2031                           + signString(result)+" should be "+signString(refResult));
2032                 }
2033 
2034                 // test UnicodeString::caseCompare - same internal implementation function
2035                 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) {
2036                      if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0)
2037                     {
2038                         comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT);
2039                     }
2040                     else {
2041                         comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I);
2042                     }
2043 
2044                     comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0);
2045 
2046                     result=comp.compare(s1,s2);
2047                     refResult=ref_case_compare(s1, s2, opt[k].options);
2048                     if(sign(result)!=sign(refResult)) {
2049                         errln("UTF16.compare(U+"+hex(c)+" with its NFD, "
2050                               +opt[k].name+")"+signString(result) +" should be "+signString(refResult));
2051                     }
2052                 }
2053             }
2054         }
2055 
2056         // test getDecomposition() for some characters that do not decompose
2057         if( nfcNorm2.getDecomposition(0x20)!=null ||
2058             nfcNorm2.getDecomposition(0x4e00)!=null ||
2059             nfcNorm2.getDecomposition(0x20002)!=null
2060         ) {
2061             errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions");
2062         }
2063 
2064         // test getRawDecomposition() for some characters that do not decompose
2065         if( nfcNorm2.getRawDecomposition(0x20)!=null ||
2066             nfcNorm2.getRawDecomposition(0x4e00)!=null ||
2067             nfcNorm2.getRawDecomposition(0x20002)!=null
2068         ) {
2069             errln("getRawDecomposition() returns TRUE for characters which do not have decompositions");
2070         }
2071 
2072         // test composePair() for some pairs of characters that do not compose
2073         if( nfcNorm2.composePair(0x20, 0x301)>=0 ||
2074             nfcNorm2.composePair(0x61, 0x305)>=0 ||
2075             nfcNorm2.composePair(0x1100, 0x1160)>=0 ||
2076             nfcNorm2.composePair(0xac00, 0x11a7)>=0
2077         ) {
2078             errln("NFC.composePair() incorrectly composes some pairs of characters");
2079         }
2080 
2081         // test FilteredNormalizer2.getDecomposition()
2082         UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]");
2083         FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2084         if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) {
2085             errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed");
2086         }
2087 
2088         // test FilteredNormalizer2.getRawDecomposition()
2089         if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) {
2090             errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed");
2091         }
2092 
2093         // test FilteredNormalizer2::composePair()
2094         if( 0x100!=fn2.composePair(0x41, 0x304) ||
2095             fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08
2096         ) {
2097             errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed");
2098         }
2099     }
2100 
2101     // verify that case-folding does not un-FCD strings
countFoldFCDExceptions(int foldingOptions)2102     int countFoldFCDExceptions(int foldingOptions) {
2103         String s, d;
2104         int c;
2105         int count;
2106         int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC;
2107         Normalizer.QuickCheckResult qcResult;
2108         int category;
2109         boolean isNFD;
2110 
2111 
2112         logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions));
2113 
2114         count=0;
2115         for(c=0; c<=0x10ffff; ++c) {
2116             category=UCharacter.getType(c);
2117             if(category==UCharacterCategory.UNASSIGNED) {
2118                 continue; // skip unassigned code points
2119             }
2120             if(c==0xac00) {
2121                 c=0xd7a3; // skip Hangul - no case folding there
2122                 continue;
2123             }
2124             // skip Han blocks - no case folding there either
2125             if(c==0x3400) {
2126                 c=0x4db5;
2127                 continue;
2128             }
2129             if(c==0x4e00) {
2130                 c=0x9fa5;
2131                 continue;
2132             }
2133             if(c==0x20000) {
2134                 c=0x2a6d6;
2135                 continue;
2136             }
2137 
2138             s= UTF16.valueOf(c);
2139 
2140             // get leading and trailing cc for c
2141             d= Normalizer.decompose(s,false);
2142             isNFD= s==d;
2143             cc=UCharacter.getCombiningClass(UTF16.charAt(d,0));
2144             trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
2145 
2146             // get leading and trailing cc for the case-folding of c
2147             UCharacter.foldCase(s,(foldingOptions==0));
2148             d = Normalizer.decompose(s, false);
2149             foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0));
2150             foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1));
2151 
2152             qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0);
2153 
2154 
2155             // bad:
2156             // - character maps to empty string: adjacent characters may then need reordering
2157             // - folding has different leading/trailing cc's, and they don't become just 0
2158             // - folding itself is not FCD
2159             if( qcResult!=Normalizer.YES ||
2160                 s.length()==0 ||
2161                 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0)
2162             ) {
2163                 ++count;
2164                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
2165                 //errln("  cc %02x trailCC %02x    foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x   quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult);
2166                 continue;
2167             }
2168 
2169             // also bad:
2170             // if a code point is in NFD but its case folding is not, then
2171             // unorm_compare will also fail
2172             if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) {
2173                 ++count;
2174                 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")");
2175             }
2176         }
2177 
2178         logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" );
2179         return count;
2180     }
2181 
TestFindFoldFCDExceptions()2182     public void TestFindFoldFCDExceptions() {
2183         int count;
2184 
2185         count=countFoldFCDExceptions(0);
2186         count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I);
2187         if(count>0) {
2188             //*
2189             //* If case-folding un-FCDs any strings, then unorm_compare() must be
2190             //* re-implemented.
2191             //* It currently assumes that one can check for FCD then case-fold
2192             //* and then still have FCD strings for raw decomposition without reordering.
2193             //*
2194             errln("error: There are "+count+" code points for which case-folding"+
2195                   " may un-FCD a string for all folding options.\n See comment"+
2196                   " in BasicNormalizerTest::FindFoldFCDExceptions()!");
2197         }
2198     }
2199 
TestCombiningMarks()2200     public void TestCombiningMarks(){
2201         String src = "\u0f71\u0f72\u0f73\u0f74\u0f75";
2202         String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74";
2203         String result = Normalizer.decompose(src,false);
2204         if(!expected.equals(result)){
2205             errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result));
2206         }
2207     }
2208 
2209     /*
2210      * Re-enable this test when UTC fixes UAX 21
2211     public void TestUAX21Failure(){
2212         final String[][] cases = new String[][]{
2213                 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"},
2214                 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"},
2215                 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
2216                 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"},
2217                 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"},
2218                 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"},
2219         };
2220         for(int i = 0; i< cases.length; i++){
2221             String s1 =cases[0][0];
2222             String s2 = cases[0][1];
2223             if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare
2224                 &&
2225                 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){
2226                 errln("Normalizer.compare() failed for s1: "
2227                         + Utility.hex(s1) +" s2: " + Utility.hex(s2));
2228             }
2229         }
2230     }
2231     */
TestFCNFKCClosure()2232     public void TestFCNFKCClosure() {
2233         final class TestStruct{
2234             int c;
2235             String s;
2236             TestStruct(int cp, String src){
2237                 c=cp;
2238                 s=src;
2239             }
2240         }
2241 
2242         TestStruct[] tests= new TestStruct[]{
2243             new TestStruct( 0x00C4, "" ),
2244             new TestStruct( 0x00E4, "" ),
2245             new TestStruct( 0x037A, "\u0020\u03B9" ),
2246             new TestStruct( 0x03D2, "\u03C5" ),
2247             new TestStruct( 0x20A8, "\u0072\u0073" ) ,
2248             new TestStruct( 0x210B, "\u0068" ),
2249             new TestStruct( 0x210C, "\u0068" ),
2250             new TestStruct( 0x2121, "\u0074\u0065\u006C" ),
2251             new TestStruct( 0x2122, "\u0074\u006D" ),
2252             new TestStruct( 0x2128, "\u007A" ),
2253             new TestStruct( 0x1D5DB,"\u0068" ),
2254             new TestStruct( 0x1D5ED,"\u007A" ),
2255             new TestStruct( 0x0061, "" )
2256         };
2257 
2258 
2259         for(int i = 0; i < tests.length; ++ i) {
2260             String result=Normalizer.getFC_NFKC_Closure(tests[i].c);
2261             if(!result.equals(new String(tests[i].s))) {
2262                 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong");
2263             }
2264         }
2265 
2266         /* error handling */
2267 
2268         int length=Normalizer.getFC_NFKC_Closure(0x5c, null);
2269         if(length!=0){
2270             errln("getFC_NFKC_Closure did not perform error handling correctly");
2271         }
2272     }
TestBugJ2324()2273     public void TestBugJ2324(){
2274        /* String[] input = new String[]{
2275                             //"\u30FD\u3099",
2276                             "\u30FA\u309A",
2277                             "\u30FB\u309A",
2278                             "\u30FC\u309A",
2279                             "\u30FE\u309A",
2280                             "\u30FD\u309A",
2281 
2282         };*/
2283         String troublesome = "\u309A";
2284         for(int i=0x3000; i<0x3100;i++){
2285             String input = ((char)i)+troublesome;
2286             try{
2287               /*  String result =*/ Normalizer.compose(input,false);
2288             }catch(IndexOutOfBoundsException e){
2289                 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString());
2290             }
2291         }
2292 
2293     }
2294 
2295     static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5;
2296 
initSkippables(UnicodeSet[] skipSets)2297     private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) {
2298         skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false);
2299         skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
2300         skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false);
2301         skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false);
2302 
2303         // Remove from the NFC and NFKC sets all those characters that change
2304         // when a back-combining character is added.
2305         // First, get all of the back-combining characters and their combining classes.
2306         UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]");
2307         int numCombineBack=combineBack.size();
2308         int[] combineBackCharsAndCc=new int[numCombineBack*2];
2309         UnicodeSetIterator iter=new UnicodeSetIterator(combineBack);
2310         for(int i=0; i<numCombineBack; ++i) {
2311             iter.next();
2312             int c=iter.codepoint;
2313             combineBackCharsAndCc[2*i]=c;
2314             combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c);
2315         }
2316 
2317         // We need not look at control codes, Han characters nor Hangul LVT syllables because they
2318         // do not combine forward. LV syllables are already removed.
2319         UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]");
2320         UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting);
2321         // System.out.format("unsure.size()=%d\n", unsure.size());
2322 
2323         // For each character about which we are unsure, see if it changes when we add
2324         // one of the back-combining characters.
2325         Normalizer2 norm2=Normalizer2.getNFCInstance();
2326         StringBuilder s=new StringBuilder();
2327         iter.reset(unsure);
2328         while(iter.next()) {
2329             int c=iter.codepoint;
2330             s.delete(0, 0x7fffffff).appendCodePoint(c);
2331             int cLength=s.length();
2332             int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS);
2333             for(int i=0; i<numCombineBack; ++i) {
2334                 // If c's decomposition ends with a character with non-zero combining class, then
2335                 // c can only change if it combines with a character with a non-zero combining class.
2336                 int cc2=combineBackCharsAndCc[2*i+1];
2337                 if(tccc==0 || cc2!=0) {
2338                     int c2=combineBackCharsAndCc[2*i];
2339                     s.appendCodePoint(c2);
2340                     if(!norm2.isNormalized(s)) {
2341                         // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2);
2342                         skipSets[C].remove(c);
2343                         skipSets[KC].remove(c);
2344                         break;
2345                     }
2346                     s.delete(cLength, 0x7fffffff);
2347                 }
2348             }
2349         }
2350         return skipSets;
2351     }
2352 
TestSkippable()2353     public void TestSkippable() {
2354         UnicodeSet[] skipSets = new UnicodeSet[] {
2355             new UnicodeSet(), //NFD
2356             new UnicodeSet(), //NFC
2357             new UnicodeSet(), //NFKD
2358             new UnicodeSet()  //NFKC
2359         };
2360         UnicodeSet[] expectSets = new UnicodeSet[] {
2361             new UnicodeSet(),
2362             new UnicodeSet(),
2363             new UnicodeSet(),
2364             new UnicodeSet()
2365         };
2366         StringBuilder s, pattern;
2367 
2368         // build NF*Skippable sets from runtime data
2369         skipSets[D].applyPattern("[:NFD_Inert:]");
2370         skipSets[C].applyPattern("[:NFC_Inert:]");
2371         skipSets[KD].applyPattern("[:NFKD_Inert:]");
2372         skipSets[KC].applyPattern("[:NFKC_Inert:]");
2373 
2374         expectSets = initSkippables(expectSets);
2375         if(expectSets[D].contains(0x0350)){
2376             errln("expectSets[D] contains 0x0350");
2377         }
2378         for(int i=0; i<expectSets.length; ++i) {
2379             if(!skipSets[i].equals(expectSets[i])) {
2380                 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n");
2381                 // Note: This used to depend on hardcoded UnicodeSet patterns generated by
2382                 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by
2383                 // running com.ibm.text.UCD.Main with the option NFSkippable.
2384                 // Since ICU 4.6/Unicode 6, we are generating the
2385                 // expectSets ourselves in initSkippables().
2386 
2387                 s=new StringBuilder();
2388 
2389                 s.append("\n\nskip=       ");
2390                 s.append(skipSets[i].toPattern(true));
2391                 s.append("\n\n");
2392 
2393                 s.append("skip-expect=");
2394                 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true));
2395                 s.append(pattern);
2396 
2397                 pattern.delete(0,pattern.length());
2398                 s.append("\n\nexpect-skip=");
2399                 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true));
2400                 s.append(pattern);
2401                 s.append("\n\n");
2402 
2403                 pattern.delete(0,pattern.length());
2404                 s.append("\n\nintersection(expect,skip)=");
2405                 UnicodeSet intersection  = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]);
2406                 pattern = new StringBuilder(intersection.toPattern(true));
2407                 s.append(pattern);
2408                 s.append("\n\n");
2409 
2410                 errln(s.toString());
2411             }
2412         }
2413     }
2414 
TestBugJ2068()2415     public void TestBugJ2068(){
2416         String sample = "The quick brown fox jumped over the lazy dog";
2417         UCharacterIterator text = UCharacterIterator.getInstance(sample);
2418         Normalizer norm = new Normalizer(text,Normalizer.NFC,0);
2419         text.setIndex(4);
2420         if(text.current() == norm.current()){
2421             errln("Normalizer is not cloning the UCharacterIterator");
2422         }
2423      }
TestGetCombiningClass()2424      public void TestGetCombiningClass(){
2425         for(int i=0;i<0x10FFFF;i++){
2426             int cc = UCharacter.getCombiningClass(i);
2427             if(0xD800<= i && i<=0xDFFF && cc >0 ){
2428                 cc = UCharacter.getCombiningClass(i);
2429                 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8));
2430             }
2431         }
2432     }
2433 
TestSerializedSet()2434     public void TestSerializedSet(){
2435         USerializedSet sset=new USerializedSet();
2436         UnicodeSet set = new UnicodeSet();
2437         int start, end;
2438 
2439         char[] serialized = {
2440             0x8007,  // length
2441             3,  // bmpLength
2442             0xc0, 0xfe, 0xfffc,
2443             1, 9, 0x10, 0xfffc
2444         };
2445         sset.getSet(serialized, 0);
2446 
2447         // collect all sets into one for contiguous output
2448         int[] startEnd = new int[2];
2449         int count=sset.countRanges();
2450         for(int j=0; j<count; ++j) {
2451             sset.getRange(j, startEnd);
2452             set.add(startEnd[0], startEnd[1]);
2453         }
2454 
2455         // test all of these characters
2456         UnicodeSetIterator it = new UnicodeSetIterator(set);
2457         while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) {
2458             start=it.codepoint;
2459             end=it.codepointEnd;
2460             while(start<=end) {
2461                 if(!sset.contains(start)){
2462                     errln("USerializedSet.contains failed for "+Utility.hex(start,8));
2463                 }
2464                 ++start;
2465             }
2466         }
2467     }
2468 
TestReturnFailure()2469     public void TestReturnFailure(){
2470         char[] term = {'r','\u00e9','s','u','m','\u00e9' };
2471         char[] decomposed_term = new char[10 + term.length + 2];
2472         int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0);
2473         int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0);
2474         if(rc!=rc1){
2475             errln("Normalizer decompose did not return correct length");
2476         }
2477     }
2478 
2479     private final static class TestCompositionCase {
2480         public Normalizer.Mode mode;
2481         public int options;
2482         public String input, expect;
TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect)2483         TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) {
2484             this.mode=mode;
2485             this.options=options;
2486             this.input=input;
2487             this.expect=expect;
2488         }
2489     }
2490 
TestComposition()2491     public void TestComposition() {
2492         final TestCompositionCase cases[]=new TestCompositionCase[]{
2493             /*
2494              * special cases for UAX #15 bug
2495              * see Unicode Corrigendum #5: Normalization Idempotency
2496              * at http://unicode.org/versions/corrigendum5.html
2497              * (was Public Review Issue #29)
2498              */
2499             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327",      "\u1100\u0300\u1161\u0327"),
2500             new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"),
2501             new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8",      "\uac00\u0327\u0300\u11a8"),
2502             new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e",            "\u0b47\u0300\u0b3e"),
2503 
2504             /* TODO: add test cases for UNORM_FCC here (j2151) */
2505         };
2506 
2507         String output;
2508         int i;
2509 
2510         for(i=0; i<cases.length; ++i) {
2511             output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options);
2512             if(!output.equals(cases[i].expect)) {
2513                 errln("unexpected result for case "+i);
2514             }
2515         }
2516     }
2517 
TestGetDecomposition()2518     public void TestGetDecomposition() {
2519         Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS);
2520         String decomp=n2.getDecomposition(0x20);
2521         assertEquals("fcc.getDecomposition(space) failed", null, decomp);
2522         decomp=n2.getDecomposition(0xe4);
2523         assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp);
2524         decomp=n2.getDecomposition(0xac01);
2525         assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp);
2526     }
2527 
TestGetRawDecomposition()2528     public void TestGetRawDecomposition() {
2529         Normalizer2 n2=Normalizer2.getNFKCInstance();
2530         /*
2531          * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values,
2532          * without recursive decomposition.
2533          */
2534 
2535         String decomp=n2.getRawDecomposition(0x20);
2536         assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp);
2537         decomp=n2.getRawDecomposition(0xe4);
2538         assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp);
2539         /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */
2540         decomp=n2.getRawDecomposition(0x1e08);
2541         assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp);
2542         /* U+212B ANGSTROM SIGN */
2543         decomp=n2.getRawDecomposition(0x212b);
2544         assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp);
2545         decomp=n2.getRawDecomposition(0xac00);
2546         assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp);
2547         /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */
2548         decomp=n2.getRawDecomposition(0xac01);
2549         assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp);
2550     }
2551 
TestCustomComp()2552     public void TestCustomComp() {
2553         String [][] pairs={
2554             { "\\uD801\\uE000\\uDFFE", "" },
2555             { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
2556             { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
2557             { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" },
2558             { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
2559             { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
2560             { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
2561             { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
2562         };
2563         Normalizer2 customNorm2;
2564         customNorm2=
2565             Normalizer2.getInstance(
2566                 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
2567                 "testnorm",
2568                 Normalizer2.Mode.COMPOSE);
2569         for(int i=0; i<pairs.length; ++i) {
2570             String[] pair=pairs[i];
2571             String input=Utility.unescape(pair[0]);
2572             String expected=Utility.unescape(pair[1]);
2573             String result=customNorm2.normalize(input);
2574             if(!result.equals(expected)) {
2575                 errln("custom compose Normalizer2 did not normalize input "+i+" as expected");
2576             }
2577         }
2578     }
2579 
TestCustomFCC()2580     public void TestCustomFCC() {
2581         String[][] pairs={
2582             { "\\uD801\\uE000\\uDFFE", "" },
2583             { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" },
2584             { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" },
2585             // The following expected result is different from CustomComp
2586             // because of only-contiguous composition.
2587             { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" },
2588             { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" },
2589             { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" },
2590             { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" },
2591             { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" }
2592         };
2593         Normalizer2 customNorm2;
2594         customNorm2=
2595             Normalizer2.getInstance(
2596                 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"),
2597                 "testnorm",
2598                 Normalizer2.Mode.COMPOSE_CONTIGUOUS);
2599         for(int i=0; i<pairs.length; ++i) {
2600             String[] pair=pairs[i];
2601             String input=Utility.unescape(pair[0]);
2602             String expected=Utility.unescape(pair[1]);
2603             String result=customNorm2.normalize(input);
2604             if(!result.equals(expected)) {
2605                 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected");
2606             }
2607         }
2608     }
2609 
TestCanonIterData()2610     public void TestCanonIterData() {
2611         // For now, just a regression test.
2612         Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData();
2613         // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character
2614         // in some decomposition mappings where there is a composition exclusion.
2615         // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0)
2616         // but it is not a segment starter because it occurs in a decomposition mapping.
2617         if(impl.isCanonSegmentStarter(0xfb5)) {
2618             errln("isCanonSegmentStarter(U+0fb5)=true is wrong");
2619         }
2620         // For [:Segment_Starter:] to work right, not just the property function has to work right,
2621         // UnicodeSet also needs a correct range starts set.
2622         UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze();
2623         if(segStarters.contains(0xfb5)) {
2624             errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong");
2625         }
2626         // Try characters up to Kana and miscellaneous CJK but below Han (for expediency).
2627         for(int c=0; c<=0x33ff; ++c) {
2628             boolean isStarter=impl.isCanonSegmentStarter(c);
2629             boolean isContained=segStarters.contains(c);
2630             if(isStarter!=isContained) {
2631                 errln(String.format(
2632                         "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " +
2633                         "[:Segment_Starter:].contains(same)",
2634                         c, isStarter));
2635             }
2636         }
2637     }
2638 
TestFilteredNormalizer2()2639     public void TestFilteredNormalizer2() {
2640         Normalizer2 nfcNorm2=Normalizer2.getNFCInstance();
2641         UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]");
2642         FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2643         int c;
2644         for(c=0; c<=0x3ff; ++c) {
2645             int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0;
2646             int cc=fn2.getCombiningClass(c);
2647             assertEquals(
2648                     "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+
2649                     ")==filtered NFC.getCC()",
2650                     expectedCC, cc);
2651         }
2652     }
2653 
TestFilteredAppend()2654     public void TestFilteredAppend() {
2655         Normalizer2 nfcNorm2=Normalizer2.getNFCInstance();
2656         UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]");
2657         FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter);
2658 
2659         // Append two strings that each contain a character outside the filter set.
2660         StringBuilder sb = new StringBuilder("a\u0313a");
2661         String second = "\u0301\u0313";
2662         assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString());
2663 
2664         // Same, and also normalize the second string.
2665         sb.replace(0, 0x7fffffff, "a\u0313a");
2666         assertEquals(
2667             "normalizeSecondAndAppend()",
2668             "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString());
2669 
2670         // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend().
2671         assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313"));
2672     }
2673 
TestGetEasyToUseInstance()2674     public void TestGetEasyToUseInstance() {
2675         // Test input string:
2676         // U+00A0 -> <noBreak> 0020
2677         // U+00C7 0301 = 1E08 = 0043 0327 0301
2678         String in="\u00A0\u00C7\u0301";
2679         Normalizer2 n2=Normalizer2.getNFCInstance();
2680         String out=n2.normalize(in);
2681         assertEquals(
2682                 "getNFCInstance() did not return an NFC instance " +
2683                 "(normalizes to " + prettify(out) + ')',
2684                 "\u00A0\u1E08", out);
2685 
2686         n2=Normalizer2.getNFDInstance();
2687         out=n2.normalize(in);
2688         assertEquals(
2689                 "getNFDInstance() did not return an NFD instance " +
2690                 "(normalizes to " + prettify(out) + ')',
2691                 "\u00A0C\u0327\u0301", out);
2692 
2693         n2=Normalizer2.getNFKCInstance();
2694         out=n2.normalize(in);
2695         assertEquals(
2696                 "getNFKCInstance() did not return an NFKC instance " +
2697                 "(normalizes to " + prettify(out) + ')',
2698                 " \u1E08", out);
2699 
2700         n2=Normalizer2.getNFKDInstance();
2701         out=n2.normalize(in);
2702         assertEquals(
2703                 "getNFKDInstance() did not return an NFKD instance " +
2704                 "(normalizes to " + prettify(out) + ')',
2705                 " C\u0327\u0301", out);
2706 
2707         n2=Normalizer2.getNFKCCasefoldInstance();
2708         out=n2.normalize(in);
2709         assertEquals(
2710                 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " +
2711                 "(normalizes to " + prettify(out) + ')',
2712                 " \u1E09", out);
2713     }
2714 }
2715