1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2012, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.dev.test.normalizer; 9 10 import java.text.StringCharacterIterator; 11 import java.util.Random; 12 13 import com.ibm.icu.dev.test.TestFmwk; 14 import com.ibm.icu.impl.Norm2AllModes; 15 import com.ibm.icu.impl.Normalizer2Impl; 16 import com.ibm.icu.impl.USerializedSet; 17 import com.ibm.icu.impl.Utility; 18 import com.ibm.icu.lang.UCharacter; 19 import com.ibm.icu.lang.UCharacterCategory; 20 import com.ibm.icu.lang.UProperty; 21 import com.ibm.icu.text.FilteredNormalizer2; 22 import com.ibm.icu.text.Normalizer; 23 import com.ibm.icu.text.Normalizer2; 24 import com.ibm.icu.text.UCharacterIterator; 25 import com.ibm.icu.text.UTF16; 26 import com.ibm.icu.text.UnicodeSet; 27 import com.ibm.icu.text.UnicodeSetIterator; 28 29 30 public class BasicTest extends TestFmwk { main(String[] args)31 public static void main(String[] args) throws Exception { 32 new BasicTest().run(args); 33 } 34 35 String[][] canonTests = { 36 // Input Decomposed Composed 37 { "cat", "cat", "cat" }, 38 { "\u00e0ardvark", "a\u0300ardvark", "\u00e0ardvark", }, 39 40 { "\u1e0a", "D\u0307", "\u1e0a" }, // D-dot_above 41 { "D\u0307", "D\u0307", "\u1e0a" }, // D dot_above 42 43 { "\u1e0c\u0307", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_below dot_above 44 { "\u1e0a\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D-dot_above dot_below 45 { "D\u0307\u0323", "D\u0323\u0307", "\u1e0c\u0307" }, // D dot_below dot_above 46 47 { "\u1e10\u0307\u0323", "D\u0327\u0323\u0307", "\u1e10\u0323\u0307"}, // D dot_below cedilla dot_above 48 { "D\u0307\u0328\u0323","D\u0328\u0323\u0307", "\u1e0c\u0328\u0307"}, // D dot_above ogonek dot_below 49 50 { "\u1E14", "E\u0304\u0300", "\u1E14" }, // E-macron-grave 51 { "\u0112\u0300", "E\u0304\u0300", "\u1E14" }, // E-macron + grave 52 { "\u00c8\u0304", "E\u0300\u0304", "\u00c8\u0304" }, // E-grave + macron 53 54 { "\u212b", "A\u030a", "\u00c5" }, // angstrom_sign 55 { "\u00c5", "A\u030a", "\u00c5" }, // A-ring 56 57 { "\u00c4ffin", "A\u0308ffin", "\u00c4ffin" }, 58 { "\u00c4\uFB03n", "A\u0308\uFB03n", "\u00c4\uFB03n" }, 59 60 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated with 3.0 61 { "\u00fd\uFB03n", "y\u0301\uFB03n", "\u00fd\uFB03n" }, //updated with 3.0 62 63 { "Henry IV", "Henry IV", "Henry IV" }, 64 { "Henry \u2163", "Henry \u2163", "Henry \u2163" }, 65 66 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 67 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 68 { "\uFF76\uFF9E", "\uFF76\uFF9E", "\uFF76\uFF9E" }, // hw_ka + hw_ten 69 { "\u30AB\uFF9E", "\u30AB\uFF9E", "\u30AB\uFF9E" }, // ka + hw_ten 70 { "\uFF76\u3099", "\uFF76\u3099", "\uFF76\u3099" }, // hw_ka + ten 71 72 { "A\u0300\u0316", "A\u0316\u0300", "\u00C0\u0316" }, 73 {"\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e","\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165", "\\U0001D157\\U0001D165\\U0001D157\\U0001D165\\U0001D157\\U0001D165"}, 74 }; 75 76 String[][] compatTests = { 77 // Input Decomposed Composed 78 { "cat", "cat", "cat" }, 79 { "\uFB4f", "\u05D0\u05DC", "\u05D0\u05DC", }, // Alef-Lamed vs. Alef, Lamed 80 81 { "\u00C4ffin", "A\u0308ffin", "\u00C4ffin" }, 82 { "\u00C4\uFB03n", "A\u0308ffin", "\u00C4ffin" }, // ffi ligature -> f + f + i 83 84 { "\u00fdffin", "y\u0301ffin", "\u00fdffin" }, //updated for 3.0 85 { "\u00fd\uFB03n", "y\u0301ffin", "\u00fdffin" }, // ffi ligature -> f + f + i 86 87 { "Henry IV", "Henry IV", "Henry IV" }, 88 { "Henry \u2163", "Henry IV", "Henry IV" }, 89 90 { "\u30AC", "\u30AB\u3099", "\u30AC" }, // ga (Katakana) 91 { "\u30AB\u3099", "\u30AB\u3099", "\u30AC" }, // ka + ten 92 93 { "\uFF76\u3099", "\u30AB\u3099", "\u30AC" }, // hw_ka + ten 94 95 /* These two are broken in Unicode 2.1.2 but fixed in 2.1.5 and later*/ 96 { "\uFF76\uFF9E", "\u30AB\u3099", "\u30AC" }, // hw_ka + hw_ten 97 { "\u30AB\uFF9E", "\u30AB\u3099", "\u30AC" }, // ka + hw_ten 98 99 }; 100 101 // With Canonical decomposition, Hangul syllables should get decomposed 102 // into Jamo, but Jamo characters should not be decomposed into 103 // conjoining Jamo 104 String[][] hangulCanon = { 105 // Input Decomposed Composed 106 { "\ud4db", "\u1111\u1171\u11b6", "\ud4db" }, 107 { "\u1111\u1171\u11b6", "\u1111\u1171\u11b6", "\ud4db" }, 108 }; 109 110 // With compatibility decomposition turned on, 111 // it should go all the way down to conjoining Jamo characters. 112 // THIS IS NO LONGER TRUE IN UNICODE v2.1.8, SO THIS TEST IS OBSOLETE 113 String[][] hangulCompat = { 114 // Input Decomposed Composed 115 // { "\ud4db", "\u1111\u116e\u1175\u11af\u11c2", "\ud478\u1175\u11af\u11c2" }, 116 }; 117 TestHangulCompose()118 public void TestHangulCompose() 119 throws Exception{ 120 // Make sure that the static composition methods work 121 logln("Canonical composition..."); 122 staticTest(Normalizer.NFC, hangulCanon, 2); 123 logln("Compatibility composition..."); 124 staticTest(Normalizer.NFKC, hangulCompat, 2); 125 // Now try iterative composition.... 126 logln("Iterative composition..."); 127 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 128 iterateTest(norm, hangulCanon, 2); 129 130 norm.setMode(Normalizer.NFKD); 131 iterateTest(norm, hangulCompat, 2); 132 133 // And finally, make sure you can do it in reverse too 134 logln("Reverse iteration..."); 135 norm.setMode(Normalizer.NFC); 136 backAndForth(norm, hangulCanon); 137 } 138 TestHangulDecomp()139 public void TestHangulDecomp() throws Exception{ 140 // Make sure that the static decomposition methods work 141 logln("Canonical decomposition..."); 142 staticTest(Normalizer.NFD, hangulCanon, 1); 143 logln("Compatibility decomposition..."); 144 staticTest(Normalizer.NFKD, hangulCompat, 1); 145 146 // Now the iterative decomposition methods... 147 logln("Iterative decomposition..."); 148 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 149 iterateTest(norm, hangulCanon, 1); 150 151 norm.setMode(Normalizer.NFKD); 152 iterateTest(norm, hangulCompat, 1); 153 154 // And finally, make sure you can do it in reverse too 155 logln("Reverse iteration..."); 156 norm.setMode(Normalizer.NFD); 157 backAndForth(norm, hangulCanon); 158 } TestNone()159 public void TestNone() throws Exception{ 160 Normalizer norm = new Normalizer("", Normalizer.NONE,0); 161 iterateTest(norm, canonTests, 0); 162 staticTest(Normalizer.NONE, canonTests, 0); 163 } TestDecomp()164 public void TestDecomp() throws Exception{ 165 Normalizer norm = new Normalizer("", Normalizer.NFD,0); 166 iterateTest(norm, canonTests, 1); 167 staticTest(Normalizer.NFD, canonTests, 1); 168 decomposeTest(Normalizer.NFD, canonTests, 1); 169 } 170 TestCompatDecomp()171 public void TestCompatDecomp() throws Exception{ 172 Normalizer norm = new Normalizer("", Normalizer.NFKD,0); 173 iterateTest(norm, compatTests, 1); 174 staticTest(Normalizer.NFKD,compatTests, 1); 175 decomposeTest(Normalizer.NFKD,compatTests, 1); 176 } 177 TestCanonCompose()178 public void TestCanonCompose() throws Exception{ 179 Normalizer norm = new Normalizer("", Normalizer.NFC,0); 180 iterateTest(norm, canonTests, 2); 181 staticTest(Normalizer.NFC, canonTests, 2); 182 composeTest(Normalizer.NFC, canonTests, 2); 183 } 184 TestCompatCompose()185 public void TestCompatCompose() throws Exception{ 186 Normalizer norm = new Normalizer("", Normalizer.NFKC,0); 187 iterateTest(norm, compatTests, 2); 188 staticTest(Normalizer.NFKC,compatTests, 2); 189 composeTest(Normalizer.NFKC,compatTests, 2); 190 } 191 TestExplodingBase()192 public void TestExplodingBase() throws Exception{ 193 // \u017f - Latin small letter long s 194 // \u0307 - combining dot above 195 // \u1e61 - Latin small letter s with dot above 196 // \u1e9b - Latin small letter long s with dot above 197 String[][] canon = { 198 // Input Decomposed Composed 199 { "Tschu\u017f", "Tschu\u017f", "Tschu\u017f" }, 200 { "Tschu\u1e9b", "Tschu\u017f\u0307", "Tschu\u1e9b" }, 201 }; 202 String[][] compat = { 203 // Input Decomposed Composed 204 { "\u017f", "s", "s" }, 205 { "\u1e9b", "s\u0307", "\u1e61" }, 206 }; 207 208 staticTest(Normalizer.NFD, canon, 1); 209 staticTest(Normalizer.NFC, canon, 2); 210 211 staticTest(Normalizer.NFKD, compat, 1); 212 staticTest(Normalizer.NFKC, compat, 2); 213 214 } 215 216 /** 217 * The Tibetan vowel sign AA, 0f71, was messed up prior to 218 * Unicode version 2.1.9. 219 * Once 2.1.9 or 3.0 is released, uncomment this test. 220 */ TestTibetan()221 public void TestTibetan() throws Exception{ 222 String[][] decomp = { 223 { "\u0f77", "\u0f77", "\u0fb2\u0f71\u0f80" } 224 }; 225 String[][] compose = { 226 { "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80", "\u0fb2\u0f71\u0f80" } 227 }; 228 229 staticTest(Normalizer.NFD, decomp, 1); 230 staticTest(Normalizer.NFKD,decomp, 2); 231 staticTest(Normalizer.NFC, compose, 1); 232 staticTest(Normalizer.NFKC,compose, 2); 233 } 234 235 /** 236 * Make sure characters in the CompositionExclusion.txt list do not get 237 * composed to. 238 */ TestCompositionExclusion()239 public void TestCompositionExclusion() 240 throws Exception{ 241 // This list is generated from CompositionExclusion.txt. 242 // Update whenever the normalizer tables are updated. Note 243 // that we test all characters listed, even those that can be 244 // derived from the Unicode DB and are therefore commented 245 // out. 246 String EXCLUDED = 247 "\u0340\u0341\u0343\u0344\u0374\u037E\u0387\u0958" + 248 "\u0959\u095A\u095B\u095C\u095D\u095E\u095F\u09DC" + 249 "\u09DD\u09DF\u0A33\u0A36\u0A59\u0A5A\u0A5B\u0A5E" + 250 "\u0B5C\u0B5D\u0F43\u0F4D\u0F52\u0F57\u0F5C\u0F69" + 251 "\u0F73\u0F75\u0F76\u0F78\u0F81\u0F93\u0F9D\u0FA2" + 252 "\u0FA7\u0FAC\u0FB9\u1F71\u1F73\u1F75\u1F77\u1F79" + 253 "\u1F7B\u1F7D\u1FBB\u1FBE\u1FC9\u1FCB\u1FD3\u1FDB" + 254 "\u1FE3\u1FEB\u1FEE\u1FEF\u1FF9\u1FFB\u1FFD\u2000" + 255 "\u2001\u2126\u212A\u212B\u2329\u232A\uF900\uFA10" + 256 "\uFA12\uFA15\uFA20\uFA22\uFA25\uFA26\uFA2A\uFB1F" + 257 "\uFB2A\uFB2B\uFB2C\uFB2D\uFB2E\uFB2F\uFB30\uFB31" + 258 "\uFB32\uFB33\uFB34\uFB35\uFB36\uFB38\uFB39\uFB3A" + 259 "\uFB3B\uFB3C\uFB3E\uFB40\uFB41\uFB43\uFB44\uFB46" + 260 "\uFB47\uFB48\uFB49\uFB4A\uFB4B\uFB4C\uFB4D\uFB4E"; 261 for (int i=0; i<EXCLUDED.length(); ++i) { 262 String a = String.valueOf(EXCLUDED.charAt(i)); 263 String b = Normalizer.normalize(a, Normalizer.NFKD); 264 String c = Normalizer.normalize(b, Normalizer.NFC); 265 if (c.equals(a)) { 266 errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 267 hex(b) + " x COMPOSE => " + 268 hex(c)); 269 } else if (isVerbose()) { 270 logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 271 hex(b) + " x COMPOSE => " + 272 hex(c)); 273 } 274 } 275 // The following method works too, but it is somewhat 276 // incestuous. It uses UInfo, which is the same database that 277 // NormalizerBuilder uses, so if something is wrong with 278 // UInfo, the following test won't show it. All it will show 279 // is that NormalizerBuilder has been run with whatever the 280 // current UInfo is. 281 // 282 // We comment this out in favor of the test above, which 283 // provides independent verification (but also requires 284 // independent updating). 285 // logln("---"); 286 // UInfo uinfo = new UInfo(); 287 // for (int i=0; i<=0xFFFF; ++i) { 288 // if (!uinfo.isExcludedComposition((char)i) || 289 // (!uinfo.hasCanonicalDecomposition((char)i) && 290 // !uinfo.hasCompatibilityDecomposition((char)i))) continue; 291 // String a = String.valueOf((char)i); 292 // String b = Normalizer.normalize(a,Normalizer.DECOMP_COMPAT,0); 293 // String c = Normalizer.normalize(b,Normalizer.COMPOSE,0); 294 // if (c.equals(a)) { 295 // errln("FAIL: " + hex(a) + " x DECOMP_COMPAT => " + 296 // hex(b) + " x COMPOSE => " + 297 // hex(c)); 298 // } else if (isVerbose()) { 299 // logln("Ok: " + hex(a) + " x DECOMP_COMPAT => " + 300 // hex(b) + " x COMPOSE => " + 301 // hex(c)); 302 // } 303 // } 304 } 305 306 /** 307 * Test for a problem that showed up just before ICU 1.6 release 308 * having to do with combining characters with an index of zero. 309 * Such characters do not participate in any canonical 310 * decompositions. However, having an index of zero means that 311 * they all share one typeMask[] entry, that is, they all have to 312 * map to the same canonical class, which is not the case, in 313 * reality. 314 */ TestZeroIndex()315 public void TestZeroIndex() 316 throws Exception{ 317 String[] DATA = { 318 // Expect col1 x COMPOSE_COMPAT => col2 319 // Expect col2 x DECOMP => col3 320 "A\u0316\u0300", "\u00C0\u0316", "A\u0316\u0300", 321 "A\u0300\u0316", "\u00C0\u0316", "A\u0316\u0300", 322 "A\u0327\u0300", "\u00C0\u0327", "A\u0327\u0300", 323 "c\u0321\u0327", "c\u0321\u0327", "c\u0321\u0327", 324 "c\u0327\u0321", "\u00E7\u0321", "c\u0327\u0321", 325 }; 326 327 for (int i=0; i<DATA.length; i+=3) { 328 String a = DATA[i]; 329 String b = Normalizer.normalize(a, Normalizer.NFKC); 330 String exp = DATA[i+1]; 331 if (b.equals(exp)) { 332 logln("Ok: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b)); 333 } else { 334 errln("FAIL: " + hex(a) + " x COMPOSE_COMPAT => " + hex(b) + 335 ", expect " + hex(exp)); 336 } 337 a = Normalizer.normalize(b, Normalizer.NFD); 338 exp = DATA[i+2]; 339 if (a.equals(exp)) { 340 logln("Ok: " + hex(b) + " x DECOMP => " + hex(a)); 341 } else { 342 errln("FAIL: " + hex(b) + " x DECOMP => " + hex(a) + 343 ", expect " + hex(exp)); 344 } 345 } 346 } 347 348 /** 349 * Test for a problem found by Verisign. Problem is that 350 * characters at the start of a string are not put in canonical 351 * order correctly by compose() if there is no starter. 352 */ TestVerisign()353 public void TestVerisign() 354 throws Exception{ 355 String[] inputs = { 356 "\u05b8\u05b9\u05b1\u0591\u05c3\u05b0\u05ac\u059f", 357 "\u0592\u05b7\u05bc\u05a5\u05b0\u05c0\u05c4\u05ad" 358 }; 359 String[] outputs = { 360 "\u05b1\u05b8\u05b9\u0591\u05c3\u05b0\u05ac\u059f", 361 "\u05b0\u05b7\u05bc\u05a5\u0592\u05c0\u05ad\u05c4" 362 }; 363 364 for (int i = 0; i < inputs.length; ++i) { 365 String input = inputs[i]; 366 String output = outputs[i]; 367 String result = Normalizer.decompose(input, false); 368 if (!result.equals(output)) { 369 errln("FAIL input: " + hex(input)); 370 errln(" decompose: " + hex(result)); 371 errln(" expected: " + hex(output)); 372 } 373 result = Normalizer.compose(input, false); 374 if (!result.equals(output)) { 375 errln("FAIL input: " + hex(input)); 376 errln(" compose: " + hex(result)); 377 errln(" expected: " + hex(output)); 378 } 379 } 380 381 } TestQuickCheckResultNO()382 public void TestQuickCheckResultNO() 383 throws Exception{ 384 final char CPNFD[] = {0x00C5, 0x0407, 0x1E00, 0x1F57, 0x220C, 385 0x30AE, 0xAC00, 0xD7A3, 0xFB36, 0xFB4E}; 386 final char CPNFC[] = {0x0340, 0x0F93, 0x1F77, 0x1FBB, 0x1FEB, 387 0x2000, 0x232A, 0xF900, 0xFA1E, 0xFB4E}; 388 final char CPNFKD[] = {0x00A0, 0x02E4, 0x1FDB, 0x24EA, 0x32FE, 389 0xAC00, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 390 final char CPNFKC[] = {0x00A0, 0x017F, 0x2000, 0x24EA, 0x32FE, 391 0x33FE, 0xFB4E, 0xFA10, 0xFF3F, 0xFA2D}; 392 393 394 final int SIZE = 10; 395 396 int count = 0; 397 for (; count < SIZE; count ++) 398 { 399 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 400 Normalizer.NFD,0) != Normalizer.NO) 401 { 402 errln("ERROR in NFD quick check at U+" + 403 Integer.toHexString(CPNFD[count])); 404 return; 405 } 406 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 407 Normalizer.NFC,0) !=Normalizer.NO) 408 { 409 errln("ERROR in NFC quick check at U+"+ 410 Integer.toHexString(CPNFC[count])); 411 return; 412 } 413 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 414 Normalizer.NFKD,0) != Normalizer.NO) 415 { 416 errln("ERROR in NFKD quick check at U+"+ 417 Integer.toHexString(CPNFKD[count])); 418 return; 419 } 420 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 421 Normalizer.NFKC,0) !=Normalizer.NO) 422 { 423 errln("ERROR in NFKC quick check at U+"+ 424 Integer.toHexString(CPNFKC[count])); 425 return; 426 } 427 // for improving coverage 428 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 429 Normalizer.NFKC) !=Normalizer.NO) 430 { 431 errln("ERROR in NFKC quick check at U+"+ 432 Integer.toHexString(CPNFKC[count])); 433 return; 434 } 435 } 436 } 437 438 TestQuickCheckResultYES()439 public void TestQuickCheckResultYES() 440 throws Exception{ 441 final char CPNFD[] = {0x00C6, 0x017F, 0x0F74, 0x1000, 0x1E9A, 442 0x2261, 0x3075, 0x4000, 0x5000, 0xF000}; 443 final char CPNFC[] = {0x0400, 0x0540, 0x0901, 0x1000, 0x1500, 444 0x1E9A, 0x3000, 0x4000, 0x5000, 0xF000}; 445 final char CPNFKD[] = {0x00AB, 0x02A0, 0x1000, 0x1027, 0x2FFB, 446 0x3FFF, 0x4FFF, 0xA000, 0xF000, 0xFA27}; 447 final char CPNFKC[] = {0x00B0, 0x0100, 0x0200, 0x0A02, 0x1000, 448 0x2010, 0x3030, 0x4000, 0xA000, 0xFA0E}; 449 450 final int SIZE = 10; 451 int count = 0; 452 453 char cp = 0; 454 while (cp < 0xA0) 455 { 456 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFD,0) 457 != Normalizer.YES) 458 { 459 errln("ERROR in NFD quick check at U+"+ 460 Integer.toHexString(cp)); 461 return; 462 } 463 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFC,0) 464 != Normalizer.YES) 465 { 466 errln("ERROR in NFC quick check at U+"+ 467 Integer.toHexString(cp)); 468 return; 469 } 470 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKD,0) 471 != Normalizer.YES) 472 { 473 errln("ERROR in NFKD quick check at U+" + 474 Integer.toHexString(cp)); 475 return; 476 } 477 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC,0) 478 != Normalizer.YES) 479 { 480 errln("ERROR in NFKC quick check at U+"+ 481 Integer.toHexString(cp)); 482 return; 483 } 484 // improve the coverage 485 if (Normalizer.quickCheck(String.valueOf(cp), Normalizer.NFKC) 486 != Normalizer.YES) 487 { 488 errln("ERROR in NFKC quick check at U+"+ 489 Integer.toHexString(cp)); 490 return; 491 } 492 cp++; 493 } 494 495 for (; count < SIZE; count ++) 496 { 497 if (Normalizer.quickCheck(String.valueOf(CPNFD[count]), 498 Normalizer.NFD,0)!=Normalizer.YES) 499 { 500 errln("ERROR in NFD quick check at U+"+ 501 Integer.toHexString(CPNFD[count])); 502 return; 503 } 504 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 505 Normalizer.NFC,0)!=Normalizer.YES) 506 { 507 errln("ERROR in NFC quick check at U+"+ 508 Integer.toHexString(CPNFC[count])); 509 return; 510 } 511 if (Normalizer.quickCheck(String.valueOf(CPNFKD[count]), 512 Normalizer.NFKD,0)!=Normalizer.YES) 513 { 514 errln("ERROR in NFKD quick check at U+"+ 515 Integer.toHexString(CPNFKD[count])); 516 return; 517 } 518 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 519 Normalizer.NFKC,0)!=Normalizer.YES) 520 { 521 errln("ERROR in NFKC quick check at U+"+ 522 Integer.toHexString(CPNFKC[count])); 523 return; 524 } 525 // improve the coverage 526 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 527 Normalizer.NFKC)!=Normalizer.YES) 528 { 529 errln("ERROR in NFKC quick check at U+"+ 530 Integer.toHexString(CPNFKC[count])); 531 return; 532 } 533 } 534 } TestBengali()535 public void TestBengali() throws Exception{ 536 String input = "\u09bc\u09be\u09cd\u09be"; 537 String output=Normalizer.normalize(input,Normalizer.NFC); 538 if(!input.equals(output)){ 539 errln("ERROR in NFC of string"); 540 } 541 } TestQuickCheckResultMAYBE()542 public void TestQuickCheckResultMAYBE() 543 throws Exception{ 544 545 final char[] CPNFC = {0x0306, 0x0654, 0x0BBE, 0x102E, 0x1161, 546 0x116A, 0x1173, 0x1175, 0x3099, 0x309A}; 547 final char[] CPNFKC = {0x0300, 0x0654, 0x0655, 0x09D7, 0x0B3E, 548 0x0DCF, 0xDDF, 0x102E, 0x11A8, 0x3099}; 549 550 551 final int SIZE = 10; 552 553 int count = 0; 554 555 /* NFD and NFKD does not have any MAYBE codepoints */ 556 for (; count < SIZE; count ++) 557 { 558 if (Normalizer.quickCheck(String.valueOf(CPNFC[count]), 559 Normalizer.NFC,0)!=Normalizer.MAYBE) 560 { 561 errln("ERROR in NFC quick check at U+"+ 562 Integer.toHexString(CPNFC[count])); 563 return; 564 } 565 if (Normalizer.quickCheck(String.valueOf(CPNFKC[count]), 566 Normalizer.NFKC,0)!=Normalizer.MAYBE) 567 { 568 errln("ERROR in NFKC quick check at U+"+ 569 Integer.toHexString(CPNFKC[count])); 570 return; 571 } 572 if (Normalizer.quickCheck(new char[]{CPNFC[count]}, 573 Normalizer.NFC,0)!=Normalizer.MAYBE) 574 { 575 errln("ERROR in NFC quick check at U+"+ 576 Integer.toHexString(CPNFC[count])); 577 return; 578 } 579 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 580 Normalizer.NFKC,0)!=Normalizer.MAYBE) 581 { 582 errln("ERROR in NFKC quick check at U+"+ 583 Integer.toHexString(CPNFKC[count])); 584 return; 585 } 586 if (Normalizer.quickCheck(new char[]{CPNFKC[count]}, 587 Normalizer.NONE,0)!=Normalizer.YES) 588 { 589 errln("ERROR in NONE quick check at U+"+ 590 Integer.toHexString(CPNFKC[count])); 591 return; 592 } 593 } 594 } 595 TestQuickCheckStringResult()596 public void TestQuickCheckStringResult() 597 throws Exception{ 598 int count; 599 String d; 600 String c; 601 602 for (count = 0; count < canonTests.length; count ++) 603 { 604 d = canonTests[count][1]; 605 c = canonTests[count][2]; 606 if (Normalizer.quickCheck(d,Normalizer.NFD,0) 607 != Normalizer.YES) 608 { 609 errln("ERROR in NFD quick check for string at count " + count); 610 return; 611 } 612 613 if (Normalizer.quickCheck(c, Normalizer.NFC,0) 614 == Normalizer.NO) 615 { 616 errln("ERROR in NFC quick check for string at count " + count); 617 return; 618 } 619 } 620 621 for (count = 0; count < compatTests.length; count ++) 622 { 623 d = compatTests[count][1]; 624 c = compatTests[count][2]; 625 if (Normalizer.quickCheck(d, Normalizer.NFKD,0) 626 != Normalizer.YES) 627 { 628 errln("ERROR in NFKD quick check for string at count " + count); 629 return; 630 } 631 632 if (Normalizer.quickCheck(c, Normalizer.NFKC,0) 633 != Normalizer.YES) 634 { 635 errln("ERROR in NFKC quick check for string at count " + count); 636 return; 637 } 638 } 639 } 640 qcToInt(Normalizer.QuickCheckResult qc)641 static final int qcToInt(Normalizer.QuickCheckResult qc) { 642 if(qc==Normalizer.NO) { 643 return 0; 644 } else if(qc==Normalizer.YES) { 645 return 1; 646 } else /* Normalizer.MAYBE */ { 647 return 2; 648 } 649 } 650 TestQuickCheckPerCP()651 public void TestQuickCheckPerCP() { 652 int c, lead, trail; 653 String s, nfd; 654 int lccc1, lccc2, tccc1, tccc2; 655 int qc1, qc2; 656 657 if( 658 UCharacter.getIntPropertyMaxValue(UProperty.NFD_QUICK_CHECK)!=1 || // YES 659 UCharacter.getIntPropertyMaxValue(UProperty.NFKD_QUICK_CHECK)!=1 || 660 UCharacter.getIntPropertyMaxValue(UProperty.NFC_QUICK_CHECK)!=2 || // MAYBE 661 UCharacter.getIntPropertyMaxValue(UProperty.NFKC_QUICK_CHECK)!=2 || 662 UCharacter.getIntPropertyMaxValue(UProperty.LEAD_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) || 663 UCharacter.getIntPropertyMaxValue(UProperty.TRAIL_CANONICAL_COMBINING_CLASS)!=UCharacter.getIntPropertyMaxValue(UProperty.CANONICAL_COMBINING_CLASS) 664 ) { 665 errln("wrong result from one of the u_getIntPropertyMaxValue(UCHAR_NF*_QUICK_CHECK) or UCHAR_*_CANONICAL_COMBINING_CLASS"); 666 } 667 668 /* 669 * compare the quick check property values for some code points 670 * to the quick check results for checking same-code point strings 671 */ 672 c=0; 673 while(c<0x110000) { 674 s=UTF16.valueOf(c); 675 676 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFC_QUICK_CHECK); 677 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFC)); 678 if(qc1!=qc2) { 679 errln("getIntPropertyValue(NFC)="+qc1+" != "+qc2+"=quickCheck(NFC) for U+"+Integer.toHexString(c)); 680 } 681 682 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFD_QUICK_CHECK); 683 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFD)); 684 if(qc1!=qc2) { 685 errln("getIntPropertyValue(NFD)="+qc1+" != "+qc2+"=quickCheck(NFD) for U+"+Integer.toHexString(c)); 686 } 687 688 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKC_QUICK_CHECK); 689 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKC)); 690 if(qc1!=qc2) { 691 errln("getIntPropertyValue(NFKC)="+qc1+" != "+qc2+"=quickCheck(NFKC) for U+"+Integer.toHexString(c)); 692 } 693 694 qc1=UCharacter.getIntPropertyValue(c, UProperty.NFKD_QUICK_CHECK); 695 qc2=qcToInt(Normalizer.quickCheck(s, Normalizer.NFKD)); 696 if(qc1!=qc2) { 697 errln("getIntPropertyValue(NFKD)="+qc1+" != "+qc2+"=quickCheck(NFKD) for U+"+Integer.toHexString(c)); 698 } 699 700 nfd=Normalizer.normalize(s, Normalizer.NFD); 701 lead=UTF16.charAt(nfd, 0); 702 trail=UTF16.charAt(nfd, nfd.length()-1); 703 704 lccc1=UCharacter.getIntPropertyValue(c, UProperty.LEAD_CANONICAL_COMBINING_CLASS); 705 lccc2=UCharacter.getCombiningClass(lead); 706 tccc1=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 707 tccc2=UCharacter.getCombiningClass(trail); 708 709 if(lccc1!=lccc2) { 710 errln("getIntPropertyValue(lccc)="+lccc1+" != "+lccc2+"=getCombiningClass(lead) for U+"+Integer.toHexString(c)); 711 } 712 if(tccc1!=tccc2) { 713 errln("getIntPropertyValue(tccc)="+tccc1+" != "+tccc2+"=getCombiningClass(trail) for U+"+Integer.toHexString(c)); 714 } 715 716 /* skip some code points */ 717 c=(20*c)/19+1; 718 } 719 } 720 721 //------------------------------------------------------------------------ 722 // Internal utilities 723 // 724 //------------------------------------------------------------------------ 725 // Internal utilities 726 // 727 728 /* private void backAndForth(Normalizer iter, String input) 729 { 730 iter.setText(input); 731 732 // Run through the iterator forwards and stick it into a StringBuffer 733 StringBuffer forward = new StringBuffer(); 734 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 735 forward.append(ch); 736 } 737 738 // Now do it backwards 739 StringBuffer reverse = new StringBuffer(); 740 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 741 reverse.insert(0, ch); 742 } 743 744 if (!forward.toString().equals(reverse.toString())) { 745 errln("FAIL: Forward/reverse mismatch for input " + hex(input) 746 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 747 } else if (isVerbose()) { 748 logln("Ok: Forward/reverse for input " + hex(input) 749 + ", forward: " + hex(forward) + ", backward: "+hex(reverse)); 750 } 751 }*/ 752 backAndForth(Normalizer iter, String[][] tests)753 private void backAndForth(Normalizer iter, String[][] tests) 754 { 755 for (int i = 0; i < tests.length; i++) 756 { 757 iter.setText(tests[i][0]); 758 759 // Run through the iterator forwards and stick it into a 760 // StringBuffer 761 StringBuffer forward = new StringBuffer(); 762 for (int ch = iter.first(); ch != Normalizer.DONE; ch = iter.next()) { 763 forward.append(ch); 764 } 765 766 // Now do it backwards 767 StringBuffer reverse = new StringBuffer(); 768 for (int ch = iter.last(); ch != Normalizer.DONE; ch = iter.previous()) { 769 reverse.insert(0, ch); 770 } 771 772 if (!forward.toString().equals(reverse.toString())) { 773 errln("FAIL: Forward/reverse mismatch for input " 774 + hex(tests[i][0]) + ", forward: " + hex(forward) 775 + ", backward: " + hex(reverse)); 776 } else if (isVerbose()) { 777 logln("Ok: Forward/reverse for input " + hex(tests[i][0]) 778 + ", forward: " + hex(forward) + ", backward: " 779 + hex(reverse)); 780 } 781 } 782 } 783 staticTest(Normalizer.Mode mode, String[][] tests, int outCol)784 private void staticTest (Normalizer.Mode mode, 785 String[][] tests, int outCol) throws Exception{ 786 for (int i = 0; i < tests.length; i++) 787 { 788 String input = Utility.unescape(tests[i][0]); 789 String expect = Utility.unescape(tests[i][outCol]); 790 791 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 792 793 String output = Normalizer.normalize(input, mode); 794 795 if (!output.equals(expect)) { 796 errln("FAIL: case " + i 797 + " expected '" + expect + "' (" + hex(expect) + ")" 798 + " but got '" + output + "' (" + hex(output) + ")" ); 799 } 800 } 801 char[] output = new char[1]; 802 for (int i = 0; i < tests.length; i++) 803 { 804 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 805 String expect =Utility.unescape( tests[i][outCol]); 806 807 logln("Normalizing '" + new String(input) + "' (" + 808 hex(new String(input)) + ")" ); 809 int reqLength=0; 810 while(true){ 811 try{ 812 reqLength=Normalizer.normalize(input,output, mode,0); 813 if(reqLength<=output.length ){ 814 break; 815 } 816 }catch(IndexOutOfBoundsException e){ 817 output= new char[Integer.parseInt(e.getMessage())]; 818 continue; 819 } 820 } 821 if (!expect.equals(new String(output,0,reqLength))) { 822 errln("FAIL: case " + i 823 + " expected '" + expect + "' (" + hex(expect) + ")" 824 + " but got '" + new String(output) 825 + "' (" + hex(new String(output)) + ")" ); 826 } 827 } 828 } decomposeTest(Normalizer.Mode mode, String[][] tests, int outCol)829 private void decomposeTest(Normalizer.Mode mode, 830 String[][] tests, int outCol) throws Exception{ 831 for (int i = 0; i < tests.length; i++) 832 { 833 String input = Utility.unescape(tests[i][0]); 834 String expect = Utility.unescape(tests[i][outCol]); 835 836 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 837 838 String output = Normalizer.decompose(input, mode==Normalizer.NFKD); 839 840 if (!output.equals(expect)) { 841 errln("FAIL: case " + i 842 + " expected '" + expect + "' (" + hex(expect) + ")" 843 + " but got '" + output + "' (" + hex(output) + ")" ); 844 } 845 } 846 char[] output = new char[1]; 847 for (int i = 0; i < tests.length; i++) 848 { 849 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 850 String expect = Utility.unescape(tests[i][outCol]); 851 852 logln("Normalizing '" + new String(input) + "' (" + 853 hex(new String(input)) + ")" ); 854 int reqLength=0; 855 while(true){ 856 try{ 857 reqLength=Normalizer.decompose(input,output, mode==Normalizer.NFKD,0); 858 if(reqLength<=output.length ){ 859 break; 860 } 861 }catch(IndexOutOfBoundsException e){ 862 output= new char[Integer.parseInt(e.getMessage())]; 863 continue; 864 } 865 } 866 if (!expect.equals(new String(output,0,reqLength))) { 867 errln("FAIL: case " + i 868 + " expected '" + expect + "' (" + hex(expect) + ")" 869 + " but got '" + new String(output) 870 + "' (" + hex(new String(output)) + ")" ); 871 } 872 } 873 output = new char[1]; 874 for (int i = 0; i < tests.length; i++) 875 { 876 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 877 String expect = Utility.unescape(tests[i][outCol]); 878 879 logln("Normalizing '" + new String(input) + "' (" + 880 hex(new String(input)) + ")" ); 881 int reqLength=0; 882 while(true){ 883 try{ 884 reqLength=Normalizer.decompose(input,0,input.length,output,0,output.length, mode==Normalizer.NFKD,0); 885 if(reqLength<=output.length ){ 886 break; 887 } 888 }catch(IndexOutOfBoundsException e){ 889 output= new char[Integer.parseInt(e.getMessage())]; 890 continue; 891 } 892 } 893 if (!expect.equals(new String(output,0,reqLength))) { 894 errln("FAIL: case " + i 895 + " expected '" + expect + "' (" + hex(expect) + ")" 896 + " but got '" + new String(output) 897 + "' (" + hex(new String(output)) + ")" ); 898 } 899 char[] output2 = new char[reqLength * 2]; 900 System.arraycopy(output, 0, output2, 0, reqLength); 901 int retLength = Normalizer.decompose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 902 if(retLength != reqLength){ 903 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 904 } 905 } 906 } 907 composeTest(Normalizer.Mode mode, String[][] tests, int outCol)908 private void composeTest(Normalizer.Mode mode, 909 String[][] tests, int outCol) throws Exception{ 910 for (int i = 0; i < tests.length; i++) 911 { 912 String input = Utility.unescape(tests[i][0]); 913 String expect = Utility.unescape(tests[i][outCol]); 914 915 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 916 917 String output = Normalizer.compose(input, mode==Normalizer.NFKC); 918 919 if (!output.equals(expect)) { 920 errln("FAIL: case " + i 921 + " expected '" + expect + "' (" + hex(expect) + ")" 922 + " but got '" + output + "' (" + hex(output) + ")" ); 923 } 924 } 925 char[] output = new char[1]; 926 for (int i = 0; i < tests.length; i++) 927 { 928 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 929 String expect = Utility.unescape(tests[i][outCol]); 930 931 logln("Normalizing '" + new String(input) + "' (" + 932 hex(new String(input)) + ")" ); 933 int reqLength=0; 934 while(true){ 935 try{ 936 reqLength=Normalizer.compose(input,output, mode==Normalizer.NFKC,0); 937 if(reqLength<=output.length ){ 938 break; 939 } 940 }catch(IndexOutOfBoundsException e){ 941 output= new char[Integer.parseInt(e.getMessage())]; 942 continue; 943 } 944 } 945 if (!expect.equals(new String(output,0,reqLength))) { 946 errln("FAIL: case " + i 947 + " expected '" + expect + "' (" + hex(expect) + ")" 948 + " but got '" + new String(output) 949 + "' (" + hex(new String(output)) + ")" ); 950 } 951 } 952 output = new char[1]; 953 for (int i = 0; i < tests.length; i++) 954 { 955 char[] input = Utility.unescape(tests[i][0]).toCharArray(); 956 String expect = Utility.unescape(tests[i][outCol]); 957 958 logln("Normalizing '" + new String(input) + "' (" + 959 hex(new String(input)) + ")" ); 960 int reqLength=0; 961 while(true){ 962 try{ 963 reqLength=Normalizer.compose(input,0,input.length, output, 0, output.length, mode==Normalizer.NFKC,0); 964 if(reqLength<=output.length ){ 965 break; 966 } 967 }catch(IndexOutOfBoundsException e){ 968 output= new char[Integer.parseInt(e.getMessage())]; 969 continue; 970 } 971 } 972 if (!expect.equals(new String(output,0,reqLength))) { 973 errln("FAIL: case " + i 974 + " expected '" + expect + "' (" + hex(expect) + ")" 975 + " but got '" + new String(output) 976 + "' (" + hex(new String(output)) + ")" ); 977 } 978 979 char[] output2 = new char[reqLength * 2]; 980 System.arraycopy(output, 0, output2, 0, reqLength); 981 int retLength = Normalizer.compose(input,0,input.length, output2, reqLength, output2.length, mode==Normalizer.NFKC,0); 982 if(retLength != reqLength){ 983 logln("FAIL: Normalizer.compose did not return the expected length. Expected: " +reqLength + " Got: " + retLength); 984 } 985 } 986 } iterateTest(Normalizer iter, String[][] tests, int outCol)987 private void iterateTest(Normalizer iter, String[][] tests, int outCol){ 988 for (int i = 0; i < tests.length; i++) 989 { 990 String input = Utility.unescape(tests[i][0]); 991 String expect = Utility.unescape(tests[i][outCol]); 992 993 logln("Normalizing '" + input + "' (" + hex(input) + ")" ); 994 995 iter.setText(input); 996 assertEqual(expect, iter, "case " + i + " "); 997 } 998 } 999 assertEqual(String expected, Normalizer iter, String msg)1000 private void assertEqual(String expected, Normalizer iter, String msg) 1001 { 1002 int index = 0; 1003 int ch; 1004 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1005 1006 while ((ch=iter.next())!= Normalizer.DONE){ 1007 if (index >= expected.length()) { 1008 errln("FAIL: " + msg + "Unexpected character '" + (char)ch 1009 + "' (" + hex(ch) + ")" 1010 + " at index " + index); 1011 break; 1012 } 1013 int want = UTF16.charAt(expected,index); 1014 if (ch != want) { 1015 errln("FAIL: " + msg + "got '" + (char)ch 1016 + "' (" + hex(ch) + ")" 1017 + " but expected '" + want + "' (" + hex(want)+ ")" 1018 + " at index " + index); 1019 } 1020 index+= UTF16.getCharCount(ch); 1021 } 1022 if (index < expected.length()) { 1023 errln("FAIL: " + msg + "Only got " + index + " chars, expected " 1024 + expected.length()); 1025 } 1026 1027 cIter.setToLimit(); 1028 while((ch=iter.previous())!=Normalizer.DONE){ 1029 int want = cIter.previousCodePoint(); 1030 if (ch != want ) { 1031 errln("FAIL: " + msg + "got '" + (char)ch 1032 + "' (" + hex(ch) + ")" 1033 + " but expected '" + want + "' (" + hex(want) + ")" 1034 + " at index " + index); 1035 } 1036 } 1037 } 1038 //-------------------------------------------------------------------------- 1039 1040 // NOTE: These tests are used for quick debugging so are not ported 1041 // to ICU4C tsnorm.cpp in intltest 1042 // 1043 TestDebugStatic()1044 public void TestDebugStatic(){ 1045 String in = Utility.unescape("\\U0001D157\\U0001D165"); 1046 if(!Normalizer.isNormalized(in,Normalizer.NFC,0)){ 1047 errln("isNormalized failed"); 1048 } 1049 1050 String input = "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1051 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1052 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1053 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1054 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1055 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1056 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1057 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1058 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1059 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1060 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1061 "d\u031B\u0307\u0323"; 1062 String expect = "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1063 "\u11AA\u1100\u116F\u11AA\uD834\uDD57\uD834\uDD65"+ 1064 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1065 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1066 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1067 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1068 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1069 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1070 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1071 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1072 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1073 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1074 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1075 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1076 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1077 "\uD834\uDD57\uD834\uDD65\uD834\uDD57\uD834\uDD65"+ 1078 "\uD834\uDD57\uD834\uDD65aaaaaaaaaaaaaaaaaazzzzzz"+ 1079 "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1080 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1081 "bbbbbbbbbbbbbbbbbbbbbbbbccccccccccccccccccccccccccccc"+ 1082 "cccccccccccccccccccccccccccccccccccccccccccccccc"+ 1083 "ddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1084 "dddddddddddddddddddddddd"+ 1085 "\u1100\u116F\u11AA\u1100\u116F\u11AA\u1100\u116F"+ 1086 "\u11AA\u1100\u116F\u11AA\u0064\u031B\u0323\u0307"; 1087 String output = Normalizer.normalize(Utility.unescape(input), 1088 Normalizer.NFD); 1089 if(!expect.equals(output)){ 1090 errln("FAIL expected: "+hex(expect) + " got: "+hex(output)); 1091 } 1092 1093 1094 1095 } TestDebugIter()1096 public void TestDebugIter(){ 1097 String src = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1098 String expected = Utility.unescape("\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e"); 1099 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(src)), 1100 Normalizer.NONE,0); 1101 int index = 0; 1102 int ch; 1103 UCharacterIterator cIter = UCharacterIterator.getInstance(expected); 1104 1105 while ((ch=iter.next())!= Normalizer.DONE){ 1106 if (index >= expected.length()) { 1107 errln("FAIL: " + "Unexpected character '" + (char)ch 1108 + "' (" + hex(ch) + ")" 1109 + " at index " + index); 1110 break; 1111 } 1112 int want = UTF16.charAt(expected,index); 1113 if (ch != want) { 1114 errln("FAIL: " + "got '" + (char)ch 1115 + "' (" + hex(ch) + ")" 1116 + " but expected '" + want + "' (" + hex(want)+ ")" 1117 + " at index " + index); 1118 } 1119 index+= UTF16.getCharCount(ch); 1120 } 1121 if (index < expected.length()) { 1122 errln("FAIL: " + "Only got " + index + " chars, expected " 1123 + expected.length()); 1124 } 1125 1126 cIter.setToLimit(); 1127 while((ch=iter.previous())!=Normalizer.DONE){ 1128 int want = cIter.previousCodePoint(); 1129 if (ch != want ) { 1130 errln("FAIL: " + "got '" + (char)ch 1131 + "' (" + hex(ch) + ")" 1132 + " but expected '" + want + "' (" + hex(want) + ")" 1133 + " at index " + index); 1134 } 1135 } 1136 } TestDebugIterOld()1137 public void TestDebugIterOld(){ 1138 String input = "\\U0001D15E"; 1139 String expected = "\uD834\uDD57\uD834\uDD65"; 1140 String expectedReverse = "\uD834\uDD65\uD834\uDD57"; 1141 int index = 0; 1142 int ch; 1143 Normalizer iter = new Normalizer(new StringCharacterIterator(Utility.unescape(input)), 1144 Normalizer.NFKC,0); 1145 StringBuffer got = new StringBuffer(); 1146 for (ch = iter.first();ch!=Normalizer.DONE;ch=iter.next()) 1147 { 1148 if (index >= expected.length()) { 1149 errln("FAIL: " + "Unexpected character '" + (char)ch + 1150 "' (" + hex(ch) + ")" + " at index " + index); 1151 break; 1152 } 1153 got.append(UCharacter.toString(ch)); 1154 index++; 1155 } 1156 if (!expected.equals(got.toString())) { 1157 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1158 + " but expected '" + expected + "' (" 1159 + hex(expected) + ")"); 1160 } 1161 if (got.length() < expected.length()) { 1162 errln("FAIL: " + "Only got " + index + " chars, expected " 1163 + expected.length()); 1164 } 1165 1166 logln("Reverse Iteration\n"); 1167 iter.setIndexOnly(iter.endIndex()); 1168 got.setLength(0); 1169 for(ch=iter.previous();ch!=Normalizer.DONE;ch=iter.previous()){ 1170 if (index >= expected.length()) { 1171 errln("FAIL: " + "Unexpected character '" + (char)ch 1172 + "' (" + hex(ch) + ")" + " at index " + index); 1173 break; 1174 } 1175 got.append(UCharacter.toString(ch)); 1176 } 1177 if (!expectedReverse.equals(got.toString())) { 1178 errln("FAIL: " + "got '" +got+ "' (" + hex(got) + ")" 1179 + " but expected '" + expected 1180 + "' (" + hex(expected) + ")"); 1181 } 1182 if (got.length() < expected.length()) { 1183 errln("FAIL: " + "Only got " + index + " chars, expected " 1184 + expected.length()); 1185 } 1186 1187 } 1188 //-------------------------------------------------------------------------- 1189 // helper class for TestPreviousNext() 1190 // simple UTF-32 character iterator 1191 class UCharIterator { 1192 UCharIterator(int[] src, int len, int index)1193 public UCharIterator(int[] src, int len, int index){ 1194 1195 s=src; 1196 length=len; 1197 i=index; 1198 } 1199 current()1200 public int current() { 1201 if(i<length) { 1202 return s[i]; 1203 } else { 1204 return -1; 1205 } 1206 } 1207 next()1208 public int next() { 1209 if(i<length) { 1210 return s[i++]; 1211 } else { 1212 return -1; 1213 } 1214 } 1215 previous()1216 public int previous() { 1217 if(i>0) { 1218 return s[--i]; 1219 } else { 1220 return -1; 1221 } 1222 } 1223 getIndex()1224 public int getIndex() { 1225 return i; 1226 } 1227 1228 private int[] s; 1229 private int length, i; 1230 } TestPreviousNext()1231 public void TestPreviousNext() { 1232 // src and expect strings 1233 char src[]={ 1234 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1235 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1236 0xc4, 1237 0x1ed0 1238 }; 1239 int expect[]={ 1240 0x831d, 1241 0x1d158, 0x1d165, 1242 0x41, 0x308, 1243 0x4f, 0x302, 0x301 1244 }; 1245 1246 // expected src indexes corresponding to expect indexes 1247 int expectIndex[]={ 1248 0, 1249 2, 2, 1250 4, 4, 1251 5, 5, 5, 1252 6 // behind last character 1253 }; 1254 1255 // initial indexes into the src and expect strings 1256 1257 final int SRC_MIDDLE=4; 1258 final int EXPECT_MIDDLE=3; 1259 1260 1261 // movement vector 1262 // - for previous(), 0 for current(), + for next() 1263 // not const so that we can terminate it below for the error message 1264 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1265 1266 // iterators 1267 Normalizer iter = new Normalizer(new String(src), 1268 Normalizer.NFD,0); 1269 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1270 EXPECT_MIDDLE); 1271 1272 int c1, c2; 1273 char m; 1274 1275 // initially set the indexes into the middle of the strings 1276 iter.setIndexOnly(SRC_MIDDLE); 1277 1278 // move around and compare the iteration code points with 1279 // the expected ones 1280 int movesIndex =0; 1281 while(movesIndex<moves.length()) { 1282 m=moves.charAt(movesIndex++); 1283 if(m=='-') { 1284 c1=iter.previous(); 1285 c2=iter32.previous(); 1286 } else if(m=='0') { 1287 c1=iter.current(); 1288 c2=iter32.current(); 1289 } else /* m=='+' */ { 1290 c1=iter.next(); 1291 c2=iter32.next(); 1292 } 1293 1294 // compare results 1295 if(c1!=c2) { 1296 // copy the moves until the current (m) move, and terminate 1297 String history = moves.substring(0,movesIndex); 1298 errln("error: mismatch in Normalizer iteration at "+history+": " 1299 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1300 break; 1301 } 1302 1303 // compare indexes 1304 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1305 // copy the moves until the current (m) move, and terminate 1306 String history = moves.substring(0,movesIndex); 1307 errln("error: index mismatch in Normalizer iteration at " 1308 +history+ " : "+ "Normalizer index " +iter.getIndex() 1309 +" expected "+ expectIndex[iter32.getIndex()]); 1310 break; 1311 } 1312 } 1313 } 1314 // Only in ICU4j TestPreviousNextJCI()1315 public void TestPreviousNextJCI() { 1316 // src and expect strings 1317 char src[]={ 1318 UTF16.getLeadSurrogate(0x2f999), UTF16.getTrailSurrogate(0x2f999), 1319 UTF16.getLeadSurrogate(0x1d15f), UTF16.getTrailSurrogate(0x1d15f), 1320 0xc4, 1321 0x1ed0 1322 }; 1323 int expect[]={ 1324 0x831d, 1325 0x1d158, 0x1d165, 1326 0x41, 0x308, 1327 0x4f, 0x302, 0x301 1328 }; 1329 1330 // expected src indexes corresponding to expect indexes 1331 int expectIndex[]={ 1332 0, 1333 2, 2, 1334 4, 4, 1335 5, 5, 5, 1336 6 // behind last character 1337 }; 1338 1339 // initial indexes into the src and expect strings 1340 1341 final int SRC_MIDDLE=4; 1342 final int EXPECT_MIDDLE=3; 1343 1344 1345 // movement vector 1346 // - for previous(), 0 for current(), + for next() 1347 // not const so that we can terminate it below for the error message 1348 String moves="0+0+0--0-0-+++0--+++++++0--------"; 1349 1350 // iterators 1351 StringCharacterIterator text = new StringCharacterIterator(new String(src)); 1352 Normalizer iter = new Normalizer(text,Normalizer.NFD,0); 1353 UCharIterator iter32 = new UCharIterator(expect, expect.length, 1354 EXPECT_MIDDLE); 1355 1356 int c1, c2; 1357 char m; 1358 1359 // initially set the indexes into the middle of the strings 1360 iter.setIndexOnly(SRC_MIDDLE); 1361 1362 // move around and compare the iteration code points with 1363 // the expected ones 1364 int movesIndex =0; 1365 while(movesIndex<moves.length()) { 1366 m=moves.charAt(movesIndex++); 1367 if(m=='-') { 1368 c1=iter.previous(); 1369 c2=iter32.previous(); 1370 } else if(m=='0') { 1371 c1=iter.current(); 1372 c2=iter32.current(); 1373 } else /* m=='+' */ { 1374 c1=iter.next(); 1375 c2=iter32.next(); 1376 } 1377 1378 // compare results 1379 if(c1!=c2) { 1380 // copy the moves until the current (m) move, and terminate 1381 String history = moves.substring(0,movesIndex); 1382 errln("error: mismatch in Normalizer iteration at "+history+": " 1383 +"got c1= " + hex(c1) +" != expected c2= "+ hex(c2)); 1384 break; 1385 } 1386 1387 // compare indexes 1388 if(iter.getIndex()!=expectIndex[iter32.getIndex()]) { 1389 // copy the moves until the current (m) move, and terminate 1390 String history = moves.substring(0,movesIndex); 1391 errln("error: index mismatch in Normalizer iteration at " 1392 +history+ " : "+ "Normalizer index " +iter.getIndex() 1393 +" expected "+ expectIndex[iter32.getIndex()]); 1394 break; 1395 } 1396 } 1397 } 1398 1399 // test APIs that are not otherwise used - improve test coverage TestNormalizerAPI()1400 public void TestNormalizerAPI() throws Exception { 1401 try{ 1402 // instantiate a Normalizer from a CharacterIterator 1403 String s=Utility.unescape("a\u0308\uac00\\U0002f800"); 1404 // make s a bit longer and more interesting 1405 UCharacterIterator iter = UCharacterIterator.getInstance(s+s); 1406 Normalizer norm = new Normalizer(iter, Normalizer.NFC,0); 1407 if(norm.next()!=0xe4) { 1408 errln("error in Normalizer(CharacterIterator).next()"); 1409 } 1410 1411 // test clone(), ==, and hashCode() 1412 Normalizer clone=(Normalizer)norm.clone(); 1413 if(clone.equals(norm)) { 1414 errln("error in Normalizer(Normalizer(CharacterIterator)).clone()!=norm"); 1415 } 1416 1417 1418 if(clone.getLength()!= norm.getLength()){ 1419 errln("error in Normalizer.getBeginIndex()"); 1420 } 1421 // clone must have the same hashCode() 1422 //if(clone.hashCode()!=norm.hashCode()) { 1423 // errln("error in Normalizer(Normalizer(CharacterIterator)).clone().hashCode()!=copy.hashCode()"); 1424 //} 1425 if(clone.next()!=0xac00) { 1426 errln("error in Normalizer(Normalizer(CharacterIterator)).next()"); 1427 } 1428 int ch = clone.next(); 1429 if(ch!=0x4e3d) { 1430 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next()"); 1431 } 1432 // position changed, must change hashCode() 1433 if(clone.hashCode()==norm.hashCode()) { 1434 errln("error in Normalizer(Normalizer(CharacterIterator)).clone().next().hashCode()==copy.hashCode()"); 1435 } 1436 1437 // test compose() and decompose() 1438 StringBuffer tel; 1439 String nfkc, nfkd; 1440 tel=new StringBuffer("\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121\u2121"); 1441 tel.insert(1,(char)0x0301); 1442 1443 nfkc=Normalizer.compose(tel.toString(), true); 1444 nfkd=Normalizer.decompose(tel.toString(), true); 1445 if( 1446 !nfkc.equals(Utility.unescape("TE\u0139TELTELTELTELTELTELTELTELTEL"))|| 1447 !nfkd.equals(Utility.unescape("TEL\u0301TELTELTELTELTELTELTELTELTEL")) 1448 ) { 1449 errln("error in Normalizer::(de)compose(): wrong result(s)"); 1450 } 1451 1452 // test setIndex() 1453 // ch=norm.setIndex(3); 1454 // if(ch!=0x4e3d) { 1455 // errln("error in Normalizer(CharacterIterator).setIndex(3)"); 1456 // } 1457 1458 // test setText(CharacterIterator) and getText() 1459 String out, out2; 1460 clone.setText(iter); 1461 1462 out = clone.getText(); 1463 out2 = iter.getText(); 1464 if( !out.equals(out2) || 1465 clone.startIndex()!=0|| 1466 clone.endIndex()!=iter.getLength() 1467 ) { 1468 errln("error in Normalizer::setText() or Normalizer::getText()"); 1469 } 1470 1471 char[] fillIn1 = new char[clone.getLength()]; 1472 char[] fillIn2 = new char[iter.getLength()]; 1473 int len = clone.getText(fillIn1); 1474 iter.getText(fillIn2,0); 1475 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1476 errln("error in Normalizer.getText(). Normalizer: "+ 1477 Utility.hex(new String(fillIn1))+ 1478 " Iter: " + Utility.hex(new String(fillIn2))); 1479 } 1480 1481 clone.setText(fillIn1); 1482 len = clone.getText(fillIn2); 1483 if(!Utility.arrayRegionMatches(fillIn1,0,fillIn2,0,len)){ 1484 errln("error in Normalizer.setText() or Normalizer.getText()"+ 1485 Utility.hex(new String(fillIn1))+ 1486 " Iter: " + Utility.hex(new String(fillIn2))); 1487 } 1488 1489 // test setText(UChar *), getUMode() and setMode() 1490 clone.setText(s); 1491 clone.setIndexOnly(1); 1492 clone.setMode(Normalizer.NFD); 1493 if(clone.getMode()!=Normalizer.NFD) { 1494 errln("error in Normalizer::setMode() or Normalizer::getMode()"); 1495 } 1496 if(clone.next()!=0x308 || clone.next()!=0x1100) { 1497 errln("error in Normalizer::setText() or Normalizer::setMode()"); 1498 } 1499 1500 // test last()/previous() with an internal buffer overflow 1501 StringBuffer buf = new StringBuffer("aaaaaaaaaa"); 1502 buf.setCharAt(10-1,'\u0308'); 1503 clone.setText(buf); 1504 if(clone.last()!=0x308) { 1505 errln("error in Normalizer(10*U+0308).last()"); 1506 } 1507 1508 // test UNORM_NONE 1509 norm.setMode(Normalizer.NONE); 1510 if(norm.first()!=0x61 || norm.next()!=0x308 || norm.last()!=0x2f800) { 1511 errln("error in Normalizer(UNORM_NONE).first()/next()/last()"); 1512 } 1513 out=Normalizer.normalize(s, Normalizer.NONE); 1514 if(!out.equals(s)) { 1515 errln("error in Normalizer::normalize(UNORM_NONE)"); 1516 } 1517 ch = 0x1D15E; 1518 String exp = "\\U0001D157\\U0001D165"; 1519 String ns = Normalizer.normalize(ch,Normalizer.NFC); 1520 if(!ns.equals(Utility.unescape(exp))){ 1521 errln("error in Normalizer.normalize(int,Mode)"); 1522 } 1523 ns = Normalizer.normalize(ch,Normalizer.NFC,0); 1524 if(!ns.equals(Utility.unescape(exp))){ 1525 errln("error in Normalizer.normalize(int,Mode,int)"); 1526 } 1527 1528 1529 }catch(Exception e){ 1530 throw e; 1531 } 1532 } 1533 TestConcatenate()1534 public void TestConcatenate() { 1535 1536 Object[][]cases=new Object[][]{ 1537 /* mode, left, right, result */ 1538 { 1539 Normalizer.NFC, 1540 "re", 1541 "\u0301sum\u00e9", 1542 "r\u00e9sum\u00e9" 1543 }, 1544 { 1545 Normalizer.NFC, 1546 "a\u1100", 1547 "\u1161bcdefghijk", 1548 "a\uac00bcdefghijk" 1549 }, 1550 /* ### TODO: add more interesting cases */ 1551 { 1552 Normalizer.NFD, 1553 "\u03B1\u0345", 1554 "\u0C4D\uD804\uDCBA\uD834\uDD69", // 0C4D 110BA 1D169 1555 "\u03B1\uD834\uDD69\uD804\uDCBA\u0C4D\u0345" // 03B1 1D169 110BA 0C4D 0345 1556 } 1557 }; 1558 1559 String left, right, expect, result; 1560 Normalizer.Mode mode; 1561 int i; 1562 1563 /* test concatenation */ 1564 for(i=0; i<cases.length; ++i) { 1565 mode = (Normalizer.Mode)cases[i][0]; 1566 1567 left=(String)cases[i][1]; 1568 right=(String)cases[i][2]; 1569 expect=(String)cases[i][3]; 1570 { 1571 result=Normalizer.concatenate(left, right, mode,0); 1572 if(!result.equals(expect)) { 1573 errln("error in Normalizer.concatenate(), cases[] failed" 1574 +", result==expect: expected: " 1575 + hex(expect)+" =========> got: " + hex(result)); 1576 } 1577 } 1578 { 1579 result=Normalizer.concatenate(left.toCharArray(), right.toCharArray(), mode,0); 1580 if(!result.equals(expect)) { 1581 errln("error in Normalizer.concatenate(), cases[] failed" 1582 +", result==expect: expected: " 1583 + hex(expect)+" =========> got: " + hex(result)); 1584 } 1585 } 1586 } 1587 } 1588 private final int RAND_MAX = 0x7fff; 1589 TestCheckFCD()1590 public void TestCheckFCD() 1591 { 1592 char[] FAST = {0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 1593 0x0008, 0x0009, 0x000A}; 1594 1595 char[] FALSE = {0x0001, 0x0002, 0x02EA, 0x03EB, 0x0300, 0x0301, 1596 0x02B9, 0x0314, 0x0315, 0x0316}; 1597 1598 char[] TRUE = {0x0030, 0x0040, 0x0440, 0x056D, 0x064F, 0x06E7, 1599 0x0050, 0x0730, 0x09EE, 0x1E10}; 1600 1601 char[][] datastr= { {0x0061, 0x030A, 0x1E05, 0x0302, 0}, 1602 {0x0061, 0x030A, 0x00E2, 0x0323, 0}, 1603 {0x0061, 0x0323, 0x00E2, 0x0323, 0}, 1604 {0x0061, 0x0323, 0x1E05, 0x0302, 0} 1605 }; 1606 Normalizer.QuickCheckResult result[] = {Normalizer.YES, Normalizer.NO, Normalizer.NO, Normalizer.YES}; 1607 1608 char[] datachar= { 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 1609 0x6a, 1610 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 1611 0xea, 1612 0x0300, 0x0301, 0x0302, 0x0303, 0x0304, 0x0305, 0x0306, 1613 0x0307, 0x0308, 0x0309, 0x030a, 1614 0x0320, 0x0321, 0x0322, 0x0323, 0x0324, 0x0325, 0x0326, 1615 0x0327, 0x0328, 0x0329, 0x032a, 1616 0x1e00, 0x1e01, 0x1e02, 0x1e03, 0x1e04, 0x1e05, 0x1e06, 1617 0x1e07, 0x1e08, 0x1e09, 0x1e0a 1618 }; 1619 1620 int count = 0; 1621 1622 if (Normalizer.quickCheck(FAST,0,FAST.length, Normalizer.FCD,0) != Normalizer.YES) 1623 errln("Normalizer.quickCheck(FCD) failed: expected value for fast Normalizer.quickCheck is Normalizer.YES\n"); 1624 if (Normalizer.quickCheck(FALSE,0, FALSE.length,Normalizer.FCD,0) != Normalizer.NO) 1625 errln("Normalizer.quickCheck(FCD) failed: expected value for error Normalizer.quickCheck is Normalizer.NO\n"); 1626 if (Normalizer.quickCheck(TRUE,0,TRUE.length,Normalizer.FCD,0) != Normalizer.YES) 1627 errln("Normalizer.quickCheck(FCD) failed: expected value for correct Normalizer.quickCheck is Normalizer.YES\n"); 1628 1629 1630 while (count < 4) 1631 { 1632 Normalizer.QuickCheckResult fcdresult = Normalizer.quickCheck(datastr[count],0,datastr[count].length, Normalizer.FCD,0); 1633 if (result[count] != fcdresult) { 1634 errln("Normalizer.quickCheck(FCD) failed: Data set "+ count 1635 + " expected value "+ result[count]); 1636 } 1637 count ++; 1638 } 1639 1640 /* random checks of long strings */ 1641 //srand((unsigned)time( NULL )); 1642 Random rand = createRandom(); // use test framework's random 1643 1644 for (count = 0; count < 50; count ++) 1645 { 1646 int size = 0; 1647 Normalizer.QuickCheckResult testresult = Normalizer.YES; 1648 char[] data= new char[20]; 1649 char[] norm= new char[100]; 1650 char[] nfd = new char[100]; 1651 int normStart = 0; 1652 int nfdsize = 0; 1653 while (size != 19) { 1654 data[size] = datachar[rand.nextInt(RAND_MAX)*50/RAND_MAX]; 1655 logln("0x"+data[size]); 1656 normStart += Normalizer.normalize(data,size,size+1, 1657 norm,normStart,100, 1658 Normalizer.NFD,0); 1659 size ++; 1660 } 1661 logln("\n"); 1662 1663 nfdsize = Normalizer.normalize(data,0,size, nfd,0,nfd.length,Normalizer.NFD,0); 1664 // nfdsize = unorm_normalize(data, size, UNORM_NFD, UCOL_IGNORE_HANGUL, 1665 // nfd, 100, &status); 1666 if (nfdsize != normStart || Utility.arrayRegionMatches(nfd,0, norm,0,nfdsize) ==false) { 1667 testresult = Normalizer.NO; 1668 } 1669 if (testresult == Normalizer.YES) { 1670 logln("result Normalizer.YES\n"); 1671 } 1672 else { 1673 logln("result Normalizer.NO\n"); 1674 } 1675 1676 if (Normalizer.quickCheck(data,0,data.length, Normalizer.FCD,0) != testresult) { 1677 errln("Normalizer.quickCheck(FCD) failed: expected "+ testresult +" for random data: "+hex(new String(data)) ); 1678 } 1679 } 1680 } 1681 1682 1683 // reference implementation of Normalizer::compare ref_norm_compare(String s1, String s2, int options)1684 private int ref_norm_compare(String s1, String s2, int options) { 1685 String t1, t2,r1,r2; 1686 1687 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT); 1688 1689 if((options&Normalizer.COMPARE_IGNORE_CASE)!=0) { 1690 // NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 1691 r1 = Normalizer.decompose(s1,false,normOptions); 1692 r2 = Normalizer.decompose(s2,false,normOptions); 1693 r1 = UCharacter.foldCase(r1,options); 1694 r2 = UCharacter.foldCase(r2,options); 1695 }else{ 1696 r1 = s1; 1697 r2 = s2; 1698 } 1699 1700 t1 = Normalizer.decompose(r1, false, normOptions); 1701 t2 = Normalizer.decompose(r2, false, normOptions); 1702 1703 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1704 UTF16.StringComparator comp 1705 = new UTF16.StringComparator(true, false, 1706 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1707 return comp.compare(t1,t2); 1708 } else { 1709 return t1.compareTo(t2); 1710 } 1711 1712 } 1713 1714 // test wrapper for Normalizer::compare, sets UNORM_INPUT_IS_FCD appropriately norm_compare(String s1, String s2, int options)1715 private int norm_compare(String s1, String s2, int options) { 1716 int normOptions=(int)(options>>Normalizer.COMPARE_NORM_OPTIONS_SHIFT); 1717 1718 if( Normalizer.YES==Normalizer.quickCheck(s1,Normalizer.FCD,normOptions) && 1719 Normalizer.YES==Normalizer.quickCheck(s2,Normalizer.FCD,normOptions)) { 1720 options|=Normalizer.INPUT_IS_FCD; 1721 } 1722 1723 return Normalizer.compare(s1, s2, options); 1724 } 1725 1726 // reference implementation of UnicodeString::caseCompare ref_case_compare(String s1, String s2, int options)1727 private int ref_case_compare(String s1, String s2, int options) { 1728 String t1, t2; 1729 1730 t1=s1; 1731 t2=s2; 1732 1733 t1 = UCharacter.foldCase(t1,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1734 t2 = UCharacter.foldCase(t2,((options&Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I)==0)); 1735 1736 if((options&Normalizer.COMPARE_CODE_POINT_ORDER)!=0) { 1737 UTF16.StringComparator comp 1738 = new UTF16.StringComparator(true, false, 1739 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1740 return comp.compare(t1,t2); 1741 } else { 1742 return t1.compareTo(t2); 1743 } 1744 1745 } 1746 1747 // reduce an integer to -1/0/1 sign(int value)1748 private static int sign(int value) { 1749 if(value==0) { 1750 return 0; 1751 } else { 1752 return (value>>31)|1; 1753 } 1754 } signString(int value)1755 private static String signString(int value) { 1756 if(value<0) { 1757 return "<0"; 1758 } else if(value==0) { 1759 return "=0"; 1760 } else /* value>0 */ { 1761 return ">0"; 1762 } 1763 } 1764 // test Normalizer::compare and unorm_compare (thinly wrapped by the former) 1765 // by comparing it with its semantic equivalent 1766 // since we trust the pieces, this is sufficient 1767 1768 // test each string with itself and each other 1769 // each time with all options 1770 private String strings[]=new String[]{ 1771 // some cases from NormalizationTest.txt 1772 // 0..3 1773 "D\u031B\u0307\u0323", 1774 "\u1E0C\u031B\u0307", 1775 "D\u031B\u0323\u0307", 1776 "d\u031B\u0323\u0307", 1777 1778 // 4..6 1779 "\u00E4", 1780 "a\u0308", 1781 "A\u0308", 1782 1783 // Angstrom sign = A ring 1784 // 7..10 1785 "\u212B", 1786 "\u00C5", 1787 "A\u030A", 1788 "a\u030A", 1789 1790 // 11.14 1791 "a\u059A\u0316\u302A\u032Fb", 1792 "a\u302A\u0316\u032F\u059Ab", 1793 "a\u302A\u0316\u032F\u059Ab", 1794 "A\u059A\u0316\u302A\u032Fb", 1795 1796 // from ICU case folding tests 1797 // 15..20 1798 "A\u00df\u00b5\ufb03\\U0001040c\u0131", 1799 "ass\u03bcffi\\U00010434i", 1800 "\u0061\u0042\u0131\u03a3\u00df\ufb03\ud93f\udfff", 1801 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udfff", 1802 "\u0041\u0062\u0131\u03c3\u0053\u0073\u0066\u0046\u0069\ud93f\udfff", 1803 "\u0041\u0062\u0069\u03c3\u0073\u0053\u0046\u0066\u0049\ud93f\udffd", 1804 1805 // U+d800 U+10001 see implementation comment in unorm_cmpEquivFold 1806 // vs. U+10000 at bottom - code point order 1807 // 21..22 1808 "\ud800\ud800\udc01", 1809 "\ud800\udc00", 1810 1811 // other code point order tests from ustrtest.cpp 1812 // 23..31 1813 "\u20ac\ud801", 1814 "\u20ac\ud800\udc00", 1815 "\ud800", 1816 "\ud800\uff61", 1817 "\udfff", 1818 "\uff61\udfff", 1819 "\uff61\ud800\udc02", 1820 "\ud800\udc02", 1821 "\ud84d\udc56", 1822 1823 // long strings, see cnormtst.c/TestNormCoverage() 1824 // equivalent if case-insensitive 1825 // 32..33 1826 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1827 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1828 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1829 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1830 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1831 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1832 "aaaaaaaaaaaaaaaaaazzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1833 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1834 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1835 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1836 "\uAD8B\uAD8B\uAD8B\uAD8B"+ 1837 "d\u031B\u0307\u0323", 1838 1839 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1840 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1841 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1842 "\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1843 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1844 "\\U0001d15e\\U0001d157\\U0001d165\\U0001d15e\\U0001d15e\\U0001d15e\\U0001d15e"+ 1845 "aaaaaaaaaaAAAAAAAAZZZZZZZZZZZZZZZZzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz"+ 1846 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"+ 1847 "ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc"+ 1848 "ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd"+ 1849 "\u1100\u116f\u11aa\uAD8B\uAD8B\u1100\u116f\u11aa"+ 1850 "\u1E0C\u031B\u0307", 1851 1852 // some strings that may make a difference whether the compare function 1853 // case-folds or decomposes first 1854 // 34..41 1855 "\u0360\u0345\u0334", 1856 "\u0360\u03b9\u0334", 1857 1858 "\u0360\u1f80\u0334", 1859 "\u0360\u03b1\u0313\u03b9\u0334", 1860 1861 "\u0360\u1ffc\u0334", 1862 "\u0360\u03c9\u03b9\u0334", 1863 1864 "a\u0360\u0345\u0360\u0345b", 1865 "a\u0345\u0360\u0345\u0360b", 1866 1867 // interesting cases for canonical caseless match with turkic i handling 1868 // 42..43 1869 "\u00cc", 1870 "\u0069\u0300", 1871 1872 // strings with post-Unicode 3.2 normalization or normalization corrections 1873 // 44..45 1874 "\u00e4\u193b\\U0002f868", 1875 "\u0061\u193b\u0308\u36fc", 1876 1877 1878 }; 1879 1880 // all combinations of options 1881 // UNORM_INPUT_IS_FCD is set automatically if both input strings fulfill FCD conditions 1882 final class Temp { 1883 int options; 1884 String name; Temp(int opt,String str)1885 public Temp(int opt,String str){ 1886 options =opt; 1887 name = str; 1888 } 1889 1890 } 1891 // set UNORM_UNICODE_3_2 in one additional combination 1892 1893 private Temp[] opt = new Temp[]{ 1894 new Temp(0,"default"), 1895 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER, "code point order" ), 1896 new Temp(Normalizer.COMPARE_IGNORE_CASE, "ignore case" ), 1897 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE, "code point order & ignore case" ), 1898 new Temp(Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "ignore case & special i"), 1899 new Temp(Normalizer.COMPARE_CODE_POINT_ORDER|Normalizer.COMPARE_IGNORE_CASE|Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I, "code point order & ignore case & special i"), 1900 new Temp(Normalizer.UNICODE_3_2 << Normalizer.COMPARE_NORM_OPTIONS_SHIFT, "Unicode 3.2") 1901 }; 1902 1903 TestCompareDebug()1904 public void TestCompareDebug(){ 1905 1906 String[] s = new String[100]; // at least as many items as in strings[] ! 1907 1908 1909 int i, j, k, count=strings.length; 1910 int result, refResult; 1911 1912 // create the UnicodeStrings 1913 for(i=0; i<count; ++i) { 1914 s[i]=Utility.unescape(strings[i]); 1915 } 1916 UTF16.StringComparator comp = new UTF16.StringComparator(true, false, 1917 UTF16.StringComparator.FOLD_CASE_DEFAULT); 1918 // test them each with each other 1919 1920 i = 42; 1921 j = 43; 1922 k = 2; 1923 // test Normalizer::compare 1924 result=norm_compare(s[i], s[j], opt[k].options); 1925 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 1926 if(sign(result)!=sign(refResult)) { 1927 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1928 } 1929 1930 // test UnicodeString::caseCompare - same internal implementation function 1931 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 1932 // result=s[i]. (s[j], opt[k].options); 1933 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 1934 { 1935 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 1936 } 1937 else { 1938 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 1939 } 1940 1941 result=comp.compare(s[i],s[j]); 1942 refResult=ref_case_compare(s[i], s[j], opt[k].options); 1943 if(sign(result)!=sign(refResult)) { 1944 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1945 } 1946 } 1947 String value1 = "\u00dater\u00fd"; 1948 String value2 = "\u00fater\u00fd"; 1949 if(Normalizer.compare(value1,value2,0)!=0){ 1950 if(Normalizer.compare(value1,value2,Normalizer.COMPARE_IGNORE_CASE)==0){ 1951 1952 } 1953 } 1954 } 1955 TestCompare()1956 public void TestCompare() { 1957 1958 String[] s = new String[100]; // at least as many items as in strings[] ! 1959 1960 int i, j, k, count=strings.length; 1961 int result, refResult; 1962 1963 // create the UnicodeStrings 1964 for(i=0; i<count; ++i) { 1965 s[i]=Utility.unescape(strings[i]); 1966 } 1967 UTF16.StringComparator comp = new UTF16.StringComparator(); 1968 // test them each with each other 1969 for(i=0; i<count; ++i) { 1970 for(j=i; j<count; ++j) { 1971 for(k=0; k<opt.length; ++k) { 1972 // test Normalizer::compare 1973 result=norm_compare(s[i], s[j], opt[k].options); 1974 refResult=ref_norm_compare(s[i], s[j], opt[k].options); 1975 if(sign(result)!=sign(refResult)) { 1976 errln("Normalizer::compare( " + i +", "+j + ", " +k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1977 } 1978 1979 // test UnicodeString::caseCompare - same internal implementation function 1980 if(0!=(opt[k].options&Normalizer.COMPARE_IGNORE_CASE)) { 1981 // result=s[i]. (s[j], opt[k].options); 1982 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 1983 { 1984 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 1985 } 1986 else { 1987 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 1988 } 1989 1990 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 1991 // result=comp.caseCompare(s[i],s[j], opt[k].options); 1992 result=comp.compare(s[i],s[j]); 1993 refResult=ref_case_compare(s[i], s[j], opt[k].options); 1994 if(sign(result)!=sign(refResult)) { 1995 errln("Normalizer::compare( " + i +", "+j + ", "+k+"( " +opt[k].name+"))=" + result +" should be same sign as " + refResult); 1996 } 1997 } 1998 } 1999 } 2000 } 2001 2002 // test cases with i and I to make sure Turkic works 2003 char[] iI= new char[]{ 0x49, 0x69, 0x130, 0x131 }; 2004 UnicodeSet set = new UnicodeSet(), iSet = new UnicodeSet(); 2005 Normalizer2Impl nfcImpl = Norm2AllModes.getNFCInstance().impl; 2006 nfcImpl.ensureCanonIterData(); 2007 2008 String s1, s2; 2009 2010 // collect all sets into one for contiguous output 2011 for(i=0; i<iI.length; ++i) { 2012 if(nfcImpl.getCanonStartSet(iI[i], iSet)) { 2013 set.addAll(iSet); 2014 } 2015 } 2016 2017 // test all of these precomposed characters 2018 Normalizer2 nfcNorm2 = Normalizer2.getNFCInstance(); 2019 UnicodeSetIterator it = new UnicodeSetIterator(set); 2020 int c; 2021 while(it.next() && (c=it.codepoint)!=UnicodeSetIterator.IS_STRING) { 2022 s1 = UTF16.valueOf(c); 2023 s2 = nfcNorm2.getDecomposition(c); 2024 for(k=0; k<opt.length; ++k) { 2025 // test Normalizer::compare 2026 2027 result= norm_compare(s1, s2, opt[k].options); 2028 refResult=ref_norm_compare(s1, s2, opt[k].options); 2029 if(sign(result)!=sign(refResult)) { 2030 errln("Normalizer.compare(U+"+hex(c)+" with its NFD, "+opt[k].name+")" 2031 + signString(result)+" should be "+signString(refResult)); 2032 } 2033 2034 // test UnicodeString::caseCompare - same internal implementation function 2035 if((opt[k].options & Normalizer.COMPARE_IGNORE_CASE)>0) { 2036 if ((opt[k].options & Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I) == 0) 2037 { 2038 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_DEFAULT); 2039 } 2040 else { 2041 comp.setIgnoreCase(true, UTF16.StringComparator.FOLD_CASE_EXCLUDE_SPECIAL_I); 2042 } 2043 2044 comp.setCodePointCompare((opt[k].options & Normalizer.COMPARE_CODE_POINT_ORDER) != 0); 2045 2046 result=comp.compare(s1,s2); 2047 refResult=ref_case_compare(s1, s2, opt[k].options); 2048 if(sign(result)!=sign(refResult)) { 2049 errln("UTF16.compare(U+"+hex(c)+" with its NFD, " 2050 +opt[k].name+")"+signString(result) +" should be "+signString(refResult)); 2051 } 2052 } 2053 } 2054 } 2055 2056 // test getDecomposition() for some characters that do not decompose 2057 if( nfcNorm2.getDecomposition(0x20)!=null || 2058 nfcNorm2.getDecomposition(0x4e00)!=null || 2059 nfcNorm2.getDecomposition(0x20002)!=null 2060 ) { 2061 errln("NFC.getDecomposition() returns TRUE for characters which do not have decompositions"); 2062 } 2063 2064 // test getRawDecomposition() for some characters that do not decompose 2065 if( nfcNorm2.getRawDecomposition(0x20)!=null || 2066 nfcNorm2.getRawDecomposition(0x4e00)!=null || 2067 nfcNorm2.getRawDecomposition(0x20002)!=null 2068 ) { 2069 errln("getRawDecomposition() returns TRUE for characters which do not have decompositions"); 2070 } 2071 2072 // test composePair() for some pairs of characters that do not compose 2073 if( nfcNorm2.composePair(0x20, 0x301)>=0 || 2074 nfcNorm2.composePair(0x61, 0x305)>=0 || 2075 nfcNorm2.composePair(0x1100, 0x1160)>=0 || 2076 nfcNorm2.composePair(0xac00, 0x11a7)>=0 2077 ) { 2078 errln("NFC.composePair() incorrectly composes some pairs of characters"); 2079 } 2080 2081 // test FilteredNormalizer2.getDecomposition() 2082 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff]"); 2083 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2084 if(fn2.getDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getDecomposition(0x100))) { 2085 errln("FilteredNormalizer2(NFC, ^A0-FF).getDecomposition() failed"); 2086 } 2087 2088 // test FilteredNormalizer2.getRawDecomposition() 2089 if(fn2.getRawDecomposition(0xe4)!=null || !"A\u0304".equals(fn2.getRawDecomposition(0x100))) { 2090 errln("FilteredNormalizer2(NFC, ^A0-FF).getRawDecomposition() failed"); 2091 } 2092 2093 // test FilteredNormalizer2::composePair() 2094 if( 0x100!=fn2.composePair(0x41, 0x304) || 2095 fn2.composePair(0xc7, 0x301)>=0 // unfiltered result: U+1E08 2096 ) { 2097 errln("FilteredNormalizer2(NFC, ^A0-FF).composePair() failed"); 2098 } 2099 } 2100 2101 // verify that case-folding does not un-FCD strings countFoldFCDExceptions(int foldingOptions)2102 int countFoldFCDExceptions(int foldingOptions) { 2103 String s, d; 2104 int c; 2105 int count; 2106 int/*unsigned*/ cc, trailCC, foldCC, foldTrailCC; 2107 Normalizer.QuickCheckResult qcResult; 2108 int category; 2109 boolean isNFD; 2110 2111 2112 logln("Test if case folding may un-FCD a string (folding options 0x)"+hex(foldingOptions)); 2113 2114 count=0; 2115 for(c=0; c<=0x10ffff; ++c) { 2116 category=UCharacter.getType(c); 2117 if(category==UCharacterCategory.UNASSIGNED) { 2118 continue; // skip unassigned code points 2119 } 2120 if(c==0xac00) { 2121 c=0xd7a3; // skip Hangul - no case folding there 2122 continue; 2123 } 2124 // skip Han blocks - no case folding there either 2125 if(c==0x3400) { 2126 c=0x4db5; 2127 continue; 2128 } 2129 if(c==0x4e00) { 2130 c=0x9fa5; 2131 continue; 2132 } 2133 if(c==0x20000) { 2134 c=0x2a6d6; 2135 continue; 2136 } 2137 2138 s= UTF16.valueOf(c); 2139 2140 // get leading and trailing cc for c 2141 d= Normalizer.decompose(s,false); 2142 isNFD= s==d; 2143 cc=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2144 trailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2145 2146 // get leading and trailing cc for the case-folding of c 2147 UCharacter.foldCase(s,(foldingOptions==0)); 2148 d = Normalizer.decompose(s, false); 2149 foldCC=UCharacter.getCombiningClass(UTF16.charAt(d,0)); 2150 foldTrailCC=UCharacter.getCombiningClass(UTF16.charAt(d,d.length()-1)); 2151 2152 qcResult=Normalizer.quickCheck(s, Normalizer.FCD,0); 2153 2154 2155 // bad: 2156 // - character maps to empty string: adjacent characters may then need reordering 2157 // - folding has different leading/trailing cc's, and they don't become just 0 2158 // - folding itself is not FCD 2159 if( qcResult!=Normalizer.YES || 2160 s.length()==0 || 2161 (cc!=foldCC && foldCC!=0) || (trailCC!=foldTrailCC && foldTrailCC!=0) 2162 ) { 2163 ++count; 2164 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2165 //errln(" cc %02x trailCC %02x foldCC(U+%04lx) %02x foldTrailCC(U+%04lx) %02x quickCheck(folded)=%d", cc, trailCC, UTF16.charAt(d,0), foldCC, UTF16.charAt(d,d.length()-1), foldTrailCC, qcResult); 2166 continue; 2167 } 2168 2169 // also bad: 2170 // if a code point is in NFD but its case folding is not, then 2171 // unorm_compare will also fail 2172 if(isNFD && Normalizer.YES!=Normalizer.quickCheck(s, Normalizer.NFD,0)) { 2173 ++count; 2174 errln("U+"+hex(c)+": case-folding may un-FCD a string (folding options 0x"+hex(foldingOptions)+")"); 2175 } 2176 } 2177 2178 logln("There are "+hex(count)+" code points for which case-folding may un-FCD a string (folding options"+foldingOptions+"x)" ); 2179 return count; 2180 } 2181 TestFindFoldFCDExceptions()2182 public void TestFindFoldFCDExceptions() { 2183 int count; 2184 2185 count=countFoldFCDExceptions(0); 2186 count+=countFoldFCDExceptions(Normalizer.FOLD_CASE_EXCLUDE_SPECIAL_I); 2187 if(count>0) { 2188 //* 2189 //* If case-folding un-FCDs any strings, then unorm_compare() must be 2190 //* re-implemented. 2191 //* It currently assumes that one can check for FCD then case-fold 2192 //* and then still have FCD strings for raw decomposition without reordering. 2193 //* 2194 errln("error: There are "+count+" code points for which case-folding"+ 2195 " may un-FCD a string for all folding options.\n See comment"+ 2196 " in BasicNormalizerTest::FindFoldFCDExceptions()!"); 2197 } 2198 } 2199 TestCombiningMarks()2200 public void TestCombiningMarks(){ 2201 String src = "\u0f71\u0f72\u0f73\u0f74\u0f75"; 2202 String expected = "\u0F71\u0F71\u0F71\u0F72\u0F72\u0F74\u0F74"; 2203 String result = Normalizer.decompose(src,false); 2204 if(!expected.equals(result)){ 2205 errln("Reordering of combining marks failed. Expected: "+Utility.hex(expected)+" Got: "+ Utility.hex(result)); 2206 } 2207 } 2208 2209 /* 2210 * Re-enable this test when UTC fixes UAX 21 2211 public void TestUAX21Failure(){ 2212 final String[][] cases = new String[][]{ 2213 {"\u0061\u0345\u0360\u0345\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2214 {"\u0061\u0345\u0345\u0360\u0062", "\u0061\u0360\u0345\u0345\u0062"}, 2215 {"\u0061\u0345\u0360\u0362\u0360\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2216 {"\u0061\u0360\u0345\u0360\u0362\u0062", "\u0061\u0362\u0360\u0360\u0345\u0062"}, 2217 {"\u0061\u0345\u0360\u0362\u0361\u0062", "\u0061\u0362\u0360\u0361\u0345\u0062"}, 2218 {"\u0061\u0361\u0345\u0360\u0362\u0062", "\u0061\u0362\u0361\u0360\u0345\u0062"}, 2219 }; 2220 for(int i = 0; i< cases.length; i++){ 2221 String s1 =cases[0][0]; 2222 String s2 = cases[0][1]; 2223 if( (Normalizer.compare(s1,s2,Normalizer.FOLD_CASE_DEFAULT ==0)//case sensitive compare 2224 && 2225 (Normalizer.compare(s1,s2,Normalizer.COMPARE_IGNORE_CASE)!=0)){ 2226 errln("Normalizer.compare() failed for s1: " 2227 + Utility.hex(s1) +" s2: " + Utility.hex(s2)); 2228 } 2229 } 2230 } 2231 */ TestFCNFKCClosure()2232 public void TestFCNFKCClosure() { 2233 final class TestStruct{ 2234 int c; 2235 String s; 2236 TestStruct(int cp, String src){ 2237 c=cp; 2238 s=src; 2239 } 2240 } 2241 2242 TestStruct[] tests= new TestStruct[]{ 2243 new TestStruct( 0x00C4, "" ), 2244 new TestStruct( 0x00E4, "" ), 2245 new TestStruct( 0x037A, "\u0020\u03B9" ), 2246 new TestStruct( 0x03D2, "\u03C5" ), 2247 new TestStruct( 0x20A8, "\u0072\u0073" ) , 2248 new TestStruct( 0x210B, "\u0068" ), 2249 new TestStruct( 0x210C, "\u0068" ), 2250 new TestStruct( 0x2121, "\u0074\u0065\u006C" ), 2251 new TestStruct( 0x2122, "\u0074\u006D" ), 2252 new TestStruct( 0x2128, "\u007A" ), 2253 new TestStruct( 0x1D5DB,"\u0068" ), 2254 new TestStruct( 0x1D5ED,"\u007A" ), 2255 new TestStruct( 0x0061, "" ) 2256 }; 2257 2258 2259 for(int i = 0; i < tests.length; ++ i) { 2260 String result=Normalizer.getFC_NFKC_Closure(tests[i].c); 2261 if(!result.equals(new String(tests[i].s))) { 2262 errln("getFC_NFKC_Closure(U+"+Integer.toHexString(tests[i].c)+") is wrong"); 2263 } 2264 } 2265 2266 /* error handling */ 2267 2268 int length=Normalizer.getFC_NFKC_Closure(0x5c, null); 2269 if(length!=0){ 2270 errln("getFC_NFKC_Closure did not perform error handling correctly"); 2271 } 2272 } TestBugJ2324()2273 public void TestBugJ2324(){ 2274 /* String[] input = new String[]{ 2275 //"\u30FD\u3099", 2276 "\u30FA\u309A", 2277 "\u30FB\u309A", 2278 "\u30FC\u309A", 2279 "\u30FE\u309A", 2280 "\u30FD\u309A", 2281 2282 };*/ 2283 String troublesome = "\u309A"; 2284 for(int i=0x3000; i<0x3100;i++){ 2285 String input = ((char)i)+troublesome; 2286 try{ 2287 /* String result =*/ Normalizer.compose(input,false); 2288 }catch(IndexOutOfBoundsException e){ 2289 errln("compose() failed for input: " + Utility.hex(input) + " Exception: " + e.toString()); 2290 } 2291 } 2292 2293 } 2294 2295 static final int D = 0, C = 1, KD= 2, KC = 3, FCD=4, NONE=5; 2296 initSkippables(UnicodeSet[] skipSets)2297 private static UnicodeSet[] initSkippables(UnicodeSet[] skipSets) { 2298 skipSets[D].applyPattern("[[:NFD_QC=Yes:]&[:ccc=0:]]", false); 2299 skipSets[C].applyPattern("[[:NFC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2300 skipSets[KD].applyPattern("[[:NFKD_QC=Yes:]&[:ccc=0:]]", false); 2301 skipSets[KC].applyPattern("[[:NFKC_QC=Yes:]&[:ccc=0:]-[:HST=LV:]]", false); 2302 2303 // Remove from the NFC and NFKC sets all those characters that change 2304 // when a back-combining character is added. 2305 // First, get all of the back-combining characters and their combining classes. 2306 UnicodeSet combineBack=new UnicodeSet("[:NFC_QC=Maybe:]"); 2307 int numCombineBack=combineBack.size(); 2308 int[] combineBackCharsAndCc=new int[numCombineBack*2]; 2309 UnicodeSetIterator iter=new UnicodeSetIterator(combineBack); 2310 for(int i=0; i<numCombineBack; ++i) { 2311 iter.next(); 2312 int c=iter.codepoint; 2313 combineBackCharsAndCc[2*i]=c; 2314 combineBackCharsAndCc[2*i+1]=UCharacter.getCombiningClass(c); 2315 } 2316 2317 // We need not look at control codes, Han characters nor Hangul LVT syllables because they 2318 // do not combine forward. LV syllables are already removed. 2319 UnicodeSet notInteresting=new UnicodeSet("[[:C:][:Unified_Ideograph:][:HST=LVT:]]"); 2320 UnicodeSet unsure=((UnicodeSet)(skipSets[C].clone())).removeAll(notInteresting); 2321 // System.out.format("unsure.size()=%d\n", unsure.size()); 2322 2323 // For each character about which we are unsure, see if it changes when we add 2324 // one of the back-combining characters. 2325 Normalizer2 norm2=Normalizer2.getNFCInstance(); 2326 StringBuilder s=new StringBuilder(); 2327 iter.reset(unsure); 2328 while(iter.next()) { 2329 int c=iter.codepoint; 2330 s.delete(0, 0x7fffffff).appendCodePoint(c); 2331 int cLength=s.length(); 2332 int tccc=UCharacter.getIntPropertyValue(c, UProperty.TRAIL_CANONICAL_COMBINING_CLASS); 2333 for(int i=0; i<numCombineBack; ++i) { 2334 // If c's decomposition ends with a character with non-zero combining class, then 2335 // c can only change if it combines with a character with a non-zero combining class. 2336 int cc2=combineBackCharsAndCc[2*i+1]; 2337 if(tccc==0 || cc2!=0) { 2338 int c2=combineBackCharsAndCc[2*i]; 2339 s.appendCodePoint(c2); 2340 if(!norm2.isNormalized(s)) { 2341 // System.out.format("remove U+%04x (tccc=%d) + U+%04x (cc=%d)\n", c, tccc, c2, cc2); 2342 skipSets[C].remove(c); 2343 skipSets[KC].remove(c); 2344 break; 2345 } 2346 s.delete(cLength, 0x7fffffff); 2347 } 2348 } 2349 } 2350 return skipSets; 2351 } 2352 TestSkippable()2353 public void TestSkippable() { 2354 UnicodeSet[] skipSets = new UnicodeSet[] { 2355 new UnicodeSet(), //NFD 2356 new UnicodeSet(), //NFC 2357 new UnicodeSet(), //NFKD 2358 new UnicodeSet() //NFKC 2359 }; 2360 UnicodeSet[] expectSets = new UnicodeSet[] { 2361 new UnicodeSet(), 2362 new UnicodeSet(), 2363 new UnicodeSet(), 2364 new UnicodeSet() 2365 }; 2366 StringBuilder s, pattern; 2367 2368 // build NF*Skippable sets from runtime data 2369 skipSets[D].applyPattern("[:NFD_Inert:]"); 2370 skipSets[C].applyPattern("[:NFC_Inert:]"); 2371 skipSets[KD].applyPattern("[:NFKD_Inert:]"); 2372 skipSets[KC].applyPattern("[:NFKC_Inert:]"); 2373 2374 expectSets = initSkippables(expectSets); 2375 if(expectSets[D].contains(0x0350)){ 2376 errln("expectSets[D] contains 0x0350"); 2377 } 2378 for(int i=0; i<expectSets.length; ++i) { 2379 if(!skipSets[i].equals(expectSets[i])) { 2380 errln("error: TestSkippable skipSets["+i+"]!=expectedSets["+i+"]\n"); 2381 // Note: This used to depend on hardcoded UnicodeSet patterns generated by 2382 // Mark's unicodetools.com.ibm.text.UCD.NFSkippable, by 2383 // running com.ibm.text.UCD.Main with the option NFSkippable. 2384 // Since ICU 4.6/Unicode 6, we are generating the 2385 // expectSets ourselves in initSkippables(). 2386 2387 s=new StringBuilder(); 2388 2389 s.append("\n\nskip= "); 2390 s.append(skipSets[i].toPattern(true)); 2391 s.append("\n\n"); 2392 2393 s.append("skip-expect="); 2394 pattern = new StringBuilder(((UnicodeSet)skipSets[i].clone()).removeAll(expectSets[i]).toPattern(true)); 2395 s.append(pattern); 2396 2397 pattern.delete(0,pattern.length()); 2398 s.append("\n\nexpect-skip="); 2399 pattern = new StringBuilder(((UnicodeSet)expectSets[i].clone()).removeAll(skipSets[i]).toPattern(true)); 2400 s.append(pattern); 2401 s.append("\n\n"); 2402 2403 pattern.delete(0,pattern.length()); 2404 s.append("\n\nintersection(expect,skip)="); 2405 UnicodeSet intersection = ((UnicodeSet) expectSets[i].clone()).retainAll(skipSets[i]); 2406 pattern = new StringBuilder(intersection.toPattern(true)); 2407 s.append(pattern); 2408 s.append("\n\n"); 2409 2410 errln(s.toString()); 2411 } 2412 } 2413 } 2414 TestBugJ2068()2415 public void TestBugJ2068(){ 2416 String sample = "The quick brown fox jumped over the lazy dog"; 2417 UCharacterIterator text = UCharacterIterator.getInstance(sample); 2418 Normalizer norm = new Normalizer(text,Normalizer.NFC,0); 2419 text.setIndex(4); 2420 if(text.current() == norm.current()){ 2421 errln("Normalizer is not cloning the UCharacterIterator"); 2422 } 2423 } TestGetCombiningClass()2424 public void TestGetCombiningClass(){ 2425 for(int i=0;i<0x10FFFF;i++){ 2426 int cc = UCharacter.getCombiningClass(i); 2427 if(0xD800<= i && i<=0xDFFF && cc >0 ){ 2428 cc = UCharacter.getCombiningClass(i); 2429 errln("CC: "+ cc + " for codepoint: " +Utility.hex(i,8)); 2430 } 2431 } 2432 } 2433 TestSerializedSet()2434 public void TestSerializedSet(){ 2435 USerializedSet sset=new USerializedSet(); 2436 UnicodeSet set = new UnicodeSet(); 2437 int start, end; 2438 2439 char[] serialized = { 2440 0x8007, // length 2441 3, // bmpLength 2442 0xc0, 0xfe, 0xfffc, 2443 1, 9, 0x10, 0xfffc 2444 }; 2445 sset.getSet(serialized, 0); 2446 2447 // collect all sets into one for contiguous output 2448 int[] startEnd = new int[2]; 2449 int count=sset.countRanges(); 2450 for(int j=0; j<count; ++j) { 2451 sset.getRange(j, startEnd); 2452 set.add(startEnd[0], startEnd[1]); 2453 } 2454 2455 // test all of these characters 2456 UnicodeSetIterator it = new UnicodeSetIterator(set); 2457 while(it.nextRange() && it.codepoint!=UnicodeSetIterator.IS_STRING) { 2458 start=it.codepoint; 2459 end=it.codepointEnd; 2460 while(start<=end) { 2461 if(!sset.contains(start)){ 2462 errln("USerializedSet.contains failed for "+Utility.hex(start,8)); 2463 } 2464 ++start; 2465 } 2466 } 2467 } 2468 TestReturnFailure()2469 public void TestReturnFailure(){ 2470 char[] term = {'r','\u00e9','s','u','m','\u00e9' }; 2471 char[] decomposed_term = new char[10 + term.length + 2]; 2472 int rc = Normalizer.decompose(term,0,term.length, decomposed_term,0,decomposed_term.length,true, 0); 2473 int rc1 = Normalizer.decompose(term,0,term.length, decomposed_term,10,decomposed_term.length,true, 0); 2474 if(rc!=rc1){ 2475 errln("Normalizer decompose did not return correct length"); 2476 } 2477 } 2478 2479 private final static class TestCompositionCase { 2480 public Normalizer.Mode mode; 2481 public int options; 2482 public String input, expect; TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect)2483 TestCompositionCase(Normalizer.Mode mode, int options, String input, String expect) { 2484 this.mode=mode; 2485 this.options=options; 2486 this.input=input; 2487 this.expect=expect; 2488 } 2489 } 2490 TestComposition()2491 public void TestComposition() { 2492 final TestCompositionCase cases[]=new TestCompositionCase[]{ 2493 /* 2494 * special cases for UAX #15 bug 2495 * see Unicode Corrigendum #5: Normalization Idempotency 2496 * at http://unicode.org/versions/corrigendum5.html 2497 * (was Public Review Issue #29) 2498 */ 2499 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327", "\u1100\u0300\u1161\u0327"), 2500 new TestCompositionCase(Normalizer.NFC, 0, "\u1100\u0300\u1161\u0327\u11a8","\u1100\u0300\u1161\u0327\u11a8"), 2501 new TestCompositionCase(Normalizer.NFC, 0, "\uac00\u0300\u0327\u11a8", "\uac00\u0327\u0300\u11a8"), 2502 new TestCompositionCase(Normalizer.NFC, 0, "\u0b47\u0300\u0b3e", "\u0b47\u0300\u0b3e"), 2503 2504 /* TODO: add test cases for UNORM_FCC here (j2151) */ 2505 }; 2506 2507 String output; 2508 int i; 2509 2510 for(i=0; i<cases.length; ++i) { 2511 output=Normalizer.normalize(cases[i].input, cases[i].mode, cases[i].options); 2512 if(!output.equals(cases[i].expect)) { 2513 errln("unexpected result for case "+i); 2514 } 2515 } 2516 } 2517 TestGetDecomposition()2518 public void TestGetDecomposition() { 2519 Normalizer2 n2=Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2520 String decomp=n2.getDecomposition(0x20); 2521 assertEquals("fcc.getDecomposition(space) failed", null, decomp); 2522 decomp=n2.getDecomposition(0xe4); 2523 assertEquals("fcc.getDecomposition(a-umlaut) failed", "a\u0308", decomp); 2524 decomp=n2.getDecomposition(0xac01); 2525 assertEquals("fcc.getDecomposition(Hangul syllable U+AC01) failed", "\u1100\u1161\u11a8", decomp); 2526 } 2527 TestGetRawDecomposition()2528 public void TestGetRawDecomposition() { 2529 Normalizer2 n2=Normalizer2.getNFKCInstance(); 2530 /* 2531 * Raw decompositions from NFKC data are the Unicode Decomposition_Mapping values, 2532 * without recursive decomposition. 2533 */ 2534 2535 String decomp=n2.getRawDecomposition(0x20); 2536 assertEquals("nfkc.getRawDecomposition(space) failed", null, decomp); 2537 decomp=n2.getRawDecomposition(0xe4); 2538 assertEquals("nfkc.getRawDecomposition(a-umlaut) failed", "a\u0308", decomp); 2539 /* U+1E08 LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE */ 2540 decomp=n2.getRawDecomposition(0x1e08); 2541 assertEquals("nfkc.getRawDecomposition(c-cedilla-acute) failed", "\u00c7\u0301", decomp); 2542 /* U+212B ANGSTROM SIGN */ 2543 decomp=n2.getRawDecomposition(0x212b); 2544 assertEquals("nfkc.getRawDecomposition(angstrom sign) failed", "\u00c5", decomp); 2545 decomp=n2.getRawDecomposition(0xac00); 2546 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC00) failed", "\u1100\u1161", decomp); 2547 /* A Hangul LVT syllable has a raw decomposition of an LV syllable + T. */ 2548 decomp=n2.getRawDecomposition(0xac01); 2549 assertEquals("nfkc.getRawDecomposition(Hangul syllable U+AC01) failed", "\uac00\u11a8", decomp); 2550 } 2551 TestCustomComp()2552 public void TestCustomComp() { 2553 String [][] pairs={ 2554 { "\\uD801\\uE000\\uDFFE", "" }, 2555 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2556 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2557 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE002\\U000110B9\\u0327\\u0345" }, 2558 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2559 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2560 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2561 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2562 }; 2563 Normalizer2 customNorm2; 2564 customNorm2= 2565 Normalizer2.getInstance( 2566 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), 2567 "testnorm", 2568 Normalizer2.Mode.COMPOSE); 2569 for(int i=0; i<pairs.length; ++i) { 2570 String[] pair=pairs[i]; 2571 String input=Utility.unescape(pair[0]); 2572 String expected=Utility.unescape(pair[1]); 2573 String result=customNorm2.normalize(input); 2574 if(!result.equals(expected)) { 2575 errln("custom compose Normalizer2 did not normalize input "+i+" as expected"); 2576 } 2577 } 2578 } 2579 TestCustomFCC()2580 public void TestCustomFCC() { 2581 String[][] pairs={ 2582 { "\\uD801\\uE000\\uDFFE", "" }, 2583 { "\\uD800\\uD801\\uE000\\uDFFE\\uDFFF", "\\uD7FF\\uFFFF" }, 2584 { "\\uD800\\uD801\\uDFFE\\uDFFF", "\\uD7FF\\U000107FE\\uFFFF" }, 2585 // The following expected result is different from CustomComp 2586 // because of only-contiguous composition. 2587 { "\\uE001\\U000110B9\\u0345\\u0308\\u0327", "\\uE001\\U000110B9\\u0327\\u0308\\u0345" }, 2588 { "\\uE010\\U000F0011\\uE012", "\\uE011\\uE012" }, 2589 { "\\uE010\\U000F0011\\U000F0011\\uE012", "\\uE011\\U000F0010" }, 2590 { "\\uE111\\u1161\\uE112\\u1162", "\\uAE4C\\u1102\\u0062\\u1162" }, 2591 { "\\uFFF3\\uFFF7\\U00010036\\U00010077", "\\U00010037\\U00010037\\uFFF6\\U00010037" } 2592 }; 2593 Normalizer2 customNorm2; 2594 customNorm2= 2595 Normalizer2.getInstance( 2596 BasicTest.class.getResourceAsStream("/com/ibm/icu/dev/data/testdata/testnorm.nrm"), 2597 "testnorm", 2598 Normalizer2.Mode.COMPOSE_CONTIGUOUS); 2599 for(int i=0; i<pairs.length; ++i) { 2600 String[] pair=pairs[i]; 2601 String input=Utility.unescape(pair[0]); 2602 String expected=Utility.unescape(pair[1]); 2603 String result=customNorm2.normalize(input); 2604 if(!result.equals(expected)) { 2605 errln("custom FCC Normalizer2 did not normalize input "+i+" as expected"); 2606 } 2607 } 2608 } 2609 TestCanonIterData()2610 public void TestCanonIterData() { 2611 // For now, just a regression test. 2612 Normalizer2Impl impl=Norm2AllModes.getNFCInstance().impl.ensureCanonIterData(); 2613 // U+0FB5 TIBETAN SUBJOINED LETTER SSA is the trailing character 2614 // in some decomposition mappings where there is a composition exclusion. 2615 // In fact, U+0FB5 is normalization-inert (NFC_QC=Yes, NFD_QC=Yes, ccc=0) 2616 // but it is not a segment starter because it occurs in a decomposition mapping. 2617 if(impl.isCanonSegmentStarter(0xfb5)) { 2618 errln("isCanonSegmentStarter(U+0fb5)=true is wrong"); 2619 } 2620 // For [:Segment_Starter:] to work right, not just the property function has to work right, 2621 // UnicodeSet also needs a correct range starts set. 2622 UnicodeSet segStarters=new UnicodeSet("[:Segment_Starter:]").freeze(); 2623 if(segStarters.contains(0xfb5)) { 2624 errln("[:Segment_Starter:].contains(U+0fb5)=true is wrong"); 2625 } 2626 // Try characters up to Kana and miscellaneous CJK but below Han (for expediency). 2627 for(int c=0; c<=0x33ff; ++c) { 2628 boolean isStarter=impl.isCanonSegmentStarter(c); 2629 boolean isContained=segStarters.contains(c); 2630 if(isStarter!=isContained) { 2631 errln(String.format( 2632 "discrepancy: isCanonSegmentStarter(U+%04x)=%5b != " + 2633 "[:Segment_Starter:].contains(same)", 2634 c, isStarter)); 2635 } 2636 } 2637 } 2638 TestFilteredNormalizer2()2639 public void TestFilteredNormalizer2() { 2640 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2641 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2642 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2643 int c; 2644 for(c=0; c<=0x3ff; ++c) { 2645 int expectedCC= filter.contains(c) ? nfcNorm2.getCombiningClass(c) : 0; 2646 int cc=fn2.getCombiningClass(c); 2647 assertEquals( 2648 "FilteredNormalizer2(NFC, ^A0-FF,310-31F).getCombiningClass(U+"+hex(c)+ 2649 ")==filtered NFC.getCC()", 2650 expectedCC, cc); 2651 } 2652 } 2653 TestFilteredAppend()2654 public void TestFilteredAppend() { 2655 Normalizer2 nfcNorm2=Normalizer2.getNFCInstance(); 2656 UnicodeSet filter=new UnicodeSet("[^\u00a0-\u00ff\u0310-\u031f]"); 2657 FilteredNormalizer2 fn2=new FilteredNormalizer2(nfcNorm2, filter); 2658 2659 // Append two strings that each contain a character outside the filter set. 2660 StringBuilder sb = new StringBuilder("a\u0313a"); 2661 String second = "\u0301\u0313"; 2662 assertEquals("append()", "a\u0313á\u0313", fn2.append(sb, second).toString()); 2663 2664 // Same, and also normalize the second string. 2665 sb.replace(0, 0x7fffffff, "a\u0313a"); 2666 assertEquals( 2667 "normalizeSecondAndAppend()", 2668 "a\u0313á\u0313", fn2.normalizeSecondAndAppend(sb, second).toString()); 2669 2670 // Normalizer2.normalize(String) uses spanQuickCheckYes() and normalizeSecondAndAppend(). 2671 assertEquals("normalize()", "a\u0313á\u0313", fn2.normalize("a\u0313a\u0301\u0313")); 2672 } 2673 TestGetEasyToUseInstance()2674 public void TestGetEasyToUseInstance() { 2675 // Test input string: 2676 // U+00A0 -> <noBreak> 0020 2677 // U+00C7 0301 = 1E08 = 0043 0327 0301 2678 String in="\u00A0\u00C7\u0301"; 2679 Normalizer2 n2=Normalizer2.getNFCInstance(); 2680 String out=n2.normalize(in); 2681 assertEquals( 2682 "getNFCInstance() did not return an NFC instance " + 2683 "(normalizes to " + prettify(out) + ')', 2684 "\u00A0\u1E08", out); 2685 2686 n2=Normalizer2.getNFDInstance(); 2687 out=n2.normalize(in); 2688 assertEquals( 2689 "getNFDInstance() did not return an NFD instance " + 2690 "(normalizes to " + prettify(out) + ')', 2691 "\u00A0C\u0327\u0301", out); 2692 2693 n2=Normalizer2.getNFKCInstance(); 2694 out=n2.normalize(in); 2695 assertEquals( 2696 "getNFKCInstance() did not return an NFKC instance " + 2697 "(normalizes to " + prettify(out) + ')', 2698 " \u1E08", out); 2699 2700 n2=Normalizer2.getNFKDInstance(); 2701 out=n2.normalize(in); 2702 assertEquals( 2703 "getNFKDInstance() did not return an NFKD instance " + 2704 "(normalizes to " + prettify(out) + ')', 2705 " C\u0327\u0301", out); 2706 2707 n2=Normalizer2.getNFKCCasefoldInstance(); 2708 out=n2.normalize(in); 2709 assertEquals( 2710 "getNFKCCasefoldInstance() did not return an NFKC_Casefold instance " + 2711 "(normalizes to " + prettify(out) + ')', 2712 " \u1E09", out); 2713 } 2714 } 2715