1 /* 2 ******************************************************************************* 3 * Copyright (C) 2008-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.dev.test.collator; 8 import java.util.ArrayList; 9 import java.util.Arrays; 10 import java.util.Collection; 11 import java.util.Iterator; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Locale; 15 import java.util.Set; 16 import java.util.TreeSet; 17 18 import com.ibm.icu.dev.test.TestFmwk; 19 import com.ibm.icu.dev.util.CollectionUtilities; 20 import com.ibm.icu.impl.ICUDebug; 21 import com.ibm.icu.impl.Row; 22 import com.ibm.icu.impl.Row.R4; 23 import com.ibm.icu.lang.UCharacter; 24 import com.ibm.icu.lang.UProperty; 25 import com.ibm.icu.lang.UScript; 26 import com.ibm.icu.text.AlphabeticIndex; 27 import com.ibm.icu.text.AlphabeticIndex.Bucket; 28 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType; 29 import com.ibm.icu.text.AlphabeticIndex.ImmutableIndex; 30 import com.ibm.icu.text.AlphabeticIndex.Record; 31 import com.ibm.icu.text.Collator; 32 import com.ibm.icu.text.Normalizer2; 33 import com.ibm.icu.text.RawCollationKey; 34 import com.ibm.icu.text.RuleBasedCollator; 35 import com.ibm.icu.text.UTF16; 36 import com.ibm.icu.text.UnicodeSet; 37 import com.ibm.icu.util.ULocale; 38 39 /** 40 * @author Mark Davis 41 */ 42 public class AlphabeticIndexTest extends TestFmwk { 43 /** 44 * 45 */ 46 private static final String ARROW = "\u2192"; 47 private static final boolean DEBUG = ICUDebug.enabled("alphabeticindex"); 48 49 public static Set<String> KEY_LOCALES = new LinkedHashSet(Arrays.asList( 50 "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl", 51 "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da", 52 "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr", 53 "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk", 54 "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta", 55 "te", "mr", "ur", "ml", "kn", "gu", "or")); 56 private String[][] localeAndIndexCharactersLists = new String[][] { 57 /* Arabic*/ {"ar", "\u0627:\u0628:\u062A:\u062B:\u062C:\u062D:\u062E:\u062F:\u0630:\u0631:\u0632:\u0633:\u0634:\u0635:\u0636:\u0637:\u0638:\u0639:\u063A:\u0641:\u0642:\u0643:\u0644:\u0645:\u0646:\u0647:\u0648:\u064A"}, 58 /* Bulgarian*/ {"bg", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 59 /* Catalan*/ {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 60 /* Czech*/ {"cs", "A:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\u0158:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 61 /* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 62 /* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 63 /* Greek*/ {"el", "\u0391:\u0392:\u0393:\u0394:\u0395:\u0396:\u0397:\u0398:\u0399:\u039A:\u039B:\u039C:\u039D:\u039E:\u039F:\u03A0:\u03A1:\u03A3:\u03A4:\u03A5:\u03A6:\u03A7:\u03A8:\u03A9"}, 64 /* English*/ {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 65 /* Spanish*/ {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 66 /* Estonian*/ {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:Z:\u017D:T:U:V:\u00D5:\u00C4:\u00D6:\u00DC:X:Y"}, 67 /* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 68 /* Finnish*/ {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 69 /* Filipino*/ {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 70 /* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 71 /* Hebrew*/ {"he", "\u05D0:\u05D1:\u05D2:\u05D3:\u05D4:\u05D5:\u05D6:\u05D7:\u05D8:\u05D9:\u05DB:\u05DC:\u05DE:\u05E0:\u05E1:\u05E2:\u05E4:\u05E6:\u05E7:\u05E8:\u05E9:\u05EA"}, 72 /* Icelandic*/ {"is", "A:\u00C1:B:C:D:\u00D0:E:\u00C9:F:G:H:I:\u00CD:J:K:L:M:N:O:\u00D3:P:Q:R:S:T:U:\u00DA:V:W:X:Y:\u00DD:Z:\u00DE:\u00C6:\u00D6"}, 73 /* Italian*/ {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 74 /* Japanese*/ {"ja", "\u3042:\u304B:\u3055:\u305F:\u306A:\u306F:\u307E:\u3084:\u3089:\u308F"}, 75 /* Korean*/ {"ko", "\u3131:\u3134:\u3137:\u3139:\u3141:\u3142:\u3145:\u3147:\u3148:\u314A:\u314B:\u314C:\u314D:\u314E"}, 76 /* Lithuanian*/ {"lt", "A:B:C:\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\u0160:T:U:V:Z:\u017D"}, 77 /* Latvian*/ {"lv", "A:B:C:\u010C:D:E:F:G:\u0122:H:I:J:K:\u0136:L:\u013B:M:N:\u0145:O:P:Q:R:S:\u0160:T:U:V:W:X:Z:\u017D"}, 78 /* Norwegian Bokm\u00E5l*/ {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 79 /* Dutch*/ {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 80 /* Polish*/ {"pl", "A:\u0104:B:C:\u0106:D:E:\u0118:F:G:H:I:J:K:L:\u0141:M:N:\u0143:O:\u00D3:P:Q:R:S:\u015A:T:U:V:W:X:Y:Z:\u0179:\u017B"}, 81 /* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 82 /* Romanian*/ {"ro", "A:\u0102:\u00C2:B:C:D:E:F:G:H:I:\u00CE:J:K:L:M:N:O:P:Q:R:S:\u0218:T:\u021A:U:V:W:X:Y:Z"}, 83 /* Russian*/ {"ru", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042B:\u042D:\u042E:\u042F"}, 84 /* Slovak*/ {"sk", "A:\u00C4:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\u00D4:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 85 /* Slovenian*/ {"sl", "A:B:C:\u010C:\u0106:D:\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"}, 86 /* Serbian*/ {"sr", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0402:\u0415:\u0416:\u0417:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040B:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 87 /* Swedish*/ {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"}, 88 /* Turkish*/ {"tr", "A:B:C:\u00C7:D:E:F:G:H:I:\u0130:J:K:L:M:N:O:\u00D6:P:Q:R:S:\u015E:T:U:\u00DC:V:W:X:Y:Z"}, 89 /* Ukrainian*/ {"uk", "\u0410:\u0411:\u0412:\u0413:\u0490:\u0414:\u0415:\u0404:\u0416:\u0417:\u0418:\u0406:\u0407:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"}, 90 /* Vietnamese*/ {"vi", "A:\u0102:\u00C2:B:C:D:\u0110:E:\u00CA:F:G:H:I:J:K:L:M:N:O:\u00D4:\u01A0:P:Q:R:S:T:U:\u01AF:V:W:X:Y:Z"}, 91 /* Chinese*/ {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 92 /* Chinese (Traditional Han)*/ {"zh_Hant", "1\u5283:2\u5283:3\u5283:4\u5283:5\u5283:6\u5283:7\u5283:8\u5283:9\u5283:10\u5283:11\u5283:12\u5283:13\u5283:14\u5283:15\u5283:16\u5283:17\u5283:18\u5283:19\u5283:20\u5283:21\u5283:22\u5283:23\u5283:24\u5283:25\u5283:26\u5283:27\u5283:28\u5283:29\u5283:30\u5283:31\u5283:32\u5283:33\u5283:35\u5283:36\u5283:39\u5283:48\u5283"}, 93 94 // Comment these out to make the test run faster. Later, make these run under extended 95 96 // /* Afrikaans*/ {"af", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 97 // /* Akan*/ {"ak", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:O:\u0186:P:Q:R:S:T:U:V:W:X:Y:Z"}, 98 // /* Asu*/ {"asa", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 99 // /* Azerbaijani*/ {"az", "A:B:C:\u00C7:D:E:\u018F:F:G:\u011E:H:X:I:\u0130:J:K:Q:L:M:N:O:\u00D6:P:R:S:\u015E:T:U:\u00DC:V:W:Y:Z"}, 100 // /* Belarusian*/ {"be", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0406:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u042B:\u042D:\u042E:\u042F"}, 101 // /* Bemba*/ {"bem", "A:B:C:E:F:G:I:J:K:L:M:N:O:P:S:T:U:W:Y"}, 102 // /* Bena*/ {"bez", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:Y:Z"}, 103 // /* Bambara*/ {"bm", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:\u0186:P:R:S:T:U:W:Y:Z"}, 104 // /* Tibetan*/ {"bo", "\u0F40:\u0F41:\u0F42:\u0F44:\u0F45:\u0F46:\u0F47:\u0F49:\u0F4F:\u0F50:\u0F51:\u0F53:\u0F54:\u0F55:\u0F56:\u0F58:\u0F59:\u0F5A:\u0F5B:\u0F5D:\u0F5E:\u0F5F:\u0F60:\u0F61:\u0F62:\u0F63:\u0F64:\u0F66:\u0F67:\u0F68"}, 105 // /* Chiga*/ {"cgg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 106 // /* Cherokee*/ {"chr", "\u13A0:\u13A6:\u13AD:\u13B3:\u13B9:\u13BE:\u13C6:\u13CC:\u13D3:\u13DC:\u13E3:\u13E9:\u13EF"}, 107 // /* Welsh*/ {"cy", "A:B:C:CH:D:E:F:FF:G:H:I:J:L:LL:M:N:O:P:PH:R:RH:S:T:TH:U:W:Y"}, 108 // /* Taita*/ {"dav", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 109 // /* Embu*/ {"ebu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 110 // /* Ewe*/ {"ee", "A:B:C:D:\u0189:E:\u0190:F:\u0191:G:\u0194:H:I:J:K:L:M:N:\u014A:O:\u0186:P:Q:R:S:T:U:V:\u01B2:W:X:Y:Z"}, 111 // /* Esperanto*/ {"eo", "A:B:C:\u0108:D:E:F:G:\u011C:H:\u0124:I:J:\u0134:K:L:M:N:O:P:R:S:\u015C:T:U:\u016C:V:Z"}, 112 // /* Fulah*/ {"ff", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:L:M:N:\u014A:O:P:R:S:T:U:W:Y:\u01B3"}, 113 // /* Faroese*/ {"fo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8"}, 114 // /* Gusii*/ {"guz", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 115 // /* Hausa*/ {"ha", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:\u0198:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 116 // /* Igbo*/ {"ig", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 117 // /* Machame*/ {"jmc", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 118 // /* Kabyle*/ {"kab", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:P:Q:R:S:T:U:W:X:Y:Z"}, 119 // /* Kamba*/ {"kam", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 120 // /* Makonde*/ {"kde", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 121 // /* Kabuverdianu*/ {"kea", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:X:Z"}, 122 // /* Koyra Chiini*/ {"khq", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 123 // /* Kikuyu*/ {"ki", "A:B:C:D:E:G:H:I:J:K:M:N:O:R:T:U:W:Y"}, 124 // /* Kalenjin*/ {"kln", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:W:Y"}, 125 // /* Langi*/ {"lag", "A:B:C:D:E:F:G:H:I:\u0197:J:K:L:M:N:O:P:Q:R:S:T:U:\u0244:V:W:X:Y:Z"}, 126 // /* Ganda*/ {"lg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 127 // /* Luo*/ {"luo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 128 // /* Luyia*/ {"luy", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 129 // /* Masai*/ {"mas", "A:B:C:D:E:\u0190:G:H:I:\u0197:J:K:L:M:N:\u014A:O:\u0186:P:R:S:T:U:\u0244:W:Y"}, 130 // /* Meru*/ {"mer", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 131 // /* Morisyen*/ {"mfe", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y:Z"}, 132 // /* Malagasy*/ {"mg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:V:Y:Z"}, 133 // This should be the correct data. Commented till it is fixed in CLDR collation data. 134 // {"mk", "\u0410:\u0411:\u0412:\u0413:\u0403:\u0414:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u040C:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 135 // /* Macedonian*/ {"mk", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0403:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040C:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"}, 136 // This should be the correct data. Commented till it is fixed in CLDR collation data. 137 // {"mt", "A:B:C:\u010A:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 138 // /* Maltese*/ {"mt", "A:B:\u010A:C:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"}, 139 // /* Nama*/ {"naq", "A:B:C:D:E:F:G:H:I:K:M:N:O:P:Q:R:S:T:U:W:X:Y:Z"}, 140 // /* North Ndebele*/ {"nd", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:S:T:U:V:W:X:Y:Z"}, 141 // /* Norwegian Nynorsk*/ {"nn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"}, 142 // /* Nyankole*/ {"nyn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 143 // /* Oromo*/ {"om", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 144 // /* Romansh*/ {"rm", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 145 // /* Rombo*/ {"rof", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 146 // /* Kinyarwanda*/ {"rw", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 147 // /* Rwa*/ {"rwk", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 148 // /* Samburu*/ {"saq", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"}, 149 // /* Sena*/ {"seh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 150 // /* Koyraboro Senni*/ {"ses", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"}, 151 // /* Sango*/ {"sg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 152 // /* Tachelhit*/ {"shi", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 153 // /* Tachelhit (Tifinagh)*/ {"shi_Tfng", "\u2D30:\u2D31:\u2D33:\u2D37:\u2D39:\u2D3B:\u2D3C:\u2D3D:\u2D40:\u2D43:\u2D44:\u2D45:\u2D47:\u2D49:\u2D4A:\u2D4D:\u2D4E:\u2D4F:\u2D53:\u2D54:\u2D55:\u2D56:\u2D59:\u2D5A:\u2D5B:\u2D5C:\u2D5F:\u2D61:\u2D62:\u2D63:\u2D65"}, 154 // /* Shona*/ {"sn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 155 // /* Teso*/ {"teo", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y"}, 156 // /* Tonga*/ {"to", "A:B:C:D:E:F:G:H:\u02BB:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 157 // /* Central Morocco Tamazight*/ {"tzm", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"}, 158 // /* Uzbek (Latin)*/ {"uz_Latn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u02BF"}, 159 // /* Vunjo*/ {"vun", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"}, 160 // /* Soga*/ {"xog", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 161 // /* Yoruba*/ {"yo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"}, 162 163 }; main(String[] args)164 public static void main(String[] args) throws Exception{ 165 new AlphabeticIndexTest().run(args); 166 } 167 168 // public void TestAAKeyword() { 169 // ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance( 170 // ICUResourceBundle.ICU_COLLATION_BASE_NAME, "zh"); 171 // showBundle(rb, 0); 172 // String[] keywords = Collator.getKeywords(); 173 // System.out.println(Arrays.asList(keywords)); 174 // String locale = "zh"; 175 // ULocale ulocale = new ULocale(locale); 176 // for (String keyword : keywords) { 177 // List<String> values = Arrays.asList(Collator.getKeywordValuesForLocale(keyword, ulocale, false)); 178 // List<String> allValues = Arrays.asList(Collator.getKeywordValues(keyword)); 179 // for (String value : allValues) { 180 // System.out.println(keyword + "=" + value); 181 // checkKeyword(locale, value, values.contains(value)); 182 // } 183 // } 184 // } 185 // 186 // private void checkKeyword(String locale, String collationValue, boolean shouldExist) { 187 // final ULocale base = new ULocale(locale); 188 // final ULocale desired = new ULocale(locale + "@collation=" + collationValue); 189 // Collator foo = Collator.getInstance(desired); 190 // ULocale actual = foo.getLocale(ULocale.ACTUAL_LOCALE); 191 // if (shouldExist) { 192 // assertEquals("actual should match desired", desired, actual); 193 // } else { 194 // assertEquals("actual should match base", base, actual); 195 // } 196 // int comp = foo.compare("a", "ā"); 197 // assertEquals("should fall back to default for zh", -1, comp); 198 // } 199 // 200 // /** 201 // * @param rb 202 // * @param i 203 // */ 204 // private static void showBundle(UResourceBundle rb, int i) { 205 // for (String key : rb.keySet()) { 206 // System.out.print("\n" + Utility.repeat(" ", i) + key); 207 // UResourceBundle rb2 = rb.get(key); 208 // showBundle(rb2, i+1); 209 // } 210 // } 211 212 TestA()213 public void TestA() { 214 String[][] tests = {{"zh_Hant", "渡辺", "12劃"}, 215 {"zh", "渡辺", "D"} 216 /*, "zh@collation=unihan", "ja@collation=unihan", "ko@collation=unihan"*/ 217 }; 218 for (String[] test : tests) { 219 AlphabeticIndex<Integer> alphabeticIndex = new AlphabeticIndex<Integer>(new ULocale(test[0])); 220 final String probe = test[1]; 221 final String expectedLabel = test[2]; 222 alphabeticIndex.addRecord(probe, 1); 223 List labels = alphabeticIndex.getBucketLabels(); 224 logln(labels.toString()); 225 Bucket<Integer> bucket = find(alphabeticIndex, probe); 226 assertEquals("locale " + test[0] + " name=" + probe + " in bucket", 227 expectedLabel, bucket.getLabel()); 228 } 229 } 230 find(AlphabeticIndex<Integer> alphabeticIndex, final String probe)231 private Bucket<Integer> find(AlphabeticIndex<Integer> alphabeticIndex, final String probe) { 232 for (Bucket<Integer> bucket : alphabeticIndex) { 233 for (Record<Integer> record : bucket) { 234 if (record.getName().equals(probe)) { 235 return bucket; 236 } 237 } 238 } 239 return null; 240 } 241 TestFirstCharacters()242 public void TestFirstCharacters() { 243 244 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH); 245 RuleBasedCollator collator = alphabeticIndex.getCollator(); 246 collator.setStrength(Collator.IDENTICAL); 247 Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts(); 248 // Verify that each script is represented exactly once. 249 UnicodeSet missingScripts = new UnicodeSet("[^[:sc=inherited:][:sc=unknown:][:sc=common:][:Script=Braille:]]"); 250 String last = ""; 251 for (String index : firsts) { 252 if (collator.compare(last,index) >= 0) { 253 errln("Characters not in order: " + last + " !< " + index); 254 } 255 int script = getFirstRealScript(index); 256 if (script == UScript.UNKNOWN) { continue; } 257 UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script); 258 if (missingScripts.containsNone(s)) { 259 errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false)); 260 } 261 missingScripts.removeAll(s); 262 } 263 if (missingScripts.size() != 0) { 264 String missingScriptNames = ""; 265 UnicodeSet missingChars = new UnicodeSet(missingScripts); 266 for(;;) { 267 int c = missingChars.charAt(0); 268 if (c < 0) { 269 break; 270 } 271 int script = UScript.getScript(c); 272 missingScriptNames += " " + 273 UCharacter.getPropertyValueName( 274 UProperty.SCRIPT, script, UProperty.NameChoice.SHORT); 275 missingChars.removeAll(new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script)); 276 } 277 errln("Missing character from:" + missingScriptNames + " -- " + missingScripts); 278 } 279 } 280 getFirstRealScript(CharSequence s)281 private static final int getFirstRealScript(CharSequence s) { 282 for (int i = 0; i < s.length();) { 283 int c = Character.codePointAt(s, i); 284 int script = UScript.getScript(c); 285 if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) { 286 return script; 287 } 288 i += Character.charCount(c); 289 } 290 return UScript.UNKNOWN; 291 } 292 TestBuckets()293 public void TestBuckets() { 294 ULocale additionalLocale = ULocale.ENGLISH; 295 296 for (String[] pair : localeAndIndexCharactersLists) { 297 checkBuckets(pair[0], SimpleTests, additionalLocale, "E", "edgar", "Effron", "Effron"); 298 } 299 } 300 TestEmpty()301 public void TestEmpty() { 302 // just verify that it doesn't blow up. 303 Set<ULocale> locales = new LinkedHashSet<ULocale>(); 304 locales.add(ULocale.ROOT); 305 locales.addAll(Arrays.asList(ULocale.getAvailableLocales())); 306 for (ULocale locale : locales) { 307 try { 308 AlphabeticIndex<String> alphabeticIndex = new AlphabeticIndex(locale); 309 alphabeticIndex.addRecord("hi", "HI"); 310 for (Bucket<String> bucket : alphabeticIndex) { 311 @SuppressWarnings("unused") 312 LabelType labelType = bucket.getLabelType(); 313 } 314 } catch (Exception e) { 315 errln("Exception when creating AlphabeticIndex for:\t" + locale.toLanguageTag()); 316 errln(e.toString()); 317 } 318 } 319 } 320 TestInflow()321 public void TestInflow() { 322 Object[][] tests = { 323 {0, ULocale.ENGLISH}, 324 {0, ULocale.ENGLISH, new ULocale("el")}, 325 {1, ULocale.ENGLISH, new ULocale("ru")}, 326 {0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru")}, 327 {0, ULocale.ENGLISH}, 328 {2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE}, 329 }; 330 for (Object[] test : tests) { 331 int expected = (Integer) test[0]; 332 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale)test[1]); 333 for (int i = 2; i < test.length; ++i) { 334 if (test[i] instanceof ULocale) { 335 alphabeticIndex.addLabels((ULocale)test[i]); 336 } else { 337 alphabeticIndex.addLabels((UnicodeSet)test[i]); 338 } 339 } 340 Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter(); 341 for (Bucket<Double> bucket : alphabeticIndex) { 342 LabelType labelType = bucket.getLabelType(); 343 counter.add(labelType, 1); 344 } 345 String printList = Arrays.asList(test).toString(); 346 assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW)); 347 assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW)); 348 if (expected != counter.get(LabelType.INFLOW)) { 349 // for debugging 350 AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale)test[1]); 351 for (int i = 2; i < test.length; ++i) { 352 if (test[i] instanceof ULocale) { 353 indexCharacters2.addLabels((ULocale)test[i]); 354 } else { 355 indexCharacters2.addLabels((UnicodeSet)test[i]); 356 } 357 } 358 List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>()); 359 logln(buckets.toString()); 360 } 361 assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW)); 362 } 363 } 364 checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items)365 private void checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items) { 366 StringBuilder UI = new StringBuilder(); 367 ULocale desiredLocale = new ULocale(localeString); 368 369 // Create a simple index where the values for the strings are Integers, and add the strings 370 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale); 371 int counter = 0; 372 Counter<String> itemCount = new Counter(); 373 for (String item : test) { 374 index.addRecord(item, counter++); 375 itemCount.add(item, 1); 376 } 377 378 List<String> labels = index.getBucketLabels(); 379 ImmutableIndex<Integer> immIndex = index.buildImmutableIndex(); 380 381 logln(desiredLocale + "\t" + desiredLocale.getDisplayName(ULocale.ENGLISH) + " - " + desiredLocale.getDisplayName(desiredLocale) + "\t" 382 + index.getCollator().getLocale(ULocale.ACTUAL_LOCALE)); 383 UI.setLength(0); 384 UI.append(desiredLocale + "\t"); 385 boolean showAll = true; 386 387 // Show index at top. We could skip or gray out empty buckets 388 for (AlphabeticIndex.Bucket<Integer> bucket : index) { 389 if (showAll || bucket.size() != 0) { 390 showLabelAtTop(UI, bucket.getLabel()); 391 } 392 } 393 logln(UI.toString()); 394 395 // Show the buckets with their contents, skipping empty buckets 396 int bucketIndex = 0; 397 for (Bucket<Integer> bucket : index) { 398 assertEquals("bucket label vs. iterator", 399 labels.get(bucketIndex), bucket.getLabel()); 400 assertEquals("bucket label vs. immutable", 401 labels.get(bucketIndex), immIndex.getBucket(bucketIndex).getLabel()); 402 assertEquals("bucket label type vs. immutable", 403 bucket.getLabelType(), immIndex.getBucket(bucketIndex).getLabelType()); 404 for (Record<Integer> r : bucket) { 405 CharSequence name = r.getName(); 406 assertEquals("getBucketIndex(" + name + ")", 407 bucketIndex, index.getBucketIndex(name)); 408 assertEquals("immutable getBucketIndex(" + name + ")", 409 bucketIndex, immIndex.getBucketIndex(name)); 410 } 411 if (bucket.getLabel().equals(testBucket)) { 412 Counter<String> keys = getKeys(bucket); 413 for (String item : items) { 414 long globalCount = itemCount.get(item); 415 long localeCount = keys.get(item); 416 if (globalCount != localeCount) { 417 errln("Error: in " + "'" + testBucket + "', '" + item + "' should have count " 418 + globalCount + " but has count " + localeCount); 419 } 420 421 } 422 } 423 424 if (bucket.size() != 0) { 425 showLabelInList(UI, bucket.getLabel()); 426 for (AlphabeticIndex.Record<Integer> item : bucket) { 427 showIndexedItem(UI, item.getName(), item.getData()); 428 } 429 logln(UI.toString()); 430 } 431 ++bucketIndex; 432 } 433 assertEquals("getBucketCount()", bucketIndex, index.getBucketCount()); 434 assertEquals("immutable getBucketCount()", bucketIndex, immIndex.getBucketCount()); 435 436 assertNull("immutable getBucket(-1)", immIndex.getBucket(-1)); 437 assertNull("immutable getBucket(count)", immIndex.getBucket(bucketIndex)); 438 439 for (Bucket<Integer> bucket : immIndex) { 440 assertEquals("immutable bucket size", 0, bucket.size()); 441 assertFalse("immutable bucket iterator.hasNext()", bucket.iterator().hasNext()); 442 } 443 } 444 showIndex(AlphabeticIndex<T> index, boolean showEmpty)445 public <T> void showIndex(AlphabeticIndex<T> index, boolean showEmpty) { 446 logln("Actual"); 447 StringBuilder UI = new StringBuilder(); 448 for (Bucket<T> bucket : index) { 449 if (showEmpty || bucket.size() != 0) { 450 showLabelInList(UI, bucket.getLabel()); 451 for (Record<T> item : bucket) { 452 showIndexedItem(UI, item.getName(), item.getData()); 453 } 454 logln(UI.toString()); 455 } 456 } 457 } 458 459 /** 460 * @param myBucketLabels 461 * @param myBucketContents 462 * @param b 463 */ showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty)464 private void showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty) { 465 logln("Alternative"); 466 StringBuilder UI = new StringBuilder(); 467 468 for (int i = 0; i < myBucketLabels.size(); ++i) { 469 Set<R4<RawCollationKey, String, Integer, Double>> bucket = myBucketContents.get(i); 470 if (!showEmpty && bucket.size() == 0) { 471 continue; 472 } 473 UI.setLength(0); 474 UI.append("*").append(myBucketLabels.get(i)); 475 for (R4<RawCollationKey, String, Integer, Double> item : bucket) { 476 UI.append("\t ").append(item.get1().toString()).append(ARROW).append(item.get3().toString()); 477 } 478 logln(UI.toString()); 479 } 480 } 481 showLabelAtTop(StringBuilder buffer, String label)482 private void showLabelAtTop(StringBuilder buffer, String label) { 483 buffer.append(label + " "); 484 } 485 showIndexedItem(StringBuilder buffer, CharSequence key, T value)486 private <T> void showIndexedItem(StringBuilder buffer, CharSequence key, T value) { 487 buffer.append("\t " + key + ARROW + value); 488 } 489 showLabelInList(StringBuilder buffer, String label)490 private void showLabelInList(StringBuilder buffer, String label) { 491 buffer.setLength(0); 492 buffer.append(label); 493 } 494 getKeys(AlphabeticIndex.Bucket<Integer> entry)495 private Counter<String> getKeys(AlphabeticIndex.Bucket<Integer> entry) { 496 Counter<String> keys = new Counter<String>(); 497 for (AlphabeticIndex.Record x : entry) { 498 String key = x.getName().toString(); 499 keys.add(key, 1); 500 } 501 return keys; 502 } 503 TestIndexCharactersList()504 public void TestIndexCharactersList() { 505 for (String[] localeAndIndexCharacters : localeAndIndexCharactersLists) { 506 ULocale locale = new ULocale(localeAndIndexCharacters[0]); 507 String expectedIndexCharacters = "\u2026:" + localeAndIndexCharacters[1] + ":\u2026"; 508 Collection<String> alphabeticIndex = new AlphabeticIndex(locale).getBucketLabels(); 509 510 // Join the elements of the list to a string with delimiter ":" 511 StringBuilder sb = new StringBuilder(); 512 Iterator<String> iter = alphabeticIndex.iterator(); 513 while (iter.hasNext()) { 514 sb.append(iter.next()); 515 if (!iter.hasNext()) { 516 break; 517 } 518 sb.append(":"); 519 } 520 String actualIndexCharacters = sb.toString(); 521 if (!expectedIndexCharacters.equals(actualIndexCharacters)) { 522 errln("Test failed for locale " + localeAndIndexCharacters[0] + 523 "\n Expected = |" + expectedIndexCharacters + "|\n actual = |" + actualIndexCharacters + "|"); 524 } 525 } 526 } 527 TestBasics()528 public void TestBasics() { 529 ULocale[] list = ULocale.getAvailableLocales(); 530 // get keywords combinations 531 // don't bother with multiple combinations at this point 532 List keywords = new ArrayList(); 533 keywords.add(""); 534 535 String[] collationValues = Collator.getKeywordValues("collation"); 536 for (int j = 0; j < collationValues.length; ++j) { 537 keywords.add("@collation=" + collationValues[j]); 538 } 539 540 for (int i = 0; i < list.length; ++i) { 541 for (Iterator it = keywords.iterator(); it.hasNext();) { 542 String collationValue = (String) it.next(); 543 String localeString = list[i].toString(); 544 if (!KEY_LOCALES.contains(localeString)) continue; // TODO change in exhaustive 545 ULocale locale = new ULocale(localeString + collationValue); 546 if (collationValue.length() > 0 && !Collator.getFunctionalEquivalent("collation", locale).equals(locale)) { 547 //logln("Skipping " + locale); 548 continue; 549 } 550 551 if (locale.getCountry().length() != 0) { 552 continue; 553 } 554 boolean isUnihan = collationValue.contains("unihan"); 555 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale); 556 if (isUnihan) { 557 // Unihan tailorings have a label per radical, and there are at least 214, 558 // if not more when simplified radicals are distinguished. 559 alphabeticIndex.setMaxLabelCount(500); 560 } 561 final Collection mainChars = alphabeticIndex.getBucketLabels(); 562 String mainCharString = mainChars.toString(); 563 if (mainCharString.length() > 500) { 564 mainCharString = mainCharString.substring(0,500) + "..."; 565 } 566 logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH)); 567 logln("Index:\t" + mainCharString); 568 if (!isUnihan && mainChars.size() > 100) { 569 errln("Index character set too large: " + 570 locale + " [" + mainChars.size() + "]:\n " + mainChars); 571 } 572 } 573 } 574 } 575 TestClientSupport()576 public void TestClientSupport() { 577 for (String localeString : new String[] {"zh"}) { // KEY_LOCALES, new String[] {"zh"} 578 ULocale ulocale = new ULocale(localeString); 579 AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex<Double>(ulocale).addLabels(ULocale.ENGLISH); 580 RuleBasedCollator collator = alphabeticIndex.getCollator(); 581 String [][] tests; 582 583 if (!localeString.equals("zh") ) { 584 tests = new String[][] {SimpleTests}; 585 } else { 586 tests = new String[][] {SimpleTests, hackPinyin, simplifiedNames}; 587 } 588 589 for (String [] shortTest : tests) { 590 double testValue = 100; 591 alphabeticIndex.clearRecords(); 592 for (String name : shortTest) { 593 alphabeticIndex.addRecord(name, testValue++); 594 } 595 596 if (DEBUG) showIndex(alphabeticIndex, false); 597 598 // make my own copy 599 testValue = 100; 600 List<String> myBucketLabels = alphabeticIndex.getBucketLabels(); 601 ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents = new ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>>(myBucketLabels.size()); 602 for (int i = 0; i < myBucketLabels.size(); ++i) { 603 myBucketContents.add(new TreeSet<R4<RawCollationKey, String, Integer, Double>>()); 604 } 605 for (String name : shortTest) { 606 int bucketIndex = alphabeticIndex.getBucketIndex(name); 607 if (bucketIndex > myBucketContents.size()) { 608 alphabeticIndex.getBucketIndex(name); // call again for debugging 609 } 610 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(bucketIndex); 611 RawCollationKey rawCollationKey = collator.getRawCollationKey(name, null); 612 R4<RawCollationKey, String, Integer, Double> row = Row.of(rawCollationKey, name, name.length(), testValue++); 613 myBucket.add(row); 614 } 615 if (DEBUG) showIndex(myBucketLabels, myBucketContents, false); 616 617 // now compare 618 int index = 0; 619 boolean gotError = false; 620 for (AlphabeticIndex.Bucket<Double> bucket : alphabeticIndex) { 621 String bucketLabel = bucket.getLabel(); 622 String myLabel = myBucketLabels.get(index); 623 if (!bucketLabel.equals(myLabel)) { 624 gotError |= !assertEquals(ulocale + "\tBucket Labels (" + index + ")", bucketLabel, myLabel); 625 } 626 Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(index); 627 Iterator<R4<RawCollationKey, String, Integer, Double>> myBucketIterator = myBucket.iterator(); 628 int recordIndex = 0; 629 for (Record<Double> record : bucket) { 630 String myName = null; 631 if (myBucketIterator.hasNext()) { 632 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 633 myName = (String) myRecord.get1(); 634 } 635 if (!record.getName().equals(myName)) { 636 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", record.getName(), myName); 637 } 638 } 639 while (myBucketIterator.hasNext()) { 640 R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next(); 641 String myName = (String) myRecord.get1(); 642 gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", null, myName); 643 } 644 index++; 645 } 646 if (gotError) { 647 showIndex(myBucketLabels, myBucketContents, false); 648 showIndex(alphabeticIndex, false); 649 } 650 } 651 } 652 } 653 TestFirstScriptCharacters()654 public void TestFirstScriptCharacters() { 655 Collection<String> firstCharacters = 656 new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts(); 657 Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT)); 658 Collection<String> diff = new TreeSet<String>(firstCharacters); 659 diff.removeAll(expectedFirstCharacters); 660 assertTrue("First Characters contains unexpected ones: " + diff, diff.isEmpty()); 661 diff.clear(); 662 diff.addAll(expectedFirstCharacters); 663 diff.removeAll(firstCharacters); 664 assertTrue("First Characters missing expected ones: " + diff, diff.isEmpty()); 665 } 666 667 private static final UnicodeSet TO_TRY = new UnicodeSet("[[:^nfcqc=no:]-[:sc=Common:]-[:sc=Inherited:]-[:sc=Unknown:]]").freeze(); 668 669 /** 670 * Returns a collection of all the "First" characters of scripts, according to the collation. 671 */ firstStringsInScript(RuleBasedCollator ruleBasedCollator)672 private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) { 673 String[] results = new String[UScript.CODE_LIMIT]; 674 for (String current : TO_TRY) { 675 if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols. 676 continue; 677 } 678 int script = UScript.getScript(current.codePointAt(0)); 679 if (results[script] == null) { 680 results[script] = current; 681 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 682 results[script] = current; 683 } 684 } 685 686 try { 687 UnicodeSet extras = new UnicodeSet(); 688 UnicodeSet expansions = new UnicodeSet(); 689 ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true); 690 extras.addAll(expansions).removeAll(TO_TRY); 691 if (extras.size() != 0) { 692 Normalizer2 normalizer = Normalizer2.getNFKCInstance(); 693 for (String current : extras) { 694 if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) { 695 continue; 696 } 697 int script = getFirstRealScript(current); 698 if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; } 699 if (results[script] == null) { 700 results[script] = current; 701 } else if (ruleBasedCollator.compare(current, results[script]) < 0) { 702 results[script] = current; 703 } 704 } 705 } 706 } catch (Exception e) { 707 } // why have a checked exception??? 708 709 // TODO: We should not test that we get the same strings, but that we 710 // get strings that sort primary-equal to those from the implementation. 711 712 Collection<String> result = new ArrayList<String>(); 713 for (int i = 0; i < results.length; ++i) { 714 if (results[i] != null) { 715 result.add(results[i]); 716 } 717 } 718 return result; 719 } 720 isUnassignedBoundary(CharSequence s)721 private static final boolean isUnassignedBoundary(CharSequence s) { 722 // The root collator provides a script-first-primary boundary contraction 723 // for the unassigned-implicit range. 724 return s.charAt(0) == 0xfdd1 && 725 UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN; 726 } 727 TestZZZ()728 public void TestZZZ() { 729 // int x = 3; 730 // AlphabeticIndex index = new AlphabeticIndex(ULocale.ENGLISH); 731 // UnicodeSet additions = new UnicodeSet(); 732 // additions.add(0x410).add(0x415); // Cyrillic 733 // // additions.add(0x391).add(0x393); // Greek 734 // index.addLabels(additions); 735 // int lc = index.getLabels().size(); 736 // List labels = index.getLabels(); 737 // System.out.println("Label Count = " + lc + "\t" + labels); 738 // System.out.println("Bucket Count =" + index.getBucketCount()); 739 } 740 TestSimplified()741 public void TestSimplified() { 742 checkBuckets("zh", simplifiedNames, ULocale.ENGLISH, "W", "\u897f"); 743 } TestTraditional()744 public void TestTraditional() { 745 checkBuckets("zh_Hant", traditionalNames, ULocale.ENGLISH, "\u4e9f", "\u5357\u9580"); 746 } 747 748 static final String[] SimpleTests = { 749 "斎藤", 750 "\u1f2d\u03c1\u03b1", 751 "$", "\u00a3", "12", "2", 752 "Davis", "Davis", "Abbot", "\u1D05avis", "Zach", "\u1D05avis", "\u01b5", "\u0130stanbul", "Istanbul", "istanbul", "\u0131stanbul", 753 "\u00deor", "\u00c5berg", "\u00d6stlund", 754 "\u1f2d\u03c1\u03b1", "\u1f08\u03b8\u03b7\u03bd\u1fb6", 755 "\u0396\u03b5\u03cd\u03c2", "\u03a0\u03bf\u03c3\u03b5\u03b9\u03b4\u1f63\u03bd", "\u1f0d\u03b9\u03b4\u03b7\u03c2", "\u0394\u03b7\u03bc\u03ae\u03c4\u03b7\u03c1", "\u1f19\u03c3\u03c4\u03b9\u03ac", 756 //"\u1f08\u03c0\u03cc\u03bb\u03bb\u03c9\u03bd", "\u1f0c\u03c1\u03c4\u03b5\u03bc\u03b9\u03c2", "\u1f19\u03c1\u03bc\u1f23\u03c2", "\u1f0c\u03c1\u03b7\u03c2", "\u1f08\u03c6\u03c1\u03bf\u03b4\u03af\u03c4\u03b7", "\u1f2d\u03c6\u03b1\u03b9\u03c3\u03c4\u03bf\u03c2", "\u0394\u03b9\u03cc\u03bd\u03c5\u03c3\u03bf\u03c2", 757 "\u6589\u85e4", "\u4f50\u85e4", "\u9234\u6728", "\u9ad8\u6a4b", "\u7530\u4e2d", "\u6e21\u8fba", "\u4f0a\u85e4", "\u5c71\u672c", "\u4e2d\u6751", "\u5c0f\u6797", "\u658e\u85e4", "\u52a0\u85e4", 758 //"\u5409\u7530", "\u5c71\u7530", "\u4f50\u3005\u6728", "\u5c71\u53e3", "\u677e\u672c", "\u4e95\u4e0a", "\u6728\u6751", "\u6797", "\u6e05\u6c34" 759 }; 760 761 static final String[] hackPinyin = { 762 "a", "\u5416", "\u58ba", // 763 "b", "\u516b", "\u62d4", "\u8500", // 764 "c", "\u5693", "\u7938", "\u9e7e", // 765 "d", "\u5491", "\u8fcf", "\u964a", // 766 "e","\u59b8", "\u92e8", "\u834b", // 767 "f", "\u53d1", "\u9197", "\u99a5", // 768 "g", "\u7324", "\u91d3", "\u8142", // 769 "h", "\u598e", "\u927f", "\u593b", // 770 "j", "\u4e0c", "\u6785", "\u9d58", // 771 "k", "\u5494", "\u958b", "\u7a52", // 772 "l", "\u5783", "\u62c9", "\u9ba5", // 773 "m", "\u5638", "\u9ebb", "\u65c0", // 774 "n", "\u62ff", "\u80ad", "\u685b", // 775 "o", "\u5662", "\u6bee", "\u8bb4", // 776 "p", "\u5991", "\u8019", "\u8c31", // 777 "q", "\u4e03", "\u6053", "\u7f56", // 778 "r", "\u5465", "\u72aa", "\u6e03", // 779 "s", "\u4ee8", "\u9491", "\u93c1", // 780 "t", "\u4ed6", "\u9248", "\u67dd", // 781 "w", "\u5c72", "\u5558", "\u5a7a", // 782 "x", "\u5915", "\u5438", "\u6bbe", // 783 "y", "\u4e2b", "\u82bd", "\u8574", // 784 "z", "\u5e00", "\u707d", "\u5c0a" 785 }; 786 787 static final String[] simplifiedNames = { 788 "Abbot", "Morton", "Zachary", "Williams", "\u8d75", "\u94b1", "\u5b59", "\u674e", "\u5468", "\u5434", "\u90d1", "\u738b", "\u51af", "\u9648", "\u696e", "\u536b", "\u848b", "\u6c88", 789 "\u97e9", "\u6768", "\u6731", "\u79e6", "\u5c24", "\u8bb8", "\u4f55", "\u5415", "\u65bd", "\u5f20", "\u5b54", "\u66f9", "\u4e25", "\u534e", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", "\u8c22", "\u90b9", 790 "\u55bb", "\u67cf", "\u6c34", "\u7aa6", "\u7ae0", "\u4e91", "\u82cf", "\u6f58", "\u845b", "\u595a", "\u8303", "\u5f6d", "\u90ce", "\u9c81", "\u97e6", "\u660c", "\u9a6c", "\u82d7", "\u51e4", "\u82b1", "\u65b9", 791 "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9c8d", "\u53f2", "\u5510", "\u8d39", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8d3a", "\u502a", "\u6c64", "\u6ed5", "\u6bb7", "\u7f57", "\u6bd5", "\u90dd", 792 "\u90ac", "\u5b89", "\u5e38", "\u4e50", "\u4e8e", "\u65f6", "\u5085", "\u76ae", "\u535e", "\u9f50", "\u5eb7", "\u4f0d", "\u4f59", "\u5143", "\u535c", "\u987e", "\u5b5f", "\u5e73", "\u9ec4", "\u548c", "\u7a46", 793 "\u8427", "\u5c39", "\u59da", "\u90b5", "\u6e5b", "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8d1d", "\u660e", "\u81e7", "\u8ba1", "\u4f0f", "\u6210", "\u6234", "\u8c08", "\u5b8b", "\u8305", 794 "\u5e9e", "\u718a", "\u7eaa", "\u8212", "\u5c48", "\u9879", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u84dd", "\u95fd", "\u5e2d", "\u5b63", "\u9ebb", "\u5f3a", "\u8d3e", "\u8def", "\u5a04", "\u5371", 795 "\u6c5f", "\u7ae5", "\u989c", "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u953a", "\u5f90", "\u4e18", "\u9a86", "\u9ad8", "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", 796 "\u4e07", "\u652f", "\u67ef", "\u661d", "\u7ba1", "\u5362", "\u83ab", "\u7ecf", "\u623f", "\u88d8", "\u7f2a", "\u5e72", "\u89e3", "\u5e94", "\u5b97", "\u4e01", "\u5ba3", "\u8d32", "\u9093", "\u90c1", "\u5355", 797 "\u676d", "\u6d2a", "\u5305", "\u8bf8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u94ae", "\u9f9a", "\u7a0b", "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9646", "\u8363", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 798 "\u60e0", "\u7504", "\u9eb9", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u50a8", "\u9773", "\u6c72", "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u4e4c", "\u7126", "\u5df4", "\u5f13", 799 "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8f66", "\u4faf", "\u5b93", "\u84ec", "\u5168", "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bab", "\u5b81", "\u4ec7", "\u683e", "\u66b4", "\u7518", 800 "\u659c", "\u5389", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5218", "\u666f", "\u8a79", "\u675f", "\u9f99", "\u53f6", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u84df", "\u8584", "\u5370", "\u5bbf", 801 "\u767d", "\u6000", "\u84b2", "\u90b0", "\u4ece", "\u9102", "\u7d22", "\u54b8", "\u7c4d", "\u8d56", "\u5353", "\u853a", "\u5c60", "\u8499", "\u6c60", "\u4e54", "\u9634", "\u90c1", "\u80e5", "\u80fd", "\u82cd", 802 "\u53cc", "\u95fb", "\u8398", "\u515a", "\u7fdf", "\u8c2d", "\u8d21", "\u52b3", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u90e6", "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", 803 "\u6fee", "\u725b", "\u5bff", "\u901a", "\u8fb9", "\u6248", "\u71d5", "\u5180", "\u90cf", "\u6d66", "\u5c1a", "\u519c", "\u6e29", "\u522b", "\u5e84", "\u664f", "\u67f4", "\u77bf", "\u960e", "\u5145", "\u6155", 804 "\u8fde", "\u8339", "\u4e60", "\u5ba6", "\u827e", "\u9c7c", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", "\u7ec8", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", 805 "\u6ee1", "\u5f18", "\u5321", "\u56fd", "\u6587", "\u5bc7", "\u5e7f", "\u7984", "\u9619", "\u4e1c", "\u6b27", "\u6bb3", "\u6c83", "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e08", "\u5de9", "\u538d", 806 "\u8042", "\u6641", "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u961a", "\u90a3", "\u7b80", "\u9976", "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u517b", "\u97a0", "\u987b", "\u4e30", 807 "\u5de2", "\u5173", "\u84af", "\u76f8", "\u67e5", "\u540e", "\u8346", "\u7ea2", "\u6e38", "\u7afa", "\u6743", "\u9011", "\u76d6", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u9a6c", "\u4e0a\u5b98", "\u6b27\u9633", 808 "\u590f\u4faf", "\u8bf8\u845b", "\u95fb\u4eba", "\u4e1c\u65b9", "\u8d6b\u8fde", "\u7687\u752b", "\u5c09\u8fdf", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", "\u6fee\u9633", "\u6df3\u4e8e", "\u5355\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b59", "\u4ef2\u5b59", 809 "\u8f69\u8f95", "\u4ee4\u72d0", "\u953a\u79bb", "\u5b87\u6587", "\u957f\u5b59", "\u6155\u5bb9", "\u9c9c\u4e8e", "\u95fe\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8f66", "\u989b\u5b59", "\u7aef\u6728", "\u5deb\u9a6c", 810 "\u516c\u897f", "\u6f06\u96d5", "\u4e50\u6b63", "\u58e4\u9a77", "\u516c\u826f", "\u62d3\u62d4", "\u5939\u8c37", "\u5bb0\u7236", "\u8c37\u6881", "\u664b", "\u695a", "\u960e", "\u6cd5", "\u6c5d", "\u9122", "\u6d82", "\u94a6", "\u6bb5\u5e72", "\u767e\u91cc", 811 "\u4e1c\u90ed", "\u5357\u95e8", "\u547c\u5ef6", "\u5f52", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e05", "\u7f11", "\u4ea2", "\u51b5", "\u540e", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u4e1c\u95e8", "\u897f\u95e8", 812 "\u5546", "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8d4f", "\u5357\u5bab", "\u58a8", "\u54c8", "\u8c2f", "\u7b2a", "\u5e74", "\u7231", "\u9633", "\u4f5f" 813 }; 814 815 static final String[] traditionalNames = { "丁", "Abbot", "Morton", "Zachary", "Williams", "\u8d99", "\u9322", "\u5b6b", 816 "\u674e", "\u5468", "\u5433", "\u912d", "\u738b", "\u99ae", "\u9673", "\u696e", "\u885b", "\u8523", 817 "\u6c88", "\u97d3", "\u694a", "\u6731", "\u79e6", "\u5c24", "\u8a31", "\u4f55", "\u5442", "\u65bd", 818 "\u5f35", "\u5b54", "\u66f9", "\u56b4", "\u83ef", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", 819 "\u8b1d", "\u9112", "\u55bb", "\u67cf", "\u6c34", "\u7ac7", "\u7ae0", "\u96f2", "\u8607", "\u6f58", 820 "\u845b", "\u595a", "\u7bc4", "\u5f6d", "\u90ce", "\u9b6f", "\u97cb", "\u660c", "\u99ac", "\u82d7", 821 "\u9cf3", "\u82b1", "\u65b9", "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9b91", "\u53f2", 822 "\u5510", "\u8cbb", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8cc0", "\u502a", "\u6e6f", "\u6ed5", 823 "\u6bb7", "\u7f85", "\u7562", "\u90dd", "\u9114", "\u5b89", "\u5e38", "\u6a02", "\u65bc", "\u6642", 824 "\u5085", "\u76ae", "\u535e", "\u9f4a", "\u5eb7", "\u4f0d", "\u9918", "\u5143", "\u535c", "\u9867", 825 "\u5b5f", "\u5e73", "\u9ec3", "\u548c", "\u7a46", "\u856d", "\u5c39", "\u59da", "\u90b5", "\u6e5b", 826 "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8c9d", "\u660e", "\u81e7", "\u8a08", 827 "\u4f0f", "\u6210", "\u6234", "\u8ac7", "\u5b8b", "\u8305", "\u9f90", "\u718a", "\u7d00", "\u8212", 828 "\u5c48", "\u9805", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u85cd", "\u95a9", "\u5e2d", 829 "\u5b63", "\u9ebb", "\u5f37", "\u8cc8", "\u8def", "\u5a41", "\u5371", "\u6c5f", "\u7ae5", "\u984f", 830 "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u937e", "\u5f90", "\u4e18", "\u99f1", "\u9ad8", 831 "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", "\u842c", "\u652f", 832 "\u67ef", "\u661d", "\u7ba1", "\u76e7", "\u83ab", "\u7d93", "\u623f", "\u88d8", "\u7e46", "\u5e79", 833 "\u89e3", "\u61c9", "\u5b97", "\u4e01", "\u5ba3", "\u8cc1", "\u9127", "\u9b31", "\u55ae", "\u676d", 834 "\u6d2a", "\u5305", "\u8af8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u9215", "\u9f94", "\u7a0b", 835 "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9678", "\u69ae", "\u7fc1", "\u8340", "\u7f8a", "\u65bc", 836 "\u60e0", "\u7504", "\u9eb4", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u5132", "\u9773", "\u6c72", 837 "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u70cf", "\u7126", "\u5df4", 838 "\u5f13", "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8eca", "\u4faf", "\u5b93", "\u84ec", "\u5168", 839 "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bae", "\u5be7", "\u4ec7", "\u6b12", 840 "\u66b4", "\u7518", "\u659c", "\u53b2", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5289", "\u666f", 841 "\u8a79", "\u675f", "\u9f8d", "\u8449", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u858a", 842 "\u8584", "\u5370", "\u5bbf", "\u767d", "\u61f7", "\u84b2", "\u90b0", "\u5f9e", "\u9102", "\u7d22", 843 "\u54b8", "\u7c4d", "\u8cf4", "\u5353", "\u85fa", "\u5c60", "\u8499", "\u6c60", "\u55ac", "\u9670", 844 "\u9b31", "\u80e5", "\u80fd", "\u84bc", "\u96d9", "\u805e", "\u8398", "\u9ee8", "\u7fdf", "\u8b5a", 845 "\u8ca2", "\u52de", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u9148", 846 "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", "\u6fee", "\u725b", "\u58fd", "\u901a", "\u908a", 847 "\u6248", "\u71d5", "\u5180", "\u90df", "\u6d66", "\u5c1a", "\u8fb2", "\u6eab", "\u5225", "\u838a", 848 "\u664f", "\u67f4", "\u77bf", "\u95bb", "\u5145", "\u6155", "\u9023", "\u8339", "\u7fd2", "\u5ba6", 849 "\u827e", "\u9b5a", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", 850 "\u7d42", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", "\u6eff", "\u5f18", "\u5321", 851 "\u570b", "\u6587", "\u5bc7", "\u5ee3", "\u797f", "\u95d5", "\u6771", "\u6b50", "\u6bb3", "\u6c83", 852 "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e2b", "\u978f", "\u5399", "\u8076", "\u6641", 853 "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u95de", "\u90a3", "\u7c21", "\u9952", 854 "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u990a", "\u97a0", "\u9808", "\u8c50", "\u5de2", 855 "\u95dc", "\u84af", "\u76f8", "\u67e5", "\u5f8c", "\u834a", "\u7d05", "\u904a", "\u7afa", "\u6b0a", 856 "\u9011", "\u84cb", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u99ac", "\u4e0a\u5b98", 857 "\u6b50\u967d", "\u590f\u4faf", "\u8af8\u845b", "\u805e\u4eba", "\u6771\u65b9", "\u8d6b\u9023", 858 "\u7687\u752b", "\u5c09\u9072", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", 859 "\u6fee\u967d", "\u6df3\u4e8e", "\u55ae\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b6b", 860 "\u4ef2\u5b6b", "\u8ed2\u8f45", "\u4ee4\u72d0", "\u937e\u96e2", "\u5b87\u6587", "\u9577\u5b6b", 861 "\u6155\u5bb9", "\u9bae\u4e8e", "\u95ad\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", 862 "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8eca", "\u9853\u5b6b", "\u7aef\u6728", "\u5deb\u99ac", 863 "\u516c\u897f", "\u6f06\u96d5", "\u6a02\u6b63", "\u58e4\u99df", "\u516c\u826f", "\u62d3\u62d4", 864 "\u593e\u8c37", "\u5bb0\u7236", "\u7a40\u6881", "\u6649", "\u695a", "\u95bb", "\u6cd5", "\u6c5d", "\u9122", 865 "\u5857", "\u6b3d", "\u6bb5\u5e72", "\u767e\u91cc", "\u6771\u90ed", "\u5357\u9580", "\u547c\u5ef6", 866 "\u6b78", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e25", "\u7df1", "\u4ea2", "\u6cc1", 867 "\u5f8c", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u6771\u9580", "\u897f\u9580", "\u5546", 868 "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8cde", "\u5357\u5bae", "\u58a8", "\u54c8", "\u8b59", "\u7b2a", 869 "\u5e74", "\u611b", "\u967d", "\u4f5f", "\u3401", "\u3422", "\u3426", "\u3493", "\u34A5", "\u34A7", 870 "\u34AA", "\u3536", "\u4A3B", "\u4E00", "\u4E01", "\u4E07", "\u4E0D", "\u4E17", "\u4E23", "\u4E26", 871 "\u4E34", "\u4E82", "\u4EB8", "\u4EB9", "\u511F", "\u512D", "\u513D", "\u513E", "\u53B5", "\u56D4", 872 "\u56D6", "\u7065", "\u7069", "\u706A", "\u7E9E", "\u9750", "\u9F49", "\u9F7E", "\u9F98", "\uD840\uDC35", 873 "\uD840\uDC3D", "\uD840\uDC3E", "\uD840\uDC41", "\uD840\uDC46", "\uD840\uDC4C", "\uD840\uDC4E", 874 "\uD840\uDC53", "\uD840\uDC55", "\uD840\uDC56", "\uD840\uDC5F", "\uD840\uDC60", "\uD840\uDC7A", 875 "\uD840\uDC7B", "\uD840\uDCC8", "\uD840\uDD9E", "\uD840\uDD9F", "\uD840\uDDA0", "\uD840\uDDA1", 876 "\uD841\uDD3B", "\uD842\uDCCA", "\uD842\uDCCB", "\uD842\uDD6C", "\uD842\uDE0B", "\uD842\uDE0C", 877 "\uD842\uDED1", "\uD844\uDD9F", "\uD845\uDD19", "\uD845\uDD1A", "\uD846\uDD3B", "\uD84C\uDF5C", 878 "\uD85A\uDDC4", "\uD85A\uDDC5", "\uD85C\uDD98", "\uD85E\uDCB1", "\uD861\uDC04", "\uD864\uDDD3", 879 "\uD865\uDE63", "\uD869\uDCCA", "\uD86B\uDE9A", }; 880 881 /** 882 * Test AlphabeticIndex vs. root with script reordering. 883 */ TestHaniFirst()884 public void TestHaniFirst() { 885 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 886 coll.setReorderCodes(UScript.HAN); 887 AlphabeticIndex index = new AlphabeticIndex(coll); 888 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... (underflow only) 889 index.addLabels(ULocale.ENGLISH); 890 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 891 int bucketIndex = index.getBucketIndex("\u897f"); 892 assertEquals("getBucketIndex(U+897F)", 0, bucketIndex); // underflow bucket 893 bucketIndex = index.getBucketIndex("i"); 894 assertEquals("getBucketIndex(i)", 9, bucketIndex); 895 bucketIndex = index.getBucketIndex("\u03B1"); 896 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 897 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 898 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 899 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 900 bucketIndex = index.getBucketIndex("\uFFFF"); 901 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 902 } 903 904 /** 905 * Test AlphabeticIndex vs. Pinyin with script reordering. 906 */ TestPinyinFirst()907 public void TestPinyinFirst() { 908 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE); 909 coll.setReorderCodes(UScript.HAN); 910 AlphabeticIndex index = new AlphabeticIndex(coll); 911 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 912 index.addLabels(ULocale.CHINESE); 913 assertEquals("getBucketCount()", 28, index.getBucketCount()); // ... A-Z ... 914 int bucketIndex = index.getBucketIndex("\u897f"); 915 assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex); 916 bucketIndex = index.getBucketIndex("i"); 917 assertEquals("getBucketIndex(i)", 9, bucketIndex); 918 bucketIndex = index.getBucketIndex("\u03B1"); 919 assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex); 920 // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group. 921 bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005)); 922 assertEquals("getBucketIndex(U+50005)", 27, bucketIndex); 923 bucketIndex = index.getBucketIndex("\uFFFF"); 924 assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex); 925 } 926 927 /** 928 * Test labels with multiple primary weights. 929 */ TestSchSt()930 public void TestSchSt() { 931 AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN); 932 index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]")); 933 // ... A Æ B-R S Sch St T-Z ... 934 ImmutableIndex immIndex = index.buildImmutableIndex(); 935 assertEquals("getBucketCount()", 31, index.getBucketCount()); 936 assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount()); 937 String[][] testCases = new String[][] { 938 // name, bucket index, bucket label 939 { "Adelbert", "1", "A" }, 940 { "Afrika", "1", "A" }, 941 { "Æsculap", "2", "Æ" }, 942 { "Aesthet", "2", "Æ" }, 943 { "Berlin", "3", "B" }, 944 { "Rilke", "19", "R" }, 945 { "Sacher", "20", "S" }, 946 { "Seiler", "20", "S" }, 947 { "Sultan", "20", "S" }, 948 { "Schiller", "21", "Sch" }, 949 { "Steiff", "22", "St" }, 950 { "Thomas", "23", "T" } 951 }; 952 List<String> labels = index.getBucketLabels(); 953 for (String[] testCase : testCases) { 954 String name = testCase[0]; 955 int bucketIndex = Integer.valueOf(testCase[1]); 956 String label = testCase[2]; 957 String msg = "getBucketIndex(" + name + ")"; 958 assertEquals(msg, bucketIndex, index.getBucketIndex(name)); 959 msg = "immutable " + msg; 960 assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name)); 961 msg = "bucket label (" + name + ")"; 962 assertEquals(msg, label, labels.get(index.getBucketIndex(name))); 963 msg = "immutable " + msg; 964 assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel()); 965 } 966 } 967 968 /** 969 * With no real labels, there should be only the underflow label. 970 */ TestNoLabels()971 public void TestNoLabels() { 972 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT); 973 AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(coll); 974 index.addRecord("\u897f", 0); 975 index.addRecord("i", 0); 976 index.addRecord("\u03B1", 0); 977 assertEquals("getBucketCount()", 1, index.getBucketCount()); // ... 978 Bucket<Integer> bucket = index.iterator().next(); 979 assertEquals("underflow label type", LabelType.UNDERFLOW, bucket.getLabelType()); 980 assertEquals("all records in the underflow bucket", 3, bucket.size()); 981 } 982 983 /** 984 * Test with the Bopomofo-phonetic tailoring. 985 */ TestChineseZhuyin()986 public void TestChineseZhuyin() { 987 AlphabeticIndex index = new AlphabeticIndex(ULocale.forLanguageTag("zh-u-co-zhuyin")); 988 ImmutableIndex immIndex = index.buildImmutableIndex(); 989 assertEquals("getBucketCount()", 38, immIndex.getBucketCount()); // ... ㄅ ㄆ ㄇ ㄈ ㄉ -- ㄩ ... 990 assertEquals("label 1", "ㄅ", immIndex.getBucket(1).getLabel()); 991 assertEquals("label 2", "ㄆ", immIndex.getBucket(2).getLabel()); 992 assertEquals("label 3", "ㄇ", immIndex.getBucket(3).getLabel()); 993 assertEquals("label 4", "ㄈ", immIndex.getBucket(4).getLabel()); 994 assertEquals("label 5", "ㄉ", immIndex.getBucket(5).getLabel()); 995 } 996 TestJapaneseKanji()997 public void TestJapaneseKanji() { 998 AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE); 999 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1000 // There are no index characters for Kanji in the Japanese standard collator. 1001 // They should all go into the overflow bucket. 1002 final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 }; 1003 int overflowIndex = immIndex.getBucketCount() - 1; 1004 for(int i = 0; i < kanji.length; ++i) { 1005 String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]); 1006 assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i]))); 1007 } 1008 } 1009 TestFrozenCollator()1010 public void TestFrozenCollator() { 1011 // Ticket #9472 1012 RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da")); 1013 coll.setStrength(Collator.IDENTICAL); 1014 coll.freeze(); 1015 // The AlphabeticIndex constructor used to throw an exception 1016 // because it cloned the collator (which preserves frozenness) 1017 // and set the clone's strength to PRIMARY. 1018 AlphabeticIndex index = new AlphabeticIndex(coll); 1019 assertEquals("same strength as input Collator", 1020 Collator.IDENTICAL, index.getCollator().getStrength()); 1021 } 1022 TestChineseUnihan()1023 public void TestChineseUnihan() { 1024 AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan")); 1025 index.setMaxLabelCount(500); // ICU 54 default is 99. 1026 AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex(); 1027 int bucketCount = immIndex.getBucketCount(); 1028 if(bucketCount < 216) { 1029 // There should be at least an underflow and overflow label, 1030 // and one for each of 214 radicals, 1031 // and maybe additional labels for simplified radicals. 1032 // (ICU4C: dataerrln(), prints only a warning if the data is missing) 1033 errln("too few buckets/labels for Chinese/unihan: " + bucketCount + 1034 " (is zh/unihan data available?)"); 1035 return; 1036 } else { 1037 logln("Chinese/unihan has " + bucketCount + " buckets/labels"); 1038 } 1039 // bucketIndex = radical number, adjusted for simplified radicals in lower buckets. 1040 int bucketIndex = index.getBucketIndex("\u4e5d"); 1041 assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex); 1042 bucketIndex = index.getBucketIndex("\u7527"); 1043 assertEquals("getBucketIndex(U+7527)", 100, bucketIndex); 1044 } 1045 } 1046