1 /*
2  *******************************************************************************
3  * Copyright (C) 2008-2014, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.dev.test.collator;
8 import java.util.ArrayList;
9 import java.util.Arrays;
10 import java.util.Collection;
11 import java.util.Iterator;
12 import java.util.LinkedHashSet;
13 import java.util.List;
14 import java.util.Locale;
15 import java.util.Set;
16 import java.util.TreeSet;
17 
18 import com.ibm.icu.dev.test.TestFmwk;
19 import com.ibm.icu.dev.util.CollectionUtilities;
20 import com.ibm.icu.impl.ICUDebug;
21 import com.ibm.icu.impl.Row;
22 import com.ibm.icu.impl.Row.R4;
23 import com.ibm.icu.lang.UCharacter;
24 import com.ibm.icu.lang.UProperty;
25 import com.ibm.icu.lang.UScript;
26 import com.ibm.icu.text.AlphabeticIndex;
27 import com.ibm.icu.text.AlphabeticIndex.Bucket;
28 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
29 import com.ibm.icu.text.AlphabeticIndex.ImmutableIndex;
30 import com.ibm.icu.text.AlphabeticIndex.Record;
31 import com.ibm.icu.text.Collator;
32 import com.ibm.icu.text.Normalizer2;
33 import com.ibm.icu.text.RawCollationKey;
34 import com.ibm.icu.text.RuleBasedCollator;
35 import com.ibm.icu.text.UTF16;
36 import com.ibm.icu.text.UnicodeSet;
37 import com.ibm.icu.util.ULocale;
38 
39 /**
40  * @author Mark Davis
41  */
42 public class AlphabeticIndexTest extends TestFmwk {
43     /**
44      *
45      */
46     private static final String ARROW = "\u2192";
47     private static final boolean DEBUG = ICUDebug.enabled("alphabeticindex");
48 
49     public static Set<String> KEY_LOCALES = new LinkedHashSet(Arrays.asList(
50             "en", "es", "de", "fr", "ja", "it", "tr", "pt", "zh", "nl",
51             "pl", "ar", "ru", "zh_Hant", "ko", "th", "sv", "fi", "da",
52             "he", "nb", "el", "hr", "bg", "sk", "lt", "vi", "lv", "sr",
53             "pt_PT", "ro", "hu", "cs", "id", "sl", "fil", "fa", "uk",
54             "ca", "hi", "et", "eu", "is", "sw", "ms", "bn", "am", "ta",
55             "te", "mr", "ur", "ml", "kn", "gu", "or"));
56     private String[][] localeAndIndexCharactersLists = new String[][] {
57             /* Arabic*/ {"ar", "\u0627:\u0628:\u062A:\u062B:\u062C:\u062D:\u062E:\u062F:\u0630:\u0631:\u0632:\u0633:\u0634:\u0635:\u0636:\u0637:\u0638:\u0639:\u063A:\u0641:\u0642:\u0643:\u0644:\u0645:\u0646:\u0647:\u0648:\u064A"},
58             /* Bulgarian*/  {"bg", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"},
59             /* Catalan*/    {"ca", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
60             /* Czech*/  {"cs", "A:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:P:Q:R:\u0158:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
61             /* Danish*/ {"da", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
62             /* German*/ {"de", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
63             /* Greek*/  {"el", "\u0391:\u0392:\u0393:\u0394:\u0395:\u0396:\u0397:\u0398:\u0399:\u039A:\u039B:\u039C:\u039D:\u039E:\u039F:\u03A0:\u03A1:\u03A3:\u03A4:\u03A5:\u03A6:\u03A7:\u03A8:\u03A9"},
64             /* English*/    {"en", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
65             /* Spanish*/    {"es", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u00D1:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
66             /* Estonian*/   {"et", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:Z:\u017D:T:U:V:\u00D5:\u00C4:\u00D6:\u00DC:X:Y"},
67             /* Basque*/ {"eu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
68             /* Finnish*/    {"fi", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"},
69             /* Filipino*/   {"fil", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
70             /* French*/ {"fr", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
71             /* Hebrew*/ {"he", "\u05D0:\u05D1:\u05D2:\u05D3:\u05D4:\u05D5:\u05D6:\u05D7:\u05D8:\u05D9:\u05DB:\u05DC:\u05DE:\u05E0:\u05E1:\u05E2:\u05E4:\u05E6:\u05E7:\u05E8:\u05E9:\u05EA"},
72             /* Icelandic*/  {"is", "A:\u00C1:B:C:D:\u00D0:E:\u00C9:F:G:H:I:\u00CD:J:K:L:M:N:O:\u00D3:P:Q:R:S:T:U:\u00DA:V:W:X:Y:\u00DD:Z:\u00DE:\u00C6:\u00D6"},
73             /* Italian*/    {"it", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
74             /* Japanese*/   {"ja", "\u3042:\u304B:\u3055:\u305F:\u306A:\u306F:\u307E:\u3084:\u3089:\u308F"},
75             /* Korean*/ {"ko", "\u3131:\u3134:\u3137:\u3139:\u3141:\u3142:\u3145:\u3147:\u3148:\u314A:\u314B:\u314C:\u314D:\u314E"},
76             /* Lithuanian*/ {"lt", "A:B:C:\u010C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:\u0160:T:U:V:Z:\u017D"},
77             /* Latvian*/    {"lv", "A:B:C:\u010C:D:E:F:G:\u0122:H:I:J:K:\u0136:L:\u013B:M:N:\u0145:O:P:Q:R:S:\u0160:T:U:V:W:X:Z:\u017D"},
78             /* Norwegian Bokm\u00E5l*/  {"nb", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
79             /* Dutch*/  {"nl", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
80             /* Polish*/ {"pl", "A:\u0104:B:C:\u0106:D:E:\u0118:F:G:H:I:J:K:L:\u0141:M:N:\u0143:O:\u00D3:P:Q:R:S:\u015A:T:U:V:W:X:Y:Z:\u0179:\u017B"},
81             /* Portuguese*/ {"pt", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
82             /* Romanian*/   {"ro", "A:\u0102:\u00C2:B:C:D:E:F:G:H:I:\u00CE:J:K:L:M:N:O:P:Q:R:S:\u0218:T:\u021A:U:V:W:X:Y:Z"},
83             /* Russian*/    {"ru", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0418:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042B:\u042D:\u042E:\u042F"},
84             /* Slovak*/ {"sk", "A:\u00C4:B:C:\u010C:D:E:F:G:H:CH:I:J:K:L:M:N:O:\u00D4:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
85             /* Slovenian*/  {"sl", "A:B:C:\u010C:\u0106:D:\u0110:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:\u0160:T:U:V:W:X:Y:Z:\u017D"},
86             /* Serbian*/    {"sr", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0402:\u0415:\u0416:\u0417:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040B:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
87             /* Swedish*/    {"sv", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C5:\u00C4:\u00D6"},
88             /* Turkish*/    {"tr", "A:B:C:\u00C7:D:E:F:G:H:I:\u0130:J:K:L:M:N:O:\u00D6:P:Q:R:S:\u015E:T:U:\u00DC:V:W:X:Y:Z"},
89             /* Ukrainian*/  {"uk", "\u0410:\u0411:\u0412:\u0413:\u0490:\u0414:\u0415:\u0404:\u0416:\u0417:\u0418:\u0406:\u0407:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u0429:\u042E:\u042F"},
90             /* Vietnamese*/ {"vi", "A:\u0102:\u00C2:B:C:D:\u0110:E:\u00CA:F:G:H:I:J:K:L:M:N:O:\u00D4:\u01A0:P:Q:R:S:T:U:\u01AF:V:W:X:Y:Z"},
91             /* Chinese*/    {"zh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
92             /* Chinese (Traditional Han)*/  {"zh_Hant", "1\u5283:2\u5283:3\u5283:4\u5283:5\u5283:6\u5283:7\u5283:8\u5283:9\u5283:10\u5283:11\u5283:12\u5283:13\u5283:14\u5283:15\u5283:16\u5283:17\u5283:18\u5283:19\u5283:20\u5283:21\u5283:22\u5283:23\u5283:24\u5283:25\u5283:26\u5283:27\u5283:28\u5283:29\u5283:30\u5283:31\u5283:32\u5283:33\u5283:35\u5283:36\u5283:39\u5283:48\u5283"},
93 
94             // Comment these out to make the test run faster. Later, make these run under extended
95 
96             //            /* Afrikaans*/  {"af", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
97             //            /* Akan*/   {"ak", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:O:\u0186:P:Q:R:S:T:U:V:W:X:Y:Z"},
98             //            /* Asu*/    {"asa", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
99             //            /* Azerbaijani*/    {"az", "A:B:C:\u00C7:D:E:\u018F:F:G:\u011E:H:X:I:\u0130:J:K:Q:L:M:N:O:\u00D6:P:R:S:\u015E:T:U:\u00DC:V:W:Y:Z"},
100             //            /* Belarusian*/ {"be", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0415:\u0416:\u0417:\u0406:\u0419:\u041A:\u041B:\u041C:\u041D:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u0428:\u042B:\u042D:\u042E:\u042F"},
101             //            /* Bemba*/  {"bem", "A:B:C:E:F:G:I:J:K:L:M:N:O:P:S:T:U:W:Y"},
102             //            /* Bena*/   {"bez", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:Y:Z"},
103             //            /* Bambara*/    {"bm", "A:B:C:D:E:\u0190:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:\u0186:P:R:S:T:U:W:Y:Z"},
104             //            /* Tibetan*/    {"bo", "\u0F40:\u0F41:\u0F42:\u0F44:\u0F45:\u0F46:\u0F47:\u0F49:\u0F4F:\u0F50:\u0F51:\u0F53:\u0F54:\u0F55:\u0F56:\u0F58:\u0F59:\u0F5A:\u0F5B:\u0F5D:\u0F5E:\u0F5F:\u0F60:\u0F61:\u0F62:\u0F63:\u0F64:\u0F66:\u0F67:\u0F68"},
105             //            /* Chiga*/  {"cgg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
106             //            /* Cherokee*/   {"chr", "\u13A0:\u13A6:\u13AD:\u13B3:\u13B9:\u13BE:\u13C6:\u13CC:\u13D3:\u13DC:\u13E3:\u13E9:\u13EF"},
107             //            /* Welsh*/  {"cy", "A:B:C:CH:D:E:F:FF:G:H:I:J:L:LL:M:N:O:P:PH:R:RH:S:T:TH:U:W:Y"},
108             //            /* Taita*/  {"dav", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
109             //            /* Embu*/   {"ebu", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
110             //            /* Ewe*/    {"ee", "A:B:C:D:\u0189:E:\u0190:F:\u0191:G:\u0194:H:I:J:K:L:M:N:\u014A:O:\u0186:P:Q:R:S:T:U:V:\u01B2:W:X:Y:Z"},
111             //            /* Esperanto*/  {"eo", "A:B:C:\u0108:D:E:F:G:\u011C:H:\u0124:I:J:\u0134:K:L:M:N:O:P:R:S:\u015C:T:U:\u016C:V:Z"},
112             //            /* Fulah*/  {"ff", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:L:M:N:\u014A:O:P:R:S:T:U:W:Y:\u01B3"},
113             //            /* Faroese*/    {"fo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8"},
114             //            /* Gusii*/  {"guz", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
115             //            /* Hausa*/  {"ha", "A:B:\u0181:C:D:\u018A:E:F:G:H:I:J:K:\u0198:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
116             //            /* Igbo*/   {"ig", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
117             //            /* Machame*/    {"jmc", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
118             //            /* Kabyle*/ {"kab", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:P:Q:R:S:T:U:W:X:Y:Z"},
119             //            /* Kamba*/  {"kam", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
120             //            /* Makonde*/    {"kde", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
121             //            /* Kabuverdianu*/   {"kea", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:X:Z"},
122             //            /* Koyra Chiini*/   {"khq", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"},
123             //            /* Kikuyu*/ {"ki", "A:B:C:D:E:G:H:I:J:K:M:N:O:R:T:U:W:Y"},
124             //            /* Kalenjin*/   {"kln", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:W:Y"},
125             //            /* Langi*/  {"lag", "A:B:C:D:E:F:G:H:I:\u0197:J:K:L:M:N:O:P:Q:R:S:T:U:\u0244:V:W:X:Y:Z"},
126             //            /* Ganda*/  {"lg", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
127             //            /* Luo*/    {"luo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"},
128             //            /* Luyia*/  {"luy", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
129             //            /* Masai*/  {"mas", "A:B:C:D:E:\u0190:G:H:I:\u0197:J:K:L:M:N:\u014A:O:\u0186:P:R:S:T:U:\u0244:W:Y"},
130             //            /* Meru*/   {"mer", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
131             //            /* Morisyen*/   {"mfe", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y:Z"},
132             //            /* Malagasy*/   {"mg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:V:Y:Z"},
133             // This should be the correct data.  Commented till it is fixed in CLDR collation data.
134             // {"mk", "\u0410:\u0411:\u0412:\u0413:\u0403:\u0414:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u040C:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
135             //            /* Macedonian*/ {"mk", "\u0410:\u0411:\u0412:\u0413:\u0414:\u0403:\u0415:\u0416:\u0417:\u0405:\u0418:\u0408:\u041A:\u041B:\u0409:\u041C:\u041D:\u040A:\u041E:\u041F:\u0420:\u0421:\u0422:\u040C:\u0423:\u0424:\u0425:\u0426:\u0427:\u040F:\u0428"},
136             // This should be the correct data.  Commented till it is fixed in CLDR collation data.
137             // {"mt", "A:B:C:\u010A:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"},
138             //            /* Maltese*/    {"mt", "A:B:\u010A:C:D:E:F:\u0120:G:G\u0126:H:\u0126:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:\u017B:Z"},
139             //            /* Nama*/   {"naq", "A:B:C:D:E:F:G:H:I:K:M:N:O:P:Q:R:S:T:U:W:X:Y:Z"},
140             //            /* North Ndebele*/  {"nd", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:S:T:U:V:W:X:Y:Z"},
141             //            /* Norwegian Nynorsk*/  {"nn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u00C6:\u00D8:\u00C5"},
142             //            /* Nyankole*/   {"nyn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
143             //            /* Oromo*/  {"om", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
144             //            /* Romansh*/    {"rm", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
145             //            /* Rombo*/  {"rof", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
146             //            /* Kinyarwanda*/    {"rw", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
147             //            /* Rwa*/    {"rwk", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
148             //            /* Samburu*/    {"saq", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y"},
149             //            /* Sena*/   {"seh", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
150             //            /* Koyraboro Senni*/    {"ses", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:\u019D:\u014A:O:P:Q:R:S:T:U:W:X:Y:Z"},
151             //            /* Sango*/  {"sg", "A:B:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
152             //            /* Tachelhit*/  {"shi", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"},
153             //            /* Tachelhit (Tifinagh)*/   {"shi_Tfng", "\u2D30:\u2D31:\u2D33:\u2D37:\u2D39:\u2D3B:\u2D3C:\u2D3D:\u2D40:\u2D43:\u2D44:\u2D45:\u2D47:\u2D49:\u2D4A:\u2D4D:\u2D4E:\u2D4F:\u2D53:\u2D54:\u2D55:\u2D56:\u2D59:\u2D5A:\u2D5B:\u2D5C:\u2D5F:\u2D61:\u2D62:\u2D63:\u2D65"},
154             //            /* Shona*/  {"sn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
155             //            /* Teso*/   {"teo", "A:B:C:D:E:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:X:Y"},
156             //            /* Tonga*/  {"to", "A:B:C:D:E:F:G:H:\u02BB:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
157             //            /* Central Morocco Tamazight*/  {"tzm", "A:B:C:D:E:\u0190:F:G:\u0194:H:I:J:K:L:M:N:Q:R:S:T:U:W:X:Y:Z"},
158             //            /* Uzbek (Latin)*/  {"uz_Latn", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z:\u02BF"},
159             //            /* Vunjo*/  {"vun", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:R:S:T:U:V:W:Y:Z"},
160             //            /* Soga*/   {"xog", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
161             //            /* Yoruba*/ {"yo", "A:B:C:D:E:F:G:H:I:J:K:L:M:N:O:P:Q:R:S:T:U:V:W:X:Y:Z"},
162 
163     };
main(String[] args)164     public static void main(String[] args) throws Exception{
165         new AlphabeticIndexTest().run(args);
166     }
167 
168 //    public void TestAAKeyword() {
169 //    ICUResourceBundle rb = (ICUResourceBundle) UResourceBundle.getBundleInstance(
170 //            ICUResourceBundle.ICU_COLLATION_BASE_NAME, "zh");
171 //    showBundle(rb, 0);
172 //        String[] keywords = Collator.getKeywords();
173 //        System.out.println(Arrays.asList(keywords));
174 //        String locale = "zh";
175 //        ULocale ulocale = new ULocale(locale);
176 //        for (String keyword : keywords) {
177 //            List<String> values = Arrays.asList(Collator.getKeywordValuesForLocale(keyword, ulocale, false));
178 //            List<String> allValues = Arrays.asList(Collator.getKeywordValues(keyword));
179 //            for (String value : allValues) {
180 //                System.out.println(keyword + "=" + value);
181 //                checkKeyword(locale, value, values.contains(value));
182 //            }
183 //        }
184 //    }
185 //
186 //    private void checkKeyword(String locale, String collationValue, boolean shouldExist) {
187 //        final ULocale base = new ULocale(locale);
188 //        final ULocale desired = new ULocale(locale + "@collation=" + collationValue);
189 //        Collator foo = Collator.getInstance(desired);
190 //        ULocale actual = foo.getLocale(ULocale.ACTUAL_LOCALE);
191 //        if (shouldExist) {
192 //            assertEquals("actual should match desired", desired, actual);
193 //        } else {
194 //            assertEquals("actual should match base", base, actual);
195 //        }
196 //        int comp = foo.compare("a", "ā");
197 //        assertEquals("should fall back to default for zh", -1, comp);
198 //    }
199 //
200 //    /**
201 //     * @param rb
202 //     * @param i
203 //     */
204 //    private static void showBundle(UResourceBundle rb, int i) {
205 //        for (String key : rb.keySet()) {
206 //            System.out.print("\n" + Utility.repeat("  ", i) + key);
207 //            UResourceBundle rb2 = rb.get(key);
208 //            showBundle(rb2, i+1);
209 //        }
210 //    }
211 
212 
TestA()213     public void TestA() {
214         String[][] tests = {{"zh_Hant", "渡辺", "12劃"},
215                 {"zh", "渡辺", "D"}
216                 /*, "zh@collation=unihan", "ja@collation=unihan", "ko@collation=unihan"*/
217                 };
218         for (String[] test : tests) {
219             AlphabeticIndex<Integer> alphabeticIndex = new AlphabeticIndex<Integer>(new ULocale(test[0]));
220             final String probe = test[1];
221             final String expectedLabel = test[2];
222             alphabeticIndex.addRecord(probe, 1);
223             List labels = alphabeticIndex.getBucketLabels();
224             logln(labels.toString());
225             Bucket<Integer> bucket = find(alphabeticIndex, probe);
226             assertEquals("locale " + test[0] + " name=" + probe + " in bucket",
227                     expectedLabel, bucket.getLabel());
228         }
229     }
230 
find(AlphabeticIndex<Integer> alphabeticIndex, final String probe)231     private Bucket<Integer> find(AlphabeticIndex<Integer> alphabeticIndex, final String probe) {
232         for (Bucket<Integer> bucket : alphabeticIndex) {
233             for (Record<Integer> record : bucket) {
234                 if (record.getName().equals(probe)) {
235                     return bucket;
236                 }
237             }
238         }
239         return null;
240     }
241 
TestFirstCharacters()242     public void TestFirstCharacters() {
243 
244         AlphabeticIndex alphabeticIndex = new AlphabeticIndex(Locale.ENGLISH);
245         RuleBasedCollator collator = alphabeticIndex.getCollator();
246         collator.setStrength(Collator.IDENTICAL);
247         Collection<String> firsts = alphabeticIndex.getFirstCharactersInScripts();
248         // Verify that each script is represented exactly once.
249         UnicodeSet missingScripts = new UnicodeSet("[^[:sc=inherited:][:sc=unknown:][:sc=common:][:Script=Braille:]]");
250         String last = "";
251         for (String index : firsts) {
252             if (collator.compare(last,index) >= 0) {
253                 errln("Characters not in order: " + last + " !< " + index);
254             }
255             int script = getFirstRealScript(index);
256             if (script == UScript.UNKNOWN) { continue; }
257             UnicodeSet s = new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script);
258             if (missingScripts.containsNone(s)) {
259                 errln("2nd character in script: " + index + "\t" + new UnicodeSet(missingScripts).retainAll(s).toPattern(false));
260             }
261             missingScripts.removeAll(s);
262         }
263         if (missingScripts.size() != 0) {
264             String missingScriptNames = "";
265             UnicodeSet missingChars = new UnicodeSet(missingScripts);
266             for(;;) {
267                 int c = missingChars.charAt(0);
268                 if (c < 0) {
269                     break;
270                 }
271                 int script = UScript.getScript(c);
272                 missingScriptNames += " " +
273                         UCharacter.getPropertyValueName(
274                                 UProperty.SCRIPT, script, UProperty.NameChoice.SHORT);
275                 missingChars.removeAll(new UnicodeSet().applyIntPropertyValue(UProperty.SCRIPT, script));
276             }
277             errln("Missing character from:" + missingScriptNames + " -- " + missingScripts);
278         }
279     }
280 
getFirstRealScript(CharSequence s)281     private static final int getFirstRealScript(CharSequence s) {
282         for (int i = 0; i < s.length();) {
283             int c = Character.codePointAt(s, i);
284             int script = UScript.getScript(c);
285             if (script != UScript.UNKNOWN && script != UScript.INHERITED && script != UScript.COMMON) {
286                 return script;
287             }
288             i += Character.charCount(c);
289         }
290         return UScript.UNKNOWN;
291     }
292 
TestBuckets()293     public void TestBuckets() {
294         ULocale additionalLocale = ULocale.ENGLISH;
295 
296         for (String[] pair : localeAndIndexCharactersLists) {
297             checkBuckets(pair[0], SimpleTests, additionalLocale, "E", "edgar", "Effron", "Effron");
298         }
299     }
300 
TestEmpty()301     public void TestEmpty() {
302         // just verify that it doesn't blow up.
303         Set<ULocale> locales = new LinkedHashSet<ULocale>();
304         locales.add(ULocale.ROOT);
305         locales.addAll(Arrays.asList(ULocale.getAvailableLocales()));
306         for (ULocale locale : locales) {
307             try {
308                 AlphabeticIndex<String> alphabeticIndex = new AlphabeticIndex(locale);
309                 alphabeticIndex.addRecord("hi", "HI");
310                 for (Bucket<String> bucket : alphabeticIndex) {
311                     @SuppressWarnings("unused")
312                     LabelType labelType = bucket.getLabelType();
313                 }
314             } catch (Exception e) {
315                 errln("Exception when creating AlphabeticIndex for:\t" + locale.toLanguageTag());
316                 errln(e.toString());
317             }
318         }
319     }
320 
TestInflow()321     public void TestInflow() {
322         Object[][] tests = {
323                 {0, ULocale.ENGLISH},
324                 {0, ULocale.ENGLISH, new ULocale("el")},
325                 {1, ULocale.ENGLISH, new ULocale("ru")},
326                 {0, ULocale.ENGLISH, new ULocale("el"), new UnicodeSet("[\u2C80]"), new ULocale("ru")},
327                 {0, ULocale.ENGLISH},
328                 {2, ULocale.ENGLISH, new ULocale("ru"), ULocale.JAPANESE},
329         };
330         for (Object[] test : tests) {
331             int expected = (Integer) test[0];
332             AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex((ULocale)test[1]);
333             for (int i = 2; i < test.length; ++i) {
334                 if (test[i] instanceof ULocale) {
335                     alphabeticIndex.addLabels((ULocale)test[i]);
336                 } else {
337                     alphabeticIndex.addLabels((UnicodeSet)test[i]);
338                 }
339             }
340             Counter<AlphabeticIndex.Bucket.LabelType> counter = new Counter();
341             for (Bucket<Double> bucket : alphabeticIndex) {
342                 LabelType labelType = bucket.getLabelType();
343                 counter.add(labelType, 1);
344             }
345             String printList = Arrays.asList(test).toString();
346             assertEquals(LabelType.UNDERFLOW + "\t" + printList, 1, counter.get(LabelType.UNDERFLOW));
347             assertEquals(LabelType.INFLOW + "\t" + printList, expected, counter.get(LabelType.INFLOW));
348             if (expected != counter.get(LabelType.INFLOW)) {
349                 // for debugging
350                 AlphabeticIndex<Double> indexCharacters2 = new AlphabeticIndex((ULocale)test[1]);
351                 for (int i = 2; i < test.length; ++i) {
352                     if (test[i] instanceof ULocale) {
353                         indexCharacters2.addLabels((ULocale)test[i]);
354                     } else {
355                         indexCharacters2.addLabels((UnicodeSet)test[i]);
356                     }
357                 }
358                 List<Bucket<Double>> buckets = CollectionUtilities.addAll(alphabeticIndex.iterator(), new ArrayList<Bucket<Double>>());
359                 logln(buckets.toString());
360             }
361             assertEquals(LabelType.OVERFLOW + "\t" + printList, 1, counter.get(LabelType.OVERFLOW));
362         }
363     }
364 
checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items)365     private void checkBuckets(String localeString, String[] test, ULocale additionalLocale, String testBucket, String... items) {
366         StringBuilder UI = new StringBuilder();
367         ULocale desiredLocale = new ULocale(localeString);
368 
369         // Create a simple index where the values for the strings are Integers, and add the strings
370         AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale);
371         int counter = 0;
372         Counter<String> itemCount = new Counter();
373         for (String item : test) {
374             index.addRecord(item, counter++);
375             itemCount.add(item, 1);
376         }
377 
378         List<String> labels = index.getBucketLabels();
379         ImmutableIndex<Integer> immIndex = index.buildImmutableIndex();
380 
381         logln(desiredLocale + "\t" + desiredLocale.getDisplayName(ULocale.ENGLISH) + " - " + desiredLocale.getDisplayName(desiredLocale) + "\t"
382                 + index.getCollator().getLocale(ULocale.ACTUAL_LOCALE));
383         UI.setLength(0);
384         UI.append(desiredLocale + "\t");
385         boolean showAll = true;
386 
387         // Show index at top. We could skip or gray out empty buckets
388         for (AlphabeticIndex.Bucket<Integer> bucket : index) {
389             if (showAll || bucket.size() != 0) {
390                 showLabelAtTop(UI, bucket.getLabel());
391             }
392         }
393         logln(UI.toString());
394 
395         // Show the buckets with their contents, skipping empty buckets
396         int bucketIndex = 0;
397         for (Bucket<Integer> bucket : index) {
398             assertEquals("bucket label vs. iterator",
399                     labels.get(bucketIndex), bucket.getLabel());
400             assertEquals("bucket label vs. immutable",
401                     labels.get(bucketIndex), immIndex.getBucket(bucketIndex).getLabel());
402             assertEquals("bucket label type vs. immutable",
403                     bucket.getLabelType(), immIndex.getBucket(bucketIndex).getLabelType());
404             for (Record<Integer> r : bucket) {
405                 CharSequence name = r.getName();
406                 assertEquals("getBucketIndex(" + name + ")",
407                         bucketIndex, index.getBucketIndex(name));
408                 assertEquals("immutable getBucketIndex(" + name + ")",
409                         bucketIndex, immIndex.getBucketIndex(name));
410             }
411             if (bucket.getLabel().equals(testBucket)) {
412                 Counter<String> keys = getKeys(bucket);
413                 for (String item : items) {
414                     long globalCount = itemCount.get(item);
415                     long localeCount = keys.get(item);
416                     if (globalCount != localeCount) {
417                         errln("Error: in " + "'" + testBucket + "', '" + item + "' should have count "
418                                 + globalCount + " but has count " + localeCount);
419                     }
420 
421                 }
422             }
423 
424             if (bucket.size() != 0) {
425                 showLabelInList(UI, bucket.getLabel());
426                 for (AlphabeticIndex.Record<Integer> item : bucket) {
427                     showIndexedItem(UI, item.getName(), item.getData());
428                 }
429                 logln(UI.toString());
430             }
431             ++bucketIndex;
432         }
433         assertEquals("getBucketCount()", bucketIndex, index.getBucketCount());
434         assertEquals("immutable getBucketCount()", bucketIndex, immIndex.getBucketCount());
435 
436         assertNull("immutable getBucket(-1)", immIndex.getBucket(-1));
437         assertNull("immutable getBucket(count)", immIndex.getBucket(bucketIndex));
438 
439         for (Bucket<Integer> bucket : immIndex) {
440             assertEquals("immutable bucket size", 0, bucket.size());
441             assertFalse("immutable bucket iterator.hasNext()", bucket.iterator().hasNext());
442         }
443     }
444 
showIndex(AlphabeticIndex<T> index, boolean showEmpty)445     public <T> void showIndex(AlphabeticIndex<T> index, boolean showEmpty) {
446         logln("Actual");
447         StringBuilder UI = new StringBuilder();
448         for (Bucket<T> bucket : index) {
449             if (showEmpty || bucket.size() != 0) {
450                 showLabelInList(UI, bucket.getLabel());
451                 for (Record<T> item : bucket) {
452                     showIndexedItem(UI, item.getName(), item.getData());
453                 }
454                 logln(UI.toString());
455             }
456         }
457     }
458 
459     /**
460      * @param myBucketLabels
461      * @param myBucketContents
462      * @param b
463      */
showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty)464     private void showIndex(List<String> myBucketLabels, ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents, boolean showEmpty) {
465         logln("Alternative");
466         StringBuilder UI = new StringBuilder();
467 
468         for (int i = 0; i < myBucketLabels.size(); ++i) {
469             Set<R4<RawCollationKey, String, Integer, Double>> bucket = myBucketContents.get(i);
470             if (!showEmpty && bucket.size() == 0) {
471                 continue;
472             }
473             UI.setLength(0);
474             UI.append("*").append(myBucketLabels.get(i));
475             for (R4<RawCollationKey, String, Integer, Double> item : bucket) {
476                 UI.append("\t ").append(item.get1().toString()).append(ARROW).append(item.get3().toString());
477             }
478             logln(UI.toString());
479         }
480     }
481 
showLabelAtTop(StringBuilder buffer, String label)482     private void showLabelAtTop(StringBuilder buffer, String label) {
483         buffer.append(label + " ");
484     }
485 
showIndexedItem(StringBuilder buffer, CharSequence key, T value)486     private <T> void showIndexedItem(StringBuilder buffer, CharSequence key, T value) {
487         buffer.append("\t " + key + ARROW + value);
488     }
489 
showLabelInList(StringBuilder buffer, String label)490     private void showLabelInList(StringBuilder buffer, String label) {
491         buffer.setLength(0);
492         buffer.append(label);
493     }
494 
getKeys(AlphabeticIndex.Bucket<Integer> entry)495     private Counter<String> getKeys(AlphabeticIndex.Bucket<Integer> entry) {
496         Counter<String> keys = new Counter<String>();
497         for (AlphabeticIndex.Record x : entry) {
498             String key = x.getName().toString();
499             keys.add(key, 1);
500         }
501         return keys;
502     }
503 
TestIndexCharactersList()504     public void TestIndexCharactersList() {
505         for (String[] localeAndIndexCharacters : localeAndIndexCharactersLists) {
506             ULocale locale = new ULocale(localeAndIndexCharacters[0]);
507             String expectedIndexCharacters = "\u2026:" + localeAndIndexCharacters[1] + ":\u2026";
508             Collection<String> alphabeticIndex = new AlphabeticIndex(locale).getBucketLabels();
509 
510             // Join the elements of the list to a string with delimiter ":"
511             StringBuilder sb = new StringBuilder();
512             Iterator<String> iter = alphabeticIndex.iterator();
513             while (iter.hasNext()) {
514                 sb.append(iter.next());
515                 if (!iter.hasNext()) {
516                     break;
517                 }
518                 sb.append(":");
519             }
520             String actualIndexCharacters = sb.toString();
521             if (!expectedIndexCharacters.equals(actualIndexCharacters)) {
522                 errln("Test failed for locale " + localeAndIndexCharacters[0] +
523                         "\n  Expected = |" + expectedIndexCharacters + "|\n  actual   = |" + actualIndexCharacters + "|");
524             }
525         }
526     }
527 
TestBasics()528     public void TestBasics() {
529         ULocale[] list = ULocale.getAvailableLocales();
530         // get keywords combinations
531         // don't bother with multiple combinations at this point
532         List keywords = new ArrayList();
533         keywords.add("");
534 
535         String[] collationValues = Collator.getKeywordValues("collation");
536         for (int j = 0; j < collationValues.length; ++j) {
537             keywords.add("@collation=" + collationValues[j]);
538         }
539 
540         for (int i = 0; i < list.length; ++i) {
541             for (Iterator it = keywords.iterator(); it.hasNext();) {
542                 String collationValue = (String) it.next();
543                 String localeString = list[i].toString();
544                 if (!KEY_LOCALES.contains(localeString)) continue; // TODO change in exhaustive
545                 ULocale locale = new ULocale(localeString + collationValue);
546                 if (collationValue.length() > 0 && !Collator.getFunctionalEquivalent("collation", locale).equals(locale)) {
547                     //logln("Skipping " + locale);
548                     continue;
549                 }
550 
551                 if (locale.getCountry().length() != 0) {
552                     continue;
553                 }
554                 boolean isUnihan = collationValue.contains("unihan");
555                 AlphabeticIndex alphabeticIndex = new AlphabeticIndex(locale);
556                 if (isUnihan) {
557                     // Unihan tailorings have a label per radical, and there are at least 214,
558                     // if not more when simplified radicals are distinguished.
559                     alphabeticIndex.setMaxLabelCount(500);
560                 }
561                 final Collection mainChars = alphabeticIndex.getBucketLabels();
562                 String mainCharString = mainChars.toString();
563                 if (mainCharString.length() > 500) {
564                     mainCharString = mainCharString.substring(0,500) + "...";
565                 }
566                 logln(mainChars.size() + "\t" + locale + "\t" + locale.getDisplayName(ULocale.ENGLISH));
567                 logln("Index:\t" + mainCharString);
568                 if (!isUnihan && mainChars.size() > 100) {
569                     errln("Index character set too large: " +
570                             locale + " [" + mainChars.size() + "]:\n    " + mainChars);
571                 }
572             }
573         }
574     }
575 
TestClientSupport()576     public void TestClientSupport() {
577         for (String localeString : new String[] {"zh"}) { // KEY_LOCALES, new String[] {"zh"}
578             ULocale ulocale = new ULocale(localeString);
579             AlphabeticIndex<Double> alphabeticIndex = new AlphabeticIndex<Double>(ulocale).addLabels(ULocale.ENGLISH);
580             RuleBasedCollator collator = alphabeticIndex.getCollator();
581             String [][] tests;
582 
583             if (!localeString.equals("zh") ) {
584                 tests = new String[][] {SimpleTests};
585             } else {
586                 tests = new String[][] {SimpleTests, hackPinyin, simplifiedNames};
587             }
588 
589             for (String [] shortTest : tests) {
590                 double testValue = 100;
591                 alphabeticIndex.clearRecords();
592                 for (String name : shortTest) {
593                     alphabeticIndex.addRecord(name, testValue++);
594                 }
595 
596                 if (DEBUG) showIndex(alphabeticIndex, false);
597 
598                 // make my own copy
599                 testValue = 100;
600                 List<String> myBucketLabels = alphabeticIndex.getBucketLabels();
601                 ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>> myBucketContents = new ArrayList<Set<R4<RawCollationKey, String, Integer, Double>>>(myBucketLabels.size());
602                 for (int i = 0; i < myBucketLabels.size(); ++i) {
603                     myBucketContents.add(new TreeSet<R4<RawCollationKey, String, Integer, Double>>());
604                 }
605                 for (String name : shortTest) {
606                     int bucketIndex = alphabeticIndex.getBucketIndex(name);
607                     if (bucketIndex > myBucketContents.size()) {
608                         alphabeticIndex.getBucketIndex(name); // call again for debugging
609                     }
610                     Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(bucketIndex);
611                     RawCollationKey rawCollationKey = collator.getRawCollationKey(name, null);
612                     R4<RawCollationKey, String, Integer, Double> row = Row.of(rawCollationKey, name, name.length(), testValue++);
613                     myBucket.add(row);
614                 }
615                 if (DEBUG) showIndex(myBucketLabels, myBucketContents, false);
616 
617                 // now compare
618                 int index = 0;
619                 boolean gotError = false;
620                 for (AlphabeticIndex.Bucket<Double> bucket : alphabeticIndex) {
621                     String bucketLabel = bucket.getLabel();
622                     String myLabel = myBucketLabels.get(index);
623                     if (!bucketLabel.equals(myLabel)) {
624                         gotError |= !assertEquals(ulocale + "\tBucket Labels (" + index + ")", bucketLabel, myLabel);
625                     }
626                     Set<R4<RawCollationKey, String, Integer, Double>> myBucket = myBucketContents.get(index);
627                     Iterator<R4<RawCollationKey, String, Integer, Double>> myBucketIterator = myBucket.iterator();
628                     int recordIndex = 0;
629                     for (Record<Double> record : bucket) {
630                         String myName = null;
631                         if (myBucketIterator.hasNext()) {
632                             R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next();
633                             myName = (String) myRecord.get1();
634                         }
635                         if (!record.getName().equals(myName)) {
636                             gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", record.getName(), myName);
637                         }
638                     }
639                     while (myBucketIterator.hasNext()) {
640                         R4<RawCollationKey, String, Integer, Double> myRecord = myBucketIterator.next();
641                         String myName = (String) myRecord.get1();
642                         gotError |= !assertEquals(ulocale + "\t" + bucketLabel + "\t" + "Record Names (" + index + "." + recordIndex++ + ")", null, myName);
643                     }
644                     index++;
645                 }
646                 if (gotError) {
647                     showIndex(myBucketLabels, myBucketContents, false);
648                     showIndex(alphabeticIndex, false);
649                 }
650             }
651         }
652     }
653 
TestFirstScriptCharacters()654     public void TestFirstScriptCharacters() {
655         Collection<String> firstCharacters =
656                 new AlphabeticIndex(ULocale.ENGLISH).getFirstCharactersInScripts();
657         Collection<String> expectedFirstCharacters = firstStringsInScript((RuleBasedCollator) Collator.getInstance(ULocale.ROOT));
658         Collection<String> diff = new TreeSet<String>(firstCharacters);
659         diff.removeAll(expectedFirstCharacters);
660         assertTrue("First Characters contains unexpected ones: " + diff, diff.isEmpty());
661         diff.clear();
662         diff.addAll(expectedFirstCharacters);
663         diff.removeAll(firstCharacters);
664         assertTrue("First Characters missing expected ones: " + diff, diff.isEmpty());
665     }
666 
667     private static final UnicodeSet TO_TRY = new UnicodeSet("[[:^nfcqc=no:]-[:sc=Common:]-[:sc=Inherited:]-[:sc=Unknown:]]").freeze();
668 
669     /**
670      * Returns a collection of all the "First" characters of scripts, according to the collation.
671      */
firstStringsInScript(RuleBasedCollator ruleBasedCollator)672     private static Collection<String> firstStringsInScript(RuleBasedCollator ruleBasedCollator) {
673         String[] results = new String[UScript.CODE_LIMIT];
674         for (String current : TO_TRY) {
675             if (ruleBasedCollator.compare(current, "a") < 0) { // we only want "real" script characters, not symbols.
676                 continue;
677             }
678             int script = UScript.getScript(current.codePointAt(0));
679             if (results[script] == null) {
680                 results[script] = current;
681             } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
682                 results[script] = current;
683             }
684         }
685 
686         try {
687             UnicodeSet extras = new UnicodeSet();
688             UnicodeSet expansions = new UnicodeSet();
689             ruleBasedCollator.getContractionsAndExpansions(extras, expansions, true);
690             extras.addAll(expansions).removeAll(TO_TRY);
691             if (extras.size() != 0) {
692                 Normalizer2 normalizer = Normalizer2.getNFKCInstance();
693                 for (String current : extras) {
694                     if (!normalizer.isNormalized(current) || ruleBasedCollator.compare(current, "9") <= 0) {
695                         continue;
696                     }
697                     int script = getFirstRealScript(current);
698                     if (script == UScript.UNKNOWN && !isUnassignedBoundary(current)) { continue; }
699                     if (results[script] == null) {
700                         results[script] = current;
701                     } else if (ruleBasedCollator.compare(current, results[script]) < 0) {
702                         results[script] = current;
703                     }
704                 }
705             }
706         } catch (Exception e) {
707         } // why have a checked exception???
708 
709         // TODO: We should not test that we get the same strings, but that we
710         // get strings that sort primary-equal to those from the implementation.
711 
712         Collection<String> result = new ArrayList<String>();
713         for (int i = 0; i < results.length; ++i) {
714             if (results[i] != null) {
715                 result.add(results[i]);
716             }
717         }
718         return result;
719     }
720 
isUnassignedBoundary(CharSequence s)721     private static final boolean isUnassignedBoundary(CharSequence s) {
722         // The root collator provides a script-first-primary boundary contraction
723         // for the unassigned-implicit range.
724         return s.charAt(0) == 0xfdd1 &&
725                 UScript.getScript(Character.codePointAt(s, 1)) == UScript.UNKNOWN;
726     }
727 
TestZZZ()728     public void TestZZZ() {
729         //            int x = 3;
730         //            AlphabeticIndex index = new AlphabeticIndex(ULocale.ENGLISH);
731         //            UnicodeSet additions = new UnicodeSet();
732         //            additions.add(0x410).add(0x415);  // Cyrillic
733         //            // additions.add(0x391).add(0x393);     // Greek
734         //            index.addLabels(additions);
735         //            int lc = index.getLabels().size();
736         //            List  labels = index.getLabels();
737         //            System.out.println("Label Count = " + lc + "\t" + labels);
738         //            System.out.println("Bucket Count =" + index.getBucketCount());
739     }
740 
TestSimplified()741     public void TestSimplified() {
742         checkBuckets("zh", simplifiedNames, ULocale.ENGLISH, "W", "\u897f");
743     }
TestTraditional()744     public void TestTraditional() {
745         checkBuckets("zh_Hant", traditionalNames, ULocale.ENGLISH, "\u4e9f", "\u5357\u9580");
746     }
747 
748     static final String[] SimpleTests = {
749         "斎藤",
750         "\u1f2d\u03c1\u03b1",
751         "$", "\u00a3", "12", "2",
752         "Davis", "Davis", "Abbot", "\u1D05avis", "Zach", "\u1D05avis", "\u01b5", "\u0130stanbul", "Istanbul", "istanbul", "\u0131stanbul",
753         "\u00deor", "\u00c5berg", "\u00d6stlund",
754         "\u1f2d\u03c1\u03b1", "\u1f08\u03b8\u03b7\u03bd\u1fb6",
755         "\u0396\u03b5\u03cd\u03c2", "\u03a0\u03bf\u03c3\u03b5\u03b9\u03b4\u1f63\u03bd", "\u1f0d\u03b9\u03b4\u03b7\u03c2", "\u0394\u03b7\u03bc\u03ae\u03c4\u03b7\u03c1", "\u1f19\u03c3\u03c4\u03b9\u03ac",
756         //"\u1f08\u03c0\u03cc\u03bb\u03bb\u03c9\u03bd", "\u1f0c\u03c1\u03c4\u03b5\u03bc\u03b9\u03c2", "\u1f19\u03c1\u03bc\u1f23\u03c2", "\u1f0c\u03c1\u03b7\u03c2", "\u1f08\u03c6\u03c1\u03bf\u03b4\u03af\u03c4\u03b7", "\u1f2d\u03c6\u03b1\u03b9\u03c3\u03c4\u03bf\u03c2", "\u0394\u03b9\u03cc\u03bd\u03c5\u03c3\u03bf\u03c2",
757         "\u6589\u85e4", "\u4f50\u85e4", "\u9234\u6728", "\u9ad8\u6a4b", "\u7530\u4e2d", "\u6e21\u8fba", "\u4f0a\u85e4", "\u5c71\u672c", "\u4e2d\u6751", "\u5c0f\u6797", "\u658e\u85e4", "\u52a0\u85e4",
758         //"\u5409\u7530", "\u5c71\u7530", "\u4f50\u3005\u6728", "\u5c71\u53e3", "\u677e\u672c", "\u4e95\u4e0a", "\u6728\u6751", "\u6797", "\u6e05\u6c34"
759     };
760 
761     static final String[] hackPinyin = {
762         "a", "\u5416", "\u58ba", //
763         "b", "\u516b", "\u62d4", "\u8500", //
764         "c", "\u5693", "\u7938", "\u9e7e", //
765         "d", "\u5491", "\u8fcf", "\u964a", //
766         "e","\u59b8", "\u92e8", "\u834b", //
767         "f", "\u53d1", "\u9197", "\u99a5", //
768         "g", "\u7324", "\u91d3", "\u8142", //
769         "h", "\u598e", "\u927f", "\u593b", //
770         "j", "\u4e0c", "\u6785", "\u9d58", //
771         "k", "\u5494", "\u958b", "\u7a52", //
772         "l", "\u5783", "\u62c9", "\u9ba5", //
773         "m", "\u5638", "\u9ebb", "\u65c0", //
774         "n", "\u62ff", "\u80ad", "\u685b", //
775         "o", "\u5662", "\u6bee", "\u8bb4", //
776         "p", "\u5991", "\u8019", "\u8c31", //
777         "q", "\u4e03", "\u6053", "\u7f56", //
778         "r", "\u5465", "\u72aa", "\u6e03", //
779         "s", "\u4ee8", "\u9491", "\u93c1", //
780         "t", "\u4ed6", "\u9248", "\u67dd", //
781         "w", "\u5c72", "\u5558", "\u5a7a", //
782         "x", "\u5915", "\u5438", "\u6bbe", //
783         "y", "\u4e2b", "\u82bd", "\u8574", //
784         "z", "\u5e00", "\u707d", "\u5c0a"
785     };
786 
787     static final String[] simplifiedNames = {
788         "Abbot", "Morton", "Zachary", "Williams", "\u8d75", "\u94b1", "\u5b59", "\u674e", "\u5468", "\u5434", "\u90d1", "\u738b", "\u51af", "\u9648", "\u696e", "\u536b", "\u848b", "\u6c88",
789         "\u97e9", "\u6768", "\u6731", "\u79e6", "\u5c24", "\u8bb8", "\u4f55", "\u5415", "\u65bd", "\u5f20", "\u5b54", "\u66f9", "\u4e25", "\u534e", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a", "\u8c22", "\u90b9",
790         "\u55bb", "\u67cf", "\u6c34", "\u7aa6", "\u7ae0", "\u4e91", "\u82cf", "\u6f58", "\u845b", "\u595a", "\u8303", "\u5f6d", "\u90ce", "\u9c81", "\u97e6", "\u660c", "\u9a6c", "\u82d7", "\u51e4", "\u82b1", "\u65b9",
791         "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9c8d", "\u53f2", "\u5510", "\u8d39", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8d3a", "\u502a", "\u6c64", "\u6ed5", "\u6bb7", "\u7f57", "\u6bd5", "\u90dd",
792         "\u90ac", "\u5b89", "\u5e38", "\u4e50", "\u4e8e", "\u65f6", "\u5085", "\u76ae", "\u535e", "\u9f50", "\u5eb7", "\u4f0d", "\u4f59", "\u5143", "\u535c", "\u987e", "\u5b5f", "\u5e73", "\u9ec4", "\u548c", "\u7a46",
793         "\u8427", "\u5c39", "\u59da", "\u90b5", "\u6e5b", "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8d1d", "\u660e", "\u81e7", "\u8ba1", "\u4f0f", "\u6210", "\u6234", "\u8c08", "\u5b8b", "\u8305",
794         "\u5e9e", "\u718a", "\u7eaa", "\u8212", "\u5c48", "\u9879", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u84dd", "\u95fd", "\u5e2d", "\u5b63", "\u9ebb", "\u5f3a", "\u8d3e", "\u8def", "\u5a04", "\u5371",
795         "\u6c5f", "\u7ae5", "\u989c", "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u953a", "\u5f90", "\u4e18", "\u9a86", "\u9ad8", "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e",
796         "\u4e07", "\u652f", "\u67ef", "\u661d", "\u7ba1", "\u5362", "\u83ab", "\u7ecf", "\u623f", "\u88d8", "\u7f2a", "\u5e72", "\u89e3", "\u5e94", "\u5b97", "\u4e01", "\u5ba3", "\u8d32", "\u9093", "\u90c1", "\u5355",
797         "\u676d", "\u6d2a", "\u5305", "\u8bf8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u94ae", "\u9f9a", "\u7a0b", "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9646", "\u8363", "\u7fc1", "\u8340", "\u7f8a", "\u65bc",
798         "\u60e0", "\u7504", "\u9eb9", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u50a8", "\u9773", "\u6c72", "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u4e4c", "\u7126", "\u5df4", "\u5f13",
799         "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8f66", "\u4faf", "\u5b93", "\u84ec", "\u5168", "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bab", "\u5b81", "\u4ec7", "\u683e", "\u66b4", "\u7518",
800         "\u659c", "\u5389", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5218", "\u666f", "\u8a79", "\u675f", "\u9f99", "\u53f6", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u84df", "\u8584", "\u5370", "\u5bbf",
801         "\u767d", "\u6000", "\u84b2", "\u90b0", "\u4ece", "\u9102", "\u7d22", "\u54b8", "\u7c4d", "\u8d56", "\u5353", "\u853a", "\u5c60", "\u8499", "\u6c60", "\u4e54", "\u9634", "\u90c1", "\u80e5", "\u80fd", "\u82cd",
802         "\u53cc", "\u95fb", "\u8398", "\u515a", "\u7fdf", "\u8c2d", "\u8d21", "\u52b3", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u90e6", "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842",
803         "\u6fee", "\u725b", "\u5bff", "\u901a", "\u8fb9", "\u6248", "\u71d5", "\u5180", "\u90cf", "\u6d66", "\u5c1a", "\u519c", "\u6e29", "\u522b", "\u5e84", "\u664f", "\u67f4", "\u77bf", "\u960e", "\u5145", "\u6155",
804         "\u8fde", "\u8339", "\u4e60", "\u5ba6", "\u827e", "\u9c7c", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe", "\u7ec8", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f",
805         "\u6ee1", "\u5f18", "\u5321", "\u56fd", "\u6587", "\u5bc7", "\u5e7f", "\u7984", "\u9619", "\u4e1c", "\u6b27", "\u6bb3", "\u6c83", "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e08", "\u5de9", "\u538d",
806         "\u8042", "\u6641", "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u961a", "\u90a3", "\u7b80", "\u9976", "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u517b", "\u97a0", "\u987b", "\u4e30",
807         "\u5de2", "\u5173", "\u84af", "\u76f8", "\u67e5", "\u540e", "\u8346", "\u7ea2", "\u6e38", "\u7afa", "\u6743", "\u9011", "\u76d6", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u9a6c", "\u4e0a\u5b98", "\u6b27\u9633",
808         "\u590f\u4faf", "\u8bf8\u845b", "\u95fb\u4eba", "\u4e1c\u65b9", "\u8d6b\u8fde", "\u7687\u752b", "\u5c09\u8fdf", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f", "\u6fee\u9633", "\u6df3\u4e8e", "\u5355\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b59", "\u4ef2\u5b59",
809         "\u8f69\u8f95", "\u4ee4\u72d0", "\u953a\u79bb", "\u5b87\u6587", "\u957f\u5b59", "\u6155\u5bb9", "\u9c9c\u4e8e", "\u95fe\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98", "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8f66", "\u989b\u5b59", "\u7aef\u6728", "\u5deb\u9a6c",
810         "\u516c\u897f", "\u6f06\u96d5", "\u4e50\u6b63", "\u58e4\u9a77", "\u516c\u826f", "\u62d3\u62d4", "\u5939\u8c37", "\u5bb0\u7236", "\u8c37\u6881", "\u664b", "\u695a", "\u960e", "\u6cd5", "\u6c5d", "\u9122", "\u6d82", "\u94a6", "\u6bb5\u5e72", "\u767e\u91cc",
811         "\u4e1c\u90ed", "\u5357\u95e8", "\u547c\u5ef6", "\u5f52", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e05", "\u7f11", "\u4ea2", "\u51b5", "\u540e", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u4e1c\u95e8", "\u897f\u95e8",
812         "\u5546", "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8d4f", "\u5357\u5bab", "\u58a8", "\u54c8", "\u8c2f", "\u7b2a", "\u5e74", "\u7231", "\u9633", "\u4f5f"
813     };
814 
815     static final String[] traditionalNames = { "丁", "Abbot", "Morton", "Zachary", "Williams", "\u8d99", "\u9322", "\u5b6b",
816             "\u674e", "\u5468", "\u5433", "\u912d", "\u738b", "\u99ae", "\u9673", "\u696e", "\u885b", "\u8523",
817             "\u6c88", "\u97d3", "\u694a", "\u6731", "\u79e6", "\u5c24", "\u8a31", "\u4f55", "\u5442", "\u65bd",
818             "\u5f35", "\u5b54", "\u66f9", "\u56b4", "\u83ef", "\u91d1", "\u9b4f", "\u9676", "\u59dc", "\u621a",
819             "\u8b1d", "\u9112", "\u55bb", "\u67cf", "\u6c34", "\u7ac7", "\u7ae0", "\u96f2", "\u8607", "\u6f58",
820             "\u845b", "\u595a", "\u7bc4", "\u5f6d", "\u90ce", "\u9b6f", "\u97cb", "\u660c", "\u99ac", "\u82d7",
821             "\u9cf3", "\u82b1", "\u65b9", "\u4fde", "\u4efb", "\u8881", "\u67f3", "\u9146", "\u9b91", "\u53f2",
822             "\u5510", "\u8cbb", "\u5ec9", "\u5c91", "\u859b", "\u96f7", "\u8cc0", "\u502a", "\u6e6f", "\u6ed5",
823             "\u6bb7", "\u7f85", "\u7562", "\u90dd", "\u9114", "\u5b89", "\u5e38", "\u6a02", "\u65bc", "\u6642",
824             "\u5085", "\u76ae", "\u535e", "\u9f4a", "\u5eb7", "\u4f0d", "\u9918", "\u5143", "\u535c", "\u9867",
825             "\u5b5f", "\u5e73", "\u9ec3", "\u548c", "\u7a46", "\u856d", "\u5c39", "\u59da", "\u90b5", "\u6e5b",
826             "\u6c6a", "\u7941", "\u6bdb", "\u79b9", "\u72c4", "\u7c73", "\u8c9d", "\u660e", "\u81e7", "\u8a08",
827             "\u4f0f", "\u6210", "\u6234", "\u8ac7", "\u5b8b", "\u8305", "\u9f90", "\u718a", "\u7d00", "\u8212",
828             "\u5c48", "\u9805", "\u795d", "\u8463", "\u6881", "\u675c", "\u962e", "\u85cd", "\u95a9", "\u5e2d",
829             "\u5b63", "\u9ebb", "\u5f37", "\u8cc8", "\u8def", "\u5a41", "\u5371", "\u6c5f", "\u7ae5", "\u984f",
830             "\u90ed", "\u6885", "\u76db", "\u6797", "\u5201", "\u937e", "\u5f90", "\u4e18", "\u99f1", "\u9ad8",
831             "\u590f", "\u8521", "\u7530", "\u6a0a", "\u80e1", "\u51cc", "\u970d", "\u865e", "\u842c", "\u652f",
832             "\u67ef", "\u661d", "\u7ba1", "\u76e7", "\u83ab", "\u7d93", "\u623f", "\u88d8", "\u7e46", "\u5e79",
833             "\u89e3", "\u61c9", "\u5b97", "\u4e01", "\u5ba3", "\u8cc1", "\u9127", "\u9b31", "\u55ae", "\u676d",
834             "\u6d2a", "\u5305", "\u8af8", "\u5de6", "\u77f3", "\u5d14", "\u5409", "\u9215", "\u9f94", "\u7a0b",
835             "\u5d47", "\u90a2", "\u6ed1", "\u88f4", "\u9678", "\u69ae", "\u7fc1", "\u8340", "\u7f8a", "\u65bc",
836             "\u60e0", "\u7504", "\u9eb4", "\u5bb6", "\u5c01", "\u82ae", "\u7fbf", "\u5132", "\u9773", "\u6c72",
837             "\u90b4", "\u7cdc", "\u677e", "\u4e95", "\u6bb5", "\u5bcc", "\u5deb", "\u70cf", "\u7126", "\u5df4",
838             "\u5f13", "\u7267", "\u9697", "\u5c71", "\u8c37", "\u8eca", "\u4faf", "\u5b93", "\u84ec", "\u5168",
839             "\u90d7", "\u73ed", "\u4ef0", "\u79cb", "\u4ef2", "\u4f0a", "\u5bae", "\u5be7", "\u4ec7", "\u6b12",
840             "\u66b4", "\u7518", "\u659c", "\u53b2", "\u620e", "\u7956", "\u6b66", "\u7b26", "\u5289", "\u666f",
841             "\u8a79", "\u675f", "\u9f8d", "\u8449", "\u5e78", "\u53f8", "\u97f6", "\u90dc", "\u9ece", "\u858a",
842             "\u8584", "\u5370", "\u5bbf", "\u767d", "\u61f7", "\u84b2", "\u90b0", "\u5f9e", "\u9102", "\u7d22",
843             "\u54b8", "\u7c4d", "\u8cf4", "\u5353", "\u85fa", "\u5c60", "\u8499", "\u6c60", "\u55ac", "\u9670",
844             "\u9b31", "\u80e5", "\u80fd", "\u84bc", "\u96d9", "\u805e", "\u8398", "\u9ee8", "\u7fdf", "\u8b5a",
845             "\u8ca2", "\u52de", "\u9004", "\u59ec", "\u7533", "\u6276", "\u5835", "\u5189", "\u5bb0", "\u9148",
846             "\u96cd", "\u90e4", "\u74a9", "\u6851", "\u6842", "\u6fee", "\u725b", "\u58fd", "\u901a", "\u908a",
847             "\u6248", "\u71d5", "\u5180", "\u90df", "\u6d66", "\u5c1a", "\u8fb2", "\u6eab", "\u5225", "\u838a",
848             "\u664f", "\u67f4", "\u77bf", "\u95bb", "\u5145", "\u6155", "\u9023", "\u8339", "\u7fd2", "\u5ba6",
849             "\u827e", "\u9b5a", "\u5bb9", "\u5411", "\u53e4", "\u6613", "\u614e", "\u6208", "\u5ed6", "\u5ebe",
850             "\u7d42", "\u66a8", "\u5c45", "\u8861", "\u6b65", "\u90fd", "\u803f", "\u6eff", "\u5f18", "\u5321",
851             "\u570b", "\u6587", "\u5bc7", "\u5ee3", "\u797f", "\u95d5", "\u6771", "\u6b50", "\u6bb3", "\u6c83",
852             "\u5229", "\u851a", "\u8d8a", "\u5914", "\u9686", "\u5e2b", "\u978f", "\u5399", "\u8076", "\u6641",
853             "\u52fe", "\u6556", "\u878d", "\u51b7", "\u8a3e", "\u8f9b", "\u95de", "\u90a3", "\u7c21", "\u9952",
854             "\u7a7a", "\u66fe", "\u6bcb", "\u6c99", "\u4e5c", "\u990a", "\u97a0", "\u9808", "\u8c50", "\u5de2",
855             "\u95dc", "\u84af", "\u76f8", "\u67e5", "\u5f8c", "\u834a", "\u7d05", "\u904a", "\u7afa", "\u6b0a",
856             "\u9011", "\u84cb", "\u76ca", "\u6853", "\u516c", "\u4e07\u4fdf", "\u53f8\u99ac", "\u4e0a\u5b98",
857             "\u6b50\u967d", "\u590f\u4faf", "\u8af8\u845b", "\u805e\u4eba", "\u6771\u65b9", "\u8d6b\u9023",
858             "\u7687\u752b", "\u5c09\u9072", "\u516c\u7f8a", "\u6fb9\u53f0", "\u516c\u51b6", "\u5b97\u653f",
859             "\u6fee\u967d", "\u6df3\u4e8e", "\u55ae\u4e8e", "\u592a\u53d4", "\u7533\u5c60", "\u516c\u5b6b",
860             "\u4ef2\u5b6b", "\u8ed2\u8f45", "\u4ee4\u72d0", "\u937e\u96e2", "\u5b87\u6587", "\u9577\u5b6b",
861             "\u6155\u5bb9", "\u9bae\u4e8e", "\u95ad\u4e18", "\u53f8\u5f92", "\u53f8\u7a7a", "\u4e0c\u5b98",
862             "\u53f8\u5bc7", "\u4ec9", "\u7763", "\u5b50\u8eca", "\u9853\u5b6b", "\u7aef\u6728", "\u5deb\u99ac",
863             "\u516c\u897f", "\u6f06\u96d5", "\u6a02\u6b63", "\u58e4\u99df", "\u516c\u826f", "\u62d3\u62d4",
864             "\u593e\u8c37", "\u5bb0\u7236", "\u7a40\u6881", "\u6649", "\u695a", "\u95bb", "\u6cd5", "\u6c5d", "\u9122",
865             "\u5857", "\u6b3d", "\u6bb5\u5e72", "\u767e\u91cc", "\u6771\u90ed", "\u5357\u9580", "\u547c\u5ef6",
866             "\u6b78", "\u6d77", "\u7f8a\u820c", "\u5fae\u751f", "\u5cb3", "\u5e25", "\u7df1", "\u4ea2", "\u6cc1",
867             "\u5f8c", "\u6709", "\u7434", "\u6881\u4e18", "\u5de6\u4e18", "\u6771\u9580", "\u897f\u9580", "\u5546",
868             "\u725f", "\u4f58", "\u4f74", "\u4f2f", "\u8cde", "\u5357\u5bae", "\u58a8", "\u54c8", "\u8b59", "\u7b2a",
869             "\u5e74", "\u611b", "\u967d", "\u4f5f", "\u3401", "\u3422", "\u3426", "\u3493", "\u34A5", "\u34A7",
870             "\u34AA", "\u3536", "\u4A3B", "\u4E00", "\u4E01", "\u4E07", "\u4E0D", "\u4E17", "\u4E23", "\u4E26",
871             "\u4E34", "\u4E82", "\u4EB8", "\u4EB9", "\u511F", "\u512D", "\u513D", "\u513E", "\u53B5", "\u56D4",
872             "\u56D6", "\u7065", "\u7069", "\u706A", "\u7E9E", "\u9750", "\u9F49", "\u9F7E", "\u9F98", "\uD840\uDC35",
873             "\uD840\uDC3D", "\uD840\uDC3E", "\uD840\uDC41", "\uD840\uDC46", "\uD840\uDC4C", "\uD840\uDC4E",
874             "\uD840\uDC53", "\uD840\uDC55", "\uD840\uDC56", "\uD840\uDC5F", "\uD840\uDC60", "\uD840\uDC7A",
875             "\uD840\uDC7B", "\uD840\uDCC8", "\uD840\uDD9E", "\uD840\uDD9F", "\uD840\uDDA0", "\uD840\uDDA1",
876             "\uD841\uDD3B", "\uD842\uDCCA", "\uD842\uDCCB", "\uD842\uDD6C", "\uD842\uDE0B", "\uD842\uDE0C",
877             "\uD842\uDED1", "\uD844\uDD9F", "\uD845\uDD19", "\uD845\uDD1A", "\uD846\uDD3B", "\uD84C\uDF5C",
878             "\uD85A\uDDC4", "\uD85A\uDDC5", "\uD85C\uDD98", "\uD85E\uDCB1", "\uD861\uDC04", "\uD864\uDDD3",
879             "\uD865\uDE63", "\uD869\uDCCA", "\uD86B\uDE9A", };
880 
881     /**
882      * Test AlphabeticIndex vs. root with script reordering.
883      */
TestHaniFirst()884     public void TestHaniFirst() {
885         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
886         coll.setReorderCodes(UScript.HAN);
887         AlphabeticIndex index = new AlphabeticIndex(coll);
888         assertEquals("getBucketCount()", 1, index.getBucketCount());   // ... (underflow only)
889         index.addLabels(ULocale.ENGLISH);
890         assertEquals("getBucketCount()", 28, index.getBucketCount());  // ... A-Z ...
891         int bucketIndex = index.getBucketIndex("\u897f");
892         assertEquals("getBucketIndex(U+897F)", 0, bucketIndex);  // underflow bucket
893         bucketIndex = index.getBucketIndex("i");
894         assertEquals("getBucketIndex(i)", 9, bucketIndex);
895         bucketIndex = index.getBucketIndex("\u03B1");
896         assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
897         // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
898         bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
899         assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
900         bucketIndex = index.getBucketIndex("\uFFFF");
901         assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
902     }
903 
904     /**
905      * Test AlphabeticIndex vs. Pinyin with script reordering.
906      */
TestPinyinFirst()907     public void TestPinyinFirst() {
908         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.CHINESE);
909         coll.setReorderCodes(UScript.HAN);
910         AlphabeticIndex index = new AlphabeticIndex(coll);
911         assertEquals("getBucketCount()", 28, index.getBucketCount());   // ... A-Z ...
912         index.addLabels(ULocale.CHINESE);
913         assertEquals("getBucketCount()", 28, index.getBucketCount());  // ... A-Z ...
914         int bucketIndex = index.getBucketIndex("\u897f");
915         assertEquals("getBucketIndex(U+897F)", 'X' - 'A' + 1, bucketIndex);
916         bucketIndex = index.getBucketIndex("i");
917         assertEquals("getBucketIndex(i)", 9, bucketIndex);
918         bucketIndex = index.getBucketIndex("\u03B1");
919         assertEquals("getBucketIndex(Greek alpha)", 27, bucketIndex);
920         // U+50005 is an unassigned code point which sorts at the end, independent of the Hani group.
921         bucketIndex = index.getBucketIndex(UTF16.valueOf(0x50005));
922         assertEquals("getBucketIndex(U+50005)", 27, bucketIndex);
923         bucketIndex = index.getBucketIndex("\uFFFF");
924         assertEquals("getBucketIndex(U+FFFF)", 27, bucketIndex);
925     }
926 
927     /**
928      * Test labels with multiple primary weights.
929      */
TestSchSt()930     public void TestSchSt() {
931         AlphabeticIndex index = new AlphabeticIndex(ULocale.GERMAN);
932         index.addLabels(new UnicodeSet("[Æ{Sch*}{St*}]"));
933         // ... A Æ B-R S Sch St T-Z ...
934         ImmutableIndex immIndex = index.buildImmutableIndex();
935         assertEquals("getBucketCount()", 31, index.getBucketCount());
936         assertEquals("immutable getBucketCount()", 31, immIndex.getBucketCount());
937         String[][] testCases = new String[][] {
938             // name, bucket index, bucket label
939             { "Adelbert", "1", "A" },
940             { "Afrika", "1", "A" },
941             { "Æsculap", "2", "Æ" },
942             { "Aesthet", "2", "Æ" },
943             { "Berlin", "3", "B" },
944             { "Rilke", "19", "R" },
945             { "Sacher", "20", "S" },
946             { "Seiler", "20", "S" },
947             { "Sultan", "20", "S" },
948             { "Schiller", "21", "Sch" },
949             { "Steiff", "22", "St" },
950             { "Thomas", "23", "T" }
951         };
952         List<String> labels = index.getBucketLabels();
953         for (String[] testCase : testCases) {
954             String name = testCase[0];
955             int bucketIndex = Integer.valueOf(testCase[1]);
956             String label = testCase[2];
957             String msg = "getBucketIndex(" + name + ")";
958             assertEquals(msg, bucketIndex, index.getBucketIndex(name));
959             msg = "immutable " + msg;
960             assertEquals(msg, bucketIndex, immIndex.getBucketIndex(name));
961             msg = "bucket label (" + name + ")";
962             assertEquals(msg, label, labels.get(index.getBucketIndex(name)));
963             msg = "immutable " + msg;
964             assertEquals(msg, label, immIndex.getBucket(bucketIndex).getLabel());
965         }
966     }
967 
968     /**
969      * With no real labels, there should be only the underflow label.
970      */
TestNoLabels()971     public void TestNoLabels() {
972         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(ULocale.ROOT);
973         AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(coll);
974         index.addRecord("\u897f", 0);
975         index.addRecord("i", 0);
976         index.addRecord("\u03B1", 0);
977         assertEquals("getBucketCount()", 1, index.getBucketCount());  // ...
978         Bucket<Integer> bucket = index.iterator().next();
979         assertEquals("underflow label type", LabelType.UNDERFLOW, bucket.getLabelType());
980         assertEquals("all records in the underflow bucket", 3, bucket.size());
981     }
982 
983     /**
984      * Test with the Bopomofo-phonetic tailoring.
985      */
TestChineseZhuyin()986     public void TestChineseZhuyin() {
987         AlphabeticIndex index = new AlphabeticIndex(ULocale.forLanguageTag("zh-u-co-zhuyin"));
988         ImmutableIndex immIndex = index.buildImmutableIndex();
989         assertEquals("getBucketCount()", 38, immIndex.getBucketCount());  // ... ㄅ ㄆ ㄇ ㄈ ㄉ -- ㄩ ...
990         assertEquals("label 1", "ㄅ", immIndex.getBucket(1).getLabel());
991         assertEquals("label 2", "ㄆ", immIndex.getBucket(2).getLabel());
992         assertEquals("label 3", "ㄇ", immIndex.getBucket(3).getLabel());
993         assertEquals("label 4", "ㄈ", immIndex.getBucket(4).getLabel());
994         assertEquals("label 5", "ㄉ", immIndex.getBucket(5).getLabel());
995     }
996 
TestJapaneseKanji()997     public void TestJapaneseKanji() {
998         AlphabeticIndex index = new AlphabeticIndex(ULocale.JAPANESE);
999         AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
1000         // There are no index characters for Kanji in the Japanese standard collator.
1001         // They should all go into the overflow bucket.
1002         final int[] kanji = { 0x4E9C, 0x95C7, 0x4E00, 0x58F1 };
1003         int overflowIndex = immIndex.getBucketCount() - 1;
1004         for(int i = 0; i < kanji.length; ++i) {
1005             String msg = String.format("kanji[%d]=U+%04X in overflow bucket", i, kanji[i]);
1006             assertEquals(msg, overflowIndex, immIndex.getBucketIndex(UTF16.valueOf(kanji[i])));
1007         }
1008     }
1009 
TestFrozenCollator()1010     public void TestFrozenCollator() {
1011         // Ticket #9472
1012         RuleBasedCollator coll = (RuleBasedCollator) Collator.getInstance(new ULocale("da"));
1013         coll.setStrength(Collator.IDENTICAL);
1014         coll.freeze();
1015         // The AlphabeticIndex constructor used to throw an exception
1016         // because it cloned the collator (which preserves frozenness)
1017         // and set the clone's strength to PRIMARY.
1018         AlphabeticIndex index = new AlphabeticIndex(coll);
1019         assertEquals("same strength as input Collator",
1020                 Collator.IDENTICAL, index.getCollator().getStrength());
1021     }
1022 
TestChineseUnihan()1023     public void TestChineseUnihan() {
1024         AlphabeticIndex index = new AlphabeticIndex(new ULocale("zh-u-co-unihan"));
1025         index.setMaxLabelCount(500);  // ICU 54 default is 99.
1026         AlphabeticIndex.ImmutableIndex immIndex = index.buildImmutableIndex();
1027         int bucketCount = immIndex.getBucketCount();
1028         if(bucketCount < 216) {
1029             // There should be at least an underflow and overflow label,
1030             // and one for each of 214 radicals,
1031             // and maybe additional labels for simplified radicals.
1032             // (ICU4C: dataerrln(), prints only a warning if the data is missing)
1033             errln("too few buckets/labels for Chinese/unihan: " + bucketCount +
1034                     " (is zh/unihan data available?)");
1035             return;
1036         } else {
1037             logln("Chinese/unihan has " + bucketCount + " buckets/labels");
1038         }
1039         // bucketIndex = radical number, adjusted for simplified radicals in lower buckets.
1040         int bucketIndex = index.getBucketIndex("\u4e5d");
1041         assertEquals("getBucketIndex(U+4E5D)", 5, bucketIndex);
1042         bucketIndex = index.getBucketIndex("\u7527");
1043         assertEquals("getBucketIndex(U+7527)", 100, bucketIndex);
1044     }
1045 }
1046