1 /*
2  *******************************************************************************
3  * Copyright (C) 2008-2014, Google Inc, International Business Machines Corporation
4  * and others. All Rights Reserved.
5  *******************************************************************************
6  */
7 package com.ibm.icu.text;
8 
9 import java.util.ArrayList;
10 import java.util.Collections;
11 import java.util.Comparator;
12 import java.util.Iterator;
13 import java.util.List;
14 import java.util.Locale;
15 
16 import com.ibm.icu.lang.UCharacter;
17 import com.ibm.icu.text.AlphabeticIndex.Bucket;
18 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
19 import com.ibm.icu.util.LocaleData;
20 import com.ibm.icu.util.ULocale;
21 
22 /**
23  * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
24  * It can support either direct use, or use with a client that doesn't support localized collation.
25  * The following is an example of what an index might look like in a UI:
26  *
27  * <pre>
28  *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
29  *
30  *  <b>A</b>
31  *     Addison
32  *     Albertson
33  *     Azensky
34  *  <b>B</b>
35  *     Baecker
36  *  ...
37  * </pre>
38  *
39  * The class can generate a list of labels for use as a UI "index", that is, a list of
40  * clickable characters (or character sequences) that allow the user to see a segment
41  * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
42  * the target list, where everything in the bucket is greater than or equal to the character
43  * (according to the locale's collation). Strings can be added to the index;
44  * they will be in sorted order in the right bucket.</p>
45  * <p>
46  * The class also supports having buckets for strings before the first (underflow),
47  * after the last (overflow), and between scripts (inflow). For example, if the index
48  * is constructed with labels for Russian and English, Greek characters would fall
49  * into an inflow bucket between the other two scripts.</p>
50  *
51  * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
52  * as well as characters from the user's language,
53  * then it is a good idea to call addLabels(ULocale.English).</p>
54  *
55  * <h2>Direct Use</h2>
56  * <p>The following shows an example of building an index directly.
57  *  The "show..." methods below are just to illustrate usage.
58  *
59  * <pre>
60  * // Create a simple index where the values for the strings are Integers, and add the strings
61  *
62  * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
63  * int counter = 0;
64  * for (String item : test) {
65  *     index.addRecord(item, counter++);
66  * }
67  * ...
68  * // Show index at top. We could skip or gray out empty buckets
69  *
70  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
71  *     if (showAll || bucket.size() != 0) {
72  *         showLabelAtTop(UI, bucket.getLabel());
73  *     }
74  * }
75  *  ...
76  * // Show the buckets with their contents, skipping empty buckets
77  *
78  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
79  *     if (bucket.size() != 0) {
80  *         showLabelInList(UI, bucket.getLabel());
81  *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
82  *             showIndexedItem(UI, item.getName(), item.getData());
83  *         }
84  * </pre>
85  *
86  * The caller can build different UIs using this class.
87  * For example, an index character could be omitted or grayed-out
88  * if its bucket is empty. Small buckets could also be combined based on size, such as:
89  *
90  * <pre>
91  * <b>... A-F G-N O-Z ...</b>
92  * </pre>
93  *
94  * <h2>Client Support</h2>
95  * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
96  * to support sorting on a client that doesn't support AlphabeticIndex functionality.
97  *
98  * <p>The ImmutableIndex is both immutable and thread-safe.
99  * The corresponding AlphabeticIndex methods are not thread-safe because
100  * they "lazily" build the index buckets.
101  * <ul>
102  * <li>ImmutableIndex.getBucket(index) provides random access to all
103  *     buckets and their labels and label types.
104  * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
105  *     can be used to get a list of the labels,
106  *     such as "...", "A", "B",..., and send that list to the client.
107  * <li>When the client has a new name, it sends that name to the server.
108  * The server needs to call the following methods,
109  * and communicate the bucketIndex and collationKey back to the client.
110  *
111  * <pre>
112  * int bucketIndex = index.getBucketIndex(name);
113  * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
114  * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
115  * </pre>
116  *
117  * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
118  * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
119  * </ul>
120  *
121  * @author Mark Davis
122  * @stable ICU 4.8
123  */
124 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
125     /**
126      * Prefix string for Chinese index buckets.
127      * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
128      */
129     private static final String BASE = "\uFDD0";
130 
131     private static final char CGJ = '\u034F';
132 
133     private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
134 
135     private final RuleBasedCollator collatorOriginal;
136     private final RuleBasedCollator collatorPrimaryOnly;
137     private RuleBasedCollator collatorExternal;
138 
139     // Comparator for records, so that the Record class can be static.
140     private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
141         public int compare(Record<V> o1, Record<V> o2) {
142             return collatorOriginal.compare(o1.name, o2.name);
143         }
144     };
145 
146     private final List<String> firstCharsInScripts;
147 
148     // We accumulate these as we build up the input parameters
149     private final UnicodeSet initialLabels = new UnicodeSet();
150     private List<Record<V>> inputList;
151 
152     // Lazy evaluated: null means that we have not built yet.
153     private BucketList<V> buckets;
154 
155     private String overflowLabel = "\u2026";
156     private String underflowLabel = "\u2026";
157     private String inflowLabel = "\u2026";
158 
159     /**
160      * Immutable, thread-safe version of {@link AlphabeticIndex}.
161      * This class provides thread-safe methods for bucketing,
162      * and random access to buckets and their properties,
163      * but does not offer adding records to the index.
164      *
165      * @param <V> The Record value type is unused. It can be omitted for this class
166      * if it was omitted for the AlphabeticIndex that built it.
167      * @stable ICU 51
168      */
169     public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
170         private final BucketList<V> buckets;
171         private final Collator collatorPrimaryOnly;
172 
ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly)173         private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
174             this.buckets = bucketList;
175             this.collatorPrimaryOnly = collatorPrimaryOnly;
176         }
177 
178         /**
179          * Returns the number of index buckets and labels, including underflow/inflow/overflow.
180          *
181          * @return the number of index buckets
182          * @stable ICU 51
183          */
getBucketCount()184         public int getBucketCount() {
185             return buckets.getBucketCount();
186         }
187 
188         /**
189          * Finds the index bucket for the given name and returns the number of that bucket.
190          * Use {@link #getBucket(int)} to get the bucket's properties.
191          *
192          * @param name the string to be sorted into an index bucket
193          * @return the bucket number for the name
194          * @stable ICU 51
195          */
getBucketIndex(CharSequence name)196         public int getBucketIndex(CharSequence name) {
197             return buckets.getBucketIndex(name, collatorPrimaryOnly);
198         }
199 
200         /**
201          * Returns the index-th bucket. Returns null if the index is out of range.
202          *
203          * @param index bucket number
204          * @return the index-th bucket
205          * @stable ICU 51
206          */
getBucket(int index)207         public Bucket<V> getBucket(int index) {
208             if (0 <= index && index < buckets.getBucketCount()) {
209                 return buckets.immutableVisibleList.get(index);
210             } else {
211                 return null;
212             }
213         }
214 
215         /**
216          * {@inheritDoc}
217          * @stable ICU 51
218          */
iterator()219         public Iterator<Bucket<V>> iterator() {
220             return buckets.iterator();
221         }
222     }
223 
224     /**
225      * Create the index object.
226      *
227      * @param locale
228      *            The locale for the index.
229      * @stable ICU 4.8
230      */
AlphabeticIndex(ULocale locale)231     public AlphabeticIndex(ULocale locale) {
232         this(locale, null);
233     }
234 
235     /**
236      * Create the index object.
237      *
238      * @param locale
239      *            The locale for the index.
240      * @stable ICU 4.8
241      */
AlphabeticIndex(Locale locale)242     public AlphabeticIndex(Locale locale) {
243         this(ULocale.forLocale(locale), null);
244     }
245 
246     /**
247      * Create an AlphabeticIndex that uses a specific collator.
248      *
249      * <p>The index will be created with no labels; the addLabels() function must be called
250      * after creation to add the desired labels to the index.
251      *
252      * <p>The index will work directly with the supplied collator. If the caller will need to
253      * continue working with the collator it should be cloned first, so that the
254      * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
255      *
256      * @param collator The collator to use to order the contents of this index.
257      * @stable ICU 51
258      */
AlphabeticIndex(RuleBasedCollator collator)259     public AlphabeticIndex(RuleBasedCollator collator) {
260         this(null, collator);
261     }
262 
263     /**
264      * Internal constructor containing implementation used by public constructors.
265      */
AlphabeticIndex(ULocale locale, RuleBasedCollator collator)266     private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
267         collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
268         try {
269             collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
270         } catch (Exception e) {
271             // should never happen
272             throw new IllegalStateException("Collator cannot be cloned", e);
273         }
274         collatorPrimaryOnly.setStrength(Collator.PRIMARY);
275         collatorPrimaryOnly.freeze();
276 
277         firstCharsInScripts = getFirstCharactersInScripts();
278         Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
279         // Guard against a degenerate collator where
280         // some script boundary strings are primary ignorable.
281         for (;;) {
282             if (firstCharsInScripts.isEmpty()) {
283                 throw new IllegalArgumentException(
284                         "AlphabeticIndex requires some non-ignorable script boundary strings");
285             }
286             if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
287                 firstCharsInScripts.remove(0);
288             } else {
289                 break;
290             }
291         }
292 
293         // Chinese index characters, which are specific to each of the several Chinese tailorings,
294         // take precedence over the single locale data exemplar set per language.
295         if (!addChineseIndexCharacters() && locale != null) {
296             addIndexExemplars(locale);
297         }
298     }
299 
300     /**
301      * Add more index characters (aside from what are in the locale)
302      * @param additions additional characters to add to the index, such as A-Z.
303      * @return this, for chaining
304      * @stable ICU 4.8
305      */
addLabels(UnicodeSet additions)306     public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
307         initialLabels.addAll(additions);
308         buckets = null;
309         return this;
310     }
311 
312     /**
313      * Add more index characters (aside from what are in the locale)
314      * @param additions additional characters to add to the index, such as those in Swedish.
315      * @return this, for chaining
316      * @stable ICU 4.8
317      */
addLabels(ULocale... additions)318     public AlphabeticIndex<V> addLabels(ULocale... additions) {
319         for (ULocale addition : additions) {
320             addIndexExemplars(addition);
321         }
322         buckets = null;
323         return this;
324     }
325 
326     /**
327      * Add more index characters (aside from what are in the locale)
328      * @param additions additional characters to add to the index, such as those in Swedish.
329      * @return this, for chaining
330      * @stable ICU 4.8
331      */
addLabels(Locale... additions)332     public AlphabeticIndex<V> addLabels(Locale... additions) {
333         for (Locale addition : additions) {
334             addIndexExemplars(ULocale.forLocale(addition));
335         }
336         buckets = null;
337         return this;
338     }
339 
340     /**
341      * Set the overflow label
342      * @param overflowLabel see class description
343      * @return this, for chaining
344      * @stable ICU 4.8
345      */
setOverflowLabel(String overflowLabel)346     public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
347         this.overflowLabel = overflowLabel;
348         buckets = null;
349         return this;
350     }
351 
352     /**
353      * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
354      *
355      * @return underflow label
356      * @stable ICU 4.8
357      */
getUnderflowLabel()358     public String getUnderflowLabel() {
359         return underflowLabel; // TODO get localized version
360     }
361 
362 
363     /**
364      * Set the underflowLabel label
365      * @param underflowLabel see class description
366      * @return this, for chaining
367      * @stable ICU 4.8
368      */
setUnderflowLabel(String underflowLabel)369     public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
370         this.underflowLabel = underflowLabel;
371         buckets = null;
372         return this;
373     }
374 
375     /**
376      * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
377      *
378      * @return overflow label
379      * @stable ICU 4.8
380      */
getOverflowLabel()381     public String getOverflowLabel() {
382         return overflowLabel; // TODO get localized version
383     }
384 
385 
386     /**
387      * Set the inflowLabel label
388      * @param inflowLabel see class description
389      * @return this, for chaining
390      * @stable ICU 4.8
391      */
setInflowLabel(String inflowLabel)392     public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
393         this.inflowLabel = inflowLabel;
394         buckets = null;
395         return this;
396     }
397 
398     /**
399      * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
400      * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
401      *
402      * @return inflow label
403      * @stable ICU 4.8
404      */
getInflowLabel()405     public String getInflowLabel() {
406         return inflowLabel; // TODO get localized version
407     }
408 
409 
410     /**
411      * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
412      *
413      * @return maxLabelCount maximum number of labels.
414      * @stable ICU 4.8
415      */
getMaxLabelCount()416     public int getMaxLabelCount() {
417         return maxLabelCount;
418     }
419 
420     /**
421      * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
422      * getBucketCount().
423      *
424      * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
425      *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
426      *         future.
427      * @return this, for chaining
428      * @stable ICU 4.8
429      */
setMaxLabelCount(int maxLabelCount)430     public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
431         this.maxLabelCount = maxLabelCount;
432         buckets = null;
433         return this;
434     }
435 
436     /**
437      * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
438      * and sort differently, and that the overall list is small enough.
439      */
initLabels()440     private List<String> initLabels() {
441         Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
442         List<String> indexCharacters = new ArrayList<String>();
443 
444         String firstScriptBoundary = firstCharsInScripts.get(0);
445         String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
446 
447         // We make a sorted array of elements.
448         // Some of the input may be redundant.
449         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
450         // We filter out those cases.
451         for (String item : initialLabels) {
452             boolean checkDistinct;
453             if (!UTF16.hasMoreCodePointsThan(item, 1)) {
454                 checkDistinct = false;
455             } else if(item.charAt(item.length() - 1) == '*' &&
456                     item.charAt(item.length() - 2) != '*') {
457                 // Use a label if it is marked with one trailing star,
458                 // even if the label string sorts the same when all contractions are suppressed.
459                 item = item.substring(0, item.length() - 1);
460                 checkDistinct = false;
461             } else {
462                 checkDistinct = true;
463             }
464             if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
465                 // Ignore a primary-ignorable or non-alphabetic index character.
466             } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
467                 // Ignore an index character that will land in the overflow bucket.
468             } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
469                 // Ignore a multi-code point index character that does not sort distinctly
470                 // from the sequence of its separate characters.
471             } else {
472                 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
473                 if (insertionPoint < 0) {
474                     indexCharacters.add(~insertionPoint, item);
475                 } else {
476                     String itemAlreadyIn = indexCharacters.get(insertionPoint);
477                     if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
478                         indexCharacters.set(insertionPoint, item);
479                     }
480                 }
481             }
482         }
483 
484         // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
485 
486         final int size = indexCharacters.size() - 1;
487         if (size > maxLabelCount) {
488             int count = 0;
489             int old = -1;
490             for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
491                 ++count;
492                 it.next();
493                 final int bump = count * maxLabelCount / size;
494                 if (bump == old) {
495                     it.remove();
496                 } else {
497                     old = bump;
498                 }
499             }
500         }
501 
502         return indexCharacters;
503     }
504 
fixLabel(String current)505     private static String fixLabel(String current) {
506         if (!current.startsWith(BASE)) {
507             return current;
508         }
509         int rest = current.charAt(BASE.length());
510         if (0x2800 < rest && rest <= 0x28FF) { // stroke count
511             return (rest-0x2800) + "\u5283";
512         }
513         return current.substring(BASE.length());
514     }
515 
516     /**
517      * This method is called to get the index exemplars. Normally these come from the locale directly,
518      * but if they aren't available, we have to synthesize them.
519      */
addIndexExemplars(ULocale locale)520     private void addIndexExemplars(ULocale locale) {
521         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
522         if (exemplars != null) {
523             initialLabels.addAll(exemplars);
524             return;
525         }
526 
527         // The locale data did not include explicit Index characters.
528         // Synthesize a set of them from the locale's standard exemplar characters.
529         exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
530 
531         exemplars = exemplars.cloneAsThawed();
532         // question: should we add auxiliary exemplars?
533         if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
534             exemplars.addAll('a', 'z');
535         }
536         if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
537             // cut down to small list
538             exemplars.remove(0xAC00, 0xD7A3).
539                 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
540                 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
541                 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
542                 add(0xD30C).add(0xD558);
543         }
544         if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
545             // cut down to small list
546             // make use of the fact that Ethiopic is allocated in 8's, where
547             // the base is 0 mod 8.
548             UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
549             UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
550             while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
551                 if ((it.codepoint & 0x7) != 0) {
552                     exemplars.remove(it.codepoint);
553                 }
554             }
555         }
556 
557         // Upper-case any that aren't already so.
558         //   (We only do this for synthesized index characters.)
559         for (String item : exemplars) {
560             initialLabels.add(UCharacter.toUpperCase(locale, item));
561         }
562     }
563 
564     /**
565      * Add Chinese index characters from the tailoring.
566      */
addChineseIndexCharacters()567     private boolean addChineseIndexCharacters() {
568         UnicodeSet contractions = new UnicodeSet();
569         try {
570             collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
571         } catch (Exception e) {
572             return false;
573         }
574         if (contractions.isEmpty()) { return false; }
575         initialLabels.addAll(contractions);
576         for (String s : contractions) {
577             assert(s.startsWith(BASE));
578             char c = s.charAt(s.length() - 1);
579             if (0x41 <= c && c <= 0x5A) {  // A-Z
580                 // There are Pinyin labels, add ASCII A-Z labels as well.
581                 initialLabels.add(0x41, 0x5A);  // A-Z
582                 break;
583             }
584         }
585         return true;
586     }
587 
588     /**
589      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
590      * <p>This is used to test whether contractions sort differently from their components.
591      */
separated(String item)592     private String separated(String item) {
593         StringBuilder result = new StringBuilder();
594         // add a CGJ except within surrogates
595         char last = item.charAt(0);
596         result.append(last);
597         for (int i = 1; i < item.length(); ++i) {
598             char ch = item.charAt(i);
599             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
600                 result.append(CGJ);
601             }
602             result.append(ch);
603             last = ch;
604         }
605         return result.toString();
606     }
607 
608     /**
609      * Builds an immutable, thread-safe version of this instance, without data records.
610      *
611      * @return an immutable index instance
612      * @stable ICU 51
613      */
buildImmutableIndex()614     public ImmutableIndex<V> buildImmutableIndex() {
615         // The current AlphabeticIndex Java code never modifies the bucket list once built.
616         // If it contains no records, we can use it.
617         // addRecord() sets buckets=null rather than inserting the new record into it.
618         BucketList<V> immutableBucketList;
619         if (inputList != null && !inputList.isEmpty()) {
620             // We need a bucket list with no records.
621             immutableBucketList = createBucketList();
622         } else {
623             if (buckets == null) {
624                 buckets = createBucketList();
625             }
626             immutableBucketList = buckets;
627         }
628         return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
629     }
630 
631     /**
632      * Get the labels.
633      *
634      * @return The list of bucket labels, after processing.
635      * @stable ICU 4.8
636      */
getBucketLabels()637     public List<String> getBucketLabels() {
638         initBuckets();
639         ArrayList<String> result = new ArrayList<String>();
640         for (Bucket<V> bucket : buckets) {
641             result.add(bucket.getLabel());
642         }
643         return result;
644     }
645 
646     /**
647      * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
648      * then stored. The next time it is accessed, the same instance is returned.
649      * <p>
650      * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
651      * synchronizing.</i></b>
652      *
653      * @return a clone of the collator used internally
654      * @stable ICU 4.8
655      */
getCollator()656     public RuleBasedCollator getCollator() {
657         if (collatorExternal == null) {
658             try {
659                 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
660             } catch (Exception e) {
661                 // should never happen
662                 throw new IllegalStateException("Collator cannot be cloned", e);
663             }
664         }
665         return collatorExternal;
666     }
667 
668     /**
669      * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
670      * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
671      * the first added comes first.
672      *
673      * @param name
674      *            Name, such as a name
675      * @param data
676      *            Data, such as an address or link
677      * @return this, for chaining
678      * @stable ICU 4.8
679      */
addRecord(CharSequence name, V data)680     public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
681         // TODO instead of invalidating, just add to unprocessed list.
682         buckets = null; // invalidate old bucketlist
683         if (inputList == null) {
684             inputList = new ArrayList<Record<V>>();
685         }
686         inputList.add(new Record<V>(name, data));
687         return this;
688     }
689 
690     /**
691      * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
692      * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
693      * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
694      * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
695      * the index or collator.
696      * <p>
697      * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
698      * those are changed, then the bucket number and sort key must be regenerated.
699      *
700      * @param name
701      *            Name, such as a name
702      * @return the bucket index for the name
703      * @stable ICU 4.8
704      */
getBucketIndex(CharSequence name)705     public int getBucketIndex(CharSequence name) {
706         initBuckets();
707         return buckets.getBucketIndex(name, collatorPrimaryOnly);
708     }
709 
710     /**
711      * Clear the index.
712      *
713      * @return this, for chaining
714      * @stable ICU 4.8
715      */
clearRecords()716     public AlphabeticIndex<V> clearRecords() {
717         if (inputList != null && !inputList.isEmpty()) {
718             inputList.clear();
719             buckets = null;
720         }
721         return this;
722     }
723 
724     /**
725      * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
726      *
727      * @return number of buckets
728      * @stable ICU 4.8
729      */
getBucketCount()730     public int getBucketCount() {
731         initBuckets();
732         return buckets.getBucketCount();
733     }
734 
735     /**
736      * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets.
737      *
738      * @return total number of records in buckets
739      * @stable ICU 4.8
740      */
getRecordCount()741     public int getRecordCount() {
742         return inputList != null ? inputList.size() : 0;
743     }
744 
745     /**
746      * Return an iterator over the buckets.
747      *
748      * @return iterator over buckets.
749      * @stable ICU 4.8
750      */
iterator()751     public Iterator<Bucket<V>> iterator() {
752         initBuckets();
753         return buckets.iterator();
754     }
755 
756     /**
757      * Creates an index, and buckets and sorts the list of records into the index.
758      */
initBuckets()759     private void initBuckets() {
760         if (buckets != null) {
761             return;
762         }
763         buckets = createBucketList();
764         if (inputList == null || inputList.isEmpty()) {
765             return;
766         }
767 
768         // Sort the records by name.
769         // Stable sort preserves input order of collation duplicates.
770         Collections.sort(inputList, recordComparator);
771 
772         // Now, we traverse all of the input, which is now sorted.
773         // If the item doesn't go in the current bucket, we find the next bucket that contains it.
774         // This makes the process order n*log(n), since we just sort the list and then do a linear process.
775         // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
776         // we need to improve it for that case.
777 
778         Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
779         Bucket<V> currentBucket = bucketIterator.next();
780         Bucket<V> nextBucket;
781         String upperBoundary;
782         if (bucketIterator.hasNext()) {
783             nextBucket = bucketIterator.next();
784             upperBoundary = nextBucket.lowerBoundary;
785         } else {
786             nextBucket = null;
787             upperBoundary = null;
788         }
789         for (Record<V> r : inputList) {
790             // if the current bucket isn't the right one, find the one that is
791             // We have a special flag for the last bucket so that we don't look any further
792             while (upperBoundary != null &&
793                     collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
794                 currentBucket = nextBucket;
795                 // now reset the boundary that we compare against
796                 if (bucketIterator.hasNext()) {
797                     nextBucket = bucketIterator.next();
798                     upperBoundary = nextBucket.lowerBoundary;
799                 } else {
800                     upperBoundary = null;
801                 }
802             }
803             // now put the record into the bucket.
804             Bucket<V> bucket = currentBucket;
805             if (bucket.displayBucket != null) {
806                 bucket = bucket.displayBucket;
807             }
808             if (bucket.records == null) {
809                 bucket.records = new ArrayList<Record<V>>();
810             }
811             bucket.records.add(r);
812         }
813     }
814 
815     private int maxLabelCount = 99;
816 
817     /**
818      * Returns true if one index character string is "better" than the other.
819      * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
820      * better, and otherwise binary-less-than is better.
821      */
isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other)822     private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
823         // This is called with primary-equal strings, but never with one.equals(other).
824         String n1 = nfkdNormalizer.normalize(one);
825         String n2 = nfkdNormalizer.normalize(other);
826         int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
827         if (result != 0) {
828             return result < 0;
829         }
830         result = binaryCmp.compare(n1, n2);
831         if (result != 0) {
832             return result < 0;
833         }
834         return binaryCmp.compare(one, other) < 0;
835     }
836 
837     /**
838      * A (name, data) pair, to be sorted by name into one of the index buckets.
839      * The user data is not used by the index implementation.
840      *
841      * @stable ICU 4.8
842      */
843     public static class Record<V> {
844         private final CharSequence name;
845         private final V data;
846 
Record(CharSequence name, V data)847         private Record(CharSequence name, V data) {
848             this.name = name;
849             this.data = data;
850         }
851 
852         /**
853          * Get the name
854          *
855          * @return the name
856          * @stable ICU 4.8
857          */
getName()858         public CharSequence getName() {
859             return name;
860         }
861 
862         /**
863          * Get the data
864          *
865          * @return the data
866          * @stable ICU 4.8
867          */
getData()868         public V getData() {
869             return data;
870         }
871 
872         /**
873          * Standard toString()
874          * @stable ICU 4.8
875          */
toString()876         public String toString() {
877             return name + "=" + data;
878         }
879     }
880 
881     /**
882      * An index "bucket" with a label string and type.
883      * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
884      * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
885      * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
886      * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
887      * into a bucket according to the record's name.
888      *
889      * @param <V>
890      *            Data type
891      * @stable ICU 4.8
892      */
893     public static class Bucket<V> implements Iterable<Record<V>> {
894         private final String label;
895         private final String lowerBoundary;
896         private final LabelType labelType;
897         private Bucket<V> displayBucket;
898         private int displayIndex;
899         private List<Record<V>> records;
900 
901         /**
902          * Type of the label
903          *
904          * @stable ICU 4.8
905          */
906         public enum LabelType {
907             /**
908              * Normal
909              * @stable ICU 4.8
910              */
911             NORMAL,
912             /**
913              * Underflow (before the first)
914              * @stable ICU 4.8
915              */
916             UNDERFLOW,
917             /**
918              * Inflow (between scripts)
919              * @stable ICU 4.8
920              */
921             INFLOW,
922             /**
923              * Overflow (after the last)
924              * @stable ICU 4.8
925              */
926             OVERFLOW
927         }
928 
929         /**
930          * Set up the bucket.
931          *
932          * @param label
933          *            label for the bucket
934          * @param labelType
935          *            is an underflow, overflow, or inflow bucket
936          * @stable ICU 4.8
937          */
Bucket(String label, String lowerBoundary, LabelType labelType)938         private Bucket(String label, String lowerBoundary, LabelType labelType) {
939             this.label = label;
940             this.lowerBoundary = lowerBoundary;
941             this.labelType = labelType;
942         }
943 
944         /**
945          * Get the label
946          *
947          * @return label for the bucket
948          * @stable ICU 4.8
949          */
getLabel()950         public String getLabel() {
951             return label;
952         }
953 
954         /**
955          * Is a normal, underflow, overflow, or inflow bucket
956          *
957          * @return is an underflow, overflow, or inflow bucket
958          * @stable ICU 4.8
959          */
getLabelType()960         public LabelType getLabelType() {
961             return labelType;
962         }
963 
964         /**
965          * Get the number of records in the bucket.
966          *
967          * @return number of records in bucket
968          * @stable ICU 4.8
969          */
size()970         public int size() {
971             return records == null ? 0 : records.size();
972         }
973 
974         /**
975          * Iterator over the records in the bucket
976          * @stable ICU 4.8
977          */
iterator()978         public Iterator<Record<V>> iterator() {
979             if (records == null) {
980                 return Collections.<Record<V>>emptyList().iterator();
981             }
982             return records.iterator();
983         }
984 
985         /**
986          * Standard toString()
987          * @stable ICU 4.8
988          */
989         @Override
toString()990         public String toString() {
991             return "{" +
992             "labelType=" + labelType
993             + ", " +
994             "lowerBoundary=" + lowerBoundary
995             + ", " +
996             "label=" + label
997             + "}"
998             ;
999         }
1000     }
1001 
createBucketList()1002     private BucketList<V> createBucketList() {
1003         // Initialize indexCharacters.
1004         List<String> indexCharacters = initLabels();
1005 
1006         // Variables for hasMultiplePrimaryWeights().
1007         long variableTop;
1008         if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
1009             variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
1010         } else {
1011             variableTop = 0;
1012         }
1013         boolean hasInvisibleBuckets = false;
1014 
1015         // Helper arrays for Chinese Pinyin collation.
1016         @SuppressWarnings({ "rawtypes", "unchecked" })
1017         Bucket<V>[] asciiBuckets = new Bucket[26];
1018         @SuppressWarnings({ "rawtypes", "unchecked" })
1019         Bucket<V>[] pinyinBuckets = new Bucket[26];
1020         boolean hasPinyin = false;
1021 
1022         ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
1023 
1024         // underflow bucket
1025         bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
1026 
1027         // fix up the list, adding underflow, additions, overflow
1028         // Insert inflow labels as needed.
1029         int scriptIndex = -1;
1030         String scriptUpperBoundary = "";
1031         for (String current : indexCharacters) {
1032             if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
1033                 // We crossed the script boundary into a new script.
1034                 String inflowBoundary = scriptUpperBoundary;
1035                 boolean skippedScript = false;
1036                 for (;;) {
1037                     scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
1038                     if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
1039                         break;
1040                     }
1041                     skippedScript = true;
1042                 }
1043                 if (skippedScript && bucketList.size() > 1) {
1044                     // We are skipping one or more scripts,
1045                     // and we are not just getting out of the underflow label.
1046                     bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1047                             LabelType.INFLOW));
1048                 }
1049             }
1050             // Add a bucket with the current label.
1051             Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1052             bucketList.add(bucket);
1053             // Remember ASCII and Pinyin buckets for Pinyin redirects.
1054             char c;
1055             if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1056                 asciiBuckets[c - 'A'] = bucket;
1057             } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1058                     'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1059                 pinyinBuckets[c - 'A'] = bucket;
1060                 hasPinyin = true;
1061             }
1062             // Check for multiple primary weights.
1063             if (!current.startsWith(BASE) &&
1064                     hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
1065                     !current.endsWith("\uffff")) {
1066                 // "Æ" or "Sch" etc.
1067                 for (int i = bucketList.size() - 2;; --i) {
1068                     Bucket<V> singleBucket = bucketList.get(i);
1069                     if (singleBucket.labelType != LabelType.NORMAL) {
1070                         // There is no single-character bucket since the last
1071                         // underflow or inflow label.
1072                         break;
1073                     }
1074                     if (singleBucket.displayBucket == null &&
1075                             !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
1076                         // Add an invisible bucket that redirects strings greater than the expansion
1077                         // to the previous single-character bucket.
1078                         // For example, after ... Q R S Sch we add Sch\uFFFF->S
1079                         // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1080                         bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1081                         bucket.displayBucket = singleBucket;
1082                         bucketList.add(bucket);
1083                         hasInvisibleBuckets = true;
1084                         break;
1085                     }
1086                 }
1087             }
1088         }
1089         if (bucketList.size() == 1) {
1090             // No real labels, show only the underflow label.
1091             return new BucketList<V>(bucketList, bucketList);
1092         }
1093         // overflow bucket
1094         bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1095 
1096         if (hasPinyin) {
1097             // Redirect Pinyin buckets.
1098             Bucket<V> asciiBucket = null;
1099             for (int i = 0; i < 26; ++i) {
1100                 if (asciiBuckets[i] != null) {
1101                     asciiBucket = asciiBuckets[i];
1102                 }
1103                 if (pinyinBuckets[i] != null && asciiBucket != null) {
1104                     pinyinBuckets[i].displayBucket = asciiBucket;
1105                     hasInvisibleBuckets = true;
1106                 }
1107             }
1108         }
1109 
1110         if (!hasInvisibleBuckets) {
1111             return new BucketList<V>(bucketList, bucketList);
1112         }
1113         // Merge inflow buckets that are visually adjacent.
1114         // Iterate backwards: Merge inflow into overflow rather than the other way around.
1115         int i = bucketList.size() - 1;
1116         Bucket<V> nextBucket = bucketList.get(i);
1117         while (--i > 0) {
1118             Bucket<V> bucket = bucketList.get(i);
1119             if (bucket.displayBucket != null) {
1120                 continue;  // skip invisible buckets
1121             }
1122             if (bucket.labelType == LabelType.INFLOW) {
1123                 if (nextBucket.labelType != LabelType.NORMAL) {
1124                     bucket.displayBucket = nextBucket;
1125                     continue;
1126                 }
1127             }
1128             nextBucket = bucket;
1129         }
1130 
1131         ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1132         for (Bucket<V> bucket : bucketList) {
1133             if (bucket.displayBucket == null) {
1134                 publicBucketList.add(bucket);
1135             }
1136         }
1137         return new BucketList<V>(bucketList, publicBucketList);
1138     }
1139 
1140     private static class BucketList<V> implements Iterable<Bucket<V>> {
1141         private final ArrayList<Bucket<V>> bucketList;
1142         private final List<Bucket<V>> immutableVisibleList;
1143 
BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList)1144         private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1145             this.bucketList = bucketList;
1146 
1147             int displayIndex = 0;
1148             for (Bucket<V> bucket : publicBucketList) {
1149                 bucket.displayIndex = displayIndex++;
1150             }
1151             immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1152         }
1153 
getBucketCount()1154         private int getBucketCount() {
1155             return immutableVisibleList.size();
1156         }
1157 
getBucketIndex(CharSequence name, Collator collatorPrimaryOnly)1158         private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1159             // binary search
1160             int start = 0;
1161             int limit = bucketList.size();
1162             while ((start + 1) < limit) {
1163                 int i = (start + limit) / 2;
1164                 Bucket<V> bucket = bucketList.get(i);
1165                 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1166                 if (nameVsBucket < 0) {
1167                     limit = i;
1168                 } else {
1169                     start = i;
1170                 }
1171             }
1172             Bucket<V> bucket = bucketList.get(start);
1173             if (bucket.displayBucket != null) {
1174                 bucket = bucket.displayBucket;
1175             }
1176             return bucket.displayIndex;
1177         }
1178 
1179         /**
1180          * Private iterator over all the buckets, visible and invisible
1181          */
fullIterator()1182         private Iterator<Bucket<V>> fullIterator() {
1183             return bucketList.iterator();
1184         }
1185 
1186         /**
1187          * Iterator over just the visible buckets.
1188          */
iterator()1189         public Iterator<Bucket<V>> iterator() {
1190             return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1191         }
1192     }
1193 
hasMultiplePrimaryWeights( RuleBasedCollator coll, long variableTop, String s)1194     private static boolean hasMultiplePrimaryWeights(
1195             RuleBasedCollator coll, long variableTop, String s) {
1196         long[] ces = coll.internalGetCEs(s);
1197         boolean seenPrimary = false;
1198         for (int i = 0; i < ces.length; ++i) {
1199             long ce = ces[i];
1200             long p = ce >>> 32;
1201             if (p > variableTop) {
1202                 // not primary ignorable
1203                 if (seenPrimary) {
1204                     return true;
1205                 }
1206                 seenPrimary = true;
1207             }
1208         }
1209         return false;
1210     }
1211 
1212     // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
1213     private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
1214     private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
1215     private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
1216     private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
1217     private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
1218     private static final int GC_L_MASK =
1219             GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
1220     private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
1221 
1222     /**
1223      * Return a list of the first character in each script. Only exposed for testing.
1224      *
1225      * @return list of first characters in each script
1226      * @internal
1227      * @deprecated This API is ICU internal, only for testing.
1228      */
1229     @Deprecated
getFirstCharactersInScripts()1230     public List<String> getFirstCharactersInScripts() {
1231         List<String> dest = new ArrayList<String>(200);
1232         // Fetch the script-first-primary contractions which are defined in the root collator.
1233         // They all start with U+FDD1.
1234         UnicodeSet set = new UnicodeSet();
1235         collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
1236         if (set.isEmpty()) {
1237             throw new UnsupportedOperationException(
1238                     "AlphabeticIndex requires script-first-primary contractions");
1239         }
1240         for (String boundary : set) {
1241             int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
1242             if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
1243                 // Ignore boundaries for the special reordering groups.
1244                 // Take only those for "real scripts" (where the sample character is a Letter,
1245                 // and the one for unassigned implicit weights (Cn).
1246                 continue;
1247             }
1248             dest.add(boundary);
1249         }
1250         return dest;
1251     }
1252 }
1253