1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.text;
10 
11 import java.util.ArrayList;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.Iterator;
15 import java.util.List;
16 import java.util.Locale;
17 
18 import com.ibm.icu.lang.UCharacter;
19 import com.ibm.icu.text.AlphabeticIndex.Bucket;
20 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType;
21 import com.ibm.icu.util.LocaleData;
22 import com.ibm.icu.util.ULocale;
23 
24 /**
25  * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
26  * It can support either direct use, or use with a client that doesn't support localized collation.
27  * The following is an example of what an index might look like in a UI:
28  *
29  * <pre>
30  *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
31  *
32  *  <b>A</b>
33  *     Addison
34  *     Albertson
35  *     Azensky
36  *  <b>B</b>
37  *     Baecker
38  *  ...
39  * </pre>
40  *
41  * The class can generate a list of labels for use as a UI "index", that is, a list of
42  * clickable characters (or character sequences) that allow the user to see a segment
43  * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
44  * the target list, where everything in the bucket is greater than or equal to the character
45  * (according to the locale's collation). Strings can be added to the index;
46  * they will be in sorted order in the right bucket.
47  * <p>
48  * The class also supports having buckets for strings before the first (underflow),
49  * after the last (overflow), and between scripts (inflow). For example, if the index
50  * is constructed with labels for Russian and English, Greek characters would fall
51  * into an inflow bucket between the other two scripts.
52  *
53  * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
54  * as well as characters from the user's language,
55  * then it is a good idea to call addLabels(ULocale.English).
56  *
57  * <h2>Direct Use</h2>
58  * <p>The following shows an example of building an index directly.
59  *  The "show..." methods below are just to illustrate usage.
60  *
61  * <pre>
62  * // Create a simple index where the values for the strings are Integers, and add the strings
63  *
64  * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
65  * int counter = 0;
66  * for (String item : test) {
67  *     index.addRecord(item, counter++);
68  * }
69  * ...
70  * // Show index at top. We could skip or gray out empty buckets
71  *
72  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
73  *     if (showAll || bucket.size() != 0) {
74  *         showLabelAtTop(UI, bucket.getLabel());
75  *     }
76  * }
77  *  ...
78  * // Show the buckets with their contents, skipping empty buckets
79  *
80  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
81  *     if (bucket.size() != 0) {
82  *         showLabelInList(UI, bucket.getLabel());
83  *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
84  *             showIndexedItem(UI, item.getName(), item.getData());
85  *         }
86  * </pre>
87  *
88  * The caller can build different UIs using this class.
89  * For example, an index character could be omitted or grayed-out
90  * if its bucket is empty. Small buckets could also be combined based on size, such as:
91  *
92  * <pre>
93  * <b>... A-F G-N O-Z ...</b>
94  * </pre>
95  *
96  * <h2>Client Support</h2>
97  * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
98  * to support sorting on a client that doesn't support AlphabeticIndex functionality.
99  *
100  * <p>The ImmutableIndex is both immutable and thread-safe.
101  * The corresponding AlphabeticIndex methods are not thread-safe because
102  * they "lazily" build the index buckets.
103  * <ul>
104  * <li>ImmutableIndex.getBucket(index) provides random access to all
105  *     buckets and their labels and label types.
106  * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
107  *     can be used to get a list of the labels,
108  *     such as "...", "A", "B",..., and send that list to the client.
109  * <li>When the client has a new name, it sends that name to the server.
110  * The server needs to call the following methods,
111  * and communicate the bucketIndex and collationKey back to the client.
112  *
113  * <pre>
114  * int bucketIndex = index.getBucketIndex(name);
115  * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
116  * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
117  * </pre>
118  *
119  * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
120  * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
121  * </ul>
122  *
123  * @author Mark Davis
124  * @stable ICU 4.8
125  */
126 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
127     /**
128      * Prefix string for Chinese index buckets.
129      * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
130      */
131     private static final String BASE = "\uFDD0";
132 
133     private static final char CGJ = '\u034F';
134 
135     private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
136 
137     private final RuleBasedCollator collatorOriginal;
138     private final RuleBasedCollator collatorPrimaryOnly;
139     private RuleBasedCollator collatorExternal;
140 
141     // Comparator for records, so that the Record class can be static.
142     private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
143         @Override
144         public int compare(Record<V> o1, Record<V> o2) {
145             return collatorOriginal.compare(o1.name, o2.name);
146         }
147     };
148 
149     private final List<String> firstCharsInScripts;
150 
151     // We accumulate these as we build up the input parameters
152     private final UnicodeSet initialLabels = new UnicodeSet();
153     private List<Record<V>> inputList;
154 
155     // Lazy evaluated: null means that we have not built yet.
156     private BucketList<V> buckets;
157 
158     private String overflowLabel = "\u2026";
159     private String underflowLabel = "\u2026";
160     private String inflowLabel = "\u2026";
161 
162     /**
163      * Immutable, thread-safe version of {@link AlphabeticIndex}.
164      * This class provides thread-safe methods for bucketing,
165      * and random access to buckets and their properties,
166      * but does not offer adding records to the index.
167      *
168      * @param <V> The Record value type is unused. It can be omitted for this class
169      * if it was omitted for the AlphabeticIndex that built it.
170      * @stable ICU 51
171      */
172     public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
173         private final BucketList<V> buckets;
174         private final Collator collatorPrimaryOnly;
175 
ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly)176         private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
177             this.buckets = bucketList;
178             this.collatorPrimaryOnly = collatorPrimaryOnly;
179         }
180 
181         /**
182          * Returns the number of index buckets and labels, including underflow/inflow/overflow.
183          *
184          * @return the number of index buckets
185          * @stable ICU 51
186          */
getBucketCount()187         public int getBucketCount() {
188             return buckets.getBucketCount();
189         }
190 
191         /**
192          * Finds the index bucket for the given name and returns the number of that bucket.
193          * Use {@link #getBucket(int)} to get the bucket's properties.
194          *
195          * @param name the string to be sorted into an index bucket
196          * @return the bucket number for the name
197          * @stable ICU 51
198          */
getBucketIndex(CharSequence name)199         public int getBucketIndex(CharSequence name) {
200             return buckets.getBucketIndex(name, collatorPrimaryOnly);
201         }
202 
203         /**
204          * Returns the index-th bucket. Returns null if the index is out of range.
205          *
206          * @param index bucket number
207          * @return the index-th bucket
208          * @stable ICU 51
209          */
getBucket(int index)210         public Bucket<V> getBucket(int index) {
211             if (0 <= index && index < buckets.getBucketCount()) {
212                 return buckets.immutableVisibleList.get(index);
213             } else {
214                 return null;
215             }
216         }
217 
218         /**
219          * {@inheritDoc}
220          * @stable ICU 51
221          */
222         @Override
iterator()223         public Iterator<Bucket<V>> iterator() {
224             return buckets.iterator();
225         }
226     }
227 
228     /**
229      * Create the index object.
230      *
231      * @param locale
232      *            The locale for the index.
233      * @stable ICU 4.8
234      */
AlphabeticIndex(ULocale locale)235     public AlphabeticIndex(ULocale locale) {
236         this(locale, null);
237     }
238 
239     /**
240      * Create the index object.
241      *
242      * @param locale
243      *            The locale for the index.
244      * @stable ICU 4.8
245      */
AlphabeticIndex(Locale locale)246     public AlphabeticIndex(Locale locale) {
247         this(ULocale.forLocale(locale), null);
248     }
249 
250     /**
251      * Create an AlphabeticIndex that uses a specific collator.
252      *
253      * <p>The index will be created with no labels; the addLabels() function must be called
254      * after creation to add the desired labels to the index.
255      *
256      * <p>The index will work directly with the supplied collator. If the caller will need to
257      * continue working with the collator it should be cloned first, so that the
258      * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
259      *
260      * @param collator The collator to use to order the contents of this index.
261      * @stable ICU 51
262      */
AlphabeticIndex(RuleBasedCollator collator)263     public AlphabeticIndex(RuleBasedCollator collator) {
264         this(null, collator);
265     }
266 
267     /**
268      * Internal constructor containing implementation used by public constructors.
269      */
AlphabeticIndex(ULocale locale, RuleBasedCollator collator)270     private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
271         collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
272         try {
273             collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
274         } catch (Exception e) {
275             // should never happen
276             throw new IllegalStateException("Collator cannot be cloned", e);
277         }
278         collatorPrimaryOnly.setStrength(Collator.PRIMARY);
279         collatorPrimaryOnly.freeze();
280 
281         firstCharsInScripts = getFirstCharactersInScripts();
282         Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
283         // Guard against a degenerate collator where
284         // some script boundary strings are primary ignorable.
285         for (;;) {
286             if (firstCharsInScripts.isEmpty()) {
287                 throw new IllegalArgumentException(
288                         "AlphabeticIndex requires some non-ignorable script boundary strings");
289             }
290             if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
291                 firstCharsInScripts.remove(0);
292             } else {
293                 break;
294             }
295         }
296 
297         // Chinese index characters, which are specific to each of the several Chinese tailorings,
298         // take precedence over the single locale data exemplar set per language.
299         if (!addChineseIndexCharacters() && locale != null) {
300             addIndexExemplars(locale);
301         }
302     }
303 
304     /**
305      * Add more index characters (aside from what are in the locale)
306      * @param additions additional characters to add to the index, such as A-Z.
307      * @return this, for chaining
308      * @stable ICU 4.8
309      */
addLabels(UnicodeSet additions)310     public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
311         initialLabels.addAll(additions);
312         buckets = null;
313         return this;
314     }
315 
316     /**
317      * Add more index characters (aside from what are in the locale)
318      * @param additions additional characters to add to the index, such as those in Swedish.
319      * @return this, for chaining
320      * @stable ICU 4.8
321      */
addLabels(ULocale... additions)322     public AlphabeticIndex<V> addLabels(ULocale... additions) {
323         for (ULocale addition : additions) {
324             addIndexExemplars(addition);
325         }
326         buckets = null;
327         return this;
328     }
329 
330     /**
331      * Add more index characters (aside from what are in the locale)
332      * @param additions additional characters to add to the index, such as those in Swedish.
333      * @return this, for chaining
334      * @stable ICU 4.8
335      */
addLabels(Locale... additions)336     public AlphabeticIndex<V> addLabels(Locale... additions) {
337         for (Locale addition : additions) {
338             addIndexExemplars(ULocale.forLocale(addition));
339         }
340         buckets = null;
341         return this;
342     }
343 
344     /**
345      * Set the overflow label
346      * @param overflowLabel see class description
347      * @return this, for chaining
348      * @stable ICU 4.8
349      */
setOverflowLabel(String overflowLabel)350     public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
351         this.overflowLabel = overflowLabel;
352         buckets = null;
353         return this;
354     }
355 
356     /**
357      * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
358      *
359      * @return underflow label
360      * @stable ICU 4.8
361      */
getUnderflowLabel()362     public String getUnderflowLabel() {
363         return underflowLabel; // TODO get localized version
364     }
365 
366 
367     /**
368      * Set the underflowLabel label
369      * @param underflowLabel see class description
370      * @return this, for chaining
371      * @stable ICU 4.8
372      */
setUnderflowLabel(String underflowLabel)373     public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
374         this.underflowLabel = underflowLabel;
375         buckets = null;
376         return this;
377     }
378 
379     /**
380      * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
381      *
382      * @return overflow label
383      * @stable ICU 4.8
384      */
getOverflowLabel()385     public String getOverflowLabel() {
386         return overflowLabel; // TODO get localized version
387     }
388 
389 
390     /**
391      * Set the inflowLabel label
392      * @param inflowLabel see class description
393      * @return this, for chaining
394      * @stable ICU 4.8
395      */
setInflowLabel(String inflowLabel)396     public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
397         this.inflowLabel = inflowLabel;
398         buckets = null;
399         return this;
400     }
401 
402     /**
403      * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
404      * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
405      *
406      * @return inflow label
407      * @stable ICU 4.8
408      */
getInflowLabel()409     public String getInflowLabel() {
410         return inflowLabel; // TODO get localized version
411     }
412 
413 
414     /**
415      * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
416      *
417      * @return maxLabelCount maximum number of labels.
418      * @stable ICU 4.8
419      */
getMaxLabelCount()420     public int getMaxLabelCount() {
421         return maxLabelCount;
422     }
423 
424     /**
425      * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
426      * getBucketCount().
427      *
428      * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
429      *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
430      *         future.
431      * @return this, for chaining
432      * @stable ICU 4.8
433      */
setMaxLabelCount(int maxLabelCount)434     public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
435         this.maxLabelCount = maxLabelCount;
436         buckets = null;
437         return this;
438     }
439 
440     /**
441      * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
442      * and sort differently, and that the overall list is small enough.
443      */
initLabels()444     private List<String> initLabels() {
445         Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
446         List<String> indexCharacters = new ArrayList<String>();
447 
448         String firstScriptBoundary = firstCharsInScripts.get(0);
449         String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
450 
451         // We make a sorted array of elements.
452         // Some of the input may be redundant.
453         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
454         // We filter out those cases.
455         for (String item : initialLabels) {
456             boolean checkDistinct;
457             if (!UTF16.hasMoreCodePointsThan(item, 1)) {
458                 checkDistinct = false;
459             } else if(item.charAt(item.length() - 1) == '*' &&
460                     item.charAt(item.length() - 2) != '*') {
461                 // Use a label if it is marked with one trailing star,
462                 // even if the label string sorts the same when all contractions are suppressed.
463                 item = item.substring(0, item.length() - 1);
464                 checkDistinct = false;
465             } else {
466                 checkDistinct = true;
467             }
468             if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
469                 // Ignore a primary-ignorable or non-alphabetic index character.
470             } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
471                 // Ignore an index character that will land in the overflow bucket.
472             } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
473                 // Ignore a multi-code point index character that does not sort distinctly
474                 // from the sequence of its separate characters.
475             } else {
476                 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
477                 if (insertionPoint < 0) {
478                     indexCharacters.add(~insertionPoint, item);
479                 } else {
480                     String itemAlreadyIn = indexCharacters.get(insertionPoint);
481                     if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
482                         indexCharacters.set(insertionPoint, item);
483                     }
484                 }
485             }
486         }
487 
488         // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
489 
490         final int size = indexCharacters.size() - 1;
491         if (size > maxLabelCount) {
492             int count = 0;
493             int old = -1;
494             for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
495                 ++count;
496                 it.next();
497                 final int bump = count * maxLabelCount / size;
498                 if (bump == old) {
499                     it.remove();
500                 } else {
501                     old = bump;
502                 }
503             }
504         }
505 
506         return indexCharacters;
507     }
508 
fixLabel(String current)509     private static String fixLabel(String current) {
510         if (!current.startsWith(BASE)) {
511             return current;
512         }
513         int rest = current.charAt(BASE.length());
514         if (0x2800 < rest && rest <= 0x28FF) { // stroke count
515             return (rest-0x2800) + "\u5283";
516         }
517         return current.substring(BASE.length());
518     }
519 
520     /**
521      * This method is called to get the index exemplars. Normally these come from the locale directly,
522      * but if they aren't available, we have to synthesize them.
523      */
addIndexExemplars(ULocale locale)524     private void addIndexExemplars(ULocale locale) {
525         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
526         if (exemplars != null && !exemplars.isEmpty()) {
527             initialLabels.addAll(exemplars);
528             return;
529         }
530 
531         // The locale data did not include explicit Index characters.
532         // Synthesize a set of them from the locale's standard exemplar characters.
533         exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
534 
535         exemplars = exemplars.cloneAsThawed();
536         // question: should we add auxiliary exemplars?
537         if (exemplars.containsSome('a', 'z') || exemplars.isEmpty()) {
538             exemplars.addAll('a', 'z');
539         }
540         if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
541             // cut down to small list
542             exemplars.remove(0xAC00, 0xD7A3).
543                 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
544                 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
545                 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
546                 add(0xD30C).add(0xD558);
547         }
548         if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
549             // cut down to small list
550             // make use of the fact that Ethiopic is allocated in 8's, where
551             // the base is 0 mod 8.
552             UnicodeSet ethiopic = new UnicodeSet("[ሀለሐመሠረሰሸቀቈቐቘበቨተቸኀኈነኘአከኰኸዀወዐዘዠየደዸጀገጐጘጠጨጰጸፀፈፐፘ]");
553             ethiopic.retainAll(exemplars);
554             exemplars.remove('ሀ', 0x137F).addAll(ethiopic);
555         }
556 
557         // Upper-case any that aren't already so.
558         //   (We only do this for synthesized index characters.)
559         for (String item : exemplars) {
560             initialLabels.add(UCharacter.toUpperCase(locale, item));
561         }
562     }
563 
564     /**
565      * Add Chinese index characters from the tailoring.
566      */
addChineseIndexCharacters()567     private boolean addChineseIndexCharacters() {
568         UnicodeSet contractions = new UnicodeSet();
569         try {
570             collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
571         } catch (Exception e) {
572             return false;
573         }
574         if (contractions.isEmpty()) { return false; }
575         initialLabels.addAll(contractions);
576         for (String s : contractions) {
577             assert(s.startsWith(BASE));
578             char c = s.charAt(s.length() - 1);
579             if (0x41 <= c && c <= 0x5A) {  // A-Z
580                 // There are Pinyin labels, add ASCII A-Z labels as well.
581                 initialLabels.add(0x41, 0x5A);  // A-Z
582                 break;
583             }
584         }
585         return true;
586     }
587 
588     /**
589      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
590      * <p>This is used to test whether contractions sort differently from their components.
591      */
separated(String item)592     private String separated(String item) {
593         StringBuilder result = new StringBuilder();
594         // add a CGJ except within surrogates
595         char last = item.charAt(0);
596         result.append(last);
597         for (int i = 1; i < item.length(); ++i) {
598             char ch = item.charAt(i);
599             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
600                 result.append(CGJ);
601             }
602             result.append(ch);
603             last = ch;
604         }
605         return result.toString();
606     }
607 
608     /**
609      * Builds an immutable, thread-safe version of this instance, without data records.
610      *
611      * @return an immutable index instance
612      * @stable ICU 51
613      */
buildImmutableIndex()614     public ImmutableIndex<V> buildImmutableIndex() {
615         // The current AlphabeticIndex Java code never modifies the bucket list once built.
616         // If it contains no records, we can use it.
617         // addRecord() sets buckets=null rather than inserting the new record into it.
618         BucketList<V> immutableBucketList;
619         if (inputList != null && !inputList.isEmpty()) {
620             // We need a bucket list with no records.
621             immutableBucketList = createBucketList();
622         } else {
623             if (buckets == null) {
624                 buckets = createBucketList();
625             }
626             immutableBucketList = buckets;
627         }
628         return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
629     }
630 
631     /**
632      * Get the labels.
633      *
634      * @return The list of bucket labels, after processing.
635      * @stable ICU 4.8
636      */
getBucketLabels()637     public List<String> getBucketLabels() {
638         initBuckets();
639         ArrayList<String> result = new ArrayList<String>();
640         for (Bucket<V> bucket : buckets) {
641             result.add(bucket.getLabel());
642         }
643         return result;
644     }
645 
646     /**
647      * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
648      * then stored. The next time it is accessed, the same instance is returned.
649      * <p>
650      * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
651      * synchronizing.</i></b>
652      *
653      * @return a clone of the collator used internally
654      * @stable ICU 4.8
655      */
getCollator()656     public RuleBasedCollator getCollator() {
657         if (collatorExternal == null) {
658             try {
659                 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
660             } catch (Exception e) {
661                 // should never happen
662                 throw new IllegalStateException("Collator cannot be cloned", e);
663             }
664         }
665         return collatorExternal;
666     }
667 
668     /**
669      * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
670      * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
671      * the first added comes first.
672      *
673      * @param name
674      *            Name, such as a name
675      * @param data
676      *            Data, such as an address or link
677      * @return this, for chaining
678      * @stable ICU 4.8
679      */
addRecord(CharSequence name, V data)680     public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
681         // TODO instead of invalidating, just add to unprocessed list.
682         buckets = null; // invalidate old bucketlist
683         if (inputList == null) {
684             inputList = new ArrayList<Record<V>>();
685         }
686         inputList.add(new Record<V>(name, data));
687         return this;
688     }
689 
690     /**
691      * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
692      * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
693      * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
694      * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
695      * the index or collator.
696      * <p>
697      * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
698      * those are changed, then the bucket number and sort key must be regenerated.
699      *
700      * @param name
701      *            Name, such as a name
702      * @return the bucket index for the name
703      * @stable ICU 4.8
704      */
getBucketIndex(CharSequence name)705     public int getBucketIndex(CharSequence name) {
706         initBuckets();
707         return buckets.getBucketIndex(name, collatorPrimaryOnly);
708     }
709 
710     /**
711      * Clear the index.
712      *
713      * @return this, for chaining
714      * @stable ICU 4.8
715      */
clearRecords()716     public AlphabeticIndex<V> clearRecords() {
717         if (inputList != null && !inputList.isEmpty()) {
718             inputList.clear();
719             buckets = null;
720         }
721         return this;
722     }
723 
724     /**
725      * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
726      *
727      * @return number of buckets
728      * @stable ICU 4.8
729      */
getBucketCount()730     public int getBucketCount() {
731         initBuckets();
732         return buckets.getBucketCount();
733     }
734 
735     /**
736      * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
737      *
738      * @return total number of records in buckets
739      * @stable ICU 4.8
740      */
getRecordCount()741     public int getRecordCount() {
742         return inputList != null ? inputList.size() : 0;
743     }
744 
745     /**
746      * Return an iterator over the buckets.
747      *
748      * @return iterator over buckets.
749      * @stable ICU 4.8
750      */
751     @Override
iterator()752     public Iterator<Bucket<V>> iterator() {
753         initBuckets();
754         return buckets.iterator();
755     }
756 
757     /**
758      * Creates an index, and buckets and sorts the list of records into the index.
759      */
initBuckets()760     private void initBuckets() {
761         if (buckets != null) {
762             return;
763         }
764         buckets = createBucketList();
765         if (inputList == null || inputList.isEmpty()) {
766             return;
767         }
768 
769         // Sort the records by name.
770         // Stable sort preserves input order of collation duplicates.
771         Collections.sort(inputList, recordComparator);
772 
773         // Now, we traverse all of the input, which is now sorted.
774         // If the item doesn't go in the current bucket, we find the next bucket that contains it.
775         // This makes the process order n*log(n), since we just sort the list and then do a linear process.
776         // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
777         // we need to improve it for that case.
778 
779         Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
780         Bucket<V> currentBucket = bucketIterator.next();
781         Bucket<V> nextBucket;
782         String upperBoundary;
783         if (bucketIterator.hasNext()) {
784             nextBucket = bucketIterator.next();
785             upperBoundary = nextBucket.lowerBoundary;
786         } else {
787             nextBucket = null;
788             upperBoundary = null;
789         }
790         for (Record<V> r : inputList) {
791             // if the current bucket isn't the right one, find the one that is
792             // We have a special flag for the last bucket so that we don't look any further
793             while (upperBoundary != null &&
794                     collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
795                 currentBucket = nextBucket;
796                 // now reset the boundary that we compare against
797                 if (bucketIterator.hasNext()) {
798                     nextBucket = bucketIterator.next();
799                     upperBoundary = nextBucket.lowerBoundary;
800                 } else {
801                     upperBoundary = null;
802                 }
803             }
804             // now put the record into the bucket.
805             Bucket<V> bucket = currentBucket;
806             if (bucket.displayBucket != null) {
807                 bucket = bucket.displayBucket;
808             }
809             if (bucket.records == null) {
810                 bucket.records = new ArrayList<Record<V>>();
811             }
812             bucket.records.add(r);
813         }
814     }
815 
816     private int maxLabelCount = 99;
817 
818     /**
819      * Returns true if one index character string is "better" than the other.
820      * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
821      * better, and otherwise binary-less-than is better.
822      */
isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other)823     private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
824         // This is called with primary-equal strings, but never with one.equals(other).
825         String n1 = nfkdNormalizer.normalize(one);
826         String n2 = nfkdNormalizer.normalize(other);
827         int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
828         if (result != 0) {
829             return result < 0;
830         }
831         result = binaryCmp.compare(n1, n2);
832         if (result != 0) {
833             return result < 0;
834         }
835         return binaryCmp.compare(one, other) < 0;
836     }
837 
838     /**
839      * A (name, data) pair, to be sorted by name into one of the index buckets.
840      * The user data is not used by the index implementation.
841      *
842      * @stable ICU 4.8
843      */
844     public static class Record<V> {
845         private final CharSequence name;
846         private final V data;
847 
Record(CharSequence name, V data)848         private Record(CharSequence name, V data) {
849             this.name = name;
850             this.data = data;
851         }
852 
853         /**
854          * Get the name
855          *
856          * @return the name
857          * @stable ICU 4.8
858          */
getName()859         public CharSequence getName() {
860             return name;
861         }
862 
863         /**
864          * Get the data
865          *
866          * @return the data
867          * @stable ICU 4.8
868          */
getData()869         public V getData() {
870             return data;
871         }
872 
873         /**
874          * Standard toString()
875          * @stable ICU 4.8
876          */
877         @Override
toString()878         public String toString() {
879             return name + "=" + data;
880         }
881     }
882 
883     /**
884      * An index "bucket" with a label string and type.
885      * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
886      * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
887      * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
888      * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
889      * into a bucket according to the record's name.
890      *
891      * @param <V>
892      *            Data type
893      * @stable ICU 4.8
894      */
895     public static class Bucket<V> implements Iterable<Record<V>> {
896         private final String label;
897         private final String lowerBoundary;
898         private final LabelType labelType;
899         private Bucket<V> displayBucket;
900         private int displayIndex;
901         private List<Record<V>> records;
902 
903         /**
904          * Type of the label
905          *
906          * @stable ICU 4.8
907          */
908         public enum LabelType {
909             /**
910              * Normal
911              * @stable ICU 4.8
912              */
913             NORMAL,
914             /**
915              * Underflow (before the first)
916              * @stable ICU 4.8
917              */
918             UNDERFLOW,
919             /**
920              * Inflow (between scripts)
921              * @stable ICU 4.8
922              */
923             INFLOW,
924             /**
925              * Overflow (after the last)
926              * @stable ICU 4.8
927              */
928             OVERFLOW
929         }
930 
931         /**
932          * Set up the bucket.
933          *
934          * @param label
935          *            label for the bucket
936          * @param labelType
937          *            is an underflow, overflow, or inflow bucket
938          * @stable ICU 4.8
939          */
Bucket(String label, String lowerBoundary, LabelType labelType)940         private Bucket(String label, String lowerBoundary, LabelType labelType) {
941             this.label = label;
942             this.lowerBoundary = lowerBoundary;
943             this.labelType = labelType;
944         }
945 
946         /**
947          * Get the label
948          *
949          * @return label for the bucket
950          * @stable ICU 4.8
951          */
getLabel()952         public String getLabel() {
953             return label;
954         }
955 
956         /**
957          * Is a normal, underflow, overflow, or inflow bucket
958          *
959          * @return is an underflow, overflow, or inflow bucket
960          * @stable ICU 4.8
961          */
getLabelType()962         public LabelType getLabelType() {
963             return labelType;
964         }
965 
966         /**
967          * Get the number of records in the bucket.
968          *
969          * @return number of records in bucket
970          * @stable ICU 4.8
971          */
size()972         public int size() {
973             return records == null ? 0 : records.size();
974         }
975 
976         /**
977          * Iterator over the records in the bucket
978          * @stable ICU 4.8
979          */
980         @Override
iterator()981         public Iterator<Record<V>> iterator() {
982             if (records == null) {
983                 return Collections.<Record<V>>emptyList().iterator();
984             }
985             return records.iterator();
986         }
987 
988         /**
989          * Standard toString()
990          * @stable ICU 4.8
991          */
992         @Override
toString()993         public String toString() {
994             return "{" +
995             "labelType=" + labelType
996             + ", " +
997             "lowerBoundary=" + lowerBoundary
998             + ", " +
999             "label=" + label
1000             + "}"
1001             ;
1002         }
1003     }
1004 
createBucketList()1005     private BucketList<V> createBucketList() {
1006         // Initialize indexCharacters.
1007         List<String> indexCharacters = initLabels();
1008 
1009         // Variables for hasMultiplePrimaryWeights().
1010         long variableTop;
1011         if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
1012             variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
1013         } else {
1014             variableTop = 0;
1015         }
1016         boolean hasInvisibleBuckets = false;
1017 
1018         // Helper arrays for Chinese Pinyin collation.
1019         @SuppressWarnings({ "rawtypes", "unchecked" })
1020         Bucket<V>[] asciiBuckets = new Bucket[26];
1021         @SuppressWarnings({ "rawtypes", "unchecked" })
1022         Bucket<V>[] pinyinBuckets = new Bucket[26];
1023         boolean hasPinyin = false;
1024 
1025         ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
1026 
1027         // underflow bucket
1028         bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
1029 
1030         // fix up the list, adding underflow, additions, overflow
1031         // Insert inflow labels as needed.
1032         int scriptIndex = -1;
1033         String scriptUpperBoundary = "";
1034         for (String current : indexCharacters) {
1035             if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
1036                 // We crossed the script boundary into a new script.
1037                 String inflowBoundary = scriptUpperBoundary;
1038                 boolean skippedScript = false;
1039                 for (;;) {
1040                     scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
1041                     if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
1042                         break;
1043                     }
1044                     skippedScript = true;
1045                 }
1046                 if (skippedScript && bucketList.size() > 1) {
1047                     // We are skipping one or more scripts,
1048                     // and we are not just getting out of the underflow label.
1049                     bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
1050                             LabelType.INFLOW));
1051                 }
1052             }
1053             // Add a bucket with the current label.
1054             Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
1055             bucketList.add(bucket);
1056             // Remember ASCII and Pinyin buckets for Pinyin redirects.
1057             char c;
1058             if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
1059                 asciiBuckets[c - 'A'] = bucket;
1060             } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
1061                     'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
1062                 pinyinBuckets[c - 'A'] = bucket;
1063                 hasPinyin = true;
1064             }
1065             // Check for multiple primary weights.
1066             if (!current.startsWith(BASE) &&
1067                     hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
1068                     !current.endsWith("\uffff")) {
1069                 // "Æ" or "Sch" etc.
1070                 for (int i = bucketList.size() - 2;; --i) {
1071                     Bucket<V> singleBucket = bucketList.get(i);
1072                     if (singleBucket.labelType != LabelType.NORMAL) {
1073                         // There is no single-character bucket since the last
1074                         // underflow or inflow label.
1075                         break;
1076                     }
1077                     if (singleBucket.displayBucket == null &&
1078                             !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
1079                         // Add an invisible bucket that redirects strings greater than the expansion
1080                         // to the previous single-character bucket.
1081                         // For example, after ... Q R S Sch we add Sch\uFFFF->S
1082                         // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
1083                         bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
1084                         bucket.displayBucket = singleBucket;
1085                         bucketList.add(bucket);
1086                         hasInvisibleBuckets = true;
1087                         break;
1088                     }
1089                 }
1090             }
1091         }
1092         if (bucketList.size() == 1) {
1093             // No real labels, show only the underflow label.
1094             return new BucketList<V>(bucketList, bucketList);
1095         }
1096         // overflow bucket
1097         bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
1098 
1099         if (hasPinyin) {
1100             // Redirect Pinyin buckets.
1101             Bucket<V> asciiBucket = null;
1102             for (int i = 0; i < 26; ++i) {
1103                 if (asciiBuckets[i] != null) {
1104                     asciiBucket = asciiBuckets[i];
1105                 }
1106                 if (pinyinBuckets[i] != null && asciiBucket != null) {
1107                     pinyinBuckets[i].displayBucket = asciiBucket;
1108                     hasInvisibleBuckets = true;
1109                 }
1110             }
1111         }
1112 
1113         if (!hasInvisibleBuckets) {
1114             return new BucketList<V>(bucketList, bucketList);
1115         }
1116         // Merge inflow buckets that are visually adjacent.
1117         // Iterate backwards: Merge inflow into overflow rather than the other way around.
1118         int i = bucketList.size() - 1;
1119         Bucket<V> nextBucket = bucketList.get(i);
1120         while (--i > 0) {
1121             Bucket<V> bucket = bucketList.get(i);
1122             if (bucket.displayBucket != null) {
1123                 continue;  // skip invisible buckets
1124             }
1125             if (bucket.labelType == LabelType.INFLOW) {
1126                 if (nextBucket.labelType != LabelType.NORMAL) {
1127                     bucket.displayBucket = nextBucket;
1128                     continue;
1129                 }
1130             }
1131             nextBucket = bucket;
1132         }
1133 
1134         ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
1135         for (Bucket<V> bucket : bucketList) {
1136             if (bucket.displayBucket == null) {
1137                 publicBucketList.add(bucket);
1138             }
1139         }
1140         return new BucketList<V>(bucketList, publicBucketList);
1141     }
1142 
1143     private static class BucketList<V> implements Iterable<Bucket<V>> {
1144         private final ArrayList<Bucket<V>> bucketList;
1145         private final List<Bucket<V>> immutableVisibleList;
1146 
BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList)1147         private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
1148             this.bucketList = bucketList;
1149 
1150             int displayIndex = 0;
1151             for (Bucket<V> bucket : publicBucketList) {
1152                 bucket.displayIndex = displayIndex++;
1153             }
1154             immutableVisibleList = Collections.unmodifiableList(publicBucketList);
1155         }
1156 
getBucketCount()1157         private int getBucketCount() {
1158             return immutableVisibleList.size();
1159         }
1160 
getBucketIndex(CharSequence name, Collator collatorPrimaryOnly)1161         private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
1162             // binary search
1163             int start = 0;
1164             int limit = bucketList.size();
1165             while ((start + 1) < limit) {
1166                 int i = (start + limit) / 2;
1167                 Bucket<V> bucket = bucketList.get(i);
1168                 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
1169                 if (nameVsBucket < 0) {
1170                     limit = i;
1171                 } else {
1172                     start = i;
1173                 }
1174             }
1175             Bucket<V> bucket = bucketList.get(start);
1176             if (bucket.displayBucket != null) {
1177                 bucket = bucket.displayBucket;
1178             }
1179             return bucket.displayIndex;
1180         }
1181 
1182         /**
1183          * Private iterator over all the buckets, visible and invisible
1184          */
fullIterator()1185         private Iterator<Bucket<V>> fullIterator() {
1186             return bucketList.iterator();
1187         }
1188 
1189         /**
1190          * Iterator over just the visible buckets.
1191          */
1192         @Override
iterator()1193         public Iterator<Bucket<V>> iterator() {
1194             return immutableVisibleList.iterator(); // use immutable list to prevent remove().
1195         }
1196     }
1197 
hasMultiplePrimaryWeights( RuleBasedCollator coll, long variableTop, String s)1198     private static boolean hasMultiplePrimaryWeights(
1199             RuleBasedCollator coll, long variableTop, String s) {
1200         long[] ces = coll.internalGetCEs(s);
1201         boolean seenPrimary = false;
1202         for (int i = 0; i < ces.length; ++i) {
1203             long ce = ces[i];
1204             long p = ce >>> 32;
1205             if (p > variableTop) {
1206                 // not primary ignorable
1207                 if (seenPrimary) {
1208                     return true;
1209                 }
1210                 seenPrimary = true;
1211             }
1212         }
1213         return false;
1214     }
1215 
1216     // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
1217     private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
1218     private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
1219     private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
1220     private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
1221     private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
1222     private static final int GC_L_MASK =
1223             GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
1224     private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
1225 
1226     /**
1227      * Return a list of the first character in each script. Only exposed for testing.
1228      *
1229      * @return list of first characters in each script
1230      * @internal
1231      * @deprecated This API is ICU internal, only for testing.
1232      */
1233     @Deprecated
getFirstCharactersInScripts()1234     public List<String> getFirstCharactersInScripts() {
1235         List<String> dest = new ArrayList<String>(200);
1236         // Fetch the script-first-primary contractions which are defined in the root collator.
1237         // They all start with U+FDD1.
1238         UnicodeSet set = new UnicodeSet();
1239         collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
1240         if (set.isEmpty()) {
1241             throw new UnsupportedOperationException(
1242                     "AlphabeticIndex requires script-first-primary contractions");
1243         }
1244         for (String boundary : set) {
1245             int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
1246             if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
1247                 // Ignore boundaries for the special reordering groups.
1248                 // Take only those for "real scripts" (where the sample character is a Letter,
1249                 // and the one for unassigned implicit weights (Cn).
1250                 continue;
1251             }
1252             dest.add(boundary);
1253         }
1254         return dest;
1255     }
1256 }
1257