1 /* 2 ******************************************************************************* 3 * Copyright (C) 2008-2014, Google Inc, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 9 import java.util.ArrayList; 10 import java.util.Collections; 11 import java.util.Comparator; 12 import java.util.Iterator; 13 import java.util.List; 14 import java.util.Locale; 15 16 import com.ibm.icu.lang.UCharacter; 17 import com.ibm.icu.text.AlphabeticIndex.Bucket; 18 import com.ibm.icu.text.AlphabeticIndex.Bucket.LabelType; 19 import com.ibm.icu.util.LocaleData; 20 import com.ibm.icu.util.ULocale; 21 22 /** 23 * AlphabeticIndex supports the creation of a UI index appropriate for a given language. 24 * It can support either direct use, or use with a client that doesn't support localized collation. 25 * The following is an example of what an index might look like in a UI: 26 * 27 * <pre> 28 * <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z ...</b> 29 * 30 * <b>A</b> 31 * Addison 32 * Albertson 33 * Azensky 34 * <b>B</b> 35 * Baecker 36 * ... 37 * </pre> 38 * 39 * The class can generate a list of labels for use as a UI "index", that is, a list of 40 * clickable characters (or character sequences) that allow the user to see a segment 41 * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in 42 * the target list, where everything in the bucket is greater than or equal to the character 43 * (according to the locale's collation). Strings can be added to the index; 44 * they will be in sorted order in the right bucket.</p> 45 * <p> 46 * The class also supports having buckets for strings before the first (underflow), 47 * after the last (overflow), and between scripts (inflow). For example, if the index 48 * is constructed with labels for Russian and English, Greek characters would fall 49 * into an inflow bucket between the other two scripts.</p> 50 * 51 * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters 52 * as well as characters from the user's language, 53 * then it is a good idea to call addLabels(ULocale.English).</p> 54 * 55 * <h2>Direct Use</h2> 56 * <p>The following shows an example of building an index directly. 57 * The "show..." methods below are just to illustrate usage. 58 * 59 * <pre> 60 * // Create a simple index where the values for the strings are Integers, and add the strings 61 * 62 * AlphabeticIndex<Integer> index = new AlphabeticIndex<Integer>(desiredLocale).addLabels(additionalLocale); 63 * int counter = 0; 64 * for (String item : test) { 65 * index.addRecord(item, counter++); 66 * } 67 * ... 68 * // Show index at top. We could skip or gray out empty buckets 69 * 70 * for (AlphabeticIndex.Bucket<Integer> bucket : index) { 71 * if (showAll || bucket.size() != 0) { 72 * showLabelAtTop(UI, bucket.getLabel()); 73 * } 74 * } 75 * ... 76 * // Show the buckets with their contents, skipping empty buckets 77 * 78 * for (AlphabeticIndex.Bucket<Integer> bucket : index) { 79 * if (bucket.size() != 0) { 80 * showLabelInList(UI, bucket.getLabel()); 81 * for (AlphabeticIndex.Record<Integer> item : bucket) { 82 * showIndexedItem(UI, item.getName(), item.getData()); 83 * } 84 * </pre> 85 * 86 * The caller can build different UIs using this class. 87 * For example, an index character could be omitted or grayed-out 88 * if its bucket is empty. Small buckets could also be combined based on size, such as: 89 * 90 * <pre> 91 * <b>... A-F G-N O-Z ...</b> 92 * </pre> 93 * 94 * <h2>Client Support</h2> 95 * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself, 96 * to support sorting on a client that doesn't support AlphabeticIndex functionality. 97 * 98 * <p>The ImmutableIndex is both immutable and thread-safe. 99 * The corresponding AlphabeticIndex methods are not thread-safe because 100 * they "lazily" build the index buckets. 101 * <ul> 102 * <li>ImmutableIndex.getBucket(index) provides random access to all 103 * buckets and their labels and label types. 104 * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class 105 * can be used to get a list of the labels, 106 * such as "...", "A", "B",..., and send that list to the client. 107 * <li>When the client has a new name, it sends that name to the server. 108 * The server needs to call the following methods, 109 * and communicate the bucketIndex and collationKey back to the client. 110 * 111 * <pre> 112 * int bucketIndex = index.getBucketIndex(name); 113 * String label = immutableIndex.getBucket(bucketIndex).getLabel(); // optional 114 * RawCollationKey collationKey = collator.getRawCollationKey(name, null); 115 * </pre> 116 * 117 * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a 118 * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li> 119 * </ul> 120 * 121 * @author Mark Davis 122 * @stable ICU 4.8 123 */ 124 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> { 125 /** 126 * Prefix string for Chinese index buckets. 127 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 128 */ 129 private static final String BASE = "\uFDD0"; 130 131 private static final char CGJ = '\u034F'; 132 133 private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0); 134 135 private final RuleBasedCollator collatorOriginal; 136 private final RuleBasedCollator collatorPrimaryOnly; 137 private RuleBasedCollator collatorExternal; 138 139 // Comparator for records, so that the Record class can be static. 140 private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() { 141 public int compare(Record<V> o1, Record<V> o2) { 142 return collatorOriginal.compare(o1.name, o2.name); 143 } 144 }; 145 146 private final List<String> firstCharsInScripts; 147 148 // We accumulate these as we build up the input parameters 149 private final UnicodeSet initialLabels = new UnicodeSet(); 150 private List<Record<V>> inputList; 151 152 // Lazy evaluated: null means that we have not built yet. 153 private BucketList<V> buckets; 154 155 private String overflowLabel = "\u2026"; 156 private String underflowLabel = "\u2026"; 157 private String inflowLabel = "\u2026"; 158 159 /** 160 * Immutable, thread-safe version of {@link AlphabeticIndex}. 161 * This class provides thread-safe methods for bucketing, 162 * and random access to buckets and their properties, 163 * but does not offer adding records to the index. 164 * 165 * @param <V> The Record value type is unused. It can be omitted for this class 166 * if it was omitted for the AlphabeticIndex that built it. 167 * @stable ICU 51 168 */ 169 public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> { 170 private final BucketList<V> buckets; 171 private final Collator collatorPrimaryOnly; 172 ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly)173 private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) { 174 this.buckets = bucketList; 175 this.collatorPrimaryOnly = collatorPrimaryOnly; 176 } 177 178 /** 179 * Returns the number of index buckets and labels, including underflow/inflow/overflow. 180 * 181 * @return the number of index buckets 182 * @stable ICU 51 183 */ getBucketCount()184 public int getBucketCount() { 185 return buckets.getBucketCount(); 186 } 187 188 /** 189 * Finds the index bucket for the given name and returns the number of that bucket. 190 * Use {@link #getBucket(int)} to get the bucket's properties. 191 * 192 * @param name the string to be sorted into an index bucket 193 * @return the bucket number for the name 194 * @stable ICU 51 195 */ getBucketIndex(CharSequence name)196 public int getBucketIndex(CharSequence name) { 197 return buckets.getBucketIndex(name, collatorPrimaryOnly); 198 } 199 200 /** 201 * Returns the index-th bucket. Returns null if the index is out of range. 202 * 203 * @param index bucket number 204 * @return the index-th bucket 205 * @stable ICU 51 206 */ getBucket(int index)207 public Bucket<V> getBucket(int index) { 208 if (0 <= index && index < buckets.getBucketCount()) { 209 return buckets.immutableVisibleList.get(index); 210 } else { 211 return null; 212 } 213 } 214 215 /** 216 * {@inheritDoc} 217 * @stable ICU 51 218 */ iterator()219 public Iterator<Bucket<V>> iterator() { 220 return buckets.iterator(); 221 } 222 } 223 224 /** 225 * Create the index object. 226 * 227 * @param locale 228 * The locale for the index. 229 * @stable ICU 4.8 230 */ AlphabeticIndex(ULocale locale)231 public AlphabeticIndex(ULocale locale) { 232 this(locale, null); 233 } 234 235 /** 236 * Create the index object. 237 * 238 * @param locale 239 * The locale for the index. 240 * @stable ICU 4.8 241 */ AlphabeticIndex(Locale locale)242 public AlphabeticIndex(Locale locale) { 243 this(ULocale.forLocale(locale), null); 244 } 245 246 /** 247 * Create an AlphabeticIndex that uses a specific collator. 248 * 249 * <p>The index will be created with no labels; the addLabels() function must be called 250 * after creation to add the desired labels to the index. 251 * 252 * <p>The index will work directly with the supplied collator. If the caller will need to 253 * continue working with the collator it should be cloned first, so that the 254 * collator provided to the AlphabeticIndex remains unchanged after creation of the index. 255 * 256 * @param collator The collator to use to order the contents of this index. 257 * @stable ICU 51 258 */ AlphabeticIndex(RuleBasedCollator collator)259 public AlphabeticIndex(RuleBasedCollator collator) { 260 this(null, collator); 261 } 262 263 /** 264 * Internal constructor containing implementation used by public constructors. 265 */ AlphabeticIndex(ULocale locale, RuleBasedCollator collator)266 private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) { 267 collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale); 268 try { 269 collatorPrimaryOnly = collatorOriginal.cloneAsThawed(); 270 } catch (Exception e) { 271 // should never happen 272 throw new IllegalStateException("Collator cannot be cloned", e); 273 } 274 collatorPrimaryOnly.setStrength(Collator.PRIMARY); 275 collatorPrimaryOnly.freeze(); 276 277 firstCharsInScripts = getFirstCharactersInScripts(); 278 Collections.sort(firstCharsInScripts, collatorPrimaryOnly); 279 // Guard against a degenerate collator where 280 // some script boundary strings are primary ignorable. 281 for (;;) { 282 if (firstCharsInScripts.isEmpty()) { 283 throw new IllegalArgumentException( 284 "AlphabeticIndex requires some non-ignorable script boundary strings"); 285 } 286 if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) { 287 firstCharsInScripts.remove(0); 288 } else { 289 break; 290 } 291 } 292 293 // Chinese index characters, which are specific to each of the several Chinese tailorings, 294 // take precedence over the single locale data exemplar set per language. 295 if (!addChineseIndexCharacters() && locale != null) { 296 addIndexExemplars(locale); 297 } 298 } 299 300 /** 301 * Add more index characters (aside from what are in the locale) 302 * @param additions additional characters to add to the index, such as A-Z. 303 * @return this, for chaining 304 * @stable ICU 4.8 305 */ addLabels(UnicodeSet additions)306 public AlphabeticIndex<V> addLabels(UnicodeSet additions) { 307 initialLabels.addAll(additions); 308 buckets = null; 309 return this; 310 } 311 312 /** 313 * Add more index characters (aside from what are in the locale) 314 * @param additions additional characters to add to the index, such as those in Swedish. 315 * @return this, for chaining 316 * @stable ICU 4.8 317 */ addLabels(ULocale... additions)318 public AlphabeticIndex<V> addLabels(ULocale... additions) { 319 for (ULocale addition : additions) { 320 addIndexExemplars(addition); 321 } 322 buckets = null; 323 return this; 324 } 325 326 /** 327 * Add more index characters (aside from what are in the locale) 328 * @param additions additional characters to add to the index, such as those in Swedish. 329 * @return this, for chaining 330 * @stable ICU 4.8 331 */ addLabels(Locale... additions)332 public AlphabeticIndex<V> addLabels(Locale... additions) { 333 for (Locale addition : additions) { 334 addIndexExemplars(ULocale.forLocale(addition)); 335 } 336 buckets = null; 337 return this; 338 } 339 340 /** 341 * Set the overflow label 342 * @param overflowLabel see class description 343 * @return this, for chaining 344 * @stable ICU 4.8 345 */ setOverflowLabel(String overflowLabel)346 public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) { 347 this.overflowLabel = overflowLabel; 348 buckets = null; 349 return this; 350 } 351 352 /** 353 * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ... 354 * 355 * @return underflow label 356 * @stable ICU 4.8 357 */ getUnderflowLabel()358 public String getUnderflowLabel() { 359 return underflowLabel; // TODO get localized version 360 } 361 362 363 /** 364 * Set the underflowLabel label 365 * @param underflowLabel see class description 366 * @return this, for chaining 367 * @stable ICU 4.8 368 */ setUnderflowLabel(String underflowLabel)369 public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) { 370 this.underflowLabel = underflowLabel; 371 buckets = null; 372 return this; 373 } 374 375 /** 376 * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C 377 * 378 * @return overflow label 379 * @stable ICU 4.8 380 */ getOverflowLabel()381 public String getOverflowLabel() { 382 return overflowLabel; // TODO get localized version 383 } 384 385 386 /** 387 * Set the inflowLabel label 388 * @param inflowLabel see class description 389 * @return this, for chaining 390 * @stable ICU 4.8 391 */ setInflowLabel(String inflowLabel)392 public AlphabeticIndex<V> setInflowLabel(String inflowLabel) { 393 this.inflowLabel = inflowLabel; 394 buckets = null; 395 return this; 396 } 397 398 /** 399 * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels 400 * for Latin and Greek are used: X Y Z ... Α Β Γ. 401 * 402 * @return inflow label 403 * @stable ICU 4.8 404 */ getInflowLabel()405 public String getInflowLabel() { 406 return inflowLabel; // TODO get localized version 407 } 408 409 410 /** 411 * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount(). 412 * 413 * @return maxLabelCount maximum number of labels. 414 * @stable ICU 4.8 415 */ getMaxLabelCount()416 public int getMaxLabelCount() { 417 return maxLabelCount; 418 } 419 420 /** 421 * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see 422 * getBucketCount(). 423 * 424 * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every 425 * nth item is removed to bring the count down. A more sophisticated mechanism may be available in the 426 * future. 427 * @return this, for chaining 428 * @stable ICU 4.8 429 */ setMaxLabelCount(int maxLabelCount)430 public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) { 431 this.maxLabelCount = maxLabelCount; 432 buckets = null; 433 return this; 434 } 435 436 /** 437 * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique, 438 * and sort differently, and that the overall list is small enough. 439 */ initLabels()440 private List<String> initLabels() { 441 Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance(); 442 List<String> indexCharacters = new ArrayList<String>(); 443 444 String firstScriptBoundary = firstCharsInScripts.get(0); 445 String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1); 446 447 // We make a sorted array of elements. 448 // Some of the input may be redundant. 449 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 450 // We filter out those cases. 451 for (String item : initialLabels) { 452 boolean checkDistinct; 453 if (!UTF16.hasMoreCodePointsThan(item, 1)) { 454 checkDistinct = false; 455 } else if(item.charAt(item.length() - 1) == '*' && 456 item.charAt(item.length() - 2) != '*') { 457 // Use a label if it is marked with one trailing star, 458 // even if the label string sorts the same when all contractions are suppressed. 459 item = item.substring(0, item.length() - 1); 460 checkDistinct = false; 461 } else { 462 checkDistinct = true; 463 } 464 if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) { 465 // Ignore a primary-ignorable or non-alphabetic index character. 466 } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) { 467 // Ignore an index character that will land in the overflow bucket. 468 } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) { 469 // Ignore a multi-code point index character that does not sort distinctly 470 // from the sequence of its separate characters. 471 } else { 472 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly); 473 if (insertionPoint < 0) { 474 indexCharacters.add(~insertionPoint, item); 475 } else { 476 String itemAlreadyIn = indexCharacters.get(insertionPoint); 477 if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) { 478 indexCharacters.set(insertionPoint, item); 479 } 480 } 481 } 482 } 483 484 // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element 485 486 final int size = indexCharacters.size() - 1; 487 if (size > maxLabelCount) { 488 int count = 0; 489 int old = -1; 490 for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) { 491 ++count; 492 it.next(); 493 final int bump = count * maxLabelCount / size; 494 if (bump == old) { 495 it.remove(); 496 } else { 497 old = bump; 498 } 499 } 500 } 501 502 return indexCharacters; 503 } 504 fixLabel(String current)505 private static String fixLabel(String current) { 506 if (!current.startsWith(BASE)) { 507 return current; 508 } 509 int rest = current.charAt(BASE.length()); 510 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 511 return (rest-0x2800) + "\u5283"; 512 } 513 return current.substring(BASE.length()); 514 } 515 516 /** 517 * This method is called to get the index exemplars. Normally these come from the locale directly, 518 * but if they aren't available, we have to synthesize them. 519 */ addIndexExemplars(ULocale locale)520 private void addIndexExemplars(ULocale locale) { 521 UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX); 522 if (exemplars != null) { 523 initialLabels.addAll(exemplars); 524 return; 525 } 526 527 // The locale data did not include explicit Index characters. 528 // Synthesize a set of them from the locale's standard exemplar characters. 529 exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD); 530 531 exemplars = exemplars.cloneAsThawed(); 532 // question: should we add auxiliary exemplars? 533 if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) { 534 exemplars.addAll('a', 'z'); 535 } 536 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 537 // cut down to small list 538 exemplars.remove(0xAC00, 0xD7A3). 539 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 540 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 541 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 542 add(0xD30C).add(0xD558); 543 } 544 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 545 // cut down to small list 546 // make use of the fact that Ethiopic is allocated in 8's, where 547 // the base is 0 mod 8. 548 UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"); 549 UnicodeSetIterator it = new UnicodeSetIterator(ethiopic); 550 while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) { 551 if ((it.codepoint & 0x7) != 0) { 552 exemplars.remove(it.codepoint); 553 } 554 } 555 } 556 557 // Upper-case any that aren't already so. 558 // (We only do this for synthesized index characters.) 559 for (String item : exemplars) { 560 initialLabels.add(UCharacter.toUpperCase(locale, item)); 561 } 562 } 563 564 /** 565 * Add Chinese index characters from the tailoring. 566 */ addChineseIndexCharacters()567 private boolean addChineseIndexCharacters() { 568 UnicodeSet contractions = new UnicodeSet(); 569 try { 570 collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions); 571 } catch (Exception e) { 572 return false; 573 } 574 if (contractions.isEmpty()) { return false; } 575 initialLabels.addAll(contractions); 576 for (String s : contractions) { 577 assert(s.startsWith(BASE)); 578 char c = s.charAt(s.length() - 1); 579 if (0x41 <= c && c <= 0x5A) { // A-Z 580 // There are Pinyin labels, add ASCII A-Z labels as well. 581 initialLabels.add(0x41, 0x5A); // A-Z 582 break; 583 } 584 } 585 return true; 586 } 587 588 /** 589 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 590 * <p>This is used to test whether contractions sort differently from their components. 591 */ separated(String item)592 private String separated(String item) { 593 StringBuilder result = new StringBuilder(); 594 // add a CGJ except within surrogates 595 char last = item.charAt(0); 596 result.append(last); 597 for (int i = 1; i < item.length(); ++i) { 598 char ch = item.charAt(i); 599 if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) { 600 result.append(CGJ); 601 } 602 result.append(ch); 603 last = ch; 604 } 605 return result.toString(); 606 } 607 608 /** 609 * Builds an immutable, thread-safe version of this instance, without data records. 610 * 611 * @return an immutable index instance 612 * @stable ICU 51 613 */ buildImmutableIndex()614 public ImmutableIndex<V> buildImmutableIndex() { 615 // The current AlphabeticIndex Java code never modifies the bucket list once built. 616 // If it contains no records, we can use it. 617 // addRecord() sets buckets=null rather than inserting the new record into it. 618 BucketList<V> immutableBucketList; 619 if (inputList != null && !inputList.isEmpty()) { 620 // We need a bucket list with no records. 621 immutableBucketList = createBucketList(); 622 } else { 623 if (buckets == null) { 624 buckets = createBucketList(); 625 } 626 immutableBucketList = buckets; 627 } 628 return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly); 629 } 630 631 /** 632 * Get the labels. 633 * 634 * @return The list of bucket labels, after processing. 635 * @stable ICU 4.8 636 */ getBucketLabels()637 public List<String> getBucketLabels() { 638 initBuckets(); 639 ArrayList<String> result = new ArrayList<String>(); 640 for (Bucket<V> bucket : buckets) { 641 result.add(bucket.getLabel()); 642 } 643 return result; 644 } 645 646 /** 647 * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and 648 * then stored. The next time it is accessed, the same instance is returned. 649 * <p> 650 * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without 651 * synchronizing.</i></b> 652 * 653 * @return a clone of the collator used internally 654 * @stable ICU 4.8 655 */ getCollator()656 public RuleBasedCollator getCollator() { 657 if (collatorExternal == null) { 658 try { 659 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone()); 660 } catch (Exception e) { 661 // should never happen 662 throw new IllegalStateException("Collator cannot be cloned", e); 663 } 664 } 665 return collatorExternal; 666 } 667 668 /** 669 * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort 670 * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added: 671 * the first added comes first. 672 * 673 * @param name 674 * Name, such as a name 675 * @param data 676 * Data, such as an address or link 677 * @return this, for chaining 678 * @stable ICU 4.8 679 */ addRecord(CharSequence name, V data)680 public AlphabeticIndex<V> addRecord(CharSequence name, V data) { 681 // TODO instead of invalidating, just add to unprocessed list. 682 buckets = null; // invalidate old bucketlist 683 if (inputList == null) { 684 inputList = new ArrayList<Record<V>>(); 685 } 686 inputList.add(new Record<V>(name, data)); 687 return this; 688 } 689 690 /** 691 * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling 692 * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask 693 * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that 694 * information, it can put the name into the right bucket, and sort it within that bucket, without having access to 695 * the index or collator. 696 * <p> 697 * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if 698 * those are changed, then the bucket number and sort key must be regenerated. 699 * 700 * @param name 701 * Name, such as a name 702 * @return the bucket index for the name 703 * @stable ICU 4.8 704 */ getBucketIndex(CharSequence name)705 public int getBucketIndex(CharSequence name) { 706 initBuckets(); 707 return buckets.getBucketIndex(name, collatorPrimaryOnly); 708 } 709 710 /** 711 * Clear the index. 712 * 713 * @return this, for chaining 714 * @stable ICU 4.8 715 */ clearRecords()716 public AlphabeticIndex<V> clearRecords() { 717 if (inputList != null && !inputList.isEmpty()) { 718 inputList.clear(); 719 buckets = null; 720 } 721 return this; 722 } 723 724 /** 725 * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s). 726 * 727 * @return number of buckets 728 * @stable ICU 4.8 729 */ getBucketCount()730 public int getBucketCount() { 731 initBuckets(); 732 return buckets.getBucketCount(); 733 } 734 735 /** 736 * Return the number of records in the index: that is, the total number of distinct <name,data> pairs added with addRecord(...), over all the buckets. 737 * 738 * @return total number of records in buckets 739 * @stable ICU 4.8 740 */ getRecordCount()741 public int getRecordCount() { 742 return inputList != null ? inputList.size() : 0; 743 } 744 745 /** 746 * Return an iterator over the buckets. 747 * 748 * @return iterator over buckets. 749 * @stable ICU 4.8 750 */ iterator()751 public Iterator<Bucket<V>> iterator() { 752 initBuckets(); 753 return buckets.iterator(); 754 } 755 756 /** 757 * Creates an index, and buckets and sorts the list of records into the index. 758 */ initBuckets()759 private void initBuckets() { 760 if (buckets != null) { 761 return; 762 } 763 buckets = createBucketList(); 764 if (inputList == null || inputList.isEmpty()) { 765 return; 766 } 767 768 // Sort the records by name. 769 // Stable sort preserves input order of collation duplicates. 770 Collections.sort(inputList, recordComparator); 771 772 // Now, we traverse all of the input, which is now sorted. 773 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 774 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 775 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 776 // we need to improve it for that case. 777 778 Iterator<Bucket<V>> bucketIterator = buckets.fullIterator(); 779 Bucket<V> currentBucket = bucketIterator.next(); 780 Bucket<V> nextBucket; 781 String upperBoundary; 782 if (bucketIterator.hasNext()) { 783 nextBucket = bucketIterator.next(); 784 upperBoundary = nextBucket.lowerBoundary; 785 } else { 786 nextBucket = null; 787 upperBoundary = null; 788 } 789 for (Record<V> r : inputList) { 790 // if the current bucket isn't the right one, find the one that is 791 // We have a special flag for the last bucket so that we don't look any further 792 while (upperBoundary != null && 793 collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) { 794 currentBucket = nextBucket; 795 // now reset the boundary that we compare against 796 if (bucketIterator.hasNext()) { 797 nextBucket = bucketIterator.next(); 798 upperBoundary = nextBucket.lowerBoundary; 799 } else { 800 upperBoundary = null; 801 } 802 } 803 // now put the record into the bucket. 804 Bucket<V> bucket = currentBucket; 805 if (bucket.displayBucket != null) { 806 bucket = bucket.displayBucket; 807 } 808 if (bucket.records == null) { 809 bucket.records = new ArrayList<Record<V>>(); 810 } 811 bucket.records.add(r); 812 } 813 } 814 815 private int maxLabelCount = 99; 816 817 /** 818 * Returns true if one index character string is "better" than the other. 819 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 820 * better, and otherwise binary-less-than is better. 821 */ isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other)822 private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) { 823 // This is called with primary-equal strings, but never with one.equals(other). 824 String n1 = nfkdNormalizer.normalize(one); 825 String n2 = nfkdNormalizer.normalize(other); 826 int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length()); 827 if (result != 0) { 828 return result < 0; 829 } 830 result = binaryCmp.compare(n1, n2); 831 if (result != 0) { 832 return result < 0; 833 } 834 return binaryCmp.compare(one, other) < 0; 835 } 836 837 /** 838 * A (name, data) pair, to be sorted by name into one of the index buckets. 839 * The user data is not used by the index implementation. 840 * 841 * @stable ICU 4.8 842 */ 843 public static class Record<V> { 844 private final CharSequence name; 845 private final V data; 846 Record(CharSequence name, V data)847 private Record(CharSequence name, V data) { 848 this.name = name; 849 this.data = data; 850 } 851 852 /** 853 * Get the name 854 * 855 * @return the name 856 * @stable ICU 4.8 857 */ getName()858 public CharSequence getName() { 859 return name; 860 } 861 862 /** 863 * Get the data 864 * 865 * @return the data 866 * @stable ICU 4.8 867 */ getData()868 public V getData() { 869 return data; 870 } 871 872 /** 873 * Standard toString() 874 * @stable ICU 4.8 875 */ toString()876 public String toString() { 877 return name + "=" + data; 878 } 879 } 880 881 /** 882 * An index "bucket" with a label string and type. 883 * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)} 884 * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)}, 885 * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)}, 886 * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record 887 * into a bucket according to the record's name. 888 * 889 * @param <V> 890 * Data type 891 * @stable ICU 4.8 892 */ 893 public static class Bucket<V> implements Iterable<Record<V>> { 894 private final String label; 895 private final String lowerBoundary; 896 private final LabelType labelType; 897 private Bucket<V> displayBucket; 898 private int displayIndex; 899 private List<Record<V>> records; 900 901 /** 902 * Type of the label 903 * 904 * @stable ICU 4.8 905 */ 906 public enum LabelType { 907 /** 908 * Normal 909 * @stable ICU 4.8 910 */ 911 NORMAL, 912 /** 913 * Underflow (before the first) 914 * @stable ICU 4.8 915 */ 916 UNDERFLOW, 917 /** 918 * Inflow (between scripts) 919 * @stable ICU 4.8 920 */ 921 INFLOW, 922 /** 923 * Overflow (after the last) 924 * @stable ICU 4.8 925 */ 926 OVERFLOW 927 } 928 929 /** 930 * Set up the bucket. 931 * 932 * @param label 933 * label for the bucket 934 * @param labelType 935 * is an underflow, overflow, or inflow bucket 936 * @stable ICU 4.8 937 */ Bucket(String label, String lowerBoundary, LabelType labelType)938 private Bucket(String label, String lowerBoundary, LabelType labelType) { 939 this.label = label; 940 this.lowerBoundary = lowerBoundary; 941 this.labelType = labelType; 942 } 943 944 /** 945 * Get the label 946 * 947 * @return label for the bucket 948 * @stable ICU 4.8 949 */ getLabel()950 public String getLabel() { 951 return label; 952 } 953 954 /** 955 * Is a normal, underflow, overflow, or inflow bucket 956 * 957 * @return is an underflow, overflow, or inflow bucket 958 * @stable ICU 4.8 959 */ getLabelType()960 public LabelType getLabelType() { 961 return labelType; 962 } 963 964 /** 965 * Get the number of records in the bucket. 966 * 967 * @return number of records in bucket 968 * @stable ICU 4.8 969 */ size()970 public int size() { 971 return records == null ? 0 : records.size(); 972 } 973 974 /** 975 * Iterator over the records in the bucket 976 * @stable ICU 4.8 977 */ iterator()978 public Iterator<Record<V>> iterator() { 979 if (records == null) { 980 return Collections.<Record<V>>emptyList().iterator(); 981 } 982 return records.iterator(); 983 } 984 985 /** 986 * Standard toString() 987 * @stable ICU 4.8 988 */ 989 @Override toString()990 public String toString() { 991 return "{" + 992 "labelType=" + labelType 993 + ", " + 994 "lowerBoundary=" + lowerBoundary 995 + ", " + 996 "label=" + label 997 + "}" 998 ; 999 } 1000 } 1001 createBucketList()1002 private BucketList<V> createBucketList() { 1003 // Initialize indexCharacters. 1004 List<String> indexCharacters = initLabels(); 1005 1006 // Variables for hasMultiplePrimaryWeights(). 1007 long variableTop; 1008 if (collatorPrimaryOnly.isAlternateHandlingShifted()) { 1009 variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL; 1010 } else { 1011 variableTop = 0; 1012 } 1013 boolean hasInvisibleBuckets = false; 1014 1015 // Helper arrays for Chinese Pinyin collation. 1016 @SuppressWarnings({ "rawtypes", "unchecked" }) 1017 Bucket<V>[] asciiBuckets = new Bucket[26]; 1018 @SuppressWarnings({ "rawtypes", "unchecked" }) 1019 Bucket<V>[] pinyinBuckets = new Bucket[26]; 1020 boolean hasPinyin = false; 1021 1022 ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>(); 1023 1024 // underflow bucket 1025 bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW)); 1026 1027 // fix up the list, adding underflow, additions, overflow 1028 // Insert inflow labels as needed. 1029 int scriptIndex = -1; 1030 String scriptUpperBoundary = ""; 1031 for (String current : indexCharacters) { 1032 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) { 1033 // We crossed the script boundary into a new script. 1034 String inflowBoundary = scriptUpperBoundary; 1035 boolean skippedScript = false; 1036 for (;;) { 1037 scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex); 1038 if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) { 1039 break; 1040 } 1041 skippedScript = true; 1042 } 1043 if (skippedScript && bucketList.size() > 1) { 1044 // We are skipping one or more scripts, 1045 // and we are not just getting out of the underflow label. 1046 bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary, 1047 LabelType.INFLOW)); 1048 } 1049 } 1050 // Add a bucket with the current label. 1051 Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL); 1052 bucketList.add(bucket); 1053 // Remember ASCII and Pinyin buckets for Pinyin redirects. 1054 char c; 1055 if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') { 1056 asciiBuckets[c - 'A'] = bucket; 1057 } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) && 1058 'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') { 1059 pinyinBuckets[c - 'A'] = bucket; 1060 hasPinyin = true; 1061 } 1062 // Check for multiple primary weights. 1063 if (!current.startsWith(BASE) && 1064 hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) && 1065 !current.endsWith("\uffff")) { 1066 // "Æ" or "Sch" etc. 1067 for (int i = bucketList.size() - 2;; --i) { 1068 Bucket<V> singleBucket = bucketList.get(i); 1069 if (singleBucket.labelType != LabelType.NORMAL) { 1070 // There is no single-character bucket since the last 1071 // underflow or inflow label. 1072 break; 1073 } 1074 if (singleBucket.displayBucket == null && 1075 !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) { 1076 // Add an invisible bucket that redirects strings greater than the expansion 1077 // to the previous single-character bucket. 1078 // For example, after ... Q R S Sch we add Sch\uFFFF->S 1079 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 1080 bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL); 1081 bucket.displayBucket = singleBucket; 1082 bucketList.add(bucket); 1083 hasInvisibleBuckets = true; 1084 break; 1085 } 1086 } 1087 } 1088 } 1089 if (bucketList.size() == 1) { 1090 // No real labels, show only the underflow label. 1091 return new BucketList<V>(bucketList, bucketList); 1092 } 1093 // overflow bucket 1094 bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final 1095 1096 if (hasPinyin) { 1097 // Redirect Pinyin buckets. 1098 Bucket<V> asciiBucket = null; 1099 for (int i = 0; i < 26; ++i) { 1100 if (asciiBuckets[i] != null) { 1101 asciiBucket = asciiBuckets[i]; 1102 } 1103 if (pinyinBuckets[i] != null && asciiBucket != null) { 1104 pinyinBuckets[i].displayBucket = asciiBucket; 1105 hasInvisibleBuckets = true; 1106 } 1107 } 1108 } 1109 1110 if (!hasInvisibleBuckets) { 1111 return new BucketList<V>(bucketList, bucketList); 1112 } 1113 // Merge inflow buckets that are visually adjacent. 1114 // Iterate backwards: Merge inflow into overflow rather than the other way around. 1115 int i = bucketList.size() - 1; 1116 Bucket<V> nextBucket = bucketList.get(i); 1117 while (--i > 0) { 1118 Bucket<V> bucket = bucketList.get(i); 1119 if (bucket.displayBucket != null) { 1120 continue; // skip invisible buckets 1121 } 1122 if (bucket.labelType == LabelType.INFLOW) { 1123 if (nextBucket.labelType != LabelType.NORMAL) { 1124 bucket.displayBucket = nextBucket; 1125 continue; 1126 } 1127 } 1128 nextBucket = bucket; 1129 } 1130 1131 ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>(); 1132 for (Bucket<V> bucket : bucketList) { 1133 if (bucket.displayBucket == null) { 1134 publicBucketList.add(bucket); 1135 } 1136 } 1137 return new BucketList<V>(bucketList, publicBucketList); 1138 } 1139 1140 private static class BucketList<V> implements Iterable<Bucket<V>> { 1141 private final ArrayList<Bucket<V>> bucketList; 1142 private final List<Bucket<V>> immutableVisibleList; 1143 BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList)1144 private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) { 1145 this.bucketList = bucketList; 1146 1147 int displayIndex = 0; 1148 for (Bucket<V> bucket : publicBucketList) { 1149 bucket.displayIndex = displayIndex++; 1150 } 1151 immutableVisibleList = Collections.unmodifiableList(publicBucketList); 1152 } 1153 getBucketCount()1154 private int getBucketCount() { 1155 return immutableVisibleList.size(); 1156 } 1157 getBucketIndex(CharSequence name, Collator collatorPrimaryOnly)1158 private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) { 1159 // binary search 1160 int start = 0; 1161 int limit = bucketList.size(); 1162 while ((start + 1) < limit) { 1163 int i = (start + limit) / 2; 1164 Bucket<V> bucket = bucketList.get(i); 1165 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary); 1166 if (nameVsBucket < 0) { 1167 limit = i; 1168 } else { 1169 start = i; 1170 } 1171 } 1172 Bucket<V> bucket = bucketList.get(start); 1173 if (bucket.displayBucket != null) { 1174 bucket = bucket.displayBucket; 1175 } 1176 return bucket.displayIndex; 1177 } 1178 1179 /** 1180 * Private iterator over all the buckets, visible and invisible 1181 */ fullIterator()1182 private Iterator<Bucket<V>> fullIterator() { 1183 return bucketList.iterator(); 1184 } 1185 1186 /** 1187 * Iterator over just the visible buckets. 1188 */ iterator()1189 public Iterator<Bucket<V>> iterator() { 1190 return immutableVisibleList.iterator(); // use immutable list to prevent remove(). 1191 } 1192 } 1193 hasMultiplePrimaryWeights( RuleBasedCollator coll, long variableTop, String s)1194 private static boolean hasMultiplePrimaryWeights( 1195 RuleBasedCollator coll, long variableTop, String s) { 1196 long[] ces = coll.internalGetCEs(s); 1197 boolean seenPrimary = false; 1198 for (int i = 0; i < ces.length; ++i) { 1199 long ce = ces[i]; 1200 long p = ce >>> 32; 1201 if (p > variableTop) { 1202 // not primary ignorable 1203 if (seenPrimary) { 1204 return true; 1205 } 1206 seenPrimary = true; 1207 } 1208 } 1209 return false; 1210 } 1211 1212 // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?! 1213 private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER; 1214 private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER; 1215 private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER; 1216 private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER; 1217 private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER; 1218 private static final int GC_L_MASK = 1219 GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK; 1220 private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES; 1221 1222 /** 1223 * Return a list of the first character in each script. Only exposed for testing. 1224 * 1225 * @return list of first characters in each script 1226 * @internal 1227 * @deprecated This API is ICU internal, only for testing. 1228 */ 1229 @Deprecated getFirstCharactersInScripts()1230 public List<String> getFirstCharactersInScripts() { 1231 List<String> dest = new ArrayList<String>(200); 1232 // Fetch the script-first-primary contractions which are defined in the root collator. 1233 // They all start with U+FDD1. 1234 UnicodeSet set = new UnicodeSet(); 1235 collatorPrimaryOnly.internalAddContractions(0xFDD1, set); 1236 if (set.isEmpty()) { 1237 throw new UnsupportedOperationException( 1238 "AlphabeticIndex requires script-first-primary contractions"); 1239 } 1240 for (String boundary : set) { 1241 int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1)); 1242 if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) { 1243 // Ignore boundaries for the special reordering groups. 1244 // Take only those for "real scripts" (where the sample character is a Letter, 1245 // and the one for unassigned implicit weights (Cn). 1246 continue; 1247 } 1248 dest.add(boundary); 1249 } 1250 return dest; 1251 } 1252 } 1253