1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.dev.util; 8 9 import java.util.ArrayList; 10 import java.util.Collection; 11 import java.util.Collections; 12 import java.util.Comparator; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.LinkedHashSet; 16 import java.util.Map; 17 import java.util.Map.Entry; 18 import java.util.Set; 19 import java.util.TreeMap; 20 import java.util.TreeSet; 21 22 import com.ibm.icu.impl.Utility; 23 import com.ibm.icu.text.StringTransform; 24 import com.ibm.icu.text.UTF16; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.text.UnicodeSetIterator; 27 import com.ibm.icu.util.Freezable; 28 29 /** 30 * Class for mapping Unicode characters and strings to values, optimized for single code points, 31 * where ranges of code points have the same value. 32 * Much smaller storage than using HashMap, and much faster and more compact than 33 * a list of UnicodeSets. The API design mimics Map<String,T> but can't extend it due to some 34 * necessary changes (much as UnicodeSet mimics Set<String>). Note that nulls are not permitted as values; 35 * that is, a put(x,null) is the same as remove(x).<br> 36 * At this point "" is also not allowed as a key, although that may change. 37 * @author markdavis 38 */ 39 40 public final class UnicodeMap<T> implements Cloneable, Freezable, StringTransform, Iterable<String> { 41 /** 42 * For serialization 43 */ 44 //private static final long serialVersionUID = -6540936876295804105L; 45 static final boolean ASSERTIONS = false; 46 static final long GROWTH_PERCENT = 200; // 100 is no growth! 47 static final long GROWTH_GAP = 10; // extra bump! 48 49 private int length; 50 // two parallel arrays to save memory. Wish Java had structs. 51 private int[] transitions; 52 /* package private */ T[] values; 53 54 private LinkedHashSet<T> availableValues = new LinkedHashSet<T>(); 55 private transient boolean staleAvailableValues; 56 57 private transient boolean errorOnReset; 58 private volatile transient boolean locked; 59 private int lastIndex; 60 private Map<String,T> stringMap; 61 clear()62 { clear(); } 63 clear()64 public UnicodeMap<T> clear() { 65 if (locked) { 66 throw new UnsupportedOperationException("Attempt to modify locked object"); 67 } 68 length = 2; 69 transitions = new int[] {0,0x110000,0,0,0,0,0,0,0,0}; 70 values = (T[]) new Object[10]; 71 72 availableValues.clear(); 73 staleAvailableValues = false; 74 75 errorOnReset = false; 76 lastIndex = 0; 77 stringMap = null; 78 return this; 79 } 80 81 /* Boilerplate */ equals(Object other)82 public boolean equals(Object other) { 83 if (other == null) return false; 84 try { 85 UnicodeMap that = (UnicodeMap) other; 86 if (length != that.length) return false; 87 for (int i = 0; i < length-1; ++i) { 88 if (transitions[i] != that.transitions[i]) return false; 89 if (!areEqual(values[i], that.values[i])) return false; 90 } 91 return true; 92 } catch (ClassCastException e) { 93 return false; 94 } 95 } 96 areEqual(Object a , Object b)97 public static boolean areEqual(Object a , Object b) { 98 if (a == b) return true; 99 if (a == null || b == null) return false; 100 return a.equals(b); 101 } 102 hashCode()103 public int hashCode() { 104 int result = length; 105 // TODO might want to abbreviate this for speed. 106 for (int i = 0; i < length-1; ++i) { 107 result = 37*result + transitions[i]; 108 result = 37*result + values[i].hashCode(); 109 } 110 return result; 111 } 112 113 /** 114 * Standard clone. Warning, as with Collections, does not do deep clone. 115 */ cloneAsThawed()116 public UnicodeMap<T> cloneAsThawed() { 117 UnicodeMap<T> that = new UnicodeMap<T>(); 118 that.length = length; 119 that.transitions = (int[]) transitions.clone(); 120 that.values = (T[]) values.clone(); 121 that.availableValues = new LinkedHashSet<T>(availableValues); 122 that.locked = false; 123 return that; 124 } 125 126 /* for internal consistency checking */ 127 _checkInvariants()128 void _checkInvariants() { 129 if (length < 2 130 || length > transitions.length 131 || transitions.length != values.length) { 132 throw new IllegalArgumentException("Invariant failed: Lengths bad"); 133 } 134 for (int i = 1; i < length-1; ++i) { 135 if (areEqual(values[i-1], values[i])) { 136 throw new IllegalArgumentException("Invariant failed: values shared at " 137 + "\t" + Utility.hex(i-1) + ": <" + values[i-1] + ">" 138 + "\t" + Utility.hex(i) + ": <" + values[i] + ">" 139 ); 140 } 141 } 142 if (transitions[0] != 0 || transitions[length-1] != 0x110000) { 143 throw new IllegalArgumentException("Invariant failed: bounds set wrong"); 144 } 145 for (int i = 1; i < length-1; ++i) { 146 if (transitions[i-1] >= transitions[i]) { 147 throw new IllegalArgumentException("Invariant failed: not monotonic" 148 + "\t" + Utility.hex(i-1) + ": " + transitions[i-1] 149 + "\t" + Utility.hex(i) + ": " + transitions[i] 150 ); 151 } 152 } 153 } 154 155 /** 156 * Finds an index such that inversionList[i] <= codepoint < inversionList[i+1] 157 * Assumes that 0 <= codepoint <= 0x10FFFF 158 * @param codepoint 159 * @return the index 160 */ _findIndex(int c)161 private int _findIndex(int c) { 162 int lo = 0; 163 int hi = length - 1; 164 int i = (lo + hi) >>> 1; 165 // invariant: c >= list[lo] 166 // invariant: c < list[hi] 167 while (i != lo) { 168 if (c < transitions[i]) { 169 hi = i; 170 } else { 171 lo = i; 172 } 173 i = (lo + hi) >>> 1; 174 } 175 if (ASSERTIONS) _checkFind(c, lo); 176 return lo; 177 } 178 _checkFind(int codepoint, int value)179 private void _checkFind(int codepoint, int value) { 180 int other = __findIndex(codepoint); 181 if (other != value) { 182 throw new IllegalArgumentException("Invariant failed: binary search" 183 + "\t" + Utility.hex(codepoint) + ": " + value 184 + "\tshould be: " + other); 185 } 186 } 187 __findIndex(int codepoint)188 private int __findIndex(int codepoint) { 189 for (int i = length-1; i > 0; --i) { 190 if (transitions[i] <= codepoint) return i; 191 } 192 return 0; 193 } 194 195 /* 196 * Try indexed lookup 197 198 static final int SHIFT = 8; 199 int[] starts = new int[0x10FFFF>>SHIFT]; // lowest transition index where codepoint>>x can be found 200 boolean startsValid = false; 201 private int findIndex(int codepoint) { 202 if (!startsValid) { 203 int start = 0; 204 for (int i = 1; i < length; ++i) { 205 206 } 207 } 208 for (int i = length-1; i > 0; --i) { 209 if (transitions[i] <= codepoint) return i; 210 } 211 return 0; 212 } 213 */ 214 215 /** 216 * Remove the items from index through index+count-1. 217 * Logically reduces the size of the internal arrays. 218 * @param index 219 * @param count 220 */ _removeAt(int index, int count)221 private void _removeAt(int index, int count) { 222 for (int i = index + count; i < length; ++i) { 223 transitions[i-count] = transitions[i]; 224 values[i-count] = values[i]; 225 } 226 length -= count; 227 } 228 /** 229 * Add a gap from index to index+count-1. 230 * The values there are undefined, and must be set. 231 * Logically grows arrays to accomodate. Actual growth is limited 232 * @param index 233 * @param count 234 */ _insertGapAt(int index, int count)235 private void _insertGapAt(int index, int count) { 236 int newLength = length + count; 237 int[] oldtransitions = transitions; 238 T[] oldvalues = values; 239 if (newLength > transitions.length) { 240 int allocation = (int) (GROWTH_GAP + (newLength * GROWTH_PERCENT) / 100); 241 transitions = new int[allocation]; 242 values = (T[]) new Object[allocation]; 243 for (int i = 0; i < index; ++i) { 244 transitions[i] = oldtransitions[i]; 245 values[i] = oldvalues[i]; 246 } 247 } 248 for (int i = length - 1; i >= index; --i) { 249 transitions[i+count] = oldtransitions[i]; 250 values[i+count] = oldvalues[i]; 251 } 252 length = newLength; 253 } 254 255 /** 256 * Associates code point with value. Removes any previous association. 257 * All code that calls this MUST check for frozen first! 258 * @param codepoint 259 * @param value 260 * @return this, for chaining 261 */ _put(int codepoint, T value)262 private UnicodeMap _put(int codepoint, T value) { 263 // Warning: baseIndex is an invariant; must 264 // be defined such that transitions[baseIndex] < codepoint 265 // at end of this routine. 266 int baseIndex; 267 if (transitions[lastIndex] <= codepoint 268 && codepoint < transitions[lastIndex+1]) { 269 baseIndex = lastIndex; 270 } else { 271 baseIndex = _findIndex(codepoint); 272 } 273 int limitIndex = baseIndex + 1; 274 // cases are (a) value is already set 275 if (areEqual(values[baseIndex], value)) return this; 276 if (locked) { 277 throw new UnsupportedOperationException("Attempt to modify locked object"); 278 } 279 if (errorOnReset && values[baseIndex] != null) { 280 throw new UnsupportedOperationException("Attempt to reset value for " + Utility.hex(codepoint) 281 + " when that is disallowed. Old: " + values[baseIndex] + "; New: " + value); 282 } 283 284 // adjust the available values 285 staleAvailableValues = true; 286 availableValues.add(value); // add if not there already 287 288 int baseCP = transitions[baseIndex]; 289 int limitCP = transitions[limitIndex]; 290 // we now start walking through the difference case, 291 // based on whether we are at the start or end of range 292 // and whether the range is a single character or multiple 293 294 if (baseCP == codepoint) { 295 // CASE: At very start of range 296 boolean connectsWithPrevious = 297 baseIndex != 0 && areEqual(value, values[baseIndex-1]); 298 299 if (limitCP == codepoint + 1) { 300 // CASE: Single codepoint range 301 boolean connectsWithFollowing = 302 baseIndex < length - 2 && areEqual(value, values[limitIndex]); // was -1 303 304 if (connectsWithPrevious) { 305 // A1a connects with previous & following, so remove index 306 if (connectsWithFollowing) { 307 _removeAt(baseIndex, 2); 308 } else { 309 _removeAt(baseIndex, 1); // extend previous 310 } 311 --baseIndex; // fix up 312 } else if (connectsWithFollowing) { 313 _removeAt(baseIndex, 1); // extend following backwards 314 transitions[baseIndex] = codepoint; 315 } else { 316 // doesn't connect on either side, just reset 317 values[baseIndex] = value; 318 } 319 } else if (connectsWithPrevious) { 320 // A.1: start of multi codepoint range 321 // if connects 322 ++transitions[baseIndex]; // extend previous 323 } else { 324 // otherwise insert new transition 325 transitions[baseIndex] = codepoint+1; // fix following range 326 _insertGapAt(baseIndex, 1); 327 values[baseIndex] = value; 328 transitions[baseIndex] = codepoint; 329 } 330 } else if (limitCP == codepoint + 1) { 331 // CASE: at end of range 332 // if connects, just back up range 333 boolean connectsWithFollowing = 334 baseIndex < length - 2 && areEqual(value, values[limitIndex]); // was -1 335 336 if (connectsWithFollowing) { 337 --transitions[limitIndex]; 338 return this; 339 } else { 340 _insertGapAt(limitIndex, 1); 341 transitions[limitIndex] = codepoint; 342 values[limitIndex] = value; 343 } 344 } else { 345 // CASE: in middle of range 346 // insert gap, then set the new range 347 _insertGapAt(++baseIndex,2); 348 transitions[baseIndex] = codepoint; 349 values[baseIndex] = value; 350 transitions[baseIndex+1] = codepoint + 1; 351 values[baseIndex+1] = values[baseIndex-1]; // copy lower range values 352 } 353 lastIndex = baseIndex; // store for next time 354 return this; 355 } 356 357 private UnicodeMap _putAll(int startCodePoint, int endCodePoint, T value) { 358 // TODO optimize 359 for (int i = startCodePoint; i <= endCodePoint; ++i) { 360 _put(i, value); 361 if (ASSERTIONS) _checkInvariants(); 362 } 363 return this; 364 } 365 366 /** 367 * Sets the codepoint value. 368 * @param codepoint 369 * @param value 370 * @return this (for chaining) 371 */ 372 public UnicodeMap<T> put(int codepoint, T value) { 373 if (codepoint < 0 || codepoint > 0x10FFFF) { 374 throw new IllegalArgumentException("Codepoint out of range: " + codepoint); 375 } 376 _put(codepoint, value); 377 if (ASSERTIONS) _checkInvariants(); 378 return this; 379 } 380 381 /** 382 * Sets the codepoint value. 383 * @param codepoint 384 * @param value 385 * @return this (for chaining) 386 */ 387 public UnicodeMap<T> put(String string, T value) { 388 int v = UnicodeSet.getSingleCodePoint(string); 389 if (v == Integer.MAX_VALUE) { 390 if (locked) { 391 throw new UnsupportedOperationException("Attempt to modify locked object"); 392 } 393 if (value != null) { 394 if (stringMap == null) { 395 stringMap = new TreeMap<String,T>(); 396 } 397 stringMap.put(string, value); 398 staleAvailableValues = true; 399 } else if (stringMap != null) { 400 if (stringMap.remove(string) != null) { 401 staleAvailableValues = true; 402 } 403 } 404 return this; 405 } 406 return put(v, value); 407 } 408 409 /** 410 * Adds bunch o' codepoints; otherwise like put. 411 * @param codepoints 412 * @param value 413 * @return this (for chaining) 414 */ 415 public UnicodeMap<T> putAll(UnicodeSet codepoints, T value) { 416 UnicodeSetIterator it = new UnicodeSetIterator(codepoints); 417 while (it.nextRange()) { 418 if (it.string == null) { 419 _putAll(it.codepoint, it.codepointEnd, value); 420 } else { 421 put(it.string, value); 422 } 423 } 424 return this; 425 } 426 427 /** 428 * Adds bunch o' codepoints; otherwise like add. 429 * @param startCodePoint 430 * @param endCodePoint 431 * @param value 432 * @return this (for chaining) 433 */ 434 public UnicodeMap<T> putAll(int startCodePoint, int endCodePoint, T value) { 435 if (locked) { 436 throw new UnsupportedOperationException("Attempt to modify locked object"); 437 } 438 if (startCodePoint < 0 || endCodePoint > 0x10FFFF) { 439 throw new IllegalArgumentException("Codepoint out of range: " 440 + Utility.hex(startCodePoint) + ".." + Utility.hex(endCodePoint)); 441 } 442 return _putAll(startCodePoint, endCodePoint, value); 443 } 444 445 /** 446 * Add all the (main) values from a UnicodeMap 447 * @param unicodeMap the property to add to the map 448 * @return this (for chaining) 449 */ 450 public UnicodeMap<T> putAll(UnicodeMap<T> unicodeMap) { 451 for (int i = 0; i < unicodeMap.length; ++i) { 452 T value = unicodeMap.values[i]; 453 if (value != null) { 454 _putAll(unicodeMap.transitions[i], unicodeMap.transitions[i+1]-1, value); 455 } 456 if (ASSERTIONS) _checkInvariants(); 457 } 458 if (unicodeMap.stringMap != null && !unicodeMap.stringMap.isEmpty()) { 459 if (stringMap == null) { 460 stringMap = new TreeMap<String,T>(); 461 } 462 stringMap.putAll(unicodeMap.stringMap); 463 } 464 return this; 465 } 466 467 /** 468 * Add all the (main) values from a Unicode property 469 * @param prop the property to add to the map 470 * @return this (for chaining) 471 */ 472 public UnicodeMap<T> putAllFiltered(UnicodeMap<T> prop, UnicodeSet filter) { 473 // TODO optimize 474 for (UnicodeSetIterator it = new UnicodeSetIterator(filter); it.next();) { 475 if (it.codepoint != UnicodeSetIterator.IS_STRING) { 476 T value = prop.getValue(it.codepoint); 477 if (value != null) { 478 _put(it.codepoint, value); 479 } 480 } 481 } 482 // now do the strings 483 for (String key : filter.strings()) { 484 T value = prop.get(key); 485 if (value != null) { 486 put(key, value); 487 } 488 } 489 return this; 490 } 491 492 /** 493 * Set the currently unmapped Unicode code points to the given value. 494 * @param value the value to set 495 * @return this (for chaining) 496 */ 497 public UnicodeMap<T> setMissing(T value) { 498 // fast path, if value not yet present 499 if (!getAvailableValues().contains(value)) { 500 staleAvailableValues = true; 501 availableValues.add(value); 502 for (int i = 0; i < length; ++i) { 503 if (values[i] == null) values[i] = value; 504 } 505 return this; 506 } else { 507 return putAll(keySet(null), value); 508 } 509 } 510 /** 511 * Returns the keyset consisting of all the keys that would produce the given value. Deposits into 512 * result if it is not null. Remember to clear if you just want 513 * the new values. 514 */ 515 public UnicodeSet keySet(T value, UnicodeSet result) { 516 if (result == null) result = new UnicodeSet(); 517 for (int i = 0; i < length - 1; ++i) { 518 if (areEqual(value, values[i])) { 519 result.add(transitions[i], transitions[i+1]-1); 520 } 521 } 522 if (value != null && stringMap != null) { 523 for (String key : stringMap.keySet()) { 524 T newValue = stringMap.get(key); 525 if (value.equals(newValue)) { 526 result.add((String)key); 527 } 528 } 529 } 530 return result; 531 } 532 533 /** 534 * Returns the keyset consisting of all the keys that would produce the given value. 535 * the new values. 536 */ 537 public UnicodeSet keySet(T value) { 538 return keySet(value,null); 539 } 540 541 /** 542 * Returns the keyset consisting of all the keys that would produce (non-null) values. 543 */ 544 public UnicodeSet keySet() { 545 UnicodeSet result = new UnicodeSet(); 546 for (int i = 0; i < length - 1; ++i) { 547 if (values[i] != null) { 548 result.add(transitions[i], transitions[i+1]-1); 549 } 550 } 551 if (stringMap != null) { 552 result.addAll(stringMap.keySet()); 553 } 554 return result; 555 } 556 557 /** 558 * Returns the list of possible values. Deposits each non-null value into 559 * result. Creates result if it is null. Remember to clear result if 560 * you are not appending to existing collection. 561 * @param result 562 * @return result 563 */ 564 public <U extends Collection<T>> U values(U result) { 565 if (staleAvailableValues) { 566 // collect all the current values 567 // retain them in the availableValues 568 Set<T> temp = new HashSet<T>(); 569 for (int i = 0; i < length - 1; ++i) { 570 if (values[i] != null) temp.add(values[i]); 571 } 572 availableValues.retainAll(temp); 573 if (stringMap != null) { 574 availableValues.addAll(stringMap.values()); 575 } 576 staleAvailableValues = false; 577 } 578 if (result == null) { 579 result = (U) new ArrayList<T>(availableValues.size()); 580 } 581 result.addAll(availableValues); 582 return result; 583 } 584 585 /** 586 * Convenience method 587 */ 588 public Collection<T> values() { 589 return getAvailableValues(null); 590 } 591 /** 592 * Gets the value associated with a given code point. 593 * Returns null, if there is no such value. 594 * @param codepoint 595 * @return the value 596 */ 597 public T get(int codepoint) { 598 if (codepoint < 0 || codepoint > 0x10FFFF) { 599 throw new IllegalArgumentException("Codepoint out of range: " + codepoint); 600 } 601 return values[_findIndex(codepoint)]; 602 } 603 604 /** 605 * Gets the value associated with a given code point. 606 * Returns null, if there is no such value. 607 * @param codepoint 608 * @return the value 609 */ 610 public T get(String value) { 611 if (UTF16.hasMoreCodePointsThan(value, 1)) { 612 if (stringMap == null) { 613 return null; 614 } 615 return stringMap.get(value); 616 } 617 return getValue(UTF16.charAt(value, 0)); 618 } 619 620 621 /** 622 * Change a new string from the source string according to the mappings. 623 * For each code point cp, if getValue(cp) is null, append the character, otherwise append getValue(cp).toString() 624 * TODO: extend to strings 625 * @param source 626 * @return 627 */ 628 public String transform(String source) { 629 StringBuffer result = new StringBuffer(); 630 int cp; 631 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 632 cp = UTF16.charAt(source, i); 633 T mResult = getValue(cp); 634 if (mResult != null) { 635 result.append(mResult); 636 } else { 637 UTF16.append(result, cp); 638 } 639 } 640 return result.toString(); 641 } 642 643 /** 644 * Used to add complex values, where the value isn't replaced but in some sense composed 645 * @author markdavis 646 */ 647 public abstract static class Composer<T> { 648 /** 649 * This will be called with either a string or a code point. The result is the new value for that item. 650 * If the codepoint is used, the string is null; if the string is used, the codepoint is -1. 651 * @param a 652 * @param b 653 */ 654 public abstract T compose(int codePoint, String string, T a, T b); 655 } 656 657 public UnicodeMap<T> composeWith(UnicodeMap<T> other, Composer<T> composer) { 658 for (T value : other.getAvailableValues()) { 659 UnicodeSet set = other.keySet(value); 660 composeWith(set, value, composer); 661 } 662 return this; 663 } 664 665 public UnicodeMap<T> composeWith(UnicodeSet set, T value, Composer<T> composer) { 666 for (UnicodeSetIterator it = new UnicodeSetIterator(set); it.next();) { 667 int i = it.codepoint; 668 if (i == UnicodeSetIterator.IS_STRING) { 669 String s = it.string; 670 T v1 = getValue(s); 671 T v3 = composer.compose(-1, s, v1, value); 672 if (v1 != v3 && (v1 == null || !v1.equals(v3))) { 673 put(s, v3); 674 } 675 } else { 676 T v1 = getValue(i); 677 T v3 = composer.compose(i, null, v1, value); 678 if (v1 != v3 && (v1 == null || !v1.equals(v3))) { 679 put(i, v3); 680 } 681 } 682 } 683 return this; 684 } 685 686 public String toString() { 687 return toString(null); 688 } 689 690 public String toString(Comparator<T> collected) { 691 StringBuffer result = new StringBuffer(); 692 if (collected == null) { 693 for (int i = 0; i < length-1; ++i) { 694 T value = values[i]; 695 if (value == null) continue; 696 int start = transitions[i]; 697 int end = transitions[i+1]-1; 698 result.append(Utility.hex(start)); 699 if (start != end) result.append("-").append(Utility.hex(end)); 700 result.append("=").append(value.toString()).append("\n"); 701 } 702 if (stringMap != null) { 703 for (String s : stringMap.keySet()) { 704 result.append(Utility.hex(s)).append("=").append(stringMap.get(s).toString()).append("\n"); 705 } 706 } 707 } else { 708 Set<T> set = values(new TreeSet<T>(collected)); 709 for (Iterator<T> it = set.iterator(); it.hasNext();) { 710 T value = it.next(); 711 UnicodeSet s = keySet(value); 712 result.append(value).append("=").append(s.toString()).append("\n"); 713 } 714 } 715 return result.toString(); 716 } 717 /** 718 * @return Returns the errorOnReset value. 719 */ 720 public boolean getErrorOnReset() { 721 return errorOnReset; 722 } 723 /** 724 * Puts the UnicodeMap into a state whereby new mappings are accepted, but changes to old mappings cause an exception. 725 * @param errorOnReset The errorOnReset to set. 726 */ 727 public UnicodeMap<T> setErrorOnReset(boolean errorOnReset) { 728 this.errorOnReset = errorOnReset; 729 return this; 730 } 731 732 /* (non-Javadoc) 733 * @see com.ibm.icu.dev.test.util.Freezable#isFrozen() 734 */ 735 public boolean isFrozen() { 736 // TODO Auto-generated method stub 737 return locked; 738 } 739 740 /* (non-Javadoc) 741 * @see com.ibm.icu.dev.test.util.Freezable#lock() 742 */ 743 public UnicodeMap<T> freeze() { 744 locked = true; 745 return this; 746 } 747 748 /** 749 * Utility to find the maximal common prefix of two strings. 750 * TODO: fix supplemental support 751 */ 752 static public int findCommonPrefix(String last, String s) { 753 int minLen = Math.min(last.length(), s.length()); 754 for (int i = 0; i < minLen; ++i) { 755 if (last.charAt(i) != s.charAt(i)) return i; 756 } 757 return minLen; 758 } 759 760 /** 761 * Get the number of ranges; used for getRangeStart/End. The ranges together cover all of the single-codepoint keys in the UnicodeMap. Other keys can be gotten with getStrings(). 762 */ 763 public int getRangeCount() { 764 return length-1; 765 } 766 767 /** 768 * Get the start of a range. All code points between start and end are in the UnicodeMap's keyset. 769 */ 770 public int getRangeStart(int range) { 771 return transitions[range]; 772 } 773 774 /** 775 * Get the start of a range. All code points between start and end are in the UnicodeMap's keyset. 776 */ 777 public int getRangeEnd(int range) { 778 return transitions[range+1] - 1; 779 } 780 781 /** 782 * Get the value for the range. 783 */ 784 public T getRangeValue(int range) { 785 return values[range]; 786 } 787 788 /** 789 * Get the strings that are not in the ranges. Returns null if there are none. 790 * @return 791 */ 792 public Set<String> getNonRangeStrings() { 793 if (stringMap == null || stringMap.isEmpty()) { 794 return null; 795 } 796 return Collections.unmodifiableSet(stringMap.keySet()); 797 } 798 799 static final boolean DEBUG_WRITE = false; 800 801 /* (non-Javadoc) 802 * @see java.util.Map#containsKey(java.lang.Object) 803 */ 804 public boolean containsKey(String key) { 805 return getValue(key) != null; 806 } 807 808 /* (non-Javadoc) 809 * @see java.util.Map#containsKey(java.lang.Object) 810 */ 811 public boolean containsKey(int key) { 812 return getValue(key) != null; 813 } 814 815 /* (non-Javadoc) 816 * @see java.util.Map#containsValue(java.lang.Object) 817 */ 818 public boolean containsValue(T value) { 819 // TODO Optimize 820 return getAvailableValues().contains(value); 821 } 822 823 /* (non-Javadoc) 824 * @see java.util.Map#isEmpty() 825 */ 826 public boolean isEmpty() { 827 return size() == 0; 828 } 829 830 /* (non-Javadoc) 831 * @see java.util.Map#putAll(java.util.Map) 832 */ 833 public UnicodeMap<T> putAll(Map<? extends String, ? extends T> map) { 834 for (String key : map.keySet()) { 835 put(key,map.get(key)); 836 } 837 return this; 838 } 839 840 /** 841 * Utility for extracting map 842 */ 843 public UnicodeMap<T> putAllIn(Map<? super String, ? super T> map) { 844 for (String key : keySet()) { 845 map.put(key, get(key)); 846 } 847 return this; 848 } 849 850 /* (non-Javadoc) 851 * @see java.util.Map#remove(java.lang.Object) 852 */ 853 public UnicodeMap<T> remove(String key) { 854 return put(key, null); 855 } 856 857 /* (non-Javadoc) 858 * @see java.util.Map#remove(java.lang.Object) 859 */ 860 public UnicodeMap<T> remove(int key) { 861 return put(key, null); 862 } 863 864 /* (non-Javadoc) 865 * @see java.util.Map#size() 866 */ 867 public int size() { 868 int result = stringMap == null ? 0 : stringMap.size(); 869 for (int i = 0; i < length-1; ++i) { 870 T value = values[i]; 871 if (value == null) continue; 872 result += transitions[i+1] - transitions[i]; 873 } 874 return result; 875 } 876 877 /* (non-Javadoc) 878 * @see java.util.Map#entrySet() 879 */ 880 public Iterable<Entry<String,T>> entrySet() { 881 return new EntrySetX(); 882 } 883 884 private class EntrySetX implements Iterable<Entry<String, T>> { 885 public Iterator<Entry<String, T>> iterator() { 886 return new IteratorX(); 887 } 888 public String toString() { 889 StringBuffer b = new StringBuffer(); 890 for (Iterator it = iterator(); it.hasNext();) { 891 Object item = it.next(); 892 b.append(item.toString()).append(' '); 893 } 894 return b.toString(); 895 } 896 } 897 898 private class IteratorX implements Iterator<Entry<String, T>> { 899 Iterator<String> iterator = keySet().iterator(); 900 901 /* (non-Javadoc) 902 * @see java.util.Iterator#hasNext() 903 */ 904 public boolean hasNext() { 905 return iterator.hasNext(); 906 } 907 908 /* (non-Javadoc) 909 * @see java.util.Iterator#next() 910 */ 911 public Entry<String, T> next() { 912 String key = iterator.next(); 913 return new ImmutableEntry(key, get(key)); 914 } 915 916 /* (non-Javadoc) 917 * @see java.util.Iterator#remove() 918 */ 919 public void remove() { 920 throw new UnsupportedOperationException(); 921 } 922 923 } 924 925 /** 926 * Struct-like class used to iterate over a UnicodeMap in a for loop. 927 * If the value is a string, then codepoint == codepointEnd == -1. Otherwise the string is null; 928 * Caution: The contents may change during the iteration! 929 */ 930 public static class EntryRange<T> { 931 public int codepoint; 932 public int codepointEnd; 933 public String string; 934 public T value; 935 @Override 936 public String toString() { 937 return (string != null ? Utility.hex(string) 938 : Utility.hex(codepoint) + (codepoint == codepointEnd ? "" : ".." + Utility.hex(codepointEnd))) 939 + "=" + value; 940 } 941 } 942 943 /** 944 * Returns an Iterable over EntryRange, designed for efficient for loops over UnicodeMaps. 945 * Caution: For efficiency, the EntryRange may be reused, so the EntryRange may change on each iteration! 946 * The value is guaranteed never to be null. 947 * @return entry range, for for loops 948 */ 949 public Iterable<EntryRange<T>> entryRanges() { 950 return new EntryRanges(); 951 } 952 953 private class EntryRanges implements Iterable<EntryRange<T>>, Iterator<EntryRange<T>> { 954 private int pos; 955 private EntryRange<T> result = new EntryRange<T>(); 956 private int lastRealRange = values[length-2] == null ? length - 2 : length - 1; 957 private Iterator<Entry<String, T>> stringIterator = stringMap == null ? null : stringMap.entrySet().iterator(); 958 959 public Iterator<EntryRange<T>> iterator() { 960 return this; 961 } 962 public boolean hasNext() { 963 return pos < lastRealRange || (stringIterator != null && stringIterator.hasNext()); 964 } 965 public EntryRange<T> next() { 966 // a range may be null, but then the next one must not be (except the final range) 967 if (pos < lastRealRange) { 968 T temp = values[pos]; 969 if (temp == null) { 970 temp = values[++pos]; 971 } 972 result.codepoint = transitions[pos]; 973 result.codepointEnd = transitions[pos+1]-1; 974 result.string = null; 975 result.value = temp; 976 ++pos; 977 } else { 978 Entry<String, T> entry = stringIterator.next(); 979 result.codepoint = result.codepointEnd = -1; 980 result.string = entry.getKey(); 981 result.value = entry.getValue(); 982 } 983 return result; 984 } 985 public void remove() { 986 throw new UnsupportedOperationException(); 987 } 988 } 989 990 /* (non-Javadoc) 991 * @see java.lang.Iterable#iterator() 992 */ 993 public Iterator<String> iterator() { 994 return keySet().iterator(); 995 } 996 997 /** 998 * Old form for compatibility 999 */ 1000 public T getValue(String key) { 1001 return get(key); 1002 } 1003 1004 /** 1005 * Old form for compatibility 1006 */ 1007 public T getValue(int key) { 1008 // TODO Auto-generated method stub 1009 return get(key); 1010 } 1011 1012 /** 1013 * Old form for compatibility 1014 */ 1015 public Collection<T> getAvailableValues() { 1016 return values(); 1017 } 1018 1019 /** 1020 * Old form for compatibility 1021 */ 1022 public <U extends Collection<T>> U getAvailableValues(U result) { 1023 return values(result); 1024 } 1025 1026 /** 1027 * Old form for compatibility 1028 */ 1029 public UnicodeSet getSet(T value) { 1030 return keySet(value); 1031 } 1032 1033 /** 1034 * Old form for compatibility 1035 */ 1036 public UnicodeSet getSet(T value, UnicodeSet result) { 1037 return keySet(value, result); 1038 } 1039 1040 // This is to support compressed serialization. It works; just commented out for now as we shift to Generics 1041 // TODO Fix once generics are cleaned up. 1042 // // TODO Fix to serialize more than just strings. 1043 // // Only if all the items are strings will we do the following compression 1044 // // Otherwise we'll just use Java Serialization, bulky as it is 1045 // public void writeExternal(ObjectOutput out1) throws IOException { 1046 // DataOutputCompressor sc = new DataOutputCompressor(out1); 1047 // // if all objects are strings 1048 // Collection<T> availableVals = getAvailableValues(); 1049 // boolean allStrings = allAreString(availableVals); 1050 // sc.writeBoolean(allStrings); 1051 // Map object_index = new LinkedHashMap(); 1052 // if (allAreString(availableVals)) { 1053 // sc.writeStringSet(new TreeSet(availableVals), object_index); 1054 // } else { 1055 // sc.writeCollection(availableVals, object_index); 1056 // } 1057 // sc.writeUInt(length); 1058 // int lastTransition = -1; 1059 // int lastValueNumber = 0; 1060 // if (DEBUG_WRITE) System.out.println("Trans count: " + length); 1061 // for (int i = 0; i < length; ++i) { 1062 // int valueNumber = ((Integer)object_index.get(values[i])).intValue(); 1063 // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + valueNumber); 1064 // 1065 // int deltaTransition = transitions[i] - lastTransition; 1066 // lastTransition = transitions[i]; 1067 // int deltaValueNumber = valueNumber - lastValueNumber; 1068 // lastValueNumber = valueNumber; 1069 // 1070 // deltaValueNumber <<= 1; // make room for one bit 1071 // boolean canCombine = deltaTransition == 1; 1072 // if (canCombine) deltaValueNumber |= 1; 1073 // sc.writeInt(deltaValueNumber); 1074 // if (DEBUG_WRITE) System.out.println("deltaValueNumber: " + deltaValueNumber); 1075 // if (!canCombine) { 1076 // sc.writeUInt(deltaTransition); 1077 // if (DEBUG_WRITE) System.out.println("deltaTransition: " + deltaTransition); 1078 // } 1079 // } 1080 // sc.flush(); 1081 // } 1082 // 1083 // /** 1084 // * 1085 // */ 1086 // private boolean allAreString(Collection<T> availableValues2) { 1087 // //if (true) return false; 1088 // for (Iterator<T> it = availableValues2.iterator(); it.hasNext();) { 1089 // if (!(it.next() instanceof String)) return false; 1090 // } 1091 // return true; 1092 // } 1093 // 1094 // public void readExternal(ObjectInput in1) throws IOException, ClassNotFoundException { 1095 // DataInputCompressor sc = new DataInputCompressor(in1); 1096 // boolean allStrings = sc.readBoolean(); 1097 // T[] valuesList; 1098 // availableValues = new LinkedHashSet(); 1099 // if (allStrings) { 1100 // valuesList = sc.readStringSet(availableValues); 1101 // } else { 1102 // valuesList = sc.readCollection(availableValues); 1103 // } 1104 // length = sc.readUInt(); 1105 // transitions = new int[length]; 1106 // if (DEBUG_WRITE) System.out.println("Trans count: " + length); 1107 // values = (T[]) new Object[length]; 1108 // int currentTransition = -1; 1109 // int currentValue = 0; 1110 // int deltaTransition; 1111 // for (int i = 0; i < length; ++i) { 1112 // int temp = sc.readInt(); 1113 // if (DEBUG_WRITE) System.out.println("deltaValueNumber: " + temp); 1114 // boolean combined = (temp & 1) != 0; 1115 // temp >>= 1; 1116 // values[i] = valuesList[currentValue += temp]; 1117 // if (!combined) { 1118 // deltaTransition = sc.readUInt(); 1119 // if (DEBUG_WRITE) System.out.println("deltaTransition: " + deltaTransition); 1120 // } else { 1121 // deltaTransition = 1; 1122 // } 1123 // transitions[i] = currentTransition += deltaTransition; // delta value 1124 // if (DEBUG_WRITE) System.out.println("Trans: " + transitions[i] + ",\t" + currentValue); 1125 // } 1126 // } 1127 1128 public final UnicodeMap<T> removeAll(UnicodeSet set) { 1129 return putAll(set, null); 1130 } 1131 1132 public final UnicodeMap<T> removeAll(UnicodeMap<T> reference) { 1133 return removeRetainAll(reference, true); 1134 } 1135 1136 public final UnicodeMap<T> retainAll(UnicodeSet set) { 1137 UnicodeSet toNuke = new UnicodeSet(); 1138 // TODO Optimize 1139 for (EntryRange<T> ae : entryRanges()) { 1140 if (ae.string != null) { 1141 if (!set.contains(ae.string)) { 1142 toNuke.add(ae.string); 1143 } 1144 } else { 1145 for (int i = ae.codepoint; i <= ae.codepointEnd; ++i) { 1146 if (!set.contains(i)) { 1147 toNuke.add(i); 1148 } 1149 } 1150 } 1151 } 1152 return putAll(toNuke, null); 1153 } 1154 1155 public final UnicodeMap<T> retainAll(UnicodeMap<T> reference) { 1156 return removeRetainAll(reference, false); 1157 } 1158 1159 private final UnicodeMap<T> removeRetainAll(UnicodeMap<T> reference, boolean remove) { 1160 UnicodeSet toNuke = new UnicodeSet(); 1161 // TODO Optimize 1162 for (EntryRange<T> ae : entryRanges()) { 1163 if (ae.string != null) { 1164 if (ae.value.equals(reference.get(ae.string)) == remove) { 1165 toNuke.add(ae.string); 1166 } 1167 } else { 1168 for (int i = ae.codepoint; i <= ae.codepointEnd; ++i) { 1169 if (ae.value.equals(reference.get(i)) == remove) { 1170 toNuke.add(i); 1171 } 1172 } 1173 } 1174 } 1175 return putAll(toNuke, null); 1176 } 1177 1178 /** 1179 * Returns the keys that consist of multiple code points. 1180 * @return 1181 */ 1182 public Set<String> stringKeys() { 1183 return stringMap.keySet(); 1184 } 1185 } 1186