1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.DataOutputStream; 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.nio.ByteBuffer; 16 import java.nio.ByteOrder; 17 import java.util.Iterator; 18 import java.util.NoSuchElementException; 19 20 21 /** 22 * This is the interface and common implementation of a Unicode Trie2. 23 * It is a kind of compressed table that maps from Unicode code points (0..0x10ffff) 24 * to 16- or 32-bit integer values. It works best when there are ranges of 25 * characters with the same value, which is generally the case with Unicode 26 * character properties. 27 * 28 * This is the second common version of a Unicode trie (hence the name Trie2). 29 * 30 */ 31 public abstract class Trie2 implements Iterable<Trie2.Range> { 32 33 34 /** 35 * Create a Trie2 from its serialized form. Inverse of utrie2_serialize(). 36 * 37 * Reads from the current position and leaves the buffer after the end of the trie. 38 * 39 * The serialized format is identical between ICU4C and ICU4J, so this function 40 * will work with serialized Trie2s from either. 41 * 42 * The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending 43 * on the width of the data. 44 * 45 * To obtain the width of the Trie2, check the actual class type of the returned Trie2. 46 * Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will 47 * return only Tries of their specific type/size. 48 * 49 * The serialized Trie2 on the stream may be in either little or big endian byte order. 50 * This allows using serialized Tries from ICU4C without needing to consider the 51 * byte order of the system that created them. 52 * 53 * @param bytes a byte buffer to the serialized form of a UTrie2. 54 * @return An unserialized Trie2, ready for use. 55 * @throws IllegalArgumentException if the stream does not contain a serialized Trie2. 56 * @throws IOException if a read error occurs in the buffer. 57 * 58 */ createFromSerialized(ByteBuffer bytes)59 public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException { 60 // From ICU4C utrie2_impl.h 61 // * Trie2 data structure in serialized form: 62 // * 63 // * UTrie2Header header; 64 // * uint16_t index[header.index2Length]; 65 // * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] 66 // * @internal 67 // */ 68 // typedef struct UTrie2Header { 69 // /** "Tri2" in big-endian US-ASCII (0x54726932) */ 70 // uint32_t signature; 71 72 // /** 73 // * options bit field: 74 // * 15.. 4 reserved (0) 75 // * 3.. 0 UTrie2ValueBits valueBits 76 // */ 77 // uint16_t options; 78 // 79 // /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */ 80 // uint16_t indexLength; 81 // 82 // /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */ 83 // uint16_t shiftedDataLength; 84 // 85 // /** Null index and data blocks, not shifted. */ 86 // uint16_t index2NullOffset, dataNullOffset; 87 // 88 // /** 89 // * First code point of the single-value range ending with U+10ffff, 90 // * rounded up and then shifted right by UTRIE2_SHIFT_1. 91 // */ 92 // uint16_t shiftedHighStart; 93 // } UTrie2Header; 94 95 ByteOrder outerByteOrder = bytes.order(); 96 try { 97 UTrie2Header header = new UTrie2Header(); 98 99 /* check the signature */ 100 header.signature = bytes.getInt(); 101 switch (header.signature) { 102 case 0x54726932: 103 // The buffer is already set to the trie data byte order. 104 break; 105 case 0x32697254: 106 // Temporarily reverse the byte order. 107 boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN; 108 bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN); 109 header.signature = 0x54726932; 110 break; 111 default: 112 throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2"); 113 } 114 115 header.options = bytes.getChar(); 116 header.indexLength = bytes.getChar(); 117 header.shiftedDataLength = bytes.getChar(); 118 header.index2NullOffset = bytes.getChar(); 119 header.dataNullOffset = bytes.getChar(); 120 header.shiftedHighStart = bytes.getChar(); 121 122 // Trie2 data width - 0: 16 bits 123 // 1: 32 bits 124 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) > 1) { 125 throw new IllegalArgumentException("UTrie2 serialized format error."); 126 } 127 ValueWidth width; 128 Trie2 This; 129 if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) == 0) { 130 width = ValueWidth.BITS_16; 131 This = new Trie2_16(); 132 } else { 133 width = ValueWidth.BITS_32; 134 This = new Trie2_32(); 135 } 136 This.header = header; 137 138 /* get the length values and offsets */ 139 This.indexLength = header.indexLength; 140 This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT; 141 This.index2NullOffset = header.index2NullOffset; 142 This.dataNullOffset = header.dataNullOffset; 143 This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1; 144 This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY; 145 if (width == ValueWidth.BITS_16) { 146 This.highValueIndex += This.indexLength; 147 } 148 149 // Allocate the Trie2 index array. If the data width is 16 bits, the array also 150 // includes the space for the data. 151 152 int indexArraySize = This.indexLength; 153 if (width == ValueWidth.BITS_16) { 154 indexArraySize += This.dataLength; 155 } 156 157 /* Read in the index */ 158 This.index = ICUBinary.getChars(bytes, indexArraySize, 0); 159 160 /* Read in the data. 16 bit data goes in the same array as the index. 161 * 32 bit data goes in its own separate data array. 162 */ 163 if (width == ValueWidth.BITS_16) { 164 This.data16 = This.indexLength; 165 } else { 166 This.data32 = ICUBinary.getInts(bytes, This.dataLength, 0); 167 } 168 169 switch(width) { 170 case BITS_16: 171 This.data32 = null; 172 This.initialValue = This.index[This.dataNullOffset]; 173 This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET]; 174 break; 175 case BITS_32: 176 This.data16=0; 177 This.initialValue = This.data32[This.dataNullOffset]; 178 This.errorValue = This.data32[UTRIE2_BAD_UTF8_DATA_OFFSET]; 179 break; 180 default: 181 throw new IllegalArgumentException("UTrie2 serialized format error."); 182 } 183 184 return This; 185 } finally { 186 bytes.order(outerByteOrder); 187 } 188 } 189 190 /** 191 * Get the UTrie version from an InputStream containing the serialized form 192 * of either a Trie (version 1) or a Trie2 (version 2). 193 * 194 * @param is an InputStream containing the serialized form 195 * of a UTrie, version 1 or 2. The stream must support mark() and reset(). 196 * The position of the input stream will be left unchanged. 197 * @param littleEndianOk If FALSE, only big-endian (Java native) serialized forms are recognized. 198 * If TRUE, little-endian serialized forms are recognized as well. 199 * @return the Trie version of the serialized form, or 0 if it is not 200 * recognized as a serialized UTrie 201 * @throws IOException on errors in reading from the input stream. 202 */ getVersion(InputStream is, boolean littleEndianOk)203 public static int getVersion(InputStream is, boolean littleEndianOk) throws IOException { 204 if (! is.markSupported()) { 205 throw new IllegalArgumentException("Input stream must support mark()."); 206 } 207 is.mark(4); 208 byte sig[] = new byte[4]; 209 int read = is.read(sig); 210 is.reset(); 211 212 if (read != sig.length) { 213 return 0; 214 } 215 216 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='e') { 217 return 1; 218 } 219 if (sig[0]=='T' && sig[1]=='r' && sig[2]=='i' && sig[3]=='2') { 220 return 2; 221 } 222 if (littleEndianOk) { 223 if (sig[0]=='e' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') { 224 return 1; 225 } 226 if (sig[0]=='2' && sig[1]=='i' && sig[2]=='r' && sig[3]=='T') { 227 return 2; 228 } 229 } 230 return 0; 231 } 232 233 /** 234 * Get the value for a code point as stored in the Trie2. 235 * 236 * @param codePoint the code point 237 * @return the value 238 */ get(int codePoint)239 abstract public int get(int codePoint); 240 241 242 /** 243 * Get the trie value for a UTF-16 code unit. 244 * 245 * A Trie2 stores two distinct values for input in the lead surrogate 246 * range, one for lead surrogates, which is the value that will be 247 * returned by this function, and a second value that is returned 248 * by Trie2.get(). 249 * 250 * For code units outside of the lead surrogate range, this function 251 * returns the same result as Trie2.get(). 252 * 253 * This function, together with the alternate value for lead surrogates, 254 * makes possible very efficient processing of UTF-16 strings without 255 * first converting surrogate pairs to their corresponding 32 bit code point 256 * values. 257 * 258 * At build-time, enumerate the contents of the Trie2 to see if there 259 * is non-trivial (non-initialValue) data for any of the supplementary 260 * code points associated with a lead surrogate. 261 * If so, then set a special (application-specific) value for the 262 * lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit(). 263 * 264 * At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial 265 * data and the code unit is a lead surrogate, then check if a trail surrogate 266 * follows. If so, assemble the supplementary code point and look up its value 267 * with Trie2.get(); otherwise reset the lead 268 * surrogate's value or do a code point lookup for it. 269 * 270 * If there is only trivial data for lead and trail surrogates, then processing 271 * can often skip them. For example, in normalization or case mapping 272 * all characters that do not have any mappings are simply copied as is. 273 * 274 * @param c the code point or lead surrogate value. 275 * @return the value 276 */ getFromU16SingleLead(char c)277 abstract public int getFromU16SingleLead(char c); 278 279 280 /** 281 * Equals function. Two Tries are equal if their contents are equal. 282 * The type need not be the same, so a Trie2Writable will be equal to 283 * (read-only) Trie2_16 or Trie2_32 so long as they are storing the same values. 284 * 285 */ 286 @Override equals(Object other)287 public final boolean equals(Object other) { 288 if(!(other instanceof Trie2)) { 289 return false; 290 } 291 Trie2 OtherTrie = (Trie2)other; 292 Range rangeFromOther; 293 294 Iterator<Trie2.Range> otherIter = OtherTrie.iterator(); 295 for (Trie2.Range rangeFromThis: this) { 296 if (otherIter.hasNext() == false) { 297 return false; 298 } 299 rangeFromOther = otherIter.next(); 300 if (!rangeFromThis.equals(rangeFromOther)) { 301 return false; 302 } 303 } 304 if (otherIter.hasNext()) { 305 return false; 306 } 307 308 if (errorValue != OtherTrie.errorValue || 309 initialValue != OtherTrie.initialValue) { 310 return false; 311 } 312 313 return true; 314 } 315 316 317 @Override hashCode()318 public int hashCode() { 319 if (fHash == 0) { 320 int hash = initHash(); 321 for (Range r: this) { 322 hash = hashInt(hash, r.hashCode()); 323 } 324 if (hash == 0) { 325 hash = 1; 326 } 327 fHash = hash; 328 } 329 return fHash; 330 } 331 332 /** 333 * When iterating over the contents of a Trie2, Elements of this type are produced. 334 * The iterator will return one item for each contiguous range of codepoints having the same value. 335 * 336 * When iterating, the same Trie2EnumRange object will be reused and returned for each range. 337 * If you need to retain complete iteration results, clone each returned Trie2EnumRange, 338 * or save the range in some other way, before advancing to the next iteration step. 339 */ 340 public static class Range { 341 public int startCodePoint; 342 public int endCodePoint; // Inclusive. 343 public int value; 344 public boolean leadSurrogate; 345 346 @Override equals(Object other)347 public boolean equals(Object other) { 348 if (other == null || !(other.getClass().equals(getClass()))) { 349 return false; 350 } 351 Range tother = (Range)other; 352 return this.startCodePoint == tother.startCodePoint && 353 this.endCodePoint == tother.endCodePoint && 354 this.value == tother.value && 355 this.leadSurrogate == tother.leadSurrogate; 356 } 357 358 359 @Override hashCode()360 public int hashCode() { 361 int h = initHash(); 362 h = hashUChar32(h, startCodePoint); 363 h = hashUChar32(h, endCodePoint); 364 h = hashInt(h, value); 365 h = hashByte(h, leadSurrogate? 1: 0); 366 return h; 367 } 368 } 369 370 371 /** 372 * Create an iterator over the value ranges in this Trie2. 373 * Values from the Trie2 are not remapped or filtered, but are returned as they 374 * are stored in the Trie2. 375 * 376 * @return an Iterator 377 */ 378 @Override iterator()379 public Iterator<Range> iterator() { 380 return iterator(defaultValueMapper); 381 } 382 383 private static ValueMapper defaultValueMapper = new ValueMapper() { 384 @Override 385 public int map(int in) { 386 return in; 387 } 388 }; 389 390 /** 391 * Create an iterator over the value ranges from this Trie2. 392 * Values from the Trie2 are passed through a caller-supplied remapping function, 393 * and it is the remapped values that determine the ranges that 394 * will be produced by the iterator. 395 * 396 * 397 * @param mapper provides a function to remap values obtained from the Trie2. 398 * @return an Iterator 399 */ iterator(ValueMapper mapper)400 public Iterator<Range> iterator(ValueMapper mapper) { 401 return new Trie2Iterator(mapper); 402 } 403 404 405 /** 406 * Create an iterator over the Trie2 values for the 1024=0x400 code points 407 * corresponding to a given lead surrogate. 408 * For example, for the lead surrogate U+D87E it will enumerate the values 409 * for [U+2F800..U+2FC00[. 410 * Used by data builder code that sets special lead surrogate code unit values 411 * for optimized UTF-16 string processing. 412 * 413 * Do not modify the Trie2 during the iteration. 414 * 415 * Except for the limited code point range, this functions just like Trie2.iterator(). 416 * 417 */ iteratorForLeadSurrogate(char lead, ValueMapper mapper)418 public Iterator<Range> iteratorForLeadSurrogate(char lead, ValueMapper mapper) { 419 return new Trie2Iterator(lead, mapper); 420 } 421 422 /** 423 * Create an iterator over the Trie2 values for the 1024=0x400 code points 424 * corresponding to a given lead surrogate. 425 * For example, for the lead surrogate U+D87E it will enumerate the values 426 * for [U+2F800..U+2FC00[. 427 * Used by data builder code that sets special lead surrogate code unit values 428 * for optimized UTF-16 string processing. 429 * 430 * Do not modify the Trie2 during the iteration. 431 * 432 * Except for the limited code point range, this functions just like Trie2.iterator(). 433 * 434 */ iteratorForLeadSurrogate(char lead)435 public Iterator<Range> iteratorForLeadSurrogate(char lead) { 436 return new Trie2Iterator(lead, defaultValueMapper); 437 } 438 439 /** 440 * When iterating over the contents of a Trie2, an instance of TrieValueMapper may 441 * be used to remap the values from the Trie2. The remapped values will be used 442 * both in determining the ranges of codepoints and as the value to be returned 443 * for each range. 444 * 445 * Example of use, with an anonymous subclass of TrieValueMapper: 446 * 447 * 448 * ValueMapper m = new ValueMapper() { 449 * int map(int in) {return in & 0x1f;}; 450 * } 451 * for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) { 452 * Trie2EnumRange r = i.next(); 453 * ... // Do something with the range r. 454 * } 455 * 456 */ 457 public interface ValueMapper { map(int originalVal)458 public int map(int originalVal); 459 } 460 461 462 /** 463 * Serialize a trie2 Header and Index onto an OutputStream. This is 464 * common code used for both the Trie2_16 and Trie2_32 serialize functions. 465 * @param dos the stream to which the serialized Trie2 data will be written. 466 * @return the number of bytes written. 467 */ serializeHeader(DataOutputStream dos)468 protected int serializeHeader(DataOutputStream dos) throws IOException { 469 // Write the header. It is already set and ready to use, having been 470 // created when the Trie2 was unserialized or when it was frozen. 471 int bytesWritten = 0; 472 473 dos.writeInt(header.signature); 474 dos.writeShort(header.options); 475 dos.writeShort(header.indexLength); 476 dos.writeShort(header.shiftedDataLength); 477 dos.writeShort(header.index2NullOffset); 478 dos.writeShort(header.dataNullOffset); 479 dos.writeShort(header.shiftedHighStart); 480 bytesWritten += 16; 481 482 // Write the index 483 int i; 484 for (i=0; i< header.indexLength; i++) { 485 dos.writeChar(index[i]); 486 } 487 bytesWritten += header.indexLength; 488 return bytesWritten; 489 } 490 491 492 /** 493 * Struct-like class for holding the results returned by a UTrie2 CharSequence iterator. 494 * The iteration walks over a CharSequence, and for each Unicode code point therein 495 * returns the character and its associated Trie2 value. 496 */ 497 public static class CharSequenceValues { 498 /** string index of the current code point. */ 499 public int index; 500 /** The code point at index. */ 501 public int codePoint; 502 /** The Trie2 value for the current code point */ 503 public int value; 504 } 505 506 507 /** 508 * Create an iterator that will produce the values from the Trie2 for 509 * the sequence of code points in an input text. 510 * 511 * @param text A text string to be iterated over. 512 * @param index The starting iteration position within the input text. 513 * @return the CharSequenceIterator 514 */ charSequenceIterator(CharSequence text, int index)515 public CharSequenceIterator charSequenceIterator(CharSequence text, int index) { 516 return new CharSequenceIterator(text, index); 517 } 518 519 // TODO: Survey usage of the equivalent of CharSequenceIterator in ICU4C 520 // and if there is none, remove it from here. 521 // Don't waste time testing and maintaining unused code. 522 523 /** 524 * An iterator that operates over an input CharSequence, and for each Unicode code point 525 * in the input returns the associated value from the Trie2. 526 * 527 * The iterator can move forwards or backwards, and can be reset to an arbitrary index. 528 * 529 * Note that Trie2_16 and Trie2_32 subclass Trie2.CharSequenceIterator. This is done 530 * only for performance reasons. It does require that any changes made here be propagated 531 * into the corresponding code in the subclasses. 532 */ 533 public class CharSequenceIterator implements Iterator<CharSequenceValues> { 534 /** 535 * Internal constructor. 536 */ CharSequenceIterator(CharSequence t, int index)537 CharSequenceIterator(CharSequence t, int index) { 538 text = t; 539 textLength = text.length(); 540 set(index); 541 } 542 543 private CharSequence text; 544 private int textLength; 545 private int index; 546 private Trie2.CharSequenceValues fResults = new Trie2.CharSequenceValues(); 547 548 set(int i)549 public void set(int i) { 550 if (i < 0 || i > textLength) { 551 throw new IndexOutOfBoundsException(); 552 } 553 index = i; 554 } 555 556 557 @Override hasNext()558 public final boolean hasNext() { 559 return index<textLength; 560 } 561 562 hasPrevious()563 public final boolean hasPrevious() { 564 return index>0; 565 } 566 567 568 @Override next()569 public Trie2.CharSequenceValues next() { 570 int c = Character.codePointAt(text, index); 571 int val = get(c); 572 573 fResults.index = index; 574 fResults.codePoint = c; 575 fResults.value = val; 576 index++; 577 if (c >= 0x10000) { 578 index++; 579 } 580 return fResults; 581 } 582 583 previous()584 public Trie2.CharSequenceValues previous() { 585 int c = Character.codePointBefore(text, index); 586 int val = get(c); 587 index--; 588 if (c >= 0x10000) { 589 index--; 590 } 591 fResults.index = index; 592 fResults.codePoint = c; 593 fResults.value = val; 594 return fResults; 595 } 596 597 /** 598 * Iterator.remove() is not supported by Trie2.CharSequenceIterator. 599 * @throws UnsupportedOperationException Always thrown because this operation is not supported 600 * @see java.util.Iterator#remove() 601 */ 602 @Override remove()603 public void remove() { 604 throw new UnsupportedOperationException("Trie2.CharSequenceIterator does not support remove()."); 605 } 606 } 607 608 609 //-------------------------------------------------------------------------------- 610 // 611 // Below this point are internal implementation items. No further public API. 612 // 613 //-------------------------------------------------------------------------------- 614 615 616 /** 617 * Selectors for the width of a UTrie2 data value. 618 */ 619 enum ValueWidth { 620 BITS_16, 621 BITS_32 622 } 623 624 /** 625 * Trie2 data structure in serialized form: 626 * 627 * UTrie2Header header; 628 * uint16_t index[header.index2Length]; 629 * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...] 630 * 631 * For Java, this is read from the stream into an instance of UTrie2Header. 632 * (The C version just places a struct over the raw serialized data.) 633 * 634 * @internal 635 */ 636 static class UTrie2Header { 637 /** "Tri2" in big-endian US-ASCII (0x54726932) */ 638 int signature; 639 640 /** 641 * options bit field (uint16_t): 642 * 15.. 4 reserved (0) 643 * 3.. 0 UTrie2ValueBits valueBits 644 */ 645 int options; 646 647 /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */ 648 int indexLength; 649 650 /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */ 651 int shiftedDataLength; 652 653 /** Null index and data blocks, not shifted. (uint16_t) */ 654 int index2NullOffset, dataNullOffset; 655 656 /** 657 * First code point of the single-value range ending with U+10ffff, 658 * rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t) 659 */ 660 int shiftedHighStart; 661 } 662 663 // 664 // Data members of UTrie2. 665 // 666 UTrie2Header header; 667 char index[]; // Index array. Includes data for 16 bit Tries. 668 int data16; // Offset to data portion of the index array, if 16 bit data. 669 // zero if 32 bit data. 670 int data32[]; // NULL if 16b data is used via index 671 672 int indexLength; 673 int dataLength; 674 int index2NullOffset; // 0xffff if there is no dedicated index-2 null block 675 int initialValue; 676 677 /** Value returned for out-of-range code points and illegal UTF-8. */ 678 int errorValue; 679 680 /* Start of the last range which ends at U+10ffff, and its value. */ 681 int highStart; 682 int highValueIndex; 683 684 int dataNullOffset; 685 686 int fHash; // Zero if not yet computed. 687 // Shared by Trie2Writable, Trie2_16, Trie2_32. 688 // Thread safety: if two racing threads compute 689 // the same hash on a frozen Trie2, no damage is done. 690 691 692 /** 693 * Trie2 constants, defining shift widths, index array lengths, etc. 694 * 695 * These are needed for the runtime macros but users can treat these as 696 * implementation details and skip to the actual public API further below. 697 */ 698 699 static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f; 700 701 702 /** Shift size for getting the index-1 table offset. */ 703 static final int UTRIE2_SHIFT_1=6+5; 704 705 /** Shift size for getting the index-2 table offset. */ 706 static final int UTRIE2_SHIFT_2=5; 707 708 /** 709 * Difference between the two shift sizes, 710 * for getting an index-1 offset from an index-2 offset. 6=11-5 711 */ 712 static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2; 713 714 /** 715 * Number of index-1 entries for the BMP. 32=0x20 716 * This part of the index-1 table is omitted from the serialized form. 717 */ 718 static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1; 719 720 /** Number of code points per index-1 table entry. 2048=0x800 */ 721 static final int UTRIE2_CP_PER_INDEX_1_ENTRY=1<<UTRIE2_SHIFT_1; 722 723 /** Number of entries in an index-2 block. 64=0x40 */ 724 static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2; 725 726 /** Mask for getting the lower bits for the in-index-2-block offset. */ 727 static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1; 728 729 /** Number of entries in a data block. 32=0x20 */ 730 static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2; 731 732 /** Mask for getting the lower bits for the in-data-block offset. */ 733 static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1; 734 735 /** 736 * Shift size for shifting left the index array values. 737 * Increases possible data size with 16-bit index values at the cost 738 * of compactability. 739 * This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY. 740 */ 741 static final int UTRIE2_INDEX_SHIFT=2; 742 743 /** The alignment size of a data block. Also the granularity for compaction. */ 744 static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT; 745 746 /* Fixed layout of the first part of the index array. ------------------- */ 747 748 /** 749 * The BMP part of the index-2 table is fixed and linear and starts at offset 0. 750 * Length=2048=0x800=0x10000>>UTRIE2_SHIFT_2. 751 */ 752 static final int UTRIE2_INDEX_2_OFFSET=0; 753 754 /** 755 * The part of the index-2 table for U+D800..U+DBFF stores values for 756 * lead surrogate code _units_ not code _points_. 757 * Values for lead surrogate code _points_ are indexed with this portion of the table. 758 * Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.) 759 */ 760 static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2; 761 static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2; 762 763 /** Count the lengths of both BMP pieces. 2080=0x820 */ 764 static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH; 765 766 /** 767 * The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820. 768 * Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2. 769 */ 770 static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH; 771 static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */ 772 773 /** 774 * The index-1 table, only used for supplementary code points, at offset 2112=0x840. 775 * Variable length, for code points up to highStart, where the last single-value range starts. 776 * Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1. 777 * (For 0x100000 supplementary code points U+10000..U+10ffff.) 778 * 779 * The part of the index-2 table for supplementary code points starts 780 * after this index-1 table. 781 * 782 * Both the index-1 table and the following part of the index-2 table 783 * are omitted completely if there is only BMP data. 784 */ 785 static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH; 786 static final int UTRIE2_MAX_INDEX_1_LENGTH=0x100000>>UTRIE2_SHIFT_1; 787 788 /* 789 * Fixed layout of the first part of the data array. ----------------------- 790 * Starts with 4 blocks (128=0x80 entries) for ASCII. 791 */ 792 793 /** 794 * The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80. 795 * Used with linear access for single bytes 0..0xbf for simple error handling. 796 * Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH. 797 */ 798 static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80; 799 800 /** The start of non-linear-ASCII data blocks, at offset 192=0xc0. */ 801 static final int UTRIE2_DATA_START_OFFSET=0xc0; 802 803 /* Building a Trie2 ---------------------------------------------------------- */ 804 805 /* 806 * These definitions are mostly needed by utrie2_builder.c, but also by 807 * utrie2_get32() and utrie2_enum(). 808 */ 809 810 /* 811 * At build time, leave a gap in the index-2 table, 812 * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table 813 * and the supplementary index-1 table. 814 * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting. 815 */ 816 static final int UNEWTRIE2_INDEX_GAP_OFFSET = UTRIE2_INDEX_2_BMP_LENGTH; 817 static final int UNEWTRIE2_INDEX_GAP_LENGTH = 818 ((UTRIE2_UTF8_2B_INDEX_2_LENGTH + UTRIE2_MAX_INDEX_1_LENGTH) + UTRIE2_INDEX_2_MASK) & 819 ~UTRIE2_INDEX_2_MASK; 820 821 /** 822 * Maximum length of the build-time index-2 array. 823 * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2, 824 * plus the part of the index-2 table for lead surrogate code points, 825 * plus the build-time index gap, 826 * plus the null index-2 block. 827 */ 828 static final int UNEWTRIE2_MAX_INDEX_2_LENGTH= 829 (0x110000>>UTRIE2_SHIFT_2)+ 830 UTRIE2_LSCP_INDEX_2_LENGTH+ 831 UNEWTRIE2_INDEX_GAP_LENGTH+ 832 UTRIE2_INDEX_2_BLOCK_LENGTH; 833 834 static final int UNEWTRIE2_INDEX_1_LENGTH = 0x110000>>UTRIE2_SHIFT_1; 835 836 /** 837 * Maximum length of the build-time data array. 838 * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block, 839 * plus values for the 0x400 surrogate code units. 840 */ 841 static final int UNEWTRIE2_MAX_DATA_LENGTH = (0x110000+0x40+0x40+0x400); 842 843 844 845 /** 846 * Implementation class for an iterator over a Trie2. 847 * 848 * Iteration over a Trie2 first returns all of the ranges that are indexed by code points, 849 * then returns the special alternate values for the lead surrogates 850 * 851 * @internal 852 */ 853 class Trie2Iterator implements Iterator<Range> { 854 // The normal constructor that configures the iterator to cover the complete 855 // contents of the Trie2 Trie2Iterator(ValueMapper vm)856 Trie2Iterator(ValueMapper vm) { 857 mapper = vm; 858 nextStart = 0; 859 limitCP = 0x110000; 860 doLeadSurrogates = true; 861 } 862 863 // An alternate constructor that configures the iterator to cover only the 864 // code points corresponding to a particular Lead Surrogate value. Trie2Iterator(char leadSurrogate, ValueMapper vm)865 Trie2Iterator(char leadSurrogate, ValueMapper vm) { 866 if (leadSurrogate < 0xd800 || leadSurrogate > 0xdbff) { 867 throw new IllegalArgumentException("Bad lead surrogate value."); 868 } 869 mapper = vm; 870 nextStart = (leadSurrogate - 0xd7c0) << 10; 871 limitCP = nextStart + 0x400; 872 doLeadSurrogates = false; // Do not iterate over lead the special lead surrogate 873 // values after completing iteration over code points. 874 } 875 876 /** 877 * The main next() function for Trie2 iterators 878 * 879 */ 880 @Override next()881 public Range next() { 882 if (!hasNext()) { 883 throw new NoSuchElementException(); 884 } 885 if (nextStart >= limitCP) { 886 // Switch over from iterating normal code point values to 887 // doing the alternate lead-surrogate values. 888 doingCodePoints = false; 889 nextStart = 0xd800; 890 } 891 int endOfRange = 0; 892 int val = 0; 893 int mappedVal = 0; 894 895 if (doingCodePoints) { 896 // Iteration over code point values. 897 val = get(nextStart); 898 mappedVal = mapper.map(val); 899 endOfRange = rangeEnd(nextStart, limitCP, val); 900 // Loop once for each range in the Trie2 with the same raw (unmapped) value. 901 // Loop continues so long as the mapped values are the same. 902 for (;;) { 903 if (endOfRange >= limitCP-1) { 904 break; 905 } 906 val = get(endOfRange+1); 907 if (mapper.map(val) != mappedVal) { 908 break; 909 } 910 endOfRange = rangeEnd(endOfRange+1, limitCP, val); 911 } 912 } else { 913 // Iteration over the alternate lead surrogate values. 914 val = getFromU16SingleLead((char)nextStart); 915 mappedVal = mapper.map(val); 916 endOfRange = rangeEndLS((char)nextStart); 917 // Loop once for each range in the Trie2 with the same raw (unmapped) value. 918 // Loop continues so long as the mapped values are the same. 919 for (;;) { 920 if (endOfRange >= 0xdbff) { 921 break; 922 } 923 val = getFromU16SingleLead((char)(endOfRange+1)); 924 if (mapper.map(val) != mappedVal) { 925 break; 926 } 927 endOfRange = rangeEndLS((char)(endOfRange+1)); 928 } 929 } 930 returnValue.startCodePoint = nextStart; 931 returnValue.endCodePoint = endOfRange; 932 returnValue.value = mappedVal; 933 returnValue.leadSurrogate = !doingCodePoints; 934 nextStart = endOfRange+1; 935 return returnValue; 936 } 937 938 /** 939 * 940 */ 941 @Override hasNext()942 public boolean hasNext() { 943 return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00; 944 } 945 946 @Override remove()947 public void remove() { 948 throw new UnsupportedOperationException(); 949 } 950 951 952 /** 953 * Find the last lead surrogate in a contiguous range with the 954 * same Trie2 value as the input character. 955 * 956 * Use the alternate Lead Surrogate values from the Trie2, 957 * not the code-point values. 958 * 959 * Note: Trie2_16 and Trie2_32 override this implementation with optimized versions, 960 * meaning that the implementation here is only being used with 961 * Trie2Writable. The code here is logically correct with any type 962 * of Trie2, however. 963 * 964 * @param c The character to begin with. 965 * @return The last contiguous character with the same value. 966 */ rangeEndLS(char startingLS)967 private int rangeEndLS(char startingLS) { 968 if (startingLS >= 0xdbff) { 969 return 0xdbff; 970 } 971 972 int c; 973 int val = getFromU16SingleLead(startingLS); 974 for (c = startingLS+1; c <= 0x0dbff; c++) { 975 if (getFromU16SingleLead((char)c) != val) { 976 break; 977 } 978 } 979 return c-1; 980 } 981 982 // 983 // Iteration State Variables 984 // 985 private ValueMapper mapper; 986 private Range returnValue = new Range(); 987 // The starting code point for the next range to be returned. 988 private int nextStart; 989 // The upper limit for the last normal range to be returned. Normally 0x110000, but 990 // may be lower when iterating over the code points for a single lead surrogate. 991 private int limitCP; 992 993 // True while iterating over the the Trie2 values for code points. 994 // False while iterating over the alternate values for lead surrogates. 995 private boolean doingCodePoints = true; 996 997 // True if the iterator should iterate the special values for lead surrogates in 998 // addition to the normal values for code points. 999 private boolean doLeadSurrogates = true; 1000 } 1001 1002 /** 1003 * Find the last character in a contiguous range of characters with the 1004 * same Trie2 value as the input character. 1005 * 1006 * @param c The character to begin with. 1007 * @return The last contiguous character with the same value. 1008 */ rangeEnd(int start, int limitp, int val)1009 int rangeEnd(int start, int limitp, int val) { 1010 int c; 1011 int limit = Math.min(highStart, limitp); 1012 1013 for (c = start+1; c < limit; c++) { 1014 if (get(c) != val) { 1015 break; 1016 } 1017 } 1018 if (c >= highStart) { 1019 c = limitp; 1020 } 1021 return c - 1; 1022 } 1023 1024 1025 // 1026 // Hashing implementation functions. FNV hash. Respected public domain algorithm. 1027 // initHash()1028 private static int initHash() { 1029 return 0x811c9DC5; // unsigned 2166136261 1030 } 1031 hashByte(int h, int b)1032 private static int hashByte(int h, int b) { 1033 h = h * 16777619; 1034 h = h ^ b; 1035 return h; 1036 } 1037 hashUChar32(int h, int c)1038 private static int hashUChar32(int h, int c) { 1039 h = Trie2.hashByte(h, c & 255); 1040 h = Trie2.hashByte(h, (c>>8) & 255); 1041 h = Trie2.hashByte(h, c>>16); 1042 return h; 1043 } 1044 hashInt(int h, int i)1045 private static int hashInt(int h, int i) { 1046 h = Trie2.hashByte(h, i & 255); 1047 h = Trie2.hashByte(h, (i>>8) & 255); 1048 h = Trie2.hashByte(h, (i>>16) & 255); 1049 h = Trie2.hashByte(h, (i>>24) & 255); 1050 return h; 1051 } 1052 1053 } 1054