1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.nio.ByteBuffer; 13 import java.util.Arrays; 14 15 import com.ibm.icu.lang.UCharacter; 16 import com.ibm.icu.text.UTF16; 17 18 /** 19 * <p>A trie is a kind of compressed, serializable table of values 20 * associated with Unicode code points (0..0x10ffff).</p> 21 * <p>This class defines the basic structure of a trie and provides methods 22 * to <b>retrieve the offsets to the actual data</b>.</p> 23 * <p>Data will be the form of an array of basic types, char or int.</p> 24 * <p>The actual data format will have to be specified by the user in the 25 * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p> 26 * <p>This trie implementation is optimized for getting offset while walking 27 * forward through a UTF-16 string. 28 * Therefore, the simplest and fastest access macros are the 29 * fromLead() and fromOffsetTrail() methods. 30 * The fromBMP() method are a little more complicated; they get offsets even 31 * for lead surrogate codepoints, while the fromLead() method get special 32 * "folded" offsets for lead surrogate code units if there is relevant data 33 * associated with them. 34 * From such a folded offsets, an offset needs to be extracted to supply 35 * to the fromOffsetTrail() methods. 36 * To handle such supplementary codepoints, some offset information are kept 37 * in the data.</p> 38 * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve 39 * that offset from the folded value for the lead surrogate unit.</p> 40 * <p>For examples of use, see com.ibm.icu.impl.CharTrie or 41 * com.ibm.icu.impl.IntTrie.</p> 42 * @author synwee 43 * @see com.ibm.icu.impl.CharTrie 44 * @see com.ibm.icu.impl.IntTrie 45 * @since release 2.1, Jan 01 2002 46 */ 47 public abstract class Trie 48 { 49 // public class declaration ---------------------------------------- 50 51 /** 52 * Character data in com.ibm.impl.Trie have different user-specified format 53 * for different purposes. 54 * This interface specifies methods to be implemented in order for 55 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 56 * the data. 57 */ 58 public static interface DataManipulate 59 { 60 /** 61 * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's 62 * data 63 * the index array offset of the indexes for that lead surrogate. 64 * @param value data value for a surrogate from the trie, including the 65 * folding offset 66 * @return data offset or 0 if there is no data for the lead surrogate 67 */ getFoldingOffset(int value)68 public int getFoldingOffset(int value); 69 } 70 71 // default implementation 72 private static class DefaultGetFoldingOffset implements DataManipulate { 73 @Override getFoldingOffset(int value)74 public int getFoldingOffset(int value) { 75 return value; 76 } 77 } 78 79 // public methods -------------------------------------------------- 80 81 /** 82 * Determines if this trie has a linear latin 1 array 83 * @return true if this trie has a linear latin 1 array, false otherwise 84 */ isLatin1Linear()85 public final boolean isLatin1Linear() 86 { 87 return m_isLatin1Linear_; 88 } 89 90 /** 91 * Checks if the argument Trie has the same data as this Trie. 92 * Attributes are checked but not the index data. 93 * @param other Trie to check 94 * @return true if the argument Trie has the same data as this Trie, false 95 * otherwise 96 */ 97 ///CLOVER:OFF 98 @Override equals(Object other)99 public boolean equals(Object other) 100 { 101 if (other == this) { 102 return true; 103 } 104 if (!(other instanceof Trie)) { 105 return false; 106 } 107 Trie othertrie = (Trie)other; 108 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_ 109 && m_options_ == othertrie.m_options_ 110 && m_dataLength_ == othertrie.m_dataLength_ 111 && Arrays.equals(m_index_, othertrie.m_index_); 112 } 113 114 @Override hashCode()115 public int hashCode() { 116 assert false : "hashCode not designed"; 117 return 42; 118 } 119 ///CLOVER:ON 120 121 /** 122 * Gets the serialized data file size of the Trie. This is used during 123 * trie data reading for size checking purposes. 124 * @return size size of serialized trie data file in terms of the number 125 * of bytes 126 */ getSerializedDataSize()127 public int getSerializedDataSize() 128 { 129 // includes signature, option, dataoffset and datalength output 130 int result = (4 << 2); 131 result += (m_dataOffset_ << 1); 132 if (isCharTrie()) { 133 result += (m_dataLength_ << 1); 134 } 135 else if (isIntTrie()) { 136 result += (m_dataLength_ << 2); 137 } 138 return result; 139 } 140 141 // protected constructor ------------------------------------------- 142 143 /** 144 * Trie constructor for CharTrie use. 145 * @param bytes data of an ICU data file, containing the trie 146 * @param dataManipulate object containing the information to parse the 147 * trie data 148 */ Trie(ByteBuffer bytes, DataManipulate dataManipulate)149 protected Trie(ByteBuffer bytes, DataManipulate dataManipulate) 150 { 151 // Magic number to authenticate the data. 152 int signature = bytes.getInt(); 153 m_options_ = bytes.getInt(); 154 155 if (!checkHeader(signature)) { 156 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 157 } 158 159 if(dataManipulate != null) { 160 m_dataManipulate_ = dataManipulate; 161 } else { 162 m_dataManipulate_ = new DefaultGetFoldingOffset(); 163 } 164 m_isLatin1Linear_ = (m_options_ & 165 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 166 m_dataOffset_ = bytes.getInt(); 167 m_dataLength_ = bytes.getInt(); 168 unserialize(bytes); 169 } 170 171 /** 172 * Trie constructor 173 * @param index array to be used for index 174 * @param options used by the trie 175 * @param dataManipulate object containing the information to parse the 176 * trie data 177 */ Trie(char index[], int options, DataManipulate dataManipulate)178 protected Trie(char index[], int options, DataManipulate dataManipulate) 179 { 180 m_options_ = options; 181 if(dataManipulate != null) { 182 m_dataManipulate_ = dataManipulate; 183 } else { 184 m_dataManipulate_ = new DefaultGetFoldingOffset(); 185 } 186 m_isLatin1Linear_ = (m_options_ & 187 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 188 m_index_ = index; 189 m_dataOffset_ = m_index_.length; 190 } 191 192 193 // protected data members ------------------------------------------ 194 195 /** 196 * Lead surrogate code points' index displacement in the index array. 197 * 0x10000-0xd800=0x2800 198 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 199 */ 200 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 201 /** 202 * Shift size for shifting right the input index. 1..9 203 */ 204 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 205 /** 206 * Shift size for shifting left the index array values. 207 * Increases possible data size with 16-bit index values at the cost 208 * of compactability. 209 * This requires blocks of stage 2 data to be aligned by 210 * DATA_GRANULARITY. 211 * 0..INDEX_STAGE_1_SHIFT 212 */ 213 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 214 /** 215 * Number of data values in a stage 2 (data array) block. 216 */ 217 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 218 /** 219 * Mask for getting the lower bits from the input index. 220 * DATA_BLOCK_LENGTH - 1. 221 */ 222 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 223 /** Number of bits of a trail surrogate that are used in index table lookups. */ 224 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; 225 /** 226 * Number of index (stage 1) entries per lead surrogate. 227 * Same as number of index entries for 1024 trail surrogates, 228 * ==0x400>>INDEX_STAGE_1_SHIFT_ 229 */ 230 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); 231 /** Length of the BMP portion of the index (stage 1) array. */ 232 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; 233 /** 234 * Surrogate mask to use when shifting offset to retrieve supplementary 235 * values 236 */ 237 protected static final int SURROGATE_MASK_ = 0x3FF; 238 /** 239 * Index or UTF16 characters 240 */ 241 protected char m_index_[]; 242 /** 243 * Internal TrieValue which handles the parsing of the data value. 244 * This class is to be implemented by the user 245 */ 246 protected DataManipulate m_dataManipulate_; 247 /** 248 * Start index of the data portion of the trie. CharTrie combines 249 * index and data into a char array, so this is used to indicate the 250 * initial offset to the data portion. 251 * Note this index always points to the initial value. 252 */ 253 protected int m_dataOffset_; 254 /** 255 * Length of the data array 256 */ 257 protected int m_dataLength_; 258 259 // protected methods ----------------------------------------------- 260 261 /** 262 * Gets the offset to the data which the surrogate pair points to. 263 * @param lead lead surrogate 264 * @param trail trailing surrogate 265 * @return offset to data 266 */ getSurrogateOffset(char lead, char trail)267 protected abstract int getSurrogateOffset(char lead, char trail); 268 269 /** 270 * Gets the value at the argument index 271 * @param index value at index will be retrieved 272 * @return 32 bit value 273 */ getValue(int index)274 protected abstract int getValue(int index); 275 276 /** 277 * Gets the default initial value 278 * @return 32 bit value 279 */ getInitialValue()280 protected abstract int getInitialValue(); 281 282 /** 283 * Gets the offset to the data which the index ch after variable offset 284 * points to. 285 * Note for locating a non-supplementary character data offset, calling 286 * <p> 287 * getRawOffset(0, ch); 288 * </p> 289 * will do. Otherwise if it is a supplementary character formed by 290 * surrogates lead and trail. Then we would have to call getRawOffset() 291 * with getFoldingIndexOffset(). See getSurrogateOffset(). 292 * @param offset index offset which ch is to start from 293 * @param ch index to be used after offset 294 * @return offset to the data 295 */ getRawOffset(int offset, char ch)296 protected final int getRawOffset(int offset, char ch) 297 { 298 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 299 << INDEX_STAGE_2_SHIFT_) 300 + (ch & INDEX_STAGE_3_MASK_); 301 } 302 303 /** 304 * Gets the offset to data which the BMP character points to 305 * Treats a lead surrogate as a normal code point. 306 * @param ch BMP character 307 * @return offset to data 308 */ getBMPOffset(char ch)309 protected final int getBMPOffset(char ch) 310 { 311 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 312 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 313 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 314 : getRawOffset(0, ch); 315 // using a getRawOffset(ch) makes no diff 316 } 317 318 /** 319 * Gets the offset to the data which this lead surrogate character points 320 * to. 321 * Data at the returned offset may contain folding offset information for 322 * the next trailing surrogate character. 323 * @param ch lead surrogate character 324 * @return offset to data 325 */ getLeadOffset(char ch)326 protected final int getLeadOffset(char ch) 327 { 328 return getRawOffset(0, ch); 329 } 330 331 /** 332 * Internal trie getter from a code point. 333 * Could be faster(?) but longer with 334 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } 335 * Gets the offset to data which the codepoint points to 336 * @param ch codepoint 337 * @return offset to data 338 */ getCodePointOffset(int ch)339 protected final int getCodePointOffset(int ch) 340 { 341 // if ((ch >> 16) == 0) slower 342 if (ch < 0) { 343 return -1; 344 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 345 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 346 return getRawOffset(0, (char)ch); 347 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 348 // BMP codepoint 349 return getBMPOffset((char)ch); 350 } else if (ch <= UCharacter.MAX_VALUE) { 351 // look at the construction of supplementary characters 352 // trail forms the ends of it. 353 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 354 (char)(ch & SURROGATE_MASK_)); 355 } else { 356 // return -1 if there is an error, in this case we return 357 return -1; 358 } 359 } 360 361 /** 362 * <p>Parses the byte buffer and creates the trie index with it.</p> 363 * <p>The position of the input ByteBuffer must be right after the trie header.</p> 364 * <p>This is overwritten by the child classes. 365 * @param bytes buffer containing trie data 366 */ unserialize(ByteBuffer bytes)367 protected void unserialize(ByteBuffer bytes) 368 { 369 m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0); 370 } 371 372 /** 373 * Determines if this is a 32 bit trie 374 * @return true if options specifies this is a 32 bit trie 375 */ isIntTrie()376 protected final boolean isIntTrie() 377 { 378 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; 379 } 380 381 /** 382 * Determines if this is a 16 bit trie 383 * @return true if this is a 16 bit trie 384 */ isCharTrie()385 protected final boolean isCharTrie() 386 { 387 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 388 } 389 390 // private data members -------------------------------------------- 391 392 // struct UTrieHeader { 393 // int32_t signature; 394 // int32_t options (a bit field) 395 // int32_t indexLength 396 // int32_t dataLength 397 398 /** 399 * Size of Trie header in bytes 400 */ 401 protected static final int HEADER_LENGTH_ = 4 * 4; 402 /** 403 * Latin 1 option mask 404 */ 405 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 406 /** 407 * Constant number to authenticate the byte block 408 */ 409 protected static final int HEADER_SIGNATURE_ = 0x54726965; 410 /** 411 * Header option formatting 412 */ 413 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 414 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 415 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 416 417 /** 418 * Flag indicator for Latin quick access data block 419 */ 420 private boolean m_isLatin1Linear_; 421 422 /** 423 * <p>Trie options field.</p> 424 * <p>options bit field:<br> 425 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 426 * 8 0 = 16-bit data, 1=32-bit data<br> 427 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 428 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 429 */ 430 private int m_options_; 431 432 // private methods --------------------------------------------------- 433 434 /** 435 * Authenticates raw data header. 436 * Checking the header information, signature and options. 437 * @param signature This contains the options and type of a Trie 438 * @return true if the header is authenticated valid 439 */ checkHeader(int signature)440 private final boolean checkHeader(int signature) 441 { 442 // check the signature 443 // Trie in big-endian US-ASCII (0x54726965). 444 // Magic number to authenticate the data. 445 if (signature != HEADER_SIGNATURE_) { 446 return false; 447 } 448 449 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 450 INDEX_STAGE_1_SHIFT_ || 451 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 452 HEADER_OPTIONS_SHIFT_MASK_) 453 != INDEX_STAGE_2_SHIFT_) { 454 return false; 455 } 456 return true; 457 } 458 } 459