1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ****************************************************************************** 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.nio.ByteBuffer; 13 14 import com.ibm.icu.text.UTF16; 15 16 /** 17 * Trie implementation which stores data in char, 16 bits. 18 * @author synwee 19 * @see com.ibm.icu.impl.Trie 20 * @since release 2.1, Jan 01 2002 21 */ 22 23 // note that i need to handle the block calculations later, since chartrie 24 // in icu4c uses the same index array. 25 public class CharTrie extends Trie 26 { 27 // public constructors --------------------------------------------- 28 29 /** 30 * <p>Creates a new Trie with the settings for the trie data.</p> 31 * <p>Unserialize the 32-bit-aligned input buffer and use the data for the 32 * trie.</p> 33 * @param bytes data of an ICU data file, containing the trie 34 * @param dataManipulate object which provides methods to parse the char 35 * data 36 */ CharTrie(ByteBuffer bytes, DataManipulate dataManipulate)37 public CharTrie(ByteBuffer bytes, DataManipulate dataManipulate) { 38 super(bytes, dataManipulate); 39 40 if (!isCharTrie()) { 41 throw new IllegalArgumentException( 42 "Data given does not belong to a char trie."); 43 } 44 } 45 46 /** 47 * Make a dummy CharTrie. 48 * A dummy trie is an empty runtime trie, used when a real data trie cannot 49 * be loaded. 50 * 51 * The trie always returns the initialValue, 52 * or the leadUnitValue for lead surrogate code points. 53 * The Latin-1 part is always set up to be linear. 54 * 55 * @param initialValue the initial value that is set for all code points 56 * @param leadUnitValue the value for lead surrogate code _units_ that do not 57 * have associated supplementary data 58 * @param dataManipulate object which provides methods to parse the char data 59 */ 60 @SuppressWarnings("all") // No way to ignore dead code warning specifically - see eclipse bug#282770 CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate)61 public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) { 62 super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate); 63 64 int dataLength, latin1Length, i, limit; 65 char block; 66 67 /* calculate the actual size of the dummy trie data */ 68 69 /* max(Latin-1, block 0) */ 70 dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH; 71 if(leadUnitValue!=initialValue) { 72 dataLength+=DATA_BLOCK_LENGTH; 73 } 74 m_data_=new char[dataLength]; 75 m_dataLength_=dataLength; 76 77 m_initialValue_=(char)initialValue; 78 79 /* fill the index and data arrays */ 80 81 /* indexes are preset to 0 (block 0) */ 82 83 /* Latin-1 data */ 84 for(i=0; i<latin1Length; ++i) { 85 m_data_[i]=(char)initialValue; 86 } 87 88 if(leadUnitValue!=initialValue) { 89 /* indexes for lead surrogate code units to the block after Latin-1 */ 90 block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_); 91 i=0xd800>>INDEX_STAGE_1_SHIFT_; 92 limit=0xdc00>>INDEX_STAGE_1_SHIFT_; 93 for(; i<limit; ++i) { 94 m_index_[i]=block; 95 } 96 97 /* data for lead surrogate code units */ 98 limit=latin1Length+DATA_BLOCK_LENGTH; 99 for(i=latin1Length; i<limit; ++i) { 100 m_data_[i]=(char)leadUnitValue; 101 } 102 } 103 } 104 105 // public methods -------------------------------------------------- 106 107 /** 108 * Gets the value associated with the codepoint. 109 * If no value is associated with the codepoint, a default value will be 110 * returned. 111 * @param ch codepoint 112 * @return offset to data 113 */ getCodePointValue(int ch)114 public final char getCodePointValue(int ch) 115 { 116 int offset; 117 118 // fastpath for U+0000..U+D7FF 119 if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 120 // copy of getRawOffset() 121 offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_) 122 + (ch & INDEX_STAGE_3_MASK_); 123 return m_data_[offset]; 124 } 125 126 // handle U+D800..U+10FFFF 127 offset = getCodePointOffset(ch); 128 129 // return -1 if there is an error, in this case we return the default 130 // value: m_initialValue_ 131 return (offset >= 0) ? m_data_[offset] : m_initialValue_; 132 } 133 134 /** 135 * Gets the value to the data which this lead surrogate character points 136 * to. 137 * Returned data may contain folding offset information for the next 138 * trailing surrogate character. 139 * This method does not guarantee correct results for trail surrogates. 140 * @param ch lead surrogate character 141 * @return data value 142 */ getLeadValue(char ch)143 public final char getLeadValue(char ch) 144 { 145 return m_data_[getLeadOffset(ch)]; 146 } 147 148 /** 149 * Get the value associated with the BMP code point. 150 * Lead surrogate code points are treated as normal code points, with 151 * unfolded values that may differ from getLeadValue() results. 152 * @param ch the input BMP code point 153 * @return trie data value associated with the BMP codepoint 154 */ getBMPValue(char ch)155 public final char getBMPValue(char ch) 156 { 157 return m_data_[getBMPOffset(ch)]; 158 } 159 160 /** 161 * Get the value associated with a pair of surrogates. 162 * @param lead a lead surrogate 163 * @param trail a trail surrogate 164 */ getSurrogateValue(char lead, char trail)165 public final char getSurrogateValue(char lead, char trail) 166 { 167 int offset = getSurrogateOffset(lead, trail); 168 if (offset > 0) { 169 return m_data_[offset]; 170 } 171 return m_initialValue_; 172 } 173 174 /** 175 * <p>Get a value from a folding offset (from the value of a lead surrogate) 176 * and a trail surrogate.</p> 177 * <p>If the 178 * @param leadvalue value associated with the lead surrogate which contains 179 * the folding offset 180 * @param trail surrogate 181 * @return trie data value associated with the trail character 182 */ getTrailValue(int leadvalue, char trail)183 public final char getTrailValue(int leadvalue, char trail) 184 { 185 if (m_dataManipulate_ == null) { 186 throw new NullPointerException( 187 "The field DataManipulate in this Trie is null"); 188 } 189 int offset = m_dataManipulate_.getFoldingOffset(leadvalue); 190 if (offset > 0) { 191 return m_data_[getRawOffset(offset, 192 (char)(trail & SURROGATE_MASK_))]; 193 } 194 return m_initialValue_; 195 } 196 197 /** 198 * <p>Gets the latin 1 fast path value.</p> 199 * <p>Note this only works if latin 1 characters have their own linear 200 * array.</p> 201 * @param ch latin 1 characters 202 * @return value associated with latin character 203 */ getLatin1LinearValue(char ch)204 public final char getLatin1LinearValue(char ch) 205 { 206 return m_data_[INDEX_STAGE_3_MASK_ + 1 + m_dataOffset_ + ch]; 207 } 208 209 /** 210 * Checks if the argument Trie has the same data as this Trie 211 * @param other Trie to check 212 * @return true if the argument Trie has the same data as this Trie, false 213 * otherwise 214 */ 215 ///CLOVER:OFF 216 @Override equals(Object other)217 public boolean equals(Object other) 218 { 219 boolean result = super.equals(other); 220 if (result && other instanceof CharTrie) { 221 CharTrie othertrie = (CharTrie)other; 222 return m_initialValue_ == othertrie.m_initialValue_; 223 } 224 return false; 225 } 226 227 @Override hashCode()228 public int hashCode() { 229 assert false : "hashCode not designed"; 230 return 42; 231 } 232 ///CLOVER:ON 233 234 // protected methods ----------------------------------------------- 235 236 /** 237 * <p>Parses the byte buffer and stores its trie content into a index and 238 * data array</p> 239 * @param bytes buffer containing trie data 240 */ 241 @Override unserialize(ByteBuffer bytes)242 protected final void unserialize(ByteBuffer bytes) 243 { 244 int indexDataLength = m_dataOffset_ + m_dataLength_; 245 m_index_ = ICUBinary.getChars(bytes, indexDataLength, 0); 246 m_data_ = m_index_; 247 m_initialValue_ = m_data_[m_dataOffset_]; 248 } 249 250 /** 251 * Gets the offset to the data which the surrogate pair points to. 252 * @param lead lead surrogate 253 * @param trail trailing surrogate 254 * @return offset to data 255 */ 256 @Override getSurrogateOffset(char lead, char trail)257 protected final int getSurrogateOffset(char lead, char trail) 258 { 259 if (m_dataManipulate_ == null) { 260 throw new NullPointerException( 261 "The field DataManipulate in this Trie is null"); 262 } 263 264 // get fold position for the next trail surrogate 265 int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead)); 266 267 // get the real data from the folded lead/trail units 268 if (offset > 0) { 269 return getRawOffset(offset, (char)(trail & SURROGATE_MASK_)); 270 } 271 272 // return -1 if there is an error, in this case we return the default 273 // value: m_initialValue_ 274 return -1; 275 } 276 277 /** 278 * Gets the value at the argument index. 279 * For use internally in TrieIterator. 280 * @param index value at index will be retrieved 281 * @return 32 bit value 282 * @see com.ibm.icu.impl.TrieIterator 283 */ 284 @Override getValue(int index)285 protected final int getValue(int index) 286 { 287 return m_data_[index]; 288 } 289 290 /** 291 * Gets the default initial value 292 * @return 32 bit value 293 */ 294 @Override getInitialValue()295 protected final int getInitialValue() 296 { 297 return m_initialValue_; 298 } 299 300 // private data members -------------------------------------------- 301 302 /** 303 * Default value 304 */ 305 private char m_initialValue_; 306 /** 307 * Array of char data 308 */ 309 private char m_data_[]; 310 } 311