1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  ******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ******************************************************************************
8  */
9 
10 package com.ibm.icu.impl;
11 
12 import java.nio.ByteBuffer;
13 
14 import com.ibm.icu.text.UTF16;
15 
16 /**
17  * Trie implementation which stores data in char, 16 bits.
18  * @author synwee
19  * @see com.ibm.icu.impl.Trie
20  * @since release 2.1, Jan 01 2002
21  */
22 
23  // note that i need to handle the block calculations later, since chartrie
24  // in icu4c uses the same index array.
25 public class CharTrie extends Trie
26 {
27     // public constructors ---------------------------------------------
28 
29     /**
30      * <p>Creates a new Trie with the settings for the trie data.</p>
31      * <p>Unserialize the 32-bit-aligned input buffer and use the data for the
32      * trie.</p>
33      * @param bytes data of an ICU data file, containing the trie
34      * @param dataManipulate object which provides methods to parse the char
35      *                        data
36      */
CharTrie(ByteBuffer bytes, DataManipulate dataManipulate)37     public CharTrie(ByteBuffer bytes, DataManipulate dataManipulate) {
38         super(bytes, dataManipulate);
39 
40         if (!isCharTrie()) {
41             throw new IllegalArgumentException(
42                                "Data given does not belong to a char trie.");
43         }
44     }
45 
46     /**
47      * Make a dummy CharTrie.
48      * A dummy trie is an empty runtime trie, used when a real data trie cannot
49      * be loaded.
50      *
51      * The trie always returns the initialValue,
52      * or the leadUnitValue for lead surrogate code points.
53      * The Latin-1 part is always set up to be linear.
54      *
55      * @param initialValue the initial value that is set for all code points
56      * @param leadUnitValue the value for lead surrogate code _units_ that do not
57      *                      have associated supplementary data
58      * @param dataManipulate object which provides methods to parse the char data
59      */
60     @SuppressWarnings("all") // No way to ignore dead code warning specifically - see eclipse bug#282770
CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate)61     public CharTrie(int initialValue, int leadUnitValue, DataManipulate dataManipulate) {
62         super(new char[BMP_INDEX_LENGTH+SURROGATE_BLOCK_COUNT], HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_, dataManipulate);
63 
64         int dataLength, latin1Length, i, limit;
65         char block;
66 
67         /* calculate the actual size of the dummy trie data */
68 
69         /* max(Latin-1, block 0) */
70         dataLength=latin1Length= INDEX_STAGE_1_SHIFT_<=8 ? 256 : DATA_BLOCK_LENGTH;
71         if(leadUnitValue!=initialValue) {
72             dataLength+=DATA_BLOCK_LENGTH;
73         }
74         m_data_=new char[dataLength];
75         m_dataLength_=dataLength;
76 
77         m_initialValue_=(char)initialValue;
78 
79         /* fill the index and data arrays */
80 
81         /* indexes are preset to 0 (block 0) */
82 
83         /* Latin-1 data */
84         for(i=0; i<latin1Length; ++i) {
85             m_data_[i]=(char)initialValue;
86         }
87 
88         if(leadUnitValue!=initialValue) {
89             /* indexes for lead surrogate code units to the block after Latin-1 */
90             block=(char)(latin1Length>>INDEX_STAGE_2_SHIFT_);
91             i=0xd800>>INDEX_STAGE_1_SHIFT_;
92             limit=0xdc00>>INDEX_STAGE_1_SHIFT_;
93             for(; i<limit; ++i) {
94                 m_index_[i]=block;
95             }
96 
97             /* data for lead surrogate code units */
98             limit=latin1Length+DATA_BLOCK_LENGTH;
99             for(i=latin1Length; i<limit; ++i) {
100                 m_data_[i]=(char)leadUnitValue;
101             }
102         }
103     }
104 
105     // public methods --------------------------------------------------
106 
107     /**
108     * Gets the value associated with the codepoint.
109     * If no value is associated with the codepoint, a default value will be
110     * returned.
111     * @param ch codepoint
112     * @return offset to data
113     */
getCodePointValue(int ch)114     public final char getCodePointValue(int ch)
115     {
116         int offset;
117 
118         // fastpath for U+0000..U+D7FF
119         if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
120             // copy of getRawOffset()
121             offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
122                     + (ch & INDEX_STAGE_3_MASK_);
123             return m_data_[offset];
124         }
125 
126         // handle U+D800..U+10FFFF
127         offset = getCodePointOffset(ch);
128 
129         // return -1 if there is an error, in this case we return the default
130         // value: m_initialValue_
131         return (offset >= 0) ? m_data_[offset] : m_initialValue_;
132     }
133 
134     /**
135     * Gets the value to the data which this lead surrogate character points
136     * to.
137     * Returned data may contain folding offset information for the next
138     * trailing surrogate character.
139     * This method does not guarantee correct results for trail surrogates.
140     * @param ch lead surrogate character
141     * @return data value
142     */
getLeadValue(char ch)143     public final char getLeadValue(char ch)
144     {
145        return m_data_[getLeadOffset(ch)];
146     }
147 
148     /**
149     * Get the value associated with the BMP code point.
150     * Lead surrogate code points are treated as normal code points, with
151     * unfolded values that may differ from getLeadValue() results.
152     * @param ch the input BMP code point
153     * @return trie data value associated with the BMP codepoint
154     */
getBMPValue(char ch)155     public final char getBMPValue(char ch)
156     {
157         return m_data_[getBMPOffset(ch)];
158     }
159 
160     /**
161     * Get the value associated with a pair of surrogates.
162     * @param lead a lead surrogate
163     * @param trail a trail surrogate
164     */
getSurrogateValue(char lead, char trail)165     public final char getSurrogateValue(char lead, char trail)
166     {
167         int offset = getSurrogateOffset(lead, trail);
168         if (offset > 0) {
169             return m_data_[offset];
170         }
171         return m_initialValue_;
172     }
173 
174     /**
175     * <p>Get a value from a folding offset (from the value of a lead surrogate)
176     * and a trail surrogate.</p>
177     * <p>If the
178     * @param leadvalue value associated with the lead surrogate which contains
179     *        the folding offset
180     * @param trail surrogate
181     * @return trie data value associated with the trail character
182     */
getTrailValue(int leadvalue, char trail)183     public final char getTrailValue(int leadvalue, char trail)
184     {
185         if (m_dataManipulate_ == null) {
186             throw new NullPointerException(
187                              "The field DataManipulate in this Trie is null");
188         }
189         int offset = m_dataManipulate_.getFoldingOffset(leadvalue);
190         if (offset > 0) {
191             return m_data_[getRawOffset(offset,
192                                         (char)(trail & SURROGATE_MASK_))];
193         }
194         return m_initialValue_;
195     }
196 
197     /**
198      * <p>Gets the latin 1 fast path value.</p>
199      * <p>Note this only works if latin 1 characters have their own linear
200      * array.</p>
201      * @param ch latin 1 characters
202      * @return value associated with latin character
203      */
getLatin1LinearValue(char ch)204     public final char getLatin1LinearValue(char ch)
205     {
206         return m_data_[INDEX_STAGE_3_MASK_ + 1 + m_dataOffset_ + ch];
207     }
208 
209     /**
210      * Checks if the argument Trie has the same data as this Trie
211      * @param other Trie to check
212      * @return true if the argument Trie has the same data as this Trie, false
213      *         otherwise
214      */
215     ///CLOVER:OFF
216     @Override
equals(Object other)217     public boolean equals(Object other)
218     {
219         boolean result = super.equals(other);
220         if (result && other instanceof CharTrie) {
221             CharTrie othertrie = (CharTrie)other;
222             return m_initialValue_ == othertrie.m_initialValue_;
223         }
224         return false;
225     }
226 
227     @Override
hashCode()228     public int hashCode() {
229         assert false : "hashCode not designed";
230         return 42;
231     }
232     ///CLOVER:ON
233 
234     // protected methods -----------------------------------------------
235 
236     /**
237      * <p>Parses the byte buffer and stores its trie content into a index and
238      * data array</p>
239      * @param bytes buffer containing trie data
240      */
241     @Override
unserialize(ByteBuffer bytes)242     protected final void unserialize(ByteBuffer bytes)
243     {
244         int indexDataLength = m_dataOffset_ + m_dataLength_;
245         m_index_ = ICUBinary.getChars(bytes, indexDataLength, 0);
246         m_data_           = m_index_;
247         m_initialValue_   = m_data_[m_dataOffset_];
248     }
249 
250     /**
251     * Gets the offset to the data which the surrogate pair points to.
252     * @param lead lead surrogate
253     * @param trail trailing surrogate
254     * @return offset to data
255     */
256     @Override
getSurrogateOffset(char lead, char trail)257     protected final int getSurrogateOffset(char lead, char trail)
258     {
259         if (m_dataManipulate_ == null) {
260             throw new NullPointerException(
261                              "The field DataManipulate in this Trie is null");
262         }
263 
264         // get fold position for the next trail surrogate
265         int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
266 
267         // get the real data from the folded lead/trail units
268         if (offset > 0) {
269             return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
270         }
271 
272         // return -1 if there is an error, in this case we return the default
273         // value: m_initialValue_
274         return -1;
275     }
276 
277     /**
278     * Gets the value at the argument index.
279     * For use internally in TrieIterator.
280     * @param index value at index will be retrieved
281     * @return 32 bit value
282     * @see com.ibm.icu.impl.TrieIterator
283     */
284     @Override
getValue(int index)285     protected final int getValue(int index)
286     {
287         return m_data_[index];
288     }
289 
290     /**
291     * Gets the default initial value
292     * @return 32 bit value
293     */
294     @Override
getInitialValue()295     protected final int getInitialValue()
296     {
297         return m_initialValue_;
298     }
299 
300     // private data members --------------------------------------------
301 
302     /**
303     * Default value
304     */
305     private char m_initialValue_;
306     /**
307     * Array of char data
308     */
309     private char m_data_[];
310 }
311