1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 ******************************************************************************
5 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
6 * others. All Rights Reserved.                                               *
7 ******************************************************************************
8 */
9 
10 package com.ibm.icu.impl;
11 
12 import java.util.Arrays;
13 
14 import com.ibm.icu.lang.UCharacter;
15 
16 /**
17  * Builder class to manipulate and generate a trie.
18  * This is useful for ICU data in primitive types.
19  * Provides a compact way to store information that is indexed by Unicode
20  * values, such as character properties, types, keyboard values, etc. This is
21  * very useful when you have a block of Unicode data that contains significant
22  * values while the rest of the Unicode data is unused in the application or
23  * when you have a lot of redundance, such as where all 21,000 Han ideographs
24  * have the same value.  However, lookup is much faster than a hash table.
25  * A trie of any primitive data type serves two purposes:
26  * <UL type = round>
27  *     <LI>Fast access of the indexed values.
28  *     <LI>Smaller memory footprint.
29  * </UL>
30  * This is a direct port from the ICU4C version
31  * @author             Syn Wee Quek
32  */
33 public class TrieBuilder
34 {
35     // public data member ----------------------------------------------
36 
37     /**
38      * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
39      * 0x200
40      */
41     public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
42 
43     // public class declaration ----------------------------------------
44 
45     /**
46      * Character data in com.ibm.impl.Trie have different user-specified format
47      * for different purposes.
48      * This interface specifies methods to be implemented in order for
49      * com.ibm.impl.Trie, to surrogate offset information encapsulated within
50      * the data.
51      */
52     public static interface DataManipulate
53     {
54         /**
55          * Build-time trie callback function, used with serialize().
56          * This function calculates a lead surrogate's value including a
57          * folding offset from the 1024 supplementary code points
58          * [start..start+1024[ .
59          * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
60          * The folding offset is provided by the caller.
61          * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
62          * with n=0..1023.
63          * Instead of the offset itself, n can be stored in 10 bits - or fewer
64          * if it can be assumed that few lead surrogates have associated data.
65          * The returned value must be
66          *  - not zero if and only if there is relevant data for the
67          *                        corresponding 1024 supplementary code points
68          *  - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
69          *                                                    offset))==offset
70          * @return a folded value, or 0 if there is no relevant data for the
71          *         lead surrogate.
72          */
getFoldedValue(int start, int offset)73         public int getFoldedValue(int start, int offset);
74     }
75 
76     // public methods ----------------------------------------------------
77 
78     /**
79      * Checks if the character belongs to a zero block in the trie
80      * @param ch codepoint which data is to be retrieved
81      * @return true if ch is in the zero block
82      */
isInZeroBlock(int ch)83     public boolean isInZeroBlock(int ch)
84     {
85         // valid, uncompacted trie and valid c?
86         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
87             || ch < UCharacter.MIN_VALUE) {
88             return true;
89         }
90 
91         return m_index_[ch >> SHIFT_] == 0;
92     }
93 
94     // package private method -----------------------------------------------
95 
96     // protected data member -----------------------------------------------
97 
98     /**
99      * Index values at build-time are 32 bits wide for easier processing.
100      * Bit 31 is set if the data block is used by multiple index values
101      * (from setRange()).
102      */
103     protected int m_index_[];
104     protected int m_indexLength_;
105     protected int m_dataCapacity_;
106     protected int m_dataLength_;
107     protected boolean m_isLatin1Linear_;
108     protected boolean m_isCompacted_;
109     /**
110      * Map of adjusted indexes, used in utrie_compact().
111      * Maps from original indexes to new ones.
112      */
113     protected int m_map_[];
114 
115     /**
116      * Shift size for shifting right the input index. 1..9
117      */
118     protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
119     /**
120      * Length of the index (stage 1) array before folding.
121      * Maximum number of Unicode code points (0x110000) shifted right by
122      * SHIFT.
123      */
124     protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
125     /**
126      * Length of the BMP portion of the index (stage 1) array.
127      */
128     protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
129     /**
130      * Number of index (stage 1) entries per lead surrogate.
131      * Same as number of indexe entries for 1024 trail surrogates,
132      * ==0x400>>UTRIE_SHIFT
133      * 10 - SHIFT == Number of bits of a trail surrogate that are used in
134      *               index table lookups.
135      */
136     protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
137     /**
138      * Mask for getting the lower bits from the input index.
139      * DATA_BLOCK_LENGTH - 1.
140      */
141     protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
142     /**
143      * Shift size for shifting left the index array values.
144      * Increases possible data size with 16-bit index values at the cost
145      * of compactability.
146      * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
147      * 0..UTRIE_SHIFT
148      */
149     protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
150     /**
151      * Maximum length of the runtime data (stage 2) array.
152      * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
153      */
154     protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
155     /**
156      * Shifting to position the index value in options
157      */
158     protected static final int OPTIONS_INDEX_SHIFT_ = 4;
159     /**
160      * If set, then the data (stage 2) array is 32 bits wide.
161      */
162     protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
163     /**
164      * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
165      * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
166      */
167     protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
168     /**
169      * The alignment size of a stage 2 data block. Also the granularity for
170      * compaction.
171      */
172     protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
173 
174     // protected constructor ----------------------------------------------
175 
TrieBuilder()176     protected TrieBuilder()
177     {
178         m_index_ = new int[MAX_INDEX_LENGTH_];
179         m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
180         m_isLatin1Linear_ = false;
181         m_isCompacted_ = false;
182         m_indexLength_ = MAX_INDEX_LENGTH_;
183     }
184 
TrieBuilder(TrieBuilder table)185     protected TrieBuilder(TrieBuilder table)
186     {
187         m_index_ = new int[MAX_INDEX_LENGTH_];
188         m_indexLength_ = table.m_indexLength_;
189         System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
190         m_dataCapacity_ = table.m_dataCapacity_;
191         m_dataLength_ = table.m_dataLength_;
192         m_map_ = new int[table.m_map_.length];
193         System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
194         m_isLatin1Linear_ = table.m_isLatin1Linear_;
195         m_isCompacted_ = table.m_isCompacted_;
196     }
197 
198     // protected functions ------------------------------------------------
199 
200     /**
201      * Compare two sections of an array for equality.
202      */
equal_int(int[] array, int start1, int start2, int length)203     protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
204         while(length>0 && array[start1]==array[start2]) {
205             ++start1;
206             ++start2;
207             --length;
208         }
209         return length==0;
210     }
211 
212     /**
213      * Set a value in the trie index map to indicate which data block
214      * is referenced and which one is not.
215      * utrie_compact() will remove data blocks that are not used at all.
216      * Set
217      * - 0 if it is used
218      * - -1 if it is not used
219      */
findUnusedBlocks()220     protected void findUnusedBlocks()
221     {
222         // fill the entire map with "not used"
223         Arrays.fill(m_map_, 0xff);
224 
225         // mark each block that _is_ used with 0
226         for (int i = 0; i < m_indexLength_; ++ i) {
227             m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
228         }
229 
230         // never move the all-initial-value block 0
231         m_map_[0] = 0;
232     }
233 
234     /**
235      * Finds the same index block as the otherBlock
236      * @param index array
237      * @param indexLength size of index
238      * @param otherBlock
239      * @return same index block
240      */
findSameIndexBlock(int index[], int indexLength, int otherBlock)241     protected static final int findSameIndexBlock(int index[], int indexLength,
242                                                   int otherBlock)
243     {
244         for (int block = BMP_INDEX_LENGTH_; block < indexLength;
245              block += SURROGATE_BLOCK_COUNT_) {
246             if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
247                 return block;
248             }
249         }
250         return indexLength;
251     }
252 
253     // private data member ------------------------------------------------
254 
255     /**
256      * Maximum length of the build-time data (stage 2) array.
257      * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
258      * (Number of Unicode code points + one all-initial-value block +
259      *  possible duplicate entries for 1024 lead surrogates.)
260      */
261     private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
262         0x110000 + DATA_BLOCK_LENGTH + 0x400;
263 }
264