1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ****************************************************************************** 5 * Copyright (C) 1996-2010, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ****************************************************************************** 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.util.Arrays; 13 14 import com.ibm.icu.lang.UCharacter; 15 16 /** 17 * Builder class to manipulate and generate a trie. 18 * This is useful for ICU data in primitive types. 19 * Provides a compact way to store information that is indexed by Unicode 20 * values, such as character properties, types, keyboard values, etc. This is 21 * very useful when you have a block of Unicode data that contains significant 22 * values while the rest of the Unicode data is unused in the application or 23 * when you have a lot of redundance, such as where all 21,000 Han ideographs 24 * have the same value. However, lookup is much faster than a hash table. 25 * A trie of any primitive data type serves two purposes: 26 * <UL type = round> 27 * <LI>Fast access of the indexed values. 28 * <LI>Smaller memory footprint. 29 * </UL> 30 * This is a direct port from the ICU4C version 31 * @author Syn Wee Quek 32 */ 33 public class TrieBuilder 34 { 35 // public data member ---------------------------------------------- 36 37 /** 38 * Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 39 * 0x200 40 */ 41 public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_; 42 43 // public class declaration ---------------------------------------- 44 45 /** 46 * Character data in com.ibm.impl.Trie have different user-specified format 47 * for different purposes. 48 * This interface specifies methods to be implemented in order for 49 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 50 * the data. 51 */ 52 public static interface DataManipulate 53 { 54 /** 55 * Build-time trie callback function, used with serialize(). 56 * This function calculates a lead surrogate's value including a 57 * folding offset from the 1024 supplementary code points 58 * [start..start+1024[ . 59 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. 60 * The folding offset is provided by the caller. 61 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT 62 * with n=0..1023. 63 * Instead of the offset itself, n can be stored in 10 bits - or fewer 64 * if it can be assumed that few lead surrogates have associated data. 65 * The returned value must be 66 * - not zero if and only if there is relevant data for the 67 * corresponding 1024 supplementary code points 68 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., 69 * offset))==offset 70 * @return a folded value, or 0 if there is no relevant data for the 71 * lead surrogate. 72 */ getFoldedValue(int start, int offset)73 public int getFoldedValue(int start, int offset); 74 } 75 76 // public methods ---------------------------------------------------- 77 78 /** 79 * Checks if the character belongs to a zero block in the trie 80 * @param ch codepoint which data is to be retrieved 81 * @return true if ch is in the zero block 82 */ isInZeroBlock(int ch)83 public boolean isInZeroBlock(int ch) 84 { 85 // valid, uncompacted trie and valid c? 86 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE 87 || ch < UCharacter.MIN_VALUE) { 88 return true; 89 } 90 91 return m_index_[ch >> SHIFT_] == 0; 92 } 93 94 // package private method ----------------------------------------------- 95 96 // protected data member ----------------------------------------------- 97 98 /** 99 * Index values at build-time are 32 bits wide for easier processing. 100 * Bit 31 is set if the data block is used by multiple index values 101 * (from setRange()). 102 */ 103 protected int m_index_[]; 104 protected int m_indexLength_; 105 protected int m_dataCapacity_; 106 protected int m_dataLength_; 107 protected boolean m_isLatin1Linear_; 108 protected boolean m_isCompacted_; 109 /** 110 * Map of adjusted indexes, used in utrie_compact(). 111 * Maps from original indexes to new ones. 112 */ 113 protected int m_map_[]; 114 115 /** 116 * Shift size for shifting right the input index. 1..9 117 */ 118 protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_; 119 /** 120 * Length of the index (stage 1) array before folding. 121 * Maximum number of Unicode code points (0x110000) shifted right by 122 * SHIFT. 123 */ 124 protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_); 125 /** 126 * Length of the BMP portion of the index (stage 1) array. 127 */ 128 protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_; 129 /** 130 * Number of index (stage 1) entries per lead surrogate. 131 * Same as number of indexe entries for 1024 trail surrogates, 132 * ==0x400>>UTRIE_SHIFT 133 * 10 - SHIFT == Number of bits of a trail surrogate that are used in 134 * index table lookups. 135 */ 136 protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_); 137 /** 138 * Mask for getting the lower bits from the input index. 139 * DATA_BLOCK_LENGTH - 1. 140 */ 141 protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_; 142 /** 143 * Shift size for shifting left the index array values. 144 * Increases possible data size with 16-bit index values at the cost 145 * of compactability. 146 * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY. 147 * 0..UTRIE_SHIFT 148 */ 149 protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_; 150 /** 151 * Maximum length of the runtime data (stage 2) array. 152 * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_. 153 */ 154 protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_); 155 /** 156 * Shifting to position the index value in options 157 */ 158 protected static final int OPTIONS_INDEX_SHIFT_ = 4; 159 /** 160 * If set, then the data (stage 2) array is 32 bits wide. 161 */ 162 protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100; 163 /** 164 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data 165 * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH. 166 */ 167 protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200; 168 /** 169 * The alignment size of a stage 2 data block. Also the granularity for 170 * compaction. 171 */ 172 protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_; 173 174 // protected constructor ---------------------------------------------- 175 TrieBuilder()176 protected TrieBuilder() 177 { 178 m_index_ = new int[MAX_INDEX_LENGTH_]; 179 m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_]; 180 m_isLatin1Linear_ = false; 181 m_isCompacted_ = false; 182 m_indexLength_ = MAX_INDEX_LENGTH_; 183 } 184 TrieBuilder(TrieBuilder table)185 protected TrieBuilder(TrieBuilder table) 186 { 187 m_index_ = new int[MAX_INDEX_LENGTH_]; 188 m_indexLength_ = table.m_indexLength_; 189 System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_); 190 m_dataCapacity_ = table.m_dataCapacity_; 191 m_dataLength_ = table.m_dataLength_; 192 m_map_ = new int[table.m_map_.length]; 193 System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length); 194 m_isLatin1Linear_ = table.m_isLatin1Linear_; 195 m_isCompacted_ = table.m_isCompacted_; 196 } 197 198 // protected functions ------------------------------------------------ 199 200 /** 201 * Compare two sections of an array for equality. 202 */ equal_int(int[] array, int start1, int start2, int length)203 protected static final boolean equal_int(int[] array, int start1, int start2, int length) { 204 while(length>0 && array[start1]==array[start2]) { 205 ++start1; 206 ++start2; 207 --length; 208 } 209 return length==0; 210 } 211 212 /** 213 * Set a value in the trie index map to indicate which data block 214 * is referenced and which one is not. 215 * utrie_compact() will remove data blocks that are not used at all. 216 * Set 217 * - 0 if it is used 218 * - -1 if it is not used 219 */ findUnusedBlocks()220 protected void findUnusedBlocks() 221 { 222 // fill the entire map with "not used" 223 Arrays.fill(m_map_, 0xff); 224 225 // mark each block that _is_ used with 0 226 for (int i = 0; i < m_indexLength_; ++ i) { 227 m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0; 228 } 229 230 // never move the all-initial-value block 0 231 m_map_[0] = 0; 232 } 233 234 /** 235 * Finds the same index block as the otherBlock 236 * @param index array 237 * @param indexLength size of index 238 * @param otherBlock 239 * @return same index block 240 */ findSameIndexBlock(int index[], int indexLength, int otherBlock)241 protected static final int findSameIndexBlock(int index[], int indexLength, 242 int otherBlock) 243 { 244 for (int block = BMP_INDEX_LENGTH_; block < indexLength; 245 block += SURROGATE_BLOCK_COUNT_) { 246 if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) { 247 return block; 248 } 249 } 250 return indexLength; 251 } 252 253 // private data member ------------------------------------------------ 254 255 /** 256 * Maximum length of the build-time data (stage 2) array. 257 * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400. 258 * (Number of Unicode code points + one all-initial-value block + 259 * possible duplicate entries for 1024 lead surrogates.) 260 */ 261 private static final int MAX_BUILD_TIME_DATA_LENGTH_ = 262 0x110000 + DATA_BLOCK_LENGTH + 0x400; 263 } 264