1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdatareader.h 9 * 10 * created on: 2013feb07 11 * created by: Markus W. Scherer 12 */ 13 14 #ifndef __COLLATIONDATAREADER_H__ 15 #define __COLLATIONDATAREADER_H__ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_COLLATION 20 21 #include "unicode/udata.h" 22 23 struct UDataMemory; 24 25 U_NAMESPACE_BEGIN 26 27 struct CollationTailoring; 28 29 /** 30 * Collation binary data reader. 31 */ 32 struct U_I18N_API CollationDataReader /* all static */ { 33 // The following constants are also copied into source/common/ucol_swp.cpp. 34 // Keep them in sync! 35 enum { 36 /** 37 * Number of int32_t indexes. 38 * 39 * Can be 2 if there are only options. 40 * Can be 7 or 8 if there are only options and a script reordering. 41 * The loader treats any index>=indexes[IX_INDEXES_LENGTH] as 0. 42 */ 43 IX_INDEXES_LENGTH, // 0 44 /** 45 * Bits 31..24: numericPrimary, for numeric collation 46 * 23..16: fast Latin format version (0 = no fast Latin table) 47 * 15.. 0: options bit set 48 */ 49 IX_OPTIONS, 50 IX_RESERVED2, 51 IX_RESERVED3, 52 53 /** Array offset to Jamo CE32s in ce32s[], or <0 if none. */ 54 IX_JAMO_CE32S_START, // 4 55 56 // Byte offsets from the start of the data, after the generic header. 57 // The indexes[] are at byte offset 0, other data follows. 58 // Each data item is aligned properly. 59 // The data items should be in descending order of unit size, 60 // to minimize the need for padding. 61 // Each item's byte length is given by the difference between its offset and 62 // the next index/offset value. 63 /** Byte offset to int32_t reorderCodes[]. */ 64 IX_REORDER_CODES_OFFSET, 65 /** 66 * Byte offset to uint8_t reorderTable[]. 67 * Empty table if <256 bytes (padding only). 68 * Otherwise 256 bytes or more (with padding). 69 */ 70 IX_REORDER_TABLE_OFFSET, 71 /** Byte offset to the collation trie. Its length is a multiple of 8 bytes. */ 72 IX_TRIE_OFFSET, 73 74 IX_RESERVED8_OFFSET, // 8 75 /** Byte offset to int64_t ces[]. */ 76 IX_CES_OFFSET, 77 IX_RESERVED10_OFFSET, 78 /** Byte offset to uint32_t ce32s[]. */ 79 IX_CE32S_OFFSET, 80 81 /** Byte offset to uint32_t rootElements[]. */ 82 IX_ROOT_ELEMENTS_OFFSET, // 12 83 /** Byte offset to UChar *contexts[]. */ 84 IX_CONTEXTS_OFFSET, 85 /** Byte offset to uint16_t [] with serialized unsafeBackwardSet. */ 86 IX_UNSAFE_BWD_OFFSET, 87 /** Byte offset to uint16_t fastLatinTable[]. */ 88 IX_FAST_LATIN_TABLE_OFFSET, 89 90 /** Byte offset to uint16_t scripts[]. */ 91 IX_SCRIPTS_OFFSET, // 16 92 /** 93 * Byte offset to UBool compressibleBytes[]. 94 * Empty table if <256 bytes (padding only). 95 * Otherwise 256 bytes or more (with padding). 96 */ 97 IX_COMPRESSIBLE_BYTES_OFFSET, 98 IX_RESERVED18_OFFSET, 99 IX_TOTAL_SIZE 100 }; 101 102 static void read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength, 103 CollationTailoring &tailoring, UErrorCode &errorCode); 104 105 static UBool U_CALLCONV 106 isAcceptable(void *context, const char *type, const char *name, const UDataInfo *pInfo); 107 108 private: 109 CollationDataReader(); // no constructor 110 }; 111 112 /* 113 * Format of collation data (ucadata.icu, binary data in coll/ *.res files). 114 * Format version 5. 115 * 116 * The root collation data is stored in the ucadata.icu file. 117 * Tailorings are stored inside .res resource bundle files, with a complete file header. 118 * 119 * Collation data begins with a standard ICU data file header 120 * (DataHeader, see ucmndata.h and unicode/udata.h). 121 * The UDataInfo.dataVersion field contains the UCA and other version numbers, 122 * see the comments for CollationTailoring.version. 123 * 124 * After the header, the file contains the following parts. 125 * Constants are defined as enum values of the CollationDataReader class. 126 * See also the Collation class. 127 * 128 * int32_t indexes[indexesLength]; 129 * The indexes array has variable length. 130 * Some tailorings only need the length and the options, 131 * others only add reorderCodes and the reorderTable, 132 * some need to store mappings. 133 * Only as many indexes are stored as needed to read all of the data. 134 * 135 * Index 0: indexesLength 136 * Index 1: numericPrimary, CollationFastLatin::VERSION, and options: see IX_OPTIONS 137 * Index 2..3: Unused/reserved/0. 138 * Index 4: Index into the ce32s array where the CE32s of the conjoining Jamo 139 * are stored in a short, contiguous part of the ce32s array. 140 * 141 * Indexes 5..19 are byte offsets in ascending order. 142 * Each byte offset marks the start of the next part in the data file, 143 * and the end of the previous one. 144 * When two consecutive byte offsets are the same (or too short), 145 * then the corresponding part is empty. 146 * Byte offsets are offsets from after the header, 147 * that is, from the beginning of the indexes[]. 148 * Each part starts at an offset with proper alignment for its data. 149 * If necessary, the previous part may include padding bytes to achieve this alignment. 150 * The last byte offset that is stored in the indexes indicates the total size of the data 151 * (starting with the indexes). 152 * 153 * int32_t reorderCodes[]; -- empty in root 154 * The list of script and reordering codes. 155 * 156 * Beginning with format version 5, this array may optionally 157 * have trailing entries with a full list of reorder ranges 158 * as described for CollationSettings::reorderRanges. 159 * 160 * Script or reorder codes are first and do not exceed 16-bit values. 161 * Range limits are stored in the upper 16 bits, and are never 0. 162 * Split this array into reorder codes and ranges at the first entry 163 * with non-zero upper 16 bits. 164 * 165 * If the ranges are missing but needed for split-reordered primary lead bytes, 166 * then they are regenerated at load time. 167 * 168 * uint8_t reorderTable[256]; -- empty in root; can be longer to include padding bytes 169 * Primary-weight lead byte permutation table. 170 * Normally present when the reorderCodes are, but can be built at load time. 171 * 172 * Beginning with format version 5, a 0 entry at a non-zero index 173 * (which is otherwise an illegal value) 174 * means that the primary lead byte is "split" 175 * (there are different offsets for primaries that share that lead byte) 176 * and the reordering offset must be determined via the reorder ranges 177 * that are either stored as part of the reorderCodes array 178 * or regenerated at load time. 179 * 180 * UTrie2 trie; -- see utrie2_impl.h and utrie2.h 181 * The trie holds the main collation data. Each code point is mapped to a 32-bit value. 182 * It encodes a simple collation element (CE) in compact form, unless bits 7..6 are both set, 183 * in which case it is a special CE32 and contains a 4-bit tag and further data. 184 * See the Collation class for details. 185 * 186 * The trie has a value for each lead surrogate code unit with some bits encoding 187 * collective properties of the 1024 supplementary characters whose UTF-16 form starts with 188 * the lead surrogate. See Collation::LEAD_SURROGATE_TAG.. 189 * 190 * int64_t ces[]; 191 * 64-bit CEs and expansions that cannot be stored in a more compact form. 192 * 193 * uint32_t ce32s[]; 194 * CE32s for expansions in compact form, and for characters whose trie values 195 * contain special data. 196 * 197 * uint32_t rootElements[]; -- empty in all tailorings 198 * Compact storage for all of the CEs that occur in the root collation. 199 * See the CollationRootElements class. 200 * 201 * UChar *contexts[]; 202 * Serialized UCharsTrie structures with prefix (pre-context) and contraction mappings. 203 * 204 * uint16_t unsafeBackwardSet[]; -- see UnicodeSet::serialize() 205 * Serialized form of characters that are unsafe when iterating backwards, 206 * and at the end of an identical string prefix. 207 * Back up to a safe character. 208 * Lead surrogates are "unsafe" when any of their corresponding supplementary 209 * code points are unsafe. 210 * Does not include [:^lccc=0:][:^tccc=0:]. 211 * For each tailoring, the root unsafeBackwardSet is subtracted. 212 * (As a result, in many tailorings no set needs to be stored.) 213 * 214 * uint16_t fastLatinTable[]; 215 * Optional optimization for Latin text. 216 * See the CollationFastLatin class. 217 * 218 * uint16_t scripts[]; -- empty in all tailorings 219 * Format version 5: 220 * uint16_t numScripts; 221 * uint16_t scriptsIndex[numScripts+16]; 222 * uint16_t scriptStarts[]; 223 * See CollationData::numScripts etc. 224 * 225 * Format version 4: 226 * Table of the reordering groups with their first and last lead bytes, 227 * and their script and reordering codes. 228 * See CollationData::scripts. 229 * 230 * UBool compressibleBytes[]; -- empty in all tailorings 231 * Flag for getSortKey(), indicating primary weight lead bytes that are compressible. 232 * 233 * ----------------- 234 * Changes for formatVersion 5 (ICU 55) 235 * 236 * Reordering moves single scripts, not groups of scripts. 237 * Reorder ranges are optionally appended to the reorderCodes, 238 * and a 0 entry in the reorderTable indicates a split lead byte. 239 * The scripts data has a new format. 240 * 241 * The rootElements may contain secondary and tertiary weights below common=05. 242 * (Used for small Hiragana letters.) 243 * Where is occurs, there is also an explicit unit with common secondary & tertiary weights. 244 * There are no other data structure changes, but builder code needs to be able to handle such data. 245 * 246 * The collation element for the merge separator code point U+FFFE 247 * does not necessarily have special, unique secondary/tertiary weights any more. 248 */ 249 250 U_NAMESPACE_END 251 252 #endif // !UCONFIG_NO_COLLATION 253 #endif // __COLLATIONDATAREADER_H__ 254