1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdatareader.cpp
7 *
8 * created on: 2013feb07
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "collationdatareader.h"
23 #include "collationfastlatin.h"
24 #include "collationkeys.h"
25 #include "collationrootelements.h"
26 #include "collationsettings.h"
27 #include "collationtailoring.h"
28 #include "normalizer2impl.h"
29 #include "uassert.h"
30 #include "ucmndata.h"
31 #include "utrie2.h"
32
33 U_NAMESPACE_BEGIN
34
35 namespace {
36
getIndex(const int32_t * indexes,int32_t length,int32_t i)37 int32_t getIndex(const int32_t *indexes, int32_t length, int32_t i) {
38 return (i < length) ? indexes[i] : -1;
39 }
40
41 } // namespace
42
43 void
read(const CollationTailoring * base,const uint8_t * inBytes,int32_t inLength,CollationTailoring & tailoring,UErrorCode & errorCode)44 CollationDataReader::read(const CollationTailoring *base, const uint8_t *inBytes, int32_t inLength,
45 CollationTailoring &tailoring, UErrorCode &errorCode) {
46 if(U_FAILURE(errorCode)) { return; }
47 if(base != NULL) {
48 if(inBytes == NULL || (0 <= inLength && inLength < 24)) {
49 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
50 return;
51 }
52 const DataHeader *header = reinterpret_cast<const DataHeader *>(inBytes);
53 if(!(header->dataHeader.magic1 == 0xda && header->dataHeader.magic2 == 0x27 &&
54 isAcceptable(tailoring.version, NULL, NULL, &header->info))) {
55 errorCode = U_INVALID_FORMAT_ERROR;
56 return;
57 }
58 if(base->getUCAVersion() != tailoring.getUCAVersion()) {
59 errorCode = U_COLLATOR_VERSION_MISMATCH;
60 return;
61 }
62 int32_t headerLength = header->dataHeader.headerSize;
63 inBytes += headerLength;
64 if(inLength >= 0) {
65 inLength -= headerLength;
66 }
67 }
68
69 if(inBytes == NULL || (0 <= inLength && inLength < 8)) {
70 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
71 return;
72 }
73 const int32_t *inIndexes = reinterpret_cast<const int32_t *>(inBytes);
74 int32_t indexesLength = inIndexes[IX_INDEXES_LENGTH];
75 if(indexesLength < 2 || (0 <= inLength && inLength < indexesLength * 4)) {
76 errorCode = U_INVALID_FORMAT_ERROR; // Not enough indexes.
77 return;
78 }
79
80 // Assume that the tailoring data is in initial state,
81 // with NULL pointers and 0 lengths.
82
83 // Set pointers to non-empty data parts.
84 // Do this in order of their byte offsets. (Should help porting to Java.)
85
86 int32_t index; // one of the indexes[] slots
87 int32_t offset; // byte offset for the index part
88 int32_t length; // number of bytes in the index part
89
90 if(indexesLength > IX_TOTAL_SIZE) {
91 length = inIndexes[IX_TOTAL_SIZE];
92 } else if(indexesLength > IX_REORDER_CODES_OFFSET) {
93 length = inIndexes[indexesLength - 1];
94 } else {
95 length = 0; // only indexes, and inLength was already checked for them
96 }
97 if(0 <= inLength && inLength < length) {
98 errorCode = U_INVALID_FORMAT_ERROR;
99 return;
100 }
101
102 const CollationData *baseData = base == NULL ? NULL : base->data;
103 const int32_t *reorderCodes = NULL;
104 int32_t reorderCodesLength = 0;
105 const uint32_t *reorderRanges = NULL;
106 int32_t reorderRangesLength = 0;
107 index = IX_REORDER_CODES_OFFSET;
108 offset = getIndex(inIndexes, indexesLength, index);
109 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
110 if(length >= 4) {
111 if(baseData == NULL) {
112 // We assume for collation settings that
113 // the base data does not have a reordering.
114 errorCode = U_INVALID_FORMAT_ERROR;
115 return;
116 }
117 reorderCodes = reinterpret_cast<const int32_t *>(inBytes + offset);
118 reorderCodesLength = length / 4;
119
120 // The reorderRanges (if any) are the trailing reorderCodes entries.
121 // Split the array at the boundary.
122 // Script or reorder codes do not exceed 16-bit values.
123 // Range limits are stored in the upper 16 bits, and are never 0.
124 while(reorderRangesLength < reorderCodesLength &&
125 (reorderCodes[reorderCodesLength - reorderRangesLength - 1] & 0xffff0000) != 0) {
126 ++reorderRangesLength;
127 }
128 U_ASSERT(reorderRangesLength < reorderCodesLength);
129 if(reorderRangesLength != 0) {
130 reorderCodesLength -= reorderRangesLength;
131 reorderRanges = reinterpret_cast<const uint32_t *>(reorderCodes + reorderCodesLength);
132 }
133 }
134
135 // There should be a reorder table only if there are reorder codes.
136 // However, when there are reorder codes the reorder table may be omitted to reduce
137 // the data size.
138 const uint8_t *reorderTable = NULL;
139 index = IX_REORDER_TABLE_OFFSET;
140 offset = getIndex(inIndexes, indexesLength, index);
141 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
142 if(length >= 256) {
143 if(reorderCodesLength == 0) {
144 errorCode = U_INVALID_FORMAT_ERROR; // Reordering table without reordering codes.
145 return;
146 }
147 reorderTable = inBytes + offset;
148 } else {
149 // If we have reorder codes, then build the reorderTable at the end,
150 // when the CollationData is otherwise complete.
151 }
152
153 if(baseData != NULL && baseData->numericPrimary != (inIndexes[IX_OPTIONS] & 0xff000000)) {
154 errorCode = U_INVALID_FORMAT_ERROR;
155 return;
156 }
157 CollationData *data = NULL; // Remains NULL if there are no mappings.
158
159 index = IX_TRIE_OFFSET;
160 offset = getIndex(inIndexes, indexesLength, index);
161 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
162 if(length >= 8) {
163 if(!tailoring.ensureOwnedData(errorCode)) { return; }
164 data = tailoring.ownedData;
165 data->base = baseData;
166 data->numericPrimary = inIndexes[IX_OPTIONS] & 0xff000000;
167 data->trie = tailoring.trie = utrie2_openFromSerialized(
168 UTRIE2_32_VALUE_BITS, inBytes + offset, length, NULL,
169 &errorCode);
170 if(U_FAILURE(errorCode)) { return; }
171 } else if(baseData != NULL) {
172 // Use the base data. Only the settings are tailored.
173 tailoring.data = baseData;
174 } else {
175 errorCode = U_INVALID_FORMAT_ERROR; // No mappings.
176 return;
177 }
178
179 index = IX_CES_OFFSET;
180 offset = getIndex(inIndexes, indexesLength, index);
181 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
182 if(length >= 8) {
183 if(data == NULL) {
184 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ces without tailored trie.
185 return;
186 }
187 data->ces = reinterpret_cast<const int64_t *>(inBytes + offset);
188 data->cesLength = length / 8;
189 }
190
191 index = IX_CE32S_OFFSET;
192 offset = getIndex(inIndexes, indexesLength, index);
193 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
194 if(length >= 4) {
195 if(data == NULL) {
196 errorCode = U_INVALID_FORMAT_ERROR; // Tailored ce32s without tailored trie.
197 return;
198 }
199 data->ce32s = reinterpret_cast<const uint32_t *>(inBytes + offset);
200 data->ce32sLength = length / 4;
201 }
202
203 int32_t jamoCE32sStart = getIndex(inIndexes, indexesLength, IX_JAMO_CE32S_START);
204 if(jamoCE32sStart >= 0) {
205 if(data == NULL || data->ce32s == NULL) {
206 errorCode = U_INVALID_FORMAT_ERROR; // Index into non-existent ce32s[].
207 return;
208 }
209 data->jamoCE32s = data->ce32s + jamoCE32sStart;
210 } else if(data == NULL) {
211 // Nothing to do.
212 } else if(baseData != NULL) {
213 data->jamoCE32s = baseData->jamoCE32s;
214 } else {
215 errorCode = U_INVALID_FORMAT_ERROR; // No Jamo CE32s for Hangul processing.
216 return;
217 }
218
219 index = IX_ROOT_ELEMENTS_OFFSET;
220 offset = getIndex(inIndexes, indexesLength, index);
221 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
222 if(length >= 4) {
223 length /= 4;
224 if(data == NULL || length <= CollationRootElements::IX_SEC_TER_BOUNDARIES) {
225 errorCode = U_INVALID_FORMAT_ERROR;
226 return;
227 }
228 data->rootElements = reinterpret_cast<const uint32_t *>(inBytes + offset);
229 data->rootElementsLength = length;
230 uint32_t commonSecTer = data->rootElements[CollationRootElements::IX_COMMON_SEC_AND_TER_CE];
231 if(commonSecTer != Collation::COMMON_SEC_AND_TER_CE) {
232 errorCode = U_INVALID_FORMAT_ERROR;
233 return;
234 }
235 uint32_t secTerBoundaries = data->rootElements[CollationRootElements::IX_SEC_TER_BOUNDARIES];
236 if((secTerBoundaries >> 24) < CollationKeys::SEC_COMMON_HIGH) {
237 // [fixed last secondary common byte] is too low,
238 // and secondary weights would collide with compressed common secondaries.
239 errorCode = U_INVALID_FORMAT_ERROR;
240 return;
241 }
242 }
243
244 index = IX_CONTEXTS_OFFSET;
245 offset = getIndex(inIndexes, indexesLength, index);
246 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
247 if(length >= 2) {
248 if(data == NULL) {
249 errorCode = U_INVALID_FORMAT_ERROR; // Tailored contexts without tailored trie.
250 return;
251 }
252 data->contexts = reinterpret_cast<const UChar *>(inBytes + offset);
253 data->contextsLength = length / 2;
254 }
255
256 index = IX_UNSAFE_BWD_OFFSET;
257 offset = getIndex(inIndexes, indexesLength, index);
258 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
259 if(length >= 2) {
260 if(data == NULL) {
261 errorCode = U_INVALID_FORMAT_ERROR;
262 return;
263 }
264 if(baseData == NULL) {
265 // Create the unsafe-backward set for the root collator.
266 // Include all non-zero combining marks and trail surrogates.
267 // We do this at load time, rather than at build time,
268 // to simplify Unicode version bootstrapping:
269 // The root data builder only needs the new FractionalUCA.txt data,
270 // but it need not be built with a version of ICU already updated to
271 // the corresponding new Unicode Character Database.
272 //
273 // The following is an optimized version of
274 // new UnicodeSet("[[:^lccc=0:][\\udc00-\\udfff]]").
275 // It is faster and requires fewer code dependencies.
276 tailoring.unsafeBackwardSet = new UnicodeSet(0xdc00, 0xdfff); // trail surrogates
277 if(tailoring.unsafeBackwardSet == NULL) {
278 errorCode = U_MEMORY_ALLOCATION_ERROR;
279 return;
280 }
281 data->nfcImpl.addLcccChars(*tailoring.unsafeBackwardSet);
282 } else {
283 // Clone the root collator's set contents.
284 tailoring.unsafeBackwardSet = static_cast<UnicodeSet *>(
285 baseData->unsafeBackwardSet->cloneAsThawed());
286 if(tailoring.unsafeBackwardSet == NULL) {
287 errorCode = U_MEMORY_ALLOCATION_ERROR;
288 return;
289 }
290 }
291 // Add the ranges from the data file to the unsafe-backward set.
292 USerializedSet sset;
293 const uint16_t *unsafeData = reinterpret_cast<const uint16_t *>(inBytes + offset);
294 if(!uset_getSerializedSet(&sset, unsafeData, length / 2)) {
295 errorCode = U_INVALID_FORMAT_ERROR;
296 return;
297 }
298 int32_t count = uset_getSerializedRangeCount(&sset);
299 for(int32_t i = 0; i < count; ++i) {
300 UChar32 start, end;
301 uset_getSerializedRange(&sset, i, &start, &end);
302 tailoring.unsafeBackwardSet->add(start, end);
303 }
304 // Mark each lead surrogate as "unsafe"
305 // if any of its 1024 associated supplementary code points is "unsafe".
306 UChar32 c = 0x10000;
307 for(UChar lead = 0xd800; lead < 0xdc00; ++lead, c += 0x400) {
308 if(!tailoring.unsafeBackwardSet->containsNone(c, c + 0x3ff)) {
309 tailoring.unsafeBackwardSet->add(lead);
310 }
311 }
312 tailoring.unsafeBackwardSet->freeze();
313 data->unsafeBackwardSet = tailoring.unsafeBackwardSet;
314 } else if(data == NULL) {
315 // Nothing to do.
316 } else if(baseData != NULL) {
317 // No tailoring-specific data: Alias the root collator's set.
318 data->unsafeBackwardSet = baseData->unsafeBackwardSet;
319 } else {
320 errorCode = U_INVALID_FORMAT_ERROR; // No unsafeBackwardSet.
321 return;
322 }
323
324 // If the fast Latin format version is different,
325 // or the version is set to 0 for "no fast Latin table",
326 // then just always use the normal string comparison path.
327 if(data != NULL) {
328 data->fastLatinTable = NULL;
329 data->fastLatinTableLength = 0;
330 if(((inIndexes[IX_OPTIONS] >> 16) & 0xff) == CollationFastLatin::VERSION) {
331 index = IX_FAST_LATIN_TABLE_OFFSET;
332 offset = getIndex(inIndexes, indexesLength, index);
333 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
334 if(length >= 2) {
335 data->fastLatinTable = reinterpret_cast<const uint16_t *>(inBytes + offset);
336 data->fastLatinTableLength = length / 2;
337 if((*data->fastLatinTable >> 8) != CollationFastLatin::VERSION) {
338 errorCode = U_INVALID_FORMAT_ERROR; // header vs. table version mismatch
339 return;
340 }
341 } else if(baseData != NULL) {
342 data->fastLatinTable = baseData->fastLatinTable;
343 data->fastLatinTableLength = baseData->fastLatinTableLength;
344 }
345 }
346 }
347
348 index = IX_SCRIPTS_OFFSET;
349 offset = getIndex(inIndexes, indexesLength, index);
350 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
351 if(length >= 2) {
352 if(data == NULL) {
353 errorCode = U_INVALID_FORMAT_ERROR;
354 return;
355 }
356 const uint16_t *scripts = reinterpret_cast<const uint16_t *>(inBytes + offset);
357 int32_t scriptsLength = length / 2;
358 data->numScripts = scripts[0];
359 // There must be enough entries for both arrays, including more than two range starts.
360 data->scriptStartsLength = scriptsLength - (1 + data->numScripts + 16);
361 if(data->scriptStartsLength <= 2 ||
362 CollationData::MAX_NUM_SCRIPT_RANGES < data->scriptStartsLength) {
363 errorCode = U_INVALID_FORMAT_ERROR;
364 return;
365 }
366 data->scriptsIndex = scripts + 1;
367 data->scriptStarts = scripts + 1 + data->numScripts + 16;
368 if(!(data->scriptStarts[0] == 0 &&
369 data->scriptStarts[1] == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8) &&
370 data->scriptStarts[data->scriptStartsLength - 1] ==
371 (Collation::TRAIL_WEIGHT_BYTE << 8))) {
372 errorCode = U_INVALID_FORMAT_ERROR;
373 return;
374 }
375 } else if(data == NULL) {
376 // Nothing to do.
377 } else if(baseData != NULL) {
378 data->numScripts = baseData->numScripts;
379 data->scriptsIndex = baseData->scriptsIndex;
380 data->scriptStarts = baseData->scriptStarts;
381 data->scriptStartsLength = baseData->scriptStartsLength;
382 }
383
384 index = IX_COMPRESSIBLE_BYTES_OFFSET;
385 offset = getIndex(inIndexes, indexesLength, index);
386 length = getIndex(inIndexes, indexesLength, index + 1) - offset;
387 if(length >= 256) {
388 if(data == NULL) {
389 errorCode = U_INVALID_FORMAT_ERROR;
390 return;
391 }
392 data->compressibleBytes = reinterpret_cast<const UBool *>(inBytes + offset);
393 } else if(data == NULL) {
394 // Nothing to do.
395 } else if(baseData != NULL) {
396 data->compressibleBytes = baseData->compressibleBytes;
397 } else {
398 errorCode = U_INVALID_FORMAT_ERROR; // No compressibleBytes[].
399 return;
400 }
401
402 const CollationSettings &ts = *tailoring.settings;
403 int32_t options = inIndexes[IX_OPTIONS] & 0xffff;
404 uint16_t fastLatinPrimaries[CollationFastLatin::LATIN_LIMIT];
405 int32_t fastLatinOptions = CollationFastLatin::getOptions(
406 tailoring.data, ts, fastLatinPrimaries, UPRV_LENGTHOF(fastLatinPrimaries));
407 if(options == ts.options && ts.variableTop != 0 &&
408 reorderCodesLength == ts.reorderCodesLength &&
409 uprv_memcmp(reorderCodes, ts.reorderCodes, reorderCodesLength * 4) == 0 &&
410 fastLatinOptions == ts.fastLatinOptions &&
411 (fastLatinOptions < 0 ||
412 uprv_memcmp(fastLatinPrimaries, ts.fastLatinPrimaries,
413 sizeof(fastLatinPrimaries)) == 0)) {
414 return;
415 }
416
417 CollationSettings *settings = SharedObject::copyOnWrite(tailoring.settings);
418 if(settings == NULL) {
419 errorCode = U_MEMORY_ALLOCATION_ERROR;
420 return;
421 }
422 settings->options = options;
423 // Set variableTop from options and scripts data.
424 settings->variableTop = tailoring.data->getLastPrimaryForGroup(
425 UCOL_REORDER_CODE_FIRST + settings->getMaxVariable());
426 if(settings->variableTop == 0) {
427 errorCode = U_INVALID_FORMAT_ERROR;
428 return;
429 }
430
431 if(reorderCodesLength != 0) {
432 settings->aliasReordering(*baseData, reorderCodes, reorderCodesLength,
433 reorderRanges, reorderRangesLength,
434 reorderTable, errorCode);
435 }
436
437 settings->fastLatinOptions = CollationFastLatin::getOptions(
438 tailoring.data, *settings,
439 settings->fastLatinPrimaries, UPRV_LENGTHOF(settings->fastLatinPrimaries));
440 }
441
442 UBool U_CALLCONV
isAcceptable(void * context,const char *,const char *,const UDataInfo * pInfo)443 CollationDataReader::isAcceptable(void *context,
444 const char * /* type */, const char * /*name*/,
445 const UDataInfo *pInfo) {
446 if(
447 pInfo->size >= 20 &&
448 pInfo->isBigEndian == U_IS_BIG_ENDIAN &&
449 pInfo->charsetFamily == U_CHARSET_FAMILY &&
450 pInfo->dataFormat[0] == 0x55 && // dataFormat="UCol"
451 pInfo->dataFormat[1] == 0x43 &&
452 pInfo->dataFormat[2] == 0x6f &&
453 pInfo->dataFormat[3] == 0x6c &&
454 pInfo->formatVersion[0] == 5
455 ) {
456 UVersionInfo *version = static_cast<UVersionInfo *>(context);
457 if(version != NULL) {
458 uprv_memcpy(version, pInfo->dataVersion, 4);
459 }
460 return TRUE;
461 } else {
462 return FALSE;
463 }
464 }
465
466 U_NAMESPACE_END
467
468 #endif // !UCONFIG_NO_COLLATION
469