1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * collationdatawriter.cpp
9 *
10 * created on: 2013aug06
11 * created by: Markus W. Scherer
12 */
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_COLLATION
17 
18 #include "unicode/tblcoll.h"
19 #include "unicode/udata.h"
20 #include "unicode/uniset.h"
21 #include "cmemory.h"
22 #include "collationdata.h"
23 #include "collationdatabuilder.h"
24 #include "collationdatareader.h"
25 #include "collationdatawriter.h"
26 #include "collationfastlatin.h"
27 #include "collationsettings.h"
28 #include "collationtailoring.h"
29 #include "uassert.h"
30 #include "ucmndata.h"
31 
32 U_NAMESPACE_BEGIN
33 
34 uint8_t *
cloneRuleData(int32_t & length,UErrorCode & errorCode) const35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
36     if(U_FAILURE(errorCode)) { return NULL; }
37     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
38     if(buffer.isNull()) {
39         errorCode = U_MEMORY_ALLOCATION_ERROR;
40         return NULL;
41     }
42     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
43     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
44         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
45             errorCode = U_MEMORY_ALLOCATION_ERROR;
46             return NULL;
47         }
48         errorCode = U_ZERO_ERROR;
49         length = cloneBinary(buffer.getAlias(), length, errorCode);
50     }
51     if(U_FAILURE(errorCode)) { return NULL; }
52     return buffer.orphan();
53 }
54 
55 int32_t
cloneBinary(uint8_t * dest,int32_t capacity,UErrorCode & errorCode) const56 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
57     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
58     return CollationDataWriter::writeTailoring(
59             *tailoring, *settings, indexes, dest, capacity,
60             errorCode);
61 }
62 
63 static const UDataInfo dataInfo = {
64     sizeof(UDataInfo),
65     0,
66 
67     U_IS_BIG_ENDIAN,
68     U_CHARSET_FAMILY,
69     U_SIZEOF_UCHAR,
70     0,
71 
72     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
73     { 5, 0, 0, 0 },                     // formatVersion
74     { 6, 3, 0, 0 }                      // dataVersion
75 };
76 
77 int32_t
writeBase(const CollationData & data,const CollationSettings & settings,const void * rootElements,int32_t rootElementsLength,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)78 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
79                                const void *rootElements, int32_t rootElementsLength,
80                                int32_t indexes[], uint8_t *dest, int32_t capacity,
81                                UErrorCode &errorCode) {
82     return write(TRUE, NULL,
83                  data, settings,
84                  rootElements, rootElementsLength,
85                  indexes, dest, capacity, errorCode);
86 }
87 
88 int32_t
writeTailoring(const CollationTailoring & t,const CollationSettings & settings,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)89 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
90                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
91                                     UErrorCode &errorCode) {
92     return write(FALSE, t.version,
93                  *t.data, settings,
94                  NULL, 0,
95                  indexes, dest, capacity, errorCode);
96 }
97 
98 int32_t
write(UBool isBase,const UVersionInfo dataVersion,const CollationData & data,const CollationSettings & settings,const void * rootElements,int32_t rootElementsLength,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)99 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
100                            const CollationData &data, const CollationSettings &settings,
101                            const void *rootElements, int32_t rootElementsLength,
102                            int32_t indexes[], uint8_t *dest, int32_t capacity,
103                            UErrorCode &errorCode) {
104     if(U_FAILURE(errorCode)) { return 0; }
105     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
106         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
107         return 0;
108     }
109 
110     // Figure out which data items to write before settling on
111     // the indexes length and writing offsets.
112     // For any data item, we need to write the start and limit offsets,
113     // so the indexes length must be at least index-of-start-offset + 2.
114     int32_t indexesLength;
115     UBool hasMappings;
116     UnicodeSet unsafeBackwardSet;
117     const CollationData *baseData = data.base;
118 
119     int32_t fastLatinVersion;
120     if(data.fastLatinTable != NULL) {
121         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
122     } else {
123         fastLatinVersion = 0;
124     }
125     int32_t fastLatinTableLength = 0;
126 
127     if(isBase) {
128         // For the root collator, we write an even number of indexes
129         // so that we start with an 8-aligned offset.
130         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
131         U_ASSERT(settings.reorderCodesLength == 0);
132         hasMappings = TRUE;
133         unsafeBackwardSet = *data.unsafeBackwardSet;
134         fastLatinTableLength = data.fastLatinTableLength;
135     } else if(baseData == NULL) {
136         hasMappings = FALSE;
137         if(settings.reorderCodesLength == 0) {
138             // only options
139             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
140         } else {
141             // only options, reorder codes, and the reorder table
142             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
143         }
144     } else {
145         hasMappings = TRUE;
146         // Tailored mappings, and what else?
147         // Check in ascending order of optional tailoring data items.
148         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
149         if(data.contextsLength != 0) {
150             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
151         }
152         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
153         if(!unsafeBackwardSet.isEmpty()) {
154             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
155         }
156         if(data.fastLatinTable != baseData->fastLatinTable) {
157             fastLatinTableLength = data.fastLatinTableLength;
158             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
159         }
160     }
161 
162     UVector32 codesAndRanges(errorCode);
163     const int32_t *reorderCodes = settings.reorderCodes;
164     int32_t reorderCodesLength = settings.reorderCodesLength;
165     if(settings.hasReordering() &&
166             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
167         // Rebuild the full list of reorder ranges.
168         // The list in the settings is truncated for efficiency.
169         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
170         // Write the codes, then the ranges.
171         for(int32_t i = 0; i < reorderCodesLength; ++i) {
172             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
173         }
174         if(U_FAILURE(errorCode)) { return 0; }
175         reorderCodes = codesAndRanges.getBuffer();
176         reorderCodesLength = codesAndRanges.size();
177     }
178 
179     int32_t headerSize;
180     if(isBase) {
181         headerSize = 0;  // udata_create() writes the header
182     } else {
183         DataHeader header;
184         header.dataHeader.magic1 = 0xda;
185         header.dataHeader.magic2 = 0x27;
186         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
187         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
188         headerSize = (int32_t)sizeof(header);
189         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
190         if(hasMappings && data.cesLength != 0) {
191             // Sum of the sizes of the data items which are
192             // not automatically multiples of 8 bytes and which are placed before the CEs.
193             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
194             if((sum & 7) != 0) {
195                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
196                 // We add to the header size here.
197                 // Alternatively, we could increment the indexesLength
198                 // or add a few bytes to the reorderTable.
199                 headerSize += 4;
200             }
201         }
202         header.dataHeader.headerSize = (uint16_t)headerSize;
203         if(headerSize <= capacity) {
204             uprv_memcpy(dest, &header, sizeof(header));
205             // Write 00 bytes so that the padding is not mistaken for a copyright string.
206             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
207             dest += headerSize;
208             capacity -= headerSize;
209         } else {
210             dest = NULL;
211             capacity = 0;
212         }
213     }
214 
215     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
216     U_ASSERT((settings.options & ~0xffff) == 0);
217     indexes[CollationDataReader::IX_OPTIONS] =
218             data.numericPrimary | fastLatinVersion | settings.options;
219     indexes[CollationDataReader::IX_RESERVED2] = 0;
220     indexes[CollationDataReader::IX_RESERVED3] = 0;
221 
222     // Byte offsets of data items all start from the start of the indexes.
223     // We add the headerSize at the very end.
224     int32_t totalSize = indexesLength * 4;
225 
226     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
228     } else {
229         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
230     }
231 
232     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
233     totalSize += reorderCodesLength * 4;
234 
235     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
236     if(settings.reorderTable != NULL) {
237         totalSize += 256;
238     }
239 
240     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
241     if(hasMappings) {
242         UErrorCode errorCode2 = U_ZERO_ERROR;
243         int32_t length;
244         if(totalSize < capacity) {
245             length = utrie2_serialize(data.trie, dest + totalSize,
246                                       capacity - totalSize, &errorCode2);
247         } else {
248             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
249         }
250         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
251             errorCode = errorCode2;
252             return 0;
253         }
254         // The trie size should be a multiple of 8 bytes due to the way
255         // compactIndex2(UNewTrie2 *trie) currently works.
256         U_ASSERT((length & 7) == 0);
257         totalSize += length;
258     }
259 
260     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
261     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
262     if(hasMappings && data.cesLength != 0) {
263         U_ASSERT(((headerSize + totalSize) & 7) == 0);
264         totalSize += data.cesLength * 8;
265     }
266 
267     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
268     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
269     if(hasMappings) {
270         totalSize += data.ce32sLength * 4;
271     }
272 
273     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
274     totalSize += rootElementsLength * 4;
275 
276     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
277     if(hasMappings) {
278         totalSize += data.contextsLength * 2;
279     }
280 
281     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
282     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
283         UErrorCode errorCode2 = U_ZERO_ERROR;
284         int32_t length;
285         if(totalSize < capacity) {
286             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
287             length = unsafeBackwardSet.serialize(
288                     p, (capacity - totalSize) / 2, errorCode2);
289         } else {
290             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
291         }
292         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
293             errorCode = errorCode2;
294             return 0;
295         }
296         totalSize += length * 2;
297     }
298 
299     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
300     totalSize += fastLatinTableLength * 2;
301 
302     UnicodeString scripts;
303     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
304     if(isBase) {
305         scripts.append((UChar)data.numScripts);
306         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
307         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
308         totalSize += scripts.length() * 2;
309     }
310 
311     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
312     if(isBase) {
313         totalSize += 256;
314     }
315 
316     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
317     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
318 
319     if(totalSize > capacity) {
320         errorCode = U_BUFFER_OVERFLOW_ERROR;
321         return headerSize + totalSize;
322     }
323 
324     uprv_memcpy(dest, indexes, indexesLength * 4);
325     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
326     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
327     // The trie has already been serialized into the dest buffer.
328     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
329     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
330     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
331     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
332     // The unsafeBackwardSet has already been serialized into the dest buffer.
333     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
334     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
335     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
336 
337     return headerSize + totalSize;
338 }
339 
340 void
copyData(const int32_t indexes[],int32_t startIndex,const void * src,uint8_t * dest)341 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
342                               const void *src, uint8_t *dest) {
343     int32_t start = indexes[startIndex];
344     int32_t limit = indexes[startIndex + 1];
345     if(start < limit) {
346         uprv_memcpy(dest + start, src, limit - start);
347     }
348 }
349 
350 U_NAMESPACE_END
351 
352 #endif  // !UCONFIG_NO_COLLATION
353