1 /*
2 *******************************************************************************
3 * Copyright (C) 2013-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdatawriter.cpp
7 *
8 * created on: 2013aug06
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/tblcoll.h"
17 #include "unicode/udata.h"
18 #include "unicode/uniset.h"
19 #include "cmemory.h"
20 #include "collationdata.h"
21 #include "collationdatabuilder.h"
22 #include "collationdatareader.h"
23 #include "collationdatawriter.h"
24 #include "collationfastlatin.h"
25 #include "collationsettings.h"
26 #include "collationtailoring.h"
27 #include "uassert.h"
28 #include "ucmndata.h"
29 
30 U_NAMESPACE_BEGIN
31 
32 uint8_t *
cloneRuleData(int32_t & length,UErrorCode & errorCode) const33 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const {
34     if(U_FAILURE(errorCode)) { return NULL; }
35     LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000));
36     if(buffer.isNull()) {
37         errorCode = U_MEMORY_ALLOCATION_ERROR;
38         return NULL;
39     }
40     length = cloneBinary(buffer.getAlias(), 20000, errorCode);
41     if(errorCode == U_BUFFER_OVERFLOW_ERROR) {
42         if(buffer.allocateInsteadAndCopy(length, 0) == NULL) {
43             errorCode = U_MEMORY_ALLOCATION_ERROR;
44             return NULL;
45         }
46         errorCode = U_ZERO_ERROR;
47         length = cloneBinary(buffer.getAlias(), length, errorCode);
48     }
49     if(U_FAILURE(errorCode)) { return NULL; }
50     return buffer.orphan();
51 }
52 
53 int32_t
cloneBinary(uint8_t * dest,int32_t capacity,UErrorCode & errorCode) const54 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const {
55     int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1];
56     return CollationDataWriter::writeTailoring(
57             *tailoring, *settings, indexes, dest, capacity,
58             errorCode);
59 }
60 
61 static const UDataInfo dataInfo = {
62     sizeof(UDataInfo),
63     0,
64 
65     U_IS_BIG_ENDIAN,
66     U_CHARSET_FAMILY,
67     U_SIZEOF_UCHAR,
68     0,
69 
70     { 0x55, 0x43, 0x6f, 0x6c },         // dataFormat="UCol"
71     { 5, 0, 0, 0 },                     // formatVersion
72     { 6, 3, 0, 0 }                      // dataVersion
73 };
74 
75 int32_t
writeBase(const CollationData & data,const CollationSettings & settings,const void * rootElements,int32_t rootElementsLength,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)76 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings,
77                                const void *rootElements, int32_t rootElementsLength,
78                                int32_t indexes[], uint8_t *dest, int32_t capacity,
79                                UErrorCode &errorCode) {
80     return write(TRUE, NULL,
81                  data, settings,
82                  rootElements, rootElementsLength,
83                  indexes, dest, capacity, errorCode);
84 }
85 
86 int32_t
writeTailoring(const CollationTailoring & t,const CollationSettings & settings,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)87 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings,
88                                     int32_t indexes[], uint8_t *dest, int32_t capacity,
89                                     UErrorCode &errorCode) {
90     return write(FALSE, t.version,
91                  *t.data, settings,
92                  NULL, 0,
93                  indexes, dest, capacity, errorCode);
94 }
95 
96 int32_t
write(UBool isBase,const UVersionInfo dataVersion,const CollationData & data,const CollationSettings & settings,const void * rootElements,int32_t rootElementsLength,int32_t indexes[],uint8_t * dest,int32_t capacity,UErrorCode & errorCode)97 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion,
98                            const CollationData &data, const CollationSettings &settings,
99                            const void *rootElements, int32_t rootElementsLength,
100                            int32_t indexes[], uint8_t *dest, int32_t capacity,
101                            UErrorCode &errorCode) {
102     if(U_FAILURE(errorCode)) { return 0; }
103     if(capacity < 0 || (capacity > 0 && dest == NULL)) {
104         errorCode = U_ILLEGAL_ARGUMENT_ERROR;
105         return 0;
106     }
107 
108     // Figure out which data items to write before settling on
109     // the indexes length and writing offsets.
110     // For any data item, we need to write the start and limit offsets,
111     // so the indexes length must be at least index-of-start-offset + 2.
112     int32_t indexesLength;
113     UBool hasMappings;
114     UnicodeSet unsafeBackwardSet;
115     const CollationData *baseData = data.base;
116 
117     int32_t fastLatinVersion;
118     if(data.fastLatinTable != NULL) {
119         fastLatinVersion = (int32_t)CollationFastLatin::VERSION << 16;
120     } else {
121         fastLatinVersion = 0;
122     }
123     int32_t fastLatinTableLength = 0;
124 
125     if(isBase) {
126         // For the root collator, we write an even number of indexes
127         // so that we start with an 8-aligned offset.
128         indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1;
129         U_ASSERT(settings.reorderCodesLength == 0);
130         hasMappings = TRUE;
131         unsafeBackwardSet = *data.unsafeBackwardSet;
132         fastLatinTableLength = data.fastLatinTableLength;
133     } else if(baseData == NULL) {
134         hasMappings = FALSE;
135         if(settings.reorderCodesLength == 0) {
136             // only options
137             indexesLength = CollationDataReader::IX_OPTIONS + 1;  // no limit offset here
138         } else {
139             // only options, reorder codes, and the reorder table
140             indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2;
141         }
142     } else {
143         hasMappings = TRUE;
144         // Tailored mappings, and what else?
145         // Check in ascending order of optional tailoring data items.
146         indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2;
147         if(data.contextsLength != 0) {
148             indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2;
149         }
150         unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet);
151         if(!unsafeBackwardSet.isEmpty()) {
152             indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2;
153         }
154         if(data.fastLatinTable != baseData->fastLatinTable) {
155             fastLatinTableLength = data.fastLatinTableLength;
156             indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2;
157         }
158     }
159 
160     UVector32 codesAndRanges(errorCode);
161     const int32_t *reorderCodes = settings.reorderCodes;
162     int32_t reorderCodesLength = settings.reorderCodesLength;
163     if(settings.hasReordering() &&
164             CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) {
165         // Rebuild the full list of reorder ranges.
166         // The list in the settings is truncated for efficiency.
167         data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode);
168         // Write the codes, then the ranges.
169         for(int32_t i = 0; i < reorderCodesLength; ++i) {
170             codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode);
171         }
172         if(U_FAILURE(errorCode)) { return 0; }
173         reorderCodes = codesAndRanges.getBuffer();
174         reorderCodesLength = codesAndRanges.size();
175     }
176 
177     int32_t headerSize;
178     if(isBase) {
179         headerSize = 0;  // udata_create() writes the header
180     } else {
181         DataHeader header;
182         header.dataHeader.magic1 = 0xda;
183         header.dataHeader.magic2 = 0x27;
184         uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo));
185         uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo));
186         headerSize = (int32_t)sizeof(header);
187         U_ASSERT((headerSize & 3) == 0);  // multiple of 4 bytes
188         if(hasMappings && data.cesLength != 0) {
189             // Sum of the sizes of the data items which are
190             // not automatically multiples of 8 bytes and which are placed before the CEs.
191             int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4;
192             if((sum & 7) != 0) {
193                 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned.
194                 // We add to the header size here.
195                 // Alternatively, we could increment the indexesLength
196                 // or add a few bytes to the reorderTable.
197                 headerSize += 4;
198             }
199         }
200         header.dataHeader.headerSize = (uint16_t)headerSize;
201         if(headerSize <= capacity) {
202             uprv_memcpy(dest, &header, sizeof(header));
203             // Write 00 bytes so that the padding is not mistaken for a copyright string.
204             uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header));
205             dest += headerSize;
206             capacity -= headerSize;
207         } else {
208             dest = NULL;
209             capacity = 0;
210         }
211     }
212 
213     indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength;
214     U_ASSERT((settings.options & ~0xffff) == 0);
215     indexes[CollationDataReader::IX_OPTIONS] =
216             data.numericPrimary | fastLatinVersion | settings.options;
217     indexes[CollationDataReader::IX_RESERVED2] = 0;
218     indexes[CollationDataReader::IX_RESERVED3] = 0;
219 
220     // Byte offsets of data items all start from the start of the indexes.
221     // We add the headerSize at the very end.
222     int32_t totalSize = indexesLength * 4;
223 
224     if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) {
225         indexes[CollationDataReader::IX_JAMO_CE32S_START] = data.jamoCE32s - data.ce32s;
226     } else {
227         indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1;
228     }
229 
230     indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize;
231     totalSize += reorderCodesLength * 4;
232 
233     indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize;
234     if(settings.reorderTable != NULL) {
235         totalSize += 256;
236     }
237 
238     indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize;
239     if(hasMappings) {
240         UErrorCode errorCode2 = U_ZERO_ERROR;
241         int32_t length;
242         if(totalSize < capacity) {
243             length = utrie2_serialize(data.trie, dest + totalSize,
244                                       capacity - totalSize, &errorCode2);
245         } else {
246             length = utrie2_serialize(data.trie, NULL, 0, &errorCode2);
247         }
248         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
249             errorCode = errorCode2;
250             return 0;
251         }
252         // The trie size should be a multiple of 8 bytes due to the way
253         // compactIndex2(UNewTrie2 *trie) currently works.
254         U_ASSERT((length & 7) == 0);
255         totalSize += length;
256     }
257 
258     indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize;
259     indexes[CollationDataReader::IX_CES_OFFSET] = totalSize;
260     if(hasMappings && data.cesLength != 0) {
261         U_ASSERT(((headerSize + totalSize) & 7) == 0);
262         totalSize += data.cesLength * 8;
263     }
264 
265     indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize;
266     indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize;
267     if(hasMappings) {
268         totalSize += data.ce32sLength * 4;
269     }
270 
271     indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize;
272     totalSize += rootElementsLength * 4;
273 
274     indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize;
275     if(hasMappings) {
276         totalSize += data.contextsLength * 2;
277     }
278 
279     indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize;
280     if(hasMappings && !unsafeBackwardSet.isEmpty()) {
281         UErrorCode errorCode2 = U_ZERO_ERROR;
282         int32_t length;
283         if(totalSize < capacity) {
284             uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize);
285             length = unsafeBackwardSet.serialize(
286                     p, (capacity - totalSize) / 2, errorCode2);
287         } else {
288             length = unsafeBackwardSet.serialize(NULL, 0, errorCode2);
289         }
290         if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) {
291             errorCode = errorCode2;
292             return 0;
293         }
294         totalSize += length * 2;
295     }
296 
297     indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize;
298     totalSize += fastLatinTableLength * 2;
299 
300     UnicodeString scripts;
301     indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize;
302     if(isBase) {
303         scripts.append((UChar)data.numScripts);
304         scripts.append(reinterpret_cast<const UChar *>(data.scriptsIndex), data.numScripts + 16);
305         scripts.append(reinterpret_cast<const UChar *>(data.scriptStarts), data.scriptStartsLength);
306         totalSize += scripts.length() * 2;
307     }
308 
309     indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize;
310     if(isBase) {
311         totalSize += 256;
312     }
313 
314     indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize;
315     indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize;
316 
317     if(totalSize > capacity) {
318         errorCode = U_BUFFER_OVERFLOW_ERROR;
319         return headerSize + totalSize;
320     }
321 
322     uprv_memcpy(dest, indexes, indexesLength * 4);
323     copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest);
324     copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest);
325     // The trie has already been serialized into the dest buffer.
326     copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest);
327     copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest);
328     copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest);
329     copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest);
330     // The unsafeBackwardSet has already been serialized into the dest buffer.
331     copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest);
332     copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest);
333     copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest);
334 
335     return headerSize + totalSize;
336 }
337 
338 void
copyData(const int32_t indexes[],int32_t startIndex,const void * src,uint8_t * dest)339 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex,
340                               const void *src, uint8_t *dest) {
341     int32_t start = indexes[startIndex];
342     int32_t limit = indexes[startIndex + 1];
343     if(start < limit) {
344         uprv_memcpy(dest + start, src, limit - start);
345     }
346 }
347 
348 U_NAMESPACE_END
349 
350 #endif  // !UCONFIG_NO_COLLATION
351