1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 * collationdata.cpp
7 *
8 * created on: 2012jul28
9 * created by: Markus W. Scherer
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION
15 
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "uassert.h"
23 #include "utrie2.h"
24 #include "uvectr32.h"
25 
26 U_NAMESPACE_BEGIN
27 
28 uint32_t
getIndirectCE32(uint32_t ce32) const29 CollationData::getIndirectCE32(uint32_t ce32) const {
30     U_ASSERT(Collation::isSpecialCE32(ce32));
31     int32_t tag = Collation::tagFromCE32(ce32);
32     if(tag == Collation::DIGIT_TAG) {
33         // Fetch the non-numeric-collation CE32.
34         ce32 = ce32s[Collation::indexFromCE32(ce32)];
35     } else if(tag == Collation::LEAD_SURROGATE_TAG) {
36         ce32 = Collation::UNASSIGNED_CE32;
37     } else if(tag == Collation::U0000_TAG) {
38         // Fetch the normal ce32 for U+0000.
39         ce32 = ce32s[0];
40     }
41     return ce32;
42 }
43 
44 uint32_t
getFinalCE32(uint32_t ce32) const45 CollationData::getFinalCE32(uint32_t ce32) const {
46     if(Collation::isSpecialCE32(ce32)) {
47         ce32 = getIndirectCE32(ce32);
48     }
49     return ce32;
50 }
51 
52 int64_t
getSingleCE(UChar32 c,UErrorCode & errorCode) const53 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
54     if(U_FAILURE(errorCode)) { return 0; }
55     // Keep parallel with CollationDataBuilder::getSingleCE().
56     const CollationData *d;
57     uint32_t ce32 = getCE32(c);
58     if(ce32 == Collation::FALLBACK_CE32) {
59         d = base;
60         ce32 = base->getCE32(c);
61     } else {
62         d = this;
63     }
64     while(Collation::isSpecialCE32(ce32)) {
65         switch(Collation::tagFromCE32(ce32)) {
66         case Collation::LATIN_EXPANSION_TAG:
67         case Collation::BUILDER_DATA_TAG:
68         case Collation::PREFIX_TAG:
69         case Collation::CONTRACTION_TAG:
70         case Collation::HANGUL_TAG:
71         case Collation::LEAD_SURROGATE_TAG:
72             errorCode = U_UNSUPPORTED_ERROR;
73             return 0;
74         case Collation::FALLBACK_TAG:
75         case Collation::RESERVED_TAG_3:
76             errorCode = U_INTERNAL_PROGRAM_ERROR;
77             return 0;
78         case Collation::LONG_PRIMARY_TAG:
79             return Collation::ceFromLongPrimaryCE32(ce32);
80         case Collation::LONG_SECONDARY_TAG:
81             return Collation::ceFromLongSecondaryCE32(ce32);
82         case Collation::EXPANSION32_TAG:
83             if(Collation::lengthFromCE32(ce32) == 1) {
84                 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
85                 break;
86             } else {
87                 errorCode = U_UNSUPPORTED_ERROR;
88                 return 0;
89             }
90         case Collation::EXPANSION_TAG: {
91             if(Collation::lengthFromCE32(ce32) == 1) {
92                 return d->ces[Collation::indexFromCE32(ce32)];
93             } else {
94                 errorCode = U_UNSUPPORTED_ERROR;
95                 return 0;
96             }
97         }
98         case Collation::DIGIT_TAG:
99             // Fetch the non-numeric-collation CE32 and continue.
100             ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
101             break;
102         case Collation::U0000_TAG:
103             U_ASSERT(c == 0);
104             // Fetch the normal ce32 for U+0000 and continue.
105             ce32 = d->ce32s[0];
106             break;
107         case Collation::OFFSET_TAG:
108             return d->getCEFromOffsetCE32(c, ce32);
109         case Collation::IMPLICIT_TAG:
110             return Collation::unassignedCEFromCodePoint(c);
111         }
112     }
113     return Collation::ceFromSimpleCE32(ce32);
114 }
115 
116 uint32_t
getFirstPrimaryForGroup(int32_t script) const117 CollationData::getFirstPrimaryForGroup(int32_t script) const {
118     int32_t index = getScriptIndex(script);
119     return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
120 }
121 
122 uint32_t
getLastPrimaryForGroup(int32_t script) const123 CollationData::getLastPrimaryForGroup(int32_t script) const {
124     int32_t index = getScriptIndex(script);
125     if(index == 0) {
126         return 0;
127     }
128     uint32_t limit = scriptStarts[index + 1];
129     return (limit << 16) - 1;
130 }
131 
132 int32_t
getGroupForPrimary(uint32_t p) const133 CollationData::getGroupForPrimary(uint32_t p) const {
134     p >>= 16;
135     if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
136         return -1;
137     }
138     int32_t index = 1;
139     while(p >= scriptStarts[index + 1]) { ++index; }
140     for(int32_t i = 0; i < numScripts; ++i) {
141         if(scriptsIndex[i] == index) {
142             return i;
143         }
144     }
145     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
146         if(scriptsIndex[numScripts + i] == index) {
147             return UCOL_REORDER_CODE_FIRST + i;
148         }
149     }
150     return -1;
151 }
152 
153 int32_t
getScriptIndex(int32_t script) const154 CollationData::getScriptIndex(int32_t script) const {
155     if(script < 0) {
156         return 0;
157     } else if(script < numScripts) {
158         return scriptsIndex[script];
159     } else if(script < UCOL_REORDER_CODE_FIRST) {
160         return 0;
161     } else {
162         script -= UCOL_REORDER_CODE_FIRST;
163         if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
164             return scriptsIndex[numScripts + script];
165         } else {
166             return 0;
167         }
168     }
169 }
170 
171 int32_t
getEquivalentScripts(int32_t script,int32_t dest[],int32_t capacity,UErrorCode & errorCode) const172 CollationData::getEquivalentScripts(int32_t script,
173                                     int32_t dest[], int32_t capacity,
174                                     UErrorCode &errorCode) const {
175     if(U_FAILURE(errorCode)) { return 0; }
176     int32_t index = getScriptIndex(script);
177     if(index == 0) { return 0; }
178     if(script >= UCOL_REORDER_CODE_FIRST) {
179         // Special groups have no aliases.
180         if(capacity > 0) {
181             dest[0] = script;
182         } else {
183             errorCode = U_BUFFER_OVERFLOW_ERROR;
184         }
185         return 1;
186     }
187 
188     int32_t length = 0;
189     for(int32_t i = 0; i < numScripts; ++i) {
190         if(scriptsIndex[i] == index) {
191             if(length < capacity) {
192                 dest[length] = i;
193             }
194             ++length;
195         }
196     }
197     if(length > capacity) {
198         errorCode = U_BUFFER_OVERFLOW_ERROR;
199     }
200     return length;
201 }
202 
203 void
makeReorderRanges(const int32_t * reorder,int32_t length,UVector32 & ranges,UErrorCode & errorCode) const204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
205                                  UVector32 &ranges, UErrorCode &errorCode) const {
206     makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
207 }
208 
209 void
makeReorderRanges(const int32_t * reorder,int32_t length,UBool latinMustMove,UVector32 & ranges,UErrorCode & errorCode) const210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
211                                  UBool latinMustMove,
212                                  UVector32 &ranges, UErrorCode &errorCode) const {
213     if(U_FAILURE(errorCode)) { return; }
214     ranges.removeAllElements();
215     if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
216         return;
217     }
218 
219     // Maps each script-or-group range to a new lead byte.
220     uint8_t table[MAX_NUM_SCRIPT_RANGES];
221     uprv_memset(table, 0, sizeof(table));
222 
223     {
224         // Set "don't care" values for reserved ranges.
225         int32_t index = scriptsIndex[
226                 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
227         if(index != 0) {
228             table[index] = 0xff;
229         }
230         index = scriptsIndex[
231                 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
232         if(index != 0) {
233             table[index] = 0xff;
234         }
235     }
236 
237     // Never reorder special low and high primary lead bytes.
238     U_ASSERT(scriptStartsLength >= 2);
239     U_ASSERT(scriptStarts[0] == 0);
240     int32_t lowStart = scriptStarts[1];
241     U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
242     int32_t highLimit = scriptStarts[scriptStartsLength - 1];
243     U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
244 
245     // Get the set of special reorder codes in the input list.
246     // This supports a fixed number of special reorder codes;
247     // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
248     uint32_t specials = 0;
249     for(int32_t i = 0; i < length; ++i) {
250         int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
251         if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
252             specials |= (uint32_t)1 << reorderCode;
253         }
254     }
255 
256     // Start the reordering with the special low reorder codes that do not occur in the input.
257     for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
258         int32_t index = scriptsIndex[numScripts + i];
259         if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
260             lowStart = addLowScriptRange(table, index, lowStart);
261         }
262     }
263 
264     // Skip the reserved range before Latin if Latin is the first script,
265     // so that we do not move it unnecessarily.
266     int32_t skippedReserved = 0;
267     if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
268         int32_t index = scriptsIndex[USCRIPT_LATIN];
269         U_ASSERT(index != 0);
270         int32_t start = scriptStarts[index];
271         U_ASSERT(lowStart <= start);
272         skippedReserved = start - lowStart;
273         lowStart = start;
274     }
275 
276     // Reorder according to the input scripts, continuing from the bottom of the primary range.
277     int32_t originalLength = length;  // length will be decremented if "others" is in the list.
278     UBool hasReorderToEnd = FALSE;
279     for(int32_t i = 0; i < length;) {
280         int32_t script = reorder[i++];
281         if(script == USCRIPT_UNKNOWN) {
282             // Put the remaining scripts at the top.
283             hasReorderToEnd = TRUE;
284             while(i < length) {
285                 script = reorder[--length];
286                 if(script == USCRIPT_UNKNOWN ||  // Must occur at most once.
287                         script == UCOL_REORDER_CODE_DEFAULT) {
288                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
289                     return;
290                 }
291                 int32_t index = getScriptIndex(script);
292                 if(index == 0) { continue; }
293                 if(table[index] != 0) {  // Duplicate or equivalent script.
294                     errorCode = U_ILLEGAL_ARGUMENT_ERROR;
295                     return;
296                 }
297                 highLimit = addHighScriptRange(table, index, highLimit);
298             }
299             break;
300         }
301         if(script == UCOL_REORDER_CODE_DEFAULT) {
302             // The default code must be the only one in the list, and that is handled by the caller.
303             // Otherwise it must not be used.
304             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
305             return;
306         }
307         int32_t index = getScriptIndex(script);
308         if(index == 0) { continue; }
309         if(table[index] != 0) {  // Duplicate or equivalent script.
310             errorCode = U_ILLEGAL_ARGUMENT_ERROR;
311             return;
312         }
313         lowStart = addLowScriptRange(table, index, lowStart);
314     }
315 
316     // Put all remaining scripts into the middle.
317     for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
318         int32_t leadByte = table[i];
319         if(leadByte != 0) { continue; }
320         int32_t start = scriptStarts[i];
321         if(!hasReorderToEnd && start > lowStart) {
322             // No need to move this script.
323             lowStart = start;
324         }
325         lowStart = addLowScriptRange(table, i, lowStart);
326     }
327     if(lowStart > highLimit) {
328         if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
329             // Try not skipping the before-Latin reserved range.
330             makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
331             return;
332         }
333         // We need more primary lead bytes than available, despite the reserved ranges.
334         errorCode = U_BUFFER_OVERFLOW_ERROR;
335         return;
336     }
337 
338     // Turn lead bytes into a list of (limit, offset) pairs.
339     // Encode each pair in one list element:
340     // Upper 16 bits = limit, lower 16 = signed lead byte offset.
341     int32_t offset = 0;
342     for(int32_t i = 1;; ++i) {
343         int32_t nextOffset = offset;
344         while(i < scriptStartsLength - 1) {
345             int32_t newLeadByte = table[i];
346             if(newLeadByte == 0xff) {
347                 // "Don't care" lead byte for reserved range, continue with current offset.
348             } else {
349                 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
350                 if(nextOffset != offset) { break; }
351             }
352             ++i;
353         }
354         if(offset != 0 || i < scriptStartsLength - 1) {
355             ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
356         }
357         if(i == scriptStartsLength - 1) { break; }
358         offset = nextOffset;
359     }
360 }
361 
362 int32_t
addLowScriptRange(uint8_t table[],int32_t index,int32_t lowStart) const363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
364     int32_t start = scriptStarts[index];
365     if((start & 0xff) < (lowStart & 0xff)) {
366         lowStart += 0x100;
367     }
368     table[index] = (uint8_t)(lowStart >> 8);
369     int32_t limit = scriptStarts[index + 1];
370     lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
371     return lowStart;
372 }
373 
374 int32_t
addHighScriptRange(uint8_t table[],int32_t index,int32_t highLimit) const375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
376     int32_t limit = scriptStarts[index + 1];
377     if((limit & 0xff) > (highLimit & 0xff)) {
378         highLimit -= 0x100;
379     }
380     int32_t start = scriptStarts[index];
381     highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
382     table[index] = (uint8_t)(highLimit >> 8);
383     return highLimit;
384 }
385 
386 U_NAMESPACE_END
387 
388 #endif  // !UCONFIG_NO_COLLATION
389