1 /*
2 *******************************************************************************
3 * Copyright (C) 2012-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 *******************************************************************************
6 * collationdata.cpp
7 *
8 * created on: 2012jul28
9 * created by: Markus W. Scherer
10 */
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_COLLATION
15
16 #include "unicode/ucol.h"
17 #include "unicode/udata.h"
18 #include "unicode/uscript.h"
19 #include "cmemory.h"
20 #include "collation.h"
21 #include "collationdata.h"
22 #include "uassert.h"
23 #include "utrie2.h"
24 #include "uvectr32.h"
25
26 U_NAMESPACE_BEGIN
27
28 uint32_t
getIndirectCE32(uint32_t ce32) const29 CollationData::getIndirectCE32(uint32_t ce32) const {
30 U_ASSERT(Collation::isSpecialCE32(ce32));
31 int32_t tag = Collation::tagFromCE32(ce32);
32 if(tag == Collation::DIGIT_TAG) {
33 // Fetch the non-numeric-collation CE32.
34 ce32 = ce32s[Collation::indexFromCE32(ce32)];
35 } else if(tag == Collation::LEAD_SURROGATE_TAG) {
36 ce32 = Collation::UNASSIGNED_CE32;
37 } else if(tag == Collation::U0000_TAG) {
38 // Fetch the normal ce32 for U+0000.
39 ce32 = ce32s[0];
40 }
41 return ce32;
42 }
43
44 uint32_t
getFinalCE32(uint32_t ce32) const45 CollationData::getFinalCE32(uint32_t ce32) const {
46 if(Collation::isSpecialCE32(ce32)) {
47 ce32 = getIndirectCE32(ce32);
48 }
49 return ce32;
50 }
51
52 int64_t
getSingleCE(UChar32 c,UErrorCode & errorCode) const53 CollationData::getSingleCE(UChar32 c, UErrorCode &errorCode) const {
54 if(U_FAILURE(errorCode)) { return 0; }
55 // Keep parallel with CollationDataBuilder::getSingleCE().
56 const CollationData *d;
57 uint32_t ce32 = getCE32(c);
58 if(ce32 == Collation::FALLBACK_CE32) {
59 d = base;
60 ce32 = base->getCE32(c);
61 } else {
62 d = this;
63 }
64 while(Collation::isSpecialCE32(ce32)) {
65 switch(Collation::tagFromCE32(ce32)) {
66 case Collation::LATIN_EXPANSION_TAG:
67 case Collation::BUILDER_DATA_TAG:
68 case Collation::PREFIX_TAG:
69 case Collation::CONTRACTION_TAG:
70 case Collation::HANGUL_TAG:
71 case Collation::LEAD_SURROGATE_TAG:
72 errorCode = U_UNSUPPORTED_ERROR;
73 return 0;
74 case Collation::FALLBACK_TAG:
75 case Collation::RESERVED_TAG_3:
76 errorCode = U_INTERNAL_PROGRAM_ERROR;
77 return 0;
78 case Collation::LONG_PRIMARY_TAG:
79 return Collation::ceFromLongPrimaryCE32(ce32);
80 case Collation::LONG_SECONDARY_TAG:
81 return Collation::ceFromLongSecondaryCE32(ce32);
82 case Collation::EXPANSION32_TAG:
83 if(Collation::lengthFromCE32(ce32) == 1) {
84 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
85 break;
86 } else {
87 errorCode = U_UNSUPPORTED_ERROR;
88 return 0;
89 }
90 case Collation::EXPANSION_TAG: {
91 if(Collation::lengthFromCE32(ce32) == 1) {
92 return d->ces[Collation::indexFromCE32(ce32)];
93 } else {
94 errorCode = U_UNSUPPORTED_ERROR;
95 return 0;
96 }
97 }
98 case Collation::DIGIT_TAG:
99 // Fetch the non-numeric-collation CE32 and continue.
100 ce32 = d->ce32s[Collation::indexFromCE32(ce32)];
101 break;
102 case Collation::U0000_TAG:
103 U_ASSERT(c == 0);
104 // Fetch the normal ce32 for U+0000 and continue.
105 ce32 = d->ce32s[0];
106 break;
107 case Collation::OFFSET_TAG:
108 return d->getCEFromOffsetCE32(c, ce32);
109 case Collation::IMPLICIT_TAG:
110 return Collation::unassignedCEFromCodePoint(c);
111 }
112 }
113 return Collation::ceFromSimpleCE32(ce32);
114 }
115
116 uint32_t
getFirstPrimaryForGroup(int32_t script) const117 CollationData::getFirstPrimaryForGroup(int32_t script) const {
118 int32_t index = getScriptIndex(script);
119 return index == 0 ? 0 : (uint32_t)scriptStarts[index] << 16;
120 }
121
122 uint32_t
getLastPrimaryForGroup(int32_t script) const123 CollationData::getLastPrimaryForGroup(int32_t script) const {
124 int32_t index = getScriptIndex(script);
125 if(index == 0) {
126 return 0;
127 }
128 uint32_t limit = scriptStarts[index + 1];
129 return (limit << 16) - 1;
130 }
131
132 int32_t
getGroupForPrimary(uint32_t p) const133 CollationData::getGroupForPrimary(uint32_t p) const {
134 p >>= 16;
135 if(p < scriptStarts[1] || scriptStarts[scriptStartsLength - 1] <= p) {
136 return -1;
137 }
138 int32_t index = 1;
139 while(p >= scriptStarts[index + 1]) { ++index; }
140 for(int32_t i = 0; i < numScripts; ++i) {
141 if(scriptsIndex[i] == index) {
142 return i;
143 }
144 }
145 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
146 if(scriptsIndex[numScripts + i] == index) {
147 return UCOL_REORDER_CODE_FIRST + i;
148 }
149 }
150 return -1;
151 }
152
153 int32_t
getScriptIndex(int32_t script) const154 CollationData::getScriptIndex(int32_t script) const {
155 if(script < 0) {
156 return 0;
157 } else if(script < numScripts) {
158 return scriptsIndex[script];
159 } else if(script < UCOL_REORDER_CODE_FIRST) {
160 return 0;
161 } else {
162 script -= UCOL_REORDER_CODE_FIRST;
163 if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
164 return scriptsIndex[numScripts + script];
165 } else {
166 return 0;
167 }
168 }
169 }
170
171 int32_t
getEquivalentScripts(int32_t script,int32_t dest[],int32_t capacity,UErrorCode & errorCode) const172 CollationData::getEquivalentScripts(int32_t script,
173 int32_t dest[], int32_t capacity,
174 UErrorCode &errorCode) const {
175 if(U_FAILURE(errorCode)) { return 0; }
176 int32_t index = getScriptIndex(script);
177 if(index == 0) { return 0; }
178 if(script >= UCOL_REORDER_CODE_FIRST) {
179 // Special groups have no aliases.
180 if(capacity > 0) {
181 dest[0] = script;
182 } else {
183 errorCode = U_BUFFER_OVERFLOW_ERROR;
184 }
185 return 1;
186 }
187
188 int32_t length = 0;
189 for(int32_t i = 0; i < numScripts; ++i) {
190 if(scriptsIndex[i] == index) {
191 if(length < capacity) {
192 dest[length] = i;
193 }
194 ++length;
195 }
196 }
197 if(length > capacity) {
198 errorCode = U_BUFFER_OVERFLOW_ERROR;
199 }
200 return length;
201 }
202
203 void
makeReorderRanges(const int32_t * reorder,int32_t length,UVector32 & ranges,UErrorCode & errorCode) const204 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
205 UVector32 &ranges, UErrorCode &errorCode) const {
206 makeReorderRanges(reorder, length, FALSE, ranges, errorCode);
207 }
208
209 void
makeReorderRanges(const int32_t * reorder,int32_t length,UBool latinMustMove,UVector32 & ranges,UErrorCode & errorCode) const210 CollationData::makeReorderRanges(const int32_t *reorder, int32_t length,
211 UBool latinMustMove,
212 UVector32 &ranges, UErrorCode &errorCode) const {
213 if(U_FAILURE(errorCode)) { return; }
214 ranges.removeAllElements();
215 if(length == 0 || (length == 1 && reorder[0] == USCRIPT_UNKNOWN)) {
216 return;
217 }
218
219 // Maps each script-or-group range to a new lead byte.
220 uint8_t table[MAX_NUM_SCRIPT_RANGES];
221 uprv_memset(table, 0, sizeof(table));
222
223 {
224 // Set "don't care" values for reserved ranges.
225 int32_t index = scriptsIndex[
226 numScripts + REORDER_RESERVED_BEFORE_LATIN - UCOL_REORDER_CODE_FIRST];
227 if(index != 0) {
228 table[index] = 0xff;
229 }
230 index = scriptsIndex[
231 numScripts + REORDER_RESERVED_AFTER_LATIN - UCOL_REORDER_CODE_FIRST];
232 if(index != 0) {
233 table[index] = 0xff;
234 }
235 }
236
237 // Never reorder special low and high primary lead bytes.
238 U_ASSERT(scriptStartsLength >= 2);
239 U_ASSERT(scriptStarts[0] == 0);
240 int32_t lowStart = scriptStarts[1];
241 U_ASSERT(lowStart == ((Collation::MERGE_SEPARATOR_BYTE + 1) << 8));
242 int32_t highLimit = scriptStarts[scriptStartsLength - 1];
243 U_ASSERT(highLimit == (Collation::TRAIL_WEIGHT_BYTE << 8));
244
245 // Get the set of special reorder codes in the input list.
246 // This supports a fixed number of special reorder codes;
247 // it works for data with codes beyond UCOL_REORDER_CODE_LIMIT.
248 uint32_t specials = 0;
249 for(int32_t i = 0; i < length; ++i) {
250 int32_t reorderCode = reorder[i] - UCOL_REORDER_CODE_FIRST;
251 if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
252 specials |= (uint32_t)1 << reorderCode;
253 }
254 }
255
256 // Start the reordering with the special low reorder codes that do not occur in the input.
257 for(int32_t i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
258 int32_t index = scriptsIndex[numScripts + i];
259 if(index != 0 && (specials & ((uint32_t)1 << i)) == 0) {
260 lowStart = addLowScriptRange(table, index, lowStart);
261 }
262 }
263
264 // Skip the reserved range before Latin if Latin is the first script,
265 // so that we do not move it unnecessarily.
266 int32_t skippedReserved = 0;
267 if(specials == 0 && reorder[0] == USCRIPT_LATIN && !latinMustMove) {
268 int32_t index = scriptsIndex[USCRIPT_LATIN];
269 U_ASSERT(index != 0);
270 int32_t start = scriptStarts[index];
271 U_ASSERT(lowStart <= start);
272 skippedReserved = start - lowStart;
273 lowStart = start;
274 }
275
276 // Reorder according to the input scripts, continuing from the bottom of the primary range.
277 int32_t originalLength = length; // length will be decremented if "others" is in the list.
278 UBool hasReorderToEnd = FALSE;
279 for(int32_t i = 0; i < length;) {
280 int32_t script = reorder[i++];
281 if(script == USCRIPT_UNKNOWN) {
282 // Put the remaining scripts at the top.
283 hasReorderToEnd = TRUE;
284 while(i < length) {
285 script = reorder[--length];
286 if(script == USCRIPT_UNKNOWN || // Must occur at most once.
287 script == UCOL_REORDER_CODE_DEFAULT) {
288 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
289 return;
290 }
291 int32_t index = getScriptIndex(script);
292 if(index == 0) { continue; }
293 if(table[index] != 0) { // Duplicate or equivalent script.
294 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
295 return;
296 }
297 highLimit = addHighScriptRange(table, index, highLimit);
298 }
299 break;
300 }
301 if(script == UCOL_REORDER_CODE_DEFAULT) {
302 // The default code must be the only one in the list, and that is handled by the caller.
303 // Otherwise it must not be used.
304 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
305 return;
306 }
307 int32_t index = getScriptIndex(script);
308 if(index == 0) { continue; }
309 if(table[index] != 0) { // Duplicate or equivalent script.
310 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
311 return;
312 }
313 lowStart = addLowScriptRange(table, index, lowStart);
314 }
315
316 // Put all remaining scripts into the middle.
317 for(int32_t i = 1; i < scriptStartsLength - 1; ++i) {
318 int32_t leadByte = table[i];
319 if(leadByte != 0) { continue; }
320 int32_t start = scriptStarts[i];
321 if(!hasReorderToEnd && start > lowStart) {
322 // No need to move this script.
323 lowStart = start;
324 }
325 lowStart = addLowScriptRange(table, i, lowStart);
326 }
327 if(lowStart > highLimit) {
328 if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
329 // Try not skipping the before-Latin reserved range.
330 makeReorderRanges(reorder, originalLength, TRUE, ranges, errorCode);
331 return;
332 }
333 // We need more primary lead bytes than available, despite the reserved ranges.
334 errorCode = U_BUFFER_OVERFLOW_ERROR;
335 return;
336 }
337
338 // Turn lead bytes into a list of (limit, offset) pairs.
339 // Encode each pair in one list element:
340 // Upper 16 bits = limit, lower 16 = signed lead byte offset.
341 int32_t offset = 0;
342 for(int32_t i = 1;; ++i) {
343 int32_t nextOffset = offset;
344 while(i < scriptStartsLength - 1) {
345 int32_t newLeadByte = table[i];
346 if(newLeadByte == 0xff) {
347 // "Don't care" lead byte for reserved range, continue with current offset.
348 } else {
349 nextOffset = newLeadByte - (scriptStarts[i] >> 8);
350 if(nextOffset != offset) { break; }
351 }
352 ++i;
353 }
354 if(offset != 0 || i < scriptStartsLength - 1) {
355 ranges.addElement(((int32_t)scriptStarts[i] << 16) | (offset & 0xffff), errorCode);
356 }
357 if(i == scriptStartsLength - 1) { break; }
358 offset = nextOffset;
359 }
360 }
361
362 int32_t
addLowScriptRange(uint8_t table[],int32_t index,int32_t lowStart) const363 CollationData::addLowScriptRange(uint8_t table[], int32_t index, int32_t lowStart) const {
364 int32_t start = scriptStarts[index];
365 if((start & 0xff) < (lowStart & 0xff)) {
366 lowStart += 0x100;
367 }
368 table[index] = (uint8_t)(lowStart >> 8);
369 int32_t limit = scriptStarts[index + 1];
370 lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
371 return lowStart;
372 }
373
374 int32_t
addHighScriptRange(uint8_t table[],int32_t index,int32_t highLimit) const375 CollationData::addHighScriptRange(uint8_t table[], int32_t index, int32_t highLimit) const {
376 int32_t limit = scriptStarts[index + 1];
377 if((limit & 0xff) > (highLimit & 0xff)) {
378 highLimit -= 0x100;
379 }
380 int32_t start = scriptStarts[index];
381 highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
382 table[index] = (uint8_t)(highLimit >> 8);
383 return highLimit;
384 }
385
386 U_NAMESPACE_END
387
388 #endif // !UCONFIG_NO_COLLATION
389