1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  *   Copyright (C) 1996-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  ******************************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_COLLATION
13 
14 #include "unicode/unistr.h"
15 #include "unicode/usearch.h"
16 
17 #include "cmemory.h"
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/coleitr.h"
21 #include "unicode/ucoleitr.h"
22 
23 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
24 
25 #include "unicode/uniset.h"
26 #include "unicode/uset.h"
27 #include "unicode/usetiter.h"
28 #include "unicode/ustring.h"
29 #include "hash.h"
30 #include "normalizer2impl.h"
31 #include "uhash.h"
32 #include "usrchimp.h"
33 #include "uassert.h"
34 
35 #include "colldata.h"
36 
37 #define NEW_ARRAY(type, count) (type *) uprv_malloc((size_t)(count) * sizeof(type))
38 #define DELETE_ARRAY(array) uprv_free((void *) (array))
39 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (size_t)(count) * sizeof (src)[0])
40 
CEList(UCollator * coll,const UnicodeString & string,UErrorCode & status)41 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
42     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
43 {
44     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
45     UCollationStrength strength = ucol_getStrength(coll);
46     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
47     uint32_t variableTop = ucol_getVariableTop(coll, &status);
48     uint32_t strengthMask = 0;
49     int32_t order;
50 
51     if (U_FAILURE(status)) {
52         return;
53     }
54 
55     // **** only set flag if string has Han(gul) ****
56     // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476
57 
58     switch (strength)
59     {
60     default:
61         strengthMask |= UCOL_TERTIARYORDERMASK;
62         U_FALLTHROUGH;
63     case UCOL_SECONDARY:
64         strengthMask |= UCOL_SECONDARYORDERMASK;
65         U_FALLTHROUGH;
66     case UCOL_PRIMARY:
67         strengthMask |= UCOL_PRIMARYORDERMASK;
68     }
69 
70     ces = ceBuffer;
71 
72     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
73         UBool cont = isContinuation(order);
74 
75         order &= strengthMask;
76 
77         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
78             if (strength >= UCOL_QUATERNARY) {
79                 order &= UCOL_PRIMARYORDERMASK;
80             } else {
81                 order = UCOL_IGNORABLE;
82             }
83         }
84 
85         if (order == UCOL_IGNORABLE) {
86             continue;
87         }
88 
89         if (cont) {
90             order |= UCOL_CONTINUATION_MARKER;
91         }
92 
93         add(order, status);
94     }
95 
96     ucol_closeElements(elems);
97 }
98 
~CEList()99 CEList::~CEList()
100 {
101     if (ces != ceBuffer) {
102         DELETE_ARRAY(ces);
103     }
104 }
105 
add(uint32_t ce,UErrorCode & status)106 void CEList::add(uint32_t ce, UErrorCode &status)
107 {
108     if (U_FAILURE(status)) {
109         return;
110     }
111 
112     if (listSize >= listMax) {
113         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
114         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
115 
116         if (newCEs == NULL) {
117             status = U_MEMORY_ALLOCATION_ERROR;
118             return;
119         }
120 
121         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
122 
123         if (ces != ceBuffer) {
124             DELETE_ARRAY(ces);
125         }
126 
127         ces = newCEs;
128         listMax = newMax;
129     }
130 
131     ces[listSize++] = ce;
132 }
133 
get(int32_t index) const134 uint32_t CEList::get(int32_t index) const
135 {
136     if (index >= 0 && index < listSize) {
137         return ces[index];
138     }
139 
140     return (uint32_t)UCOL_NULLORDER;
141 }
142 
operator [](int32_t index) const143 uint32_t &CEList::operator[](int32_t index) const
144 {
145     return ces[index];
146 }
147 
matchesAt(int32_t offset,const CEList * other) const148 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
149 {
150     if (other == NULL || listSize - offset < other->size()) {
151         return FALSE;
152     }
153 
154     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
155         if (ces[i] != (*other)[j]) {
156             return FALSE;
157         }
158     }
159 
160     return TRUE;
161 }
162 
size() const163 int32_t CEList::size() const
164 {
165     return listSize;
166 }
167 
StringList(UErrorCode & status)168 StringList::StringList(UErrorCode &status)
169     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
170 {
171     if (U_FAILURE(status)) {
172         return;
173     }
174 
175     strings = new UnicodeString [listMax];
176 
177     if (strings == NULL) {
178         status = U_MEMORY_ALLOCATION_ERROR;
179         return;
180     }
181 }
182 
~StringList()183 StringList::~StringList()
184 {
185     delete[] strings;
186 }
187 
add(const UnicodeString * string,UErrorCode & status)188 void StringList::add(const UnicodeString *string, UErrorCode &status)
189 {
190     if (U_FAILURE(status)) {
191         return;
192     }
193     if (listSize >= listMax) {
194         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
195         UnicodeString *newStrings = new UnicodeString[newMax];
196         if (newStrings == NULL) {
197             status = U_MEMORY_ALLOCATION_ERROR;
198             return;
199         }
200         for (int32_t i=0; i<listSize; ++i) {
201             newStrings[i] = strings[i];
202         }
203         delete[] strings;
204         strings = newStrings;
205         listMax = newMax;
206     }
207 
208     // The ctor initialized all the strings in
209     // the array to empty strings, so this
210     // is the same as copying the source string.
211     strings[listSize++].append(*string);
212 }
213 
add(const UChar * chars,int32_t count,UErrorCode & status)214 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
215 {
216     const UnicodeString string(chars, count);
217 
218     add(&string, status);
219 }
220 
get(int32_t index) const221 const UnicodeString *StringList::get(int32_t index) const
222 {
223     if (index >= 0 && index < listSize) {
224         return &strings[index];
225     }
226 
227     return NULL;
228 }
229 
size() const230 int32_t StringList::size() const
231 {
232     return listSize;
233 }
234 
235 
236 U_CDECL_BEGIN
237 static void U_CALLCONV
deleteStringList(void * obj)238 deleteStringList(void *obj)
239 {
240     StringList *strings = (StringList *) obj;
241 
242     delete strings;
243 }
244 U_CDECL_END
245 
246 class CEToStringsMap
247 {
248 public:
249     CEToStringsMap(UErrorCode &status);
250     ~CEToStringsMap();
251 
252     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
253     StringList *getStringList(uint32_t ce) const;
254 
255 private:
256     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
257     UHashtable *map;
258 };
259 
CEToStringsMap(UErrorCode & status)260 CEToStringsMap::CEToStringsMap(UErrorCode &status)
261     : map(NULL)
262 {
263     if (U_FAILURE(status)) {
264         return;
265     }
266 
267     map = uhash_open(uhash_hashLong, uhash_compareLong,
268                      uhash_compareCaselessUnicodeString,
269                      &status);
270 
271     if (U_FAILURE(status)) {
272         return;
273     }
274 
275     uhash_setValueDeleter(map, deleteStringList);
276 }
277 
~CEToStringsMap()278 CEToStringsMap::~CEToStringsMap()
279 {
280     uhash_close(map);
281 }
282 
put(uint32_t ce,UnicodeString * string,UErrorCode & status)283 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
284 {
285     StringList *strings = getStringList(ce);
286 
287     if (strings == NULL) {
288         strings = new StringList(status);
289 
290         if (strings == NULL || U_FAILURE(status)) {
291             status = U_MEMORY_ALLOCATION_ERROR;
292             return;
293         }
294 
295         putStringList(ce, strings, status);
296     }
297 
298     strings->add(string, status);
299 }
300 
getStringList(uint32_t ce) const301 StringList *CEToStringsMap::getStringList(uint32_t ce) const
302 {
303     return (StringList *) uhash_iget(map, ce);
304 }
305 
putStringList(uint32_t ce,StringList * stringList,UErrorCode & status)306 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
307 {
308     uhash_iput(map, ce, (void *) stringList, &status);
309 }
310 
311 #define CLONE_COLLATOR
312 
CollData(UCollator * collator,UErrorCode & status)313 CollData::CollData(UCollator *collator, UErrorCode &status)
314     : coll(NULL), ceToCharsStartingWith(NULL)
315 {
316     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
317     // i.e. other, control, private use, format, surrogate
318     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
319     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
320     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
321 
322     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
323     // i.e. all the characers we handle implicitly
324     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
325     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
326     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
327 
328     if (U_FAILURE(status)) {
329         return;
330     }
331 
332     USet *expansions   = uset_openEmpty();
333     USet *contractions = uset_openEmpty();
334     int32_t itemCount;
335 
336     ceToCharsStartingWith = new CEToStringsMap(status);
337 
338     if (U_FAILURE(status)) {
339         goto bail;
340     }
341 
342 #ifdef CLONE_COLLATOR
343     coll = ucol_safeClone(collator, NULL, NULL, &status);
344 
345     if (U_FAILURE(status)) {
346         goto bail;
347     }
348 #else
349     coll = collator;
350 #endif
351 
352     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
353 
354     uset_addAll(charsToTest, contractions);
355     uset_addAll(charsToTest, expansions);
356     uset_removeAll(charsToTest, charsToRemove);
357 
358     itemCount = uset_getItemCount(charsToTest);
359     for(int32_t item = 0; item < itemCount; item += 1) {
360         UChar32 start = 0, end = 0;
361         UChar buffer[16];
362         int32_t len = uset_getItem(charsToTest, item, &start, &end,
363                                    buffer, 16, &status);
364 
365         if (len == 0) {
366             for (UChar32 ch = start; ch <= end; ch += 1) {
367                 UnicodeString *st = new UnicodeString(ch);
368 
369                 if (st == NULL) {
370                     status = U_MEMORY_ALLOCATION_ERROR;
371                     break;
372                 }
373 
374                 CEList *ceList = new CEList(coll, *st, status);
375 
376                 ceToCharsStartingWith->put(ceList->get(0), st, status);
377 
378                 delete ceList;
379                 delete st;
380             }
381         } else if (len > 0) {
382             UnicodeString *st = new UnicodeString(buffer, len);
383 
384             if (st == NULL) {
385                 status = U_MEMORY_ALLOCATION_ERROR;
386                 break;
387             }
388 
389             CEList *ceList = new CEList(coll, *st, status);
390 
391             ceToCharsStartingWith->put(ceList->get(0), st, status);
392 
393             delete ceList;
394             delete st;
395         } else {
396             // shouldn't happen...
397         }
398 
399         if (U_FAILURE(status)) {
400              break;
401         }
402     }
403 
404 bail:
405     uset_close(contractions);
406     uset_close(expansions);
407     uset_close(charsToRemove);
408     uset_close(charsToTest);
409 
410     if (U_FAILURE(status)) {
411         return;
412     }
413 
414     UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
415     if (U_FAILURE(status)) {
416         return;
417     }
418     UnicodeSetIterator hanIter(hanRanges);
419     UnicodeString hanString;
420     while(hanIter.nextRange()) {
421         hanString.append(hanIter.getCodepoint());
422         hanString.append(hanIter.getCodepointEnd());
423     }
424     // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
425     // but as of Unicode 6.3 the 11xx block is filled,
426     // and there are also more Jamo T at U+D7CB..U+D7FB.
427     // Maybe use [:HST=T:] and look for the end of the last range?
428     // Maybe use script boundary mappings instead of this code??
429     UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
430      UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
431      CEList hanList(coll, hanString, status);
432      CEList jamoList(coll, jamoString, status);
433      int32_t j = 0;
434 
435      if (U_FAILURE(status)) {
436          return;
437      }
438 
439      for (int32_t c = 0; c < jamoList.size(); c += 1) {
440          uint32_t jce = jamoList[c];
441 
442          if (! isContinuation(jce)) {
443              jamoLimits[j++] = jce;
444          }
445      }
446 
447      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
448 
449      minHan = 0xFFFFFFFF;
450      maxHan = 0;
451 
452      for(int32_t h = 0; h < hanList.size(); h += 2) {
453          uint32_t han = (uint32_t) hanList[h];
454 
455          if (han < minHan) {
456              minHan = han;
457          }
458 
459          if (han > maxHan) {
460              maxHan = han;
461          }
462      }
463 
464      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
465 }
466 
~CollData()467 CollData::~CollData()
468 {
469 #ifdef CLONE_COLLATOR
470    ucol_close(coll);
471 #endif
472 
473    delete ceToCharsStartingWith;
474 }
475 
getCollator() const476 UCollator *CollData::getCollator() const
477 {
478     return coll;
479 }
480 
getStringList(int32_t ce) const481 const StringList *CollData::getStringList(int32_t ce) const
482 {
483     return ceToCharsStartingWith->getStringList(ce);
484 }
485 
getCEList(const UnicodeString * string) const486 const CEList *CollData::getCEList(const UnicodeString *string) const
487 {
488     UErrorCode status = U_ZERO_ERROR;
489     const CEList *list = new CEList(coll, *string, status);
490 
491     if (U_FAILURE(status)) {
492         delete list;
493         list = NULL;
494     }
495 
496     return list;
497 }
498 
freeCEList(const CEList * list)499 void CollData::freeCEList(const CEList *list)
500 {
501     delete list;
502 }
503 
minLengthInChars(const CEList * ceList,int32_t offset,int32_t * history) const504 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
505 {
506     // find out shortest string for the longest sequence of ces.
507     // this can probably be folded with the minLengthCache...
508 
509     if (history[offset] >= 0) {
510         return history[offset];
511     }
512 
513     uint32_t ce = ceList->get(offset);
514     int32_t maxOffset = ceList->size();
515     int32_t shortestLength = INT32_MAX;
516     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
517 
518     if (strings != NULL) {
519         int32_t stringCount = strings->size();
520 
521         for (int32_t s = 0; s < stringCount; s += 1) {
522             const UnicodeString *string = strings->get(s);
523             UErrorCode status = U_ZERO_ERROR;
524             const CEList *ceList2 = new CEList(coll, *string, status);
525 
526             if (U_FAILURE(status)) {
527                 delete ceList2;
528                 ceList2 = NULL;
529             }
530 
531             if (ceList->matchesAt(offset, ceList2)) {
532                 U_ASSERT(ceList2 != NULL);
533                 int32_t clength = ceList2->size();
534                 int32_t slength = string->length();
535                 int32_t roffset = offset + clength;
536                 int32_t rlength = 0;
537 
538                 if (roffset < maxOffset) {
539                     rlength = minLengthInChars(ceList, roffset, history);
540 
541                     if (rlength <= 0) {
542                     // delete before continue to avoid memory leak.
543                         delete ceList2;
544 
545                         // ignore any dead ends
546                         continue;
547                     }
548                 }
549 
550                 if (shortestLength > slength + rlength) {
551                     shortestLength = slength + rlength;
552                 }
553             }
554 
555             delete ceList2;
556         }
557     }
558 
559     if (shortestLength == INT32_MAX) {
560         // No matching strings at this offset. See if
561         // the CE is in a range we can handle manually.
562         if (ce >= minHan && ce < maxHan) {
563             // all han have implicit orders which
564             // generate two CEs.
565             int32_t roffset = offset + 2;
566             int32_t rlength = 0;
567 
568           //history[roffset++] = -1;
569           //history[roffset++] = 1;
570 
571             if (roffset < maxOffset) {
572                 rlength = minLengthInChars(ceList, roffset, history);
573             }
574 
575             if (rlength < 0) {
576                 return -1;
577             }
578 
579             shortestLength = 1 + rlength;
580             goto have_shortest;
581         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
582             int32_t roffset = offset;
583             int32_t rlength = 0;
584 
585             // **** this loop may not handle archaic Hangul correctly ****
586             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
587                 uint32_t jce = ceList->get(roffset);
588 
589                 // Some Jamo have 24-bit primary order; skip the
590                 // 2nd CE. This should always be OK because if
591                 // we're still in the loop all we've seen are
592                 // a series of Jamo in LVT order.
593                 if (isContinuation(jce)) {
594                     continue;
595                 }
596 
597                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
598                     break;
599                 }
600             }
601 
602             if (roffset == offset) {
603                 // we started with a non-L Jamo...
604                 // just say it comes from a single character
605                 roffset += 1;
606 
607                 // See if the single Jamo has a 24-bit order.
608                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
609                     roffset += 1;
610                 }
611             }
612 
613             if (roffset < maxOffset) {
614                 rlength = minLengthInChars(ceList, roffset, history);
615             }
616 
617             if (rlength < 0) {
618                 return -1;
619             }
620 
621             shortestLength = 1 + rlength;
622             goto have_shortest;
623         }
624 
625         // Can't handle it manually either. Just move on.
626         return -1;
627     }
628 
629 have_shortest:
630     history[offset] = shortestLength;
631 
632     return shortestLength;
633 }
634 
minLengthInChars(const CEList * ceList,int32_t offset) const635 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
636 {
637     int32_t clength = ceList->size();
638     int32_t *history = NEW_ARRAY(int32_t, clength);
639 
640     for (int32_t i = 0; i < clength; i += 1) {
641         history[i] = -1;
642     }
643 
644     int32_t minLength = minLengthInChars(ceList, offset, history);
645 
646     DELETE_ARRAY(history);
647 
648     return minLength;
649 }
650 
651 #endif // #if !UCONFIG_NO_COLLATION
652