1 /*
2 **********************************************************************
3 *   Copyright (C) 2001-2014 IBM and others. All rights reserved.
4 **********************************************************************
5 *   Date        Name        Description
6 *  07/02/2001   synwee      Creation.
7 **********************************************************************
8 */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13 
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23 #include "ustr_imp.h"
24 
25 U_NAMESPACE_USE
26 
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29 #define BOYER_MOORE 0
30 
31 // internal definition ---------------------------------------------------
32 
33 #define LAST_BYTE_MASK_          0xFF
34 #define SECOND_LAST_BYTE_SHIFT_  8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36 
37 static const Normalizer2Impl *g_nfcImpl = NULL;
38 
39 // internal methods -------------------------------------------------
40 
41 /**
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
46 */
47 static
setColEIterOffset(UCollationElements * elems,int32_t offset)48 inline void setColEIterOffset(UCollationElements *elems,
49                       int32_t             offset)
50 {
51     // Note: Not "fast" any more after the 2013 collation rewrite.
52     // We do not want to expose more internals than necessary.
53     UErrorCode status = U_ZERO_ERROR;
54     ucol_setOffset(elems, offset, &status);
55 }
56 
57 /**
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
61 */
62 static
getMask(UCollationStrength strength)63 inline uint32_t getMask(UCollationStrength strength)
64 {
65     switch (strength)
66     {
67     case UCOL_PRIMARY:
68         return UCOL_PRIMARYORDERMASK;
69     case UCOL_SECONDARY:
70         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71     default:
72         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73                UCOL_PRIMARYORDERMASK;
74     }
75 }
76 
77 /**
78 * This is to squeeze the 21bit ces into a 256 table
79 * @param ce collation element
80 * @return collapsed version of the collation element
81 */
82 static
hash(uint32_t ce)83 inline int hash(uint32_t ce)
84 {
85     // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
86     // well with the new collation where most of the latin 1 characters
87     // are of the value xx000xxx. their hashes will most of the time be 0
88     // to be discussed on the hash algo.
89     return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
90 }
91 
92 U_CDECL_BEGIN
93 static UBool U_CALLCONV
usearch_cleanup(void)94 usearch_cleanup(void) {
95     g_nfcImpl = NULL;
96     return TRUE;
97 }
98 U_CDECL_END
99 
100 /**
101 * Initializing the fcd tables.
102 * Internal method, status assumed to be a success.
103 * @param status output error if any, caller to check status before calling
104 *               method, status assumed to be success when passed in.
105 */
106 static
initializeFCD(UErrorCode * status)107 inline void initializeFCD(UErrorCode *status)
108 {
109     if (g_nfcImpl == NULL) {
110         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
111         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
112     }
113 }
114 
115 /**
116 * Gets the fcd value for a character at the argument index.
117 * This method takes into accounts of the supplementary characters.
118 * @param str UTF16 string where character for fcd retrieval resides
119 * @param offset position of the character whose fcd is to be retrieved, to be
120 *               overwritten with the next character position, taking
121 *               surrogate characters into consideration.
122 * @param strlength length of the argument string
123 * @return fcd value
124 */
125 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)126 uint16_t getFCD(const UChar   *str, int32_t *offset,
127                              int32_t  strlength)
128 {
129     const UChar *temp = str + *offset;
130     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
131     *offset = (int32_t)(temp - str);
132     return result;
133 }
134 
135 /**
136 * Getting the modified collation elements taking into account the collation
137 * attributes
138 * @param strsrch string search data
139 * @param sourcece
140 * @return the modified collation element
141 */
142 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)143 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
144 {
145     // note for tertiary we can't use the collator->tertiaryMask, that
146     // is a preprocessed mask that takes into account case options. since
147     // we are only concerned with exact matches, we don't need that.
148     sourcece &= strsrch->ceMask;
149 
150     if (strsrch->toShift) {
151         // alternate handling here, since only the 16 most significant digits
152         // is only used, we can safely do a compare without masking
153         // if the ce is a variable, we mask and get only the primary values
154         // no shifting to quartenary is required since all primary values
155         // less than variabletop will need to be masked off anyway.
156         if (strsrch->variableTop > sourcece) {
157             if (strsrch->strength >= UCOL_QUATERNARY) {
158                 sourcece &= UCOL_PRIMARYORDERMASK;
159             }
160             else {
161                 sourcece = UCOL_IGNORABLE;
162             }
163         }
164     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
165         sourcece = 0xFFFF;
166     }
167 
168     return sourcece;
169 }
170 
171 /**
172 * Allocate a memory and returns NULL if it failed.
173 * Internal method, status assumed to be a success.
174 * @param size to allocate
175 * @param status output error if any, caller to check status before calling
176 *               method, status assumed to be success when passed in.
177 * @return newly allocated array, NULL otherwise
178 */
179 static
allocateMemory(uint32_t size,UErrorCode * status)180 inline void * allocateMemory(uint32_t size, UErrorCode *status)
181 {
182     uint32_t *result = (uint32_t *)uprv_malloc(size);
183     if (result == NULL) {
184         *status = U_MEMORY_ALLOCATION_ERROR;
185     }
186     return result;
187 }
188 
189 /**
190 * Adds a uint32_t value to a destination array.
191 * Creates a new array if we run out of space. The caller will have to
192 * manually deallocate the newly allocated array.
193 * Internal method, status assumed to be success, caller has to check status
194 * before calling this method. destination not to be NULL and has at least
195 * size destinationlength.
196 * @param destination target array
197 * @param offset destination offset to add value
198 * @param destinationlength target array size, return value for the new size
199 * @param value to be added
200 * @param increments incremental size expected
201 * @param status output error if any, caller to check status before calling
202 *               method, status assumed to be success when passed in.
203 * @return new destination array, destination if there was no new allocation
204 */
205 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)206 inline int32_t * addTouint32_tArray(int32_t    *destination,
207                                     uint32_t    offset,
208                                     uint32_t   *destinationlength,
209                                     uint32_t    value,
210                                     uint32_t    increments,
211                                     UErrorCode *status)
212 {
213     uint32_t newlength = *destinationlength;
214     if (offset + 1 == newlength) {
215         newlength += increments;
216         int32_t *temp = (int32_t *)allocateMemory(
217                                          sizeof(int32_t) * newlength, status);
218         if (U_FAILURE(*status)) {
219             return NULL;
220         }
221         uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
222         *destinationlength = newlength;
223         destination        = temp;
224     }
225     destination[offset] = value;
226     return destination;
227 }
228 
229 /**
230 * Adds a uint64_t value to a destination array.
231 * Creates a new array if we run out of space. The caller will have to
232 * manually deallocate the newly allocated array.
233 * Internal method, status assumed to be success, caller has to check status
234 * before calling this method. destination not to be NULL and has at least
235 * size destinationlength.
236 * @param destination target array
237 * @param offset destination offset to add value
238 * @param destinationlength target array size, return value for the new size
239 * @param value to be added
240 * @param increments incremental size expected
241 * @param status output error if any, caller to check status before calling
242 *               method, status assumed to be success when passed in.
243 * @return new destination array, destination if there was no new allocation
244 */
245 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)246 inline int64_t * addTouint64_tArray(int64_t    *destination,
247                                     uint32_t    offset,
248                                     uint32_t   *destinationlength,
249                                     uint64_t    value,
250                                     uint32_t    increments,
251                                     UErrorCode *status)
252 {
253     uint32_t newlength = *destinationlength;
254     if (offset + 1 == newlength) {
255         newlength += increments;
256         int64_t *temp = (int64_t *)allocateMemory(
257                                          sizeof(int64_t) * newlength, status);
258 
259         if (U_FAILURE(*status)) {
260             return NULL;
261         }
262 
263         uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
264         *destinationlength = newlength;
265         destination        = temp;
266     }
267 
268     destination[offset] = value;
269 
270     return destination;
271 }
272 
273 /**
274 * Initializing the ce table for a pattern.
275 * Stores non-ignorable collation keys.
276 * Table size will be estimated by the size of the pattern text. Table
277 * expansion will be perform as we go along. Adding 1 to ensure that the table
278 * size definitely increases.
279 * Internal method, status assumed to be a success.
280 * @param strsrch string search data
281 * @param status output error if any, caller to check status before calling
282 *               method, status assumed to be success when passed in.
283 * @return total number of expansions
284 */
285 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)286 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
287                                          UErrorCode    *status)
288 {
289     UPattern *pattern            = &(strsrch->pattern);
290     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
291     int32_t  *cetable            = pattern->cesBuffer;
292     uint32_t  patternlength      = pattern->textLength;
293     UCollationElements *coleiter = strsrch->utilIter;
294 
295     if (coleiter == NULL) {
296         coleiter = ucol_openElements(strsrch->collator, pattern->text,
297                                      patternlength, status);
298         // status will be checked in ucol_next(..) later and if it is an
299         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
300         // returned.
301         strsrch->utilIter = coleiter;
302     }
303     else {
304         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
305     }
306     if(U_FAILURE(*status)) {
307         return 0;
308     }
309 
310     if (pattern->ces != cetable && pattern->ces) {
311         uprv_free(pattern->ces);
312     }
313 
314     uint16_t  offset      = 0;
315     uint16_t  result      = 0;
316     int32_t   ce;
317 
318     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
319            U_SUCCESS(*status)) {
320         uint32_t newce = getCE(strsrch, ce);
321         if (newce) {
322             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
323                                   newce,
324                                   patternlength - ucol_getOffset(coleiter) + 1,
325                                   status);
326             if (U_FAILURE(*status)) {
327                 return 0;
328             }
329             offset ++;
330             if (cetable != temp && cetable != pattern->cesBuffer) {
331                 uprv_free(cetable);
332             }
333             cetable = temp;
334         }
335         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
336     }
337 
338     cetable[offset]   = 0;
339     pattern->ces       = cetable;
340     pattern->cesLength = offset;
341 
342     return result;
343 }
344 
345 /**
346 * Initializing the pce table for a pattern.
347 * Stores non-ignorable collation keys.
348 * Table size will be estimated by the size of the pattern text. Table
349 * expansion will be perform as we go along. Adding 1 to ensure that the table
350 * size definitely increases.
351 * Internal method, status assumed to be a success.
352 * @param strsrch string search data
353 * @param status output error if any, caller to check status before calling
354 *               method, status assumed to be success when passed in.
355 * @return total number of expansions
356 */
357 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)358 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
359                                           UErrorCode    *status)
360 {
361     UPattern *pattern            = &(strsrch->pattern);
362     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
363     int64_t  *pcetable           = pattern->pcesBuffer;
364     uint32_t  patternlength      = pattern->textLength;
365     UCollationElements *coleiter = strsrch->utilIter;
366 
367     if (coleiter == NULL) {
368         coleiter = ucol_openElements(strsrch->collator, pattern->text,
369                                      patternlength, status);
370         // status will be checked in ucol_next(..) later and if it is an
371         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
372         // returned.
373         strsrch->utilIter = coleiter;
374     } else {
375         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
376     }
377     if(U_FAILURE(*status)) {
378         return 0;
379     }
380 
381     if (pattern->pces != pcetable && pattern->pces != NULL) {
382         uprv_free(pattern->pces);
383     }
384 
385     uint16_t  offset = 0;
386     uint16_t  result = 0;
387     int64_t   pce;
388 
389     icu::UCollationPCE iter(coleiter);
390 
391     // ** Should processed CEs be signed or unsigned?
392     // ** (the rest of the code in this file seems to play fast-and-loose with
393     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
394     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
395            U_SUCCESS(*status)) {
396         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
397                               pce,
398                               patternlength - ucol_getOffset(coleiter) + 1,
399                               status);
400 
401         if (U_FAILURE(*status)) {
402             return 0;
403         }
404 
405         offset += 1;
406 
407         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
408             uprv_free(pcetable);
409         }
410 
411         pcetable = temp;
412         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
413     }
414 
415     pcetable[offset]   = 0;
416     pattern->pces       = pcetable;
417     pattern->pcesLength = offset;
418 
419     return result;
420 }
421 
422 /**
423 * Initializes the pattern struct.
424 * Internal method, status assumed to be success.
425 * @param strsrch UStringSearch data storage
426 * @param status output error if any, caller to check status before calling
427 *               method, status assumed to be success when passed in.
428 * @return expansionsize the total expansion size of the pattern
429 */
430 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)431 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
432 {
433     if (U_FAILURE(*status)) { return 0; }
434           UPattern   *pattern     = &(strsrch->pattern);
435     const UChar      *patterntext = pattern->text;
436           int32_t     length      = pattern->textLength;
437           int32_t index       = 0;
438 
439     // Since the strength is primary, accents are ignored in the pattern.
440     if (strsrch->strength == UCOL_PRIMARY) {
441         pattern->hasPrefixAccents = 0;
442         pattern->hasSuffixAccents = 0;
443     } else {
444         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
445                                                          SECOND_LAST_BYTE_SHIFT_;
446         index = length;
447         U16_BACK_1(patterntext, 0, index);
448         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
449                                                                  LAST_BYTE_MASK_;
450     }
451 
452     // ** HACK **
453     if (strsrch->pattern.pces != NULL) {
454         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
455             uprv_free(strsrch->pattern.pces);
456         }
457 
458         strsrch->pattern.pces = NULL;
459     }
460 
461     // since intializePattern is an internal method status is a success.
462     return initializePatternCETable(strsrch, status);
463 }
464 
465 /**
466 * Initializing shift tables, with the default values.
467 * If a corresponding default value is 0, the shift table is not set.
468 * @param shift table for forwards shift
469 * @param backshift table for backwards shift
470 * @param cetable table containing pattern ce
471 * @param cesize size of the pattern ces
472 * @param expansionsize total size of the expansions
473 * @param defaultforward the default forward value
474 * @param defaultbackward the default backward value
475 */
476 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)477 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
478                           int32_t  *cetable, int32_t cesize,
479                           int16_t   expansionsize,
480                           int16_t   defaultforward,
481                           int16_t   defaultbackward)
482 {
483     // estimate the value to shift. to do that we estimate the smallest
484     // number of characters to give the relevant ces, ie approximately
485     // the number of ces minus their expansion, since expansions can come
486     // from a character.
487     int32_t count;
488     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
489         shift[count] = defaultforward;
490     }
491     cesize --; // down to the last index
492     for (count = 0; count < cesize; count ++) {
493         // number of ces from right of array to the count
494         int temp = defaultforward - count - 1;
495         shift[hash(cetable[count])] = temp > 1 ? temp : 1;
496     }
497     shift[hash(cetable[cesize])] = 1;
498     // for ignorables we just shift by one. see test examples.
499     shift[hash(0)] = 1;
500 
501     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
502         backshift[count] = defaultbackward;
503     }
504     for (count = cesize; count > 0; count --) {
505         // the original value count does not seem to work
506         backshift[hash(cetable[count])] = count > expansionsize ?
507                                           (int16_t)(count - expansionsize) : 1;
508     }
509     backshift[hash(cetable[0])] = 1;
510     backshift[hash(0)] = 1;
511 }
512 
513 /**
514 * Building of the pattern collation element list and the boyer moore strsrch
515 * table.
516 * The canonical match will only be performed after the default match fails.
517 * For both cases we need to remember the size of the composed and decomposed
518 * versions of the string. Since the Boyer-Moore shift calculations shifts by
519 * a number of characters in the text and tries to match the pattern from that
520 * offset, the shift value can not be too large in case we miss some
521 * characters. To choose a right shift size, we estimate the NFC form of the
522 * and use its size as a shift guide. The NFC form should be the small
523 * possible representation of the pattern. Anyways, we'll err on the smaller
524 * shift size. Hence the calculation for minlength.
525 * Canonical match will be performed slightly differently. We'll split the
526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
527 * the first and last base character (MS), the ending accents (EA). Matches
528 * will be done on MS first, and only when we match MS then some processing
529 * will be required for the prefix and end accents in order to determine if
530 * they match PA and EA. Hence the default shift values
531 * for the canonical match will take the size of either end's accent into
532 * consideration. Forwards search will take the end accents into consideration
533 * for the default shift values and the backwards search will take the prefix
534 * accents into consideration.
535 * If pattern has no non-ignorable ce, we return a illegal argument error.
536 * Internal method, status assumed to be success.
537 * @param strsrch UStringSearch data storage
538 * @param status  for output errors if it occurs, status is assumed to be a
539 *                success when it is passed in.
540 */
541 static
initialize(UStringSearch * strsrch,UErrorCode * status)542 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
543 {
544     int16_t expandlength  = initializePattern(strsrch, status);
545     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
546         UPattern *pattern = &strsrch->pattern;
547         int32_t   cesize  = pattern->cesLength;
548 
549         int16_t minlength = cesize > expandlength
550                             ? (int16_t)cesize - expandlength : 1;
551         pattern->defaultShiftSize    = minlength;
552         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
553                       cesize, expandlength, minlength, minlength);
554         return;
555     }
556     strsrch->pattern.defaultShiftSize = 0;
557 }
558 
559 #if BOYER_MOORE
560 /**
561 * Check to make sure that the match length is at the end of the character by
562 * using the breakiterator.
563 * @param strsrch string search data
564 * @param start target text start offset
565 * @param end target text end offset
566 */
567 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)568 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
569                                int32_t *end)
570 {
571 #if !UCONFIG_NO_BREAK_ITERATION
572     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
573     if (breakiterator) {
574         int32_t matchend = *end;
575         //int32_t matchstart = *start;
576 
577         if (!ubrk_isBoundary(breakiterator, matchend)) {
578             *end = ubrk_following(breakiterator, matchend);
579         }
580 
581         /* Check the start of the matched text to make sure it doesn't have any accents
582          * before it.  This code may not be necessary and so it is commented out */
583         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
584             *start = ubrk_preceding(breakiterator, matchstart);
585         }*/
586     }
587 #endif
588 }
589 
590 /**
591 * Determine whether the target text in UStringSearch bounded by the offset
592 * start and end is one or more whole units of text as
593 * determined by the breakiterator in UStringSearch.
594 * @param strsrch string search data
595 * @param start target text start offset
596 * @param end target text end offset
597 */
598 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)599 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
600                                int32_t    end)
601 {
602 #if !UCONFIG_NO_BREAK_ITERATION
603     UBreakIterator *breakiterator = strsrch->search->breakIter;
604     //TODO: Add here.
605     if (breakiterator) {
606         int32_t startindex = ubrk_first(breakiterator);
607         int32_t endindex   = ubrk_last(breakiterator);
608 
609         // out-of-range indexes are never boundary positions
610         if (start < startindex || start > endindex ||
611             end < startindex || end > endindex) {
612             return FALSE;
613         }
614         // otherwise, we can use following() on the position before the
615         // specified one and return true of the position we get back is the
616         // one the user specified
617         UBool result = (start == startindex ||
618                 ubrk_following(breakiterator, start - 1) == start) &&
619                (end == endindex ||
620                 ubrk_following(breakiterator, end - 1) == end);
621         if (result) {
622             // iterates the individual ces
623                   UCollationElements *coleiter  = strsrch->utilIter;
624             const UChar              *text      = strsrch->search->text +
625                                                                       start;
626                   UErrorCode          status    = U_ZERO_ERROR;
627             ucol_setText(coleiter, text, end - start, &status);
628             for (int32_t count = 0; count < strsrch->pattern.cesLength;
629                  count ++) {
630                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
631                 if (ce == UCOL_IGNORABLE) {
632                     count --;
633                     continue;
634                 }
635                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
636                     return FALSE;
637                 }
638             }
639             int32_t nextce = ucol_next(coleiter, &status);
640             while (ucol_getOffset(coleiter) == (end - start)
641                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
642                 nextce = ucol_next(coleiter, &status);
643             }
644             if (ucol_getOffset(coleiter) == (end - start)
645                 && nextce != UCOL_NULLORDER) {
646                 // extra collation elements at the end of the match
647                 return FALSE;
648             }
649         }
650         return result;
651     }
652 #endif
653     return TRUE;
654 }
655 
656 /**
657 * Getting the next base character offset if current offset is an accent,
658 * or the current offset if the current character contains a base character.
659 * accents the following base character will be returned
660 * @param text string
661 * @param textoffset current offset
662 * @param textlength length of text string
663 * @return the next base character or the current offset
664 *         if the current character is contains a base character.
665 */
666 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)667 inline int32_t getNextBaseOffset(const UChar       *text,
668                                            int32_t  textoffset,
669                                            int32_t      textlength)
670 {
671     if (textoffset < textlength) {
672         int32_t temp = textoffset;
673         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
674             while (temp < textlength) {
675                 int32_t result = temp;
676                 if ((getFCD(text, &temp, textlength) >>
677                      SECOND_LAST_BYTE_SHIFT_) == 0) {
678                     return result;
679                 }
680             }
681             return textlength;
682         }
683     }
684     return textoffset;
685 }
686 
687 /**
688 * Gets the next base character offset depending on the string search pattern
689 * data
690 * @param strsrch string search data
691 * @param textoffset current offset, one offset away from the last character
692 *                   to search for.
693 * @return start index of the next base character or the current offset
694 *         if the current character is contains a base character.
695 */
696 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
698                                                   int32_t    textoffset)
699 {
700     int32_t textlength = strsrch->search->textLength;
701     if (strsrch->pattern.hasSuffixAccents &&
702         textoffset < textlength) {
703               int32_t  temp       = textoffset;
704         const UChar       *text       = strsrch->search->text;
705         U16_BACK_1(text, 0, temp);
706         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
707             return getNextBaseOffset(text, textoffset, textlength);
708         }
709     }
710     return textoffset;
711 }
712 
713 /**
714 * Shifting the collation element iterator position forward to prepare for
715 * a following match. If the last character is a unsafe character, we'll only
716 * shift by 1 to capture contractions, normalization etc.
717 * Internal method, status assumed to be success.
718 * @param text strsrch string search data
719 * @param textoffset start text position to do search
720 * @param ce the text ce which failed the match.
721 * @param patternceindex index of the ce within the pattern ce buffer which
722 *        failed the match
723 * @return final offset
724 */
725 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)726 inline int32_t shiftForward(UStringSearch *strsrch,
727                                 int32_t    textoffset,
728                                 int32_t       ce,
729                                 int32_t        patternceindex)
730 {
731     UPattern *pattern = &(strsrch->pattern);
732     if (ce != UCOL_NULLORDER) {
733         int32_t shift = pattern->shift[hash(ce)];
734         // this is to adjust for characters in the middle of the
735         // substring for matching that failed.
736         int32_t adjust = pattern->cesLength - patternceindex;
737         if (adjust > 1 && shift >= adjust) {
738             shift -= adjust - 1;
739         }
740         textoffset += shift;
741     }
742     else {
743         textoffset += pattern->defaultShiftSize;
744     }
745 
746     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
747     // check for unsafe characters
748     // * if it is the start or middle of a contraction: to be done after
749     //   a initial match is found
750     // * thai or lao base consonant character: similar to contraction
751     // * high surrogate character: similar to contraction
752     // * next character is a accent: shift to the next base character
753     return textoffset;
754 }
755 #endif // #if BOYER_MOORE
756 
757 /**
758 * sets match not found
759 * @param strsrch string search data
760 */
761 static
setMatchNotFound(UStringSearch * strsrch)762 inline void setMatchNotFound(UStringSearch *strsrch)
763 {
764     // this method resets the match result regardless of the error status.
765     strsrch->search->matchedIndex = USEARCH_DONE;
766     strsrch->search->matchedLength = 0;
767     if (strsrch->search->isForwardSearching) {
768         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
769     }
770     else {
771         setColEIterOffset(strsrch->textIter, 0);
772     }
773 }
774 
775 #if BOYER_MOORE
776 /**
777 * Gets the offset to the next safe point in text.
778 * ie. not the middle of a contraction, swappable characters or supplementary
779 * characters.
780 * @param collator collation sata
781 * @param text string to work with
782 * @param textoffset offset in string
783 * @param textlength length of text string
784 * @return offset to the next safe character
785 */
786 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)787 inline int32_t getNextSafeOffset(const UCollator   *collator,
788                                      const UChar       *text,
789                                            int32_t  textoffset,
790                                            int32_t      textlength)
791 {
792     int32_t result = textoffset; // first contraction character
793     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
794         result ++;
795     }
796     return result;
797 }
798 
799 /**
800 * This checks for accents in the potential match started with a .
801 * composite character.
802 * This is really painful... we have to check that composite character do not
803 * have any extra accents. We have to normalize the potential match and find
804 * the immediate decomposed character before the match.
805 * The first composite character would have been taken care of by the fcd
806 * checks in checkForwardExactMatch.
807 * This is the slow path after the fcd of the first character and
808 * the last character has been checked by checkForwardExactMatch and we
809 * determine that the potential match has extra non-ignorable preceding
810 * ces.
811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
813 * Note here that accents checking are slow and cautioned in the API docs.
814 * Internal method, status assumed to be a success, caller should check status
815 * before calling this method
816 * @param strsrch string search data
817 * @param start index of the potential unfriendly composite character
818 * @param end index of the potential unfriendly composite character
819 * @param status output error status if any.
820 * @return TRUE if there is non-ignorable accents before at the beginning
821 *              of the match, FALSE otherwise.
822 */
823 
824 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)825 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
826                                    int32_t    end,
827                                    UErrorCode    *status)
828 {
829     UBool result = FALSE;
830     if (strsrch->pattern.hasPrefixAccents) {
831               int32_t  length = end - start;
832               int32_t  offset = 0;
833         const UChar       *text   = strsrch->search->text + start;
834 
835         U16_FWD_1(text, offset, length);
836         // we are only concerned with the first composite character
837         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
838             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
839                                                        text, 0, length);
840             if (safeoffset != length) {
841                 safeoffset ++;
842             }
843             UChar   *norm = NULL;
844             UChar    buffer[INITIAL_ARRAY_SIZE_];
845             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
846                                             buffer, INITIAL_ARRAY_SIZE_,
847                                             status);
848             if (U_FAILURE(*status)) {
849                 return FALSE;
850             }
851             if (size >= INITIAL_ARRAY_SIZE_) {
852                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
853                                                status);
854                 // if allocation failed, status will be set to
855                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
856                 // checks for it.
857                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
858                                        size, status);
859                 if (U_FAILURE(*status) && norm != NULL) {
860                     uprv_free(norm);
861                     return FALSE;
862                 }
863             }
864             else {
865                 norm = buffer;
866             }
867 
868             UCollationElements *coleiter  = strsrch->utilIter;
869             ucol_setText(coleiter, norm, size, status);
870             uint32_t            firstce   = strsrch->pattern.ces[0];
871             UBool               ignorable = TRUE;
872             uint32_t            ce        = UCOL_IGNORABLE;
873             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
874                 offset = ucol_getOffset(coleiter);
875                 if (ce != firstce && ce != UCOL_IGNORABLE) {
876                     ignorable = FALSE;
877                 }
878                 ce = ucol_next(coleiter, status);
879             }
880             UChar32 codepoint;
881             U16_PREV(norm, 0, offset, codepoint);
882             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
883 
884             if (norm != buffer) {
885                 uprv_free(norm);
886             }
887         }
888     }
889 
890     return result;
891 }
892 
893 /**
894 * Used by exact matches, checks if there are accents before the match.
895 * This is really painful... we have to check that composite characters at
896 * the start of the matches have to not have any extra accents.
897 * We check the FCD of the character first, if it starts with an accent and
898 * the first pattern ce does not match the first ce of the character, we bail.
899 * Otherwise we try normalizing the first composite
900 * character and find the immediate decomposed character before the match to
901 * see if it is an non-ignorable accent.
902 * Now normalizing the first composite character is enough because we ensure
903 * that when the match is passed in here with extra beginning ces, the
904 * first or last ce that match has to occur within the first character.
905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
907 * Note here that accents checking are slow and cautioned in the API docs.
908 * @param strsrch string search data
909 * @param start offset
910 * @param end offset
911 * @return TRUE if there are accents on either side of the match,
912 *         FALSE otherwise
913 */
914 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)915 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
916                                   int32_t    end)
917 {
918     if (strsrch->pattern.hasPrefixAccents) {
919         UCollationElements *coleiter  = strsrch->textIter;
920         UErrorCode          status    = U_ZERO_ERROR;
921         // we have been iterating forwards previously
922         uint32_t            ignorable = TRUE;
923         int32_t             firstce   = strsrch->pattern.ces[0];
924 
925         setColEIterOffset(coleiter, start);
926         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
927         if (U_FAILURE(status)) {
928             return TRUE;
929         }
930         while (ce != firstce) {
931             if (ce != UCOL_IGNORABLE) {
932                 ignorable = FALSE;
933             }
934             ce = getCE(strsrch, ucol_next(coleiter, &status));
935             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
936                 return TRUE;
937             }
938         }
939         if (!ignorable && inNormBuf(coleiter)) {
940             // within normalization buffer, discontiguous handled here
941             return TRUE;
942         }
943 
944         // within text
945         int32_t temp = start;
946         // original code
947         // accent = (getFCD(strsrch->search->text, &temp,
948         //                  strsrch->search->textLength)
949         //            >> SECOND_LAST_BYTE_SHIFT_);
950         // however this code does not work well with VC7 .net in release mode.
951         // maybe the inlines for getFCD combined with shifting has bugs in
952         // VC7. anyways this is a work around.
953         UBool accent = getFCD(strsrch->search->text, &temp,
954                               strsrch->search->textLength) > 0xFF;
955         if (!accent) {
956             return checkExtraMatchAccents(strsrch, start, end, &status);
957         }
958         if (!ignorable) {
959             return TRUE;
960         }
961         if (start > 0) {
962             temp = start;
963             U16_BACK_1(strsrch->search->text, 0, temp);
964             if (getFCD(strsrch->search->text, &temp,
965                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
966                 setColEIterOffset(coleiter, start);
967                 ce = ucol_previous(coleiter, &status);
968                 if (U_FAILURE(status) ||
969                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
970                     return TRUE;
971                 }
972             }
973         }
974     }
975 
976     return FALSE;
977 }
978 
979 /**
980 * Used by exact matches, checks if there are accents bounding the match.
981 * Note this is the initial boundary check. If the potential match
982 * starts or ends with composite characters, the accents in those
983 * characters will be determined later.
984 * Not doing backwards iteration here, since discontiguos contraction for
985 * backwards collation element iterator, use up too many characters.
986 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
987 * should fail since there is a acute at the end of \u01FA
988 * Note here that accents checking are slow and cautioned in the API docs.
989 * @param strsrch string search data
990 * @param start offset of match
991 * @param end end offset of the match
992 * @return TRUE if there are accents on either side of the match,
993 *         FALSE otherwise
994 */
995 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)996 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
997                                  int32_t    end)
998 {
999     if (strsrch->pattern.hasSuffixAccents) {
1000         const UChar       *text       = strsrch->search->text;
1001               int32_t  temp       = end;
1002               int32_t      textlength = strsrch->search->textLength;
1003         U16_BACK_1(text, 0, temp);
1004         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1005             int32_t             firstce  = strsrch->pattern.ces[0];
1006             UCollationElements *coleiter = strsrch->textIter;
1007             UErrorCode          status   = U_ZERO_ERROR;
1008             int32_t ce;
1009             setColEIterOffset(coleiter, start);
1010             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1011                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1012                     return TRUE;
1013                 }
1014             }
1015             int32_t count = 1;
1016             while (count < strsrch->pattern.cesLength) {
1017                 if (getCE(strsrch, ucol_next(coleiter, &status))
1018                     == UCOL_IGNORABLE) {
1019                     // Thai can give an ignorable here.
1020                     count --;
1021                 }
1022                 if (U_FAILURE(status)) {
1023                     return TRUE;
1024                 }
1025                 count ++;
1026             }
1027 
1028             ce = ucol_next(coleiter, &status);
1029             if (U_FAILURE(status)) {
1030                 return TRUE;
1031             }
1032             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1033                 ce = getCE(strsrch, ce);
1034             }
1035             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1036                 if (ucol_getOffset(coleiter) <= end) {
1037                     return TRUE;
1038                 }
1039                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1040                     return TRUE;
1041                 }
1042             }
1043         }
1044     }
1045     return FALSE;
1046 }
1047 #endif // #if BOYER_MOORE
1048 
1049 /**
1050 * Checks if the offset runs out of the text string
1051 * @param offset
1052 * @param textlength of the text string
1053 * @return TRUE if offset is out of bounds, FALSE otherwise
1054 */
1055 static
isOutOfBounds(int32_t textlength,int32_t offset)1056 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1057 {
1058     return offset < 0 || offset > textlength;
1059 }
1060 
1061 /**
1062 * Checks for identical match
1063 * @param strsrch string search data
1064 * @param start offset of possible match
1065 * @param end offset of possible match
1066 * @return TRUE if identical match is found
1067 */
1068 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1069 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1070                                   int32_t    end)
1071 {
1072     if (strsrch->strength != UCOL_IDENTICAL) {
1073         return TRUE;
1074     }
1075 
1076     // Note: We could use Normalizer::compare() or similar, but for short strings
1077     // which may not be in FCD it might be faster to just NFD them.
1078     UErrorCode status = U_ZERO_ERROR;
1079     UnicodeString t2, p2;
1080     strsrch->nfd->normalize(
1081         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1082     strsrch->nfd->normalize(
1083         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1084     // return FALSE if NFD failed
1085     return U_SUCCESS(status) && t2 == p2;
1086 }
1087 
1088 #if BOYER_MOORE
1089 /**
1090 * Checks to see if the match is repeated
1091 * @param strsrch string search data
1092 * @param start new match start index
1093 * @param end new match end index
1094 * @return TRUE if the the match is repeated, FALSE otherwise
1095 */
1096 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1097 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1098                                 int32_t    start,
1099                                 int32_t    end)
1100 {
1101     int32_t lastmatchindex = strsrch->search->matchedIndex;
1102     UBool       result;
1103     if (lastmatchindex == USEARCH_DONE) {
1104         return FALSE;
1105     }
1106     if (strsrch->search->isForwardSearching) {
1107         result = start <= lastmatchindex;
1108     }
1109     else {
1110         result = start >= lastmatchindex;
1111     }
1112     if (!result && !strsrch->search->isOverlap) {
1113         if (strsrch->search->isForwardSearching) {
1114             result = start < lastmatchindex + strsrch->search->matchedLength;
1115         }
1116         else {
1117             result = end > lastmatchindex;
1118         }
1119     }
1120     return result;
1121 }
1122 
1123 /**
1124 * Gets the collation element iterator's current offset.
1125 * @param coleiter collation element iterator
1126 * @param forwards flag TRUE if we are moving in th forwards direction
1127 * @return current offset
1128 */
1129 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1130 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1131                                               UBool               forwards)
1132 {
1133     int32_t result = ucol_getOffset(coleiter);
1134     // intricacies of the the backwards collation element iterator
1135     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1136         result ++;
1137     }
1138     return result;
1139 }
1140 
1141 /**
1142 * Checks match for contraction.
1143 * If the match ends with a partial contraction we fail.
1144 * If the match starts too far off (because of backwards iteration) we try to
1145 * chip off the extra characters depending on whether a breakiterator has
1146 * been used.
1147 * Internal method, error assumed to be success, caller has to check status
1148 * before calling this method.
1149 * @param strsrch string search data
1150 * @param start offset of potential match, to be modified if necessary
1151 * @param end offset of potential match, to be modified if necessary
1152 * @param status output error status if any
1153 * @return TRUE if match passes the contraction test, FALSE otherwise
1154 */
1155 
1156 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1157 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1158                                      int32_t   *start,
1159                                      int32_t   *end, UErrorCode  *status)
1160 {
1161           UCollationElements *coleiter   = strsrch->textIter;
1162           int32_t             textlength = strsrch->search->textLength;
1163           int32_t             temp       = *start;
1164     const UCollator          *collator   = strsrch->collator;
1165     const UChar              *text       = strsrch->search->text;
1166     // This part checks if either ends of the match contains potential
1167     // contraction. If so we'll have to iterate through them
1168     // The start contraction needs to be checked since ucol_previous dumps
1169     // all characters till the first safe character into the buffer.
1170     // *start + 1 is used to test for the unsafe characters instead of *start
1171     // because ucol_prev takes all unsafe characters till the first safe
1172     // character ie *start. so by testing *start + 1, we can estimate if
1173     // excess prefix characters has been included in the potential search
1174     // results.
1175     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1176         (*start + 1 < textlength
1177          && ucol_unsafeCP(text[*start + 1], collator))) {
1178         int32_t expansion  = getExpansionPrefix(coleiter);
1179         UBool   expandflag = expansion > 0;
1180         setColEIterOffset(coleiter, *start);
1181         while (expansion > 0) {
1182             // getting rid of the redundant ce, caused by setOffset.
1183             // since backward contraction/expansion may have extra ces if we
1184             // are in the normalization buffer, hasAccentsBeforeMatch would
1185             // have taken care of it.
1186             // E.g. the character \u01FA will have an expansion of 3, but if
1187             // we are only looking for acute and ring \u030A and \u0301, we'll
1188             // have to skip the first ce in the expansion buffer.
1189             ucol_next(coleiter, status);
1190             if (U_FAILURE(*status)) {
1191                 return FALSE;
1192             }
1193             if (ucol_getOffset(coleiter) != temp) {
1194                 *start = temp;
1195                 temp  = ucol_getOffset(coleiter);
1196             }
1197             expansion --;
1198         }
1199 
1200         int32_t  *patternce       = strsrch->pattern.ces;
1201         int32_t   patterncelength = strsrch->pattern.cesLength;
1202         int32_t   count           = 0;
1203         while (count < patterncelength) {
1204             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1205             if (ce == UCOL_IGNORABLE) {
1206                 continue;
1207             }
1208             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1209                 *start = temp;
1210                 temp   = ucol_getOffset(coleiter);
1211             }
1212             if (U_FAILURE(*status) || ce != patternce[count]) {
1213                 (*end) ++;
1214                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1215                 return FALSE;
1216             }
1217             count ++;
1218         }
1219     }
1220     return TRUE;
1221 }
1222 
1223 /**
1224 * Checks and sets the match information if found.
1225 * Checks
1226 * <ul>
1227 * <li> the potential match does not repeat the previous match
1228 * <li> boundaries are correct
1229 * <li> exact matches has no extra accents
1230 * <li> identical matchesb
1231 * <li> potential match does not end in the middle of a contraction
1232 * <\ul>
1233 * Otherwise the offset will be shifted to the next character.
1234 * Internal method, status assumed to be success, caller has to check status
1235 * before calling this method.
1236 * @param strsrch string search data
1237 * @param textoffset offset in the collation element text. the returned value
1238 *        will be the truncated end offset of the match or the new start
1239 *        search offset.
1240 * @param status output error status if any
1241 * @return TRUE if the match is valid, FALSE otherwise
1242 */
1243 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1244 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1245                                  int32_t   *textoffset, UErrorCode *status)
1246 {
1247     UCollationElements *coleiter = strsrch->textIter;
1248     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1249 
1250     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1251         return FALSE;
1252     }
1253 
1254     // this totally matches, however we need to check if it is repeating
1255     if (!isBreakUnit(strsrch, start, *textoffset) ||
1256         checkRepeatedMatch(strsrch, start, *textoffset) ||
1257         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1258         !checkIdentical(strsrch, start, *textoffset) ||
1259         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1260 
1261         (*textoffset) ++;
1262         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1263         return FALSE;
1264     }
1265 
1266     //Add breakiterator boundary check for primary strength search.
1267     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1268         checkBreakBoundary(strsrch, &start, textoffset);
1269     }
1270 
1271     // totally match, we will get rid of the ending ignorables.
1272     strsrch->search->matchedIndex  = start;
1273     strsrch->search->matchedLength = *textoffset - start;
1274     return TRUE;
1275 }
1276 
1277 /**
1278 * Getting the previous base character offset, or the current offset if the
1279 * current character is a base character
1280 * @param text string
1281 * @param textoffset one offset after the current character
1282 * @return the offset of the next character after the base character or the first
1283 *         composed character with accents
1284 */
1285 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1286 inline int32_t getPreviousBaseOffset(const UChar       *text,
1287                                                int32_t  textoffset)
1288 {
1289     if (textoffset > 0) {
1290         for (;;) {
1291             int32_t result = textoffset;
1292             U16_BACK_1(text, 0, textoffset);
1293             int32_t temp = textoffset;
1294             uint16_t fcd = getFCD(text, &temp, result);
1295             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1296                 if (fcd & LAST_BYTE_MASK_) {
1297                     return textoffset;
1298                 }
1299                 return result;
1300             }
1301             if (textoffset == 0) {
1302                 return 0;
1303             }
1304         }
1305     }
1306     return textoffset;
1307 }
1308 
1309 /**
1310 * Getting the indexes of the accents that are not blocked in the argument
1311 * accent array
1312 * @param accents array of accents in nfd terminated by a 0.
1313 * @param accentsindex array of indexes of the accents that are not blocked
1314 */
1315 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1316 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1317 {
1318     int32_t index     = 0;
1319     int32_t     length    = u_strlen(accents);
1320     UChar32     codepoint = 0;
1321     int         cclass    = 0;
1322     int         result    = 0;
1323     int32_t temp;
1324     while (index < length) {
1325         temp = index;
1326         U16_NEXT(accents, index, length, codepoint);
1327         if (u_getCombiningClass(codepoint) != cclass) {
1328             cclass        = u_getCombiningClass(codepoint);
1329             accentsindex[result] = temp;
1330             result ++;
1331         }
1332     }
1333     accentsindex[result] = length;
1334     return result;
1335 }
1336 
1337 /**
1338 * Appends 3 UChar arrays to a destination array.
1339 * Creates a new array if we run out of space. The caller will have to
1340 * manually deallocate the newly allocated array.
1341 * Internal method, status assumed to be success, caller has to check status
1342 * before calling this method. destination not to be NULL and has at least
1343 * size destinationlength.
1344 * @param destination target array
1345 * @param destinationlength target array size, returning the appended length
1346 * @param source1 null-terminated first array
1347 * @param source2 second array
1348 * @param source2length length of seond array
1349 * @param source3 null-terminated third array
1350 * @param status error status if any
1351 * @return new destination array, destination if there was no new allocation
1352 */
1353 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1354 inline UChar * addToUCharArray(      UChar      *destination,
1355                                      int32_t    *destinationlength,
1356                                const UChar      *source1,
1357                                const UChar      *source2,
1358                                      int32_t     source2length,
1359                                const UChar      *source3,
1360                                      UErrorCode *status)
1361 {
1362     int32_t source1length = source1 ? u_strlen(source1) : 0;
1363     int32_t source3length = source3 ? u_strlen(source3) : 0;
1364     if (*destinationlength < source1length + source2length + source3length +
1365                                                                            1)
1366     {
1367         destination = (UChar *)allocateMemory(
1368           (source1length + source2length + source3length + 1) * sizeof(UChar),
1369           status);
1370         // if error allocating memory, status will be
1371         // U_MEMORY_ALLOCATION_ERROR
1372         if (U_FAILURE(*status)) {
1373             *destinationlength = 0;
1374             return NULL;
1375         }
1376     }
1377     if (source1length != 0) {
1378         uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1379     }
1380     if (source2length != 0) {
1381         uprv_memcpy(destination + source1length, source2,
1382                     sizeof(UChar) * source2length);
1383     }
1384     if (source3length != 0) {
1385         uprv_memcpy(destination + source1length + source2length, source3,
1386                     sizeof(UChar) * source3length);
1387     }
1388     *destinationlength = source1length + source2length + source3length;
1389     return destination;
1390 }
1391 
1392 /**
1393 * Running through a collation element iterator to see if the contents matches
1394 * pattern in string search data
1395 * @param strsrch string search data
1396 * @param coleiter collation element iterator
1397 * @return TRUE if a match if found, FALSE otherwise
1398 */
1399 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1400 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1401                                        UCollationElements *coleiter)
1402 {
1403     int         patternceindex = strsrch->pattern.cesLength;
1404     int32_t    *patternce      = strsrch->pattern.ces;
1405     UErrorCode  status = U_ZERO_ERROR;
1406     while (patternceindex > 0) {
1407         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1408         if (ce == UCOL_IGNORABLE) {
1409             continue;
1410         }
1411         if (U_FAILURE(status) || ce != *patternce) {
1412             return FALSE;
1413         }
1414         patternce ++;
1415         patternceindex --;
1416     }
1417     return TRUE;
1418 }
1419 
1420 /**
1421 * Rearranges the front accents to try matching.
1422 * Prefix accents in the text will be grouped according to their combining
1423 * class and the groups will be mixed and matched to try find the perfect
1424 * match with the pattern.
1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1427 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1428 *         "\u0301\u0325".
1429 * step 2: check if any of the generated substrings matches the pattern.
1430 * Internal method, status is assumed to be success, caller has to check status
1431 * before calling this method.
1432 * @param strsrch string search match
1433 * @param start first offset of the accents to start searching
1434 * @param end start of the last accent set
1435 * @param status output error status if any
1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1437 *         offset of the match. Note this start includes all preceding accents.
1438 */
1439 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1440 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1441                                        int32_t    start,
1442                                        int32_t    end,
1443                                        UErrorCode    *status)
1444 {
1445     const UChar       *text       = strsrch->search->text;
1446           int32_t      textlength = strsrch->search->textLength;
1447           int32_t  tempstart  = start;
1448 
1449     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1450         // die... failed at a base character
1451         return USEARCH_DONE;
1452     }
1453 
1454     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1455     start = getPreviousBaseOffset(text, tempstart);
1456 
1457     UChar       accents[INITIAL_ARRAY_SIZE_];
1458     // normalizing the offensive string
1459     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1460                     INITIAL_ARRAY_SIZE_, status);
1461     if (U_FAILURE(*status)) {
1462         return USEARCH_DONE;
1463     }
1464 
1465     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1466     int32_t         accentsize = getUnblockedAccentIndex(accents,
1467                                                                  accentsindex);
1468     int32_t         count      = (2 << (accentsize - 1)) - 1;
1469     UChar               buffer[INITIAL_ARRAY_SIZE_];
1470     UCollationElements *coleiter   = strsrch->utilIter;
1471     while (U_SUCCESS(*status) && count > 0) {
1472         UChar *rearrange = strsrch->canonicalPrefixAccents;
1473         // copy the base characters
1474         for (int k = 0; k < accentsindex[0]; k ++) {
1475             *rearrange ++ = accents[k];
1476         }
1477         // forming all possible canonical rearrangement by dropping
1478         // sets of accents
1479         for (int i = 0; i <= accentsize - 1; i ++) {
1480             int32_t mask = 1 << (accentsize - i - 1);
1481             if (count & mask) {
1482                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1483                     *rearrange ++ = accents[j];
1484                 }
1485             }
1486         }
1487         *rearrange = 0;
1488         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1489         UChar   *match     = addToUCharArray(buffer, &matchsize,
1490                                            strsrch->canonicalPrefixAccents,
1491                                            strsrch->search->text + offset,
1492                                            end - offset,
1493                                            strsrch->canonicalSuffixAccents,
1494                                            status);
1495 
1496         // if status is a failure, ucol_setText does nothing.
1497         // run the collator iterator through this match
1498         ucol_setText(coleiter, match, matchsize, status);
1499         if (U_SUCCESS(*status)) {
1500             if (checkCollationMatch(strsrch, coleiter)) {
1501                 if (match != buffer) {
1502                     uprv_free(match);
1503                 }
1504                 return start;
1505             }
1506         }
1507         count --;
1508     }
1509     return USEARCH_DONE;
1510 }
1511 
1512 /**
1513 * Gets the offset to the safe point in text before textoffset.
1514 * ie. not the middle of a contraction, swappable characters or supplementary
1515 * characters.
1516 * @param collator collation sata
1517 * @param text string to work with
1518 * @param textoffset offset in string
1519 * @param textlength length of text string
1520 * @return offset to the previous safe character
1521 */
1522 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1523 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1524                                       const UChar       *text,
1525                                             int32_t  textoffset)
1526 {
1527     int32_t result = textoffset; // first contraction character
1528     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1529         result --;
1530     }
1531     if (result != 0) {
1532         // the first contraction character is consider unsafe here
1533         result --;
1534     }
1535     return result;
1536 }
1537 
1538 /**
1539 * Cleaning up after we passed the safe zone
1540 * @param strsrch string search data
1541 * @param safetext safe text array
1542 * @param safebuffer safe text buffer
1543 * @param coleiter collation element iterator for safe text
1544 */
1545 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1546 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1547                                   UChar         *safebuffer)
1548 {
1549     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1550     {
1551        uprv_free(safetext);
1552     }
1553 }
1554 
1555 /**
1556 * Take the rearranged end accents and tries matching. If match failed at
1557 * a seperate preceding set of accents (seperated from the rearranged on by
1558 * at least a base character) then we rearrange the preceding accents and
1559 * tries matching again.
1560 * We allow skipping of the ends of the accent set if the ces do not match.
1561 * However if the failure is found before the accent set, it fails.
1562 * Internal method, status assumed to be success, caller has to check status
1563 * before calling this method.
1564 * @param strsrch string search data
1565 * @param textoffset of the start of the rearranged accent
1566 * @param status output error status if any
1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1568 *         offset of the match. Note this start includes all preceding accents.
1569 */
1570 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1571 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1572                                        int32_t    textoffset,
1573                                        UErrorCode    *status)
1574 {
1575     const UChar              *text           = strsrch->search->text;
1576     const UCollator          *collator       = strsrch->collator;
1577           int32_t             safelength     = 0;
1578           UChar              *safetext;
1579           int32_t             safetextlength;
1580           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1581           UCollationElements *coleiter       = strsrch->utilIter;
1582           int32_t         safeoffset     = textoffset;
1583 
1584     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1585                                          collator)) {
1586         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1587         safelength     = textoffset - safeoffset;
1588         safetextlength = INITIAL_ARRAY_SIZE_;
1589         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1590                                          text + safeoffset, safelength,
1591                                          strsrch->canonicalSuffixAccents,
1592                                          status);
1593     }
1594     else {
1595         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1596         safetext       = strsrch->canonicalSuffixAccents;
1597     }
1598 
1599     // if status is a failure, ucol_setText does nothing
1600     ucol_setText(coleiter, safetext, safetextlength, status);
1601     // status checked in loop below
1602 
1603     int32_t  *ce        = strsrch->pattern.ces;
1604     int32_t   celength  = strsrch->pattern.cesLength;
1605     int       ceindex   = celength - 1;
1606     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1607 
1608     while (ceindex >= 0) {
1609         int32_t textce = ucol_previous(coleiter, status);
1610         if (U_FAILURE(*status)) {
1611             if (isSafe) {
1612                 cleanUpSafeText(strsrch, safetext, safebuffer);
1613             }
1614             return USEARCH_DONE;
1615         }
1616         if (textce == UCOL_NULLORDER) {
1617             // check if we have passed the safe buffer
1618             if (coleiter == strsrch->textIter) {
1619                 cleanUpSafeText(strsrch, safetext, safebuffer);
1620                 return USEARCH_DONE;
1621             }
1622             cleanUpSafeText(strsrch, safetext, safebuffer);
1623             safetext = safebuffer;
1624             coleiter = strsrch->textIter;
1625             setColEIterOffset(coleiter, safeoffset);
1626             // status checked at the start of the loop
1627             isSafe = FALSE;
1628             continue;
1629         }
1630         textce = getCE(strsrch, textce);
1631         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1632             // do the beginning stuff
1633             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1634             if (isSafe && failedoffset >= safelength) {
1635                 // alas... no hope. failed at rearranged accent set
1636                 cleanUpSafeText(strsrch, safetext, safebuffer);
1637                 return USEARCH_DONE;
1638             }
1639             else {
1640                 if (isSafe) {
1641                     failedoffset += safeoffset;
1642                     cleanUpSafeText(strsrch, safetext, safebuffer);
1643                 }
1644 
1645                 // try rearranging the front accents
1646                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1647                                         failedoffset, textoffset, status);
1648                 if (result != USEARCH_DONE) {
1649                     // if status is a failure, ucol_setOffset does nothing
1650                     setColEIterOffset(strsrch->textIter, result);
1651                 }
1652                 if (U_FAILURE(*status)) {
1653                     return USEARCH_DONE;
1654                 }
1655                 return result;
1656             }
1657         }
1658         if (textce == ce[ceindex]) {
1659             ceindex --;
1660         }
1661     }
1662     // set offset here
1663     if (isSafe) {
1664         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1665         // sets the text iterator here with the correct expansion and offset
1666         int32_t    leftoverces = getExpansionPrefix(coleiter);
1667         cleanUpSafeText(strsrch, safetext, safebuffer);
1668         if (result >= safelength) {
1669             result = textoffset;
1670         }
1671         else {
1672             result += safeoffset;
1673         }
1674         setColEIterOffset(strsrch->textIter, result);
1675         strsrch->textIter->iteratordata_.toReturn =
1676                        setExpansionPrefix(strsrch->textIter, leftoverces);
1677         return result;
1678     }
1679 
1680     return ucol_getOffset(coleiter);
1681 }
1682 
1683 /**
1684 * Trying out the substring and sees if it can be a canonical match.
1685 * This will try normalizing the end accents and arranging them into canonical
1686 * equivalents and check their corresponding ces with the pattern ce.
1687 * Suffix accents in the text will be grouped according to their combining
1688 * class and the groups will be mixed and matched to try find the perfect
1689 * match with the pattern.
1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1692 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1693 *         "\u0301\u0325".
1694 * step 2: check if any of the generated substrings matches the pattern.
1695 * Internal method, status assumed to be success, caller has to check status
1696 * before calling this method.
1697 * @param strsrch string search data
1698 * @param textoffset end offset in the collation element text that ends with
1699 *                   the accents to be rearranged
1700 * @param status error status if any
1701 * @return TRUE if the match is valid, FALSE otherwise
1702 */
1703 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1704 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1705                            int32_t    textoffset,
1706                            UErrorCode    *status)
1707 {
1708     const UChar       *text = strsrch->search->text;
1709           int32_t  temp = textoffset;
1710     U16_BACK_1(text, 0, temp);
1711     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1712         UCollationElements *coleiter = strsrch->textIter;
1713         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1714         if (strsrch->pattern.hasPrefixAccents) {
1715             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1716                                                 status);
1717             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1718                 setColEIterOffset(coleiter, offset);
1719                 return TRUE;
1720             }
1721         }
1722         return FALSE;
1723     }
1724 
1725     if (!strsrch->pattern.hasSuffixAccents) {
1726         return FALSE;
1727     }
1728 
1729     UChar       accents[INITIAL_ARRAY_SIZE_];
1730     // offset to the last base character in substring to search
1731     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1732     // normalizing the offensive string
1733     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1734                                0, accents, INITIAL_ARRAY_SIZE_, status);
1735     // status checked in loop below
1736 
1737     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1738     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1739 
1740     // 2 power n - 1 plus the full set of accents
1741     int32_t  count = (2 << (size - 1)) - 1;
1742     while (U_SUCCESS(*status) && count > 0) {
1743         UChar *rearrange = strsrch->canonicalSuffixAccents;
1744         // copy the base characters
1745         for (int k = 0; k < accentsindex[0]; k ++) {
1746             *rearrange ++ = accents[k];
1747         }
1748         // forming all possible canonical rearrangement by dropping
1749         // sets of accents
1750         for (int i = 0; i <= size - 1; i ++) {
1751             int32_t mask = 1 << (size - i - 1);
1752             if (count & mask) {
1753                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1754                     *rearrange ++ = accents[j];
1755                 }
1756             }
1757         }
1758         *rearrange = 0;
1759         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1760                                                         status);
1761         if (offset != USEARCH_DONE) {
1762             return TRUE; // match found
1763         }
1764         count --;
1765     }
1766     return FALSE;
1767 }
1768 
1769 /**
1770 * Gets the previous base character offset depending on the string search
1771 * pattern data
1772 * @param strsrch string search data
1773 * @param textoffset current offset, current character
1774 * @return the offset of the next character after this base character or itself
1775 *         if it is a composed character with accents
1776 */
1777 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1779                                                       int32_t textoffset)
1780 {
1781     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1782         const UChar       *text = strsrch->search->text;
1783               int32_t  offset = textoffset;
1784         if (getFCD(text, &offset, strsrch->search->textLength) >>
1785                                                    SECOND_LAST_BYTE_SHIFT_) {
1786             return getPreviousBaseOffset(text, textoffset);
1787         }
1788     }
1789     return textoffset;
1790 }
1791 
1792 /**
1793 * Checks match for contraction.
1794 * If the match ends with a partial contraction we fail.
1795 * If the match starts too far off (because of backwards iteration) we try to
1796 * chip off the extra characters
1797 * Internal method, status assumed to be success, caller has to check status
1798 * before calling this method.
1799 * @param strsrch string search data
1800 * @param start offset of potential match, to be modified if necessary
1801 * @param end offset of potential match, to be modified if necessary
1802 * @param status output error status if any
1803 * @return TRUE if match passes the contraction test, FALSE otherwise
1804 */
1805 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1806 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1807                                          int32_t   *start,
1808                                          int32_t   *end,
1809                                          UErrorCode    *status)
1810 {
1811           UCollationElements *coleiter   = strsrch->textIter;
1812           int32_t             textlength = strsrch->search->textLength;
1813           int32_t         temp       = *start;
1814     const UCollator          *collator   = strsrch->collator;
1815     const UChar              *text       = strsrch->search->text;
1816     // This part checks if either ends of the match contains potential
1817     // contraction. If so we'll have to iterate through them
1818     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1819         (*start + 1 < textlength
1820          && ucol_unsafeCP(text[*start + 1], collator))) {
1821         int32_t expansion  = getExpansionPrefix(coleiter);
1822         UBool   expandflag = expansion > 0;
1823         setColEIterOffset(coleiter, *start);
1824         while (expansion > 0) {
1825             // getting rid of the redundant ce, caused by setOffset.
1826             // since backward contraction/expansion may have extra ces if we
1827             // are in the normalization buffer, hasAccentsBeforeMatch would
1828             // have taken care of it.
1829             // E.g. the character \u01FA will have an expansion of 3, but if
1830             // we are only looking for acute and ring \u030A and \u0301, we'll
1831             // have to skip the first ce in the expansion buffer.
1832             ucol_next(coleiter, status);
1833             if (U_FAILURE(*status)) {
1834                 return FALSE;
1835             }
1836             if (ucol_getOffset(coleiter) != temp) {
1837                 *start = temp;
1838                 temp  = ucol_getOffset(coleiter);
1839             }
1840             expansion --;
1841         }
1842 
1843         int32_t  *patternce       = strsrch->pattern.ces;
1844         int32_t   patterncelength = strsrch->pattern.cesLength;
1845         int32_t   count           = 0;
1846         int32_t   textlength      = strsrch->search->textLength;
1847         while (count < patterncelength) {
1848             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1849             // status checked below, note that if status is a failure
1850             // ucol_next returns UCOL_NULLORDER
1851             if (ce == UCOL_IGNORABLE) {
1852                 continue;
1853             }
1854             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1855                 *start = temp;
1856                 temp   = ucol_getOffset(coleiter);
1857             }
1858 
1859             if (count == 0 && ce != patternce[0]) {
1860                 // accents may have extra starting ces, this occurs when a
1861                 // pure accent pattern is matched without rearrangement
1862                 // text \u0325\u0300 and looking for \u0300
1863                 int32_t expected = patternce[0];
1864                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1865                     ce = getCE(strsrch, ucol_next(coleiter, status));
1866                     while (U_SUCCESS(*status) && ce != expected &&
1867                            ce != UCOL_NULLORDER &&
1868                            ucol_getOffset(coleiter) <= *end) {
1869                         ce = getCE(strsrch, ucol_next(coleiter, status));
1870                     }
1871                 }
1872             }
1873             if (U_FAILURE(*status) || ce != patternce[count]) {
1874                 (*end) ++;
1875                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1876                 return FALSE;
1877             }
1878             count ++;
1879         }
1880     }
1881     return TRUE;
1882 }
1883 
1884 /**
1885 * Checks and sets the match information if found.
1886 * Checks
1887 * <ul>
1888 * <li> the potential match does not repeat the previous match
1889 * <li> boundaries are correct
1890 * <li> potential match does not end in the middle of a contraction
1891 * <li> identical matches
1892 * <\ul>
1893 * Otherwise the offset will be shifted to the next character.
1894 * Internal method, status assumed to be success, caller has to check the
1895 * status before calling this method.
1896 * @param strsrch string search data
1897 * @param textoffset offset in the collation element text. the returned value
1898 *        will be the truncated end offset of the match or the new start
1899 *        search offset.
1900 * @param status output error status if any
1901 * @return TRUE if the match is valid, FALSE otherwise
1902 */
1903 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1904 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1905                                      int32_t   *textoffset,
1906                                      UErrorCode    *status)
1907 {
1908     // to ensure that the start and ends are not composite characters
1909     UCollationElements *coleiter = strsrch->textIter;
1910     // if we have a canonical accent match
1911     if ((strsrch->pattern.hasSuffixAccents &&
1912         strsrch->canonicalSuffixAccents[0]) ||
1913         (strsrch->pattern.hasPrefixAccents &&
1914         strsrch->canonicalPrefixAccents[0])) {
1915         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1916                                                     strsrch,
1917                                                     ucol_getOffset(coleiter));
1918         strsrch->search->matchedLength = *textoffset -
1919                                                 strsrch->search->matchedIndex;
1920         return TRUE;
1921     }
1922 
1923     int32_t start = getColElemIterOffset(coleiter, FALSE);
1924     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1925                                             status) || U_FAILURE(*status)) {
1926         return FALSE;
1927     }
1928 
1929     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1930     // this totally matches, however we need to check if it is repeating
1931     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1932         !isBreakUnit(strsrch, start, *textoffset) ||
1933         !checkIdentical(strsrch, start, *textoffset)) {
1934         (*textoffset) ++;
1935         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1936                                         strsrch->search->textLength);
1937         return FALSE;
1938     }
1939 
1940     strsrch->search->matchedIndex  = start;
1941     strsrch->search->matchedLength = *textoffset - start;
1942     return TRUE;
1943 }
1944 
1945 /**
1946 * Shifting the collation element iterator position forward to prepare for
1947 * a preceding match. If the first character is a unsafe character, we'll only
1948 * shift by 1 to capture contractions, normalization etc.
1949 * Internal method, status assumed to be success, caller has to check status
1950 * before calling this method.
1951 * @param text strsrch string search data
1952 * @param textoffset start text position to do search
1953 * @param ce the text ce which failed the match.
1954 * @param patternceindex index of the ce within the pattern ce buffer which
1955 *        failed the match
1956 * @return final offset
1957 */
1958 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1959 inline int32_t reverseShift(UStringSearch *strsrch,
1960                                 int32_t    textoffset,
1961                                 int32_t       ce,
1962                                 int32_t        patternceindex)
1963 {
1964     if (strsrch->search->isOverlap) {
1965         if (textoffset != strsrch->search->textLength) {
1966             textoffset --;
1967         }
1968         else {
1969             textoffset -= strsrch->pattern.defaultShiftSize;
1970         }
1971     }
1972     else {
1973         if (ce != UCOL_NULLORDER) {
1974             int32_t shift = strsrch->pattern.backShift[hash(ce)];
1975 
1976             // this is to adjust for characters in the middle of the substring
1977             // for matching that failed.
1978             int32_t adjust = patternceindex;
1979             if (adjust > 1 && shift > adjust) {
1980                 shift -= adjust - 1;
1981             }
1982             textoffset -= shift;
1983         }
1984         else {
1985             textoffset -= strsrch->pattern.defaultShiftSize;
1986         }
1987     }
1988     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1989     return textoffset;
1990 }
1991 
1992 /**
1993 * Checks match for contraction.
1994 * If the match starts with a partial contraction we fail.
1995 * Internal method, status assumed to be success, caller has to check status
1996 * before calling this method.
1997 * @param strsrch string search data
1998 * @param start offset of potential match, to be modified if necessary
1999 * @param end offset of potential match, to be modified if necessary
2000 * @param status output error status if any
2001 * @return TRUE if match passes the contraction test, FALSE otherwise
2002 */
2003 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2004 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2005                                      int32_t   *start,
2006                                      int32_t   *end, UErrorCode  *status)
2007 {
2008           UCollationElements *coleiter   = strsrch->textIter;
2009           int32_t             textlength = strsrch->search->textLength;
2010           int32_t             temp       = *end;
2011     const UCollator          *collator   = strsrch->collator;
2012     const UChar              *text       = strsrch->search->text;
2013     // This part checks if either if the start of the match contains potential
2014     // contraction. If so we'll have to iterate through them
2015     // Since we used ucol_next while previously looking for the potential
2016     // match, this guarantees that our end will not be a partial contraction,
2017     // or a partial supplementary character.
2018     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2019         int32_t expansion  = getExpansionSuffix(coleiter);
2020         UBool   expandflag = expansion > 0;
2021         setColEIterOffset(coleiter, *end);
2022         while (U_SUCCESS(*status) && expansion > 0) {
2023             // getting rid of the redundant ce
2024             // since forward contraction/expansion may have extra ces
2025             // if we are in the normalization buffer, hasAccentsBeforeMatch
2026             // would have taken care of it.
2027             // E.g. the character \u01FA will have an expansion of 3, but if
2028             // we are only looking for A ring A\u030A, we'll have to skip the
2029             // last ce in the expansion buffer
2030             ucol_previous(coleiter, status);
2031             if (U_FAILURE(*status)) {
2032                 return FALSE;
2033             }
2034             if (ucol_getOffset(coleiter) != temp) {
2035                 *end = temp;
2036                 temp  = ucol_getOffset(coleiter);
2037             }
2038             expansion --;
2039         }
2040 
2041         int32_t  *patternce       = strsrch->pattern.ces;
2042         int32_t   patterncelength = strsrch->pattern.cesLength;
2043         int32_t   count           = patterncelength;
2044         while (count > 0) {
2045             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2046             // status checked below, note that if status is a failure
2047             // ucol_previous returns UCOL_NULLORDER
2048             if (ce == UCOL_IGNORABLE) {
2049                 continue;
2050             }
2051             if (expandflag && count == 0 &&
2052                 getColElemIterOffset(coleiter, FALSE) != temp) {
2053                 *end = temp;
2054                 temp  = ucol_getOffset(coleiter);
2055             }
2056             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2057                 (*start) --;
2058                 *start = getPreviousBaseOffset(text, *start);
2059                 return FALSE;
2060             }
2061             count --;
2062         }
2063     }
2064     return TRUE;
2065 }
2066 
2067 /**
2068 * Checks and sets the match information if found.
2069 * Checks
2070 * <ul>
2071 * <li> the current match does not repeat the last match
2072 * <li> boundaries are correct
2073 * <li> exact matches has no extra accents
2074 * <li> identical matches
2075 * <\ul>
2076 * Otherwise the offset will be shifted to the preceding character.
2077 * Internal method, status assumed to be success, caller has to check status
2078 * before calling this method.
2079 * @param strsrch string search data
2080 * @param collator
2081 * @param coleiter collation element iterator
2082 * @param text string
2083 * @param textoffset offset in the collation element text. the returned value
2084 *        will be the truncated start offset of the match or the new start
2085 *        search offset.
2086 * @param status output error status if any
2087 * @return TRUE if the match is valid, FALSE otherwise
2088 */
2089 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2090 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2091                                      int32_t   *textoffset,
2092                                      UErrorCode    *status)
2093 {
2094     // to ensure that the start and ends are not composite characters
2095     int32_t end = ucol_getOffset(strsrch->textIter);
2096     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2097         || U_FAILURE(*status)) {
2098             return FALSE;
2099     }
2100 
2101     // this totally matches, however we need to check if it is repeating
2102     // the old match
2103     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2104         !isBreakUnit(strsrch, *textoffset, end) ||
2105         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2106         !checkIdentical(strsrch, *textoffset, end) ||
2107         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2108         (*textoffset) --;
2109         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2110                                             *textoffset);
2111         return FALSE;
2112     }
2113 
2114     //Add breakiterator boundary check for primary strength search.
2115     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2116         checkBreakBoundary(strsrch, textoffset, &end);
2117     }
2118 
2119     strsrch->search->matchedIndex = *textoffset;
2120     strsrch->search->matchedLength = end - *textoffset;
2121     return TRUE;
2122 }
2123 
2124 /**
2125 * Rearranges the end accents to try matching.
2126 * Suffix accents in the text will be grouped according to their combining
2127 * class and the groups will be mixed and matched to try find the perfect
2128 * match with the pattern.
2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2131 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2132 *         "\u0301\u0325".
2133 * step 2: check if any of the generated substrings matches the pattern.
2134 * Internal method, status assumed to be success, user has to check status
2135 * before calling this method.
2136 * @param strsrch string search match
2137 * @param start offset of the first base character
2138 * @param end start of the last accent set
2139 * @param status only error status if any
2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2141 *         offset of the match. Note this start includes all following accents.
2142 */
2143 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2145                                            int32_t    start,
2146                                            int32_t    end,
2147                                            UErrorCode    *status)
2148 {
2149     const UChar       *text       = strsrch->search->text;
2150           int32_t  tempend    = end;
2151 
2152     U16_BACK_1(text, 0, tempend);
2153     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2154                                                            LAST_BYTE_MASK_)) {
2155         // die... failed at a base character
2156         return USEARCH_DONE;
2157     }
2158     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2159 
2160     if (U_SUCCESS(*status)) {
2161         UChar       accents[INITIAL_ARRAY_SIZE_];
2162         int32_t offset = getPreviousBaseOffset(text, end);
2163         // normalizing the offensive string
2164         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2165                         INITIAL_ARRAY_SIZE_, status);
2166 
2167         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2168         int32_t         accentsize = getUnblockedAccentIndex(accents,
2169                                                          accentsindex);
2170         int32_t         count      = (2 << (accentsize - 1)) - 1;
2171         UChar               buffer[INITIAL_ARRAY_SIZE_];
2172         UCollationElements *coleiter = strsrch->utilIter;
2173         while (U_SUCCESS(*status) && count > 0) {
2174             UChar *rearrange = strsrch->canonicalSuffixAccents;
2175             // copy the base characters
2176             for (int k = 0; k < accentsindex[0]; k ++) {
2177                 *rearrange ++ = accents[k];
2178             }
2179             // forming all possible canonical rearrangement by dropping
2180             // sets of accents
2181             for (int i = 0; i <= accentsize - 1; i ++) {
2182                 int32_t mask = 1 << (accentsize - i - 1);
2183                 if (count & mask) {
2184                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2185                         *rearrange ++ = accents[j];
2186                     }
2187                 }
2188             }
2189             *rearrange = 0;
2190             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2191             UChar   *match     = addToUCharArray(buffer, &matchsize,
2192                                            strsrch->canonicalPrefixAccents,
2193                                            strsrch->search->text + start,
2194                                            offset - start,
2195                                            strsrch->canonicalSuffixAccents,
2196                                            status);
2197 
2198             // run the collator iterator through this match
2199             // if status is a failure ucol_setText does nothing
2200             ucol_setText(coleiter, match, matchsize, status);
2201             if (U_SUCCESS(*status)) {
2202                 if (checkCollationMatch(strsrch, coleiter)) {
2203                     if (match != buffer) {
2204                         uprv_free(match);
2205                     }
2206                     return end;
2207                 }
2208             }
2209             count --;
2210         }
2211     }
2212     return USEARCH_DONE;
2213 }
2214 
2215 /**
2216 * Take the rearranged start accents and tries matching. If match failed at
2217 * a seperate following set of accents (seperated from the rearranged on by
2218 * at least a base character) then we rearrange the preceding accents and
2219 * tries matching again.
2220 * We allow skipping of the ends of the accent set if the ces do not match.
2221 * However if the failure is found before the accent set, it fails.
2222 * Internal method, status assumed to be success, caller has to check status
2223 * before calling this method.
2224 * @param strsrch string search data
2225 * @param textoffset of the ends of the rearranged accent
2226 * @param status output error status if any
2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2228 *         offset of the match. Note this start includes all following accents.
2229 */
2230 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2232                                            int32_t    textoffset,
2233                                            UErrorCode    *status)
2234 {
2235     const UChar       *text       = strsrch->search->text;
2236     const UCollator   *collator   = strsrch->collator;
2237           int32_t      safelength = 0;
2238           UChar       *safetext;
2239           int32_t      safetextlength;
2240           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2241           int32_t  safeoffset = textoffset;
2242 
2243     if (textoffset &&
2244         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2245                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2246                                          ], collator)) {
2247         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2248                                            strsrch->search->textLength);
2249         safelength     = safeoffset - textoffset;
2250         safetextlength = INITIAL_ARRAY_SIZE_;
2251         safetext       = addToUCharArray(safebuffer, &safetextlength,
2252                                          strsrch->canonicalPrefixAccents,
2253                                          text + textoffset, safelength,
2254                                          NULL, status);
2255     }
2256     else {
2257         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2258         safetext       = strsrch->canonicalPrefixAccents;
2259     }
2260 
2261     UCollationElements *coleiter = strsrch->utilIter;
2262      // if status is a failure, ucol_setText does nothing
2263     ucol_setText(coleiter, safetext, safetextlength, status);
2264     // status checked in loop below
2265 
2266     int32_t  *ce           = strsrch->pattern.ces;
2267     int32_t   celength     = strsrch->pattern.cesLength;
2268     int       ceindex      = 0;
2269     UBool     isSafe       = TRUE; // safe zone indication flag for position
2270     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2271 
2272     while (ceindex < celength) {
2273         int32_t textce = ucol_next(coleiter, status);
2274         if (U_FAILURE(*status)) {
2275             if (isSafe) {
2276                 cleanUpSafeText(strsrch, safetext, safebuffer);
2277             }
2278             return USEARCH_DONE;
2279         }
2280         if (textce == UCOL_NULLORDER) {
2281             // check if we have passed the safe buffer
2282             if (coleiter == strsrch->textIter) {
2283                 cleanUpSafeText(strsrch, safetext, safebuffer);
2284                 return USEARCH_DONE;
2285             }
2286             cleanUpSafeText(strsrch, safetext, safebuffer);
2287             safetext = safebuffer;
2288             coleiter = strsrch->textIter;
2289             setColEIterOffset(coleiter, safeoffset);
2290             // status checked at the start of the loop
2291             isSafe = FALSE;
2292             continue;
2293         }
2294         textce = getCE(strsrch, textce);
2295         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2296             // do the beginning stuff
2297             int32_t failedoffset = ucol_getOffset(coleiter);
2298             if (isSafe && failedoffset <= prefixlength) {
2299                 // alas... no hope. failed at rearranged accent set
2300                 cleanUpSafeText(strsrch, safetext, safebuffer);
2301                 return USEARCH_DONE;
2302             }
2303             else {
2304                 if (isSafe) {
2305                     failedoffset = safeoffset - failedoffset;
2306                     cleanUpSafeText(strsrch, safetext, safebuffer);
2307                 }
2308 
2309                 // try rearranging the end accents
2310                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2311                                         textoffset, failedoffset, status);
2312                 if (result != USEARCH_DONE) {
2313                     // if status is a failure, ucol_setOffset does nothing
2314                     setColEIterOffset(strsrch->textIter, result);
2315                 }
2316                 if (U_FAILURE(*status)) {
2317                     return USEARCH_DONE;
2318                 }
2319                 return result;
2320             }
2321         }
2322         if (textce == ce[ceindex]) {
2323             ceindex ++;
2324         }
2325     }
2326     // set offset here
2327     if (isSafe) {
2328         int32_t result      = ucol_getOffset(coleiter);
2329         // sets the text iterator here with the correct expansion and offset
2330         int32_t     leftoverces = getExpansionSuffix(coleiter);
2331         cleanUpSafeText(strsrch, safetext, safebuffer);
2332         if (result <= prefixlength) {
2333             result = textoffset;
2334         }
2335         else {
2336             result = textoffset + (safeoffset - result);
2337         }
2338         setColEIterOffset(strsrch->textIter, result);
2339         setExpansionSuffix(strsrch->textIter, leftoverces);
2340         return result;
2341     }
2342 
2343     return ucol_getOffset(coleiter);
2344 }
2345 
2346 /**
2347 * Trying out the substring and sees if it can be a canonical match.
2348 * This will try normalizing the starting accents and arranging them into
2349 * canonical equivalents and check their corresponding ces with the pattern ce.
2350 * Prefix accents in the text will be grouped according to their combining
2351 * class and the groups will be mixed and matched to try find the perfect
2352 * match with the pattern.
2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2355 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2356 *         "\u0301\u0325".
2357 * step 2: check if any of the generated substrings matches the pattern.
2358 * Internal method, status assumed to be success, caller has to check status
2359 * before calling this method.
2360 * @param strsrch string search data
2361 * @param textoffset start offset in the collation element text that starts
2362 *                   with the accents to be rearranged
2363 * @param status output error status if any
2364 * @return TRUE if the match is valid, FALSE otherwise
2365 */
2366 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2367 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2368                                int32_t    textoffset,
2369                                UErrorCode    *status)
2370 {
2371     const UChar       *text       = strsrch->search->text;
2372           int32_t  temp       = textoffset;
2373           int32_t      textlength = strsrch->search->textLength;
2374     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2375         UCollationElements *coleiter = strsrch->textIter;
2376         int32_t         offset   = ucol_getOffset(coleiter);
2377         if (strsrch->pattern.hasSuffixAccents) {
2378             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2379                                                     offset, status);
2380             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2381                 setColEIterOffset(coleiter, offset);
2382                 return TRUE;
2383             }
2384         }
2385         return FALSE;
2386     }
2387 
2388     if (!strsrch->pattern.hasPrefixAccents) {
2389         return FALSE;
2390     }
2391 
2392     UChar       accents[INITIAL_ARRAY_SIZE_];
2393     // offset to the last base character in substring to search
2394     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2395     // normalizing the offensive string
2396     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2397                                0, accents, INITIAL_ARRAY_SIZE_, status);
2398     // status checked in loop
2399 
2400     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2401     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2402 
2403     // 2 power n - 1 plus the full set of accents
2404     int32_t  count = (2 << (size - 1)) - 1;
2405     while (U_SUCCESS(*status) && count > 0) {
2406         UChar *rearrange = strsrch->canonicalPrefixAccents;
2407         // copy the base characters
2408         for (int k = 0; k < accentsindex[0]; k ++) {
2409             *rearrange ++ = accents[k];
2410         }
2411         // forming all possible canonical rearrangement by dropping
2412         // sets of accents
2413         for (int i = 0; i <= size - 1; i ++) {
2414             int32_t mask = 1 << (size - i - 1);
2415             if (count & mask) {
2416                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2417                     *rearrange ++ = accents[j];
2418                 }
2419             }
2420         }
2421         *rearrange = 0;
2422         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2423                                                           baseoffset, status);
2424         if (offset != USEARCH_DONE) {
2425             return TRUE; // match found
2426         }
2427         count --;
2428     }
2429     return FALSE;
2430 }
2431 
2432 /**
2433 * Checks match for contraction.
2434 * If the match starts with a partial contraction we fail.
2435 * Internal method, status assumed to be success, caller has to check status
2436 * before calling this method.
2437 * @param strsrch string search data
2438 * @param start offset of potential match, to be modified if necessary
2439 * @param end offset of potential match, to be modified if necessary
2440 * @param status only error status if any
2441 * @return TRUE if match passes the contraction test, FALSE otherwise
2442 */
2443 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2444 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2445                                      int32_t   *start,
2446                                      int32_t   *end, UErrorCode  *status)
2447 {
2448           UCollationElements *coleiter   = strsrch->textIter;
2449           int32_t             textlength = strsrch->search->textLength;
2450           int32_t         temp       = *end;
2451     const UCollator          *collator   = strsrch->collator;
2452     const UChar              *text       = strsrch->search->text;
2453     // This part checks if either if the start of the match contains potential
2454     // contraction. If so we'll have to iterate through them
2455     // Since we used ucol_next while previously looking for the potential
2456     // match, this guarantees that our end will not be a partial contraction,
2457     // or a partial supplementary character.
2458     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2459         int32_t expansion  = getExpansionSuffix(coleiter);
2460         UBool   expandflag = expansion > 0;
2461         setColEIterOffset(coleiter, *end);
2462         while (expansion > 0) {
2463             // getting rid of the redundant ce
2464             // since forward contraction/expansion may have extra ces
2465             // if we are in the normalization buffer, hasAccentsBeforeMatch
2466             // would have taken care of it.
2467             // E.g. the character \u01FA will have an expansion of 3, but if
2468             // we are only looking for A ring A\u030A, we'll have to skip the
2469             // last ce in the expansion buffer
2470             ucol_previous(coleiter, status);
2471             if (U_FAILURE(*status)) {
2472                 return FALSE;
2473             }
2474             if (ucol_getOffset(coleiter) != temp) {
2475                 *end = temp;
2476                 temp  = ucol_getOffset(coleiter);
2477             }
2478             expansion --;
2479         }
2480 
2481         int32_t  *patternce       = strsrch->pattern.ces;
2482         int32_t   patterncelength = strsrch->pattern.cesLength;
2483         int32_t   count           = patterncelength;
2484         while (count > 0) {
2485             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2486             // status checked below, note that if status is a failure
2487             // ucol_previous returns UCOL_NULLORDER
2488             if (ce == UCOL_IGNORABLE) {
2489                 continue;
2490             }
2491             if (expandflag && count == 0 &&
2492                 getColElemIterOffset(coleiter, FALSE) != temp) {
2493                 *end = temp;
2494                 temp  = ucol_getOffset(coleiter);
2495             }
2496             if (count == patterncelength &&
2497                 ce != patternce[patterncelength - 1]) {
2498                 // accents may have extra starting ces, this occurs when a
2499                 // pure accent pattern is matched without rearrangement
2500                 int32_t    expected = patternce[patterncelength - 1];
2501                 U16_BACK_1(text, 0, *end);
2502                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2503                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2504                     while (U_SUCCESS(*status) && ce != expected &&
2505                            ce != UCOL_NULLORDER &&
2506                            ucol_getOffset(coleiter) <= *start) {
2507                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2508                     }
2509                 }
2510             }
2511             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2512                 (*start) --;
2513                 *start = getPreviousBaseOffset(text, *start);
2514                 return FALSE;
2515             }
2516             count --;
2517         }
2518     }
2519     return TRUE;
2520 }
2521 
2522 /**
2523 * Checks and sets the match information if found.
2524 * Checks
2525 * <ul>
2526 * <li> the potential match does not repeat the previous match
2527 * <li> boundaries are correct
2528 * <li> potential match does not end in the middle of a contraction
2529 * <li> identical matches
2530 * <\ul>
2531 * Otherwise the offset will be shifted to the next character.
2532 * Internal method, status assumed to be success, caller has to check status
2533 * before calling this method.
2534 * @param strsrch string search data
2535 * @param textoffset offset in the collation element text. the returned value
2536 *        will be the truncated start offset of the match or the new start
2537 *        search offset.
2538 * @param status only error status if any
2539 * @return TRUE if the match is valid, FALSE otherwise
2540 */
2541 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2542 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2543                                          int32_t   *textoffset,
2544                                          UErrorCode    *status)
2545 {
2546     // to ensure that the start and ends are not composite characters
2547     UCollationElements *coleiter = strsrch->textIter;
2548     // if we have a canonical accent match
2549     if ((strsrch->pattern.hasSuffixAccents &&
2550         strsrch->canonicalSuffixAccents[0]) ||
2551         (strsrch->pattern.hasPrefixAccents &&
2552         strsrch->canonicalPrefixAccents[0])) {
2553         strsrch->search->matchedIndex  = *textoffset;
2554         strsrch->search->matchedLength =
2555             getNextUStringSearchBaseOffset(strsrch,
2556                                       getColElemIterOffset(coleiter, FALSE))
2557             - *textoffset;
2558         return TRUE;
2559     }
2560 
2561     int32_t end = ucol_getOffset(coleiter);
2562     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2563                                                 status) ||
2564          U_FAILURE(*status)) {
2565         return FALSE;
2566     }
2567 
2568     end = getNextUStringSearchBaseOffset(strsrch, end);
2569     // this totally matches, however we need to check if it is repeating
2570     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2571         !isBreakUnit(strsrch, *textoffset, end) ||
2572         !checkIdentical(strsrch, *textoffset, end)) {
2573         (*textoffset) --;
2574         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2575                                             *textoffset);
2576         return FALSE;
2577     }
2578 
2579     strsrch->search->matchedIndex  = *textoffset;
2580     strsrch->search->matchedLength = end - *textoffset;
2581     return TRUE;
2582 }
2583 #endif // #if BOYER_MOORE
2584 
2585 // constructors and destructor -------------------------------------------
2586 
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2587 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2588                                           int32_t         patternlength,
2589                                     const UChar          *text,
2590                                           int32_t         textlength,
2591                                     const char           *locale,
2592                                           UBreakIterator *breakiter,
2593                                           UErrorCode     *status)
2594 {
2595     if (U_FAILURE(*status)) {
2596         return NULL;
2597     }
2598 #if UCONFIG_NO_BREAK_ITERATION
2599     if (breakiter != NULL) {
2600         *status = U_UNSUPPORTED_ERROR;
2601         return NULL;
2602     }
2603 #endif
2604     if (locale) {
2605         // ucol_open internally checks for status
2606         UCollator     *collator = ucol_open(locale, status);
2607         // pattern, text checks are done in usearch_openFromCollator
2608         UStringSearch *result   = usearch_openFromCollator(pattern,
2609                                               patternlength, text, textlength,
2610                                               collator, breakiter, status);
2611 
2612         if (result == NULL || U_FAILURE(*status)) {
2613             if (collator) {
2614                 ucol_close(collator);
2615             }
2616             return NULL;
2617         }
2618         else {
2619             result->ownCollator = TRUE;
2620         }
2621         return result;
2622     }
2623     *status = U_ILLEGAL_ARGUMENT_ERROR;
2624     return NULL;
2625 }
2626 
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2627 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2628                                   const UChar          *pattern,
2629                                         int32_t         patternlength,
2630                                   const UChar          *text,
2631                                         int32_t         textlength,
2632                                   const UCollator      *collator,
2633                                         UBreakIterator *breakiter,
2634                                         UErrorCode     *status)
2635 {
2636     if (U_FAILURE(*status)) {
2637         return NULL;
2638     }
2639 #if UCONFIG_NO_BREAK_ITERATION
2640     if (breakiter != NULL) {
2641         *status = U_UNSUPPORTED_ERROR;
2642         return NULL;
2643     }
2644 #endif
2645     if (pattern == NULL || text == NULL || collator == NULL) {
2646         *status = U_ILLEGAL_ARGUMENT_ERROR;
2647         return NULL;
2648     }
2649 
2650     // string search does not really work when numeric collation is turned on
2651     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2652         *status = U_UNSUPPORTED_ERROR;
2653         return NULL;
2654     }
2655 
2656     if (U_SUCCESS(*status)) {
2657         initializeFCD(status);
2658         if (U_FAILURE(*status)) {
2659             return NULL;
2660         }
2661 
2662         UStringSearch *result;
2663         if (textlength == -1) {
2664             textlength = u_strlen(text);
2665         }
2666         if (patternlength == -1) {
2667             patternlength = u_strlen(pattern);
2668         }
2669         if (textlength <= 0 || patternlength <= 0) {
2670             *status = U_ILLEGAL_ARGUMENT_ERROR;
2671             return NULL;
2672         }
2673 
2674         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2675         if (result == NULL) {
2676             *status = U_MEMORY_ALLOCATION_ERROR;
2677             return NULL;
2678         }
2679 
2680         result->collator    = collator;
2681         result->strength    = ucol_getStrength(collator);
2682         result->ceMask      = getMask(result->strength);
2683         result->toShift     =
2684              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2685                                                             UCOL_SHIFTED;
2686         result->variableTop = ucol_getVariableTop(collator, status);
2687 
2688         result->nfd         = Normalizer2::getNFDInstance(*status);
2689 
2690         if (U_FAILURE(*status)) {
2691             uprv_free(result);
2692             return NULL;
2693         }
2694 
2695         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2696         if (result->search == NULL) {
2697             *status = U_MEMORY_ALLOCATION_ERROR;
2698             uprv_free(result);
2699             return NULL;
2700         }
2701 
2702         result->search->text       = text;
2703         result->search->textLength = textlength;
2704 
2705         result->pattern.text       = pattern;
2706         result->pattern.textLength = patternlength;
2707         result->pattern.ces         = NULL;
2708         result->pattern.pces        = NULL;
2709 
2710         result->search->breakIter  = breakiter;
2711 #if !UCONFIG_NO_BREAK_ITERATION
2712         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2713         if (breakiter) {
2714             ubrk_setText(breakiter, text, textlength, status);
2715         }
2716 #endif
2717 
2718         result->ownCollator           = FALSE;
2719         result->search->matchedLength = 0;
2720         result->search->matchedIndex  = USEARCH_DONE;
2721         result->utilIter              = NULL;
2722         result->textIter              = ucol_openElements(collator, text,
2723                                                           textlength, status);
2724         result->textProcessedIter     = NULL;
2725         if (U_FAILURE(*status)) {
2726             usearch_close(result);
2727             return NULL;
2728         }
2729 
2730         result->search->isOverlap          = FALSE;
2731         result->search->isCanonicalMatch   = FALSE;
2732         result->search->elementComparisonType = 0;
2733         result->search->isForwardSearching = TRUE;
2734         result->search->reset              = TRUE;
2735 
2736         initialize(result, status);
2737 
2738         if (U_FAILURE(*status)) {
2739             usearch_close(result);
2740             return NULL;
2741         }
2742 
2743         return result;
2744     }
2745     return NULL;
2746 }
2747 
usearch_close(UStringSearch * strsrch)2748 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2749 {
2750     if (strsrch) {
2751         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2752             strsrch->pattern.ces) {
2753             uprv_free(strsrch->pattern.ces);
2754         }
2755 
2756         if (strsrch->pattern.pces != NULL &&
2757             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2758             uprv_free(strsrch->pattern.pces);
2759         }
2760 
2761         delete strsrch->textProcessedIter;
2762         ucol_closeElements(strsrch->textIter);
2763         ucol_closeElements(strsrch->utilIter);
2764 
2765         if (strsrch->ownCollator && strsrch->collator) {
2766             ucol_close((UCollator *)strsrch->collator);
2767         }
2768 
2769 #if !UCONFIG_NO_BREAK_ITERATION
2770         if (strsrch->search->internalBreakIter) {
2771             ubrk_close(strsrch->search->internalBreakIter);
2772         }
2773 #endif
2774 
2775         uprv_free(strsrch->search);
2776         uprv_free(strsrch);
2777     }
2778 }
2779 
2780 namespace {
2781 
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2782 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2783     if (U_FAILURE(*status)) { return FALSE; }
2784     if (strsrch->textProcessedIter == NULL) {
2785         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2786         if (strsrch->textProcessedIter == NULL) {
2787             *status = U_MEMORY_ALLOCATION_ERROR;
2788             return FALSE;
2789         }
2790     } else {
2791         strsrch->textProcessedIter->init(strsrch->textIter);
2792     }
2793     return TRUE;
2794 }
2795 
2796 }
2797 
2798 // set and get methods --------------------------------------------------
2799 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2800 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2801                                         int32_t    position,
2802                                         UErrorCode    *status)
2803 {
2804     if (U_SUCCESS(*status) && strsrch) {
2805         if (isOutOfBounds(strsrch->search->textLength, position)) {
2806             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2807         }
2808         else {
2809             setColEIterOffset(strsrch->textIter, position);
2810         }
2811         strsrch->search->matchedIndex  = USEARCH_DONE;
2812         strsrch->search->matchedLength = 0;
2813         strsrch->search->reset         = FALSE;
2814     }
2815 }
2816 
usearch_getOffset(const UStringSearch * strsrch)2817 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2818 {
2819     if (strsrch) {
2820         int32_t result = ucol_getOffset(strsrch->textIter);
2821         if (isOutOfBounds(strsrch->search->textLength, result)) {
2822             return USEARCH_DONE;
2823         }
2824         return result;
2825     }
2826     return USEARCH_DONE;
2827 }
2828 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2829 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2830                                  USearchAttribute attribute,
2831                                  USearchAttributeValue value,
2832                                  UErrorCode *status)
2833 {
2834     if (U_SUCCESS(*status) && strsrch) {
2835         switch (attribute)
2836         {
2837         case USEARCH_OVERLAP :
2838             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2839             break;
2840         case USEARCH_CANONICAL_MATCH :
2841             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2842                                                                       FALSE);
2843             break;
2844         case USEARCH_ELEMENT_COMPARISON :
2845             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2846                 strsrch->search->elementComparisonType = (int16_t)value;
2847             } else {
2848                 strsrch->search->elementComparisonType = 0;
2849             }
2850             break;
2851         case USEARCH_ATTRIBUTE_COUNT :
2852         default:
2853             *status = U_ILLEGAL_ARGUMENT_ERROR;
2854         }
2855     }
2856     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2857         *status = U_ILLEGAL_ARGUMENT_ERROR;
2858     }
2859 }
2860 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2861 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2862                                                 const UStringSearch *strsrch,
2863                                                 USearchAttribute attribute)
2864 {
2865     if (strsrch) {
2866         switch (attribute) {
2867         case USEARCH_OVERLAP :
2868             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2869                                                         USEARCH_OFF);
2870         case USEARCH_CANONICAL_MATCH :
2871             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2872                                                                USEARCH_OFF);
2873         case USEARCH_ELEMENT_COMPARISON :
2874             {
2875                 int16_t value = strsrch->search->elementComparisonType;
2876                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2877                     return (USearchAttributeValue)value;
2878                 } else {
2879                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
2880                 }
2881             }
2882         case USEARCH_ATTRIBUTE_COUNT :
2883             return USEARCH_DEFAULT;
2884         }
2885     }
2886     return USEARCH_DEFAULT;
2887 }
2888 
usearch_getMatchedStart(const UStringSearch * strsrch)2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890                                                 const UStringSearch *strsrch)
2891 {
2892     if (strsrch == NULL) {
2893         return USEARCH_DONE;
2894     }
2895     return strsrch->search->matchedIndex;
2896 }
2897 
2898 
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900                                             UChar         *result,
2901                                             int32_t        resultCapacity,
2902                                             UErrorCode    *status)
2903 {
2904     if (U_FAILURE(*status)) {
2905         return USEARCH_DONE;
2906     }
2907     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908         result == NULL)) {
2909         *status = U_ILLEGAL_ARGUMENT_ERROR;
2910         return USEARCH_DONE;
2911     }
2912 
2913     int32_t     copylength = strsrch->search->matchedLength;
2914     int32_t copyindex  = strsrch->search->matchedIndex;
2915     if (copyindex == USEARCH_DONE) {
2916         u_terminateUChars(result, resultCapacity, 0, status);
2917         return USEARCH_DONE;
2918     }
2919 
2920     if (resultCapacity < copylength) {
2921         copylength = resultCapacity;
2922     }
2923     if (copylength > 0) {
2924         uprv_memcpy(result, strsrch->search->text + copyindex,
2925                     copylength * sizeof(UChar));
2926     }
2927     return u_terminateUChars(result, resultCapacity,
2928                              strsrch->search->matchedLength, status);
2929 }
2930 
usearch_getMatchedLength(const UStringSearch * strsrch)2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932                                               const UStringSearch *strsrch)
2933 {
2934     if (strsrch) {
2935         return strsrch->search->matchedLength;
2936     }
2937     return USEARCH_DONE;
2938 }
2939 
2940 #if !UCONFIG_NO_BREAK_ITERATION
2941 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2943                                                UBreakIterator *breakiter,
2944                                                UErrorCode     *status)
2945 {
2946     if (U_SUCCESS(*status) && strsrch) {
2947         strsrch->search->breakIter = breakiter;
2948         if (breakiter) {
2949             ubrk_setText(breakiter, strsrch->search->text,
2950                          strsrch->search->textLength, status);
2951         }
2952     }
2953 }
2954 
2955 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2956 usearch_getBreakIterator(const UStringSearch *strsrch)
2957 {
2958     if (strsrch) {
2959         return strsrch->search->breakIter;
2960     }
2961     return NULL;
2962 }
2963 
2964 #endif
2965 
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2966 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2967                                       const UChar         *text,
2968                                             int32_t        textlength,
2969                                             UErrorCode    *status)
2970 {
2971     if (U_SUCCESS(*status)) {
2972         if (strsrch == NULL || text == NULL || textlength < -1 ||
2973             textlength == 0) {
2974             *status = U_ILLEGAL_ARGUMENT_ERROR;
2975         }
2976         else {
2977             if (textlength == -1) {
2978                 textlength = u_strlen(text);
2979             }
2980             strsrch->search->text       = text;
2981             strsrch->search->textLength = textlength;
2982             ucol_setText(strsrch->textIter, text, textlength, status);
2983             strsrch->search->matchedIndex  = USEARCH_DONE;
2984             strsrch->search->matchedLength = 0;
2985             strsrch->search->reset         = TRUE;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987             if (strsrch->search->breakIter != NULL) {
2988                 ubrk_setText(strsrch->search->breakIter, text,
2989                              textlength, status);
2990             }
2991             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992 #endif
2993         }
2994     }
2995 }
2996 
usearch_getText(const UStringSearch * strsrch,int32_t * length)2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998                                                      int32_t       *length)
2999 {
3000     if (strsrch) {
3001         *length = strsrch->search->textLength;
3002         return strsrch->search->text;
3003     }
3004     return NULL;
3005 }
3006 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3007 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3008                                           const UCollator     *collator,
3009                                                 UErrorCode    *status)
3010 {
3011     if (U_SUCCESS(*status)) {
3012         if (collator == NULL) {
3013             *status = U_ILLEGAL_ARGUMENT_ERROR;
3014             return;
3015         }
3016 
3017         if (strsrch) {
3018             delete strsrch->textProcessedIter;
3019             strsrch->textProcessedIter = NULL;
3020             ucol_closeElements(strsrch->textIter);
3021             ucol_closeElements(strsrch->utilIter);
3022             strsrch->textIter = strsrch->utilIter = NULL;
3023             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3024                 ucol_close((UCollator *)strsrch->collator);
3025                 strsrch->ownCollator = FALSE;
3026             }
3027             strsrch->collator    = collator;
3028             strsrch->strength    = ucol_getStrength(collator);
3029             strsrch->ceMask      = getMask(strsrch->strength);
3030 #if !UCONFIG_NO_BREAK_ITERATION
3031             ubrk_close(strsrch->search->internalBreakIter);
3032             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3033                                                      strsrch->search->text, strsrch->search->textLength, status);
3034 #endif
3035             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3036             strsrch->toShift     =
3037                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3038                                                                 UCOL_SHIFTED;
3039             // if status is a failure, ucol_getVariableTop returns 0
3040             strsrch->variableTop = ucol_getVariableTop(collator, status);
3041             strsrch->textIter = ucol_openElements(collator,
3042                                       strsrch->search->text,
3043                                       strsrch->search->textLength,
3044                                       status);
3045             strsrch->utilIter = ucol_openElements(
3046                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3047             // initialize() _after_ setting the iterators for the new collator.
3048             initialize(strsrch, status);
3049         }
3050 
3051         // **** are these calls needed?
3052         // **** we call uprv_init_pce in initializePatternPCETable
3053         // **** and the CEIBuffer constructor...
3054 #if 0
3055         uprv_init_pce(strsrch->textIter);
3056         uprv_init_pce(strsrch->utilIter);
3057 #endif
3058     }
3059 }
3060 
usearch_getCollator(const UStringSearch * strsrch)3061 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3062 {
3063     if (strsrch) {
3064         return (UCollator *)strsrch->collator;
3065     }
3066     return NULL;
3067 }
3068 
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3069 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3070                                          const UChar         *pattern,
3071                                                int32_t        patternlength,
3072                                                UErrorCode    *status)
3073 {
3074     if (U_SUCCESS(*status)) {
3075         if (strsrch == NULL || pattern == NULL) {
3076             *status = U_ILLEGAL_ARGUMENT_ERROR;
3077         }
3078         else {
3079             if (patternlength == -1) {
3080                 patternlength = u_strlen(pattern);
3081             }
3082             if (patternlength == 0) {
3083                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3084                 return;
3085             }
3086             strsrch->pattern.text       = pattern;
3087             strsrch->pattern.textLength = patternlength;
3088             initialize(strsrch, status);
3089         }
3090     }
3091 }
3092 
3093 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3094 usearch_getPattern(const UStringSearch *strsrch,
3095                    int32_t       *length)
3096 {
3097     if (strsrch) {
3098         *length = strsrch->pattern.textLength;
3099         return strsrch->pattern.text;
3100     }
3101     return NULL;
3102 }
3103 
3104 // miscellanous methods --------------------------------------------------
3105 
usearch_first(UStringSearch * strsrch,UErrorCode * status)3106 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3107                                            UErrorCode    *status)
3108 {
3109     if (strsrch && U_SUCCESS(*status)) {
3110         strsrch->search->isForwardSearching = TRUE;
3111         usearch_setOffset(strsrch, 0, status);
3112         if (U_SUCCESS(*status)) {
3113             return usearch_next(strsrch, status);
3114         }
3115     }
3116     return USEARCH_DONE;
3117 }
3118 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3119 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3120                                                int32_t    position,
3121                                                UErrorCode    *status)
3122 {
3123     if (strsrch && U_SUCCESS(*status)) {
3124         strsrch->search->isForwardSearching = TRUE;
3125         // position checked in usearch_setOffset
3126         usearch_setOffset(strsrch, position, status);
3127         if (U_SUCCESS(*status)) {
3128             return usearch_next(strsrch, status);
3129         }
3130     }
3131     return USEARCH_DONE;
3132 }
3133 
usearch_last(UStringSearch * strsrch,UErrorCode * status)3134 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3135                                           UErrorCode    *status)
3136 {
3137     if (strsrch && U_SUCCESS(*status)) {
3138         strsrch->search->isForwardSearching = FALSE;
3139         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3140         if (U_SUCCESS(*status)) {
3141             return usearch_previous(strsrch, status);
3142         }
3143     }
3144     return USEARCH_DONE;
3145 }
3146 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3147 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3148                                                int32_t    position,
3149                                                UErrorCode    *status)
3150 {
3151     if (strsrch && U_SUCCESS(*status)) {
3152         strsrch->search->isForwardSearching = FALSE;
3153         // position checked in usearch_setOffset
3154         usearch_setOffset(strsrch, position, status);
3155         if (U_SUCCESS(*status)) {
3156             return usearch_previous(strsrch, status);
3157         }
3158     }
3159     return USEARCH_DONE;
3160 }
3161 
3162 /**
3163 * If a direction switch is required, we'll count the number of ces till the
3164 * beginning of the collation element iterator and iterate forwards that
3165 * number of times. This is so that we get to the correct point within the
3166 * string to continue the search in. Imagine when we are in the middle of the
3167 * normalization buffer when the change in direction is request. arrrgghh....
3168 * After searching the offset within the collation element iterator will be
3169 * shifted to the start of the match. If a match is not found, the offset would
3170 * have been set to the end of the text string in the collation element
3171 * iterator.
3172 * Okay, here's my take on normalization buffer. The only time when there can
3173 * be 2 matches within the same normalization is when the pattern is consists
3174 * of all accents. But since the offset returned is from the text string, we
3175 * should not confuse the caller by returning the second match within the
3176 * same normalization buffer. If we do, the 2 results will have the same match
3177 * offsets, and that'll be confusing. I'll return the next match that doesn't
3178 * fall within the same normalization buffer. Note this does not affect the
3179 * results of matches spanning the text and the normalization buffer.
3180 * The position to start searching is taken from the collation element
3181 * iterator. Callers of this API would have to set the offset in the collation
3182 * element iterator before using this method.
3183 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3184 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3185                                           UErrorCode    *status)
3186 {
3187     if (U_SUCCESS(*status) && strsrch) {
3188         // note offset is either equivalent to the start of the previous match
3189         // or is set by the user
3190         int32_t      offset       = usearch_getOffset(strsrch);
3191         USearch     *search       = strsrch->search;
3192         search->reset             = FALSE;
3193         int32_t      textlength   = search->textLength;
3194         if (search->isForwardSearching) {
3195 #if BOYER_MOORE
3196             if (offset == textlength
3197                 || (!search->isOverlap &&
3198                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3199                     (search->matchedIndex != USEARCH_DONE &&
3200                      offset + search->matchedLength >= textlength)))) {
3201                 // not enough characters to match
3202                 setMatchNotFound(strsrch);
3203                 return USEARCH_DONE;
3204             }
3205 #else
3206             if (offset == textlength ||
3207                 (! search->isOverlap &&
3208                 (search->matchedIndex != USEARCH_DONE &&
3209                 offset + search->matchedLength > textlength))) {
3210                     // not enough characters to match
3211                     setMatchNotFound(strsrch);
3212                     return USEARCH_DONE;
3213             }
3214 #endif
3215         }
3216         else {
3217             // switching direction.
3218             // if matchedIndex == USEARCH_DONE, it means that either a
3219             // setOffset has been called or that previous ran off the text
3220             // string. the iterator would have been set to offset 0 if a
3221             // match is not found.
3222             search->isForwardSearching = TRUE;
3223             if (search->matchedIndex != USEARCH_DONE) {
3224                 // there's no need to set the collation element iterator
3225                 // the next call to next will set the offset.
3226                 return search->matchedIndex;
3227             }
3228         }
3229 
3230         if (U_SUCCESS(*status)) {
3231             if (strsrch->pattern.cesLength == 0) {
3232                 if (search->matchedIndex == USEARCH_DONE) {
3233                     search->matchedIndex = offset;
3234                 }
3235                 else { // moves by codepoints
3236                     U16_FWD_1(search->text, search->matchedIndex, textlength);
3237                 }
3238 
3239                 search->matchedLength = 0;
3240                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3241                 // status checked below
3242                 if (search->matchedIndex == textlength) {
3243                     search->matchedIndex = USEARCH_DONE;
3244                 }
3245             }
3246             else {
3247                 if (search->matchedLength > 0) {
3248                     // if matchlength is 0 we are at the start of the iteration
3249                     if (search->isOverlap) {
3250                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3251                     }
3252                     else {
3253                         ucol_setOffset(strsrch->textIter,
3254                                        offset + search->matchedLength, status);
3255                     }
3256                 }
3257                 else {
3258                     // for boundary check purposes. this will ensure that the
3259                     // next match will not preceed the current offset
3260                     // note search->matchedIndex will always be set to something
3261                     // in the code
3262                     search->matchedIndex = offset - 1;
3263                 }
3264 
3265                 if (search->isCanonicalMatch) {
3266                     // can't use exact here since extra accents are allowed.
3267                     usearch_handleNextCanonical(strsrch, status);
3268                 }
3269                 else {
3270                     usearch_handleNextExact(strsrch, status);
3271                 }
3272             }
3273 
3274             if (U_FAILURE(*status)) {
3275                 return USEARCH_DONE;
3276             }
3277 
3278 #if !BOYER_MOORE
3279             if (search->matchedIndex == USEARCH_DONE) {
3280                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3281             } else {
3282                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3283             }
3284 #endif
3285 
3286             return search->matchedIndex;
3287         }
3288     }
3289     return USEARCH_DONE;
3290 }
3291 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3292 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3293                                               UErrorCode *status)
3294 {
3295     if (U_SUCCESS(*status) && strsrch) {
3296         int32_t offset;
3297         USearch *search = strsrch->search;
3298         if (search->reset) {
3299             offset                     = search->textLength;
3300             search->isForwardSearching = FALSE;
3301             search->reset              = FALSE;
3302             setColEIterOffset(strsrch->textIter, offset);
3303         }
3304         else {
3305             offset = usearch_getOffset(strsrch);
3306         }
3307 
3308         int32_t matchedindex = search->matchedIndex;
3309         if (search->isForwardSearching == TRUE) {
3310             // switching direction.
3311             // if matchedIndex == USEARCH_DONE, it means that either a
3312             // setOffset has been called or that next ran off the text
3313             // string. the iterator would have been set to offset textLength if
3314             // a match is not found.
3315             search->isForwardSearching = FALSE;
3316             if (matchedindex != USEARCH_DONE) {
3317                 return matchedindex;
3318             }
3319         }
3320         else {
3321 #if BOYER_MOORE
3322             if (offset == 0 || matchedindex == 0 ||
3323                 (!search->isOverlap &&
3324                     (offset < strsrch->pattern.defaultShiftSize ||
3325                     (matchedindex != USEARCH_DONE &&
3326                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3327                 // not enough characters to match
3328                 setMatchNotFound(strsrch);
3329                 return USEARCH_DONE;
3330             }
3331 #else
3332             // Could check pattern length, but the
3333             // linear search will do the right thing
3334             if (offset == 0 || matchedindex == 0) {
3335                 setMatchNotFound(strsrch);
3336                 return USEARCH_DONE;
3337             }
3338 #endif
3339         }
3340 
3341         if (U_SUCCESS(*status)) {
3342             if (strsrch->pattern.cesLength == 0) {
3343                 search->matchedIndex =
3344                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3345                 if (search->matchedIndex == 0) {
3346                     setMatchNotFound(strsrch);
3347                     // status checked below
3348                 }
3349                 else { // move by codepoints
3350                     U16_BACK_1(search->text, 0, search->matchedIndex);
3351                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3352                     // status checked below
3353                     search->matchedLength = 0;
3354                 }
3355             }
3356             else {
3357                 if (strsrch->search->isCanonicalMatch) {
3358                     // can't use exact here since extra accents are allowed.
3359                     usearch_handlePreviousCanonical(strsrch, status);
3360                     // status checked below
3361                 }
3362                 else {
3363                     usearch_handlePreviousExact(strsrch, status);
3364                     // status checked below
3365                 }
3366             }
3367 
3368             if (U_FAILURE(*status)) {
3369                 return USEARCH_DONE;
3370             }
3371 
3372             return search->matchedIndex;
3373         }
3374     }
3375     return USEARCH_DONE;
3376 }
3377 
3378 
3379 
usearch_reset(UStringSearch * strsrch)3380 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3381 {
3382     /*
3383     reset is setting the attributes that are already in
3384     string search, hence all attributes in the collator should
3385     be retrieved without any problems
3386     */
3387     if (strsrch) {
3388         UErrorCode status            = U_ZERO_ERROR;
3389         UBool      sameCollAttribute = TRUE;
3390         uint32_t   ceMask;
3391         UBool      shift;
3392         uint32_t   varTop;
3393 
3394         // **** hack to deal w/ how processed CEs encode quaternary ****
3395         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3396         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3397             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3398                 sameCollAttribute = FALSE;
3399         }
3400 
3401         strsrch->strength    = ucol_getStrength(strsrch->collator);
3402         ceMask = getMask(strsrch->strength);
3403         if (strsrch->ceMask != ceMask) {
3404             strsrch->ceMask = ceMask;
3405             sameCollAttribute = FALSE;
3406         }
3407 
3408         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3409         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3410                                   &status) == UCOL_SHIFTED;
3411         if (strsrch->toShift != shift) {
3412             strsrch->toShift  = shift;
3413             sameCollAttribute = FALSE;
3414         }
3415 
3416         // if status is a failure, ucol_getVariableTop returns 0
3417         varTop = ucol_getVariableTop(strsrch->collator, &status);
3418         if (strsrch->variableTop != varTop) {
3419             strsrch->variableTop = varTop;
3420             sameCollAttribute    = FALSE;
3421         }
3422         if (!sameCollAttribute) {
3423             initialize(strsrch, &status);
3424         }
3425         ucol_setText(strsrch->textIter, strsrch->search->text,
3426                               strsrch->search->textLength,
3427                               &status);
3428         strsrch->search->matchedLength      = 0;
3429         strsrch->search->matchedIndex       = USEARCH_DONE;
3430         strsrch->search->isOverlap          = FALSE;
3431         strsrch->search->isCanonicalMatch   = FALSE;
3432         strsrch->search->elementComparisonType = 0;
3433         strsrch->search->isForwardSearching = TRUE;
3434         strsrch->search->reset              = TRUE;
3435     }
3436 }
3437 
3438 //
3439 //  CEI  Collation Element + source text index.
3440 //       These structs are kept in the circular buffer.
3441 //
3442 struct  CEI {
3443     int64_t ce;
3444     int32_t lowIndex;
3445     int32_t highIndex;
3446 };
3447 
3448 U_NAMESPACE_BEGIN
3449 
3450 namespace {
3451 //
3452 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3453 //
3454 #define   DEFAULT_CEBUFFER_SIZE 96
3455 #define   CEBUFFER_EXTRA 32
3456 // Some typical max values to make buffer size more reasonable for asymmetric search.
3457 // #8694 is for a better long-term solution to allocation of this buffer.
3458 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3461 struct CEIBuffer {
3462     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3463     CEI                 *buf;
3464     int32_t              bufSize;
3465     int32_t              firstIx;
3466     int32_t              limitIx;
3467     UCollationElements  *ceIter;
3468     UStringSearch       *strSearch;
3469 
3470 
3471 
3472                CEIBuffer(UStringSearch *ss, UErrorCode *status);
3473                ~CEIBuffer();
3474    const CEI   *get(int32_t index);
3475    const CEI   *getPrevious(int32_t index);
3476 };
3477 
3478 
CEIBuffer(UStringSearch * ss,UErrorCode * status)3479 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3480     buf = defBuf;
3481     strSearch = ss;
3482     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3483     if (ss->search->elementComparisonType != 0) {
3484         const UChar * patText = ss->pattern.text;
3485         if (patText) {
3486             const UChar * patTextLimit = patText + ss->pattern.textLength;
3487             while ( patText < patTextLimit ) {
3488                 UChar c = *patText++;
3489                 if (MIGHT_BE_JAMO_L(c)) {
3490                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3491                 } else {
3492                     // No check for surrogates, we might allocate slightly more buffer than necessary.
3493                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3494                 }
3495             }
3496         }
3497     }
3498     ceIter    = ss->textIter;
3499     firstIx = 0;
3500     limitIx = 0;
3501 
3502     if (!initTextProcessedIter(ss, status)) { return; }
3503 
3504     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3505         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3506         if (buf == NULL) {
3507             *status = U_MEMORY_ALLOCATION_ERROR;
3508         }
3509     }
3510 }
3511 
3512 // TODO: add a reset or init function so that allocated
3513 //       buffers can be retained & reused.
3514 
~CEIBuffer()3515 CEIBuffer::~CEIBuffer() {
3516     if (buf != defBuf) {
3517         uprv_free(buf);
3518     }
3519 }
3520 
3521 
3522 // Get the CE with the specified index.
3523 //   Index must be in the range
3524 //          n-history_size < index < n+1
3525 //   where n is the largest index to have been fetched by some previous call to this function.
3526 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3527 //
get(int32_t index)3528 const CEI *CEIBuffer::get(int32_t index) {
3529     int i = index % bufSize;
3530 
3531     if (index>=firstIx && index<limitIx) {
3532         // The request was for an entry already in our buffer.
3533         //  Just return it.
3534         return &buf[i];
3535     }
3536 
3537     // Caller is requesting a new, never accessed before, CE.
3538     //   Verify that it is the next one in sequence, which is all
3539     //   that is allowed.
3540     if (index != limitIx) {
3541         U_ASSERT(FALSE);
3542 
3543         return NULL;
3544     }
3545 
3546     // Manage the circular CE buffer indexing
3547     limitIx++;
3548 
3549     if (limitIx - firstIx >= bufSize) {
3550         // The buffer is full, knock out the lowest-indexed entry.
3551         firstIx++;
3552     }
3553 
3554     UErrorCode status = U_ZERO_ERROR;
3555 
3556     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3557 
3558     return &buf[i];
3559 }
3560 
3561 // Get the CE with the specified index.
3562 //   Index must be in the range
3563 //          n-history_size < index < n+1
3564 //   where n is the largest index to have been fetched by some previous call to this function.
3565 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3566 //
getPrevious(int32_t index)3567 const CEI *CEIBuffer::getPrevious(int32_t index) {
3568     int i = index % bufSize;
3569 
3570     if (index>=firstIx && index<limitIx) {
3571         // The request was for an entry already in our buffer.
3572         //  Just return it.
3573         return &buf[i];
3574     }
3575 
3576     // Caller is requesting a new, never accessed before, CE.
3577     //   Verify that it is the next one in sequence, which is all
3578     //   that is allowed.
3579     if (index != limitIx) {
3580         U_ASSERT(FALSE);
3581 
3582         return NULL;
3583     }
3584 
3585     // Manage the circular CE buffer indexing
3586     limitIx++;
3587 
3588     if (limitIx - firstIx >= bufSize) {
3589         // The buffer is full, knock out the lowest-indexed entry.
3590         firstIx++;
3591     }
3592 
3593     UErrorCode status = U_ZERO_ERROR;
3594 
3595     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3596 
3597     return &buf[i];
3598 }
3599 
3600 }
3601 
3602 U_NAMESPACE_END
3603 
3604 
3605 // #define USEARCH_DEBUG
3606 
3607 #ifdef USEARCH_DEBUG
3608 #include <stdio.h>
3609 #include <stdlib.h>
3610 #endif
3611 
3612 /*
3613  * Find the next break boundary after startIndex. If the UStringSearch object
3614  * has an external break iterator, use that. Otherwise use the internal character
3615  * break iterator.
3616  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3617 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3618 #if 0
3619     const UChar *text = strsrch->search->text;
3620     int32_t textLen   = strsrch->search->textLength;
3621 
3622     U_ASSERT(startIndex>=0);
3623     U_ASSERT(startIndex<=textLen);
3624 
3625     if (startIndex >= textLen) {
3626         return startIndex;
3627     }
3628 
3629     UChar32  c;
3630     int32_t  i = startIndex;
3631     U16_NEXT(text, i, textLen, c);
3632 
3633     // If we are on a control character, stop without looking for combining marks.
3634     //    Control characters do not combine.
3635     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3637         return i;
3638     }
3639 
3640     // The initial character was not a control, and can thus accept trailing
3641     //   combining characters.  Advance over however many of them there are.
3642     int32_t  indexOfLastCharChecked;
3643     for (;;) {
3644         indexOfLastCharChecked = i;
3645         if (i>=textLen) {
3646             break;
3647         }
3648         U16_NEXT(text, i, textLen, c);
3649         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3650         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3651             break;
3652         }
3653     }
3654     return indexOfLastCharChecked;
3655 #elif !UCONFIG_NO_BREAK_ITERATION
3656     UBreakIterator *breakiterator = strsrch->search->breakIter;
3657 
3658     if (breakiterator == NULL) {
3659         breakiterator = strsrch->search->internalBreakIter;
3660     }
3661 
3662     if (breakiterator != NULL) {
3663         return ubrk_following(breakiterator, startIndex);
3664     }
3665 
3666     return startIndex;
3667 #else
3668     // **** or should we use the original code? ****
3669     return startIndex;
3670 #endif
3671 
3672 }
3673 
3674 /*
3675  * Returns TRUE if index is on a break boundary. If the UStringSearch
3676  * has an external break iterator, test using that, otherwise test
3677  * using the internal character break iterator.
3678  */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3679 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3680 #if 0
3681     const UChar *text = strsrch->search->text;
3682     int32_t textLen   = strsrch->search->textLength;
3683 
3684     U_ASSERT(index>=0);
3685     U_ASSERT(index<=textLen);
3686 
3687     if (index>=textLen || index<=0) {
3688         return TRUE;
3689     }
3690 
3691     // If the character at the current index is not a GRAPHEME_EXTEND
3692     //    then we can not be within a combining sequence.
3693     UChar32  c;
3694     U16_GET(text, 0, index, textLen, c);
3695     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3696     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3697         return TRUE;
3698     }
3699 
3700     // We are at a combining mark.  If the preceding character is anything
3701     //   except a CONTROL, CR or LF, we are in a combining sequence.
3702     U16_PREV(text, 0, index, c);
3703     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3704     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3705     return !combining;
3706 #elif !UCONFIG_NO_BREAK_ITERATION
3707     UBreakIterator *breakiterator = strsrch->search->breakIter;
3708 
3709     if (breakiterator == NULL) {
3710         breakiterator = strsrch->search->internalBreakIter;
3711     }
3712 
3713     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3714 #else
3715     // **** or use the original code? ****
3716     return TRUE;
3717 #endif
3718 }
3719 
3720 #if 0
3721 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3722 {
3723 #if !UCONFIG_NO_BREAK_ITERATION
3724     UBreakIterator *breakiterator = strsrch->search->breakIter;
3725 
3726     if (breakiterator != NULL) {
3727         int32_t startindex = ubrk_first(breakiterator);
3728         int32_t endindex   = ubrk_last(breakiterator);
3729 
3730         // out-of-range indexes are never boundary positions
3731         if (start < startindex || start > endindex ||
3732             end < startindex || end > endindex) {
3733             return FALSE;
3734         }
3735 
3736         return ubrk_isBoundary(breakiterator, start) &&
3737                ubrk_isBoundary(breakiterator, end);
3738     }
3739 #endif
3740 
3741     return TRUE;
3742 }
3743 #endif
3744 
3745 typedef enum {
3746     U_CE_MATCH = -1,
3747     U_CE_NO_MATCH = 0,
3748     U_CE_SKIP_TARG,
3749     U_CE_SKIP_PATN
3750 } UCompareCEsResult;
3751 #define U_CE_LEVEL2_BASE 0x00000005
3752 #define U_CE_LEVEL3_BASE 0x00050000
3753 
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3754 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3755     if (targCE == patCE) {
3756         return U_CE_MATCH;
3757     }
3758     if (compareType == 0) {
3759         return U_CE_NO_MATCH;
3760     }
3761 
3762     int64_t targCEshifted = targCE >> 32;
3763     int64_t patCEshifted = patCE >> 32;
3764     int64_t mask;
3765 
3766     mask = 0xFFFF0000;
3767     int32_t targLev1 = (int32_t)(targCEshifted & mask);
3768     int32_t patLev1 = (int32_t)(patCEshifted & mask);
3769     if ( targLev1 != patLev1 ) {
3770         if ( targLev1 == 0 ) {
3771             return U_CE_SKIP_TARG;
3772         }
3773         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3774             return U_CE_SKIP_PATN;
3775         }
3776         return U_CE_NO_MATCH;
3777     }
3778 
3779     mask = 0x0000FFFF;
3780     int32_t targLev2 = (int32_t)(targCEshifted & mask);
3781     int32_t patLev2 = (int32_t)(patCEshifted & mask);
3782     if ( targLev2 != patLev2 ) {
3783         if ( targLev2 == 0 ) {
3784             return U_CE_SKIP_TARG;
3785         }
3786         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3787             return U_CE_SKIP_PATN;
3788         }
3789         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3790             U_CE_MATCH: U_CE_NO_MATCH;
3791     }
3792 
3793     mask = 0xFFFF0000;
3794     int32_t targLev3 = (int32_t)(targCE & mask);
3795     int32_t patLev3 = (int32_t)(patCE & mask);
3796     if ( targLev3 != patLev3 ) {
3797         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3798             U_CE_MATCH: U_CE_NO_MATCH;
3799    }
3800 
3801     return U_CE_MATCH;
3802 }
3803 
3804 #if BOYER_MOORE
3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806 #endif
3807 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3809                                        int32_t        startIdx,
3810                                        int32_t        *matchStart,
3811                                        int32_t        *matchLimit,
3812                                        UErrorCode     *status)
3813 {
3814     if (U_FAILURE(*status)) {
3815         return FALSE;
3816     }
3817 
3818     // TODO:  reject search patterns beginning with a combining char.
3819 
3820 #ifdef USEARCH_DEBUG
3821     if (getenv("USEARCH_DEBUG") != NULL) {
3822         printf("Pattern CEs\n");
3823         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3824             printf(" %8x", strsrch->pattern.ces[ii]);
3825         }
3826         printf("\n");
3827     }
3828 
3829 #endif
3830     // Input parameter sanity check.
3831     //  TODO:  should input indicies clip to the text length
3832     //         in the same way that UText does.
3833     if(strsrch->pattern.cesLength == 0         ||
3834        startIdx < 0                           ||
3835        startIdx > strsrch->search->textLength ||
3836        strsrch->pattern.ces == NULL) {
3837            *status = U_ILLEGAL_ARGUMENT_ERROR;
3838            return FALSE;
3839     }
3840 
3841     if (strsrch->pattern.pces == NULL) {
3842         initializePatternPCETable(strsrch, status);
3843     }
3844 
3845     ucol_setOffset(strsrch->textIter, startIdx, status);
3846     CEIBuffer ceb(strsrch, status);
3847 
3848 
3849     int32_t    targetIx = 0;
3850     const CEI *targetCEI = NULL;
3851     int32_t    patIx;
3852     UBool      found;
3853 
3854     int32_t  mStart = -1;
3855     int32_t  mLimit = -1;
3856     int32_t  minLimit;
3857     int32_t  maxLimit;
3858 
3859 
3860 
3861     // Outer loop moves over match starting positions in the
3862     //      target CE space.
3863     // Here we see the target as a sequence of collation elements, resulting from the following:
3864     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865     //    (for example, digraphs such as IJ may be broken into two characters).
3866     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870     //    then the CE is deleted, so the following code sees only CEs that are relevant.
3871     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3874     //
3875     for(targetIx=0; ; targetIx++)
3876     {
3877         found = TRUE;
3878         //  Inner loop checks for a match beginning at each
3879         //  position from the outer loop.
3880         int32_t targetIxOffset = 0;
3881         int64_t patCE = 0;
3882         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883         // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885         const CEI *firstCEI = ceb.get(targetIx);
3886         if (firstCEI == NULL) {
3887             *status = U_INTERNAL_PROGRAM_ERROR;
3888             found = FALSE;
3889             break;
3890         }
3891 
3892         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3893             patCE = strsrch->pattern.pces[patIx];
3894             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3895             //  Compare CE from target string with CE from the pattern.
3896             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3897             //    which will fail the compare, below.
3898             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3899             if ( ceMatch == U_CE_NO_MATCH ) {
3900                 found = FALSE;
3901                 break;
3902             } else if ( ceMatch > U_CE_NO_MATCH ) {
3903                 if ( ceMatch == U_CE_SKIP_TARG ) {
3904                     // redo with same patCE, next targCE
3905                     patIx--;
3906                     targetIxOffset++;
3907                 } else { // ceMatch == U_CE_SKIP_PATN
3908                     // redo with same targCE, next patCE
3909                     targetIxOffset--;
3910                 }
3911             }
3912         }
3913         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3914 
3915         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3916             // No match at this targetIx.  Try again at the next.
3917             continue;
3918         }
3919 
3920         if (!found) {
3921             // No match at all, we have run off the end of the target text.
3922             break;
3923         }
3924 
3925 
3926         // We have found a match in CE space.
3927         // Now determine the bounds in string index space.
3928         //  There still is a chance of match failure if the CE range not correspond to
3929         //     an acceptable character range.
3930         //
3931         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3932 
3933         mStart   = firstCEI->lowIndex;
3934         minLimit = lastCEI->lowIndex;
3935 
3936         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3937         //   extended to the end of input, and the match is good.
3938 
3939         // Look at the high and low indices of the CE following the match. If
3940         // they are the same it means one of two things:
3941         //    1. The match extended to the last CE from the target text, which is OK, or
3942         //    2. The last CE that was part of the match is in an expansion that extends
3943         //       to the first CE after the match. In this case, we reject the match.
3944         const CEI *nextCEI = 0;
3945         if (strsrch->search->elementComparisonType == 0) {
3946             nextCEI  = ceb.get(targetIx + targetIxOffset);
3947             maxLimit = nextCEI->lowIndex;
3948             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3949                 found = FALSE;
3950             }
3951         } else {
3952             for ( ; ; ++targetIxOffset ) {
3953                 nextCEI = ceb.get(targetIx + targetIxOffset);
3954                 maxLimit = nextCEI->lowIndex;
3955                 // If we are at the end of the target too, match succeeds
3956                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3957                     break;
3958                 }
3959                 // As long as the next CE has primary weight of 0,
3960                 // it is part of the last target element matched by the pattern;
3961                 // make sure it can be part of a match with the last patCE
3962                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3963                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3964                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3965                         found = FALSE;
3966                         break;
3967                     }
3968                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969                 // target element, but it has non-zero primary weight => match fails
3970                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3971                     found = false;
3972                     break;
3973                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3974                 } else {
3975                     break;
3976                 }
3977             }
3978         }
3979 
3980 
3981         // Check for the start of the match being within a combining sequence.
3982         //   This can happen if the pattern itself begins with a combining char, and
3983         //   the match found combining marks in the target text that were attached
3984         //    to something else.
3985         //   This type of match should be rejected for not completely consuming a
3986         //   combining sequence.
3987         if (!isBreakBoundary(strsrch, mStart)) {
3988             found = FALSE;
3989         }
3990 
3991         // Check for the start of the match being within an Collation Element Expansion,
3992         //   meaning that the first char of the match is only partially matched.
3993         //   With exapnsions, the first CE will report the index of the source
3994         //   character, and all subsequent (expansions) CEs will report the source index of the
3995         //    _following_ character.
3996         int32_t secondIx = firstCEI->highIndex;
3997         if (mStart == secondIx) {
3998             found = FALSE;
3999         }
4000 
4001         //  Advance the match end position to the first acceptable match boundary.
4002         //    This advances the index over any combining charcters.
4003         mLimit = maxLimit;
4004         if (minLimit < maxLimit) {
4005             // When the last CE's low index is same with its high index, the CE is likely
4006             // a part of expansion. In this case, the index is located just after the
4007             // character corresponding to the CEs compared above. If the index is right
4008             // at the break boundary, move the position to the next boundary will result
4009             // incorrect match length when there are ignorable characters exist between
4010             // the position and the next character produces CE(s). See ticket#8482.
4011             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4012                 mLimit = minLimit;
4013             } else {
4014                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015                 if (nba >= lastCEI->highIndex) {
4016                     mLimit = nba;
4017                 }
4018             }
4019         }
4020 
4021     #ifdef USEARCH_DEBUG
4022         if (getenv("USEARCH_DEBUG") != NULL) {
4023             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4024         }
4025     #endif
4026 
4027         // If advancing to the end of a combining sequence in character indexing space
4028         //   advanced us beyond the end of the match in CE space, reject this match.
4029         if (mLimit > maxLimit) {
4030             found = FALSE;
4031         }
4032 
4033         if (!isBreakBoundary(strsrch, mLimit)) {
4034             found = FALSE;
4035         }
4036 
4037         if (! checkIdentical(strsrch, mStart, mLimit)) {
4038             found = FALSE;
4039         }
4040 
4041         if (found) {
4042             break;
4043         }
4044     }
4045 
4046     #ifdef USEARCH_DEBUG
4047     if (getenv("USEARCH_DEBUG") != NULL) {
4048         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4049         int32_t  lastToPrint = ceb.limitIx+2;
4050         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4051             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4052         }
4053         printf("\n%s\n", found? "match found" : "no match");
4054     }
4055     #endif
4056 
4057     // All Done.  Store back the match bounds to the caller.
4058     //
4059     if (found==FALSE) {
4060         mLimit = -1;
4061         mStart = -1;
4062     }
4063 
4064     if (matchStart != NULL) {
4065         *matchStart= mStart;
4066     }
4067 
4068     if (matchLimit != NULL) {
4069         *matchLimit = mLimit;
4070     }
4071 
4072     return found;
4073 }
4074 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4075 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4076                                                 int32_t        startIdx,
4077                                                 int32_t        *matchStart,
4078                                                 int32_t        *matchLimit,
4079                                                 UErrorCode     *status)
4080 {
4081     if (U_FAILURE(*status)) {
4082         return FALSE;
4083     }
4084 
4085     // TODO:  reject search patterns beginning with a combining char.
4086 
4087 #ifdef USEARCH_DEBUG
4088     if (getenv("USEARCH_DEBUG") != NULL) {
4089         printf("Pattern CEs\n");
4090         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4091             printf(" %8x", strsrch->pattern.ces[ii]);
4092         }
4093         printf("\n");
4094     }
4095 
4096 #endif
4097     // Input parameter sanity check.
4098     //  TODO:  should input indicies clip to the text length
4099     //         in the same way that UText does.
4100     if(strsrch->pattern.cesLength == 0         ||
4101        startIdx < 0                           ||
4102        startIdx > strsrch->search->textLength ||
4103        strsrch->pattern.ces == NULL) {
4104            *status = U_ILLEGAL_ARGUMENT_ERROR;
4105            return FALSE;
4106     }
4107 
4108     if (strsrch->pattern.pces == NULL) {
4109         initializePatternPCETable(strsrch, status);
4110     }
4111 
4112     CEIBuffer ceb(strsrch, status);
4113     int32_t    targetIx = 0;
4114 
4115     /*
4116      * Pre-load the buffer with the CE's for the grapheme
4117      * after our starting position so that we're sure that
4118      * we can look at the CE following the match when we
4119      * check the match boundaries.
4120      *
4121      * This will also pre-fetch the first CE that we'll
4122      * consider for the match.
4123      */
4124     if (startIdx < strsrch->search->textLength) {
4125         UBreakIterator *bi = strsrch->search->internalBreakIter;
4126         int32_t next = ubrk_following(bi, startIdx);
4127 
4128         ucol_setOffset(strsrch->textIter, next, status);
4129 
4130         for (targetIx = 0; ; targetIx += 1) {
4131             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4132                 break;
4133             }
4134         }
4135     } else {
4136         ucol_setOffset(strsrch->textIter, startIdx, status);
4137     }
4138 
4139 
4140     const CEI *targetCEI = NULL;
4141     int32_t    patIx;
4142     UBool      found;
4143 
4144     int32_t  limitIx = targetIx;
4145     int32_t  mStart = -1;
4146     int32_t  mLimit = -1;
4147     int32_t  minLimit;
4148     int32_t  maxLimit;
4149 
4150 
4151 
4152     // Outer loop moves over match starting positions in the
4153     //      target CE space.
4154     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4155     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4156     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4157     // and the beginning of the base text.
4158     for(targetIx = limitIx; ; targetIx += 1)
4159     {
4160         found = TRUE;
4161         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4162         // (compared to the last CE fetched for the previous targetIx value) as we need to go
4163         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4164         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4165         if (lastCEI == NULL) {
4166             *status = U_INTERNAL_PROGRAM_ERROR;
4167             found = FALSE;
4168              break;
4169         }
4170         //  Inner loop checks for a match beginning at each
4171         //  position from the outer loop.
4172         int32_t targetIxOffset = 0;
4173         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4174             int64_t patCE = strsrch->pattern.pces[patIx];
4175 
4176             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4177             //  Compare CE from target string with CE from the pattern.
4178             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4179             //    which will fail the compare, below.
4180             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4181             if ( ceMatch == U_CE_NO_MATCH ) {
4182                 found = FALSE;
4183                 break;
4184             } else if ( ceMatch > U_CE_NO_MATCH ) {
4185                 if ( ceMatch == U_CE_SKIP_TARG ) {
4186                     // redo with same patCE, next targCE
4187                     patIx++;
4188                     targetIxOffset++;
4189                 } else { // ceMatch == U_CE_SKIP_PATN
4190                     // redo with same targCE, next patCE
4191                     targetIxOffset--;
4192                 }
4193             }
4194         }
4195 
4196         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4197             // No match at this targetIx.  Try again at the next.
4198             continue;
4199         }
4200 
4201         if (!found) {
4202             // No match at all, we have run off the end of the target text.
4203             break;
4204         }
4205 
4206 
4207         // We have found a match in CE space.
4208         // Now determine the bounds in string index space.
4209         //  There still is a chance of match failure if the CE range not correspond to
4210         //     an acceptable character range.
4211         //
4212         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4213         mStart   = firstCEI->lowIndex;
4214 
4215         // Check for the start of the match being within a combining sequence.
4216         //   This can happen if the pattern itself begins with a combining char, and
4217         //   the match found combining marks in the target text that were attached
4218         //    to something else.
4219         //   This type of match should be rejected for not completely consuming a
4220         //   combining sequence.
4221         if (!isBreakBoundary(strsrch, mStart)) {
4222             found = FALSE;
4223         }
4224 
4225         // Look at the high index of the first CE in the match. If it's the same as the
4226         // low index, the first CE in the match is in the middle of an expansion.
4227         if (mStart == firstCEI->highIndex) {
4228             found = FALSE;
4229         }
4230 
4231 
4232         minLimit = lastCEI->lowIndex;
4233 
4234         if (targetIx > 0) {
4235             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4236             //   extended to the end of input, and the match is good.
4237 
4238             // Look at the high and low indices of the CE following the match. If
4239             // they are the same it means one of two things:
4240             //    1. The match extended to the last CE from the target text, which is OK, or
4241             //    2. The last CE that was part of the match is in an expansion that extends
4242             //       to the first CE after the match. In this case, we reject the match.
4243             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4244 
4245             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4246                 found = FALSE;
4247             }
4248 
4249             mLimit = maxLimit = nextCEI->lowIndex;
4250 
4251             //  Advance the match end position to the first acceptable match boundary.
4252             //    This advances the index over any combining charcters.
4253             if (minLimit < maxLimit) {
4254                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4255 
4256                 if (nba >= lastCEI->highIndex) {
4257                     mLimit = nba;
4258                 }
4259             }
4260 
4261             // If advancing to the end of a combining sequence in character indexing space
4262             //   advanced us beyond the end of the match in CE space, reject this match.
4263             if (mLimit > maxLimit) {
4264                 found = FALSE;
4265             }
4266 
4267             // Make sure the end of the match is on a break boundary
4268             if (!isBreakBoundary(strsrch, mLimit)) {
4269                 found = FALSE;
4270             }
4271 
4272         } else {
4273             // No non-ignorable CEs after this point.
4274             // The maximum position is detected by boundary after
4275             // the last non-ignorable CE. Combining sequence
4276             // across the start index will be truncated.
4277             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4278             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4279         }
4280 
4281     #ifdef USEARCH_DEBUG
4282         if (getenv("USEARCH_DEBUG") != NULL) {
4283             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4284         }
4285     #endif
4286 
4287 
4288         if (! checkIdentical(strsrch, mStart, mLimit)) {
4289             found = FALSE;
4290         }
4291 
4292         if (found) {
4293             break;
4294         }
4295     }
4296 
4297     #ifdef USEARCH_DEBUG
4298     if (getenv("USEARCH_DEBUG") != NULL) {
4299         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4300         int32_t  lastToPrint = ceb.limitIx+2;
4301         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4302             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4303         }
4304         printf("\n%s\n", found? "match found" : "no match");
4305     }
4306     #endif
4307 
4308     // All Done.  Store back the match bounds to the caller.
4309     //
4310     if (found==FALSE) {
4311         mLimit = -1;
4312         mStart = -1;
4313     }
4314 
4315     if (matchStart != NULL) {
4316         *matchStart= mStart;
4317     }
4318 
4319     if (matchLimit != NULL) {
4320         *matchLimit = mLimit;
4321     }
4322 
4323     return found;
4324 }
4325 
4326 // internal use methods declared in usrchimp.h -----------------------------
4327 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4328 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4329 {
4330     if (U_FAILURE(*status)) {
4331         setMatchNotFound(strsrch);
4332         return FALSE;
4333     }
4334 
4335 #if BOYER_MOORE
4336     UCollationElements *coleiter        = strsrch->textIter;
4337     int32_t             textlength      = strsrch->search->textLength;
4338     int32_t            *patternce       = strsrch->pattern.ces;
4339     int32_t             patterncelength = strsrch->pattern.cesLength;
4340     int32_t             textoffset      = ucol_getOffset(coleiter);
4341 
4342     // status used in setting coleiter offset, since offset is checked in
4343     // shiftForward before setting the coleiter offset, status never
4344     // a failure
4345     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4346                               patterncelength);
4347     while (textoffset <= textlength)
4348     {
4349         uint32_t    patternceindex = patterncelength - 1;
4350         int32_t     targetce;
4351         UBool       found          = FALSE;
4352         int32_t    lastce          = UCOL_NULLORDER;
4353 
4354         setColEIterOffset(coleiter, textoffset);
4355 
4356         for (;;) {
4357             // finding the last pattern ce match, imagine composite characters
4358             // for example: search for pattern A in text \u00C0
4359             // we'll have to skip \u0300 the grave first before we get to A
4360             targetce = ucol_previous(coleiter, status);
4361             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4362                 found = FALSE;
4363                 break;
4364             }
4365             targetce = getCE(strsrch, targetce);
4366             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4367                 // this is for the text \u0315\u0300 that requires
4368                 // normalization and pattern \u0300, where \u0315 is ignorable
4369                 continue;
4370             }
4371             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4372                 lastce = targetce;
4373             }
4374             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4375             if (targetce == patternce[patternceindex]) {
4376                 // the first ce can be a contraction
4377                 found = TRUE;
4378                 break;
4379             }
4380             if (!hasExpansion(coleiter)) {
4381                 found = FALSE;
4382                 break;
4383             }
4384         }
4385 
4386         //targetce = lastce;
4387 
4388         while (found && patternceindex > 0) {
4389             lastce = targetce;
4390             targetce    = ucol_previous(coleiter, status);
4391             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4392                 found = FALSE;
4393                 break;
4394             }
4395             targetce    = getCE(strsrch, targetce);
4396             if (targetce == UCOL_IGNORABLE) {
4397                 continue;
4398             }
4399 
4400             patternceindex --;
4401             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4402             found = found && targetce == patternce[patternceindex];
4403         }
4404 
4405         targetce = lastce;
4406 
4407         if (!found) {
4408             if (U_FAILURE(*status)) {
4409                 break;
4410             }
4411             textoffset = shiftForward(strsrch, textoffset, lastce,
4412                                       patternceindex);
4413             // status checked at loop.
4414             patternceindex = patterncelength;
4415             continue;
4416         }
4417 
4418         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4419             // status checked in ucol_setOffset
4420             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4421             return TRUE;
4422         }
4423     }
4424     setMatchNotFound(strsrch);
4425     return FALSE;
4426 #else
4427     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4428     int32_t start = -1;
4429     int32_t end = -1;
4430 
4431     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4432         strsrch->search->matchedIndex  = start;
4433         strsrch->search->matchedLength = end - start;
4434         return TRUE;
4435     } else {
4436         setMatchNotFound(strsrch);
4437         return FALSE;
4438     }
4439 #endif
4440 }
4441 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4442 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4443 {
4444     if (U_FAILURE(*status)) {
4445         setMatchNotFound(strsrch);
4446         return FALSE;
4447     }
4448 
4449 #if BOYER_MOORE
4450     UCollationElements *coleiter        = strsrch->textIter;
4451     int32_t             textlength      = strsrch->search->textLength;
4452     int32_t            *patternce       = strsrch->pattern.ces;
4453     int32_t             patterncelength = strsrch->pattern.cesLength;
4454     int32_t             textoffset      = ucol_getOffset(coleiter);
4455     UBool               hasPatternAccents =
4456        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4457 
4458     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4459                               patterncelength);
4460     strsrch->canonicalPrefixAccents[0] = 0;
4461     strsrch->canonicalSuffixAccents[0] = 0;
4462 
4463     while (textoffset <= textlength)
4464     {
4465         int32_t     patternceindex = patterncelength - 1;
4466         int32_t     targetce;
4467         UBool       found          = FALSE;
4468         int32_t     lastce         = UCOL_NULLORDER;
4469 
4470         setColEIterOffset(coleiter, textoffset);
4471 
4472         for (;;) {
4473             // finding the last pattern ce match, imagine composite characters
4474             // for example: search for pattern A in text \u00C0
4475             // we'll have to skip \u0300 the grave first before we get to A
4476             targetce = ucol_previous(coleiter, status);
4477             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4478                 found = FALSE;
4479                 break;
4480             }
4481             targetce = getCE(strsrch, targetce);
4482             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4483                 lastce = targetce;
4484             }
4485             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4486             if (targetce == patternce[patternceindex]) {
4487                 // the first ce can be a contraction
4488                 found = TRUE;
4489                 break;
4490             }
4491             if (!hasExpansion(coleiter)) {
4492                 found = FALSE;
4493                 break;
4494             }
4495         }
4496 
4497         while (found && patternceindex > 0) {
4498             targetce    = ucol_previous(coleiter, status);
4499             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4500                 found = FALSE;
4501                 break;
4502             }
4503             targetce    = getCE(strsrch, targetce);
4504             if (targetce == UCOL_IGNORABLE) {
4505                 continue;
4506             }
4507 
4508             patternceindex --;
4509             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4510             found = found && targetce == patternce[patternceindex];
4511         }
4512 
4513         // initializing the rearranged accent array
4514         if (hasPatternAccents && !found) {
4515             strsrch->canonicalPrefixAccents[0] = 0;
4516             strsrch->canonicalSuffixAccents[0] = 0;
4517             if (U_FAILURE(*status)) {
4518                 break;
4519             }
4520             found = doNextCanonicalMatch(strsrch, textoffset, status);
4521         }
4522 
4523         if (!found) {
4524             if (U_FAILURE(*status)) {
4525                 break;
4526             }
4527             textoffset = shiftForward(strsrch, textoffset, lastce,
4528                                       patternceindex);
4529             // status checked at loop
4530             patternceindex = patterncelength;
4531             continue;
4532         }
4533 
4534         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4535             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4536             return TRUE;
4537         }
4538     }
4539     setMatchNotFound(strsrch);
4540     return FALSE;
4541 #else
4542     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4543     int32_t start = -1;
4544     int32_t end = -1;
4545 
4546     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4547         strsrch->search->matchedIndex  = start;
4548         strsrch->search->matchedLength = end - start;
4549         return TRUE;
4550     } else {
4551         setMatchNotFound(strsrch);
4552         return FALSE;
4553     }
4554 #endif
4555 }
4556 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4557 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4558 {
4559     if (U_FAILURE(*status)) {
4560         setMatchNotFound(strsrch);
4561         return FALSE;
4562     }
4563 
4564 #if BOYER_MOORE
4565     UCollationElements *coleiter        = strsrch->textIter;
4566     int32_t            *patternce       = strsrch->pattern.ces;
4567     int32_t             patterncelength = strsrch->pattern.cesLength;
4568     int32_t             textoffset      = ucol_getOffset(coleiter);
4569 
4570     // shifting it check for setting offset
4571     // if setOffset is called previously or there was no previous match, we
4572     // leave the offset as it is.
4573     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4574         textoffset = strsrch->search->matchedIndex;
4575     }
4576 
4577     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4578                               patterncelength);
4579 
4580     while (textoffset >= 0)
4581     {
4582         int32_t     patternceindex = 1;
4583         int32_t     targetce;
4584         UBool       found          = FALSE;
4585         int32_t     firstce        = UCOL_NULLORDER;
4586 
4587         // if status is a failure, ucol_setOffset does nothing
4588         setColEIterOffset(coleiter, textoffset);
4589 
4590         for (;;) {
4591             // finding the first pattern ce match, imagine composite
4592             // characters. for example: search for pattern \u0300 in text
4593             // \u00C0, we'll have to skip A first before we get to
4594             // \u0300 the grave accent
4595             targetce = ucol_next(coleiter, status);
4596             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4597                 found = FALSE;
4598                 break;
4599             }
4600             targetce = getCE(strsrch, targetce);
4601             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4602                 firstce = targetce;
4603             }
4604             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4605                 continue;
4606             }
4607             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4608             if (targetce == patternce[0]) {
4609                 found = TRUE;
4610                 break;
4611             }
4612             if (!hasExpansion(coleiter)) {
4613                 // checking for accents in composite character
4614                 found = FALSE;
4615                 break;
4616             }
4617         }
4618 
4619         //targetce = firstce;
4620 
4621         while (found && (patternceindex < patterncelength)) {
4622             firstce = targetce;
4623             targetce    = ucol_next(coleiter, status);
4624             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4625                 found = FALSE;
4626                 break;
4627             }
4628             targetce    = getCE(strsrch, targetce);
4629             if (targetce == UCOL_IGNORABLE) {
4630                 continue;
4631             }
4632 
4633             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4634             found = found && targetce == patternce[patternceindex];
4635             patternceindex ++;
4636         }
4637 
4638         targetce = firstce;
4639 
4640         if (!found) {
4641             if (U_FAILURE(*status)) {
4642                 break;
4643             }
4644 
4645             textoffset = reverseShift(strsrch, textoffset, targetce,
4646                                       patternceindex);
4647             patternceindex = 0;
4648             continue;
4649         }
4650 
4651         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4652             setColEIterOffset(coleiter, textoffset);
4653             return TRUE;
4654         }
4655     }
4656     setMatchNotFound(strsrch);
4657     return FALSE;
4658 #else
4659     int32_t textOffset;
4660 
4661     if (strsrch->search->isOverlap) {
4662         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4663             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4664         } else {
4665             // move the start position at the end of possible match
4666             initializePatternPCETable(strsrch, status);
4667             if (!initTextProcessedIter(strsrch, status)) {
4668                 setMatchNotFound(strsrch);
4669                 return FALSE;
4670             }
4671             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4672                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4673                 if (pce == UCOL_PROCESSED_NULLORDER) {
4674                     // at the end of the text
4675                     break;
4676                 }
4677             }
4678             if (U_FAILURE(*status)) {
4679                 setMatchNotFound(strsrch);
4680                 return FALSE;
4681             }
4682             textOffset = ucol_getOffset(strsrch->textIter);
4683         }
4684     } else {
4685         textOffset = ucol_getOffset(strsrch->textIter);
4686     }
4687 
4688     int32_t start = -1;
4689     int32_t end = -1;
4690 
4691     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4692         strsrch->search->matchedIndex = start;
4693         strsrch->search->matchedLength = end - start;
4694         return TRUE;
4695     } else {
4696         setMatchNotFound(strsrch);
4697         return FALSE;
4698     }
4699 #endif
4700 }
4701 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4702 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4703                                       UErrorCode    *status)
4704 {
4705     if (U_FAILURE(*status)) {
4706         setMatchNotFound(strsrch);
4707         return FALSE;
4708     }
4709 
4710 #if BOYER_MOORE
4711     UCollationElements *coleiter        = strsrch->textIter;
4712     int32_t            *patternce       = strsrch->pattern.ces;
4713     int32_t             patterncelength = strsrch->pattern.cesLength;
4714     int32_t             textoffset      = ucol_getOffset(coleiter);
4715     UBool               hasPatternAccents =
4716        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4717 
4718     // shifting it check for setting offset
4719     // if setOffset is called previously or there was no previous match, we
4720     // leave the offset as it is.
4721     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4722         textoffset = strsrch->search->matchedIndex;
4723     }
4724 
4725     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4726                               patterncelength);
4727     strsrch->canonicalPrefixAccents[0] = 0;
4728     strsrch->canonicalSuffixAccents[0] = 0;
4729 
4730     while (textoffset >= 0)
4731     {
4732         int32_t     patternceindex = 1;
4733         int32_t     targetce;
4734         UBool       found          = FALSE;
4735         int32_t     firstce        = UCOL_NULLORDER;
4736 
4737         setColEIterOffset(coleiter, textoffset);
4738         for (;;) {
4739             // finding the first pattern ce match, imagine composite
4740             // characters. for example: search for pattern \u0300 in text
4741             // \u00C0, we'll have to skip A first before we get to
4742             // \u0300 the grave accent
4743             targetce = ucol_next(coleiter, status);
4744             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4745                 found = FALSE;
4746                 break;
4747             }
4748             targetce = getCE(strsrch, targetce);
4749             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4750                 firstce = targetce;
4751             }
4752 
4753             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4754             if (targetce == patternce[0]) {
4755                 // the first ce can be a contraction
4756                 found = TRUE;
4757                 break;
4758             }
4759             if (!hasExpansion(coleiter)) {
4760                 // checking for accents in composite character
4761                 found = FALSE;
4762                 break;
4763             }
4764         }
4765 
4766         targetce = firstce;
4767 
4768         while (found && patternceindex < patterncelength) {
4769             targetce    = ucol_next(coleiter, status);
4770             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4771                 found = FALSE;
4772                 break;
4773             }
4774             targetce = getCE(strsrch, targetce);
4775             if (targetce == UCOL_IGNORABLE) {
4776                 continue;
4777             }
4778 
4779             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4780             found = found && targetce == patternce[patternceindex];
4781             patternceindex ++;
4782         }
4783 
4784         // initializing the rearranged accent array
4785         if (hasPatternAccents && !found) {
4786             strsrch->canonicalPrefixAccents[0] = 0;
4787             strsrch->canonicalSuffixAccents[0] = 0;
4788             if (U_FAILURE(*status)) {
4789                 break;
4790             }
4791             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4792         }
4793 
4794         if (!found) {
4795             if (U_FAILURE(*status)) {
4796                 break;
4797             }
4798             textoffset = reverseShift(strsrch, textoffset, targetce,
4799                                       patternceindex);
4800             patternceindex = 0;
4801             continue;
4802         }
4803 
4804         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4805             setColEIterOffset(coleiter, textoffset);
4806             return TRUE;
4807         }
4808     }
4809     setMatchNotFound(strsrch);
4810     return FALSE;
4811 #else
4812     int32_t textOffset;
4813 
4814     if (strsrch->search->isOverlap) {
4815         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4816             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4817         } else {
4818             // move the start position at the end of possible match
4819             initializePatternPCETable(strsrch, status);
4820             if (!initTextProcessedIter(strsrch, status)) {
4821                 setMatchNotFound(strsrch);
4822                 return FALSE;
4823             }
4824             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4825                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4826                 if (pce == UCOL_PROCESSED_NULLORDER) {
4827                     // at the end of the text
4828                     break;
4829                 }
4830             }
4831             if (U_FAILURE(*status)) {
4832                 setMatchNotFound(strsrch);
4833                 return FALSE;
4834             }
4835             textOffset = ucol_getOffset(strsrch->textIter);
4836         }
4837     } else {
4838         textOffset = ucol_getOffset(strsrch->textIter);
4839     }
4840 
4841     int32_t start = -1;
4842     int32_t end = -1;
4843 
4844     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4845         strsrch->search->matchedIndex = start;
4846         strsrch->search->matchedLength = end - start;
4847         return TRUE;
4848     } else {
4849         setMatchNotFound(strsrch);
4850         return FALSE;
4851     }
4852 #endif
4853 }
4854 
4855 #endif /* #if !UCONFIG_NO_COLLATION */
4856