1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (C) 2001-2015 IBM and others. All rights reserved.
6 **********************************************************************
7 *   Date        Name        Description
8 *  07/02/2001   synwee      Creation.
9 **********************************************************************
10 */
11 
12 #include "unicode/utypes.h"
13 
14 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
15 
16 #include "unicode/usearch.h"
17 #include "unicode/ustring.h"
18 #include "unicode/uchar.h"
19 #include "unicode/utf16.h"
20 #include "normalizer2impl.h"
21 #include "usrchimp.h"
22 #include "cmemory.h"
23 #include "ucln_in.h"
24 #include "uassert.h"
25 #include "ustr_imp.h"
26 
27 U_NAMESPACE_USE
28 
29 // don't use Boyer-Moore
30 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
31 #define BOYER_MOORE 0
32 
33 // internal definition ---------------------------------------------------
34 
35 #define LAST_BYTE_MASK_          0xFF
36 #define SECOND_LAST_BYTE_SHIFT_  8
37 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38 
39 static const Normalizer2Impl *g_nfcImpl = NULL;
40 
41 // internal methods -------------------------------------------------
42 
43 /**
44 * Fast collation element iterator setOffset.
45 * This function does not check for bounds.
46 * @param coleiter collation element iterator
47 * @param offset to set
48 */
49 static
setColEIterOffset(UCollationElements * elems,int32_t offset)50 inline void setColEIterOffset(UCollationElements *elems,
51                       int32_t             offset)
52 {
53     // Note: Not "fast" any more after the 2013 collation rewrite.
54     // We do not want to expose more internals than necessary.
55     UErrorCode status = U_ZERO_ERROR;
56     ucol_setOffset(elems, offset, &status);
57 }
58 
59 /**
60 * Getting the mask for collation strength
61 * @param strength collation strength
62 * @return collation element mask
63 */
64 static
getMask(UCollationStrength strength)65 inline uint32_t getMask(UCollationStrength strength)
66 {
67     switch (strength)
68     {
69     case UCOL_PRIMARY:
70         return UCOL_PRIMARYORDERMASK;
71     case UCOL_SECONDARY:
72         return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73     default:
74         return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75                UCOL_PRIMARYORDERMASK;
76     }
77 }
78 
79 /**
80 * @param ce 32-bit collation element
81 * @return hash code
82 */
83 static
hashFromCE32(uint32_t ce)84 inline int hashFromCE32(uint32_t ce)
85 {
86     int hc = (int)(
87             ((((((ce >> 24) * 37) +
88             (ce >> 16)) * 37) +
89             (ce >> 8)) * 37) +
90             ce);
91     hc %= MAX_TABLE_SIZE_;
92     if (hc < 0) {
93         hc += MAX_TABLE_SIZE_;
94     }
95     return hc;
96 }
97 
98 U_CDECL_BEGIN
99 static UBool U_CALLCONV
usearch_cleanup(void)100 usearch_cleanup(void) {
101     g_nfcImpl = NULL;
102     return TRUE;
103 }
104 U_CDECL_END
105 
106 /**
107 * Initializing the fcd tables.
108 * Internal method, status assumed to be a success.
109 * @param status output error if any, caller to check status before calling
110 *               method, status assumed to be success when passed in.
111 */
112 static
initializeFCD(UErrorCode * status)113 inline void initializeFCD(UErrorCode *status)
114 {
115     if (g_nfcImpl == NULL) {
116         g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
117         ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
118     }
119 }
120 
121 /**
122 * Gets the fcd value for a character at the argument index.
123 * This method takes into accounts of the supplementary characters.
124 * @param str UTF16 string where character for fcd retrieval resides
125 * @param offset position of the character whose fcd is to be retrieved, to be
126 *               overwritten with the next character position, taking
127 *               surrogate characters into consideration.
128 * @param strlength length of the argument string
129 * @return fcd value
130 */
131 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)132 uint16_t getFCD(const UChar   *str, int32_t *offset,
133                              int32_t  strlength)
134 {
135     const UChar *temp = str + *offset;
136     uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
137     *offset = (int32_t)(temp - str);
138     return result;
139 }
140 
141 /**
142 * Getting the modified collation elements taking into account the collation
143 * attributes
144 * @param strsrch string search data
145 * @param sourcece
146 * @return the modified collation element
147 */
148 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)149 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
150 {
151     // note for tertiary we can't use the collator->tertiaryMask, that
152     // is a preprocessed mask that takes into account case options. since
153     // we are only concerned with exact matches, we don't need that.
154     sourcece &= strsrch->ceMask;
155 
156     if (strsrch->toShift) {
157         // alternate handling here, since only the 16 most significant digits
158         // is only used, we can safely do a compare without masking
159         // if the ce is a variable, we mask and get only the primary values
160         // no shifting to quartenary is required since all primary values
161         // less than variabletop will need to be masked off anyway.
162         if (strsrch->variableTop > sourcece) {
163             if (strsrch->strength >= UCOL_QUATERNARY) {
164                 sourcece &= UCOL_PRIMARYORDERMASK;
165             }
166             else {
167                 sourcece = UCOL_IGNORABLE;
168             }
169         }
170     } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
171         sourcece = 0xFFFF;
172     }
173 
174     return sourcece;
175 }
176 
177 /**
178 * Allocate a memory and returns NULL if it failed.
179 * Internal method, status assumed to be a success.
180 * @param size to allocate
181 * @param status output error if any, caller to check status before calling
182 *               method, status assumed to be success when passed in.
183 * @return newly allocated array, NULL otherwise
184 */
185 static
allocateMemory(uint32_t size,UErrorCode * status)186 inline void * allocateMemory(uint32_t size, UErrorCode *status)
187 {
188     uint32_t *result = (uint32_t *)uprv_malloc(size);
189     if (result == NULL) {
190         *status = U_MEMORY_ALLOCATION_ERROR;
191     }
192     return result;
193 }
194 
195 /**
196 * Adds a uint32_t value to a destination array.
197 * Creates a new array if we run out of space. The caller will have to
198 * manually deallocate the newly allocated array.
199 * Internal method, status assumed to be success, caller has to check status
200 * before calling this method. destination not to be NULL and has at least
201 * size destinationlength.
202 * @param destination target array
203 * @param offset destination offset to add value
204 * @param destinationlength target array size, return value for the new size
205 * @param value to be added
206 * @param increments incremental size expected
207 * @param status output error if any, caller to check status before calling
208 *               method, status assumed to be success when passed in.
209 * @return new destination array, destination if there was no new allocation
210 */
211 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)212 inline int32_t * addTouint32_tArray(int32_t    *destination,
213                                     uint32_t    offset,
214                                     uint32_t   *destinationlength,
215                                     uint32_t    value,
216                                     uint32_t    increments,
217                                     UErrorCode *status)
218 {
219     uint32_t newlength = *destinationlength;
220     if (offset + 1 == newlength) {
221         newlength += increments;
222         int32_t *temp = (int32_t *)allocateMemory(
223                                          sizeof(int32_t) * newlength, status);
224         if (U_FAILURE(*status)) {
225             return NULL;
226         }
227         uprv_memcpy(temp, destination, sizeof(int32_t) * (size_t)offset);
228         *destinationlength = newlength;
229         destination        = temp;
230     }
231     destination[offset] = value;
232     return destination;
233 }
234 
235 /**
236 * Adds a uint64_t value to a destination array.
237 * Creates a new array if we run out of space. The caller will have to
238 * manually deallocate the newly allocated array.
239 * Internal method, status assumed to be success, caller has to check status
240 * before calling this method. destination not to be NULL and has at least
241 * size destinationlength.
242 * @param destination target array
243 * @param offset destination offset to add value
244 * @param destinationlength target array size, return value for the new size
245 * @param value to be added
246 * @param increments incremental size expected
247 * @param status output error if any, caller to check status before calling
248 *               method, status assumed to be success when passed in.
249 * @return new destination array, destination if there was no new allocation
250 */
251 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)252 inline int64_t * addTouint64_tArray(int64_t    *destination,
253                                     uint32_t    offset,
254                                     uint32_t   *destinationlength,
255                                     uint64_t    value,
256                                     uint32_t    increments,
257                                     UErrorCode *status)
258 {
259     uint32_t newlength = *destinationlength;
260     if (offset + 1 == newlength) {
261         newlength += increments;
262         int64_t *temp = (int64_t *)allocateMemory(
263                                          sizeof(int64_t) * newlength, status);
264 
265         if (U_FAILURE(*status)) {
266             return NULL;
267         }
268 
269         uprv_memcpy(temp, destination, sizeof(int64_t) * (size_t)offset);
270         *destinationlength = newlength;
271         destination        = temp;
272     }
273 
274     destination[offset] = value;
275 
276     return destination;
277 }
278 
279 /**
280 * Initializing the ce table for a pattern.
281 * Stores non-ignorable collation keys.
282 * Table size will be estimated by the size of the pattern text. Table
283 * expansion will be perform as we go along. Adding 1 to ensure that the table
284 * size definitely increases.
285 * Internal method, status assumed to be a success.
286 * @param strsrch string search data
287 * @param status output error if any, caller to check status before calling
288 *               method, status assumed to be success when passed in.
289 * @return total number of expansions
290 */
291 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)292 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
293                                          UErrorCode    *status)
294 {
295     UPattern *pattern            = &(strsrch->pattern);
296     uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
297     int32_t  *cetable            = pattern->cesBuffer;
298     uint32_t  patternlength      = pattern->textLength;
299     UCollationElements *coleiter = strsrch->utilIter;
300 
301     if (coleiter == NULL) {
302         coleiter = ucol_openElements(strsrch->collator, pattern->text,
303                                      patternlength, status);
304         // status will be checked in ucol_next(..) later and if it is an
305         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
306         // returned.
307         strsrch->utilIter = coleiter;
308     }
309     else {
310         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
311     }
312     if(U_FAILURE(*status)) {
313         return 0;
314     }
315 
316     if (pattern->ces != cetable && pattern->ces) {
317         uprv_free(pattern->ces);
318     }
319 
320     uint16_t  offset      = 0;
321     uint16_t  result      = 0;
322     int32_t   ce;
323 
324     while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
325            U_SUCCESS(*status)) {
326         uint32_t newce = getCE(strsrch, ce);
327         if (newce) {
328             int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
329                                   newce,
330                                   patternlength - ucol_getOffset(coleiter) + 1,
331                                   status);
332             if (U_FAILURE(*status)) {
333                 return 0;
334             }
335             offset ++;
336             if (cetable != temp && cetable != pattern->cesBuffer) {
337                 uprv_free(cetable);
338             }
339             cetable = temp;
340         }
341         result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
342     }
343 
344     cetable[offset]   = 0;
345     pattern->ces       = cetable;
346     pattern->cesLength = offset;
347 
348     return result;
349 }
350 
351 /**
352 * Initializing the pce table for a pattern.
353 * Stores non-ignorable collation keys.
354 * Table size will be estimated by the size of the pattern text. Table
355 * expansion will be perform as we go along. Adding 1 to ensure that the table
356 * size definitely increases.
357 * Internal method, status assumed to be a success.
358 * @param strsrch string search data
359 * @param status output error if any, caller to check status before calling
360 *               method, status assumed to be success when passed in.
361 * @return total number of expansions
362 */
363 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)364 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
365                                           UErrorCode    *status)
366 {
367     UPattern *pattern            = &(strsrch->pattern);
368     uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
369     int64_t  *pcetable           = pattern->pcesBuffer;
370     uint32_t  patternlength      = pattern->textLength;
371     UCollationElements *coleiter = strsrch->utilIter;
372 
373     if (coleiter == NULL) {
374         coleiter = ucol_openElements(strsrch->collator, pattern->text,
375                                      patternlength, status);
376         // status will be checked in ucol_next(..) later and if it is an
377         // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
378         // returned.
379         strsrch->utilIter = coleiter;
380     } else {
381         ucol_setText(coleiter, pattern->text, pattern->textLength, status);
382     }
383     if(U_FAILURE(*status)) {
384         return 0;
385     }
386 
387     if (pattern->pces != pcetable && pattern->pces != NULL) {
388         uprv_free(pattern->pces);
389     }
390 
391     uint16_t  offset = 0;
392     uint16_t  result = 0;
393     int64_t   pce;
394 
395     icu::UCollationPCE iter(coleiter);
396 
397     // ** Should processed CEs be signed or unsigned?
398     // ** (the rest of the code in this file seems to play fast-and-loose with
399     // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
400     while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
401            U_SUCCESS(*status)) {
402         int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
403                               pce,
404                               patternlength - ucol_getOffset(coleiter) + 1,
405                               status);
406 
407         if (U_FAILURE(*status)) {
408             return 0;
409         }
410 
411         offset += 1;
412 
413         if (pcetable != temp && pcetable != pattern->pcesBuffer) {
414             uprv_free(pcetable);
415         }
416 
417         pcetable = temp;
418         //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
419     }
420 
421     pcetable[offset]   = 0;
422     pattern->pces       = pcetable;
423     pattern->pcesLength = offset;
424 
425     return result;
426 }
427 
428 /**
429 * Initializes the pattern struct.
430 * Internal method, status assumed to be success.
431 * @param strsrch UStringSearch data storage
432 * @param status output error if any, caller to check status before calling
433 *               method, status assumed to be success when passed in.
434 * @return expansionsize the total expansion size of the pattern
435 */
436 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)437 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
438 {
439     if (U_FAILURE(*status)) { return 0; }
440           UPattern   *pattern     = &(strsrch->pattern);
441     const UChar      *patterntext = pattern->text;
442           int32_t     length      = pattern->textLength;
443           int32_t index       = 0;
444 
445     // Since the strength is primary, accents are ignored in the pattern.
446     if (strsrch->strength == UCOL_PRIMARY) {
447         pattern->hasPrefixAccents = 0;
448         pattern->hasSuffixAccents = 0;
449     } else {
450         pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
451                                                          SECOND_LAST_BYTE_SHIFT_;
452         index = length;
453         U16_BACK_1(patterntext, 0, index);
454         pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
455                                                                  LAST_BYTE_MASK_;
456     }
457 
458     // ** HACK **
459     if (strsrch->pattern.pces != NULL) {
460         if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
461             uprv_free(strsrch->pattern.pces);
462         }
463 
464         strsrch->pattern.pces = NULL;
465     }
466 
467     // since intializePattern is an internal method status is a success.
468     return initializePatternCETable(strsrch, status);
469 }
470 
471 /**
472 * Initializing shift tables, with the default values.
473 * If a corresponding default value is 0, the shift table is not set.
474 * @param shift table for forwards shift
475 * @param backshift table for backwards shift
476 * @param cetable table containing pattern ce
477 * @param cesize size of the pattern ces
478 * @param expansionsize total size of the expansions
479 * @param defaultforward the default forward value
480 * @param defaultbackward the default backward value
481 */
482 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)483 inline void setShiftTable(int16_t   shift[], int16_t backshift[],
484                           int32_t  *cetable, int32_t cesize,
485                           int16_t   expansionsize,
486                           int16_t   defaultforward,
487                           int16_t   defaultbackward)
488 {
489     // estimate the value to shift. to do that we estimate the smallest
490     // number of characters to give the relevant ces, ie approximately
491     // the number of ces minus their expansion, since expansions can come
492     // from a character.
493     int32_t count;
494     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
495         shift[count] = defaultforward;
496     }
497     cesize --; // down to the last index
498     for (count = 0; count < cesize; count ++) {
499         // number of ces from right of array to the count
500         int temp = defaultforward - count - 1;
501         shift[hashFromCE32(cetable[count])] = temp > 1 ? static_cast<int16_t>(temp) : 1;
502     }
503     shift[hashFromCE32(cetable[cesize])] = 1;
504     // for ignorables we just shift by one. see test examples.
505     shift[hashFromCE32(0)] = 1;
506 
507     for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
508         backshift[count] = defaultbackward;
509     }
510     for (count = cesize; count > 0; count --) {
511         // the original value count does not seem to work
512         backshift[hashFromCE32(cetable[count])] = count > expansionsize ?
513                                           (int16_t)(count - expansionsize) : 1;
514     }
515     backshift[hashFromCE32(cetable[0])] = 1;
516     backshift[hashFromCE32(0)] = 1;
517 }
518 
519 /**
520 * Building of the pattern collation element list and the boyer moore strsrch
521 * table.
522 * The canonical match will only be performed after the default match fails.
523 * For both cases we need to remember the size of the composed and decomposed
524 * versions of the string. Since the Boyer-Moore shift calculations shifts by
525 * a number of characters in the text and tries to match the pattern from that
526 * offset, the shift value can not be too large in case we miss some
527 * characters. To choose a right shift size, we estimate the NFC form of the
528 * and use its size as a shift guide. The NFC form should be the small
529 * possible representation of the pattern. Anyways, we'll err on the smaller
530 * shift size. Hence the calculation for minlength.
531 * Canonical match will be performed slightly differently. We'll split the
532 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
533 * the first and last base character (MS), the ending accents (EA). Matches
534 * will be done on MS first, and only when we match MS then some processing
535 * will be required for the prefix and end accents in order to determine if
536 * they match PA and EA. Hence the default shift values
537 * for the canonical match will take the size of either end's accent into
538 * consideration. Forwards search will take the end accents into consideration
539 * for the default shift values and the backwards search will take the prefix
540 * accents into consideration.
541 * If pattern has no non-ignorable ce, we return a illegal argument error.
542 * Internal method, status assumed to be success.
543 * @param strsrch UStringSearch data storage
544 * @param status  for output errors if it occurs, status is assumed to be a
545 *                success when it is passed in.
546 */
547 static
initialize(UStringSearch * strsrch,UErrorCode * status)548 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
549 {
550     int16_t expandlength  = initializePattern(strsrch, status);
551     if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
552         UPattern *pattern = &strsrch->pattern;
553         int32_t   cesize  = pattern->cesLength;
554 
555         int16_t minlength = cesize > expandlength
556                             ? (int16_t)cesize - expandlength : 1;
557         pattern->defaultShiftSize    = minlength;
558         setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
559                       cesize, expandlength, minlength, minlength);
560         return;
561     }
562     strsrch->pattern.defaultShiftSize = 0;
563 }
564 
565 #if BOYER_MOORE
566 /**
567 * Check to make sure that the match length is at the end of the character by
568 * using the breakiterator.
569 * @param strsrch string search data
570 * @param start target text start offset
571 * @param end target text end offset
572 */
573 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)574 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
575                                int32_t *end)
576 {
577 #if !UCONFIG_NO_BREAK_ITERATION
578     UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
579     if (breakiterator) {
580         int32_t matchend = *end;
581         //int32_t matchstart = *start;
582 
583         if (!ubrk_isBoundary(breakiterator, matchend)) {
584             *end = ubrk_following(breakiterator, matchend);
585         }
586 
587         /* Check the start of the matched text to make sure it doesn't have any accents
588          * before it.  This code may not be necessary and so it is commented out */
589         /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
590             *start = ubrk_preceding(breakiterator, matchstart);
591         }*/
592     }
593 #endif
594 }
595 
596 /**
597 * Determine whether the target text in UStringSearch bounded by the offset
598 * start and end is one or more whole units of text as
599 * determined by the breakiterator in UStringSearch.
600 * @param strsrch string search data
601 * @param start target text start offset
602 * @param end target text end offset
603 */
604 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)605 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
606                                int32_t    end)
607 {
608 #if !UCONFIG_NO_BREAK_ITERATION
609     UBreakIterator *breakiterator = strsrch->search->breakIter;
610     //TODO: Add here.
611     if (breakiterator) {
612         int32_t startindex = ubrk_first(breakiterator);
613         int32_t endindex   = ubrk_last(breakiterator);
614 
615         // out-of-range indexes are never boundary positions
616         if (start < startindex || start > endindex ||
617             end < startindex || end > endindex) {
618             return FALSE;
619         }
620         // otherwise, we can use following() on the position before the
621         // specified one and return true of the position we get back is the
622         // one the user specified
623         UBool result = (start == startindex ||
624                 ubrk_following(breakiterator, start - 1) == start) &&
625                (end == endindex ||
626                 ubrk_following(breakiterator, end - 1) == end);
627         if (result) {
628             // iterates the individual ces
629                   UCollationElements *coleiter  = strsrch->utilIter;
630             const UChar              *text      = strsrch->search->text +
631                                                                       start;
632                   UErrorCode          status    = U_ZERO_ERROR;
633             ucol_setText(coleiter, text, end - start, &status);
634             for (int32_t count = 0; count < strsrch->pattern.cesLength;
635                  count ++) {
636                 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
637                 if (ce == UCOL_IGNORABLE) {
638                     count --;
639                     continue;
640                 }
641                 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
642                     return FALSE;
643                 }
644             }
645             int32_t nextce = ucol_next(coleiter, &status);
646             while (ucol_getOffset(coleiter) == (end - start)
647                    && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
648                 nextce = ucol_next(coleiter, &status);
649             }
650             if (ucol_getOffset(coleiter) == (end - start)
651                 && nextce != UCOL_NULLORDER) {
652                 // extra collation elements at the end of the match
653                 return FALSE;
654             }
655         }
656         return result;
657     }
658 #endif
659     return TRUE;
660 }
661 
662 /**
663 * Getting the next base character offset if current offset is an accent,
664 * or the current offset if the current character contains a base character.
665 * accents the following base character will be returned
666 * @param text string
667 * @param textoffset current offset
668 * @param textlength length of text string
669 * @return the next base character or the current offset
670 *         if the current character is contains a base character.
671 */
672 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)673 inline int32_t getNextBaseOffset(const UChar       *text,
674                                            int32_t  textoffset,
675                                            int32_t      textlength)
676 {
677     if (textoffset < textlength) {
678         int32_t temp = textoffset;
679         if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
680             while (temp < textlength) {
681                 int32_t result = temp;
682                 if ((getFCD(text, &temp, textlength) >>
683                      SECOND_LAST_BYTE_SHIFT_) == 0) {
684                     return result;
685                 }
686             }
687             return textlength;
688         }
689     }
690     return textoffset;
691 }
692 
693 /**
694 * Gets the next base character offset depending on the string search pattern
695 * data
696 * @param strsrch string search data
697 * @param textoffset current offset, one offset away from the last character
698 *                   to search for.
699 * @return start index of the next base character or the current offset
700 *         if the current character is contains a base character.
701 */
702 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)703 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
704                                                   int32_t    textoffset)
705 {
706     int32_t textlength = strsrch->search->textLength;
707     if (strsrch->pattern.hasSuffixAccents &&
708         textoffset < textlength) {
709               int32_t  temp       = textoffset;
710         const UChar       *text       = strsrch->search->text;
711         U16_BACK_1(text, 0, temp);
712         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
713             return getNextBaseOffset(text, textoffset, textlength);
714         }
715     }
716     return textoffset;
717 }
718 
719 /**
720 * Shifting the collation element iterator position forward to prepare for
721 * a following match. If the last character is a unsafe character, we'll only
722 * shift by 1 to capture contractions, normalization etc.
723 * Internal method, status assumed to be success.
724 * @param text strsrch string search data
725 * @param textoffset start text position to do search
726 * @param ce the text ce which failed the match.
727 * @param patternceindex index of the ce within the pattern ce buffer which
728 *        failed the match
729 * @return final offset
730 */
731 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)732 inline int32_t shiftForward(UStringSearch *strsrch,
733                                 int32_t    textoffset,
734                                 int32_t       ce,
735                                 int32_t        patternceindex)
736 {
737     UPattern *pattern = &(strsrch->pattern);
738     if (ce != UCOL_NULLORDER) {
739         int32_t shift = pattern->shift[hashFromCE32(ce)];
740         // this is to adjust for characters in the middle of the
741         // substring for matching that failed.
742         int32_t adjust = pattern->cesLength - patternceindex;
743         if (adjust > 1 && shift >= adjust) {
744             shift -= adjust - 1;
745         }
746         textoffset += shift;
747     }
748     else {
749         textoffset += pattern->defaultShiftSize;
750     }
751 
752     textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
753     // check for unsafe characters
754     // * if it is the start or middle of a contraction: to be done after
755     //   a initial match is found
756     // * thai or lao base consonant character: similar to contraction
757     // * high surrogate character: similar to contraction
758     // * next character is a accent: shift to the next base character
759     return textoffset;
760 }
761 #endif // #if BOYER_MOORE
762 
763 /**
764 * sets match not found
765 * @param strsrch string search data
766 */
767 static
setMatchNotFound(UStringSearch * strsrch)768 inline void setMatchNotFound(UStringSearch *strsrch)
769 {
770     // this method resets the match result regardless of the error status.
771     strsrch->search->matchedIndex = USEARCH_DONE;
772     strsrch->search->matchedLength = 0;
773     if (strsrch->search->isForwardSearching) {
774         setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
775     }
776     else {
777         setColEIterOffset(strsrch->textIter, 0);
778     }
779 }
780 
781 #if BOYER_MOORE
782 /**
783 * Gets the offset to the next safe point in text.
784 * ie. not the middle of a contraction, swappable characters or supplementary
785 * characters.
786 * @param collator collation sata
787 * @param text string to work with
788 * @param textoffset offset in string
789 * @param textlength length of text string
790 * @return offset to the next safe character
791 */
792 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)793 inline int32_t getNextSafeOffset(const UCollator   *collator,
794                                      const UChar       *text,
795                                            int32_t  textoffset,
796                                            int32_t      textlength)
797 {
798     int32_t result = textoffset; // first contraction character
799     while (result != textlength && ucol_unsafeCP(text[result], collator)) {
800         result ++;
801     }
802     return result;
803 }
804 
805 /**
806 * This checks for accents in the potential match started with a .
807 * composite character.
808 * This is really painful... we have to check that composite character do not
809 * have any extra accents. We have to normalize the potential match and find
810 * the immediate decomposed character before the match.
811 * The first composite character would have been taken care of by the fcd
812 * checks in checkForwardExactMatch.
813 * This is the slow path after the fcd of the first character and
814 * the last character has been checked by checkForwardExactMatch and we
815 * determine that the potential match has extra non-ignorable preceding
816 * ces.
817 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
818 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
819 * Note here that accents checking are slow and cautioned in the API docs.
820 * Internal method, status assumed to be a success, caller should check status
821 * before calling this method
822 * @param strsrch string search data
823 * @param start index of the potential unfriendly composite character
824 * @param end index of the potential unfriendly composite character
825 * @param status output error status if any.
826 * @return TRUE if there is non-ignorable accents before at the beginning
827 *              of the match, FALSE otherwise.
828 */
829 
830 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)831 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
832                                    int32_t    end,
833                                    UErrorCode    *status)
834 {
835     UBool result = FALSE;
836     if (strsrch->pattern.hasPrefixAccents) {
837               int32_t  length = end - start;
838               int32_t  offset = 0;
839         const UChar       *text   = strsrch->search->text + start;
840 
841         U16_FWD_1(text, offset, length);
842         // we are only concerned with the first composite character
843         if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
844             int32_t safeoffset = getNextSafeOffset(strsrch->collator,
845                                                        text, 0, length);
846             if (safeoffset != length) {
847                 safeoffset ++;
848             }
849             UChar   *norm = NULL;
850             UChar    buffer[INITIAL_ARRAY_SIZE_];
851             int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
852                                             buffer, INITIAL_ARRAY_SIZE_,
853                                             status);
854             if (U_FAILURE(*status)) {
855                 return FALSE;
856             }
857             if (size >= INITIAL_ARRAY_SIZE_) {
858                 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
859                                                status);
860                 // if allocation failed, status will be set to
861                 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
862                 // checks for it.
863                 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
864                                        size, status);
865                 if (U_FAILURE(*status) && norm != NULL) {
866                     uprv_free(norm);
867                     return FALSE;
868                 }
869             }
870             else {
871                 norm = buffer;
872             }
873 
874             UCollationElements *coleiter  = strsrch->utilIter;
875             ucol_setText(coleiter, norm, size, status);
876             uint32_t            firstce   = strsrch->pattern.ces[0];
877             UBool               ignorable = TRUE;
878             uint32_t            ce        = UCOL_IGNORABLE;
879             while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
880                 offset = ucol_getOffset(coleiter);
881                 if (ce != firstce && ce != UCOL_IGNORABLE) {
882                     ignorable = FALSE;
883                 }
884                 ce = ucol_next(coleiter, status);
885             }
886             UChar32 codepoint;
887             U16_PREV(norm, 0, offset, codepoint);
888             result = !ignorable && (u_getCombiningClass(codepoint) != 0);
889 
890             if (norm != buffer) {
891                 uprv_free(norm);
892             }
893         }
894     }
895 
896     return result;
897 }
898 
899 /**
900 * Used by exact matches, checks if there are accents before the match.
901 * This is really painful... we have to check that composite characters at
902 * the start of the matches have to not have any extra accents.
903 * We check the FCD of the character first, if it starts with an accent and
904 * the first pattern ce does not match the first ce of the character, we bail.
905 * Otherwise we try normalizing the first composite
906 * character and find the immediate decomposed character before the match to
907 * see if it is an non-ignorable accent.
908 * Now normalizing the first composite character is enough because we ensure
909 * that when the match is passed in here with extra beginning ces, the
910 * first or last ce that match has to occur within the first character.
911 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
912 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
913 * Note here that accents checking are slow and cautioned in the API docs.
914 * @param strsrch string search data
915 * @param start offset
916 * @param end offset
917 * @return TRUE if there are accents on either side of the match,
918 *         FALSE otherwise
919 */
920 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)921 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
922                                   int32_t    end)
923 {
924     if (strsrch->pattern.hasPrefixAccents) {
925         UCollationElements *coleiter  = strsrch->textIter;
926         UErrorCode          status    = U_ZERO_ERROR;
927         // we have been iterating forwards previously
928         uint32_t            ignorable = TRUE;
929         int32_t             firstce   = strsrch->pattern.ces[0];
930 
931         setColEIterOffset(coleiter, start);
932         int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
933         if (U_FAILURE(status)) {
934             return TRUE;
935         }
936         while (ce != firstce) {
937             if (ce != UCOL_IGNORABLE) {
938                 ignorable = FALSE;
939             }
940             ce = getCE(strsrch, ucol_next(coleiter, &status));
941             if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
942                 return TRUE;
943             }
944         }
945         if (!ignorable && inNormBuf(coleiter)) {
946             // within normalization buffer, discontiguous handled here
947             return TRUE;
948         }
949 
950         // within text
951         int32_t temp = start;
952         // original code
953         // accent = (getFCD(strsrch->search->text, &temp,
954         //                  strsrch->search->textLength)
955         //            >> SECOND_LAST_BYTE_SHIFT_);
956         // however this code does not work well with VC7 .net in release mode.
957         // maybe the inlines for getFCD combined with shifting has bugs in
958         // VC7. anyways this is a work around.
959         UBool accent = getFCD(strsrch->search->text, &temp,
960                               strsrch->search->textLength) > 0xFF;
961         if (!accent) {
962             return checkExtraMatchAccents(strsrch, start, end, &status);
963         }
964         if (!ignorable) {
965             return TRUE;
966         }
967         if (start > 0) {
968             temp = start;
969             U16_BACK_1(strsrch->search->text, 0, temp);
970             if (getFCD(strsrch->search->text, &temp,
971                        strsrch->search->textLength) & LAST_BYTE_MASK_) {
972                 setColEIterOffset(coleiter, start);
973                 ce = ucol_previous(coleiter, &status);
974                 if (U_FAILURE(status) ||
975                     (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
976                     return TRUE;
977                 }
978             }
979         }
980     }
981 
982     return FALSE;
983 }
984 
985 /**
986 * Used by exact matches, checks if there are accents bounding the match.
987 * Note this is the initial boundary check. If the potential match
988 * starts or ends with composite characters, the accents in those
989 * characters will be determined later.
990 * Not doing backwards iteration here, since discontiguos contraction for
991 * backwards collation element iterator, use up too many characters.
992 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
993 * should fail since there is a acute at the end of \u01FA
994 * Note here that accents checking are slow and cautioned in the API docs.
995 * @param strsrch string search data
996 * @param start offset of match
997 * @param end end offset of the match
998 * @return TRUE if there are accents on either side of the match,
999 *         FALSE otherwise
1000 */
1001 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)1002 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
1003                                  int32_t    end)
1004 {
1005     if (strsrch->pattern.hasSuffixAccents) {
1006         const UChar       *text       = strsrch->search->text;
1007               int32_t  temp       = end;
1008               int32_t      textlength = strsrch->search->textLength;
1009         U16_BACK_1(text, 0, temp);
1010         if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1011             int32_t             firstce  = strsrch->pattern.ces[0];
1012             UCollationElements *coleiter = strsrch->textIter;
1013             UErrorCode          status   = U_ZERO_ERROR;
1014             int32_t ce;
1015             setColEIterOffset(coleiter, start);
1016             while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1017                 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1018                     return TRUE;
1019                 }
1020             }
1021             int32_t count = 1;
1022             while (count < strsrch->pattern.cesLength) {
1023                 if (getCE(strsrch, ucol_next(coleiter, &status))
1024                     == UCOL_IGNORABLE) {
1025                     // Thai can give an ignorable here.
1026                     count --;
1027                 }
1028                 if (U_FAILURE(status)) {
1029                     return TRUE;
1030                 }
1031                 count ++;
1032             }
1033 
1034             ce = ucol_next(coleiter, &status);
1035             if (U_FAILURE(status)) {
1036                 return TRUE;
1037             }
1038             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1039                 ce = getCE(strsrch, ce);
1040             }
1041             if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1042                 if (ucol_getOffset(coleiter) <= end) {
1043                     return TRUE;
1044                 }
1045                 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1046                     return TRUE;
1047                 }
1048             }
1049         }
1050     }
1051     return FALSE;
1052 }
1053 #endif // #if BOYER_MOORE
1054 
1055 /**
1056 * Checks if the offset runs out of the text string
1057 * @param offset
1058 * @param textlength of the text string
1059 * @return TRUE if offset is out of bounds, FALSE otherwise
1060 */
1061 static
isOutOfBounds(int32_t textlength,int32_t offset)1062 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1063 {
1064     return offset < 0 || offset > textlength;
1065 }
1066 
1067 /**
1068 * Checks for identical match
1069 * @param strsrch string search data
1070 * @param start offset of possible match
1071 * @param end offset of possible match
1072 * @return TRUE if identical match is found
1073 */
1074 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1075 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1076                                   int32_t    end)
1077 {
1078     if (strsrch->strength != UCOL_IDENTICAL) {
1079         return TRUE;
1080     }
1081 
1082     // Note: We could use Normalizer::compare() or similar, but for short strings
1083     // which may not be in FCD it might be faster to just NFD them.
1084     UErrorCode status = U_ZERO_ERROR;
1085     UnicodeString t2, p2;
1086     strsrch->nfd->normalize(
1087         UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1088     strsrch->nfd->normalize(
1089         UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1090     // return FALSE if NFD failed
1091     return U_SUCCESS(status) && t2 == p2;
1092 }
1093 
1094 #if BOYER_MOORE
1095 /**
1096 * Checks to see if the match is repeated
1097 * @param strsrch string search data
1098 * @param start new match start index
1099 * @param end new match end index
1100 * @return TRUE if the the match is repeated, FALSE otherwise
1101 */
1102 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1103 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1104                                 int32_t    start,
1105                                 int32_t    end)
1106 {
1107     int32_t lastmatchindex = strsrch->search->matchedIndex;
1108     UBool       result;
1109     if (lastmatchindex == USEARCH_DONE) {
1110         return FALSE;
1111     }
1112     if (strsrch->search->isForwardSearching) {
1113         result = start <= lastmatchindex;
1114     }
1115     else {
1116         result = start >= lastmatchindex;
1117     }
1118     if (!result && !strsrch->search->isOverlap) {
1119         if (strsrch->search->isForwardSearching) {
1120             result = start < lastmatchindex + strsrch->search->matchedLength;
1121         }
1122         else {
1123             result = end > lastmatchindex;
1124         }
1125     }
1126     return result;
1127 }
1128 
1129 /**
1130 * Gets the collation element iterator's current offset.
1131 * @param coleiter collation element iterator
1132 * @param forwards flag TRUE if we are moving in th forwards direction
1133 * @return current offset
1134 */
1135 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1136 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1137                                               UBool               forwards)
1138 {
1139     int32_t result = ucol_getOffset(coleiter);
1140     // intricacies of the the backwards collation element iterator
1141     if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1142         result ++;
1143     }
1144     return result;
1145 }
1146 
1147 /**
1148 * Checks match for contraction.
1149 * If the match ends with a partial contraction we fail.
1150 * If the match starts too far off (because of backwards iteration) we try to
1151 * chip off the extra characters depending on whether a breakiterator has
1152 * been used.
1153 * Internal method, error assumed to be success, caller has to check status
1154 * before calling this method.
1155 * @param strsrch string search data
1156 * @param start offset of potential match, to be modified if necessary
1157 * @param end offset of potential match, to be modified if necessary
1158 * @param status output error status if any
1159 * @return TRUE if match passes the contraction test, FALSE otherwise
1160 */
1161 
1162 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1163 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1164                                      int32_t   *start,
1165                                      int32_t   *end, UErrorCode  *status)
1166 {
1167           UCollationElements *coleiter   = strsrch->textIter;
1168           int32_t             textlength = strsrch->search->textLength;
1169           int32_t             temp       = *start;
1170     const UCollator          *collator   = strsrch->collator;
1171     const UChar              *text       = strsrch->search->text;
1172     // This part checks if either ends of the match contains potential
1173     // contraction. If so we'll have to iterate through them
1174     // The start contraction needs to be checked since ucol_previous dumps
1175     // all characters till the first safe character into the buffer.
1176     // *start + 1 is used to test for the unsafe characters instead of *start
1177     // because ucol_prev takes all unsafe characters till the first safe
1178     // character ie *start. so by testing *start + 1, we can estimate if
1179     // excess prefix characters has been included in the potential search
1180     // results.
1181     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1182         (*start + 1 < textlength
1183          && ucol_unsafeCP(text[*start + 1], collator))) {
1184         int32_t expansion  = getExpansionPrefix(coleiter);
1185         UBool   expandflag = expansion > 0;
1186         setColEIterOffset(coleiter, *start);
1187         while (expansion > 0) {
1188             // getting rid of the redundant ce, caused by setOffset.
1189             // since backward contraction/expansion may have extra ces if we
1190             // are in the normalization buffer, hasAccentsBeforeMatch would
1191             // have taken care of it.
1192             // E.g. the character \u01FA will have an expansion of 3, but if
1193             // we are only looking for acute and ring \u030A and \u0301, we'll
1194             // have to skip the first ce in the expansion buffer.
1195             ucol_next(coleiter, status);
1196             if (U_FAILURE(*status)) {
1197                 return FALSE;
1198             }
1199             if (ucol_getOffset(coleiter) != temp) {
1200                 *start = temp;
1201                 temp  = ucol_getOffset(coleiter);
1202             }
1203             expansion --;
1204         }
1205 
1206         int32_t  *patternce       = strsrch->pattern.ces;
1207         int32_t   patterncelength = strsrch->pattern.cesLength;
1208         int32_t   count           = 0;
1209         while (count < patterncelength) {
1210             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1211             if (ce == UCOL_IGNORABLE) {
1212                 continue;
1213             }
1214             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1215                 *start = temp;
1216                 temp   = ucol_getOffset(coleiter);
1217             }
1218             if (U_FAILURE(*status) || ce != patternce[count]) {
1219                 (*end) ++;
1220                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1221                 return FALSE;
1222             }
1223             count ++;
1224         }
1225     }
1226     return TRUE;
1227 }
1228 
1229 /**
1230 * Checks and sets the match information if found.
1231 * Checks
1232 * <ul>
1233 * <li> the potential match does not repeat the previous match
1234 * <li> boundaries are correct
1235 * <li> exact matches has no extra accents
1236 * <li> identical matchesb
1237 * <li> potential match does not end in the middle of a contraction
1238 * <\ul>
1239 * Otherwise the offset will be shifted to the next character.
1240 * Internal method, status assumed to be success, caller has to check status
1241 * before calling this method.
1242 * @param strsrch string search data
1243 * @param textoffset offset in the collation element text. the returned value
1244 *        will be the truncated end offset of the match or the new start
1245 *        search offset.
1246 * @param status output error status if any
1247 * @return TRUE if the match is valid, FALSE otherwise
1248 */
1249 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1250 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1251                                  int32_t   *textoffset, UErrorCode *status)
1252 {
1253     UCollationElements *coleiter = strsrch->textIter;
1254     int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1255 
1256     if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1257         return FALSE;
1258     }
1259 
1260     // this totally matches, however we need to check if it is repeating
1261     if (!isBreakUnit(strsrch, start, *textoffset) ||
1262         checkRepeatedMatch(strsrch, start, *textoffset) ||
1263         hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1264         !checkIdentical(strsrch, start, *textoffset) ||
1265         hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1266 
1267         (*textoffset) ++;
1268         *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1269         return FALSE;
1270     }
1271 
1272     //Add breakiterator boundary check for primary strength search.
1273     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1274         checkBreakBoundary(strsrch, &start, textoffset);
1275     }
1276 
1277     // totally match, we will get rid of the ending ignorables.
1278     strsrch->search->matchedIndex  = start;
1279     strsrch->search->matchedLength = *textoffset - start;
1280     return TRUE;
1281 }
1282 
1283 /**
1284 * Getting the previous base character offset, or the current offset if the
1285 * current character is a base character
1286 * @param text string
1287 * @param textoffset one offset after the current character
1288 * @return the offset of the next character after the base character or the first
1289 *         composed character with accents
1290 */
1291 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1292 inline int32_t getPreviousBaseOffset(const UChar       *text,
1293                                                int32_t  textoffset)
1294 {
1295     if (textoffset > 0) {
1296         for (;;) {
1297             int32_t result = textoffset;
1298             U16_BACK_1(text, 0, textoffset);
1299             int32_t temp = textoffset;
1300             uint16_t fcd = getFCD(text, &temp, result);
1301             if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1302                 if (fcd & LAST_BYTE_MASK_) {
1303                     return textoffset;
1304                 }
1305                 return result;
1306             }
1307             if (textoffset == 0) {
1308                 return 0;
1309             }
1310         }
1311     }
1312     return textoffset;
1313 }
1314 
1315 /**
1316 * Getting the indexes of the accents that are not blocked in the argument
1317 * accent array
1318 * @param accents array of accents in nfd terminated by a 0.
1319 * @param accentsindex array of indexes of the accents that are not blocked
1320 */
1321 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1322 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1323 {
1324     int32_t index     = 0;
1325     int32_t     length    = u_strlen(accents);
1326     UChar32     codepoint = 0;
1327     int         cclass    = 0;
1328     int         result    = 0;
1329     int32_t temp;
1330     while (index < length) {
1331         temp = index;
1332         U16_NEXT(accents, index, length, codepoint);
1333         if (u_getCombiningClass(codepoint) != cclass) {
1334             cclass        = u_getCombiningClass(codepoint);
1335             accentsindex[result] = temp;
1336             result ++;
1337         }
1338     }
1339     accentsindex[result] = length;
1340     return result;
1341 }
1342 
1343 /**
1344 * Appends 3 UChar arrays to a destination array.
1345 * Creates a new array if we run out of space. The caller will have to
1346 * manually deallocate the newly allocated array.
1347 * Internal method, status assumed to be success, caller has to check status
1348 * before calling this method. destination not to be NULL and has at least
1349 * size destinationlength.
1350 * @param destination target array
1351 * @param destinationlength target array size, returning the appended length
1352 * @param source1 null-terminated first array
1353 * @param source2 second array
1354 * @param source2length length of seond array
1355 * @param source3 null-terminated third array
1356 * @param status error status if any
1357 * @return new destination array, destination if there was no new allocation
1358 */
1359 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1360 inline UChar * addToUCharArray(      UChar      *destination,
1361                                      int32_t    *destinationlength,
1362                                const UChar      *source1,
1363                                const UChar      *source2,
1364                                      int32_t     source2length,
1365                                const UChar      *source3,
1366                                      UErrorCode *status)
1367 {
1368     int32_t source1length = source1 ? u_strlen(source1) : 0;
1369     int32_t source3length = source3 ? u_strlen(source3) : 0;
1370     if (*destinationlength < source1length + source2length + source3length +
1371                                                                            1)
1372     {
1373         destination = (UChar *)allocateMemory(
1374           (source1length + source2length + source3length + 1) * sizeof(UChar),
1375           status);
1376         // if error allocating memory, status will be
1377         // U_MEMORY_ALLOCATION_ERROR
1378         if (U_FAILURE(*status)) {
1379             *destinationlength = 0;
1380             return NULL;
1381         }
1382     }
1383     if (source1length != 0) {
1384         u_memcpy(destination, source1, source1length);
1385     }
1386     if (source2length != 0) {
1387         uprv_memcpy(destination + source1length, source2,
1388                     sizeof(UChar) * source2length);
1389     }
1390     if (source3length != 0) {
1391         uprv_memcpy(destination + source1length + source2length, source3,
1392                     sizeof(UChar) * source3length);
1393     }
1394     *destinationlength = source1length + source2length + source3length;
1395     return destination;
1396 }
1397 
1398 /**
1399 * Running through a collation element iterator to see if the contents matches
1400 * pattern in string search data
1401 * @param strsrch string search data
1402 * @param coleiter collation element iterator
1403 * @return TRUE if a match if found, FALSE otherwise
1404 */
1405 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1406 inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1407                                        UCollationElements *coleiter)
1408 {
1409     int         patternceindex = strsrch->pattern.cesLength;
1410     int32_t    *patternce      = strsrch->pattern.ces;
1411     UErrorCode  status = U_ZERO_ERROR;
1412     while (patternceindex > 0) {
1413         int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1414         if (ce == UCOL_IGNORABLE) {
1415             continue;
1416         }
1417         if (U_FAILURE(status) || ce != *patternce) {
1418             return FALSE;
1419         }
1420         patternce ++;
1421         patternceindex --;
1422     }
1423     return TRUE;
1424 }
1425 
1426 /**
1427 * Rearranges the front accents to try matching.
1428 * Prefix accents in the text will be grouped according to their combining
1429 * class and the groups will be mixed and matched to try find the perfect
1430 * match with the pattern.
1431 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1432 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1433 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1434 *         "\u0301\u0325".
1435 * step 2: check if any of the generated substrings matches the pattern.
1436 * Internal method, status is assumed to be success, caller has to check status
1437 * before calling this method.
1438 * @param strsrch string search match
1439 * @param start first offset of the accents to start searching
1440 * @param end start of the last accent set
1441 * @param status output error status if any
1442 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1443 *         offset of the match. Note this start includes all preceding accents.
1444 */
1445 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1446 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1447                                        int32_t    start,
1448                                        int32_t    end,
1449                                        UErrorCode    *status)
1450 {
1451     const UChar       *text       = strsrch->search->text;
1452           int32_t      textlength = strsrch->search->textLength;
1453           int32_t  tempstart  = start;
1454 
1455     if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1456         // die... failed at a base character
1457         return USEARCH_DONE;
1458     }
1459 
1460     int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1461     start = getPreviousBaseOffset(text, tempstart);
1462 
1463     UChar       accents[INITIAL_ARRAY_SIZE_];
1464     // normalizing the offensive string
1465     unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1466                     INITIAL_ARRAY_SIZE_, status);
1467     if (U_FAILURE(*status)) {
1468         return USEARCH_DONE;
1469     }
1470 
1471     int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1472     int32_t         accentsize = getUnblockedAccentIndex(accents,
1473                                                                  accentsindex);
1474     int32_t         count      = (2 << (accentsize - 1)) - 1;
1475     UChar               buffer[INITIAL_ARRAY_SIZE_];
1476     UCollationElements *coleiter   = strsrch->utilIter;
1477     while (U_SUCCESS(*status) && count > 0) {
1478         UChar *rearrange = strsrch->canonicalPrefixAccents;
1479         // copy the base characters
1480         for (int k = 0; k < accentsindex[0]; k ++) {
1481             *rearrange ++ = accents[k];
1482         }
1483         // forming all possible canonical rearrangement by dropping
1484         // sets of accents
1485         for (int i = 0; i <= accentsize - 1; i ++) {
1486             int32_t mask = 1 << (accentsize - i - 1);
1487             if (count & mask) {
1488                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1489                     *rearrange ++ = accents[j];
1490                 }
1491             }
1492         }
1493         *rearrange = 0;
1494         int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1495         UChar   *match     = addToUCharArray(buffer, &matchsize,
1496                                            strsrch->canonicalPrefixAccents,
1497                                            strsrch->search->text + offset,
1498                                            end - offset,
1499                                            strsrch->canonicalSuffixAccents,
1500                                            status);
1501 
1502         // if status is a failure, ucol_setText does nothing.
1503         // run the collator iterator through this match
1504         ucol_setText(coleiter, match, matchsize, status);
1505         if (U_SUCCESS(*status)) {
1506             if (checkCollationMatch(strsrch, coleiter)) {
1507                 if (match != buffer) {
1508                     uprv_free(match);
1509                 }
1510                 return start;
1511             }
1512         }
1513         count --;
1514     }
1515     return USEARCH_DONE;
1516 }
1517 
1518 /**
1519 * Gets the offset to the safe point in text before textoffset.
1520 * ie. not the middle of a contraction, swappable characters or supplementary
1521 * characters.
1522 * @param collator collation sata
1523 * @param text string to work with
1524 * @param textoffset offset in string
1525 * @param textlength length of text string
1526 * @return offset to the previous safe character
1527 */
1528 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1529 inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1530                                       const UChar       *text,
1531                                             int32_t  textoffset)
1532 {
1533     int32_t result = textoffset; // first contraction character
1534     while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1535         result --;
1536     }
1537     if (result != 0) {
1538         // the first contraction character is consider unsafe here
1539         result --;
1540     }
1541     return result;
1542 }
1543 
1544 /**
1545 * Cleaning up after we passed the safe zone
1546 * @param strsrch string search data
1547 * @param safetext safe text array
1548 * @param safebuffer safe text buffer
1549 * @param coleiter collation element iterator for safe text
1550 */
1551 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1552 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1553                                   UChar         *safebuffer)
1554 {
1555     if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1556     {
1557        uprv_free(safetext);
1558     }
1559 }
1560 
1561 /**
1562 * Take the rearranged end accents and tries matching. If match failed at
1563 * a seperate preceding set of accents (seperated from the rearranged on by
1564 * at least a base character) then we rearrange the preceding accents and
1565 * tries matching again.
1566 * We allow skipping of the ends of the accent set if the ces do not match.
1567 * However if the failure is found before the accent set, it fails.
1568 * Internal method, status assumed to be success, caller has to check status
1569 * before calling this method.
1570 * @param strsrch string search data
1571 * @param textoffset of the start of the rearranged accent
1572 * @param status output error status if any
1573 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1574 *         offset of the match. Note this start includes all preceding accents.
1575 */
1576 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1577 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1578                                        int32_t    textoffset,
1579                                        UErrorCode    *status)
1580 {
1581     const UChar              *text           = strsrch->search->text;
1582     const UCollator          *collator       = strsrch->collator;
1583           int32_t             safelength     = 0;
1584           UChar              *safetext;
1585           int32_t             safetextlength;
1586           UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1587           UCollationElements *coleiter       = strsrch->utilIter;
1588           int32_t         safeoffset     = textoffset;
1589 
1590     if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1591                                          collator)) {
1592         safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1593         safelength     = textoffset - safeoffset;
1594         safetextlength = INITIAL_ARRAY_SIZE_;
1595         safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1596                                          text + safeoffset, safelength,
1597                                          strsrch->canonicalSuffixAccents,
1598                                          status);
1599     }
1600     else {
1601         safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1602         safetext       = strsrch->canonicalSuffixAccents;
1603     }
1604 
1605     // if status is a failure, ucol_setText does nothing
1606     ucol_setText(coleiter, safetext, safetextlength, status);
1607     // status checked in loop below
1608 
1609     int32_t  *ce        = strsrch->pattern.ces;
1610     int32_t   celength  = strsrch->pattern.cesLength;
1611     int       ceindex   = celength - 1;
1612     UBool     isSafe    = TRUE; // indication flag for position in safe zone
1613 
1614     while (ceindex >= 0) {
1615         int32_t textce = ucol_previous(coleiter, status);
1616         if (U_FAILURE(*status)) {
1617             if (isSafe) {
1618                 cleanUpSafeText(strsrch, safetext, safebuffer);
1619             }
1620             return USEARCH_DONE;
1621         }
1622         if (textce == UCOL_NULLORDER) {
1623             // check if we have passed the safe buffer
1624             if (coleiter == strsrch->textIter) {
1625                 cleanUpSafeText(strsrch, safetext, safebuffer);
1626                 return USEARCH_DONE;
1627             }
1628             cleanUpSafeText(strsrch, safetext, safebuffer);
1629             safetext = safebuffer;
1630             coleiter = strsrch->textIter;
1631             setColEIterOffset(coleiter, safeoffset);
1632             // status checked at the start of the loop
1633             isSafe = FALSE;
1634             continue;
1635         }
1636         textce = getCE(strsrch, textce);
1637         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1638             // do the beginning stuff
1639             int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1640             if (isSafe && failedoffset >= safelength) {
1641                 // alas... no hope. failed at rearranged accent set
1642                 cleanUpSafeText(strsrch, safetext, safebuffer);
1643                 return USEARCH_DONE;
1644             }
1645             else {
1646                 if (isSafe) {
1647                     failedoffset += safeoffset;
1648                     cleanUpSafeText(strsrch, safetext, safebuffer);
1649                 }
1650 
1651                 // try rearranging the front accents
1652                 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1653                                         failedoffset, textoffset, status);
1654                 if (result != USEARCH_DONE) {
1655                     // if status is a failure, ucol_setOffset does nothing
1656                     setColEIterOffset(strsrch->textIter, result);
1657                 }
1658                 if (U_FAILURE(*status)) {
1659                     return USEARCH_DONE;
1660                 }
1661                 return result;
1662             }
1663         }
1664         if (textce == ce[ceindex]) {
1665             ceindex --;
1666         }
1667     }
1668     // set offset here
1669     if (isSafe) {
1670         int32_t result     = getColElemIterOffset(coleiter, FALSE);
1671         // sets the text iterator here with the correct expansion and offset
1672         int32_t    leftoverces = getExpansionPrefix(coleiter);
1673         cleanUpSafeText(strsrch, safetext, safebuffer);
1674         if (result >= safelength) {
1675             result = textoffset;
1676         }
1677         else {
1678             result += safeoffset;
1679         }
1680         setColEIterOffset(strsrch->textIter, result);
1681         strsrch->textIter->iteratordata_.toReturn =
1682                        setExpansionPrefix(strsrch->textIter, leftoverces);
1683         return result;
1684     }
1685 
1686     return ucol_getOffset(coleiter);
1687 }
1688 
1689 /**
1690 * Trying out the substring and sees if it can be a canonical match.
1691 * This will try normalizing the end accents and arranging them into canonical
1692 * equivalents and check their corresponding ces with the pattern ce.
1693 * Suffix accents in the text will be grouped according to their combining
1694 * class and the groups will be mixed and matched to try find the perfect
1695 * match with the pattern.
1696 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1697 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1698 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1699 *         "\u0301\u0325".
1700 * step 2: check if any of the generated substrings matches the pattern.
1701 * Internal method, status assumed to be success, caller has to check status
1702 * before calling this method.
1703 * @param strsrch string search data
1704 * @param textoffset end offset in the collation element text that ends with
1705 *                   the accents to be rearranged
1706 * @param status error status if any
1707 * @return TRUE if the match is valid, FALSE otherwise
1708 */
1709 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1710 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1711                            int32_t    textoffset,
1712                            UErrorCode    *status)
1713 {
1714     const UChar       *text = strsrch->search->text;
1715           int32_t  temp = textoffset;
1716     U16_BACK_1(text, 0, temp);
1717     if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1718         UCollationElements *coleiter = strsrch->textIter;
1719         int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1720         if (strsrch->pattern.hasPrefixAccents) {
1721             offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1722                                                 status);
1723             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1724                 setColEIterOffset(coleiter, offset);
1725                 return TRUE;
1726             }
1727         }
1728         return FALSE;
1729     }
1730 
1731     if (!strsrch->pattern.hasSuffixAccents) {
1732         return FALSE;
1733     }
1734 
1735     UChar       accents[INITIAL_ARRAY_SIZE_];
1736     // offset to the last base character in substring to search
1737     int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1738     // normalizing the offensive string
1739     unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1740                                0, accents, INITIAL_ARRAY_SIZE_, status);
1741     // status checked in loop below
1742 
1743     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1744     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1745 
1746     // 2 power n - 1 plus the full set of accents
1747     int32_t  count = (2 << (size - 1)) - 1;
1748     while (U_SUCCESS(*status) && count > 0) {
1749         UChar *rearrange = strsrch->canonicalSuffixAccents;
1750         // copy the base characters
1751         for (int k = 0; k < accentsindex[0]; k ++) {
1752             *rearrange ++ = accents[k];
1753         }
1754         // forming all possible canonical rearrangement by dropping
1755         // sets of accents
1756         for (int i = 0; i <= size - 1; i ++) {
1757             int32_t mask = 1 << (size - i - 1);
1758             if (count & mask) {
1759                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1760                     *rearrange ++ = accents[j];
1761                 }
1762             }
1763         }
1764         *rearrange = 0;
1765         int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1766                                                         status);
1767         if (offset != USEARCH_DONE) {
1768             return TRUE; // match found
1769         }
1770         count --;
1771     }
1772     return FALSE;
1773 }
1774 
1775 /**
1776 * Gets the previous base character offset depending on the string search
1777 * pattern data
1778 * @param strsrch string search data
1779 * @param textoffset current offset, current character
1780 * @return the offset of the next character after this base character or itself
1781 *         if it is a composed character with accents
1782 */
1783 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1784 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1785                                                       int32_t textoffset)
1786 {
1787     if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1788         const UChar       *text = strsrch->search->text;
1789               int32_t  offset = textoffset;
1790         if (getFCD(text, &offset, strsrch->search->textLength) >>
1791                                                    SECOND_LAST_BYTE_SHIFT_) {
1792             return getPreviousBaseOffset(text, textoffset);
1793         }
1794     }
1795     return textoffset;
1796 }
1797 
1798 /**
1799 * Checks match for contraction.
1800 * If the match ends with a partial contraction we fail.
1801 * If the match starts too far off (because of backwards iteration) we try to
1802 * chip off the extra characters
1803 * Internal method, status assumed to be success, caller has to check status
1804 * before calling this method.
1805 * @param strsrch string search data
1806 * @param start offset of potential match, to be modified if necessary
1807 * @param end offset of potential match, to be modified if necessary
1808 * @param status output error status if any
1809 * @return TRUE if match passes the contraction test, FALSE otherwise
1810 */
1811 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1812 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1813                                          int32_t   *start,
1814                                          int32_t   *end,
1815                                          UErrorCode    *status)
1816 {
1817           UCollationElements *coleiter   = strsrch->textIter;
1818           int32_t             textlength = strsrch->search->textLength;
1819           int32_t         temp       = *start;
1820     const UCollator          *collator   = strsrch->collator;
1821     const UChar              *text       = strsrch->search->text;
1822     // This part checks if either ends of the match contains potential
1823     // contraction. If so we'll have to iterate through them
1824     if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1825         (*start + 1 < textlength
1826          && ucol_unsafeCP(text[*start + 1], collator))) {
1827         int32_t expansion  = getExpansionPrefix(coleiter);
1828         UBool   expandflag = expansion > 0;
1829         setColEIterOffset(coleiter, *start);
1830         while (expansion > 0) {
1831             // getting rid of the redundant ce, caused by setOffset.
1832             // since backward contraction/expansion may have extra ces if we
1833             // are in the normalization buffer, hasAccentsBeforeMatch would
1834             // have taken care of it.
1835             // E.g. the character \u01FA will have an expansion of 3, but if
1836             // we are only looking for acute and ring \u030A and \u0301, we'll
1837             // have to skip the first ce in the expansion buffer.
1838             ucol_next(coleiter, status);
1839             if (U_FAILURE(*status)) {
1840                 return FALSE;
1841             }
1842             if (ucol_getOffset(coleiter) != temp) {
1843                 *start = temp;
1844                 temp  = ucol_getOffset(coleiter);
1845             }
1846             expansion --;
1847         }
1848 
1849         int32_t  *patternce       = strsrch->pattern.ces;
1850         int32_t   patterncelength = strsrch->pattern.cesLength;
1851         int32_t   count           = 0;
1852         int32_t   textlength      = strsrch->search->textLength;
1853         while (count < patterncelength) {
1854             int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1855             // status checked below, note that if status is a failure
1856             // ucol_next returns UCOL_NULLORDER
1857             if (ce == UCOL_IGNORABLE) {
1858                 continue;
1859             }
1860             if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1861                 *start = temp;
1862                 temp   = ucol_getOffset(coleiter);
1863             }
1864 
1865             if (count == 0 && ce != patternce[0]) {
1866                 // accents may have extra starting ces, this occurs when a
1867                 // pure accent pattern is matched without rearrangement
1868                 // text \u0325\u0300 and looking for \u0300
1869                 int32_t expected = patternce[0];
1870                 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1871                     ce = getCE(strsrch, ucol_next(coleiter, status));
1872                     while (U_SUCCESS(*status) && ce != expected &&
1873                            ce != UCOL_NULLORDER &&
1874                            ucol_getOffset(coleiter) <= *end) {
1875                         ce = getCE(strsrch, ucol_next(coleiter, status));
1876                     }
1877                 }
1878             }
1879             if (U_FAILURE(*status) || ce != patternce[count]) {
1880                 (*end) ++;
1881                 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1882                 return FALSE;
1883             }
1884             count ++;
1885         }
1886     }
1887     return TRUE;
1888 }
1889 
1890 /**
1891 * Checks and sets the match information if found.
1892 * Checks
1893 * <ul>
1894 * <li> the potential match does not repeat the previous match
1895 * <li> boundaries are correct
1896 * <li> potential match does not end in the middle of a contraction
1897 * <li> identical matches
1898 * <\ul>
1899 * Otherwise the offset will be shifted to the next character.
1900 * Internal method, status assumed to be success, caller has to check the
1901 * status before calling this method.
1902 * @param strsrch string search data
1903 * @param textoffset offset in the collation element text. the returned value
1904 *        will be the truncated end offset of the match or the new start
1905 *        search offset.
1906 * @param status output error status if any
1907 * @return TRUE if the match is valid, FALSE otherwise
1908 */
1909 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1910 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1911                                      int32_t   *textoffset,
1912                                      UErrorCode    *status)
1913 {
1914     // to ensure that the start and ends are not composite characters
1915     UCollationElements *coleiter = strsrch->textIter;
1916     // if we have a canonical accent match
1917     if ((strsrch->pattern.hasSuffixAccents &&
1918         strsrch->canonicalSuffixAccents[0]) ||
1919         (strsrch->pattern.hasPrefixAccents &&
1920         strsrch->canonicalPrefixAccents[0])) {
1921         strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1922                                                     strsrch,
1923                                                     ucol_getOffset(coleiter));
1924         strsrch->search->matchedLength = *textoffset -
1925                                                 strsrch->search->matchedIndex;
1926         return TRUE;
1927     }
1928 
1929     int32_t start = getColElemIterOffset(coleiter, FALSE);
1930     if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1931                                             status) || U_FAILURE(*status)) {
1932         return FALSE;
1933     }
1934 
1935     start = getPreviousUStringSearchBaseOffset(strsrch, start);
1936     // this totally matches, however we need to check if it is repeating
1937     if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1938         !isBreakUnit(strsrch, start, *textoffset) ||
1939         !checkIdentical(strsrch, start, *textoffset)) {
1940         (*textoffset) ++;
1941         *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1942                                         strsrch->search->textLength);
1943         return FALSE;
1944     }
1945 
1946     strsrch->search->matchedIndex  = start;
1947     strsrch->search->matchedLength = *textoffset - start;
1948     return TRUE;
1949 }
1950 
1951 /**
1952 * Shifting the collation element iterator position forward to prepare for
1953 * a preceding match. If the first character is a unsafe character, we'll only
1954 * shift by 1 to capture contractions, normalization etc.
1955 * Internal method, status assumed to be success, caller has to check status
1956 * before calling this method.
1957 * @param text strsrch string search data
1958 * @param textoffset start text position to do search
1959 * @param ce the text ce which failed the match.
1960 * @param patternceindex index of the ce within the pattern ce buffer which
1961 *        failed the match
1962 * @return final offset
1963 */
1964 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1965 inline int32_t reverseShift(UStringSearch *strsrch,
1966                                 int32_t    textoffset,
1967                                 int32_t       ce,
1968                                 int32_t        patternceindex)
1969 {
1970     if (strsrch->search->isOverlap) {
1971         if (textoffset != strsrch->search->textLength) {
1972             textoffset --;
1973         }
1974         else {
1975             textoffset -= strsrch->pattern.defaultShiftSize;
1976         }
1977     }
1978     else {
1979         if (ce != UCOL_NULLORDER) {
1980             int32_t shift = strsrch->pattern.backShift[hashFromCE32(ce)];
1981 
1982             // this is to adjust for characters in the middle of the substring
1983             // for matching that failed.
1984             int32_t adjust = patternceindex;
1985             if (adjust > 1 && shift > adjust) {
1986                 shift -= adjust - 1;
1987             }
1988             textoffset -= shift;
1989         }
1990         else {
1991             textoffset -= strsrch->pattern.defaultShiftSize;
1992         }
1993     }
1994     textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1995     return textoffset;
1996 }
1997 
1998 /**
1999 * Checks match for contraction.
2000 * If the match starts with a partial contraction we fail.
2001 * Internal method, status assumed to be success, caller has to check status
2002 * before calling this method.
2003 * @param strsrch string search data
2004 * @param start offset of potential match, to be modified if necessary
2005 * @param end offset of potential match, to be modified if necessary
2006 * @param status output error status if any
2007 * @return TRUE if match passes the contraction test, FALSE otherwise
2008 */
2009 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2010 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2011                                      int32_t   *start,
2012                                      int32_t   *end, UErrorCode  *status)
2013 {
2014           UCollationElements *coleiter   = strsrch->textIter;
2015           int32_t             textlength = strsrch->search->textLength;
2016           int32_t             temp       = *end;
2017     const UCollator          *collator   = strsrch->collator;
2018     const UChar              *text       = strsrch->search->text;
2019     // This part checks if either if the start of the match contains potential
2020     // contraction. If so we'll have to iterate through them
2021     // Since we used ucol_next while previously looking for the potential
2022     // match, this guarantees that our end will not be a partial contraction,
2023     // or a partial supplementary character.
2024     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2025         int32_t expansion  = getExpansionSuffix(coleiter);
2026         UBool   expandflag = expansion > 0;
2027         setColEIterOffset(coleiter, *end);
2028         while (U_SUCCESS(*status) && expansion > 0) {
2029             // getting rid of the redundant ce
2030             // since forward contraction/expansion may have extra ces
2031             // if we are in the normalization buffer, hasAccentsBeforeMatch
2032             // would have taken care of it.
2033             // E.g. the character \u01FA will have an expansion of 3, but if
2034             // we are only looking for A ring A\u030A, we'll have to skip the
2035             // last ce in the expansion buffer
2036             ucol_previous(coleiter, status);
2037             if (U_FAILURE(*status)) {
2038                 return FALSE;
2039             }
2040             if (ucol_getOffset(coleiter) != temp) {
2041                 *end = temp;
2042                 temp  = ucol_getOffset(coleiter);
2043             }
2044             expansion --;
2045         }
2046 
2047         int32_t  *patternce       = strsrch->pattern.ces;
2048         int32_t   patterncelength = strsrch->pattern.cesLength;
2049         int32_t   count           = patterncelength;
2050         while (count > 0) {
2051             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2052             // status checked below, note that if status is a failure
2053             // ucol_previous returns UCOL_NULLORDER
2054             if (ce == UCOL_IGNORABLE) {
2055                 continue;
2056             }
2057             if (expandflag && count == 0 &&
2058                 getColElemIterOffset(coleiter, FALSE) != temp) {
2059                 *end = temp;
2060                 temp  = ucol_getOffset(coleiter);
2061             }
2062             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2063                 (*start) --;
2064                 *start = getPreviousBaseOffset(text, *start);
2065                 return FALSE;
2066             }
2067             count --;
2068         }
2069     }
2070     return TRUE;
2071 }
2072 
2073 /**
2074 * Checks and sets the match information if found.
2075 * Checks
2076 * <ul>
2077 * <li> the current match does not repeat the last match
2078 * <li> boundaries are correct
2079 * <li> exact matches has no extra accents
2080 * <li> identical matches
2081 * <\ul>
2082 * Otherwise the offset will be shifted to the preceding character.
2083 * Internal method, status assumed to be success, caller has to check status
2084 * before calling this method.
2085 * @param strsrch string search data
2086 * @param collator
2087 * @param coleiter collation element iterator
2088 * @param text string
2089 * @param textoffset offset in the collation element text. the returned value
2090 *        will be the truncated start offset of the match or the new start
2091 *        search offset.
2092 * @param status output error status if any
2093 * @return TRUE if the match is valid, FALSE otherwise
2094 */
2095 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2096 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2097                                      int32_t   *textoffset,
2098                                      UErrorCode    *status)
2099 {
2100     // to ensure that the start and ends are not composite characters
2101     int32_t end = ucol_getOffset(strsrch->textIter);
2102     if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2103         || U_FAILURE(*status)) {
2104             return FALSE;
2105     }
2106 
2107     // this totally matches, however we need to check if it is repeating
2108     // the old match
2109     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2110         !isBreakUnit(strsrch, *textoffset, end) ||
2111         hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2112         !checkIdentical(strsrch, *textoffset, end) ||
2113         hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2114         (*textoffset) --;
2115         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2116                                             *textoffset);
2117         return FALSE;
2118     }
2119 
2120     //Add breakiterator boundary check for primary strength search.
2121     if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2122         checkBreakBoundary(strsrch, textoffset, &end);
2123     }
2124 
2125     strsrch->search->matchedIndex = *textoffset;
2126     strsrch->search->matchedLength = end - *textoffset;
2127     return TRUE;
2128 }
2129 
2130 /**
2131 * Rearranges the end accents to try matching.
2132 * Suffix accents in the text will be grouped according to their combining
2133 * class and the groups will be mixed and matched to try find the perfect
2134 * match with the pattern.
2135 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2136 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2137 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2138 *         "\u0301\u0325".
2139 * step 2: check if any of the generated substrings matches the pattern.
2140 * Internal method, status assumed to be success, user has to check status
2141 * before calling this method.
2142 * @param strsrch string search match
2143 * @param start offset of the first base character
2144 * @param end start of the last accent set
2145 * @param status only error status if any
2146 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2147 *         offset of the match. Note this start includes all following accents.
2148 */
2149 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2150 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2151                                            int32_t    start,
2152                                            int32_t    end,
2153                                            UErrorCode    *status)
2154 {
2155     const UChar       *text       = strsrch->search->text;
2156           int32_t  tempend    = end;
2157 
2158     U16_BACK_1(text, 0, tempend);
2159     if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2160                                                            LAST_BYTE_MASK_)) {
2161         // die... failed at a base character
2162         return USEARCH_DONE;
2163     }
2164     end = getNextBaseOffset(text, end, strsrch->search->textLength);
2165 
2166     if (U_SUCCESS(*status)) {
2167         UChar       accents[INITIAL_ARRAY_SIZE_];
2168         int32_t offset = getPreviousBaseOffset(text, end);
2169         // normalizing the offensive string
2170         unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2171                         INITIAL_ARRAY_SIZE_, status);
2172 
2173         int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2174         int32_t         accentsize = getUnblockedAccentIndex(accents,
2175                                                          accentsindex);
2176         int32_t         count      = (2 << (accentsize - 1)) - 1;
2177         UChar               buffer[INITIAL_ARRAY_SIZE_];
2178         UCollationElements *coleiter = strsrch->utilIter;
2179         while (U_SUCCESS(*status) && count > 0) {
2180             UChar *rearrange = strsrch->canonicalSuffixAccents;
2181             // copy the base characters
2182             for (int k = 0; k < accentsindex[0]; k ++) {
2183                 *rearrange ++ = accents[k];
2184             }
2185             // forming all possible canonical rearrangement by dropping
2186             // sets of accents
2187             for (int i = 0; i <= accentsize - 1; i ++) {
2188                 int32_t mask = 1 << (accentsize - i - 1);
2189                 if (count & mask) {
2190                     for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2191                         *rearrange ++ = accents[j];
2192                     }
2193                 }
2194             }
2195             *rearrange = 0;
2196             int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2197             UChar   *match     = addToUCharArray(buffer, &matchsize,
2198                                            strsrch->canonicalPrefixAccents,
2199                                            strsrch->search->text + start,
2200                                            offset - start,
2201                                            strsrch->canonicalSuffixAccents,
2202                                            status);
2203 
2204             // run the collator iterator through this match
2205             // if status is a failure ucol_setText does nothing
2206             ucol_setText(coleiter, match, matchsize, status);
2207             if (U_SUCCESS(*status)) {
2208                 if (checkCollationMatch(strsrch, coleiter)) {
2209                     if (match != buffer) {
2210                         uprv_free(match);
2211                     }
2212                     return end;
2213                 }
2214             }
2215             count --;
2216         }
2217     }
2218     return USEARCH_DONE;
2219 }
2220 
2221 /**
2222 * Take the rearranged start accents and tries matching. If match failed at
2223 * a seperate following set of accents (seperated from the rearranged on by
2224 * at least a base character) then we rearrange the preceding accents and
2225 * tries matching again.
2226 * We allow skipping of the ends of the accent set if the ces do not match.
2227 * However if the failure is found before the accent set, it fails.
2228 * Internal method, status assumed to be success, caller has to check status
2229 * before calling this method.
2230 * @param strsrch string search data
2231 * @param textoffset of the ends of the rearranged accent
2232 * @param status output error status if any
2233 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2234 *         offset of the match. Note this start includes all following accents.
2235 */
2236 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2237 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2238                                            int32_t    textoffset,
2239                                            UErrorCode    *status)
2240 {
2241     const UChar       *text       = strsrch->search->text;
2242     const UCollator   *collator   = strsrch->collator;
2243           int32_t      safelength = 0;
2244           UChar       *safetext;
2245           int32_t      safetextlength;
2246           UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2247           int32_t  safeoffset = textoffset;
2248 
2249     if (textoffset &&
2250         ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2251                                  u_strlen(strsrch->canonicalPrefixAccents) - 1
2252                                          ], collator)) {
2253         safeoffset     = getNextSafeOffset(collator, text, textoffset,
2254                                            strsrch->search->textLength);
2255         safelength     = safeoffset - textoffset;
2256         safetextlength = INITIAL_ARRAY_SIZE_;
2257         safetext       = addToUCharArray(safebuffer, &safetextlength,
2258                                          strsrch->canonicalPrefixAccents,
2259                                          text + textoffset, safelength,
2260                                          NULL, status);
2261     }
2262     else {
2263         safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2264         safetext       = strsrch->canonicalPrefixAccents;
2265     }
2266 
2267     UCollationElements *coleiter = strsrch->utilIter;
2268      // if status is a failure, ucol_setText does nothing
2269     ucol_setText(coleiter, safetext, safetextlength, status);
2270     // status checked in loop below
2271 
2272     int32_t  *ce           = strsrch->pattern.ces;
2273     int32_t   celength     = strsrch->pattern.cesLength;
2274     int       ceindex      = 0;
2275     UBool     isSafe       = TRUE; // safe zone indication flag for position
2276     int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2277 
2278     while (ceindex < celength) {
2279         int32_t textce = ucol_next(coleiter, status);
2280         if (U_FAILURE(*status)) {
2281             if (isSafe) {
2282                 cleanUpSafeText(strsrch, safetext, safebuffer);
2283             }
2284             return USEARCH_DONE;
2285         }
2286         if (textce == UCOL_NULLORDER) {
2287             // check if we have passed the safe buffer
2288             if (coleiter == strsrch->textIter) {
2289                 cleanUpSafeText(strsrch, safetext, safebuffer);
2290                 return USEARCH_DONE;
2291             }
2292             cleanUpSafeText(strsrch, safetext, safebuffer);
2293             safetext = safebuffer;
2294             coleiter = strsrch->textIter;
2295             setColEIterOffset(coleiter, safeoffset);
2296             // status checked at the start of the loop
2297             isSafe = FALSE;
2298             continue;
2299         }
2300         textce = getCE(strsrch, textce);
2301         if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2302             // do the beginning stuff
2303             int32_t failedoffset = ucol_getOffset(coleiter);
2304             if (isSafe && failedoffset <= prefixlength) {
2305                 // alas... no hope. failed at rearranged accent set
2306                 cleanUpSafeText(strsrch, safetext, safebuffer);
2307                 return USEARCH_DONE;
2308             }
2309             else {
2310                 if (isSafe) {
2311                     failedoffset = safeoffset - failedoffset;
2312                     cleanUpSafeText(strsrch, safetext, safebuffer);
2313                 }
2314 
2315                 // try rearranging the end accents
2316                 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2317                                         textoffset, failedoffset, status);
2318                 if (result != USEARCH_DONE) {
2319                     // if status is a failure, ucol_setOffset does nothing
2320                     setColEIterOffset(strsrch->textIter, result);
2321                 }
2322                 if (U_FAILURE(*status)) {
2323                     return USEARCH_DONE;
2324                 }
2325                 return result;
2326             }
2327         }
2328         if (textce == ce[ceindex]) {
2329             ceindex ++;
2330         }
2331     }
2332     // set offset here
2333     if (isSafe) {
2334         int32_t result      = ucol_getOffset(coleiter);
2335         // sets the text iterator here with the correct expansion and offset
2336         int32_t     leftoverces = getExpansionSuffix(coleiter);
2337         cleanUpSafeText(strsrch, safetext, safebuffer);
2338         if (result <= prefixlength) {
2339             result = textoffset;
2340         }
2341         else {
2342             result = textoffset + (safeoffset - result);
2343         }
2344         setColEIterOffset(strsrch->textIter, result);
2345         setExpansionSuffix(strsrch->textIter, leftoverces);
2346         return result;
2347     }
2348 
2349     return ucol_getOffset(coleiter);
2350 }
2351 
2352 /**
2353 * Trying out the substring and sees if it can be a canonical match.
2354 * This will try normalizing the starting accents and arranging them into
2355 * canonical equivalents and check their corresponding ces with the pattern ce.
2356 * Prefix accents in the text will be grouped according to their combining
2357 * class and the groups will be mixed and matched to try find the perfect
2358 * match with the pattern.
2359 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2360 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2361 *         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2362 *         "\u0301\u0325".
2363 * step 2: check if any of the generated substrings matches the pattern.
2364 * Internal method, status assumed to be success, caller has to check status
2365 * before calling this method.
2366 * @param strsrch string search data
2367 * @param textoffset start offset in the collation element text that starts
2368 *                   with the accents to be rearranged
2369 * @param status output error status if any
2370 * @return TRUE if the match is valid, FALSE otherwise
2371 */
2372 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2373 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2374                                int32_t    textoffset,
2375                                UErrorCode    *status)
2376 {
2377     const UChar       *text       = strsrch->search->text;
2378           int32_t  temp       = textoffset;
2379           int32_t      textlength = strsrch->search->textLength;
2380     if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2381         UCollationElements *coleiter = strsrch->textIter;
2382         int32_t         offset   = ucol_getOffset(coleiter);
2383         if (strsrch->pattern.hasSuffixAccents) {
2384             offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2385                                                     offset, status);
2386             if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2387                 setColEIterOffset(coleiter, offset);
2388                 return TRUE;
2389             }
2390         }
2391         return FALSE;
2392     }
2393 
2394     if (!strsrch->pattern.hasPrefixAccents) {
2395         return FALSE;
2396     }
2397 
2398     UChar       accents[INITIAL_ARRAY_SIZE_];
2399     // offset to the last base character in substring to search
2400     int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2401     // normalizing the offensive string
2402     unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2403                                0, accents, INITIAL_ARRAY_SIZE_, status);
2404     // status checked in loop
2405 
2406     int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2407     int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2408 
2409     // 2 power n - 1 plus the full set of accents
2410     int32_t  count = (2 << (size - 1)) - 1;
2411     while (U_SUCCESS(*status) && count > 0) {
2412         UChar *rearrange = strsrch->canonicalPrefixAccents;
2413         // copy the base characters
2414         for (int k = 0; k < accentsindex[0]; k ++) {
2415             *rearrange ++ = accents[k];
2416         }
2417         // forming all possible canonical rearrangement by dropping
2418         // sets of accents
2419         for (int i = 0; i <= size - 1; i ++) {
2420             int32_t mask = 1 << (size - i - 1);
2421             if (count & mask) {
2422                 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2423                     *rearrange ++ = accents[j];
2424                 }
2425             }
2426         }
2427         *rearrange = 0;
2428         int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2429                                                           baseoffset, status);
2430         if (offset != USEARCH_DONE) {
2431             return TRUE; // match found
2432         }
2433         count --;
2434     }
2435     return FALSE;
2436 }
2437 
2438 /**
2439 * Checks match for contraction.
2440 * If the match starts with a partial contraction we fail.
2441 * Internal method, status assumed to be success, caller has to check status
2442 * before calling this method.
2443 * @param strsrch string search data
2444 * @param start offset of potential match, to be modified if necessary
2445 * @param end offset of potential match, to be modified if necessary
2446 * @param status only error status if any
2447 * @return TRUE if match passes the contraction test, FALSE otherwise
2448 */
2449 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2450 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2451                                      int32_t   *start,
2452                                      int32_t   *end, UErrorCode  *status)
2453 {
2454           UCollationElements *coleiter   = strsrch->textIter;
2455           int32_t             textlength = strsrch->search->textLength;
2456           int32_t         temp       = *end;
2457     const UCollator          *collator   = strsrch->collator;
2458     const UChar              *text       = strsrch->search->text;
2459     // This part checks if either if the start of the match contains potential
2460     // contraction. If so we'll have to iterate through them
2461     // Since we used ucol_next while previously looking for the potential
2462     // match, this guarantees that our end will not be a partial contraction,
2463     // or a partial supplementary character.
2464     if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2465         int32_t expansion  = getExpansionSuffix(coleiter);
2466         UBool   expandflag = expansion > 0;
2467         setColEIterOffset(coleiter, *end);
2468         while (expansion > 0) {
2469             // getting rid of the redundant ce
2470             // since forward contraction/expansion may have extra ces
2471             // if we are in the normalization buffer, hasAccentsBeforeMatch
2472             // would have taken care of it.
2473             // E.g. the character \u01FA will have an expansion of 3, but if
2474             // we are only looking for A ring A\u030A, we'll have to skip the
2475             // last ce in the expansion buffer
2476             ucol_previous(coleiter, status);
2477             if (U_FAILURE(*status)) {
2478                 return FALSE;
2479             }
2480             if (ucol_getOffset(coleiter) != temp) {
2481                 *end = temp;
2482                 temp  = ucol_getOffset(coleiter);
2483             }
2484             expansion --;
2485         }
2486 
2487         int32_t  *patternce       = strsrch->pattern.ces;
2488         int32_t   patterncelength = strsrch->pattern.cesLength;
2489         int32_t   count           = patterncelength;
2490         while (count > 0) {
2491             int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2492             // status checked below, note that if status is a failure
2493             // ucol_previous returns UCOL_NULLORDER
2494             if (ce == UCOL_IGNORABLE) {
2495                 continue;
2496             }
2497             if (expandflag && count == 0 &&
2498                 getColElemIterOffset(coleiter, FALSE) != temp) {
2499                 *end = temp;
2500                 temp  = ucol_getOffset(coleiter);
2501             }
2502             if (count == patterncelength &&
2503                 ce != patternce[patterncelength - 1]) {
2504                 // accents may have extra starting ces, this occurs when a
2505                 // pure accent pattern is matched without rearrangement
2506                 int32_t    expected = patternce[patterncelength - 1];
2507                 U16_BACK_1(text, 0, *end);
2508                 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2509                     ce = getCE(strsrch, ucol_previous(coleiter, status));
2510                     while (U_SUCCESS(*status) && ce != expected &&
2511                            ce != UCOL_NULLORDER &&
2512                            ucol_getOffset(coleiter) <= *start) {
2513                         ce = getCE(strsrch, ucol_previous(coleiter, status));
2514                     }
2515                 }
2516             }
2517             if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2518                 (*start) --;
2519                 *start = getPreviousBaseOffset(text, *start);
2520                 return FALSE;
2521             }
2522             count --;
2523         }
2524     }
2525     return TRUE;
2526 }
2527 
2528 /**
2529 * Checks and sets the match information if found.
2530 * Checks
2531 * <ul>
2532 * <li> the potential match does not repeat the previous match
2533 * <li> boundaries are correct
2534 * <li> potential match does not end in the middle of a contraction
2535 * <li> identical matches
2536 * <\ul>
2537 * Otherwise the offset will be shifted to the next character.
2538 * Internal method, status assumed to be success, caller has to check status
2539 * before calling this method.
2540 * @param strsrch string search data
2541 * @param textoffset offset in the collation element text. the returned value
2542 *        will be the truncated start offset of the match or the new start
2543 *        search offset.
2544 * @param status only error status if any
2545 * @return TRUE if the match is valid, FALSE otherwise
2546 */
2547 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2548 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2549                                          int32_t   *textoffset,
2550                                          UErrorCode    *status)
2551 {
2552     // to ensure that the start and ends are not composite characters
2553     UCollationElements *coleiter = strsrch->textIter;
2554     // if we have a canonical accent match
2555     if ((strsrch->pattern.hasSuffixAccents &&
2556         strsrch->canonicalSuffixAccents[0]) ||
2557         (strsrch->pattern.hasPrefixAccents &&
2558         strsrch->canonicalPrefixAccents[0])) {
2559         strsrch->search->matchedIndex  = *textoffset;
2560         strsrch->search->matchedLength =
2561             getNextUStringSearchBaseOffset(strsrch,
2562                                       getColElemIterOffset(coleiter, FALSE))
2563             - *textoffset;
2564         return TRUE;
2565     }
2566 
2567     int32_t end = ucol_getOffset(coleiter);
2568     if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2569                                                 status) ||
2570          U_FAILURE(*status)) {
2571         return FALSE;
2572     }
2573 
2574     end = getNextUStringSearchBaseOffset(strsrch, end);
2575     // this totally matches, however we need to check if it is repeating
2576     if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2577         !isBreakUnit(strsrch, *textoffset, end) ||
2578         !checkIdentical(strsrch, *textoffset, end)) {
2579         (*textoffset) --;
2580         *textoffset = getPreviousBaseOffset(strsrch->search->text,
2581                                             *textoffset);
2582         return FALSE;
2583     }
2584 
2585     strsrch->search->matchedIndex  = *textoffset;
2586     strsrch->search->matchedLength = end - *textoffset;
2587     return TRUE;
2588 }
2589 #endif // #if BOYER_MOORE
2590 
2591 // constructors and destructor -------------------------------------------
2592 
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2593 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2594                                           int32_t         patternlength,
2595                                     const UChar          *text,
2596                                           int32_t         textlength,
2597                                     const char           *locale,
2598                                           UBreakIterator *breakiter,
2599                                           UErrorCode     *status)
2600 {
2601     if (U_FAILURE(*status)) {
2602         return NULL;
2603     }
2604 #if UCONFIG_NO_BREAK_ITERATION
2605     if (breakiter != NULL) {
2606         *status = U_UNSUPPORTED_ERROR;
2607         return NULL;
2608     }
2609 #endif
2610     if (locale) {
2611         // ucol_open internally checks for status
2612         UCollator     *collator = ucol_open(locale, status);
2613         // pattern, text checks are done in usearch_openFromCollator
2614         UStringSearch *result   = usearch_openFromCollator(pattern,
2615                                               patternlength, text, textlength,
2616                                               collator, breakiter, status);
2617 
2618         if (result == NULL || U_FAILURE(*status)) {
2619             if (collator) {
2620                 ucol_close(collator);
2621             }
2622             return NULL;
2623         }
2624         else {
2625             result->ownCollator = TRUE;
2626         }
2627         return result;
2628     }
2629     *status = U_ILLEGAL_ARGUMENT_ERROR;
2630     return NULL;
2631 }
2632 
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2633 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2634                                   const UChar          *pattern,
2635                                         int32_t         patternlength,
2636                                   const UChar          *text,
2637                                         int32_t         textlength,
2638                                   const UCollator      *collator,
2639                                         UBreakIterator *breakiter,
2640                                         UErrorCode     *status)
2641 {
2642     if (U_FAILURE(*status)) {
2643         return NULL;
2644     }
2645 #if UCONFIG_NO_BREAK_ITERATION
2646     if (breakiter != NULL) {
2647         *status = U_UNSUPPORTED_ERROR;
2648         return NULL;
2649     }
2650 #endif
2651     if (pattern == NULL || text == NULL || collator == NULL) {
2652         *status = U_ILLEGAL_ARGUMENT_ERROR;
2653         return NULL;
2654     }
2655 
2656     // string search does not really work when numeric collation is turned on
2657     if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2658         *status = U_UNSUPPORTED_ERROR;
2659         return NULL;
2660     }
2661 
2662     if (U_SUCCESS(*status)) {
2663         initializeFCD(status);
2664         if (U_FAILURE(*status)) {
2665             return NULL;
2666         }
2667 
2668         UStringSearch *result;
2669         if (textlength == -1) {
2670             textlength = u_strlen(text);
2671         }
2672         if (patternlength == -1) {
2673             patternlength = u_strlen(pattern);
2674         }
2675         if (textlength <= 0 || patternlength <= 0) {
2676             *status = U_ILLEGAL_ARGUMENT_ERROR;
2677             return NULL;
2678         }
2679 
2680         result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2681         if (result == NULL) {
2682             *status = U_MEMORY_ALLOCATION_ERROR;
2683             return NULL;
2684         }
2685 
2686         result->collator    = collator;
2687         result->strength    = ucol_getStrength(collator);
2688         result->ceMask      = getMask(result->strength);
2689         result->toShift     =
2690              ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2691                                                             UCOL_SHIFTED;
2692         result->variableTop = ucol_getVariableTop(collator, status);
2693 
2694         result->nfd         = Normalizer2::getNFDInstance(*status);
2695 
2696         if (U_FAILURE(*status)) {
2697             uprv_free(result);
2698             return NULL;
2699         }
2700 
2701         result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2702         if (result->search == NULL) {
2703             *status = U_MEMORY_ALLOCATION_ERROR;
2704             uprv_free(result);
2705             return NULL;
2706         }
2707 
2708         result->search->text       = text;
2709         result->search->textLength = textlength;
2710 
2711         result->pattern.text       = pattern;
2712         result->pattern.textLength = patternlength;
2713         result->pattern.ces         = NULL;
2714         result->pattern.pces        = NULL;
2715 
2716         result->search->breakIter  = breakiter;
2717 #if !UCONFIG_NO_BREAK_ITERATION
2718         result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2719         if (breakiter) {
2720             ubrk_setText(breakiter, text, textlength, status);
2721         }
2722 #endif
2723 
2724         result->ownCollator           = FALSE;
2725         result->search->matchedLength = 0;
2726         result->search->matchedIndex  = USEARCH_DONE;
2727         result->utilIter              = NULL;
2728         result->textIter              = ucol_openElements(collator, text,
2729                                                           textlength, status);
2730         result->textProcessedIter     = NULL;
2731         if (U_FAILURE(*status)) {
2732             usearch_close(result);
2733             return NULL;
2734         }
2735 
2736         result->search->isOverlap          = FALSE;
2737         result->search->isCanonicalMatch   = FALSE;
2738         result->search->elementComparisonType = 0;
2739         result->search->isForwardSearching = TRUE;
2740         result->search->reset              = TRUE;
2741 
2742         initialize(result, status);
2743 
2744         if (U_FAILURE(*status)) {
2745             usearch_close(result);
2746             return NULL;
2747         }
2748 
2749         return result;
2750     }
2751     return NULL;
2752 }
2753 
usearch_close(UStringSearch * strsrch)2754 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2755 {
2756     if (strsrch) {
2757         if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2758             strsrch->pattern.ces) {
2759             uprv_free(strsrch->pattern.ces);
2760         }
2761 
2762         if (strsrch->pattern.pces != NULL &&
2763             strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2764             uprv_free(strsrch->pattern.pces);
2765         }
2766 
2767         delete strsrch->textProcessedIter;
2768         ucol_closeElements(strsrch->textIter);
2769         ucol_closeElements(strsrch->utilIter);
2770 
2771         if (strsrch->ownCollator && strsrch->collator) {
2772             ucol_close((UCollator *)strsrch->collator);
2773         }
2774 
2775 #if !UCONFIG_NO_BREAK_ITERATION
2776         if (strsrch->search->internalBreakIter) {
2777             ubrk_close(strsrch->search->internalBreakIter);
2778         }
2779 #endif
2780 
2781         uprv_free(strsrch->search);
2782         uprv_free(strsrch);
2783     }
2784 }
2785 
2786 namespace {
2787 
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2788 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2789     if (U_FAILURE(*status)) { return FALSE; }
2790     if (strsrch->textProcessedIter == NULL) {
2791         strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2792         if (strsrch->textProcessedIter == NULL) {
2793             *status = U_MEMORY_ALLOCATION_ERROR;
2794             return FALSE;
2795         }
2796     } else {
2797         strsrch->textProcessedIter->init(strsrch->textIter);
2798     }
2799     return TRUE;
2800 }
2801 
2802 }
2803 
2804 // set and get methods --------------------------------------------------
2805 
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2806 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2807                                         int32_t    position,
2808                                         UErrorCode    *status)
2809 {
2810     if (U_SUCCESS(*status) && strsrch) {
2811         if (isOutOfBounds(strsrch->search->textLength, position)) {
2812             *status = U_INDEX_OUTOFBOUNDS_ERROR;
2813         }
2814         else {
2815             setColEIterOffset(strsrch->textIter, position);
2816         }
2817         strsrch->search->matchedIndex  = USEARCH_DONE;
2818         strsrch->search->matchedLength = 0;
2819         strsrch->search->reset         = FALSE;
2820     }
2821 }
2822 
usearch_getOffset(const UStringSearch * strsrch)2823 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2824 {
2825     if (strsrch) {
2826         int32_t result = ucol_getOffset(strsrch->textIter);
2827         if (isOutOfBounds(strsrch->search->textLength, result)) {
2828             return USEARCH_DONE;
2829         }
2830         return result;
2831     }
2832     return USEARCH_DONE;
2833 }
2834 
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2835 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2836                                  USearchAttribute attribute,
2837                                  USearchAttributeValue value,
2838                                  UErrorCode *status)
2839 {
2840     if (U_SUCCESS(*status) && strsrch) {
2841         switch (attribute)
2842         {
2843         case USEARCH_OVERLAP :
2844             strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2845             break;
2846         case USEARCH_CANONICAL_MATCH :
2847             strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2848                                                                       FALSE);
2849             break;
2850         case USEARCH_ELEMENT_COMPARISON :
2851             if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2852                 strsrch->search->elementComparisonType = (int16_t)value;
2853             } else {
2854                 strsrch->search->elementComparisonType = 0;
2855             }
2856             break;
2857         case USEARCH_ATTRIBUTE_COUNT :
2858         default:
2859             *status = U_ILLEGAL_ARGUMENT_ERROR;
2860         }
2861     }
2862     if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2863         *status = U_ILLEGAL_ARGUMENT_ERROR;
2864     }
2865 }
2866 
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2867 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2868                                                 const UStringSearch *strsrch,
2869                                                 USearchAttribute attribute)
2870 {
2871     if (strsrch) {
2872         switch (attribute) {
2873         case USEARCH_OVERLAP :
2874             return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2875                                                         USEARCH_OFF);
2876         case USEARCH_CANONICAL_MATCH :
2877             return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2878                                                                USEARCH_OFF);
2879         case USEARCH_ELEMENT_COMPARISON :
2880             {
2881                 int16_t value = strsrch->search->elementComparisonType;
2882                 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2883                     return (USearchAttributeValue)value;
2884                 } else {
2885                     return USEARCH_STANDARD_ELEMENT_COMPARISON;
2886                 }
2887             }
2888         case USEARCH_ATTRIBUTE_COUNT :
2889             return USEARCH_DEFAULT;
2890         }
2891     }
2892     return USEARCH_DEFAULT;
2893 }
2894 
usearch_getMatchedStart(const UStringSearch * strsrch)2895 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2896                                                 const UStringSearch *strsrch)
2897 {
2898     if (strsrch == NULL) {
2899         return USEARCH_DONE;
2900     }
2901     return strsrch->search->matchedIndex;
2902 }
2903 
2904 
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2905 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2906                                             UChar         *result,
2907                                             int32_t        resultCapacity,
2908                                             UErrorCode    *status)
2909 {
2910     if (U_FAILURE(*status)) {
2911         return USEARCH_DONE;
2912     }
2913     if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2914         result == NULL)) {
2915         *status = U_ILLEGAL_ARGUMENT_ERROR;
2916         return USEARCH_DONE;
2917     }
2918 
2919     int32_t     copylength = strsrch->search->matchedLength;
2920     int32_t copyindex  = strsrch->search->matchedIndex;
2921     if (copyindex == USEARCH_DONE) {
2922         u_terminateUChars(result, resultCapacity, 0, status);
2923         return USEARCH_DONE;
2924     }
2925 
2926     if (resultCapacity < copylength) {
2927         copylength = resultCapacity;
2928     }
2929     if (copylength > 0) {
2930         uprv_memcpy(result, strsrch->search->text + copyindex,
2931                     copylength * sizeof(UChar));
2932     }
2933     return u_terminateUChars(result, resultCapacity,
2934                              strsrch->search->matchedLength, status);
2935 }
2936 
usearch_getMatchedLength(const UStringSearch * strsrch)2937 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2938                                               const UStringSearch *strsrch)
2939 {
2940     if (strsrch) {
2941         return strsrch->search->matchedLength;
2942     }
2943     return USEARCH_DONE;
2944 }
2945 
2946 #if !UCONFIG_NO_BREAK_ITERATION
2947 
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2948 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2949                                                UBreakIterator *breakiter,
2950                                                UErrorCode     *status)
2951 {
2952     if (U_SUCCESS(*status) && strsrch) {
2953         strsrch->search->breakIter = breakiter;
2954         if (breakiter) {
2955             ubrk_setText(breakiter, strsrch->search->text,
2956                          strsrch->search->textLength, status);
2957         }
2958     }
2959 }
2960 
2961 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2962 usearch_getBreakIterator(const UStringSearch *strsrch)
2963 {
2964     if (strsrch) {
2965         return strsrch->search->breakIter;
2966     }
2967     return NULL;
2968 }
2969 
2970 #endif
2971 
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2972 U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2973                                       const UChar         *text,
2974                                             int32_t        textlength,
2975                                             UErrorCode    *status)
2976 {
2977     if (U_SUCCESS(*status)) {
2978         if (strsrch == NULL || text == NULL || textlength < -1 ||
2979             textlength == 0) {
2980             *status = U_ILLEGAL_ARGUMENT_ERROR;
2981         }
2982         else {
2983             if (textlength == -1) {
2984                 textlength = u_strlen(text);
2985             }
2986             strsrch->search->text       = text;
2987             strsrch->search->textLength = textlength;
2988             ucol_setText(strsrch->textIter, text, textlength, status);
2989             strsrch->search->matchedIndex  = USEARCH_DONE;
2990             strsrch->search->matchedLength = 0;
2991             strsrch->search->reset         = TRUE;
2992 #if !UCONFIG_NO_BREAK_ITERATION
2993             if (strsrch->search->breakIter != NULL) {
2994                 ubrk_setText(strsrch->search->breakIter, text,
2995                              textlength, status);
2996             }
2997             ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2998 #endif
2999         }
3000     }
3001 }
3002 
usearch_getText(const UStringSearch * strsrch,int32_t * length)3003 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3004                                                      int32_t       *length)
3005 {
3006     if (strsrch) {
3007         *length = strsrch->search->textLength;
3008         return strsrch->search->text;
3009     }
3010     return NULL;
3011 }
3012 
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3013 U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3014                                           const UCollator     *collator,
3015                                                 UErrorCode    *status)
3016 {
3017     if (U_SUCCESS(*status)) {
3018         if (collator == NULL) {
3019             *status = U_ILLEGAL_ARGUMENT_ERROR;
3020             return;
3021         }
3022 
3023         if (strsrch) {
3024             delete strsrch->textProcessedIter;
3025             strsrch->textProcessedIter = NULL;
3026             ucol_closeElements(strsrch->textIter);
3027             ucol_closeElements(strsrch->utilIter);
3028             strsrch->textIter = strsrch->utilIter = NULL;
3029             if (strsrch->ownCollator && (strsrch->collator != collator)) {
3030                 ucol_close((UCollator *)strsrch->collator);
3031                 strsrch->ownCollator = FALSE;
3032             }
3033             strsrch->collator    = collator;
3034             strsrch->strength    = ucol_getStrength(collator);
3035             strsrch->ceMask      = getMask(strsrch->strength);
3036 #if !UCONFIG_NO_BREAK_ITERATION
3037             ubrk_close(strsrch->search->internalBreakIter);
3038             strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3039                                                      strsrch->search->text, strsrch->search->textLength, status);
3040 #endif
3041             // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3042             strsrch->toShift     =
3043                ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3044                                                                 UCOL_SHIFTED;
3045             // if status is a failure, ucol_getVariableTop returns 0
3046             strsrch->variableTop = ucol_getVariableTop(collator, status);
3047             strsrch->textIter = ucol_openElements(collator,
3048                                       strsrch->search->text,
3049                                       strsrch->search->textLength,
3050                                       status);
3051             strsrch->utilIter = ucol_openElements(
3052                     collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3053             // initialize() _after_ setting the iterators for the new collator.
3054             initialize(strsrch, status);
3055         }
3056 
3057         // **** are these calls needed?
3058         // **** we call uprv_init_pce in initializePatternPCETable
3059         // **** and the CEIBuffer constructor...
3060 #if 0
3061         uprv_init_pce(strsrch->textIter);
3062         uprv_init_pce(strsrch->utilIter);
3063 #endif
3064     }
3065 }
3066 
usearch_getCollator(const UStringSearch * strsrch)3067 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3068 {
3069     if (strsrch) {
3070         return (UCollator *)strsrch->collator;
3071     }
3072     return NULL;
3073 }
3074 
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3075 U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3076                                          const UChar         *pattern,
3077                                                int32_t        patternlength,
3078                                                UErrorCode    *status)
3079 {
3080     if (U_SUCCESS(*status)) {
3081         if (strsrch == NULL || pattern == NULL) {
3082             *status = U_ILLEGAL_ARGUMENT_ERROR;
3083         }
3084         else {
3085             if (patternlength == -1) {
3086                 patternlength = u_strlen(pattern);
3087             }
3088             if (patternlength == 0) {
3089                 *status = U_ILLEGAL_ARGUMENT_ERROR;
3090                 return;
3091             }
3092             strsrch->pattern.text       = pattern;
3093             strsrch->pattern.textLength = patternlength;
3094             initialize(strsrch, status);
3095         }
3096     }
3097 }
3098 
3099 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3100 usearch_getPattern(const UStringSearch *strsrch,
3101                    int32_t       *length)
3102 {
3103     if (strsrch) {
3104         *length = strsrch->pattern.textLength;
3105         return strsrch->pattern.text;
3106     }
3107     return NULL;
3108 }
3109 
3110 // miscellanous methods --------------------------------------------------
3111 
usearch_first(UStringSearch * strsrch,UErrorCode * status)3112 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3113                                            UErrorCode    *status)
3114 {
3115     if (strsrch && U_SUCCESS(*status)) {
3116         strsrch->search->isForwardSearching = TRUE;
3117         usearch_setOffset(strsrch, 0, status);
3118         if (U_SUCCESS(*status)) {
3119             return usearch_next(strsrch, status);
3120         }
3121     }
3122     return USEARCH_DONE;
3123 }
3124 
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3125 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3126                                                int32_t    position,
3127                                                UErrorCode    *status)
3128 {
3129     if (strsrch && U_SUCCESS(*status)) {
3130         strsrch->search->isForwardSearching = TRUE;
3131         // position checked in usearch_setOffset
3132         usearch_setOffset(strsrch, position, status);
3133         if (U_SUCCESS(*status)) {
3134             return usearch_next(strsrch, status);
3135         }
3136     }
3137     return USEARCH_DONE;
3138 }
3139 
usearch_last(UStringSearch * strsrch,UErrorCode * status)3140 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3141                                           UErrorCode    *status)
3142 {
3143     if (strsrch && U_SUCCESS(*status)) {
3144         strsrch->search->isForwardSearching = FALSE;
3145         usearch_setOffset(strsrch, strsrch->search->textLength, status);
3146         if (U_SUCCESS(*status)) {
3147             return usearch_previous(strsrch, status);
3148         }
3149     }
3150     return USEARCH_DONE;
3151 }
3152 
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3153 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3154                                                int32_t    position,
3155                                                UErrorCode    *status)
3156 {
3157     if (strsrch && U_SUCCESS(*status)) {
3158         strsrch->search->isForwardSearching = FALSE;
3159         // position checked in usearch_setOffset
3160         usearch_setOffset(strsrch, position, status);
3161         if (U_SUCCESS(*status)) {
3162             return usearch_previous(strsrch, status);
3163         }
3164     }
3165     return USEARCH_DONE;
3166 }
3167 
3168 /**
3169 * If a direction switch is required, we'll count the number of ces till the
3170 * beginning of the collation element iterator and iterate forwards that
3171 * number of times. This is so that we get to the correct point within the
3172 * string to continue the search in. Imagine when we are in the middle of the
3173 * normalization buffer when the change in direction is request. arrrgghh....
3174 * After searching the offset within the collation element iterator will be
3175 * shifted to the start of the match. If a match is not found, the offset would
3176 * have been set to the end of the text string in the collation element
3177 * iterator.
3178 * Okay, here's my take on normalization buffer. The only time when there can
3179 * be 2 matches within the same normalization is when the pattern is consists
3180 * of all accents. But since the offset returned is from the text string, we
3181 * should not confuse the caller by returning the second match within the
3182 * same normalization buffer. If we do, the 2 results will have the same match
3183 * offsets, and that'll be confusing. I'll return the next match that doesn't
3184 * fall within the same normalization buffer. Note this does not affect the
3185 * results of matches spanning the text and the normalization buffer.
3186 * The position to start searching is taken from the collation element
3187 * iterator. Callers of this API would have to set the offset in the collation
3188 * element iterator before using this method.
3189 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3190 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3191                                           UErrorCode    *status)
3192 {
3193     if (U_SUCCESS(*status) && strsrch) {
3194         // note offset is either equivalent to the start of the previous match
3195         // or is set by the user
3196         int32_t      offset       = usearch_getOffset(strsrch);
3197         USearch     *search       = strsrch->search;
3198         search->reset             = FALSE;
3199         int32_t      textlength   = search->textLength;
3200         if (search->isForwardSearching) {
3201 #if BOYER_MOORE
3202             if (offset == textlength
3203                 || (!search->isOverlap &&
3204                     (offset + strsrch->pattern.defaultShiftSize > textlength ||
3205                     (search->matchedIndex != USEARCH_DONE &&
3206                      offset + search->matchedLength >= textlength)))) {
3207                 // not enough characters to match
3208                 setMatchNotFound(strsrch);
3209                 return USEARCH_DONE;
3210             }
3211 #else
3212             if (offset == textlength ||
3213                 (! search->isOverlap &&
3214                 (search->matchedIndex != USEARCH_DONE &&
3215                 offset + search->matchedLength > textlength))) {
3216                     // not enough characters to match
3217                     setMatchNotFound(strsrch);
3218                     return USEARCH_DONE;
3219             }
3220 #endif
3221         }
3222         else {
3223             // switching direction.
3224             // if matchedIndex == USEARCH_DONE, it means that either a
3225             // setOffset has been called or that previous ran off the text
3226             // string. the iterator would have been set to offset 0 if a
3227             // match is not found.
3228             search->isForwardSearching = TRUE;
3229             if (search->matchedIndex != USEARCH_DONE) {
3230                 // there's no need to set the collation element iterator
3231                 // the next call to next will set the offset.
3232                 return search->matchedIndex;
3233             }
3234         }
3235 
3236         if (U_SUCCESS(*status)) {
3237             if (strsrch->pattern.cesLength == 0) {
3238                 if (search->matchedIndex == USEARCH_DONE) {
3239                     search->matchedIndex = offset;
3240                 }
3241                 else { // moves by codepoints
3242                     U16_FWD_1(search->text, search->matchedIndex, textlength);
3243                 }
3244 
3245                 search->matchedLength = 0;
3246                 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3247                 // status checked below
3248                 if (search->matchedIndex == textlength) {
3249                     search->matchedIndex = USEARCH_DONE;
3250                 }
3251             }
3252             else {
3253                 if (search->matchedLength > 0) {
3254                     // if matchlength is 0 we are at the start of the iteration
3255                     if (search->isOverlap) {
3256                         ucol_setOffset(strsrch->textIter, offset + 1, status);
3257                     }
3258                     else {
3259                         ucol_setOffset(strsrch->textIter,
3260                                        offset + search->matchedLength, status);
3261                     }
3262                 }
3263                 else {
3264                     // for boundary check purposes. this will ensure that the
3265                     // next match will not preceed the current offset
3266                     // note search->matchedIndex will always be set to something
3267                     // in the code
3268                     search->matchedIndex = offset - 1;
3269                 }
3270 
3271                 if (search->isCanonicalMatch) {
3272                     // can't use exact here since extra accents are allowed.
3273                     usearch_handleNextCanonical(strsrch, status);
3274                 }
3275                 else {
3276                     usearch_handleNextExact(strsrch, status);
3277                 }
3278             }
3279 
3280             if (U_FAILURE(*status)) {
3281                 return USEARCH_DONE;
3282             }
3283 
3284 #if !BOYER_MOORE
3285             if (search->matchedIndex == USEARCH_DONE) {
3286                 ucol_setOffset(strsrch->textIter, search->textLength, status);
3287             } else {
3288                 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3289             }
3290 #endif
3291 
3292             return search->matchedIndex;
3293         }
3294     }
3295     return USEARCH_DONE;
3296 }
3297 
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3298 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3299                                               UErrorCode *status)
3300 {
3301     if (U_SUCCESS(*status) && strsrch) {
3302         int32_t offset;
3303         USearch *search = strsrch->search;
3304         if (search->reset) {
3305             offset                     = search->textLength;
3306             search->isForwardSearching = FALSE;
3307             search->reset              = FALSE;
3308             setColEIterOffset(strsrch->textIter, offset);
3309         }
3310         else {
3311             offset = usearch_getOffset(strsrch);
3312         }
3313 
3314         int32_t matchedindex = search->matchedIndex;
3315         if (search->isForwardSearching == TRUE) {
3316             // switching direction.
3317             // if matchedIndex == USEARCH_DONE, it means that either a
3318             // setOffset has been called or that next ran off the text
3319             // string. the iterator would have been set to offset textLength if
3320             // a match is not found.
3321             search->isForwardSearching = FALSE;
3322             if (matchedindex != USEARCH_DONE) {
3323                 return matchedindex;
3324             }
3325         }
3326         else {
3327 #if BOYER_MOORE
3328             if (offset == 0 || matchedindex == 0 ||
3329                 (!search->isOverlap &&
3330                     (offset < strsrch->pattern.defaultShiftSize ||
3331                     (matchedindex != USEARCH_DONE &&
3332                     matchedindex < strsrch->pattern.defaultShiftSize)))) {
3333                 // not enough characters to match
3334                 setMatchNotFound(strsrch);
3335                 return USEARCH_DONE;
3336             }
3337 #else
3338             // Could check pattern length, but the
3339             // linear search will do the right thing
3340             if (offset == 0 || matchedindex == 0) {
3341                 setMatchNotFound(strsrch);
3342                 return USEARCH_DONE;
3343             }
3344 #endif
3345         }
3346 
3347         if (U_SUCCESS(*status)) {
3348             if (strsrch->pattern.cesLength == 0) {
3349                 search->matchedIndex =
3350                       (matchedindex == USEARCH_DONE ? offset : matchedindex);
3351                 if (search->matchedIndex == 0) {
3352                     setMatchNotFound(strsrch);
3353                     // status checked below
3354                 }
3355                 else { // move by codepoints
3356                     U16_BACK_1(search->text, 0, search->matchedIndex);
3357                     setColEIterOffset(strsrch->textIter, search->matchedIndex);
3358                     // status checked below
3359                     search->matchedLength = 0;
3360                 }
3361             }
3362             else {
3363                 if (strsrch->search->isCanonicalMatch) {
3364                     // can't use exact here since extra accents are allowed.
3365                     usearch_handlePreviousCanonical(strsrch, status);
3366                     // status checked below
3367                 }
3368                 else {
3369                     usearch_handlePreviousExact(strsrch, status);
3370                     // status checked below
3371                 }
3372             }
3373 
3374             if (U_FAILURE(*status)) {
3375                 return USEARCH_DONE;
3376             }
3377 
3378             return search->matchedIndex;
3379         }
3380     }
3381     return USEARCH_DONE;
3382 }
3383 
3384 
3385 
usearch_reset(UStringSearch * strsrch)3386 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3387 {
3388     /*
3389     reset is setting the attributes that are already in
3390     string search, hence all attributes in the collator should
3391     be retrieved without any problems
3392     */
3393     if (strsrch) {
3394         UErrorCode status            = U_ZERO_ERROR;
3395         UBool      sameCollAttribute = TRUE;
3396         uint32_t   ceMask;
3397         UBool      shift;
3398         uint32_t   varTop;
3399 
3400         // **** hack to deal w/ how processed CEs encode quaternary ****
3401         UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3402         if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3403             (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3404                 sameCollAttribute = FALSE;
3405         }
3406 
3407         strsrch->strength    = ucol_getStrength(strsrch->collator);
3408         ceMask = getMask(strsrch->strength);
3409         if (strsrch->ceMask != ceMask) {
3410             strsrch->ceMask = ceMask;
3411             sameCollAttribute = FALSE;
3412         }
3413 
3414         // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3415         shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3416                                   &status) == UCOL_SHIFTED;
3417         if (strsrch->toShift != shift) {
3418             strsrch->toShift  = shift;
3419             sameCollAttribute = FALSE;
3420         }
3421 
3422         // if status is a failure, ucol_getVariableTop returns 0
3423         varTop = ucol_getVariableTop(strsrch->collator, &status);
3424         if (strsrch->variableTop != varTop) {
3425             strsrch->variableTop = varTop;
3426             sameCollAttribute    = FALSE;
3427         }
3428         if (!sameCollAttribute) {
3429             initialize(strsrch, &status);
3430         }
3431         ucol_setText(strsrch->textIter, strsrch->search->text,
3432                               strsrch->search->textLength,
3433                               &status);
3434         strsrch->search->matchedLength      = 0;
3435         strsrch->search->matchedIndex       = USEARCH_DONE;
3436         strsrch->search->isOverlap          = FALSE;
3437         strsrch->search->isCanonicalMatch   = FALSE;
3438         strsrch->search->elementComparisonType = 0;
3439         strsrch->search->isForwardSearching = TRUE;
3440         strsrch->search->reset              = TRUE;
3441     }
3442 }
3443 
3444 //
3445 //  CEI  Collation Element + source text index.
3446 //       These structs are kept in the circular buffer.
3447 //
3448 struct  CEI {
3449     int64_t ce;
3450     int32_t lowIndex;
3451     int32_t highIndex;
3452 };
3453 
3454 U_NAMESPACE_BEGIN
3455 
3456 namespace {
3457 //
3458 //  CEIBuffer   A circular buffer of CEs-with-index from the text being searched.
3459 //
3460 #define   DEFAULT_CEBUFFER_SIZE 96
3461 #define   CEBUFFER_EXTRA 32
3462 // Some typical max values to make buffer size more reasonable for asymmetric search.
3463 // #8694 is for a better long-term solution to allocation of this buffer.
3464 #define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3465 #define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3466 #define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3467 struct CEIBuffer {
3468     CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3469     CEI                 *buf;
3470     int32_t              bufSize;
3471     int32_t              firstIx;
3472     int32_t              limitIx;
3473     UCollationElements  *ceIter;
3474     UStringSearch       *strSearch;
3475 
3476 
3477 
3478                CEIBuffer(UStringSearch *ss, UErrorCode *status);
3479                ~CEIBuffer();
3480    const CEI   *get(int32_t index);
3481    const CEI   *getPrevious(int32_t index);
3482 };
3483 
3484 
CEIBuffer(UStringSearch * ss,UErrorCode * status)3485 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3486     buf = defBuf;
3487     strSearch = ss;
3488     bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3489     if (ss->search->elementComparisonType != 0) {
3490         const UChar * patText = ss->pattern.text;
3491         if (patText) {
3492             const UChar * patTextLimit = patText + ss->pattern.textLength;
3493             while ( patText < patTextLimit ) {
3494                 UChar c = *patText++;
3495                 if (MIGHT_BE_JAMO_L(c)) {
3496                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3497                 } else {
3498                     // No check for surrogates, we might allocate slightly more buffer than necessary.
3499                     bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3500                 }
3501             }
3502         }
3503     }
3504     ceIter    = ss->textIter;
3505     firstIx = 0;
3506     limitIx = 0;
3507 
3508     if (!initTextProcessedIter(ss, status)) { return; }
3509 
3510     if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3511         buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3512         if (buf == NULL) {
3513             *status = U_MEMORY_ALLOCATION_ERROR;
3514         }
3515     }
3516 }
3517 
3518 // TODO: add a reset or init function so that allocated
3519 //       buffers can be retained & reused.
3520 
~CEIBuffer()3521 CEIBuffer::~CEIBuffer() {
3522     if (buf != defBuf) {
3523         uprv_free(buf);
3524     }
3525 }
3526 
3527 
3528 // Get the CE with the specified index.
3529 //   Index must be in the range
3530 //          n-history_size < index < n+1
3531 //   where n is the largest index to have been fetched by some previous call to this function.
3532 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3533 //
get(int32_t index)3534 const CEI *CEIBuffer::get(int32_t index) {
3535     int i = index % bufSize;
3536 
3537     if (index>=firstIx && index<limitIx) {
3538         // The request was for an entry already in our buffer.
3539         //  Just return it.
3540         return &buf[i];
3541     }
3542 
3543     // Caller is requesting a new, never accessed before, CE.
3544     //   Verify that it is the next one in sequence, which is all
3545     //   that is allowed.
3546     if (index != limitIx) {
3547         U_ASSERT(FALSE);
3548 
3549         return NULL;
3550     }
3551 
3552     // Manage the circular CE buffer indexing
3553     limitIx++;
3554 
3555     if (limitIx - firstIx >= bufSize) {
3556         // The buffer is full, knock out the lowest-indexed entry.
3557         firstIx++;
3558     }
3559 
3560     UErrorCode status = U_ZERO_ERROR;
3561 
3562     buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3563 
3564     return &buf[i];
3565 }
3566 
3567 // Get the CE with the specified index.
3568 //   Index must be in the range
3569 //          n-history_size < index < n+1
3570 //   where n is the largest index to have been fetched by some previous call to this function.
3571 //   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3572 //
getPrevious(int32_t index)3573 const CEI *CEIBuffer::getPrevious(int32_t index) {
3574     int i = index % bufSize;
3575 
3576     if (index>=firstIx && index<limitIx) {
3577         // The request was for an entry already in our buffer.
3578         //  Just return it.
3579         return &buf[i];
3580     }
3581 
3582     // Caller is requesting a new, never accessed before, CE.
3583     //   Verify that it is the next one in sequence, which is all
3584     //   that is allowed.
3585     if (index != limitIx) {
3586         U_ASSERT(FALSE);
3587 
3588         return NULL;
3589     }
3590 
3591     // Manage the circular CE buffer indexing
3592     limitIx++;
3593 
3594     if (limitIx - firstIx >= bufSize) {
3595         // The buffer is full, knock out the lowest-indexed entry.
3596         firstIx++;
3597     }
3598 
3599     UErrorCode status = U_ZERO_ERROR;
3600 
3601     buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3602 
3603     return &buf[i];
3604 }
3605 
3606 }
3607 
3608 U_NAMESPACE_END
3609 
3610 
3611 // #define USEARCH_DEBUG
3612 
3613 #ifdef USEARCH_DEBUG
3614 #include <stdio.h>
3615 #include <stdlib.h>
3616 #endif
3617 
3618 /*
3619  * Find the next break boundary after startIndex. If the UStringSearch object
3620  * has an external break iterator, use that. Otherwise use the internal character
3621  * break iterator.
3622  */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3623 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3624 #if 0
3625     const UChar *text = strsrch->search->text;
3626     int32_t textLen   = strsrch->search->textLength;
3627 
3628     U_ASSERT(startIndex>=0);
3629     U_ASSERT(startIndex<=textLen);
3630 
3631     if (startIndex >= textLen) {
3632         return startIndex;
3633     }
3634 
3635     UChar32  c;
3636     int32_t  i = startIndex;
3637     U16_NEXT(text, i, textLen, c);
3638 
3639     // If we are on a control character, stop without looking for combining marks.
3640     //    Control characters do not combine.
3641     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3642     if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3643         return i;
3644     }
3645 
3646     // The initial character was not a control, and can thus accept trailing
3647     //   combining characters.  Advance over however many of them there are.
3648     int32_t  indexOfLastCharChecked;
3649     for (;;) {
3650         indexOfLastCharChecked = i;
3651         if (i>=textLen) {
3652             break;
3653         }
3654         U16_NEXT(text, i, textLen, c);
3655         gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3656         if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3657             break;
3658         }
3659     }
3660     return indexOfLastCharChecked;
3661 #elif !UCONFIG_NO_BREAK_ITERATION
3662     UBreakIterator *breakiterator = strsrch->search->breakIter;
3663 
3664     if (breakiterator == NULL) {
3665         breakiterator = strsrch->search->internalBreakIter;
3666     }
3667 
3668     if (breakiterator != NULL) {
3669         return ubrk_following(breakiterator, startIndex);
3670     }
3671 
3672     return startIndex;
3673 #else
3674     // **** or should we use the original code? ****
3675     return startIndex;
3676 #endif
3677 
3678 }
3679 
3680 /*
3681  * Returns TRUE if index is on a break boundary. If the UStringSearch
3682  * has an external break iterator, test using that, otherwise test
3683  * using the internal character break iterator.
3684  */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3685 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3686 #if 0
3687     const UChar *text = strsrch->search->text;
3688     int32_t textLen   = strsrch->search->textLength;
3689 
3690     U_ASSERT(index>=0);
3691     U_ASSERT(index<=textLen);
3692 
3693     if (index>=textLen || index<=0) {
3694         return TRUE;
3695     }
3696 
3697     // If the character at the current index is not a GRAPHEME_EXTEND
3698     //    then we can not be within a combining sequence.
3699     UChar32  c;
3700     U16_GET(text, 0, index, textLen, c);
3701     int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3702     if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3703         return TRUE;
3704     }
3705 
3706     // We are at a combining mark.  If the preceding character is anything
3707     //   except a CONTROL, CR or LF, we are in a combining sequence.
3708     U16_PREV(text, 0, index, c);
3709     gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3710     UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3711     return !combining;
3712 #elif !UCONFIG_NO_BREAK_ITERATION
3713     UBreakIterator *breakiterator = strsrch->search->breakIter;
3714 
3715     if (breakiterator == NULL) {
3716         breakiterator = strsrch->search->internalBreakIter;
3717     }
3718 
3719     return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3720 #else
3721     // **** or use the original code? ****
3722     return TRUE;
3723 #endif
3724 }
3725 
3726 #if 0
3727 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3728 {
3729 #if !UCONFIG_NO_BREAK_ITERATION
3730     UBreakIterator *breakiterator = strsrch->search->breakIter;
3731 
3732     if (breakiterator != NULL) {
3733         int32_t startindex = ubrk_first(breakiterator);
3734         int32_t endindex   = ubrk_last(breakiterator);
3735 
3736         // out-of-range indexes are never boundary positions
3737         if (start < startindex || start > endindex ||
3738             end < startindex || end > endindex) {
3739             return FALSE;
3740         }
3741 
3742         return ubrk_isBoundary(breakiterator, start) &&
3743                ubrk_isBoundary(breakiterator, end);
3744     }
3745 #endif
3746 
3747     return TRUE;
3748 }
3749 #endif
3750 
3751 typedef enum {
3752     U_CE_MATCH = -1,
3753     U_CE_NO_MATCH = 0,
3754     U_CE_SKIP_TARG,
3755     U_CE_SKIP_PATN
3756 } UCompareCEsResult;
3757 #define U_CE_LEVEL2_BASE 0x00000005
3758 #define U_CE_LEVEL3_BASE 0x00050000
3759 
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3760 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3761     if (targCE == patCE) {
3762         return U_CE_MATCH;
3763     }
3764     if (compareType == 0) {
3765         return U_CE_NO_MATCH;
3766     }
3767 
3768     int64_t targCEshifted = targCE >> 32;
3769     int64_t patCEshifted = patCE >> 32;
3770     int64_t mask;
3771 
3772     mask = 0xFFFF0000;
3773     int32_t targLev1 = (int32_t)(targCEshifted & mask);
3774     int32_t patLev1 = (int32_t)(patCEshifted & mask);
3775     if ( targLev1 != patLev1 ) {
3776         if ( targLev1 == 0 ) {
3777             return U_CE_SKIP_TARG;
3778         }
3779         if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3780             return U_CE_SKIP_PATN;
3781         }
3782         return U_CE_NO_MATCH;
3783     }
3784 
3785     mask = 0x0000FFFF;
3786     int32_t targLev2 = (int32_t)(targCEshifted & mask);
3787     int32_t patLev2 = (int32_t)(patCEshifted & mask);
3788     if ( targLev2 != patLev2 ) {
3789         if ( targLev2 == 0 ) {
3790             return U_CE_SKIP_TARG;
3791         }
3792         if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3793             return U_CE_SKIP_PATN;
3794         }
3795         return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3796             U_CE_MATCH: U_CE_NO_MATCH;
3797     }
3798 
3799     mask = 0xFFFF0000;
3800     int32_t targLev3 = (int32_t)(targCE & mask);
3801     int32_t patLev3 = (int32_t)(patCE & mask);
3802     if ( targLev3 != patLev3 ) {
3803         return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3804             U_CE_MATCH: U_CE_NO_MATCH;
3805    }
3806 
3807     return U_CE_MATCH;
3808 }
3809 
3810 #if BOYER_MOORE
3811 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3812 #endif
3813 
3814 namespace {
3815 
codePointAt(const USearch & search,int32_t index)3816 UChar32 codePointAt(const USearch &search, int32_t index) {
3817     if (index < search.textLength) {
3818         UChar32 c;
3819         U16_NEXT(search.text, index, search.textLength, c);
3820         return c;
3821     }
3822     return U_SENTINEL;
3823 }
3824 
codePointBefore(const USearch & search,int32_t index)3825 UChar32 codePointBefore(const USearch &search, int32_t index) {
3826     if (0 < index) {
3827         UChar32 c;
3828         U16_PREV(search.text, 0, index, c);
3829         return c;
3830     }
3831     return U_SENTINEL;
3832 }
3833 
3834 }  // namespace
3835 
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3836 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3837                                        int32_t        startIdx,
3838                                        int32_t        *matchStart,
3839                                        int32_t        *matchLimit,
3840                                        UErrorCode     *status)
3841 {
3842     if (U_FAILURE(*status)) {
3843         return FALSE;
3844     }
3845 
3846     // TODO:  reject search patterns beginning with a combining char.
3847 
3848 #ifdef USEARCH_DEBUG
3849     if (getenv("USEARCH_DEBUG") != NULL) {
3850         printf("Pattern CEs\n");
3851         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3852             printf(" %8x", strsrch->pattern.ces[ii]);
3853         }
3854         printf("\n");
3855     }
3856 
3857 #endif
3858     // Input parameter sanity check.
3859     //  TODO:  should input indicies clip to the text length
3860     //         in the same way that UText does.
3861     if(strsrch->pattern.cesLength == 0         ||
3862        startIdx < 0                           ||
3863        startIdx > strsrch->search->textLength ||
3864        strsrch->pattern.ces == NULL) {
3865            *status = U_ILLEGAL_ARGUMENT_ERROR;
3866            return FALSE;
3867     }
3868 
3869     if (strsrch->pattern.pces == NULL) {
3870         initializePatternPCETable(strsrch, status);
3871     }
3872 
3873     ucol_setOffset(strsrch->textIter, startIdx, status);
3874     CEIBuffer ceb(strsrch, status);
3875 
3876 
3877     int32_t    targetIx = 0;
3878     const CEI *targetCEI = NULL;
3879     int32_t    patIx;
3880     UBool      found;
3881 
3882     int32_t  mStart = -1;
3883     int32_t  mLimit = -1;
3884     int32_t  minLimit;
3885     int32_t  maxLimit;
3886 
3887 
3888 
3889     // Outer loop moves over match starting positions in the
3890     //      target CE space.
3891     // Here we see the target as a sequence of collation elements, resulting from the following:
3892     // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3893     //    (for example, digraphs such as IJ may be broken into two characters).
3894     // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3895     //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3896     //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3897     //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3898     //    then the CE is deleted, so the following code sees only CEs that are relevant.
3899     // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3900     // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3901     // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3902     //
3903     for(targetIx=0; ; targetIx++)
3904     {
3905         found = TRUE;
3906         //  Inner loop checks for a match beginning at each
3907         //  position from the outer loop.
3908         int32_t targetIxOffset = 0;
3909         int64_t patCE = 0;
3910         // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3911         // (compared to the last CE fetched for the previous targetIx value) as we need to go
3912         // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3913         const CEI *firstCEI = ceb.get(targetIx);
3914         if (firstCEI == NULL) {
3915             *status = U_INTERNAL_PROGRAM_ERROR;
3916             found = FALSE;
3917             break;
3918         }
3919 
3920         for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3921             patCE = strsrch->pattern.pces[patIx];
3922             targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3923             //  Compare CE from target string with CE from the pattern.
3924             //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3925             //    which will fail the compare, below.
3926             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3927             if ( ceMatch == U_CE_NO_MATCH ) {
3928                 found = FALSE;
3929                 break;
3930             } else if ( ceMatch > U_CE_NO_MATCH ) {
3931                 if ( ceMatch == U_CE_SKIP_TARG ) {
3932                     // redo with same patCE, next targCE
3933                     patIx--;
3934                     targetIxOffset++;
3935                 } else { // ceMatch == U_CE_SKIP_PATN
3936                     // redo with same targCE, next patCE
3937                     targetIxOffset--;
3938                 }
3939             }
3940         }
3941         targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3942 
3943         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3944             // No match at this targetIx.  Try again at the next.
3945             continue;
3946         }
3947 
3948         if (!found) {
3949             // No match at all, we have run off the end of the target text.
3950             break;
3951         }
3952 
3953 
3954         // We have found a match in CE space.
3955         // Now determine the bounds in string index space.
3956         //  There still is a chance of match failure if the CE range not correspond to
3957         //     an acceptable character range.
3958         //
3959         const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3960 
3961         mStart   = firstCEI->lowIndex;
3962         minLimit = lastCEI->lowIndex;
3963 
3964         // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3965         //   extended to the end of input, and the match is good.
3966 
3967         // Look at the high and low indices of the CE following the match. If
3968         // they are the same it means one of two things:
3969         //    1. The match extended to the last CE from the target text, which is OK, or
3970         //    2. The last CE that was part of the match is in an expansion that extends
3971         //       to the first CE after the match. In this case, we reject the match.
3972         const CEI *nextCEI = 0;
3973         if (strsrch->search->elementComparisonType == 0) {
3974             nextCEI  = ceb.get(targetIx + targetIxOffset);
3975             maxLimit = nextCEI->lowIndex;
3976             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3977                 found = FALSE;
3978             }
3979         } else {
3980             for ( ; ; ++targetIxOffset ) {
3981                 nextCEI = ceb.get(targetIx + targetIxOffset);
3982                 maxLimit = nextCEI->lowIndex;
3983                 // If we are at the end of the target too, match succeeds
3984                 if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3985                     break;
3986                 }
3987                 // As long as the next CE has primary weight of 0,
3988                 // it is part of the last target element matched by the pattern;
3989                 // make sure it can be part of a match with the last patCE
3990                 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3991                     UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3992                     if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3993                         found = FALSE;
3994                         break;
3995                     }
3996                 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3997                 // target element, but it has non-zero primary weight => match fails
3998                 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3999                     found = false;
4000                     break;
4001                 // Else the target CE is not part of an expansion of the last matched element, match succeeds
4002                 } else {
4003                     break;
4004                 }
4005             }
4006         }
4007 
4008 
4009         // Check for the start of the match being within a combining sequence.
4010         //   This can happen if the pattern itself begins with a combining char, and
4011         //   the match found combining marks in the target text that were attached
4012         //    to something else.
4013         //   This type of match should be rejected for not completely consuming a
4014         //   combining sequence.
4015         if (!isBreakBoundary(strsrch, mStart)) {
4016             found = FALSE;
4017         }
4018 
4019         // Check for the start of the match being within an Collation Element Expansion,
4020         //   meaning that the first char of the match is only partially matched.
4021         //   With exapnsions, the first CE will report the index of the source
4022         //   character, and all subsequent (expansions) CEs will report the source index of the
4023         //    _following_ character.
4024         int32_t secondIx = firstCEI->highIndex;
4025         if (mStart == secondIx) {
4026             found = FALSE;
4027         }
4028 
4029         // Allow matches to end in the middle of a grapheme cluster if the following
4030         // conditions are met; this is needed to make prefix search work properly in
4031         // Indic, see #11750
4032         // * the default breakIter is being used
4033         // * the next collation element after this combining sequence
4034         //   - has non-zero primary weight
4035         //   - corresponds to a separate character following the one at end of the current match
4036         //   (the second of these conditions, and perhaps both, may be redundant given the
4037         //   subsequent check for normalization boundary; however they are likely much faster
4038         //   tests in any case)
4039         // * the match limit is a normalization boundary
4040         UBool allowMidclusterMatch = FALSE;
4041         if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4042             allowMidclusterMatch =
4043                     strsrch->search->breakIter == NULL &&
4044                     nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4045                     maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4046                     (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4047                         strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4048         }
4049         // If those conditions are met, then:
4050         // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4051         //   the match limit may be backed off to a previous break boundary. This handles
4052         //   cases in which mLimit includes target characters that are ignorable with current
4053         //   settings (such as space) and which extend beyond the pattern match.
4054         // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4055         // * do NOT require that match limit be on a breakIter boundary
4056 
4057         //  Advance the match end position to the first acceptable match boundary.
4058         //    This advances the index over any combining charcters.
4059         mLimit = maxLimit;
4060         if (minLimit < maxLimit) {
4061             // When the last CE's low index is same with its high index, the CE is likely
4062             // a part of expansion. In this case, the index is located just after the
4063             // character corresponding to the CEs compared above. If the index is right
4064             // at the break boundary, move the position to the next boundary will result
4065             // incorrect match length when there are ignorable characters exist between
4066             // the position and the next character produces CE(s). See ticket#8482.
4067             if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4068                 mLimit = minLimit;
4069             } else {
4070                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4071                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4072                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4073                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4074                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4075                     mLimit = nba;
4076                 }
4077             }
4078         }
4079 
4080     #ifdef USEARCH_DEBUG
4081         if (getenv("USEARCH_DEBUG") != NULL) {
4082             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4083         }
4084     #endif
4085 
4086         if (!allowMidclusterMatch) {
4087             // If advancing to the end of a combining sequence in character indexing space
4088             //   advanced us beyond the end of the match in CE space, reject this match.
4089             if (mLimit > maxLimit) {
4090                 found = FALSE;
4091             }
4092 
4093             if (!isBreakBoundary(strsrch, mLimit)) {
4094                 found = FALSE;
4095             }
4096         }
4097 
4098         if (! checkIdentical(strsrch, mStart, mLimit)) {
4099             found = FALSE;
4100         }
4101 
4102         if (found) {
4103             break;
4104         }
4105     }
4106 
4107     #ifdef USEARCH_DEBUG
4108     if (getenv("USEARCH_DEBUG") != NULL) {
4109         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4110         int32_t  lastToPrint = ceb.limitIx+2;
4111         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4112             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4113         }
4114         printf("\n%s\n", found? "match found" : "no match");
4115     }
4116     #endif
4117 
4118     // All Done.  Store back the match bounds to the caller.
4119     //
4120     if (found==FALSE) {
4121         mLimit = -1;
4122         mStart = -1;
4123     }
4124 
4125     if (matchStart != NULL) {
4126         *matchStart= mStart;
4127     }
4128 
4129     if (matchLimit != NULL) {
4130         *matchLimit = mLimit;
4131     }
4132 
4133     return found;
4134 }
4135 
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4136 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4137                                                 int32_t        startIdx,
4138                                                 int32_t        *matchStart,
4139                                                 int32_t        *matchLimit,
4140                                                 UErrorCode     *status)
4141 {
4142     if (U_FAILURE(*status)) {
4143         return FALSE;
4144     }
4145 
4146     // TODO:  reject search patterns beginning with a combining char.
4147 
4148 #ifdef USEARCH_DEBUG
4149     if (getenv("USEARCH_DEBUG") != NULL) {
4150         printf("Pattern CEs\n");
4151         for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4152             printf(" %8x", strsrch->pattern.ces[ii]);
4153         }
4154         printf("\n");
4155     }
4156 
4157 #endif
4158     // Input parameter sanity check.
4159     //  TODO:  should input indicies clip to the text length
4160     //         in the same way that UText does.
4161     if(strsrch->pattern.cesLength == 0         ||
4162        startIdx < 0                           ||
4163        startIdx > strsrch->search->textLength ||
4164        strsrch->pattern.ces == NULL) {
4165            *status = U_ILLEGAL_ARGUMENT_ERROR;
4166            return FALSE;
4167     }
4168 
4169     if (strsrch->pattern.pces == NULL) {
4170         initializePatternPCETable(strsrch, status);
4171     }
4172 
4173     CEIBuffer ceb(strsrch, status);
4174     int32_t    targetIx = 0;
4175 
4176     /*
4177      * Pre-load the buffer with the CE's for the grapheme
4178      * after our starting position so that we're sure that
4179      * we can look at the CE following the match when we
4180      * check the match boundaries.
4181      *
4182      * This will also pre-fetch the first CE that we'll
4183      * consider for the match.
4184      */
4185     if (startIdx < strsrch->search->textLength) {
4186         UBreakIterator *bi = strsrch->search->internalBreakIter;
4187         int32_t next = ubrk_following(bi, startIdx);
4188 
4189         ucol_setOffset(strsrch->textIter, next, status);
4190 
4191         for (targetIx = 0; ; targetIx += 1) {
4192             if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4193                 break;
4194             }
4195         }
4196     } else {
4197         ucol_setOffset(strsrch->textIter, startIdx, status);
4198     }
4199 
4200 
4201     const CEI *targetCEI = NULL;
4202     int32_t    patIx;
4203     UBool      found;
4204 
4205     int32_t  limitIx = targetIx;
4206     int32_t  mStart = -1;
4207     int32_t  mLimit = -1;
4208     int32_t  minLimit;
4209     int32_t  maxLimit;
4210 
4211 
4212 
4213     // Outer loop moves over match starting positions in the
4214     //      target CE space.
4215     // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4216     // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4217     // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4218     // and the beginning of the base text.
4219     for(targetIx = limitIx; ; targetIx += 1)
4220     {
4221         found = TRUE;
4222         // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4223         // (compared to the last CE fetched for the previous targetIx value) as we need to go
4224         // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4225         const CEI *lastCEI  = ceb.getPrevious(targetIx);
4226         if (lastCEI == NULL) {
4227             *status = U_INTERNAL_PROGRAM_ERROR;
4228             found = FALSE;
4229              break;
4230         }
4231         //  Inner loop checks for a match beginning at each
4232         //  position from the outer loop.
4233         int32_t targetIxOffset = 0;
4234         for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4235             int64_t patCE = strsrch->pattern.pces[patIx];
4236 
4237             targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4238             //  Compare CE from target string with CE from the pattern.
4239             //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4240             //    which will fail the compare, below.
4241             UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4242             if ( ceMatch == U_CE_NO_MATCH ) {
4243                 found = FALSE;
4244                 break;
4245             } else if ( ceMatch > U_CE_NO_MATCH ) {
4246                 if ( ceMatch == U_CE_SKIP_TARG ) {
4247                     // redo with same patCE, next targCE
4248                     patIx++;
4249                     targetIxOffset++;
4250                 } else { // ceMatch == U_CE_SKIP_PATN
4251                     // redo with same targCE, next patCE
4252                     targetIxOffset--;
4253                 }
4254             }
4255         }
4256 
4257         if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4258             // No match at this targetIx.  Try again at the next.
4259             continue;
4260         }
4261 
4262         if (!found) {
4263             // No match at all, we have run off the end of the target text.
4264             break;
4265         }
4266 
4267 
4268         // We have found a match in CE space.
4269         // Now determine the bounds in string index space.
4270         //  There still is a chance of match failure if the CE range not correspond to
4271         //     an acceptable character range.
4272         //
4273         const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4274         mStart   = firstCEI->lowIndex;
4275 
4276         // Check for the start of the match being within a combining sequence.
4277         //   This can happen if the pattern itself begins with a combining char, and
4278         //   the match found combining marks in the target text that were attached
4279         //    to something else.
4280         //   This type of match should be rejected for not completely consuming a
4281         //   combining sequence.
4282         if (!isBreakBoundary(strsrch, mStart)) {
4283             found = FALSE;
4284         }
4285 
4286         // Look at the high index of the first CE in the match. If it's the same as the
4287         // low index, the first CE in the match is in the middle of an expansion.
4288         if (mStart == firstCEI->highIndex) {
4289             found = FALSE;
4290         }
4291 
4292 
4293         minLimit = lastCEI->lowIndex;
4294 
4295         if (targetIx > 0) {
4296             // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4297             //   extended to the end of input, and the match is good.
4298 
4299             // Look at the high and low indices of the CE following the match. If
4300             // they are the same it means one of two things:
4301             //    1. The match extended to the last CE from the target text, which is OK, or
4302             //    2. The last CE that was part of the match is in an expansion that extends
4303             //       to the first CE after the match. In this case, we reject the match.
4304             const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4305 
4306             if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4307                 found = FALSE;
4308             }
4309 
4310             mLimit = maxLimit = nextCEI->lowIndex;
4311 
4312             // Allow matches to end in the middle of a grapheme cluster if the following
4313             // conditions are met; this is needed to make prefix search work properly in
4314             // Indic, see #11750
4315             // * the default breakIter is being used
4316             // * the next collation element after this combining sequence
4317             //   - has non-zero primary weight
4318             //   - corresponds to a separate character following the one at end of the current match
4319             //   (the second of these conditions, and perhaps both, may be redundant given the
4320             //   subsequent check for normalization boundary; however they are likely much faster
4321             //   tests in any case)
4322             // * the match limit is a normalization boundary
4323             UBool allowMidclusterMatch = FALSE;
4324             if (strsrch->search->text != NULL && strsrch->search->textLength > maxLimit) {
4325                 allowMidclusterMatch =
4326                         strsrch->search->breakIter == NULL &&
4327                         nextCEI != NULL && (((nextCEI->ce) >> 32) & 0xFFFF0000UL) != 0 &&
4328                         maxLimit >= lastCEI->highIndex && nextCEI->highIndex > maxLimit &&
4329                         (strsrch->nfd->hasBoundaryBefore(codePointAt(*strsrch->search, maxLimit)) ||
4330                             strsrch->nfd->hasBoundaryAfter(codePointBefore(*strsrch->search, maxLimit)));
4331             }
4332             // If those conditions are met, then:
4333             // * do NOT advance the candidate match limit (mLimit) to a break boundary; however
4334             //   the match limit may be backed off to a previous break boundary. This handles
4335             //   cases in which mLimit includes target characters that are ignorable with current
4336             //   settings (such as space) and which extend beyond the pattern match.
4337             // * do NOT require that end of the combining sequence not extend beyond the match in CE space
4338             // * do NOT require that match limit be on a breakIter boundary
4339 
4340             //  Advance the match end position to the first acceptable match boundary.
4341             //    This advances the index over any combining characters.
4342             if (minLimit < maxLimit) {
4343                 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4344                 // Note that we can have nba < maxLimit && nba >= minLImit, in which
4345                 // case we want to set mLimit to nba regardless of allowMidclusterMatch
4346                 // (i.e. we back off mLimit to the previous breakIterator boundary).
4347                 if (nba >= lastCEI->highIndex && (!allowMidclusterMatch || nba < maxLimit)) {
4348                     mLimit = nba;
4349                 }
4350             }
4351 
4352             if (!allowMidclusterMatch) {
4353                 // If advancing to the end of a combining sequence in character indexing space
4354                 //   advanced us beyond the end of the match in CE space, reject this match.
4355                 if (mLimit > maxLimit) {
4356                     found = FALSE;
4357                 }
4358 
4359                 // Make sure the end of the match is on a break boundary
4360                 if (!isBreakBoundary(strsrch, mLimit)) {
4361                     found = FALSE;
4362                 }
4363             }
4364 
4365         } else {
4366             // No non-ignorable CEs after this point.
4367             // The maximum position is detected by boundary after
4368             // the last non-ignorable CE. Combining sequence
4369             // across the start index will be truncated.
4370             int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4371             mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4372         }
4373 
4374     #ifdef USEARCH_DEBUG
4375         if (getenv("USEARCH_DEBUG") != NULL) {
4376             printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4377         }
4378     #endif
4379 
4380 
4381         if (! checkIdentical(strsrch, mStart, mLimit)) {
4382             found = FALSE;
4383         }
4384 
4385         if (found) {
4386             break;
4387         }
4388     }
4389 
4390     #ifdef USEARCH_DEBUG
4391     if (getenv("USEARCH_DEBUG") != NULL) {
4392         printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4393         int32_t  lastToPrint = ceb.limitIx+2;
4394         for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4395             printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4396         }
4397         printf("\n%s\n", found? "match found" : "no match");
4398     }
4399     #endif
4400 
4401     // All Done.  Store back the match bounds to the caller.
4402     //
4403     if (found==FALSE) {
4404         mLimit = -1;
4405         mStart = -1;
4406     }
4407 
4408     if (matchStart != NULL) {
4409         *matchStart= mStart;
4410     }
4411 
4412     if (matchLimit != NULL) {
4413         *matchLimit = mLimit;
4414     }
4415 
4416     return found;
4417 }
4418 
4419 // internal use methods declared in usrchimp.h -----------------------------
4420 
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4421 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4422 {
4423     if (U_FAILURE(*status)) {
4424         setMatchNotFound(strsrch);
4425         return FALSE;
4426     }
4427 
4428 #if BOYER_MOORE
4429     UCollationElements *coleiter        = strsrch->textIter;
4430     int32_t             textlength      = strsrch->search->textLength;
4431     int32_t            *patternce       = strsrch->pattern.ces;
4432     int32_t             patterncelength = strsrch->pattern.cesLength;
4433     int32_t             textoffset      = ucol_getOffset(coleiter);
4434 
4435     // status used in setting coleiter offset, since offset is checked in
4436     // shiftForward before setting the coleiter offset, status never
4437     // a failure
4438     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4439                               patterncelength);
4440     while (textoffset <= textlength)
4441     {
4442         uint32_t    patternceindex = patterncelength - 1;
4443         int32_t     targetce;
4444         UBool       found          = FALSE;
4445         int32_t    lastce          = UCOL_NULLORDER;
4446 
4447         setColEIterOffset(coleiter, textoffset);
4448 
4449         for (;;) {
4450             // finding the last pattern ce match, imagine composite characters
4451             // for example: search for pattern A in text \u00C0
4452             // we'll have to skip \u0300 the grave first before we get to A
4453             targetce = ucol_previous(coleiter, status);
4454             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4455                 found = FALSE;
4456                 break;
4457             }
4458             targetce = getCE(strsrch, targetce);
4459             if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4460                 // this is for the text \u0315\u0300 that requires
4461                 // normalization and pattern \u0300, where \u0315 is ignorable
4462                 continue;
4463             }
4464             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4465                 lastce = targetce;
4466             }
4467             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4468             if (targetce == patternce[patternceindex]) {
4469                 // the first ce can be a contraction
4470                 found = TRUE;
4471                 break;
4472             }
4473             if (!hasExpansion(coleiter)) {
4474                 found = FALSE;
4475                 break;
4476             }
4477         }
4478 
4479         //targetce = lastce;
4480 
4481         while (found && patternceindex > 0) {
4482             lastce = targetce;
4483             targetce    = ucol_previous(coleiter, status);
4484             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4485                 found = FALSE;
4486                 break;
4487             }
4488             targetce    = getCE(strsrch, targetce);
4489             if (targetce == UCOL_IGNORABLE) {
4490                 continue;
4491             }
4492 
4493             patternceindex --;
4494             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4495             found = found && targetce == patternce[patternceindex];
4496         }
4497 
4498         targetce = lastce;
4499 
4500         if (!found) {
4501             if (U_FAILURE(*status)) {
4502                 break;
4503             }
4504             textoffset = shiftForward(strsrch, textoffset, lastce,
4505                                       patternceindex);
4506             // status checked at loop.
4507             patternceindex = patterncelength;
4508             continue;
4509         }
4510 
4511         if (checkNextExactMatch(strsrch, &textoffset, status)) {
4512             // status checked in ucol_setOffset
4513             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4514             return TRUE;
4515         }
4516     }
4517     setMatchNotFound(strsrch);
4518     return FALSE;
4519 #else
4520     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4521     int32_t start = -1;
4522     int32_t end = -1;
4523 
4524     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4525         strsrch->search->matchedIndex  = start;
4526         strsrch->search->matchedLength = end - start;
4527         return TRUE;
4528     } else {
4529         setMatchNotFound(strsrch);
4530         return FALSE;
4531     }
4532 #endif
4533 }
4534 
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4535 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4536 {
4537     if (U_FAILURE(*status)) {
4538         setMatchNotFound(strsrch);
4539         return FALSE;
4540     }
4541 
4542 #if BOYER_MOORE
4543     UCollationElements *coleiter        = strsrch->textIter;
4544     int32_t             textlength      = strsrch->search->textLength;
4545     int32_t            *patternce       = strsrch->pattern.ces;
4546     int32_t             patterncelength = strsrch->pattern.cesLength;
4547     int32_t             textoffset      = ucol_getOffset(coleiter);
4548     UBool               hasPatternAccents =
4549        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4550 
4551     textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4552                               patterncelength);
4553     strsrch->canonicalPrefixAccents[0] = 0;
4554     strsrch->canonicalSuffixAccents[0] = 0;
4555 
4556     while (textoffset <= textlength)
4557     {
4558         int32_t     patternceindex = patterncelength - 1;
4559         int32_t     targetce;
4560         UBool       found          = FALSE;
4561         int32_t     lastce         = UCOL_NULLORDER;
4562 
4563         setColEIterOffset(coleiter, textoffset);
4564 
4565         for (;;) {
4566             // finding the last pattern ce match, imagine composite characters
4567             // for example: search for pattern A in text \u00C0
4568             // we'll have to skip \u0300 the grave first before we get to A
4569             targetce = ucol_previous(coleiter, status);
4570             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4571                 found = FALSE;
4572                 break;
4573             }
4574             targetce = getCE(strsrch, targetce);
4575             if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4576                 lastce = targetce;
4577             }
4578             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4579             if (targetce == patternce[patternceindex]) {
4580                 // the first ce can be a contraction
4581                 found = TRUE;
4582                 break;
4583             }
4584             if (!hasExpansion(coleiter)) {
4585                 found = FALSE;
4586                 break;
4587             }
4588         }
4589 
4590         while (found && patternceindex > 0) {
4591             targetce    = ucol_previous(coleiter, status);
4592             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4593                 found = FALSE;
4594                 break;
4595             }
4596             targetce    = getCE(strsrch, targetce);
4597             if (targetce == UCOL_IGNORABLE) {
4598                 continue;
4599             }
4600 
4601             patternceindex --;
4602             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4603             found = found && targetce == patternce[patternceindex];
4604         }
4605 
4606         // initializing the rearranged accent array
4607         if (hasPatternAccents && !found) {
4608             strsrch->canonicalPrefixAccents[0] = 0;
4609             strsrch->canonicalSuffixAccents[0] = 0;
4610             if (U_FAILURE(*status)) {
4611                 break;
4612             }
4613             found = doNextCanonicalMatch(strsrch, textoffset, status);
4614         }
4615 
4616         if (!found) {
4617             if (U_FAILURE(*status)) {
4618                 break;
4619             }
4620             textoffset = shiftForward(strsrch, textoffset, lastce,
4621                                       patternceindex);
4622             // status checked at loop
4623             patternceindex = patterncelength;
4624             continue;
4625         }
4626 
4627         if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4628             setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4629             return TRUE;
4630         }
4631     }
4632     setMatchNotFound(strsrch);
4633     return FALSE;
4634 #else
4635     int32_t textOffset = ucol_getOffset(strsrch->textIter);
4636     int32_t start = -1;
4637     int32_t end = -1;
4638 
4639     if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4640         strsrch->search->matchedIndex  = start;
4641         strsrch->search->matchedLength = end - start;
4642         return TRUE;
4643     } else {
4644         setMatchNotFound(strsrch);
4645         return FALSE;
4646     }
4647 #endif
4648 }
4649 
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4650 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4651 {
4652     if (U_FAILURE(*status)) {
4653         setMatchNotFound(strsrch);
4654         return FALSE;
4655     }
4656 
4657 #if BOYER_MOORE
4658     UCollationElements *coleiter        = strsrch->textIter;
4659     int32_t            *patternce       = strsrch->pattern.ces;
4660     int32_t             patterncelength = strsrch->pattern.cesLength;
4661     int32_t             textoffset      = ucol_getOffset(coleiter);
4662 
4663     // shifting it check for setting offset
4664     // if setOffset is called previously or there was no previous match, we
4665     // leave the offset as it is.
4666     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4667         textoffset = strsrch->search->matchedIndex;
4668     }
4669 
4670     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4671                               patterncelength);
4672 
4673     while (textoffset >= 0)
4674     {
4675         int32_t     patternceindex = 1;
4676         int32_t     targetce;
4677         UBool       found          = FALSE;
4678         int32_t     firstce        = UCOL_NULLORDER;
4679 
4680         // if status is a failure, ucol_setOffset does nothing
4681         setColEIterOffset(coleiter, textoffset);
4682 
4683         for (;;) {
4684             // finding the first pattern ce match, imagine composite
4685             // characters. for example: search for pattern \u0300 in text
4686             // \u00C0, we'll have to skip A first before we get to
4687             // \u0300 the grave accent
4688             targetce = ucol_next(coleiter, status);
4689             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4690                 found = FALSE;
4691                 break;
4692             }
4693             targetce = getCE(strsrch, targetce);
4694             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4695                 firstce = targetce;
4696             }
4697             if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4698                 continue;
4699             }
4700             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4701             if (targetce == patternce[0]) {
4702                 found = TRUE;
4703                 break;
4704             }
4705             if (!hasExpansion(coleiter)) {
4706                 // checking for accents in composite character
4707                 found = FALSE;
4708                 break;
4709             }
4710         }
4711 
4712         //targetce = firstce;
4713 
4714         while (found && (patternceindex < patterncelength)) {
4715             firstce = targetce;
4716             targetce    = ucol_next(coleiter, status);
4717             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4718                 found = FALSE;
4719                 break;
4720             }
4721             targetce    = getCE(strsrch, targetce);
4722             if (targetce == UCOL_IGNORABLE) {
4723                 continue;
4724             }
4725 
4726             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4727             found = found && targetce == patternce[patternceindex];
4728             patternceindex ++;
4729         }
4730 
4731         targetce = firstce;
4732 
4733         if (!found) {
4734             if (U_FAILURE(*status)) {
4735                 break;
4736             }
4737 
4738             textoffset = reverseShift(strsrch, textoffset, targetce,
4739                                       patternceindex);
4740             patternceindex = 0;
4741             continue;
4742         }
4743 
4744         if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4745             setColEIterOffset(coleiter, textoffset);
4746             return TRUE;
4747         }
4748     }
4749     setMatchNotFound(strsrch);
4750     return FALSE;
4751 #else
4752     int32_t textOffset;
4753 
4754     if (strsrch->search->isOverlap) {
4755         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4756             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4757         } else {
4758             // move the start position at the end of possible match
4759             initializePatternPCETable(strsrch, status);
4760             if (!initTextProcessedIter(strsrch, status)) {
4761                 setMatchNotFound(strsrch);
4762                 return FALSE;
4763             }
4764             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4765                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4766                 if (pce == UCOL_PROCESSED_NULLORDER) {
4767                     // at the end of the text
4768                     break;
4769                 }
4770             }
4771             if (U_FAILURE(*status)) {
4772                 setMatchNotFound(strsrch);
4773                 return FALSE;
4774             }
4775             textOffset = ucol_getOffset(strsrch->textIter);
4776         }
4777     } else {
4778         textOffset = ucol_getOffset(strsrch->textIter);
4779     }
4780 
4781     int32_t start = -1;
4782     int32_t end = -1;
4783 
4784     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4785         strsrch->search->matchedIndex = start;
4786         strsrch->search->matchedLength = end - start;
4787         return TRUE;
4788     } else {
4789         setMatchNotFound(strsrch);
4790         return FALSE;
4791     }
4792 #endif
4793 }
4794 
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4795 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4796                                       UErrorCode    *status)
4797 {
4798     if (U_FAILURE(*status)) {
4799         setMatchNotFound(strsrch);
4800         return FALSE;
4801     }
4802 
4803 #if BOYER_MOORE
4804     UCollationElements *coleiter        = strsrch->textIter;
4805     int32_t            *patternce       = strsrch->pattern.ces;
4806     int32_t             patterncelength = strsrch->pattern.cesLength;
4807     int32_t             textoffset      = ucol_getOffset(coleiter);
4808     UBool               hasPatternAccents =
4809        strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4810 
4811     // shifting it check for setting offset
4812     // if setOffset is called previously or there was no previous match, we
4813     // leave the offset as it is.
4814     if (strsrch->search->matchedIndex != USEARCH_DONE) {
4815         textoffset = strsrch->search->matchedIndex;
4816     }
4817 
4818     textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4819                               patterncelength);
4820     strsrch->canonicalPrefixAccents[0] = 0;
4821     strsrch->canonicalSuffixAccents[0] = 0;
4822 
4823     while (textoffset >= 0)
4824     {
4825         int32_t     patternceindex = 1;
4826         int32_t     targetce;
4827         UBool       found          = FALSE;
4828         int32_t     firstce        = UCOL_NULLORDER;
4829 
4830         setColEIterOffset(coleiter, textoffset);
4831         for (;;) {
4832             // finding the first pattern ce match, imagine composite
4833             // characters. for example: search for pattern \u0300 in text
4834             // \u00C0, we'll have to skip A first before we get to
4835             // \u0300 the grave accent
4836             targetce = ucol_next(coleiter, status);
4837             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4838                 found = FALSE;
4839                 break;
4840             }
4841             targetce = getCE(strsrch, targetce);
4842             if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4843                 firstce = targetce;
4844             }
4845 
4846             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4847             if (targetce == patternce[0]) {
4848                 // the first ce can be a contraction
4849                 found = TRUE;
4850                 break;
4851             }
4852             if (!hasExpansion(coleiter)) {
4853                 // checking for accents in composite character
4854                 found = FALSE;
4855                 break;
4856             }
4857         }
4858 
4859         targetce = firstce;
4860 
4861         while (found && patternceindex < patterncelength) {
4862             targetce    = ucol_next(coleiter, status);
4863             if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4864                 found = FALSE;
4865                 break;
4866             }
4867             targetce = getCE(strsrch, targetce);
4868             if (targetce == UCOL_IGNORABLE) {
4869                 continue;
4870             }
4871 
4872             // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4873             found = found && targetce == patternce[patternceindex];
4874             patternceindex ++;
4875         }
4876 
4877         // initializing the rearranged accent array
4878         if (hasPatternAccents && !found) {
4879             strsrch->canonicalPrefixAccents[0] = 0;
4880             strsrch->canonicalSuffixAccents[0] = 0;
4881             if (U_FAILURE(*status)) {
4882                 break;
4883             }
4884             found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4885         }
4886 
4887         if (!found) {
4888             if (U_FAILURE(*status)) {
4889                 break;
4890             }
4891             textoffset = reverseShift(strsrch, textoffset, targetce,
4892                                       patternceindex);
4893             patternceindex = 0;
4894             continue;
4895         }
4896 
4897         if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4898             setColEIterOffset(coleiter, textoffset);
4899             return TRUE;
4900         }
4901     }
4902     setMatchNotFound(strsrch);
4903     return FALSE;
4904 #else
4905     int32_t textOffset;
4906 
4907     if (strsrch->search->isOverlap) {
4908         if (strsrch->search->matchedIndex != USEARCH_DONE) {
4909             textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4910         } else {
4911             // move the start position at the end of possible match
4912             initializePatternPCETable(strsrch, status);
4913             if (!initTextProcessedIter(strsrch, status)) {
4914                 setMatchNotFound(strsrch);
4915                 return FALSE;
4916             }
4917             for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4918                 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4919                 if (pce == UCOL_PROCESSED_NULLORDER) {
4920                     // at the end of the text
4921                     break;
4922                 }
4923             }
4924             if (U_FAILURE(*status)) {
4925                 setMatchNotFound(strsrch);
4926                 return FALSE;
4927             }
4928             textOffset = ucol_getOffset(strsrch->textIter);
4929         }
4930     } else {
4931         textOffset = ucol_getOffset(strsrch->textIter);
4932     }
4933 
4934     int32_t start = -1;
4935     int32_t end = -1;
4936 
4937     if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4938         strsrch->search->matchedIndex = start;
4939         strsrch->search->matchedLength = end - start;
4940         return TRUE;
4941     } else {
4942         setMatchNotFound(strsrch);
4943         return FALSE;
4944     }
4945 #endif
4946 }
4947 
4948 #endif /* #if !UCONFIG_NO_COLLATION */
4949