1 /*
2 **********************************************************************
3 * Copyright (C) 2001-2014 IBM and others. All rights reserved.
4 **********************************************************************
5 * Date Name Description
6 * 07/02/2001 synwee Creation.
7 **********************************************************************
8 */
9
10 #include "unicode/utypes.h"
11
12 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13
14 #include "unicode/usearch.h"
15 #include "unicode/ustring.h"
16 #include "unicode/uchar.h"
17 #include "unicode/utf16.h"
18 #include "normalizer2impl.h"
19 #include "usrchimp.h"
20 #include "cmemory.h"
21 #include "ucln_in.h"
22 #include "uassert.h"
23 #include "ustr_imp.h"
24
25 U_NAMESPACE_USE
26
27 // don't use Boyer-Moore
28 // (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29 #define BOYER_MOORE 0
30
31 // internal definition ---------------------------------------------------
32
33 #define LAST_BYTE_MASK_ 0xFF
34 #define SECOND_LAST_BYTE_SHIFT_ 8
35 #define SUPPLEMENTARY_MIN_VALUE_ 0x10000
36
37 static const Normalizer2Impl *g_nfcImpl = NULL;
38
39 // internal methods -------------------------------------------------
40
41 /**
42 * Fast collation element iterator setOffset.
43 * This function does not check for bounds.
44 * @param coleiter collation element iterator
45 * @param offset to set
46 */
47 static
setColEIterOffset(UCollationElements * elems,int32_t offset)48 inline void setColEIterOffset(UCollationElements *elems,
49 int32_t offset)
50 {
51 // Note: Not "fast" any more after the 2013 collation rewrite.
52 // We do not want to expose more internals than necessary.
53 UErrorCode status = U_ZERO_ERROR;
54 ucol_setOffset(elems, offset, &status);
55 }
56
57 /**
58 * Getting the mask for collation strength
59 * @param strength collation strength
60 * @return collation element mask
61 */
62 static
getMask(UCollationStrength strength)63 inline uint32_t getMask(UCollationStrength strength)
64 {
65 switch (strength)
66 {
67 case UCOL_PRIMARY:
68 return UCOL_PRIMARYORDERMASK;
69 case UCOL_SECONDARY:
70 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
71 default:
72 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
73 UCOL_PRIMARYORDERMASK;
74 }
75 }
76
77 /**
78 * This is to squeeze the 21bit ces into a 256 table
79 * @param ce collation element
80 * @return collapsed version of the collation element
81 */
82 static
hash(uint32_t ce)83 inline int hash(uint32_t ce)
84 {
85 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
86 // well with the new collation where most of the latin 1 characters
87 // are of the value xx000xxx. their hashes will most of the time be 0
88 // to be discussed on the hash algo.
89 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
90 }
91
92 U_CDECL_BEGIN
93 static UBool U_CALLCONV
usearch_cleanup(void)94 usearch_cleanup(void) {
95 g_nfcImpl = NULL;
96 return TRUE;
97 }
98 U_CDECL_END
99
100 /**
101 * Initializing the fcd tables.
102 * Internal method, status assumed to be a success.
103 * @param status output error if any, caller to check status before calling
104 * method, status assumed to be success when passed in.
105 */
106 static
initializeFCD(UErrorCode * status)107 inline void initializeFCD(UErrorCode *status)
108 {
109 if (g_nfcImpl == NULL) {
110 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
111 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
112 }
113 }
114
115 /**
116 * Gets the fcd value for a character at the argument index.
117 * This method takes into accounts of the supplementary characters.
118 * @param str UTF16 string where character for fcd retrieval resides
119 * @param offset position of the character whose fcd is to be retrieved, to be
120 * overwritten with the next character position, taking
121 * surrogate characters into consideration.
122 * @param strlength length of the argument string
123 * @return fcd value
124 */
125 static
getFCD(const UChar * str,int32_t * offset,int32_t strlength)126 uint16_t getFCD(const UChar *str, int32_t *offset,
127 int32_t strlength)
128 {
129 const UChar *temp = str + *offset;
130 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength);
131 *offset = (int32_t)(temp - str);
132 return result;
133 }
134
135 /**
136 * Getting the modified collation elements taking into account the collation
137 * attributes
138 * @param strsrch string search data
139 * @param sourcece
140 * @return the modified collation element
141 */
142 static
getCE(const UStringSearch * strsrch,uint32_t sourcece)143 inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
144 {
145 // note for tertiary we can't use the collator->tertiaryMask, that
146 // is a preprocessed mask that takes into account case options. since
147 // we are only concerned with exact matches, we don't need that.
148 sourcece &= strsrch->ceMask;
149
150 if (strsrch->toShift) {
151 // alternate handling here, since only the 16 most significant digits
152 // is only used, we can safely do a compare without masking
153 // if the ce is a variable, we mask and get only the primary values
154 // no shifting to quartenary is required since all primary values
155 // less than variabletop will need to be masked off anyway.
156 if (strsrch->variableTop > sourcece) {
157 if (strsrch->strength >= UCOL_QUATERNARY) {
158 sourcece &= UCOL_PRIMARYORDERMASK;
159 }
160 else {
161 sourcece = UCOL_IGNORABLE;
162 }
163 }
164 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
165 sourcece = 0xFFFF;
166 }
167
168 return sourcece;
169 }
170
171 /**
172 * Allocate a memory and returns NULL if it failed.
173 * Internal method, status assumed to be a success.
174 * @param size to allocate
175 * @param status output error if any, caller to check status before calling
176 * method, status assumed to be success when passed in.
177 * @return newly allocated array, NULL otherwise
178 */
179 static
allocateMemory(uint32_t size,UErrorCode * status)180 inline void * allocateMemory(uint32_t size, UErrorCode *status)
181 {
182 uint32_t *result = (uint32_t *)uprv_malloc(size);
183 if (result == NULL) {
184 *status = U_MEMORY_ALLOCATION_ERROR;
185 }
186 return result;
187 }
188
189 /**
190 * Adds a uint32_t value to a destination array.
191 * Creates a new array if we run out of space. The caller will have to
192 * manually deallocate the newly allocated array.
193 * Internal method, status assumed to be success, caller has to check status
194 * before calling this method. destination not to be NULL and has at least
195 * size destinationlength.
196 * @param destination target array
197 * @param offset destination offset to add value
198 * @param destinationlength target array size, return value for the new size
199 * @param value to be added
200 * @param increments incremental size expected
201 * @param status output error if any, caller to check status before calling
202 * method, status assumed to be success when passed in.
203 * @return new destination array, destination if there was no new allocation
204 */
205 static
addTouint32_tArray(int32_t * destination,uint32_t offset,uint32_t * destinationlength,uint32_t value,uint32_t increments,UErrorCode * status)206 inline int32_t * addTouint32_tArray(int32_t *destination,
207 uint32_t offset,
208 uint32_t *destinationlength,
209 uint32_t value,
210 uint32_t increments,
211 UErrorCode *status)
212 {
213 uint32_t newlength = *destinationlength;
214 if (offset + 1 == newlength) {
215 newlength += increments;
216 int32_t *temp = (int32_t *)allocateMemory(
217 sizeof(int32_t) * newlength, status);
218 if (U_FAILURE(*status)) {
219 return NULL;
220 }
221 uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
222 *destinationlength = newlength;
223 destination = temp;
224 }
225 destination[offset] = value;
226 return destination;
227 }
228
229 /**
230 * Adds a uint64_t value to a destination array.
231 * Creates a new array if we run out of space. The caller will have to
232 * manually deallocate the newly allocated array.
233 * Internal method, status assumed to be success, caller has to check status
234 * before calling this method. destination not to be NULL and has at least
235 * size destinationlength.
236 * @param destination target array
237 * @param offset destination offset to add value
238 * @param destinationlength target array size, return value for the new size
239 * @param value to be added
240 * @param increments incremental size expected
241 * @param status output error if any, caller to check status before calling
242 * method, status assumed to be success when passed in.
243 * @return new destination array, destination if there was no new allocation
244 */
245 static
addTouint64_tArray(int64_t * destination,uint32_t offset,uint32_t * destinationlength,uint64_t value,uint32_t increments,UErrorCode * status)246 inline int64_t * addTouint64_tArray(int64_t *destination,
247 uint32_t offset,
248 uint32_t *destinationlength,
249 uint64_t value,
250 uint32_t increments,
251 UErrorCode *status)
252 {
253 uint32_t newlength = *destinationlength;
254 if (offset + 1 == newlength) {
255 newlength += increments;
256 int64_t *temp = (int64_t *)allocateMemory(
257 sizeof(int64_t) * newlength, status);
258
259 if (U_FAILURE(*status)) {
260 return NULL;
261 }
262
263 uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
264 *destinationlength = newlength;
265 destination = temp;
266 }
267
268 destination[offset] = value;
269
270 return destination;
271 }
272
273 /**
274 * Initializing the ce table for a pattern.
275 * Stores non-ignorable collation keys.
276 * Table size will be estimated by the size of the pattern text. Table
277 * expansion will be perform as we go along. Adding 1 to ensure that the table
278 * size definitely increases.
279 * Internal method, status assumed to be a success.
280 * @param strsrch string search data
281 * @param status output error if any, caller to check status before calling
282 * method, status assumed to be success when passed in.
283 * @return total number of expansions
284 */
285 static
initializePatternCETable(UStringSearch * strsrch,UErrorCode * status)286 inline uint16_t initializePatternCETable(UStringSearch *strsrch,
287 UErrorCode *status)
288 {
289 UPattern *pattern = &(strsrch->pattern);
290 uint32_t cetablesize = INITIAL_ARRAY_SIZE_;
291 int32_t *cetable = pattern->cesBuffer;
292 uint32_t patternlength = pattern->textLength;
293 UCollationElements *coleiter = strsrch->utilIter;
294
295 if (coleiter == NULL) {
296 coleiter = ucol_openElements(strsrch->collator, pattern->text,
297 patternlength, status);
298 // status will be checked in ucol_next(..) later and if it is an
299 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
300 // returned.
301 strsrch->utilIter = coleiter;
302 }
303 else {
304 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
305 }
306 if(U_FAILURE(*status)) {
307 return 0;
308 }
309
310 if (pattern->ces != cetable && pattern->ces) {
311 uprv_free(pattern->ces);
312 }
313
314 uint16_t offset = 0;
315 uint16_t result = 0;
316 int32_t ce;
317
318 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
319 U_SUCCESS(*status)) {
320 uint32_t newce = getCE(strsrch, ce);
321 if (newce) {
322 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
323 newce,
324 patternlength - ucol_getOffset(coleiter) + 1,
325 status);
326 if (U_FAILURE(*status)) {
327 return 0;
328 }
329 offset ++;
330 if (cetable != temp && cetable != pattern->cesBuffer) {
331 uprv_free(cetable);
332 }
333 cetable = temp;
334 }
335 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
336 }
337
338 cetable[offset] = 0;
339 pattern->ces = cetable;
340 pattern->cesLength = offset;
341
342 return result;
343 }
344
345 /**
346 * Initializing the pce table for a pattern.
347 * Stores non-ignorable collation keys.
348 * Table size will be estimated by the size of the pattern text. Table
349 * expansion will be perform as we go along. Adding 1 to ensure that the table
350 * size definitely increases.
351 * Internal method, status assumed to be a success.
352 * @param strsrch string search data
353 * @param status output error if any, caller to check status before calling
354 * method, status assumed to be success when passed in.
355 * @return total number of expansions
356 */
357 static
initializePatternPCETable(UStringSearch * strsrch,UErrorCode * status)358 inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
359 UErrorCode *status)
360 {
361 UPattern *pattern = &(strsrch->pattern);
362 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_;
363 int64_t *pcetable = pattern->pcesBuffer;
364 uint32_t patternlength = pattern->textLength;
365 UCollationElements *coleiter = strsrch->utilIter;
366
367 if (coleiter == NULL) {
368 coleiter = ucol_openElements(strsrch->collator, pattern->text,
369 patternlength, status);
370 // status will be checked in ucol_next(..) later and if it is an
371 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
372 // returned.
373 strsrch->utilIter = coleiter;
374 } else {
375 ucol_setText(coleiter, pattern->text, pattern->textLength, status);
376 }
377 if(U_FAILURE(*status)) {
378 return 0;
379 }
380
381 if (pattern->pces != pcetable && pattern->pces != NULL) {
382 uprv_free(pattern->pces);
383 }
384
385 uint16_t offset = 0;
386 uint16_t result = 0;
387 int64_t pce;
388
389 icu::UCollationPCE iter(coleiter);
390
391 // ** Should processed CEs be signed or unsigned?
392 // ** (the rest of the code in this file seems to play fast-and-loose with
393 // ** whether a CE is signed or unsigned. For example, look at routine above this one.)
394 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
395 U_SUCCESS(*status)) {
396 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
397 pce,
398 patternlength - ucol_getOffset(coleiter) + 1,
399 status);
400
401 if (U_FAILURE(*status)) {
402 return 0;
403 }
404
405 offset += 1;
406
407 if (pcetable != temp && pcetable != pattern->pcesBuffer) {
408 uprv_free(pcetable);
409 }
410
411 pcetable = temp;
412 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
413 }
414
415 pcetable[offset] = 0;
416 pattern->pces = pcetable;
417 pattern->pcesLength = offset;
418
419 return result;
420 }
421
422 /**
423 * Initializes the pattern struct.
424 * Internal method, status assumed to be success.
425 * @param strsrch UStringSearch data storage
426 * @param status output error if any, caller to check status before calling
427 * method, status assumed to be success when passed in.
428 * @return expansionsize the total expansion size of the pattern
429 */
430 static
initializePattern(UStringSearch * strsrch,UErrorCode * status)431 inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
432 {
433 if (U_FAILURE(*status)) { return 0; }
434 UPattern *pattern = &(strsrch->pattern);
435 const UChar *patterntext = pattern->text;
436 int32_t length = pattern->textLength;
437 int32_t index = 0;
438
439 // Since the strength is primary, accents are ignored in the pattern.
440 if (strsrch->strength == UCOL_PRIMARY) {
441 pattern->hasPrefixAccents = 0;
442 pattern->hasSuffixAccents = 0;
443 } else {
444 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
445 SECOND_LAST_BYTE_SHIFT_;
446 index = length;
447 U16_BACK_1(patterntext, 0, index);
448 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
449 LAST_BYTE_MASK_;
450 }
451
452 // ** HACK **
453 if (strsrch->pattern.pces != NULL) {
454 if (strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
455 uprv_free(strsrch->pattern.pces);
456 }
457
458 strsrch->pattern.pces = NULL;
459 }
460
461 // since intializePattern is an internal method status is a success.
462 return initializePatternCETable(strsrch, status);
463 }
464
465 /**
466 * Initializing shift tables, with the default values.
467 * If a corresponding default value is 0, the shift table is not set.
468 * @param shift table for forwards shift
469 * @param backshift table for backwards shift
470 * @param cetable table containing pattern ce
471 * @param cesize size of the pattern ces
472 * @param expansionsize total size of the expansions
473 * @param defaultforward the default forward value
474 * @param defaultbackward the default backward value
475 */
476 static
setShiftTable(int16_t shift[],int16_t backshift[],int32_t * cetable,int32_t cesize,int16_t expansionsize,int16_t defaultforward,int16_t defaultbackward)477 inline void setShiftTable(int16_t shift[], int16_t backshift[],
478 int32_t *cetable, int32_t cesize,
479 int16_t expansionsize,
480 int16_t defaultforward,
481 int16_t defaultbackward)
482 {
483 // estimate the value to shift. to do that we estimate the smallest
484 // number of characters to give the relevant ces, ie approximately
485 // the number of ces minus their expansion, since expansions can come
486 // from a character.
487 int32_t count;
488 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
489 shift[count] = defaultforward;
490 }
491 cesize --; // down to the last index
492 for (count = 0; count < cesize; count ++) {
493 // number of ces from right of array to the count
494 int temp = defaultforward - count - 1;
495 shift[hash(cetable[count])] = temp > 1 ? temp : 1;
496 }
497 shift[hash(cetable[cesize])] = 1;
498 // for ignorables we just shift by one. see test examples.
499 shift[hash(0)] = 1;
500
501 for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
502 backshift[count] = defaultbackward;
503 }
504 for (count = cesize; count > 0; count --) {
505 // the original value count does not seem to work
506 backshift[hash(cetable[count])] = count > expansionsize ?
507 (int16_t)(count - expansionsize) : 1;
508 }
509 backshift[hash(cetable[0])] = 1;
510 backshift[hash(0)] = 1;
511 }
512
513 /**
514 * Building of the pattern collation element list and the boyer moore strsrch
515 * table.
516 * The canonical match will only be performed after the default match fails.
517 * For both cases we need to remember the size of the composed and decomposed
518 * versions of the string. Since the Boyer-Moore shift calculations shifts by
519 * a number of characters in the text and tries to match the pattern from that
520 * offset, the shift value can not be too large in case we miss some
521 * characters. To choose a right shift size, we estimate the NFC form of the
522 * and use its size as a shift guide. The NFC form should be the small
523 * possible representation of the pattern. Anyways, we'll err on the smaller
524 * shift size. Hence the calculation for minlength.
525 * Canonical match will be performed slightly differently. We'll split the
526 * pattern into 3 parts, the prefix accents (PA), the middle string bounded by
527 * the first and last base character (MS), the ending accents (EA). Matches
528 * will be done on MS first, and only when we match MS then some processing
529 * will be required for the prefix and end accents in order to determine if
530 * they match PA and EA. Hence the default shift values
531 * for the canonical match will take the size of either end's accent into
532 * consideration. Forwards search will take the end accents into consideration
533 * for the default shift values and the backwards search will take the prefix
534 * accents into consideration.
535 * If pattern has no non-ignorable ce, we return a illegal argument error.
536 * Internal method, status assumed to be success.
537 * @param strsrch UStringSearch data storage
538 * @param status for output errors if it occurs, status is assumed to be a
539 * success when it is passed in.
540 */
541 static
initialize(UStringSearch * strsrch,UErrorCode * status)542 inline void initialize(UStringSearch *strsrch, UErrorCode *status)
543 {
544 int16_t expandlength = initializePattern(strsrch, status);
545 if (U_SUCCESS(*status) && strsrch->pattern.cesLength > 0) {
546 UPattern *pattern = &strsrch->pattern;
547 int32_t cesize = pattern->cesLength;
548
549 int16_t minlength = cesize > expandlength
550 ? (int16_t)cesize - expandlength : 1;
551 pattern->defaultShiftSize = minlength;
552 setShiftTable(pattern->shift, pattern->backShift, pattern->ces,
553 cesize, expandlength, minlength, minlength);
554 return;
555 }
556 strsrch->pattern.defaultShiftSize = 0;
557 }
558
559 #if BOYER_MOORE
560 /**
561 * Check to make sure that the match length is at the end of the character by
562 * using the breakiterator.
563 * @param strsrch string search data
564 * @param start target text start offset
565 * @param end target text end offset
566 */
567 static
checkBreakBoundary(const UStringSearch * strsrch,int32_t *,int32_t * end)568 void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
569 int32_t *end)
570 {
571 #if !UCONFIG_NO_BREAK_ITERATION
572 UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
573 if (breakiterator) {
574 int32_t matchend = *end;
575 //int32_t matchstart = *start;
576
577 if (!ubrk_isBoundary(breakiterator, matchend)) {
578 *end = ubrk_following(breakiterator, matchend);
579 }
580
581 /* Check the start of the matched text to make sure it doesn't have any accents
582 * before it. This code may not be necessary and so it is commented out */
583 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
584 *start = ubrk_preceding(breakiterator, matchstart);
585 }*/
586 }
587 #endif
588 }
589
590 /**
591 * Determine whether the target text in UStringSearch bounded by the offset
592 * start and end is one or more whole units of text as
593 * determined by the breakiterator in UStringSearch.
594 * @param strsrch string search data
595 * @param start target text start offset
596 * @param end target text end offset
597 */
598 static
isBreakUnit(const UStringSearch * strsrch,int32_t start,int32_t end)599 UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
600 int32_t end)
601 {
602 #if !UCONFIG_NO_BREAK_ITERATION
603 UBreakIterator *breakiterator = strsrch->search->breakIter;
604 //TODO: Add here.
605 if (breakiterator) {
606 int32_t startindex = ubrk_first(breakiterator);
607 int32_t endindex = ubrk_last(breakiterator);
608
609 // out-of-range indexes are never boundary positions
610 if (start < startindex || start > endindex ||
611 end < startindex || end > endindex) {
612 return FALSE;
613 }
614 // otherwise, we can use following() on the position before the
615 // specified one and return true of the position we get back is the
616 // one the user specified
617 UBool result = (start == startindex ||
618 ubrk_following(breakiterator, start - 1) == start) &&
619 (end == endindex ||
620 ubrk_following(breakiterator, end - 1) == end);
621 if (result) {
622 // iterates the individual ces
623 UCollationElements *coleiter = strsrch->utilIter;
624 const UChar *text = strsrch->search->text +
625 start;
626 UErrorCode status = U_ZERO_ERROR;
627 ucol_setText(coleiter, text, end - start, &status);
628 for (int32_t count = 0; count < strsrch->pattern.cesLength;
629 count ++) {
630 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
631 if (ce == UCOL_IGNORABLE) {
632 count --;
633 continue;
634 }
635 if (U_FAILURE(status) || ce != strsrch->pattern.ces[count]) {
636 return FALSE;
637 }
638 }
639 int32_t nextce = ucol_next(coleiter, &status);
640 while (ucol_getOffset(coleiter) == (end - start)
641 && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
642 nextce = ucol_next(coleiter, &status);
643 }
644 if (ucol_getOffset(coleiter) == (end - start)
645 && nextce != UCOL_NULLORDER) {
646 // extra collation elements at the end of the match
647 return FALSE;
648 }
649 }
650 return result;
651 }
652 #endif
653 return TRUE;
654 }
655
656 /**
657 * Getting the next base character offset if current offset is an accent,
658 * or the current offset if the current character contains a base character.
659 * accents the following base character will be returned
660 * @param text string
661 * @param textoffset current offset
662 * @param textlength length of text string
663 * @return the next base character or the current offset
664 * if the current character is contains a base character.
665 */
666 static
getNextBaseOffset(const UChar * text,int32_t textoffset,int32_t textlength)667 inline int32_t getNextBaseOffset(const UChar *text,
668 int32_t textoffset,
669 int32_t textlength)
670 {
671 if (textoffset < textlength) {
672 int32_t temp = textoffset;
673 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
674 while (temp < textlength) {
675 int32_t result = temp;
676 if ((getFCD(text, &temp, textlength) >>
677 SECOND_LAST_BYTE_SHIFT_) == 0) {
678 return result;
679 }
680 }
681 return textlength;
682 }
683 }
684 return textoffset;
685 }
686
687 /**
688 * Gets the next base character offset depending on the string search pattern
689 * data
690 * @param strsrch string search data
691 * @param textoffset current offset, one offset away from the last character
692 * to search for.
693 * @return start index of the next base character or the current offset
694 * if the current character is contains a base character.
695 */
696 static
getNextUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)697 inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
698 int32_t textoffset)
699 {
700 int32_t textlength = strsrch->search->textLength;
701 if (strsrch->pattern.hasSuffixAccents &&
702 textoffset < textlength) {
703 int32_t temp = textoffset;
704 const UChar *text = strsrch->search->text;
705 U16_BACK_1(text, 0, temp);
706 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
707 return getNextBaseOffset(text, textoffset, textlength);
708 }
709 }
710 return textoffset;
711 }
712
713 /**
714 * Shifting the collation element iterator position forward to prepare for
715 * a following match. If the last character is a unsafe character, we'll only
716 * shift by 1 to capture contractions, normalization etc.
717 * Internal method, status assumed to be success.
718 * @param text strsrch string search data
719 * @param textoffset start text position to do search
720 * @param ce the text ce which failed the match.
721 * @param patternceindex index of the ce within the pattern ce buffer which
722 * failed the match
723 * @return final offset
724 */
725 static
shiftForward(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)726 inline int32_t shiftForward(UStringSearch *strsrch,
727 int32_t textoffset,
728 int32_t ce,
729 int32_t patternceindex)
730 {
731 UPattern *pattern = &(strsrch->pattern);
732 if (ce != UCOL_NULLORDER) {
733 int32_t shift = pattern->shift[hash(ce)];
734 // this is to adjust for characters in the middle of the
735 // substring for matching that failed.
736 int32_t adjust = pattern->cesLength - patternceindex;
737 if (adjust > 1 && shift >= adjust) {
738 shift -= adjust - 1;
739 }
740 textoffset += shift;
741 }
742 else {
743 textoffset += pattern->defaultShiftSize;
744 }
745
746 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
747 // check for unsafe characters
748 // * if it is the start or middle of a contraction: to be done after
749 // a initial match is found
750 // * thai or lao base consonant character: similar to contraction
751 // * high surrogate character: similar to contraction
752 // * next character is a accent: shift to the next base character
753 return textoffset;
754 }
755 #endif // #if BOYER_MOORE
756
757 /**
758 * sets match not found
759 * @param strsrch string search data
760 */
761 static
setMatchNotFound(UStringSearch * strsrch)762 inline void setMatchNotFound(UStringSearch *strsrch)
763 {
764 // this method resets the match result regardless of the error status.
765 strsrch->search->matchedIndex = USEARCH_DONE;
766 strsrch->search->matchedLength = 0;
767 if (strsrch->search->isForwardSearching) {
768 setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
769 }
770 else {
771 setColEIterOffset(strsrch->textIter, 0);
772 }
773 }
774
775 #if BOYER_MOORE
776 /**
777 * Gets the offset to the next safe point in text.
778 * ie. not the middle of a contraction, swappable characters or supplementary
779 * characters.
780 * @param collator collation sata
781 * @param text string to work with
782 * @param textoffset offset in string
783 * @param textlength length of text string
784 * @return offset to the next safe character
785 */
786 static
getNextSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset,int32_t textlength)787 inline int32_t getNextSafeOffset(const UCollator *collator,
788 const UChar *text,
789 int32_t textoffset,
790 int32_t textlength)
791 {
792 int32_t result = textoffset; // first contraction character
793 while (result != textlength && ucol_unsafeCP(text[result], collator)) {
794 result ++;
795 }
796 return result;
797 }
798
799 /**
800 * This checks for accents in the potential match started with a .
801 * composite character.
802 * This is really painful... we have to check that composite character do not
803 * have any extra accents. We have to normalize the potential match and find
804 * the immediate decomposed character before the match.
805 * The first composite character would have been taken care of by the fcd
806 * checks in checkForwardExactMatch.
807 * This is the slow path after the fcd of the first character and
808 * the last character has been checked by checkForwardExactMatch and we
809 * determine that the potential match has extra non-ignorable preceding
810 * ces.
811 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
812 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
813 * Note here that accents checking are slow and cautioned in the API docs.
814 * Internal method, status assumed to be a success, caller should check status
815 * before calling this method
816 * @param strsrch string search data
817 * @param start index of the potential unfriendly composite character
818 * @param end index of the potential unfriendly composite character
819 * @param status output error status if any.
820 * @return TRUE if there is non-ignorable accents before at the beginning
821 * of the match, FALSE otherwise.
822 */
823
824 static
checkExtraMatchAccents(const UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)825 UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
826 int32_t end,
827 UErrorCode *status)
828 {
829 UBool result = FALSE;
830 if (strsrch->pattern.hasPrefixAccents) {
831 int32_t length = end - start;
832 int32_t offset = 0;
833 const UChar *text = strsrch->search->text + start;
834
835 U16_FWD_1(text, offset, length);
836 // we are only concerned with the first composite character
837 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
838 int32_t safeoffset = getNextSafeOffset(strsrch->collator,
839 text, 0, length);
840 if (safeoffset != length) {
841 safeoffset ++;
842 }
843 UChar *norm = NULL;
844 UChar buffer[INITIAL_ARRAY_SIZE_];
845 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
846 buffer, INITIAL_ARRAY_SIZE_,
847 status);
848 if (U_FAILURE(*status)) {
849 return FALSE;
850 }
851 if (size >= INITIAL_ARRAY_SIZE_) {
852 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
853 status);
854 // if allocation failed, status will be set to
855 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
856 // checks for it.
857 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
858 size, status);
859 if (U_FAILURE(*status) && norm != NULL) {
860 uprv_free(norm);
861 return FALSE;
862 }
863 }
864 else {
865 norm = buffer;
866 }
867
868 UCollationElements *coleiter = strsrch->utilIter;
869 ucol_setText(coleiter, norm, size, status);
870 uint32_t firstce = strsrch->pattern.ces[0];
871 UBool ignorable = TRUE;
872 uint32_t ce = UCOL_IGNORABLE;
873 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
874 offset = ucol_getOffset(coleiter);
875 if (ce != firstce && ce != UCOL_IGNORABLE) {
876 ignorable = FALSE;
877 }
878 ce = ucol_next(coleiter, status);
879 }
880 UChar32 codepoint;
881 U16_PREV(norm, 0, offset, codepoint);
882 result = !ignorable && (u_getCombiningClass(codepoint) != 0);
883
884 if (norm != buffer) {
885 uprv_free(norm);
886 }
887 }
888 }
889
890 return result;
891 }
892
893 /**
894 * Used by exact matches, checks if there are accents before the match.
895 * This is really painful... we have to check that composite characters at
896 * the start of the matches have to not have any extra accents.
897 * We check the FCD of the character first, if it starts with an accent and
898 * the first pattern ce does not match the first ce of the character, we bail.
899 * Otherwise we try normalizing the first composite
900 * character and find the immediate decomposed character before the match to
901 * see if it is an non-ignorable accent.
902 * Now normalizing the first composite character is enough because we ensure
903 * that when the match is passed in here with extra beginning ces, the
904 * first or last ce that match has to occur within the first character.
905 * E.g. looking for \u0301 acute in \u01FA A ring above and acute,
906 * checkExtraMatchAccent should fail since there is a middle ring in \u01FA
907 * Note here that accents checking are slow and cautioned in the API docs.
908 * @param strsrch string search data
909 * @param start offset
910 * @param end offset
911 * @return TRUE if there are accents on either side of the match,
912 * FALSE otherwise
913 */
914 static
hasAccentsBeforeMatch(const UStringSearch * strsrch,int32_t start,int32_t end)915 UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
916 int32_t end)
917 {
918 if (strsrch->pattern.hasPrefixAccents) {
919 UCollationElements *coleiter = strsrch->textIter;
920 UErrorCode status = U_ZERO_ERROR;
921 // we have been iterating forwards previously
922 uint32_t ignorable = TRUE;
923 int32_t firstce = strsrch->pattern.ces[0];
924
925 setColEIterOffset(coleiter, start);
926 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
927 if (U_FAILURE(status)) {
928 return TRUE;
929 }
930 while (ce != firstce) {
931 if (ce != UCOL_IGNORABLE) {
932 ignorable = FALSE;
933 }
934 ce = getCE(strsrch, ucol_next(coleiter, &status));
935 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
936 return TRUE;
937 }
938 }
939 if (!ignorable && inNormBuf(coleiter)) {
940 // within normalization buffer, discontiguous handled here
941 return TRUE;
942 }
943
944 // within text
945 int32_t temp = start;
946 // original code
947 // accent = (getFCD(strsrch->search->text, &temp,
948 // strsrch->search->textLength)
949 // >> SECOND_LAST_BYTE_SHIFT_);
950 // however this code does not work well with VC7 .net in release mode.
951 // maybe the inlines for getFCD combined with shifting has bugs in
952 // VC7. anyways this is a work around.
953 UBool accent = getFCD(strsrch->search->text, &temp,
954 strsrch->search->textLength) > 0xFF;
955 if (!accent) {
956 return checkExtraMatchAccents(strsrch, start, end, &status);
957 }
958 if (!ignorable) {
959 return TRUE;
960 }
961 if (start > 0) {
962 temp = start;
963 U16_BACK_1(strsrch->search->text, 0, temp);
964 if (getFCD(strsrch->search->text, &temp,
965 strsrch->search->textLength) & LAST_BYTE_MASK_) {
966 setColEIterOffset(coleiter, start);
967 ce = ucol_previous(coleiter, &status);
968 if (U_FAILURE(status) ||
969 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
970 return TRUE;
971 }
972 }
973 }
974 }
975
976 return FALSE;
977 }
978
979 /**
980 * Used by exact matches, checks if there are accents bounding the match.
981 * Note this is the initial boundary check. If the potential match
982 * starts or ends with composite characters, the accents in those
983 * characters will be determined later.
984 * Not doing backwards iteration here, since discontiguos contraction for
985 * backwards collation element iterator, use up too many characters.
986 * E.g. looking for \u030A ring in \u01FA A ring above and acute,
987 * should fail since there is a acute at the end of \u01FA
988 * Note here that accents checking are slow and cautioned in the API docs.
989 * @param strsrch string search data
990 * @param start offset of match
991 * @param end end offset of the match
992 * @return TRUE if there are accents on either side of the match,
993 * FALSE otherwise
994 */
995 static
hasAccentsAfterMatch(const UStringSearch * strsrch,int32_t start,int32_t end)996 UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
997 int32_t end)
998 {
999 if (strsrch->pattern.hasSuffixAccents) {
1000 const UChar *text = strsrch->search->text;
1001 int32_t temp = end;
1002 int32_t textlength = strsrch->search->textLength;
1003 U16_BACK_1(text, 0, temp);
1004 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1005 int32_t firstce = strsrch->pattern.ces[0];
1006 UCollationElements *coleiter = strsrch->textIter;
1007 UErrorCode status = U_ZERO_ERROR;
1008 int32_t ce;
1009 setColEIterOffset(coleiter, start);
1010 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1011 if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1012 return TRUE;
1013 }
1014 }
1015 int32_t count = 1;
1016 while (count < strsrch->pattern.cesLength) {
1017 if (getCE(strsrch, ucol_next(coleiter, &status))
1018 == UCOL_IGNORABLE) {
1019 // Thai can give an ignorable here.
1020 count --;
1021 }
1022 if (U_FAILURE(status)) {
1023 return TRUE;
1024 }
1025 count ++;
1026 }
1027
1028 ce = ucol_next(coleiter, &status);
1029 if (U_FAILURE(status)) {
1030 return TRUE;
1031 }
1032 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1033 ce = getCE(strsrch, ce);
1034 }
1035 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1036 if (ucol_getOffset(coleiter) <= end) {
1037 return TRUE;
1038 }
1039 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1040 return TRUE;
1041 }
1042 }
1043 }
1044 }
1045 return FALSE;
1046 }
1047 #endif // #if BOYER_MOORE
1048
1049 /**
1050 * Checks if the offset runs out of the text string
1051 * @param offset
1052 * @param textlength of the text string
1053 * @return TRUE if offset is out of bounds, FALSE otherwise
1054 */
1055 static
isOutOfBounds(int32_t textlength,int32_t offset)1056 inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1057 {
1058 return offset < 0 || offset > textlength;
1059 }
1060
1061 /**
1062 * Checks for identical match
1063 * @param strsrch string search data
1064 * @param start offset of possible match
1065 * @param end offset of possible match
1066 * @return TRUE if identical match is found
1067 */
1068 static
checkIdentical(const UStringSearch * strsrch,int32_t start,int32_t end)1069 inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1070 int32_t end)
1071 {
1072 if (strsrch->strength != UCOL_IDENTICAL) {
1073 return TRUE;
1074 }
1075
1076 // Note: We could use Normalizer::compare() or similar, but for short strings
1077 // which may not be in FCD it might be faster to just NFD them.
1078 UErrorCode status = U_ZERO_ERROR;
1079 UnicodeString t2, p2;
1080 strsrch->nfd->normalize(
1081 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1082 strsrch->nfd->normalize(
1083 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1084 // return FALSE if NFD failed
1085 return U_SUCCESS(status) && t2 == p2;
1086 }
1087
1088 #if BOYER_MOORE
1089 /**
1090 * Checks to see if the match is repeated
1091 * @param strsrch string search data
1092 * @param start new match start index
1093 * @param end new match end index
1094 * @return TRUE if the the match is repeated, FALSE otherwise
1095 */
1096 static
checkRepeatedMatch(UStringSearch * strsrch,int32_t start,int32_t end)1097 inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1098 int32_t start,
1099 int32_t end)
1100 {
1101 int32_t lastmatchindex = strsrch->search->matchedIndex;
1102 UBool result;
1103 if (lastmatchindex == USEARCH_DONE) {
1104 return FALSE;
1105 }
1106 if (strsrch->search->isForwardSearching) {
1107 result = start <= lastmatchindex;
1108 }
1109 else {
1110 result = start >= lastmatchindex;
1111 }
1112 if (!result && !strsrch->search->isOverlap) {
1113 if (strsrch->search->isForwardSearching) {
1114 result = start < lastmatchindex + strsrch->search->matchedLength;
1115 }
1116 else {
1117 result = end > lastmatchindex;
1118 }
1119 }
1120 return result;
1121 }
1122
1123 /**
1124 * Gets the collation element iterator's current offset.
1125 * @param coleiter collation element iterator
1126 * @param forwards flag TRUE if we are moving in th forwards direction
1127 * @return current offset
1128 */
1129 static
getColElemIterOffset(const UCollationElements * coleiter,UBool forwards)1130 inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1131 UBool forwards)
1132 {
1133 int32_t result = ucol_getOffset(coleiter);
1134 // intricacies of the the backwards collation element iterator
1135 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1136 result ++;
1137 }
1138 return result;
1139 }
1140
1141 /**
1142 * Checks match for contraction.
1143 * If the match ends with a partial contraction we fail.
1144 * If the match starts too far off (because of backwards iteration) we try to
1145 * chip off the extra characters depending on whether a breakiterator has
1146 * been used.
1147 * Internal method, error assumed to be success, caller has to check status
1148 * before calling this method.
1149 * @param strsrch string search data
1150 * @param start offset of potential match, to be modified if necessary
1151 * @param end offset of potential match, to be modified if necessary
1152 * @param status output error status if any
1153 * @return TRUE if match passes the contraction test, FALSE otherwise
1154 */
1155
1156 static
checkNextExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1157 UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1158 int32_t *start,
1159 int32_t *end, UErrorCode *status)
1160 {
1161 UCollationElements *coleiter = strsrch->textIter;
1162 int32_t textlength = strsrch->search->textLength;
1163 int32_t temp = *start;
1164 const UCollator *collator = strsrch->collator;
1165 const UChar *text = strsrch->search->text;
1166 // This part checks if either ends of the match contains potential
1167 // contraction. If so we'll have to iterate through them
1168 // The start contraction needs to be checked since ucol_previous dumps
1169 // all characters till the first safe character into the buffer.
1170 // *start + 1 is used to test for the unsafe characters instead of *start
1171 // because ucol_prev takes all unsafe characters till the first safe
1172 // character ie *start. so by testing *start + 1, we can estimate if
1173 // excess prefix characters has been included in the potential search
1174 // results.
1175 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1176 (*start + 1 < textlength
1177 && ucol_unsafeCP(text[*start + 1], collator))) {
1178 int32_t expansion = getExpansionPrefix(coleiter);
1179 UBool expandflag = expansion > 0;
1180 setColEIterOffset(coleiter, *start);
1181 while (expansion > 0) {
1182 // getting rid of the redundant ce, caused by setOffset.
1183 // since backward contraction/expansion may have extra ces if we
1184 // are in the normalization buffer, hasAccentsBeforeMatch would
1185 // have taken care of it.
1186 // E.g. the character \u01FA will have an expansion of 3, but if
1187 // we are only looking for acute and ring \u030A and \u0301, we'll
1188 // have to skip the first ce in the expansion buffer.
1189 ucol_next(coleiter, status);
1190 if (U_FAILURE(*status)) {
1191 return FALSE;
1192 }
1193 if (ucol_getOffset(coleiter) != temp) {
1194 *start = temp;
1195 temp = ucol_getOffset(coleiter);
1196 }
1197 expansion --;
1198 }
1199
1200 int32_t *patternce = strsrch->pattern.ces;
1201 int32_t patterncelength = strsrch->pattern.cesLength;
1202 int32_t count = 0;
1203 while (count < patterncelength) {
1204 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1205 if (ce == UCOL_IGNORABLE) {
1206 continue;
1207 }
1208 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1209 *start = temp;
1210 temp = ucol_getOffset(coleiter);
1211 }
1212 if (U_FAILURE(*status) || ce != patternce[count]) {
1213 (*end) ++;
1214 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1215 return FALSE;
1216 }
1217 count ++;
1218 }
1219 }
1220 return TRUE;
1221 }
1222
1223 /**
1224 * Checks and sets the match information if found.
1225 * Checks
1226 * <ul>
1227 * <li> the potential match does not repeat the previous match
1228 * <li> boundaries are correct
1229 * <li> exact matches has no extra accents
1230 * <li> identical matchesb
1231 * <li> potential match does not end in the middle of a contraction
1232 * <\ul>
1233 * Otherwise the offset will be shifted to the next character.
1234 * Internal method, status assumed to be success, caller has to check status
1235 * before calling this method.
1236 * @param strsrch string search data
1237 * @param textoffset offset in the collation element text. the returned value
1238 * will be the truncated end offset of the match or the new start
1239 * search offset.
1240 * @param status output error status if any
1241 * @return TRUE if the match is valid, FALSE otherwise
1242 */
1243 static
checkNextExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1244 inline UBool checkNextExactMatch(UStringSearch *strsrch,
1245 int32_t *textoffset, UErrorCode *status)
1246 {
1247 UCollationElements *coleiter = strsrch->textIter;
1248 int32_t start = getColElemIterOffset(coleiter, FALSE);
1249
1250 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1251 return FALSE;
1252 }
1253
1254 // this totally matches, however we need to check if it is repeating
1255 if (!isBreakUnit(strsrch, start, *textoffset) ||
1256 checkRepeatedMatch(strsrch, start, *textoffset) ||
1257 hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1258 !checkIdentical(strsrch, start, *textoffset) ||
1259 hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1260
1261 (*textoffset) ++;
1262 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1263 return FALSE;
1264 }
1265
1266 //Add breakiterator boundary check for primary strength search.
1267 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1268 checkBreakBoundary(strsrch, &start, textoffset);
1269 }
1270
1271 // totally match, we will get rid of the ending ignorables.
1272 strsrch->search->matchedIndex = start;
1273 strsrch->search->matchedLength = *textoffset - start;
1274 return TRUE;
1275 }
1276
1277 /**
1278 * Getting the previous base character offset, or the current offset if the
1279 * current character is a base character
1280 * @param text string
1281 * @param textoffset one offset after the current character
1282 * @return the offset of the next character after the base character or the first
1283 * composed character with accents
1284 */
1285 static
getPreviousBaseOffset(const UChar * text,int32_t textoffset)1286 inline int32_t getPreviousBaseOffset(const UChar *text,
1287 int32_t textoffset)
1288 {
1289 if (textoffset > 0) {
1290 for (;;) {
1291 int32_t result = textoffset;
1292 U16_BACK_1(text, 0, textoffset);
1293 int32_t temp = textoffset;
1294 uint16_t fcd = getFCD(text, &temp, result);
1295 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1296 if (fcd & LAST_BYTE_MASK_) {
1297 return textoffset;
1298 }
1299 return result;
1300 }
1301 if (textoffset == 0) {
1302 return 0;
1303 }
1304 }
1305 }
1306 return textoffset;
1307 }
1308
1309 /**
1310 * Getting the indexes of the accents that are not blocked in the argument
1311 * accent array
1312 * @param accents array of accents in nfd terminated by a 0.
1313 * @param accentsindex array of indexes of the accents that are not blocked
1314 */
1315 static
getUnblockedAccentIndex(UChar * accents,int32_t * accentsindex)1316 inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1317 {
1318 int32_t index = 0;
1319 int32_t length = u_strlen(accents);
1320 UChar32 codepoint = 0;
1321 int cclass = 0;
1322 int result = 0;
1323 int32_t temp;
1324 while (index < length) {
1325 temp = index;
1326 U16_NEXT(accents, index, length, codepoint);
1327 if (u_getCombiningClass(codepoint) != cclass) {
1328 cclass = u_getCombiningClass(codepoint);
1329 accentsindex[result] = temp;
1330 result ++;
1331 }
1332 }
1333 accentsindex[result] = length;
1334 return result;
1335 }
1336
1337 /**
1338 * Appends 3 UChar arrays to a destination array.
1339 * Creates a new array if we run out of space. The caller will have to
1340 * manually deallocate the newly allocated array.
1341 * Internal method, status assumed to be success, caller has to check status
1342 * before calling this method. destination not to be NULL and has at least
1343 * size destinationlength.
1344 * @param destination target array
1345 * @param destinationlength target array size, returning the appended length
1346 * @param source1 null-terminated first array
1347 * @param source2 second array
1348 * @param source2length length of seond array
1349 * @param source3 null-terminated third array
1350 * @param status error status if any
1351 * @return new destination array, destination if there was no new allocation
1352 */
1353 static
addToUCharArray(UChar * destination,int32_t * destinationlength,const UChar * source1,const UChar * source2,int32_t source2length,const UChar * source3,UErrorCode * status)1354 inline UChar * addToUCharArray( UChar *destination,
1355 int32_t *destinationlength,
1356 const UChar *source1,
1357 const UChar *source2,
1358 int32_t source2length,
1359 const UChar *source3,
1360 UErrorCode *status)
1361 {
1362 int32_t source1length = source1 ? u_strlen(source1) : 0;
1363 int32_t source3length = source3 ? u_strlen(source3) : 0;
1364 if (*destinationlength < source1length + source2length + source3length +
1365 1)
1366 {
1367 destination = (UChar *)allocateMemory(
1368 (source1length + source2length + source3length + 1) * sizeof(UChar),
1369 status);
1370 // if error allocating memory, status will be
1371 // U_MEMORY_ALLOCATION_ERROR
1372 if (U_FAILURE(*status)) {
1373 *destinationlength = 0;
1374 return NULL;
1375 }
1376 }
1377 if (source1length != 0) {
1378 uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1379 }
1380 if (source2length != 0) {
1381 uprv_memcpy(destination + source1length, source2,
1382 sizeof(UChar) * source2length);
1383 }
1384 if (source3length != 0) {
1385 uprv_memcpy(destination + source1length + source2length, source3,
1386 sizeof(UChar) * source3length);
1387 }
1388 *destinationlength = source1length + source2length + source3length;
1389 return destination;
1390 }
1391
1392 /**
1393 * Running through a collation element iterator to see if the contents matches
1394 * pattern in string search data
1395 * @param strsrch string search data
1396 * @param coleiter collation element iterator
1397 * @return TRUE if a match if found, FALSE otherwise
1398 */
1399 static
checkCollationMatch(const UStringSearch * strsrch,UCollationElements * coleiter)1400 inline UBool checkCollationMatch(const UStringSearch *strsrch,
1401 UCollationElements *coleiter)
1402 {
1403 int patternceindex = strsrch->pattern.cesLength;
1404 int32_t *patternce = strsrch->pattern.ces;
1405 UErrorCode status = U_ZERO_ERROR;
1406 while (patternceindex > 0) {
1407 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1408 if (ce == UCOL_IGNORABLE) {
1409 continue;
1410 }
1411 if (U_FAILURE(status) || ce != *patternce) {
1412 return FALSE;
1413 }
1414 patternce ++;
1415 patternceindex --;
1416 }
1417 return TRUE;
1418 }
1419
1420 /**
1421 * Rearranges the front accents to try matching.
1422 * Prefix accents in the text will be grouped according to their combining
1423 * class and the groups will be mixed and matched to try find the perfect
1424 * match with the pattern.
1425 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1426 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1427 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1428 * "\u0301\u0325".
1429 * step 2: check if any of the generated substrings matches the pattern.
1430 * Internal method, status is assumed to be success, caller has to check status
1431 * before calling this method.
1432 * @param strsrch string search match
1433 * @param start first offset of the accents to start searching
1434 * @param end start of the last accent set
1435 * @param status output error status if any
1436 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1437 * offset of the match. Note this start includes all preceding accents.
1438 */
1439 static
doNextCanonicalPrefixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)1440 int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1441 int32_t start,
1442 int32_t end,
1443 UErrorCode *status)
1444 {
1445 const UChar *text = strsrch->search->text;
1446 int32_t textlength = strsrch->search->textLength;
1447 int32_t tempstart = start;
1448
1449 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1450 // die... failed at a base character
1451 return USEARCH_DONE;
1452 }
1453
1454 int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1455 start = getPreviousBaseOffset(text, tempstart);
1456
1457 UChar accents[INITIAL_ARRAY_SIZE_];
1458 // normalizing the offensive string
1459 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1460 INITIAL_ARRAY_SIZE_, status);
1461 if (U_FAILURE(*status)) {
1462 return USEARCH_DONE;
1463 }
1464
1465 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1466 int32_t accentsize = getUnblockedAccentIndex(accents,
1467 accentsindex);
1468 int32_t count = (2 << (accentsize - 1)) - 1;
1469 UChar buffer[INITIAL_ARRAY_SIZE_];
1470 UCollationElements *coleiter = strsrch->utilIter;
1471 while (U_SUCCESS(*status) && count > 0) {
1472 UChar *rearrange = strsrch->canonicalPrefixAccents;
1473 // copy the base characters
1474 for (int k = 0; k < accentsindex[0]; k ++) {
1475 *rearrange ++ = accents[k];
1476 }
1477 // forming all possible canonical rearrangement by dropping
1478 // sets of accents
1479 for (int i = 0; i <= accentsize - 1; i ++) {
1480 int32_t mask = 1 << (accentsize - i - 1);
1481 if (count & mask) {
1482 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1483 *rearrange ++ = accents[j];
1484 }
1485 }
1486 }
1487 *rearrange = 0;
1488 int32_t matchsize = INITIAL_ARRAY_SIZE_;
1489 UChar *match = addToUCharArray(buffer, &matchsize,
1490 strsrch->canonicalPrefixAccents,
1491 strsrch->search->text + offset,
1492 end - offset,
1493 strsrch->canonicalSuffixAccents,
1494 status);
1495
1496 // if status is a failure, ucol_setText does nothing.
1497 // run the collator iterator through this match
1498 ucol_setText(coleiter, match, matchsize, status);
1499 if (U_SUCCESS(*status)) {
1500 if (checkCollationMatch(strsrch, coleiter)) {
1501 if (match != buffer) {
1502 uprv_free(match);
1503 }
1504 return start;
1505 }
1506 }
1507 count --;
1508 }
1509 return USEARCH_DONE;
1510 }
1511
1512 /**
1513 * Gets the offset to the safe point in text before textoffset.
1514 * ie. not the middle of a contraction, swappable characters or supplementary
1515 * characters.
1516 * @param collator collation sata
1517 * @param text string to work with
1518 * @param textoffset offset in string
1519 * @param textlength length of text string
1520 * @return offset to the previous safe character
1521 */
1522 static
getPreviousSafeOffset(const UCollator * collator,const UChar * text,int32_t textoffset)1523 inline uint32_t getPreviousSafeOffset(const UCollator *collator,
1524 const UChar *text,
1525 int32_t textoffset)
1526 {
1527 int32_t result = textoffset; // first contraction character
1528 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1529 result --;
1530 }
1531 if (result != 0) {
1532 // the first contraction character is consider unsafe here
1533 result --;
1534 }
1535 return result;
1536 }
1537
1538 /**
1539 * Cleaning up after we passed the safe zone
1540 * @param strsrch string search data
1541 * @param safetext safe text array
1542 * @param safebuffer safe text buffer
1543 * @param coleiter collation element iterator for safe text
1544 */
1545 static
cleanUpSafeText(const UStringSearch * strsrch,UChar * safetext,UChar * safebuffer)1546 inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1547 UChar *safebuffer)
1548 {
1549 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1550 {
1551 uprv_free(safetext);
1552 }
1553 }
1554
1555 /**
1556 * Take the rearranged end accents and tries matching. If match failed at
1557 * a seperate preceding set of accents (seperated from the rearranged on by
1558 * at least a base character) then we rearrange the preceding accents and
1559 * tries matching again.
1560 * We allow skipping of the ends of the accent set if the ces do not match.
1561 * However if the failure is found before the accent set, it fails.
1562 * Internal method, status assumed to be success, caller has to check status
1563 * before calling this method.
1564 * @param strsrch string search data
1565 * @param textoffset of the start of the rearranged accent
1566 * @param status output error status if any
1567 * @return USEARCH_DONE if a match is not found, otherwise return the starting
1568 * offset of the match. Note this start includes all preceding accents.
1569 */
1570 static
doNextCanonicalSuffixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1571 int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1572 int32_t textoffset,
1573 UErrorCode *status)
1574 {
1575 const UChar *text = strsrch->search->text;
1576 const UCollator *collator = strsrch->collator;
1577 int32_t safelength = 0;
1578 UChar *safetext;
1579 int32_t safetextlength;
1580 UChar safebuffer[INITIAL_ARRAY_SIZE_];
1581 UCollationElements *coleiter = strsrch->utilIter;
1582 int32_t safeoffset = textoffset;
1583
1584 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1585 collator)) {
1586 safeoffset = getPreviousSafeOffset(collator, text, textoffset);
1587 safelength = textoffset - safeoffset;
1588 safetextlength = INITIAL_ARRAY_SIZE_;
1589 safetext = addToUCharArray(safebuffer, &safetextlength, NULL,
1590 text + safeoffset, safelength,
1591 strsrch->canonicalSuffixAccents,
1592 status);
1593 }
1594 else {
1595 safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1596 safetext = strsrch->canonicalSuffixAccents;
1597 }
1598
1599 // if status is a failure, ucol_setText does nothing
1600 ucol_setText(coleiter, safetext, safetextlength, status);
1601 // status checked in loop below
1602
1603 int32_t *ce = strsrch->pattern.ces;
1604 int32_t celength = strsrch->pattern.cesLength;
1605 int ceindex = celength - 1;
1606 UBool isSafe = TRUE; // indication flag for position in safe zone
1607
1608 while (ceindex >= 0) {
1609 int32_t textce = ucol_previous(coleiter, status);
1610 if (U_FAILURE(*status)) {
1611 if (isSafe) {
1612 cleanUpSafeText(strsrch, safetext, safebuffer);
1613 }
1614 return USEARCH_DONE;
1615 }
1616 if (textce == UCOL_NULLORDER) {
1617 // check if we have passed the safe buffer
1618 if (coleiter == strsrch->textIter) {
1619 cleanUpSafeText(strsrch, safetext, safebuffer);
1620 return USEARCH_DONE;
1621 }
1622 cleanUpSafeText(strsrch, safetext, safebuffer);
1623 safetext = safebuffer;
1624 coleiter = strsrch->textIter;
1625 setColEIterOffset(coleiter, safeoffset);
1626 // status checked at the start of the loop
1627 isSafe = FALSE;
1628 continue;
1629 }
1630 textce = getCE(strsrch, textce);
1631 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1632 // do the beginning stuff
1633 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1634 if (isSafe && failedoffset >= safelength) {
1635 // alas... no hope. failed at rearranged accent set
1636 cleanUpSafeText(strsrch, safetext, safebuffer);
1637 return USEARCH_DONE;
1638 }
1639 else {
1640 if (isSafe) {
1641 failedoffset += safeoffset;
1642 cleanUpSafeText(strsrch, safetext, safebuffer);
1643 }
1644
1645 // try rearranging the front accents
1646 int32_t result = doNextCanonicalPrefixMatch(strsrch,
1647 failedoffset, textoffset, status);
1648 if (result != USEARCH_DONE) {
1649 // if status is a failure, ucol_setOffset does nothing
1650 setColEIterOffset(strsrch->textIter, result);
1651 }
1652 if (U_FAILURE(*status)) {
1653 return USEARCH_DONE;
1654 }
1655 return result;
1656 }
1657 }
1658 if (textce == ce[ceindex]) {
1659 ceindex --;
1660 }
1661 }
1662 // set offset here
1663 if (isSafe) {
1664 int32_t result = getColElemIterOffset(coleiter, FALSE);
1665 // sets the text iterator here with the correct expansion and offset
1666 int32_t leftoverces = getExpansionPrefix(coleiter);
1667 cleanUpSafeText(strsrch, safetext, safebuffer);
1668 if (result >= safelength) {
1669 result = textoffset;
1670 }
1671 else {
1672 result += safeoffset;
1673 }
1674 setColEIterOffset(strsrch->textIter, result);
1675 strsrch->textIter->iteratordata_.toReturn =
1676 setExpansionPrefix(strsrch->textIter, leftoverces);
1677 return result;
1678 }
1679
1680 return ucol_getOffset(coleiter);
1681 }
1682
1683 /**
1684 * Trying out the substring and sees if it can be a canonical match.
1685 * This will try normalizing the end accents and arranging them into canonical
1686 * equivalents and check their corresponding ces with the pattern ce.
1687 * Suffix accents in the text will be grouped according to their combining
1688 * class and the groups will be mixed and matched to try find the perfect
1689 * match with the pattern.
1690 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1691 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1692 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1693 * "\u0301\u0325".
1694 * step 2: check if any of the generated substrings matches the pattern.
1695 * Internal method, status assumed to be success, caller has to check status
1696 * before calling this method.
1697 * @param strsrch string search data
1698 * @param textoffset end offset in the collation element text that ends with
1699 * the accents to be rearranged
1700 * @param status error status if any
1701 * @return TRUE if the match is valid, FALSE otherwise
1702 */
1703 static
doNextCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)1704 UBool doNextCanonicalMatch(UStringSearch *strsrch,
1705 int32_t textoffset,
1706 UErrorCode *status)
1707 {
1708 const UChar *text = strsrch->search->text;
1709 int32_t temp = textoffset;
1710 U16_BACK_1(text, 0, temp);
1711 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1712 UCollationElements *coleiter = strsrch->textIter;
1713 int32_t offset = getColElemIterOffset(coleiter, FALSE);
1714 if (strsrch->pattern.hasPrefixAccents) {
1715 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1716 status);
1717 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1718 setColEIterOffset(coleiter, offset);
1719 return TRUE;
1720 }
1721 }
1722 return FALSE;
1723 }
1724
1725 if (!strsrch->pattern.hasSuffixAccents) {
1726 return FALSE;
1727 }
1728
1729 UChar accents[INITIAL_ARRAY_SIZE_];
1730 // offset to the last base character in substring to search
1731 int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1732 // normalizing the offensive string
1733 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1734 0, accents, INITIAL_ARRAY_SIZE_, status);
1735 // status checked in loop below
1736
1737 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1738 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1739
1740 // 2 power n - 1 plus the full set of accents
1741 int32_t count = (2 << (size - 1)) - 1;
1742 while (U_SUCCESS(*status) && count > 0) {
1743 UChar *rearrange = strsrch->canonicalSuffixAccents;
1744 // copy the base characters
1745 for (int k = 0; k < accentsindex[0]; k ++) {
1746 *rearrange ++ = accents[k];
1747 }
1748 // forming all possible canonical rearrangement by dropping
1749 // sets of accents
1750 for (int i = 0; i <= size - 1; i ++) {
1751 int32_t mask = 1 << (size - i - 1);
1752 if (count & mask) {
1753 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1754 *rearrange ++ = accents[j];
1755 }
1756 }
1757 }
1758 *rearrange = 0;
1759 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1760 status);
1761 if (offset != USEARCH_DONE) {
1762 return TRUE; // match found
1763 }
1764 count --;
1765 }
1766 return FALSE;
1767 }
1768
1769 /**
1770 * Gets the previous base character offset depending on the string search
1771 * pattern data
1772 * @param strsrch string search data
1773 * @param textoffset current offset, current character
1774 * @return the offset of the next character after this base character or itself
1775 * if it is a composed character with accents
1776 */
1777 static
getPreviousUStringSearchBaseOffset(UStringSearch * strsrch,int32_t textoffset)1778 inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1779 int32_t textoffset)
1780 {
1781 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1782 const UChar *text = strsrch->search->text;
1783 int32_t offset = textoffset;
1784 if (getFCD(text, &offset, strsrch->search->textLength) >>
1785 SECOND_LAST_BYTE_SHIFT_) {
1786 return getPreviousBaseOffset(text, textoffset);
1787 }
1788 }
1789 return textoffset;
1790 }
1791
1792 /**
1793 * Checks match for contraction.
1794 * If the match ends with a partial contraction we fail.
1795 * If the match starts too far off (because of backwards iteration) we try to
1796 * chip off the extra characters
1797 * Internal method, status assumed to be success, caller has to check status
1798 * before calling this method.
1799 * @param strsrch string search data
1800 * @param start offset of potential match, to be modified if necessary
1801 * @param end offset of potential match, to be modified if necessary
1802 * @param status output error status if any
1803 * @return TRUE if match passes the contraction test, FALSE otherwise
1804 */
1805 static
checkNextCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)1806 UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1807 int32_t *start,
1808 int32_t *end,
1809 UErrorCode *status)
1810 {
1811 UCollationElements *coleiter = strsrch->textIter;
1812 int32_t textlength = strsrch->search->textLength;
1813 int32_t temp = *start;
1814 const UCollator *collator = strsrch->collator;
1815 const UChar *text = strsrch->search->text;
1816 // This part checks if either ends of the match contains potential
1817 // contraction. If so we'll have to iterate through them
1818 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1819 (*start + 1 < textlength
1820 && ucol_unsafeCP(text[*start + 1], collator))) {
1821 int32_t expansion = getExpansionPrefix(coleiter);
1822 UBool expandflag = expansion > 0;
1823 setColEIterOffset(coleiter, *start);
1824 while (expansion > 0) {
1825 // getting rid of the redundant ce, caused by setOffset.
1826 // since backward contraction/expansion may have extra ces if we
1827 // are in the normalization buffer, hasAccentsBeforeMatch would
1828 // have taken care of it.
1829 // E.g. the character \u01FA will have an expansion of 3, but if
1830 // we are only looking for acute and ring \u030A and \u0301, we'll
1831 // have to skip the first ce in the expansion buffer.
1832 ucol_next(coleiter, status);
1833 if (U_FAILURE(*status)) {
1834 return FALSE;
1835 }
1836 if (ucol_getOffset(coleiter) != temp) {
1837 *start = temp;
1838 temp = ucol_getOffset(coleiter);
1839 }
1840 expansion --;
1841 }
1842
1843 int32_t *patternce = strsrch->pattern.ces;
1844 int32_t patterncelength = strsrch->pattern.cesLength;
1845 int32_t count = 0;
1846 int32_t textlength = strsrch->search->textLength;
1847 while (count < patterncelength) {
1848 int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1849 // status checked below, note that if status is a failure
1850 // ucol_next returns UCOL_NULLORDER
1851 if (ce == UCOL_IGNORABLE) {
1852 continue;
1853 }
1854 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1855 *start = temp;
1856 temp = ucol_getOffset(coleiter);
1857 }
1858
1859 if (count == 0 && ce != patternce[0]) {
1860 // accents may have extra starting ces, this occurs when a
1861 // pure accent pattern is matched without rearrangement
1862 // text \u0325\u0300 and looking for \u0300
1863 int32_t expected = patternce[0];
1864 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1865 ce = getCE(strsrch, ucol_next(coleiter, status));
1866 while (U_SUCCESS(*status) && ce != expected &&
1867 ce != UCOL_NULLORDER &&
1868 ucol_getOffset(coleiter) <= *end) {
1869 ce = getCE(strsrch, ucol_next(coleiter, status));
1870 }
1871 }
1872 }
1873 if (U_FAILURE(*status) || ce != patternce[count]) {
1874 (*end) ++;
1875 *end = getNextUStringSearchBaseOffset(strsrch, *end);
1876 return FALSE;
1877 }
1878 count ++;
1879 }
1880 }
1881 return TRUE;
1882 }
1883
1884 /**
1885 * Checks and sets the match information if found.
1886 * Checks
1887 * <ul>
1888 * <li> the potential match does not repeat the previous match
1889 * <li> boundaries are correct
1890 * <li> potential match does not end in the middle of a contraction
1891 * <li> identical matches
1892 * <\ul>
1893 * Otherwise the offset will be shifted to the next character.
1894 * Internal method, status assumed to be success, caller has to check the
1895 * status before calling this method.
1896 * @param strsrch string search data
1897 * @param textoffset offset in the collation element text. the returned value
1898 * will be the truncated end offset of the match or the new start
1899 * search offset.
1900 * @param status output error status if any
1901 * @return TRUE if the match is valid, FALSE otherwise
1902 */
1903 static
checkNextCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)1904 inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1905 int32_t *textoffset,
1906 UErrorCode *status)
1907 {
1908 // to ensure that the start and ends are not composite characters
1909 UCollationElements *coleiter = strsrch->textIter;
1910 // if we have a canonical accent match
1911 if ((strsrch->pattern.hasSuffixAccents &&
1912 strsrch->canonicalSuffixAccents[0]) ||
1913 (strsrch->pattern.hasPrefixAccents &&
1914 strsrch->canonicalPrefixAccents[0])) {
1915 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset(
1916 strsrch,
1917 ucol_getOffset(coleiter));
1918 strsrch->search->matchedLength = *textoffset -
1919 strsrch->search->matchedIndex;
1920 return TRUE;
1921 }
1922
1923 int32_t start = getColElemIterOffset(coleiter, FALSE);
1924 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1925 status) || U_FAILURE(*status)) {
1926 return FALSE;
1927 }
1928
1929 start = getPreviousUStringSearchBaseOffset(strsrch, start);
1930 // this totally matches, however we need to check if it is repeating
1931 if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1932 !isBreakUnit(strsrch, start, *textoffset) ||
1933 !checkIdentical(strsrch, start, *textoffset)) {
1934 (*textoffset) ++;
1935 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1936 strsrch->search->textLength);
1937 return FALSE;
1938 }
1939
1940 strsrch->search->matchedIndex = start;
1941 strsrch->search->matchedLength = *textoffset - start;
1942 return TRUE;
1943 }
1944
1945 /**
1946 * Shifting the collation element iterator position forward to prepare for
1947 * a preceding match. If the first character is a unsafe character, we'll only
1948 * shift by 1 to capture contractions, normalization etc.
1949 * Internal method, status assumed to be success, caller has to check status
1950 * before calling this method.
1951 * @param text strsrch string search data
1952 * @param textoffset start text position to do search
1953 * @param ce the text ce which failed the match.
1954 * @param patternceindex index of the ce within the pattern ce buffer which
1955 * failed the match
1956 * @return final offset
1957 */
1958 static
reverseShift(UStringSearch * strsrch,int32_t textoffset,int32_t ce,int32_t patternceindex)1959 inline int32_t reverseShift(UStringSearch *strsrch,
1960 int32_t textoffset,
1961 int32_t ce,
1962 int32_t patternceindex)
1963 {
1964 if (strsrch->search->isOverlap) {
1965 if (textoffset != strsrch->search->textLength) {
1966 textoffset --;
1967 }
1968 else {
1969 textoffset -= strsrch->pattern.defaultShiftSize;
1970 }
1971 }
1972 else {
1973 if (ce != UCOL_NULLORDER) {
1974 int32_t shift = strsrch->pattern.backShift[hash(ce)];
1975
1976 // this is to adjust for characters in the middle of the substring
1977 // for matching that failed.
1978 int32_t adjust = patternceindex;
1979 if (adjust > 1 && shift > adjust) {
1980 shift -= adjust - 1;
1981 }
1982 textoffset -= shift;
1983 }
1984 else {
1985 textoffset -= strsrch->pattern.defaultShiftSize;
1986 }
1987 }
1988 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1989 return textoffset;
1990 }
1991
1992 /**
1993 * Checks match for contraction.
1994 * If the match starts with a partial contraction we fail.
1995 * Internal method, status assumed to be success, caller has to check status
1996 * before calling this method.
1997 * @param strsrch string search data
1998 * @param start offset of potential match, to be modified if necessary
1999 * @param end offset of potential match, to be modified if necessary
2000 * @param status output error status if any
2001 * @return TRUE if match passes the contraction test, FALSE otherwise
2002 */
2003 static
checkPreviousExactContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2004 UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2005 int32_t *start,
2006 int32_t *end, UErrorCode *status)
2007 {
2008 UCollationElements *coleiter = strsrch->textIter;
2009 int32_t textlength = strsrch->search->textLength;
2010 int32_t temp = *end;
2011 const UCollator *collator = strsrch->collator;
2012 const UChar *text = strsrch->search->text;
2013 // This part checks if either if the start of the match contains potential
2014 // contraction. If so we'll have to iterate through them
2015 // Since we used ucol_next while previously looking for the potential
2016 // match, this guarantees that our end will not be a partial contraction,
2017 // or a partial supplementary character.
2018 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2019 int32_t expansion = getExpansionSuffix(coleiter);
2020 UBool expandflag = expansion > 0;
2021 setColEIterOffset(coleiter, *end);
2022 while (U_SUCCESS(*status) && expansion > 0) {
2023 // getting rid of the redundant ce
2024 // since forward contraction/expansion may have extra ces
2025 // if we are in the normalization buffer, hasAccentsBeforeMatch
2026 // would have taken care of it.
2027 // E.g. the character \u01FA will have an expansion of 3, but if
2028 // we are only looking for A ring A\u030A, we'll have to skip the
2029 // last ce in the expansion buffer
2030 ucol_previous(coleiter, status);
2031 if (U_FAILURE(*status)) {
2032 return FALSE;
2033 }
2034 if (ucol_getOffset(coleiter) != temp) {
2035 *end = temp;
2036 temp = ucol_getOffset(coleiter);
2037 }
2038 expansion --;
2039 }
2040
2041 int32_t *patternce = strsrch->pattern.ces;
2042 int32_t patterncelength = strsrch->pattern.cesLength;
2043 int32_t count = patterncelength;
2044 while (count > 0) {
2045 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2046 // status checked below, note that if status is a failure
2047 // ucol_previous returns UCOL_NULLORDER
2048 if (ce == UCOL_IGNORABLE) {
2049 continue;
2050 }
2051 if (expandflag && count == 0 &&
2052 getColElemIterOffset(coleiter, FALSE) != temp) {
2053 *end = temp;
2054 temp = ucol_getOffset(coleiter);
2055 }
2056 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2057 (*start) --;
2058 *start = getPreviousBaseOffset(text, *start);
2059 return FALSE;
2060 }
2061 count --;
2062 }
2063 }
2064 return TRUE;
2065 }
2066
2067 /**
2068 * Checks and sets the match information if found.
2069 * Checks
2070 * <ul>
2071 * <li> the current match does not repeat the last match
2072 * <li> boundaries are correct
2073 * <li> exact matches has no extra accents
2074 * <li> identical matches
2075 * <\ul>
2076 * Otherwise the offset will be shifted to the preceding character.
2077 * Internal method, status assumed to be success, caller has to check status
2078 * before calling this method.
2079 * @param strsrch string search data
2080 * @param collator
2081 * @param coleiter collation element iterator
2082 * @param text string
2083 * @param textoffset offset in the collation element text. the returned value
2084 * will be the truncated start offset of the match or the new start
2085 * search offset.
2086 * @param status output error status if any
2087 * @return TRUE if the match is valid, FALSE otherwise
2088 */
2089 static
checkPreviousExactMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2090 inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2091 int32_t *textoffset,
2092 UErrorCode *status)
2093 {
2094 // to ensure that the start and ends are not composite characters
2095 int32_t end = ucol_getOffset(strsrch->textIter);
2096 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2097 || U_FAILURE(*status)) {
2098 return FALSE;
2099 }
2100
2101 // this totally matches, however we need to check if it is repeating
2102 // the old match
2103 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2104 !isBreakUnit(strsrch, *textoffset, end) ||
2105 hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2106 !checkIdentical(strsrch, *textoffset, end) ||
2107 hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2108 (*textoffset) --;
2109 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2110 *textoffset);
2111 return FALSE;
2112 }
2113
2114 //Add breakiterator boundary check for primary strength search.
2115 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2116 checkBreakBoundary(strsrch, textoffset, &end);
2117 }
2118
2119 strsrch->search->matchedIndex = *textoffset;
2120 strsrch->search->matchedLength = end - *textoffset;
2121 return TRUE;
2122 }
2123
2124 /**
2125 * Rearranges the end accents to try matching.
2126 * Suffix accents in the text will be grouped according to their combining
2127 * class and the groups will be mixed and matched to try find the perfect
2128 * match with the pattern.
2129 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2130 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2131 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2132 * "\u0301\u0325".
2133 * step 2: check if any of the generated substrings matches the pattern.
2134 * Internal method, status assumed to be success, user has to check status
2135 * before calling this method.
2136 * @param strsrch string search match
2137 * @param start offset of the first base character
2138 * @param end start of the last accent set
2139 * @param status only error status if any
2140 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2141 * offset of the match. Note this start includes all following accents.
2142 */
2143 static
doPreviousCanonicalSuffixMatch(UStringSearch * strsrch,int32_t start,int32_t end,UErrorCode * status)2144 int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2145 int32_t start,
2146 int32_t end,
2147 UErrorCode *status)
2148 {
2149 const UChar *text = strsrch->search->text;
2150 int32_t tempend = end;
2151
2152 U16_BACK_1(text, 0, tempend);
2153 if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2154 LAST_BYTE_MASK_)) {
2155 // die... failed at a base character
2156 return USEARCH_DONE;
2157 }
2158 end = getNextBaseOffset(text, end, strsrch->search->textLength);
2159
2160 if (U_SUCCESS(*status)) {
2161 UChar accents[INITIAL_ARRAY_SIZE_];
2162 int32_t offset = getPreviousBaseOffset(text, end);
2163 // normalizing the offensive string
2164 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2165 INITIAL_ARRAY_SIZE_, status);
2166
2167 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2168 int32_t accentsize = getUnblockedAccentIndex(accents,
2169 accentsindex);
2170 int32_t count = (2 << (accentsize - 1)) - 1;
2171 UChar buffer[INITIAL_ARRAY_SIZE_];
2172 UCollationElements *coleiter = strsrch->utilIter;
2173 while (U_SUCCESS(*status) && count > 0) {
2174 UChar *rearrange = strsrch->canonicalSuffixAccents;
2175 // copy the base characters
2176 for (int k = 0; k < accentsindex[0]; k ++) {
2177 *rearrange ++ = accents[k];
2178 }
2179 // forming all possible canonical rearrangement by dropping
2180 // sets of accents
2181 for (int i = 0; i <= accentsize - 1; i ++) {
2182 int32_t mask = 1 << (accentsize - i - 1);
2183 if (count & mask) {
2184 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2185 *rearrange ++ = accents[j];
2186 }
2187 }
2188 }
2189 *rearrange = 0;
2190 int32_t matchsize = INITIAL_ARRAY_SIZE_;
2191 UChar *match = addToUCharArray(buffer, &matchsize,
2192 strsrch->canonicalPrefixAccents,
2193 strsrch->search->text + start,
2194 offset - start,
2195 strsrch->canonicalSuffixAccents,
2196 status);
2197
2198 // run the collator iterator through this match
2199 // if status is a failure ucol_setText does nothing
2200 ucol_setText(coleiter, match, matchsize, status);
2201 if (U_SUCCESS(*status)) {
2202 if (checkCollationMatch(strsrch, coleiter)) {
2203 if (match != buffer) {
2204 uprv_free(match);
2205 }
2206 return end;
2207 }
2208 }
2209 count --;
2210 }
2211 }
2212 return USEARCH_DONE;
2213 }
2214
2215 /**
2216 * Take the rearranged start accents and tries matching. If match failed at
2217 * a seperate following set of accents (seperated from the rearranged on by
2218 * at least a base character) then we rearrange the preceding accents and
2219 * tries matching again.
2220 * We allow skipping of the ends of the accent set if the ces do not match.
2221 * However if the failure is found before the accent set, it fails.
2222 * Internal method, status assumed to be success, caller has to check status
2223 * before calling this method.
2224 * @param strsrch string search data
2225 * @param textoffset of the ends of the rearranged accent
2226 * @param status output error status if any
2227 * @return USEARCH_DONE if a match is not found, otherwise return the ending
2228 * offset of the match. Note this start includes all following accents.
2229 */
2230 static
doPreviousCanonicalPrefixMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2231 int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2232 int32_t textoffset,
2233 UErrorCode *status)
2234 {
2235 const UChar *text = strsrch->search->text;
2236 const UCollator *collator = strsrch->collator;
2237 int32_t safelength = 0;
2238 UChar *safetext;
2239 int32_t safetextlength;
2240 UChar safebuffer[INITIAL_ARRAY_SIZE_];
2241 int32_t safeoffset = textoffset;
2242
2243 if (textoffset &&
2244 ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2245 u_strlen(strsrch->canonicalPrefixAccents) - 1
2246 ], collator)) {
2247 safeoffset = getNextSafeOffset(collator, text, textoffset,
2248 strsrch->search->textLength);
2249 safelength = safeoffset - textoffset;
2250 safetextlength = INITIAL_ARRAY_SIZE_;
2251 safetext = addToUCharArray(safebuffer, &safetextlength,
2252 strsrch->canonicalPrefixAccents,
2253 text + textoffset, safelength,
2254 NULL, status);
2255 }
2256 else {
2257 safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2258 safetext = strsrch->canonicalPrefixAccents;
2259 }
2260
2261 UCollationElements *coleiter = strsrch->utilIter;
2262 // if status is a failure, ucol_setText does nothing
2263 ucol_setText(coleiter, safetext, safetextlength, status);
2264 // status checked in loop below
2265
2266 int32_t *ce = strsrch->pattern.ces;
2267 int32_t celength = strsrch->pattern.cesLength;
2268 int ceindex = 0;
2269 UBool isSafe = TRUE; // safe zone indication flag for position
2270 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2271
2272 while (ceindex < celength) {
2273 int32_t textce = ucol_next(coleiter, status);
2274 if (U_FAILURE(*status)) {
2275 if (isSafe) {
2276 cleanUpSafeText(strsrch, safetext, safebuffer);
2277 }
2278 return USEARCH_DONE;
2279 }
2280 if (textce == UCOL_NULLORDER) {
2281 // check if we have passed the safe buffer
2282 if (coleiter == strsrch->textIter) {
2283 cleanUpSafeText(strsrch, safetext, safebuffer);
2284 return USEARCH_DONE;
2285 }
2286 cleanUpSafeText(strsrch, safetext, safebuffer);
2287 safetext = safebuffer;
2288 coleiter = strsrch->textIter;
2289 setColEIterOffset(coleiter, safeoffset);
2290 // status checked at the start of the loop
2291 isSafe = FALSE;
2292 continue;
2293 }
2294 textce = getCE(strsrch, textce);
2295 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2296 // do the beginning stuff
2297 int32_t failedoffset = ucol_getOffset(coleiter);
2298 if (isSafe && failedoffset <= prefixlength) {
2299 // alas... no hope. failed at rearranged accent set
2300 cleanUpSafeText(strsrch, safetext, safebuffer);
2301 return USEARCH_DONE;
2302 }
2303 else {
2304 if (isSafe) {
2305 failedoffset = safeoffset - failedoffset;
2306 cleanUpSafeText(strsrch, safetext, safebuffer);
2307 }
2308
2309 // try rearranging the end accents
2310 int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2311 textoffset, failedoffset, status);
2312 if (result != USEARCH_DONE) {
2313 // if status is a failure, ucol_setOffset does nothing
2314 setColEIterOffset(strsrch->textIter, result);
2315 }
2316 if (U_FAILURE(*status)) {
2317 return USEARCH_DONE;
2318 }
2319 return result;
2320 }
2321 }
2322 if (textce == ce[ceindex]) {
2323 ceindex ++;
2324 }
2325 }
2326 // set offset here
2327 if (isSafe) {
2328 int32_t result = ucol_getOffset(coleiter);
2329 // sets the text iterator here with the correct expansion and offset
2330 int32_t leftoverces = getExpansionSuffix(coleiter);
2331 cleanUpSafeText(strsrch, safetext, safebuffer);
2332 if (result <= prefixlength) {
2333 result = textoffset;
2334 }
2335 else {
2336 result = textoffset + (safeoffset - result);
2337 }
2338 setColEIterOffset(strsrch->textIter, result);
2339 setExpansionSuffix(strsrch->textIter, leftoverces);
2340 return result;
2341 }
2342
2343 return ucol_getOffset(coleiter);
2344 }
2345
2346 /**
2347 * Trying out the substring and sees if it can be a canonical match.
2348 * This will try normalizing the starting accents and arranging them into
2349 * canonical equivalents and check their corresponding ces with the pattern ce.
2350 * Prefix accents in the text will be grouped according to their combining
2351 * class and the groups will be mixed and matched to try find the perfect
2352 * match with the pattern.
2353 * So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2354 * step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2355 * "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2356 * "\u0301\u0325".
2357 * step 2: check if any of the generated substrings matches the pattern.
2358 * Internal method, status assumed to be success, caller has to check status
2359 * before calling this method.
2360 * @param strsrch string search data
2361 * @param textoffset start offset in the collation element text that starts
2362 * with the accents to be rearranged
2363 * @param status output error status if any
2364 * @return TRUE if the match is valid, FALSE otherwise
2365 */
2366 static
doPreviousCanonicalMatch(UStringSearch * strsrch,int32_t textoffset,UErrorCode * status)2367 UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2368 int32_t textoffset,
2369 UErrorCode *status)
2370 {
2371 const UChar *text = strsrch->search->text;
2372 int32_t temp = textoffset;
2373 int32_t textlength = strsrch->search->textLength;
2374 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2375 UCollationElements *coleiter = strsrch->textIter;
2376 int32_t offset = ucol_getOffset(coleiter);
2377 if (strsrch->pattern.hasSuffixAccents) {
2378 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2379 offset, status);
2380 if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2381 setColEIterOffset(coleiter, offset);
2382 return TRUE;
2383 }
2384 }
2385 return FALSE;
2386 }
2387
2388 if (!strsrch->pattern.hasPrefixAccents) {
2389 return FALSE;
2390 }
2391
2392 UChar accents[INITIAL_ARRAY_SIZE_];
2393 // offset to the last base character in substring to search
2394 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2395 // normalizing the offensive string
2396 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2397 0, accents, INITIAL_ARRAY_SIZE_, status);
2398 // status checked in loop
2399
2400 int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2401 int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2402
2403 // 2 power n - 1 plus the full set of accents
2404 int32_t count = (2 << (size - 1)) - 1;
2405 while (U_SUCCESS(*status) && count > 0) {
2406 UChar *rearrange = strsrch->canonicalPrefixAccents;
2407 // copy the base characters
2408 for (int k = 0; k < accentsindex[0]; k ++) {
2409 *rearrange ++ = accents[k];
2410 }
2411 // forming all possible canonical rearrangement by dropping
2412 // sets of accents
2413 for (int i = 0; i <= size - 1; i ++) {
2414 int32_t mask = 1 << (size - i - 1);
2415 if (count & mask) {
2416 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2417 *rearrange ++ = accents[j];
2418 }
2419 }
2420 }
2421 *rearrange = 0;
2422 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2423 baseoffset, status);
2424 if (offset != USEARCH_DONE) {
2425 return TRUE; // match found
2426 }
2427 count --;
2428 }
2429 return FALSE;
2430 }
2431
2432 /**
2433 * Checks match for contraction.
2434 * If the match starts with a partial contraction we fail.
2435 * Internal method, status assumed to be success, caller has to check status
2436 * before calling this method.
2437 * @param strsrch string search data
2438 * @param start offset of potential match, to be modified if necessary
2439 * @param end offset of potential match, to be modified if necessary
2440 * @param status only error status if any
2441 * @return TRUE if match passes the contraction test, FALSE otherwise
2442 */
2443 static
checkPreviousCanonicalContractionMatch(UStringSearch * strsrch,int32_t * start,int32_t * end,UErrorCode * status)2444 UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2445 int32_t *start,
2446 int32_t *end, UErrorCode *status)
2447 {
2448 UCollationElements *coleiter = strsrch->textIter;
2449 int32_t textlength = strsrch->search->textLength;
2450 int32_t temp = *end;
2451 const UCollator *collator = strsrch->collator;
2452 const UChar *text = strsrch->search->text;
2453 // This part checks if either if the start of the match contains potential
2454 // contraction. If so we'll have to iterate through them
2455 // Since we used ucol_next while previously looking for the potential
2456 // match, this guarantees that our end will not be a partial contraction,
2457 // or a partial supplementary character.
2458 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2459 int32_t expansion = getExpansionSuffix(coleiter);
2460 UBool expandflag = expansion > 0;
2461 setColEIterOffset(coleiter, *end);
2462 while (expansion > 0) {
2463 // getting rid of the redundant ce
2464 // since forward contraction/expansion may have extra ces
2465 // if we are in the normalization buffer, hasAccentsBeforeMatch
2466 // would have taken care of it.
2467 // E.g. the character \u01FA will have an expansion of 3, but if
2468 // we are only looking for A ring A\u030A, we'll have to skip the
2469 // last ce in the expansion buffer
2470 ucol_previous(coleiter, status);
2471 if (U_FAILURE(*status)) {
2472 return FALSE;
2473 }
2474 if (ucol_getOffset(coleiter) != temp) {
2475 *end = temp;
2476 temp = ucol_getOffset(coleiter);
2477 }
2478 expansion --;
2479 }
2480
2481 int32_t *patternce = strsrch->pattern.ces;
2482 int32_t patterncelength = strsrch->pattern.cesLength;
2483 int32_t count = patterncelength;
2484 while (count > 0) {
2485 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2486 // status checked below, note that if status is a failure
2487 // ucol_previous returns UCOL_NULLORDER
2488 if (ce == UCOL_IGNORABLE) {
2489 continue;
2490 }
2491 if (expandflag && count == 0 &&
2492 getColElemIterOffset(coleiter, FALSE) != temp) {
2493 *end = temp;
2494 temp = ucol_getOffset(coleiter);
2495 }
2496 if (count == patterncelength &&
2497 ce != patternce[patterncelength - 1]) {
2498 // accents may have extra starting ces, this occurs when a
2499 // pure accent pattern is matched without rearrangement
2500 int32_t expected = patternce[patterncelength - 1];
2501 U16_BACK_1(text, 0, *end);
2502 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2503 ce = getCE(strsrch, ucol_previous(coleiter, status));
2504 while (U_SUCCESS(*status) && ce != expected &&
2505 ce != UCOL_NULLORDER &&
2506 ucol_getOffset(coleiter) <= *start) {
2507 ce = getCE(strsrch, ucol_previous(coleiter, status));
2508 }
2509 }
2510 }
2511 if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2512 (*start) --;
2513 *start = getPreviousBaseOffset(text, *start);
2514 return FALSE;
2515 }
2516 count --;
2517 }
2518 }
2519 return TRUE;
2520 }
2521
2522 /**
2523 * Checks and sets the match information if found.
2524 * Checks
2525 * <ul>
2526 * <li> the potential match does not repeat the previous match
2527 * <li> boundaries are correct
2528 * <li> potential match does not end in the middle of a contraction
2529 * <li> identical matches
2530 * <\ul>
2531 * Otherwise the offset will be shifted to the next character.
2532 * Internal method, status assumed to be success, caller has to check status
2533 * before calling this method.
2534 * @param strsrch string search data
2535 * @param textoffset offset in the collation element text. the returned value
2536 * will be the truncated start offset of the match or the new start
2537 * search offset.
2538 * @param status only error status if any
2539 * @return TRUE if the match is valid, FALSE otherwise
2540 */
2541 static
checkPreviousCanonicalMatch(UStringSearch * strsrch,int32_t * textoffset,UErrorCode * status)2542 inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2543 int32_t *textoffset,
2544 UErrorCode *status)
2545 {
2546 // to ensure that the start and ends are not composite characters
2547 UCollationElements *coleiter = strsrch->textIter;
2548 // if we have a canonical accent match
2549 if ((strsrch->pattern.hasSuffixAccents &&
2550 strsrch->canonicalSuffixAccents[0]) ||
2551 (strsrch->pattern.hasPrefixAccents &&
2552 strsrch->canonicalPrefixAccents[0])) {
2553 strsrch->search->matchedIndex = *textoffset;
2554 strsrch->search->matchedLength =
2555 getNextUStringSearchBaseOffset(strsrch,
2556 getColElemIterOffset(coleiter, FALSE))
2557 - *textoffset;
2558 return TRUE;
2559 }
2560
2561 int32_t end = ucol_getOffset(coleiter);
2562 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2563 status) ||
2564 U_FAILURE(*status)) {
2565 return FALSE;
2566 }
2567
2568 end = getNextUStringSearchBaseOffset(strsrch, end);
2569 // this totally matches, however we need to check if it is repeating
2570 if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2571 !isBreakUnit(strsrch, *textoffset, end) ||
2572 !checkIdentical(strsrch, *textoffset, end)) {
2573 (*textoffset) --;
2574 *textoffset = getPreviousBaseOffset(strsrch->search->text,
2575 *textoffset);
2576 return FALSE;
2577 }
2578
2579 strsrch->search->matchedIndex = *textoffset;
2580 strsrch->search->matchedLength = end - *textoffset;
2581 return TRUE;
2582 }
2583 #endif // #if BOYER_MOORE
2584
2585 // constructors and destructor -------------------------------------------
2586
usearch_open(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const char * locale,UBreakIterator * breakiter,UErrorCode * status)2587 U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2588 int32_t patternlength,
2589 const UChar *text,
2590 int32_t textlength,
2591 const char *locale,
2592 UBreakIterator *breakiter,
2593 UErrorCode *status)
2594 {
2595 if (U_FAILURE(*status)) {
2596 return NULL;
2597 }
2598 #if UCONFIG_NO_BREAK_ITERATION
2599 if (breakiter != NULL) {
2600 *status = U_UNSUPPORTED_ERROR;
2601 return NULL;
2602 }
2603 #endif
2604 if (locale) {
2605 // ucol_open internally checks for status
2606 UCollator *collator = ucol_open(locale, status);
2607 // pattern, text checks are done in usearch_openFromCollator
2608 UStringSearch *result = usearch_openFromCollator(pattern,
2609 patternlength, text, textlength,
2610 collator, breakiter, status);
2611
2612 if (result == NULL || U_FAILURE(*status)) {
2613 if (collator) {
2614 ucol_close(collator);
2615 }
2616 return NULL;
2617 }
2618 else {
2619 result->ownCollator = TRUE;
2620 }
2621 return result;
2622 }
2623 *status = U_ILLEGAL_ARGUMENT_ERROR;
2624 return NULL;
2625 }
2626
usearch_openFromCollator(const UChar * pattern,int32_t patternlength,const UChar * text,int32_t textlength,const UCollator * collator,UBreakIterator * breakiter,UErrorCode * status)2627 U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2628 const UChar *pattern,
2629 int32_t patternlength,
2630 const UChar *text,
2631 int32_t textlength,
2632 const UCollator *collator,
2633 UBreakIterator *breakiter,
2634 UErrorCode *status)
2635 {
2636 if (U_FAILURE(*status)) {
2637 return NULL;
2638 }
2639 #if UCONFIG_NO_BREAK_ITERATION
2640 if (breakiter != NULL) {
2641 *status = U_UNSUPPORTED_ERROR;
2642 return NULL;
2643 }
2644 #endif
2645 if (pattern == NULL || text == NULL || collator == NULL) {
2646 *status = U_ILLEGAL_ARGUMENT_ERROR;
2647 return NULL;
2648 }
2649
2650 // string search does not really work when numeric collation is turned on
2651 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2652 *status = U_UNSUPPORTED_ERROR;
2653 return NULL;
2654 }
2655
2656 if (U_SUCCESS(*status)) {
2657 initializeFCD(status);
2658 if (U_FAILURE(*status)) {
2659 return NULL;
2660 }
2661
2662 UStringSearch *result;
2663 if (textlength == -1) {
2664 textlength = u_strlen(text);
2665 }
2666 if (patternlength == -1) {
2667 patternlength = u_strlen(pattern);
2668 }
2669 if (textlength <= 0 || patternlength <= 0) {
2670 *status = U_ILLEGAL_ARGUMENT_ERROR;
2671 return NULL;
2672 }
2673
2674 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2675 if (result == NULL) {
2676 *status = U_MEMORY_ALLOCATION_ERROR;
2677 return NULL;
2678 }
2679
2680 result->collator = collator;
2681 result->strength = ucol_getStrength(collator);
2682 result->ceMask = getMask(result->strength);
2683 result->toShift =
2684 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2685 UCOL_SHIFTED;
2686 result->variableTop = ucol_getVariableTop(collator, status);
2687
2688 result->nfd = Normalizer2::getNFDInstance(*status);
2689
2690 if (U_FAILURE(*status)) {
2691 uprv_free(result);
2692 return NULL;
2693 }
2694
2695 result->search = (USearch *)uprv_malloc(sizeof(USearch));
2696 if (result->search == NULL) {
2697 *status = U_MEMORY_ALLOCATION_ERROR;
2698 uprv_free(result);
2699 return NULL;
2700 }
2701
2702 result->search->text = text;
2703 result->search->textLength = textlength;
2704
2705 result->pattern.text = pattern;
2706 result->pattern.textLength = patternlength;
2707 result->pattern.ces = NULL;
2708 result->pattern.pces = NULL;
2709
2710 result->search->breakIter = breakiter;
2711 #if !UCONFIG_NO_BREAK_ITERATION
2712 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2713 if (breakiter) {
2714 ubrk_setText(breakiter, text, textlength, status);
2715 }
2716 #endif
2717
2718 result->ownCollator = FALSE;
2719 result->search->matchedLength = 0;
2720 result->search->matchedIndex = USEARCH_DONE;
2721 result->utilIter = NULL;
2722 result->textIter = ucol_openElements(collator, text,
2723 textlength, status);
2724 result->textProcessedIter = NULL;
2725 if (U_FAILURE(*status)) {
2726 usearch_close(result);
2727 return NULL;
2728 }
2729
2730 result->search->isOverlap = FALSE;
2731 result->search->isCanonicalMatch = FALSE;
2732 result->search->elementComparisonType = 0;
2733 result->search->isForwardSearching = TRUE;
2734 result->search->reset = TRUE;
2735
2736 initialize(result, status);
2737
2738 if (U_FAILURE(*status)) {
2739 usearch_close(result);
2740 return NULL;
2741 }
2742
2743 return result;
2744 }
2745 return NULL;
2746 }
2747
usearch_close(UStringSearch * strsrch)2748 U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2749 {
2750 if (strsrch) {
2751 if (strsrch->pattern.ces != strsrch->pattern.cesBuffer &&
2752 strsrch->pattern.ces) {
2753 uprv_free(strsrch->pattern.ces);
2754 }
2755
2756 if (strsrch->pattern.pces != NULL &&
2757 strsrch->pattern.pces != strsrch->pattern.pcesBuffer) {
2758 uprv_free(strsrch->pattern.pces);
2759 }
2760
2761 delete strsrch->textProcessedIter;
2762 ucol_closeElements(strsrch->textIter);
2763 ucol_closeElements(strsrch->utilIter);
2764
2765 if (strsrch->ownCollator && strsrch->collator) {
2766 ucol_close((UCollator *)strsrch->collator);
2767 }
2768
2769 #if !UCONFIG_NO_BREAK_ITERATION
2770 if (strsrch->search->internalBreakIter) {
2771 ubrk_close(strsrch->search->internalBreakIter);
2772 }
2773 #endif
2774
2775 uprv_free(strsrch->search);
2776 uprv_free(strsrch);
2777 }
2778 }
2779
2780 namespace {
2781
initTextProcessedIter(UStringSearch * strsrch,UErrorCode * status)2782 UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2783 if (U_FAILURE(*status)) { return FALSE; }
2784 if (strsrch->textProcessedIter == NULL) {
2785 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2786 if (strsrch->textProcessedIter == NULL) {
2787 *status = U_MEMORY_ALLOCATION_ERROR;
2788 return FALSE;
2789 }
2790 } else {
2791 strsrch->textProcessedIter->init(strsrch->textIter);
2792 }
2793 return TRUE;
2794 }
2795
2796 }
2797
2798 // set and get methods --------------------------------------------------
2799
usearch_setOffset(UStringSearch * strsrch,int32_t position,UErrorCode * status)2800 U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2801 int32_t position,
2802 UErrorCode *status)
2803 {
2804 if (U_SUCCESS(*status) && strsrch) {
2805 if (isOutOfBounds(strsrch->search->textLength, position)) {
2806 *status = U_INDEX_OUTOFBOUNDS_ERROR;
2807 }
2808 else {
2809 setColEIterOffset(strsrch->textIter, position);
2810 }
2811 strsrch->search->matchedIndex = USEARCH_DONE;
2812 strsrch->search->matchedLength = 0;
2813 strsrch->search->reset = FALSE;
2814 }
2815 }
2816
usearch_getOffset(const UStringSearch * strsrch)2817 U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2818 {
2819 if (strsrch) {
2820 int32_t result = ucol_getOffset(strsrch->textIter);
2821 if (isOutOfBounds(strsrch->search->textLength, result)) {
2822 return USEARCH_DONE;
2823 }
2824 return result;
2825 }
2826 return USEARCH_DONE;
2827 }
2828
usearch_setAttribute(UStringSearch * strsrch,USearchAttribute attribute,USearchAttributeValue value,UErrorCode * status)2829 U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2830 USearchAttribute attribute,
2831 USearchAttributeValue value,
2832 UErrorCode *status)
2833 {
2834 if (U_SUCCESS(*status) && strsrch) {
2835 switch (attribute)
2836 {
2837 case USEARCH_OVERLAP :
2838 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2839 break;
2840 case USEARCH_CANONICAL_MATCH :
2841 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2842 FALSE);
2843 break;
2844 case USEARCH_ELEMENT_COMPARISON :
2845 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2846 strsrch->search->elementComparisonType = (int16_t)value;
2847 } else {
2848 strsrch->search->elementComparisonType = 0;
2849 }
2850 break;
2851 case USEARCH_ATTRIBUTE_COUNT :
2852 default:
2853 *status = U_ILLEGAL_ARGUMENT_ERROR;
2854 }
2855 }
2856 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2857 *status = U_ILLEGAL_ARGUMENT_ERROR;
2858 }
2859 }
2860
usearch_getAttribute(const UStringSearch * strsrch,USearchAttribute attribute)2861 U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2862 const UStringSearch *strsrch,
2863 USearchAttribute attribute)
2864 {
2865 if (strsrch) {
2866 switch (attribute) {
2867 case USEARCH_OVERLAP :
2868 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2869 USEARCH_OFF);
2870 case USEARCH_CANONICAL_MATCH :
2871 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2872 USEARCH_OFF);
2873 case USEARCH_ELEMENT_COMPARISON :
2874 {
2875 int16_t value = strsrch->search->elementComparisonType;
2876 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2877 return (USearchAttributeValue)value;
2878 } else {
2879 return USEARCH_STANDARD_ELEMENT_COMPARISON;
2880 }
2881 }
2882 case USEARCH_ATTRIBUTE_COUNT :
2883 return USEARCH_DEFAULT;
2884 }
2885 }
2886 return USEARCH_DEFAULT;
2887 }
2888
usearch_getMatchedStart(const UStringSearch * strsrch)2889 U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2890 const UStringSearch *strsrch)
2891 {
2892 if (strsrch == NULL) {
2893 return USEARCH_DONE;
2894 }
2895 return strsrch->search->matchedIndex;
2896 }
2897
2898
usearch_getMatchedText(const UStringSearch * strsrch,UChar * result,int32_t resultCapacity,UErrorCode * status)2899 U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2900 UChar *result,
2901 int32_t resultCapacity,
2902 UErrorCode *status)
2903 {
2904 if (U_FAILURE(*status)) {
2905 return USEARCH_DONE;
2906 }
2907 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2908 result == NULL)) {
2909 *status = U_ILLEGAL_ARGUMENT_ERROR;
2910 return USEARCH_DONE;
2911 }
2912
2913 int32_t copylength = strsrch->search->matchedLength;
2914 int32_t copyindex = strsrch->search->matchedIndex;
2915 if (copyindex == USEARCH_DONE) {
2916 u_terminateUChars(result, resultCapacity, 0, status);
2917 return USEARCH_DONE;
2918 }
2919
2920 if (resultCapacity < copylength) {
2921 copylength = resultCapacity;
2922 }
2923 if (copylength > 0) {
2924 uprv_memcpy(result, strsrch->search->text + copyindex,
2925 copylength * sizeof(UChar));
2926 }
2927 return u_terminateUChars(result, resultCapacity,
2928 strsrch->search->matchedLength, status);
2929 }
2930
usearch_getMatchedLength(const UStringSearch * strsrch)2931 U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2932 const UStringSearch *strsrch)
2933 {
2934 if (strsrch) {
2935 return strsrch->search->matchedLength;
2936 }
2937 return USEARCH_DONE;
2938 }
2939
2940 #if !UCONFIG_NO_BREAK_ITERATION
2941
usearch_setBreakIterator(UStringSearch * strsrch,UBreakIterator * breakiter,UErrorCode * status)2942 U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch,
2943 UBreakIterator *breakiter,
2944 UErrorCode *status)
2945 {
2946 if (U_SUCCESS(*status) && strsrch) {
2947 strsrch->search->breakIter = breakiter;
2948 if (breakiter) {
2949 ubrk_setText(breakiter, strsrch->search->text,
2950 strsrch->search->textLength, status);
2951 }
2952 }
2953 }
2954
2955 U_CAPI const UBreakIterator* U_EXPORT2
usearch_getBreakIterator(const UStringSearch * strsrch)2956 usearch_getBreakIterator(const UStringSearch *strsrch)
2957 {
2958 if (strsrch) {
2959 return strsrch->search->breakIter;
2960 }
2961 return NULL;
2962 }
2963
2964 #endif
2965
usearch_setText(UStringSearch * strsrch,const UChar * text,int32_t textlength,UErrorCode * status)2966 U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch,
2967 const UChar *text,
2968 int32_t textlength,
2969 UErrorCode *status)
2970 {
2971 if (U_SUCCESS(*status)) {
2972 if (strsrch == NULL || text == NULL || textlength < -1 ||
2973 textlength == 0) {
2974 *status = U_ILLEGAL_ARGUMENT_ERROR;
2975 }
2976 else {
2977 if (textlength == -1) {
2978 textlength = u_strlen(text);
2979 }
2980 strsrch->search->text = text;
2981 strsrch->search->textLength = textlength;
2982 ucol_setText(strsrch->textIter, text, textlength, status);
2983 strsrch->search->matchedIndex = USEARCH_DONE;
2984 strsrch->search->matchedLength = 0;
2985 strsrch->search->reset = TRUE;
2986 #if !UCONFIG_NO_BREAK_ITERATION
2987 if (strsrch->search->breakIter != NULL) {
2988 ubrk_setText(strsrch->search->breakIter, text,
2989 textlength, status);
2990 }
2991 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2992 #endif
2993 }
2994 }
2995 }
2996
usearch_getText(const UStringSearch * strsrch,int32_t * length)2997 U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
2998 int32_t *length)
2999 {
3000 if (strsrch) {
3001 *length = strsrch->search->textLength;
3002 return strsrch->search->text;
3003 }
3004 return NULL;
3005 }
3006
usearch_setCollator(UStringSearch * strsrch,const UCollator * collator,UErrorCode * status)3007 U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch,
3008 const UCollator *collator,
3009 UErrorCode *status)
3010 {
3011 if (U_SUCCESS(*status)) {
3012 if (collator == NULL) {
3013 *status = U_ILLEGAL_ARGUMENT_ERROR;
3014 return;
3015 }
3016
3017 if (strsrch) {
3018 delete strsrch->textProcessedIter;
3019 strsrch->textProcessedIter = NULL;
3020 ucol_closeElements(strsrch->textIter);
3021 ucol_closeElements(strsrch->utilIter);
3022 strsrch->textIter = strsrch->utilIter = NULL;
3023 if (strsrch->ownCollator && (strsrch->collator != collator)) {
3024 ucol_close((UCollator *)strsrch->collator);
3025 strsrch->ownCollator = FALSE;
3026 }
3027 strsrch->collator = collator;
3028 strsrch->strength = ucol_getStrength(collator);
3029 strsrch->ceMask = getMask(strsrch->strength);
3030 #if !UCONFIG_NO_BREAK_ITERATION
3031 ubrk_close(strsrch->search->internalBreakIter);
3032 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3033 strsrch->search->text, strsrch->search->textLength, status);
3034 #endif
3035 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3036 strsrch->toShift =
3037 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3038 UCOL_SHIFTED;
3039 // if status is a failure, ucol_getVariableTop returns 0
3040 strsrch->variableTop = ucol_getVariableTop(collator, status);
3041 strsrch->textIter = ucol_openElements(collator,
3042 strsrch->search->text,
3043 strsrch->search->textLength,
3044 status);
3045 strsrch->utilIter = ucol_openElements(
3046 collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3047 // initialize() _after_ setting the iterators for the new collator.
3048 initialize(strsrch, status);
3049 }
3050
3051 // **** are these calls needed?
3052 // **** we call uprv_init_pce in initializePatternPCETable
3053 // **** and the CEIBuffer constructor...
3054 #if 0
3055 uprv_init_pce(strsrch->textIter);
3056 uprv_init_pce(strsrch->utilIter);
3057 #endif
3058 }
3059 }
3060
usearch_getCollator(const UStringSearch * strsrch)3061 U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3062 {
3063 if (strsrch) {
3064 return (UCollator *)strsrch->collator;
3065 }
3066 return NULL;
3067 }
3068
usearch_setPattern(UStringSearch * strsrch,const UChar * pattern,int32_t patternlength,UErrorCode * status)3069 U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch,
3070 const UChar *pattern,
3071 int32_t patternlength,
3072 UErrorCode *status)
3073 {
3074 if (U_SUCCESS(*status)) {
3075 if (strsrch == NULL || pattern == NULL) {
3076 *status = U_ILLEGAL_ARGUMENT_ERROR;
3077 }
3078 else {
3079 if (patternlength == -1) {
3080 patternlength = u_strlen(pattern);
3081 }
3082 if (patternlength == 0) {
3083 *status = U_ILLEGAL_ARGUMENT_ERROR;
3084 return;
3085 }
3086 strsrch->pattern.text = pattern;
3087 strsrch->pattern.textLength = patternlength;
3088 initialize(strsrch, status);
3089 }
3090 }
3091 }
3092
3093 U_CAPI const UChar* U_EXPORT2
usearch_getPattern(const UStringSearch * strsrch,int32_t * length)3094 usearch_getPattern(const UStringSearch *strsrch,
3095 int32_t *length)
3096 {
3097 if (strsrch) {
3098 *length = strsrch->pattern.textLength;
3099 return strsrch->pattern.text;
3100 }
3101 return NULL;
3102 }
3103
3104 // miscellanous methods --------------------------------------------------
3105
usearch_first(UStringSearch * strsrch,UErrorCode * status)3106 U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3107 UErrorCode *status)
3108 {
3109 if (strsrch && U_SUCCESS(*status)) {
3110 strsrch->search->isForwardSearching = TRUE;
3111 usearch_setOffset(strsrch, 0, status);
3112 if (U_SUCCESS(*status)) {
3113 return usearch_next(strsrch, status);
3114 }
3115 }
3116 return USEARCH_DONE;
3117 }
3118
usearch_following(UStringSearch * strsrch,int32_t position,UErrorCode * status)3119 U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3120 int32_t position,
3121 UErrorCode *status)
3122 {
3123 if (strsrch && U_SUCCESS(*status)) {
3124 strsrch->search->isForwardSearching = TRUE;
3125 // position checked in usearch_setOffset
3126 usearch_setOffset(strsrch, position, status);
3127 if (U_SUCCESS(*status)) {
3128 return usearch_next(strsrch, status);
3129 }
3130 }
3131 return USEARCH_DONE;
3132 }
3133
usearch_last(UStringSearch * strsrch,UErrorCode * status)3134 U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3135 UErrorCode *status)
3136 {
3137 if (strsrch && U_SUCCESS(*status)) {
3138 strsrch->search->isForwardSearching = FALSE;
3139 usearch_setOffset(strsrch, strsrch->search->textLength, status);
3140 if (U_SUCCESS(*status)) {
3141 return usearch_previous(strsrch, status);
3142 }
3143 }
3144 return USEARCH_DONE;
3145 }
3146
usearch_preceding(UStringSearch * strsrch,int32_t position,UErrorCode * status)3147 U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3148 int32_t position,
3149 UErrorCode *status)
3150 {
3151 if (strsrch && U_SUCCESS(*status)) {
3152 strsrch->search->isForwardSearching = FALSE;
3153 // position checked in usearch_setOffset
3154 usearch_setOffset(strsrch, position, status);
3155 if (U_SUCCESS(*status)) {
3156 return usearch_previous(strsrch, status);
3157 }
3158 }
3159 return USEARCH_DONE;
3160 }
3161
3162 /**
3163 * If a direction switch is required, we'll count the number of ces till the
3164 * beginning of the collation element iterator and iterate forwards that
3165 * number of times. This is so that we get to the correct point within the
3166 * string to continue the search in. Imagine when we are in the middle of the
3167 * normalization buffer when the change in direction is request. arrrgghh....
3168 * After searching the offset within the collation element iterator will be
3169 * shifted to the start of the match. If a match is not found, the offset would
3170 * have been set to the end of the text string in the collation element
3171 * iterator.
3172 * Okay, here's my take on normalization buffer. The only time when there can
3173 * be 2 matches within the same normalization is when the pattern is consists
3174 * of all accents. But since the offset returned is from the text string, we
3175 * should not confuse the caller by returning the second match within the
3176 * same normalization buffer. If we do, the 2 results will have the same match
3177 * offsets, and that'll be confusing. I'll return the next match that doesn't
3178 * fall within the same normalization buffer. Note this does not affect the
3179 * results of matches spanning the text and the normalization buffer.
3180 * The position to start searching is taken from the collation element
3181 * iterator. Callers of this API would have to set the offset in the collation
3182 * element iterator before using this method.
3183 */
usearch_next(UStringSearch * strsrch,UErrorCode * status)3184 U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3185 UErrorCode *status)
3186 {
3187 if (U_SUCCESS(*status) && strsrch) {
3188 // note offset is either equivalent to the start of the previous match
3189 // or is set by the user
3190 int32_t offset = usearch_getOffset(strsrch);
3191 USearch *search = strsrch->search;
3192 search->reset = FALSE;
3193 int32_t textlength = search->textLength;
3194 if (search->isForwardSearching) {
3195 #if BOYER_MOORE
3196 if (offset == textlength
3197 || (!search->isOverlap &&
3198 (offset + strsrch->pattern.defaultShiftSize > textlength ||
3199 (search->matchedIndex != USEARCH_DONE &&
3200 offset + search->matchedLength >= textlength)))) {
3201 // not enough characters to match
3202 setMatchNotFound(strsrch);
3203 return USEARCH_DONE;
3204 }
3205 #else
3206 if (offset == textlength ||
3207 (! search->isOverlap &&
3208 (search->matchedIndex != USEARCH_DONE &&
3209 offset + search->matchedLength > textlength))) {
3210 // not enough characters to match
3211 setMatchNotFound(strsrch);
3212 return USEARCH_DONE;
3213 }
3214 #endif
3215 }
3216 else {
3217 // switching direction.
3218 // if matchedIndex == USEARCH_DONE, it means that either a
3219 // setOffset has been called or that previous ran off the text
3220 // string. the iterator would have been set to offset 0 if a
3221 // match is not found.
3222 search->isForwardSearching = TRUE;
3223 if (search->matchedIndex != USEARCH_DONE) {
3224 // there's no need to set the collation element iterator
3225 // the next call to next will set the offset.
3226 return search->matchedIndex;
3227 }
3228 }
3229
3230 if (U_SUCCESS(*status)) {
3231 if (strsrch->pattern.cesLength == 0) {
3232 if (search->matchedIndex == USEARCH_DONE) {
3233 search->matchedIndex = offset;
3234 }
3235 else { // moves by codepoints
3236 U16_FWD_1(search->text, search->matchedIndex, textlength);
3237 }
3238
3239 search->matchedLength = 0;
3240 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3241 // status checked below
3242 if (search->matchedIndex == textlength) {
3243 search->matchedIndex = USEARCH_DONE;
3244 }
3245 }
3246 else {
3247 if (search->matchedLength > 0) {
3248 // if matchlength is 0 we are at the start of the iteration
3249 if (search->isOverlap) {
3250 ucol_setOffset(strsrch->textIter, offset + 1, status);
3251 }
3252 else {
3253 ucol_setOffset(strsrch->textIter,
3254 offset + search->matchedLength, status);
3255 }
3256 }
3257 else {
3258 // for boundary check purposes. this will ensure that the
3259 // next match will not preceed the current offset
3260 // note search->matchedIndex will always be set to something
3261 // in the code
3262 search->matchedIndex = offset - 1;
3263 }
3264
3265 if (search->isCanonicalMatch) {
3266 // can't use exact here since extra accents are allowed.
3267 usearch_handleNextCanonical(strsrch, status);
3268 }
3269 else {
3270 usearch_handleNextExact(strsrch, status);
3271 }
3272 }
3273
3274 if (U_FAILURE(*status)) {
3275 return USEARCH_DONE;
3276 }
3277
3278 #if !BOYER_MOORE
3279 if (search->matchedIndex == USEARCH_DONE) {
3280 ucol_setOffset(strsrch->textIter, search->textLength, status);
3281 } else {
3282 ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3283 }
3284 #endif
3285
3286 return search->matchedIndex;
3287 }
3288 }
3289 return USEARCH_DONE;
3290 }
3291
usearch_previous(UStringSearch * strsrch,UErrorCode * status)3292 U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3293 UErrorCode *status)
3294 {
3295 if (U_SUCCESS(*status) && strsrch) {
3296 int32_t offset;
3297 USearch *search = strsrch->search;
3298 if (search->reset) {
3299 offset = search->textLength;
3300 search->isForwardSearching = FALSE;
3301 search->reset = FALSE;
3302 setColEIterOffset(strsrch->textIter, offset);
3303 }
3304 else {
3305 offset = usearch_getOffset(strsrch);
3306 }
3307
3308 int32_t matchedindex = search->matchedIndex;
3309 if (search->isForwardSearching == TRUE) {
3310 // switching direction.
3311 // if matchedIndex == USEARCH_DONE, it means that either a
3312 // setOffset has been called or that next ran off the text
3313 // string. the iterator would have been set to offset textLength if
3314 // a match is not found.
3315 search->isForwardSearching = FALSE;
3316 if (matchedindex != USEARCH_DONE) {
3317 return matchedindex;
3318 }
3319 }
3320 else {
3321 #if BOYER_MOORE
3322 if (offset == 0 || matchedindex == 0 ||
3323 (!search->isOverlap &&
3324 (offset < strsrch->pattern.defaultShiftSize ||
3325 (matchedindex != USEARCH_DONE &&
3326 matchedindex < strsrch->pattern.defaultShiftSize)))) {
3327 // not enough characters to match
3328 setMatchNotFound(strsrch);
3329 return USEARCH_DONE;
3330 }
3331 #else
3332 // Could check pattern length, but the
3333 // linear search will do the right thing
3334 if (offset == 0 || matchedindex == 0) {
3335 setMatchNotFound(strsrch);
3336 return USEARCH_DONE;
3337 }
3338 #endif
3339 }
3340
3341 if (U_SUCCESS(*status)) {
3342 if (strsrch->pattern.cesLength == 0) {
3343 search->matchedIndex =
3344 (matchedindex == USEARCH_DONE ? offset : matchedindex);
3345 if (search->matchedIndex == 0) {
3346 setMatchNotFound(strsrch);
3347 // status checked below
3348 }
3349 else { // move by codepoints
3350 U16_BACK_1(search->text, 0, search->matchedIndex);
3351 setColEIterOffset(strsrch->textIter, search->matchedIndex);
3352 // status checked below
3353 search->matchedLength = 0;
3354 }
3355 }
3356 else {
3357 if (strsrch->search->isCanonicalMatch) {
3358 // can't use exact here since extra accents are allowed.
3359 usearch_handlePreviousCanonical(strsrch, status);
3360 // status checked below
3361 }
3362 else {
3363 usearch_handlePreviousExact(strsrch, status);
3364 // status checked below
3365 }
3366 }
3367
3368 if (U_FAILURE(*status)) {
3369 return USEARCH_DONE;
3370 }
3371
3372 return search->matchedIndex;
3373 }
3374 }
3375 return USEARCH_DONE;
3376 }
3377
3378
3379
usearch_reset(UStringSearch * strsrch)3380 U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3381 {
3382 /*
3383 reset is setting the attributes that are already in
3384 string search, hence all attributes in the collator should
3385 be retrieved without any problems
3386 */
3387 if (strsrch) {
3388 UErrorCode status = U_ZERO_ERROR;
3389 UBool sameCollAttribute = TRUE;
3390 uint32_t ceMask;
3391 UBool shift;
3392 uint32_t varTop;
3393
3394 // **** hack to deal w/ how processed CEs encode quaternary ****
3395 UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3396 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3397 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3398 sameCollAttribute = FALSE;
3399 }
3400
3401 strsrch->strength = ucol_getStrength(strsrch->collator);
3402 ceMask = getMask(strsrch->strength);
3403 if (strsrch->ceMask != ceMask) {
3404 strsrch->ceMask = ceMask;
3405 sameCollAttribute = FALSE;
3406 }
3407
3408 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3409 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3410 &status) == UCOL_SHIFTED;
3411 if (strsrch->toShift != shift) {
3412 strsrch->toShift = shift;
3413 sameCollAttribute = FALSE;
3414 }
3415
3416 // if status is a failure, ucol_getVariableTop returns 0
3417 varTop = ucol_getVariableTop(strsrch->collator, &status);
3418 if (strsrch->variableTop != varTop) {
3419 strsrch->variableTop = varTop;
3420 sameCollAttribute = FALSE;
3421 }
3422 if (!sameCollAttribute) {
3423 initialize(strsrch, &status);
3424 }
3425 ucol_setText(strsrch->textIter, strsrch->search->text,
3426 strsrch->search->textLength,
3427 &status);
3428 strsrch->search->matchedLength = 0;
3429 strsrch->search->matchedIndex = USEARCH_DONE;
3430 strsrch->search->isOverlap = FALSE;
3431 strsrch->search->isCanonicalMatch = FALSE;
3432 strsrch->search->elementComparisonType = 0;
3433 strsrch->search->isForwardSearching = TRUE;
3434 strsrch->search->reset = TRUE;
3435 }
3436 }
3437
3438 //
3439 // CEI Collation Element + source text index.
3440 // These structs are kept in the circular buffer.
3441 //
3442 struct CEI {
3443 int64_t ce;
3444 int32_t lowIndex;
3445 int32_t highIndex;
3446 };
3447
3448 U_NAMESPACE_BEGIN
3449
3450 namespace {
3451 //
3452 // CEIBuffer A circular buffer of CEs-with-index from the text being searched.
3453 //
3454 #define DEFAULT_CEBUFFER_SIZE 96
3455 #define CEBUFFER_EXTRA 32
3456 // Some typical max values to make buffer size more reasonable for asymmetric search.
3457 // #8694 is for a better long-term solution to allocation of this buffer.
3458 #define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3459 #define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3460 #define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3461 struct CEIBuffer {
3462 CEI defBuf[DEFAULT_CEBUFFER_SIZE];
3463 CEI *buf;
3464 int32_t bufSize;
3465 int32_t firstIx;
3466 int32_t limitIx;
3467 UCollationElements *ceIter;
3468 UStringSearch *strSearch;
3469
3470
3471
3472 CEIBuffer(UStringSearch *ss, UErrorCode *status);
3473 ~CEIBuffer();
3474 const CEI *get(int32_t index);
3475 const CEI *getPrevious(int32_t index);
3476 };
3477
3478
CEIBuffer(UStringSearch * ss,UErrorCode * status)3479 CEIBuffer::CEIBuffer(UStringSearch *ss, UErrorCode *status) {
3480 buf = defBuf;
3481 strSearch = ss;
3482 bufSize = ss->pattern.pcesLength + CEBUFFER_EXTRA;
3483 if (ss->search->elementComparisonType != 0) {
3484 const UChar * patText = ss->pattern.text;
3485 if (patText) {
3486 const UChar * patTextLimit = patText + ss->pattern.textLength;
3487 while ( patText < patTextLimit ) {
3488 UChar c = *patText++;
3489 if (MIGHT_BE_JAMO_L(c)) {
3490 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3491 } else {
3492 // No check for surrogates, we might allocate slightly more buffer than necessary.
3493 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3494 }
3495 }
3496 }
3497 }
3498 ceIter = ss->textIter;
3499 firstIx = 0;
3500 limitIx = 0;
3501
3502 if (!initTextProcessedIter(ss, status)) { return; }
3503
3504 if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3505 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3506 if (buf == NULL) {
3507 *status = U_MEMORY_ALLOCATION_ERROR;
3508 }
3509 }
3510 }
3511
3512 // TODO: add a reset or init function so that allocated
3513 // buffers can be retained & reused.
3514
~CEIBuffer()3515 CEIBuffer::~CEIBuffer() {
3516 if (buf != defBuf) {
3517 uprv_free(buf);
3518 }
3519 }
3520
3521
3522 // Get the CE with the specified index.
3523 // Index must be in the range
3524 // n-history_size < index < n+1
3525 // where n is the largest index to have been fetched by some previous call to this function.
3526 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3527 //
get(int32_t index)3528 const CEI *CEIBuffer::get(int32_t index) {
3529 int i = index % bufSize;
3530
3531 if (index>=firstIx && index<limitIx) {
3532 // The request was for an entry already in our buffer.
3533 // Just return it.
3534 return &buf[i];
3535 }
3536
3537 // Caller is requesting a new, never accessed before, CE.
3538 // Verify that it is the next one in sequence, which is all
3539 // that is allowed.
3540 if (index != limitIx) {
3541 U_ASSERT(FALSE);
3542
3543 return NULL;
3544 }
3545
3546 // Manage the circular CE buffer indexing
3547 limitIx++;
3548
3549 if (limitIx - firstIx >= bufSize) {
3550 // The buffer is full, knock out the lowest-indexed entry.
3551 firstIx++;
3552 }
3553
3554 UErrorCode status = U_ZERO_ERROR;
3555
3556 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3557
3558 return &buf[i];
3559 }
3560
3561 // Get the CE with the specified index.
3562 // Index must be in the range
3563 // n-history_size < index < n+1
3564 // where n is the largest index to have been fetched by some previous call to this function.
3565 // The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3566 //
getPrevious(int32_t index)3567 const CEI *CEIBuffer::getPrevious(int32_t index) {
3568 int i = index % bufSize;
3569
3570 if (index>=firstIx && index<limitIx) {
3571 // The request was for an entry already in our buffer.
3572 // Just return it.
3573 return &buf[i];
3574 }
3575
3576 // Caller is requesting a new, never accessed before, CE.
3577 // Verify that it is the next one in sequence, which is all
3578 // that is allowed.
3579 if (index != limitIx) {
3580 U_ASSERT(FALSE);
3581
3582 return NULL;
3583 }
3584
3585 // Manage the circular CE buffer indexing
3586 limitIx++;
3587
3588 if (limitIx - firstIx >= bufSize) {
3589 // The buffer is full, knock out the lowest-indexed entry.
3590 firstIx++;
3591 }
3592
3593 UErrorCode status = U_ZERO_ERROR;
3594
3595 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3596
3597 return &buf[i];
3598 }
3599
3600 }
3601
3602 U_NAMESPACE_END
3603
3604
3605 // #define USEARCH_DEBUG
3606
3607 #ifdef USEARCH_DEBUG
3608 #include <stdio.h>
3609 #include <stdlib.h>
3610 #endif
3611
3612 /*
3613 * Find the next break boundary after startIndex. If the UStringSearch object
3614 * has an external break iterator, use that. Otherwise use the internal character
3615 * break iterator.
3616 */
nextBoundaryAfter(UStringSearch * strsrch,int32_t startIndex)3617 static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3618 #if 0
3619 const UChar *text = strsrch->search->text;
3620 int32_t textLen = strsrch->search->textLength;
3621
3622 U_ASSERT(startIndex>=0);
3623 U_ASSERT(startIndex<=textLen);
3624
3625 if (startIndex >= textLen) {
3626 return startIndex;
3627 }
3628
3629 UChar32 c;
3630 int32_t i = startIndex;
3631 U16_NEXT(text, i, textLen, c);
3632
3633 // If we are on a control character, stop without looking for combining marks.
3634 // Control characters do not combine.
3635 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3636 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3637 return i;
3638 }
3639
3640 // The initial character was not a control, and can thus accept trailing
3641 // combining characters. Advance over however many of them there are.
3642 int32_t indexOfLastCharChecked;
3643 for (;;) {
3644 indexOfLastCharChecked = i;
3645 if (i>=textLen) {
3646 break;
3647 }
3648 U16_NEXT(text, i, textLen, c);
3649 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3650 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3651 break;
3652 }
3653 }
3654 return indexOfLastCharChecked;
3655 #elif !UCONFIG_NO_BREAK_ITERATION
3656 UBreakIterator *breakiterator = strsrch->search->breakIter;
3657
3658 if (breakiterator == NULL) {
3659 breakiterator = strsrch->search->internalBreakIter;
3660 }
3661
3662 if (breakiterator != NULL) {
3663 return ubrk_following(breakiterator, startIndex);
3664 }
3665
3666 return startIndex;
3667 #else
3668 // **** or should we use the original code? ****
3669 return startIndex;
3670 #endif
3671
3672 }
3673
3674 /*
3675 * Returns TRUE if index is on a break boundary. If the UStringSearch
3676 * has an external break iterator, test using that, otherwise test
3677 * using the internal character break iterator.
3678 */
isBreakBoundary(UStringSearch * strsrch,int32_t index)3679 static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3680 #if 0
3681 const UChar *text = strsrch->search->text;
3682 int32_t textLen = strsrch->search->textLength;
3683
3684 U_ASSERT(index>=0);
3685 U_ASSERT(index<=textLen);
3686
3687 if (index>=textLen || index<=0) {
3688 return TRUE;
3689 }
3690
3691 // If the character at the current index is not a GRAPHEME_EXTEND
3692 // then we can not be within a combining sequence.
3693 UChar32 c;
3694 U16_GET(text, 0, index, textLen, c);
3695 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3696 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3697 return TRUE;
3698 }
3699
3700 // We are at a combining mark. If the preceding character is anything
3701 // except a CONTROL, CR or LF, we are in a combining sequence.
3702 U16_PREV(text, 0, index, c);
3703 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3704 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3705 return !combining;
3706 #elif !UCONFIG_NO_BREAK_ITERATION
3707 UBreakIterator *breakiterator = strsrch->search->breakIter;
3708
3709 if (breakiterator == NULL) {
3710 breakiterator = strsrch->search->internalBreakIter;
3711 }
3712
3713 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3714 #else
3715 // **** or use the original code? ****
3716 return TRUE;
3717 #endif
3718 }
3719
3720 #if 0
3721 static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3722 {
3723 #if !UCONFIG_NO_BREAK_ITERATION
3724 UBreakIterator *breakiterator = strsrch->search->breakIter;
3725
3726 if (breakiterator != NULL) {
3727 int32_t startindex = ubrk_first(breakiterator);
3728 int32_t endindex = ubrk_last(breakiterator);
3729
3730 // out-of-range indexes are never boundary positions
3731 if (start < startindex || start > endindex ||
3732 end < startindex || end > endindex) {
3733 return FALSE;
3734 }
3735
3736 return ubrk_isBoundary(breakiterator, start) &&
3737 ubrk_isBoundary(breakiterator, end);
3738 }
3739 #endif
3740
3741 return TRUE;
3742 }
3743 #endif
3744
3745 typedef enum {
3746 U_CE_MATCH = -1,
3747 U_CE_NO_MATCH = 0,
3748 U_CE_SKIP_TARG,
3749 U_CE_SKIP_PATN
3750 } UCompareCEsResult;
3751 #define U_CE_LEVEL2_BASE 0x00000005
3752 #define U_CE_LEVEL3_BASE 0x00050000
3753
compareCE64s(int64_t targCE,int64_t patCE,int16_t compareType)3754 static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3755 if (targCE == patCE) {
3756 return U_CE_MATCH;
3757 }
3758 if (compareType == 0) {
3759 return U_CE_NO_MATCH;
3760 }
3761
3762 int64_t targCEshifted = targCE >> 32;
3763 int64_t patCEshifted = patCE >> 32;
3764 int64_t mask;
3765
3766 mask = 0xFFFF0000;
3767 int32_t targLev1 = (int32_t)(targCEshifted & mask);
3768 int32_t patLev1 = (int32_t)(patCEshifted & mask);
3769 if ( targLev1 != patLev1 ) {
3770 if ( targLev1 == 0 ) {
3771 return U_CE_SKIP_TARG;
3772 }
3773 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3774 return U_CE_SKIP_PATN;
3775 }
3776 return U_CE_NO_MATCH;
3777 }
3778
3779 mask = 0x0000FFFF;
3780 int32_t targLev2 = (int32_t)(targCEshifted & mask);
3781 int32_t patLev2 = (int32_t)(patCEshifted & mask);
3782 if ( targLev2 != patLev2 ) {
3783 if ( targLev2 == 0 ) {
3784 return U_CE_SKIP_TARG;
3785 }
3786 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3787 return U_CE_SKIP_PATN;
3788 }
3789 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3790 U_CE_MATCH: U_CE_NO_MATCH;
3791 }
3792
3793 mask = 0xFFFF0000;
3794 int32_t targLev3 = (int32_t)(targCE & mask);
3795 int32_t patLev3 = (int32_t)(patCE & mask);
3796 if ( targLev3 != patLev3 ) {
3797 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3798 U_CE_MATCH: U_CE_NO_MATCH;
3799 }
3800
3801 return U_CE_MATCH;
3802 }
3803
3804 #if BOYER_MOORE
3805 // TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3806 #endif
3807
usearch_search(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)3808 U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch,
3809 int32_t startIdx,
3810 int32_t *matchStart,
3811 int32_t *matchLimit,
3812 UErrorCode *status)
3813 {
3814 if (U_FAILURE(*status)) {
3815 return FALSE;
3816 }
3817
3818 // TODO: reject search patterns beginning with a combining char.
3819
3820 #ifdef USEARCH_DEBUG
3821 if (getenv("USEARCH_DEBUG") != NULL) {
3822 printf("Pattern CEs\n");
3823 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
3824 printf(" %8x", strsrch->pattern.ces[ii]);
3825 }
3826 printf("\n");
3827 }
3828
3829 #endif
3830 // Input parameter sanity check.
3831 // TODO: should input indicies clip to the text length
3832 // in the same way that UText does.
3833 if(strsrch->pattern.cesLength == 0 ||
3834 startIdx < 0 ||
3835 startIdx > strsrch->search->textLength ||
3836 strsrch->pattern.ces == NULL) {
3837 *status = U_ILLEGAL_ARGUMENT_ERROR;
3838 return FALSE;
3839 }
3840
3841 if (strsrch->pattern.pces == NULL) {
3842 initializePatternPCETable(strsrch, status);
3843 }
3844
3845 ucol_setOffset(strsrch->textIter, startIdx, status);
3846 CEIBuffer ceb(strsrch, status);
3847
3848
3849 int32_t targetIx = 0;
3850 const CEI *targetCEI = NULL;
3851 int32_t patIx;
3852 UBool found;
3853
3854 int32_t mStart = -1;
3855 int32_t mLimit = -1;
3856 int32_t minLimit;
3857 int32_t maxLimit;
3858
3859
3860
3861 // Outer loop moves over match starting positions in the
3862 // target CE space.
3863 // Here we see the target as a sequence of collation elements, resulting from the following:
3864 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3865 // (for example, digraphs such as IJ may be broken into two characters).
3866 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3867 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3868 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3869 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3870 // then the CE is deleted, so the following code sees only CEs that are relevant.
3871 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3872 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3873 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3874 //
3875 for(targetIx=0; ; targetIx++)
3876 {
3877 found = TRUE;
3878 // Inner loop checks for a match beginning at each
3879 // position from the outer loop.
3880 int32_t targetIxOffset = 0;
3881 int64_t patCE = 0;
3882 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3883 // (compared to the last CE fetched for the previous targetIx value) as we need to go
3884 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3885 const CEI *firstCEI = ceb.get(targetIx);
3886 if (firstCEI == NULL) {
3887 *status = U_INTERNAL_PROGRAM_ERROR;
3888 found = FALSE;
3889 break;
3890 }
3891
3892 for (patIx=0; patIx<strsrch->pattern.pcesLength; patIx++) {
3893 patCE = strsrch->pattern.pces[patIx];
3894 targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3895 // Compare CE from target string with CE from the pattern.
3896 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3897 // which will fail the compare, below.
3898 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3899 if ( ceMatch == U_CE_NO_MATCH ) {
3900 found = FALSE;
3901 break;
3902 } else if ( ceMatch > U_CE_NO_MATCH ) {
3903 if ( ceMatch == U_CE_SKIP_TARG ) {
3904 // redo with same patCE, next targCE
3905 patIx--;
3906 targetIxOffset++;
3907 } else { // ceMatch == U_CE_SKIP_PATN
3908 // redo with same targCE, next patCE
3909 targetIxOffset--;
3910 }
3911 }
3912 }
3913 targetIxOffset += strsrch->pattern.pcesLength; // this is now the offset in target CE space to end of the match so far
3914
3915 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3916 // No match at this targetIx. Try again at the next.
3917 continue;
3918 }
3919
3920 if (!found) {
3921 // No match at all, we have run off the end of the target text.
3922 break;
3923 }
3924
3925
3926 // We have found a match in CE space.
3927 // Now determine the bounds in string index space.
3928 // There still is a chance of match failure if the CE range not correspond to
3929 // an acceptable character range.
3930 //
3931 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1);
3932
3933 mStart = firstCEI->lowIndex;
3934 minLimit = lastCEI->lowIndex;
3935
3936 // Look at the CE following the match. If it is UCOL_NULLORDER the match
3937 // extended to the end of input, and the match is good.
3938
3939 // Look at the high and low indices of the CE following the match. If
3940 // they are the same it means one of two things:
3941 // 1. The match extended to the last CE from the target text, which is OK, or
3942 // 2. The last CE that was part of the match is in an expansion that extends
3943 // to the first CE after the match. In this case, we reject the match.
3944 const CEI *nextCEI = 0;
3945 if (strsrch->search->elementComparisonType == 0) {
3946 nextCEI = ceb.get(targetIx + targetIxOffset);
3947 maxLimit = nextCEI->lowIndex;
3948 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3949 found = FALSE;
3950 }
3951 } else {
3952 for ( ; ; ++targetIxOffset ) {
3953 nextCEI = ceb.get(targetIx + targetIxOffset);
3954 maxLimit = nextCEI->lowIndex;
3955 // If we are at the end of the target too, match succeeds
3956 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3957 break;
3958 }
3959 // As long as the next CE has primary weight of 0,
3960 // it is part of the last target element matched by the pattern;
3961 // make sure it can be part of a match with the last patCE
3962 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3963 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3964 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3965 found = FALSE;
3966 break;
3967 }
3968 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3969 // target element, but it has non-zero primary weight => match fails
3970 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3971 found = false;
3972 break;
3973 // Else the target CE is not part of an expansion of the last matched element, match succeeds
3974 } else {
3975 break;
3976 }
3977 }
3978 }
3979
3980
3981 // Check for the start of the match being within a combining sequence.
3982 // This can happen if the pattern itself begins with a combining char, and
3983 // the match found combining marks in the target text that were attached
3984 // to something else.
3985 // This type of match should be rejected for not completely consuming a
3986 // combining sequence.
3987 if (!isBreakBoundary(strsrch, mStart)) {
3988 found = FALSE;
3989 }
3990
3991 // Check for the start of the match being within an Collation Element Expansion,
3992 // meaning that the first char of the match is only partially matched.
3993 // With exapnsions, the first CE will report the index of the source
3994 // character, and all subsequent (expansions) CEs will report the source index of the
3995 // _following_ character.
3996 int32_t secondIx = firstCEI->highIndex;
3997 if (mStart == secondIx) {
3998 found = FALSE;
3999 }
4000
4001 // Advance the match end position to the first acceptable match boundary.
4002 // This advances the index over any combining charcters.
4003 mLimit = maxLimit;
4004 if (minLimit < maxLimit) {
4005 // When the last CE's low index is same with its high index, the CE is likely
4006 // a part of expansion. In this case, the index is located just after the
4007 // character corresponding to the CEs compared above. If the index is right
4008 // at the break boundary, move the position to the next boundary will result
4009 // incorrect match length when there are ignorable characters exist between
4010 // the position and the next character produces CE(s). See ticket#8482.
4011 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4012 mLimit = minLimit;
4013 } else {
4014 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4015 if (nba >= lastCEI->highIndex) {
4016 mLimit = nba;
4017 }
4018 }
4019 }
4020
4021 #ifdef USEARCH_DEBUG
4022 if (getenv("USEARCH_DEBUG") != NULL) {
4023 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4024 }
4025 #endif
4026
4027 // If advancing to the end of a combining sequence in character indexing space
4028 // advanced us beyond the end of the match in CE space, reject this match.
4029 if (mLimit > maxLimit) {
4030 found = FALSE;
4031 }
4032
4033 if (!isBreakBoundary(strsrch, mLimit)) {
4034 found = FALSE;
4035 }
4036
4037 if (! checkIdentical(strsrch, mStart, mLimit)) {
4038 found = FALSE;
4039 }
4040
4041 if (found) {
4042 break;
4043 }
4044 }
4045
4046 #ifdef USEARCH_DEBUG
4047 if (getenv("USEARCH_DEBUG") != NULL) {
4048 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4049 int32_t lastToPrint = ceb.limitIx+2;
4050 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4051 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4052 }
4053 printf("\n%s\n", found? "match found" : "no match");
4054 }
4055 #endif
4056
4057 // All Done. Store back the match bounds to the caller.
4058 //
4059 if (found==FALSE) {
4060 mLimit = -1;
4061 mStart = -1;
4062 }
4063
4064 if (matchStart != NULL) {
4065 *matchStart= mStart;
4066 }
4067
4068 if (matchLimit != NULL) {
4069 *matchLimit = mLimit;
4070 }
4071
4072 return found;
4073 }
4074
usearch_searchBackwards(UStringSearch * strsrch,int32_t startIdx,int32_t * matchStart,int32_t * matchLimit,UErrorCode * status)4075 U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch,
4076 int32_t startIdx,
4077 int32_t *matchStart,
4078 int32_t *matchLimit,
4079 UErrorCode *status)
4080 {
4081 if (U_FAILURE(*status)) {
4082 return FALSE;
4083 }
4084
4085 // TODO: reject search patterns beginning with a combining char.
4086
4087 #ifdef USEARCH_DEBUG
4088 if (getenv("USEARCH_DEBUG") != NULL) {
4089 printf("Pattern CEs\n");
4090 for (int ii=0; ii<strsrch->pattern.cesLength; ii++) {
4091 printf(" %8x", strsrch->pattern.ces[ii]);
4092 }
4093 printf("\n");
4094 }
4095
4096 #endif
4097 // Input parameter sanity check.
4098 // TODO: should input indicies clip to the text length
4099 // in the same way that UText does.
4100 if(strsrch->pattern.cesLength == 0 ||
4101 startIdx < 0 ||
4102 startIdx > strsrch->search->textLength ||
4103 strsrch->pattern.ces == NULL) {
4104 *status = U_ILLEGAL_ARGUMENT_ERROR;
4105 return FALSE;
4106 }
4107
4108 if (strsrch->pattern.pces == NULL) {
4109 initializePatternPCETable(strsrch, status);
4110 }
4111
4112 CEIBuffer ceb(strsrch, status);
4113 int32_t targetIx = 0;
4114
4115 /*
4116 * Pre-load the buffer with the CE's for the grapheme
4117 * after our starting position so that we're sure that
4118 * we can look at the CE following the match when we
4119 * check the match boundaries.
4120 *
4121 * This will also pre-fetch the first CE that we'll
4122 * consider for the match.
4123 */
4124 if (startIdx < strsrch->search->textLength) {
4125 UBreakIterator *bi = strsrch->search->internalBreakIter;
4126 int32_t next = ubrk_following(bi, startIdx);
4127
4128 ucol_setOffset(strsrch->textIter, next, status);
4129
4130 for (targetIx = 0; ; targetIx += 1) {
4131 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4132 break;
4133 }
4134 }
4135 } else {
4136 ucol_setOffset(strsrch->textIter, startIdx, status);
4137 }
4138
4139
4140 const CEI *targetCEI = NULL;
4141 int32_t patIx;
4142 UBool found;
4143
4144 int32_t limitIx = targetIx;
4145 int32_t mStart = -1;
4146 int32_t mLimit = -1;
4147 int32_t minLimit;
4148 int32_t maxLimit;
4149
4150
4151
4152 // Outer loop moves over match starting positions in the
4153 // target CE space.
4154 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4155 // But patIx is 0 at the beginning of the pattern and increases toward the end.
4156 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4157 // and the beginning of the base text.
4158 for(targetIx = limitIx; ; targetIx += 1)
4159 {
4160 found = TRUE;
4161 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4162 // (compared to the last CE fetched for the previous targetIx value) as we need to go
4163 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4164 const CEI *lastCEI = ceb.getPrevious(targetIx);
4165 if (lastCEI == NULL) {
4166 *status = U_INTERNAL_PROGRAM_ERROR;
4167 found = FALSE;
4168 break;
4169 }
4170 // Inner loop checks for a match beginning at each
4171 // position from the outer loop.
4172 int32_t targetIxOffset = 0;
4173 for (patIx = strsrch->pattern.pcesLength - 1; patIx >= 0; patIx -= 1) {
4174 int64_t patCE = strsrch->pattern.pces[patIx];
4175
4176 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 - patIx + targetIxOffset);
4177 // Compare CE from target string with CE from the pattern.
4178 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4179 // which will fail the compare, below.
4180 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4181 if ( ceMatch == U_CE_NO_MATCH ) {
4182 found = FALSE;
4183 break;
4184 } else if ( ceMatch > U_CE_NO_MATCH ) {
4185 if ( ceMatch == U_CE_SKIP_TARG ) {
4186 // redo with same patCE, next targCE
4187 patIx++;
4188 targetIxOffset++;
4189 } else { // ceMatch == U_CE_SKIP_PATN
4190 // redo with same targCE, next patCE
4191 targetIxOffset--;
4192 }
4193 }
4194 }
4195
4196 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4197 // No match at this targetIx. Try again at the next.
4198 continue;
4199 }
4200
4201 if (!found) {
4202 // No match at all, we have run off the end of the target text.
4203 break;
4204 }
4205
4206
4207 // We have found a match in CE space.
4208 // Now determine the bounds in string index space.
4209 // There still is a chance of match failure if the CE range not correspond to
4210 // an acceptable character range.
4211 //
4212 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.pcesLength - 1 + targetIxOffset);
4213 mStart = firstCEI->lowIndex;
4214
4215 // Check for the start of the match being within a combining sequence.
4216 // This can happen if the pattern itself begins with a combining char, and
4217 // the match found combining marks in the target text that were attached
4218 // to something else.
4219 // This type of match should be rejected for not completely consuming a
4220 // combining sequence.
4221 if (!isBreakBoundary(strsrch, mStart)) {
4222 found = FALSE;
4223 }
4224
4225 // Look at the high index of the first CE in the match. If it's the same as the
4226 // low index, the first CE in the match is in the middle of an expansion.
4227 if (mStart == firstCEI->highIndex) {
4228 found = FALSE;
4229 }
4230
4231
4232 minLimit = lastCEI->lowIndex;
4233
4234 if (targetIx > 0) {
4235 // Look at the CE following the match. If it is UCOL_NULLORDER the match
4236 // extended to the end of input, and the match is good.
4237
4238 // Look at the high and low indices of the CE following the match. If
4239 // they are the same it means one of two things:
4240 // 1. The match extended to the last CE from the target text, which is OK, or
4241 // 2. The last CE that was part of the match is in an expansion that extends
4242 // to the first CE after the match. In this case, we reject the match.
4243 const CEI *nextCEI = ceb.getPrevious(targetIx - 1);
4244
4245 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4246 found = FALSE;
4247 }
4248
4249 mLimit = maxLimit = nextCEI->lowIndex;
4250
4251 // Advance the match end position to the first acceptable match boundary.
4252 // This advances the index over any combining charcters.
4253 if (minLimit < maxLimit) {
4254 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4255
4256 if (nba >= lastCEI->highIndex) {
4257 mLimit = nba;
4258 }
4259 }
4260
4261 // If advancing to the end of a combining sequence in character indexing space
4262 // advanced us beyond the end of the match in CE space, reject this match.
4263 if (mLimit > maxLimit) {
4264 found = FALSE;
4265 }
4266
4267 // Make sure the end of the match is on a break boundary
4268 if (!isBreakBoundary(strsrch, mLimit)) {
4269 found = FALSE;
4270 }
4271
4272 } else {
4273 // No non-ignorable CEs after this point.
4274 // The maximum position is detected by boundary after
4275 // the last non-ignorable CE. Combining sequence
4276 // across the start index will be truncated.
4277 int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4278 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4279 }
4280
4281 #ifdef USEARCH_DEBUG
4282 if (getenv("USEARCH_DEBUG") != NULL) {
4283 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4284 }
4285 #endif
4286
4287
4288 if (! checkIdentical(strsrch, mStart, mLimit)) {
4289 found = FALSE;
4290 }
4291
4292 if (found) {
4293 break;
4294 }
4295 }
4296
4297 #ifdef USEARCH_DEBUG
4298 if (getenv("USEARCH_DEBUG") != NULL) {
4299 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4300 int32_t lastToPrint = ceb.limitIx+2;
4301 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4302 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4303 }
4304 printf("\n%s\n", found? "match found" : "no match");
4305 }
4306 #endif
4307
4308 // All Done. Store back the match bounds to the caller.
4309 //
4310 if (found==FALSE) {
4311 mLimit = -1;
4312 mStart = -1;
4313 }
4314
4315 if (matchStart != NULL) {
4316 *matchStart= mStart;
4317 }
4318
4319 if (matchLimit != NULL) {
4320 *matchLimit = mLimit;
4321 }
4322
4323 return found;
4324 }
4325
4326 // internal use methods declared in usrchimp.h -----------------------------
4327
usearch_handleNextExact(UStringSearch * strsrch,UErrorCode * status)4328 UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4329 {
4330 if (U_FAILURE(*status)) {
4331 setMatchNotFound(strsrch);
4332 return FALSE;
4333 }
4334
4335 #if BOYER_MOORE
4336 UCollationElements *coleiter = strsrch->textIter;
4337 int32_t textlength = strsrch->search->textLength;
4338 int32_t *patternce = strsrch->pattern.ces;
4339 int32_t patterncelength = strsrch->pattern.cesLength;
4340 int32_t textoffset = ucol_getOffset(coleiter);
4341
4342 // status used in setting coleiter offset, since offset is checked in
4343 // shiftForward before setting the coleiter offset, status never
4344 // a failure
4345 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4346 patterncelength);
4347 while (textoffset <= textlength)
4348 {
4349 uint32_t patternceindex = patterncelength - 1;
4350 int32_t targetce;
4351 UBool found = FALSE;
4352 int32_t lastce = UCOL_NULLORDER;
4353
4354 setColEIterOffset(coleiter, textoffset);
4355
4356 for (;;) {
4357 // finding the last pattern ce match, imagine composite characters
4358 // for example: search for pattern A in text \u00C0
4359 // we'll have to skip \u0300 the grave first before we get to A
4360 targetce = ucol_previous(coleiter, status);
4361 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4362 found = FALSE;
4363 break;
4364 }
4365 targetce = getCE(strsrch, targetce);
4366 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4367 // this is for the text \u0315\u0300 that requires
4368 // normalization and pattern \u0300, where \u0315 is ignorable
4369 continue;
4370 }
4371 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4372 lastce = targetce;
4373 }
4374 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4375 if (targetce == patternce[patternceindex]) {
4376 // the first ce can be a contraction
4377 found = TRUE;
4378 break;
4379 }
4380 if (!hasExpansion(coleiter)) {
4381 found = FALSE;
4382 break;
4383 }
4384 }
4385
4386 //targetce = lastce;
4387
4388 while (found && patternceindex > 0) {
4389 lastce = targetce;
4390 targetce = ucol_previous(coleiter, status);
4391 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4392 found = FALSE;
4393 break;
4394 }
4395 targetce = getCE(strsrch, targetce);
4396 if (targetce == UCOL_IGNORABLE) {
4397 continue;
4398 }
4399
4400 patternceindex --;
4401 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4402 found = found && targetce == patternce[patternceindex];
4403 }
4404
4405 targetce = lastce;
4406
4407 if (!found) {
4408 if (U_FAILURE(*status)) {
4409 break;
4410 }
4411 textoffset = shiftForward(strsrch, textoffset, lastce,
4412 patternceindex);
4413 // status checked at loop.
4414 patternceindex = patterncelength;
4415 continue;
4416 }
4417
4418 if (checkNextExactMatch(strsrch, &textoffset, status)) {
4419 // status checked in ucol_setOffset
4420 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4421 return TRUE;
4422 }
4423 }
4424 setMatchNotFound(strsrch);
4425 return FALSE;
4426 #else
4427 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4428 int32_t start = -1;
4429 int32_t end = -1;
4430
4431 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4432 strsrch->search->matchedIndex = start;
4433 strsrch->search->matchedLength = end - start;
4434 return TRUE;
4435 } else {
4436 setMatchNotFound(strsrch);
4437 return FALSE;
4438 }
4439 #endif
4440 }
4441
usearch_handleNextCanonical(UStringSearch * strsrch,UErrorCode * status)4442 UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4443 {
4444 if (U_FAILURE(*status)) {
4445 setMatchNotFound(strsrch);
4446 return FALSE;
4447 }
4448
4449 #if BOYER_MOORE
4450 UCollationElements *coleiter = strsrch->textIter;
4451 int32_t textlength = strsrch->search->textLength;
4452 int32_t *patternce = strsrch->pattern.ces;
4453 int32_t patterncelength = strsrch->pattern.cesLength;
4454 int32_t textoffset = ucol_getOffset(coleiter);
4455 UBool hasPatternAccents =
4456 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4457
4458 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4459 patterncelength);
4460 strsrch->canonicalPrefixAccents[0] = 0;
4461 strsrch->canonicalSuffixAccents[0] = 0;
4462
4463 while (textoffset <= textlength)
4464 {
4465 int32_t patternceindex = patterncelength - 1;
4466 int32_t targetce;
4467 UBool found = FALSE;
4468 int32_t lastce = UCOL_NULLORDER;
4469
4470 setColEIterOffset(coleiter, textoffset);
4471
4472 for (;;) {
4473 // finding the last pattern ce match, imagine composite characters
4474 // for example: search for pattern A in text \u00C0
4475 // we'll have to skip \u0300 the grave first before we get to A
4476 targetce = ucol_previous(coleiter, status);
4477 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4478 found = FALSE;
4479 break;
4480 }
4481 targetce = getCE(strsrch, targetce);
4482 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4483 lastce = targetce;
4484 }
4485 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4486 if (targetce == patternce[patternceindex]) {
4487 // the first ce can be a contraction
4488 found = TRUE;
4489 break;
4490 }
4491 if (!hasExpansion(coleiter)) {
4492 found = FALSE;
4493 break;
4494 }
4495 }
4496
4497 while (found && patternceindex > 0) {
4498 targetce = ucol_previous(coleiter, status);
4499 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4500 found = FALSE;
4501 break;
4502 }
4503 targetce = getCE(strsrch, targetce);
4504 if (targetce == UCOL_IGNORABLE) {
4505 continue;
4506 }
4507
4508 patternceindex --;
4509 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4510 found = found && targetce == patternce[patternceindex];
4511 }
4512
4513 // initializing the rearranged accent array
4514 if (hasPatternAccents && !found) {
4515 strsrch->canonicalPrefixAccents[0] = 0;
4516 strsrch->canonicalSuffixAccents[0] = 0;
4517 if (U_FAILURE(*status)) {
4518 break;
4519 }
4520 found = doNextCanonicalMatch(strsrch, textoffset, status);
4521 }
4522
4523 if (!found) {
4524 if (U_FAILURE(*status)) {
4525 break;
4526 }
4527 textoffset = shiftForward(strsrch, textoffset, lastce,
4528 patternceindex);
4529 // status checked at loop
4530 patternceindex = patterncelength;
4531 continue;
4532 }
4533
4534 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4535 setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4536 return TRUE;
4537 }
4538 }
4539 setMatchNotFound(strsrch);
4540 return FALSE;
4541 #else
4542 int32_t textOffset = ucol_getOffset(strsrch->textIter);
4543 int32_t start = -1;
4544 int32_t end = -1;
4545
4546 if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4547 strsrch->search->matchedIndex = start;
4548 strsrch->search->matchedLength = end - start;
4549 return TRUE;
4550 } else {
4551 setMatchNotFound(strsrch);
4552 return FALSE;
4553 }
4554 #endif
4555 }
4556
usearch_handlePreviousExact(UStringSearch * strsrch,UErrorCode * status)4557 UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4558 {
4559 if (U_FAILURE(*status)) {
4560 setMatchNotFound(strsrch);
4561 return FALSE;
4562 }
4563
4564 #if BOYER_MOORE
4565 UCollationElements *coleiter = strsrch->textIter;
4566 int32_t *patternce = strsrch->pattern.ces;
4567 int32_t patterncelength = strsrch->pattern.cesLength;
4568 int32_t textoffset = ucol_getOffset(coleiter);
4569
4570 // shifting it check for setting offset
4571 // if setOffset is called previously or there was no previous match, we
4572 // leave the offset as it is.
4573 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4574 textoffset = strsrch->search->matchedIndex;
4575 }
4576
4577 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4578 patterncelength);
4579
4580 while (textoffset >= 0)
4581 {
4582 int32_t patternceindex = 1;
4583 int32_t targetce;
4584 UBool found = FALSE;
4585 int32_t firstce = UCOL_NULLORDER;
4586
4587 // if status is a failure, ucol_setOffset does nothing
4588 setColEIterOffset(coleiter, textoffset);
4589
4590 for (;;) {
4591 // finding the first pattern ce match, imagine composite
4592 // characters. for example: search for pattern \u0300 in text
4593 // \u00C0, we'll have to skip A first before we get to
4594 // \u0300 the grave accent
4595 targetce = ucol_next(coleiter, status);
4596 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4597 found = FALSE;
4598 break;
4599 }
4600 targetce = getCE(strsrch, targetce);
4601 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4602 firstce = targetce;
4603 }
4604 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4605 continue;
4606 }
4607 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4608 if (targetce == patternce[0]) {
4609 found = TRUE;
4610 break;
4611 }
4612 if (!hasExpansion(coleiter)) {
4613 // checking for accents in composite character
4614 found = FALSE;
4615 break;
4616 }
4617 }
4618
4619 //targetce = firstce;
4620
4621 while (found && (patternceindex < patterncelength)) {
4622 firstce = targetce;
4623 targetce = ucol_next(coleiter, status);
4624 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4625 found = FALSE;
4626 break;
4627 }
4628 targetce = getCE(strsrch, targetce);
4629 if (targetce == UCOL_IGNORABLE) {
4630 continue;
4631 }
4632
4633 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4634 found = found && targetce == patternce[patternceindex];
4635 patternceindex ++;
4636 }
4637
4638 targetce = firstce;
4639
4640 if (!found) {
4641 if (U_FAILURE(*status)) {
4642 break;
4643 }
4644
4645 textoffset = reverseShift(strsrch, textoffset, targetce,
4646 patternceindex);
4647 patternceindex = 0;
4648 continue;
4649 }
4650
4651 if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4652 setColEIterOffset(coleiter, textoffset);
4653 return TRUE;
4654 }
4655 }
4656 setMatchNotFound(strsrch);
4657 return FALSE;
4658 #else
4659 int32_t textOffset;
4660
4661 if (strsrch->search->isOverlap) {
4662 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4663 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4664 } else {
4665 // move the start position at the end of possible match
4666 initializePatternPCETable(strsrch, status);
4667 if (!initTextProcessedIter(strsrch, status)) {
4668 setMatchNotFound(strsrch);
4669 return FALSE;
4670 }
4671 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4672 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4673 if (pce == UCOL_PROCESSED_NULLORDER) {
4674 // at the end of the text
4675 break;
4676 }
4677 }
4678 if (U_FAILURE(*status)) {
4679 setMatchNotFound(strsrch);
4680 return FALSE;
4681 }
4682 textOffset = ucol_getOffset(strsrch->textIter);
4683 }
4684 } else {
4685 textOffset = ucol_getOffset(strsrch->textIter);
4686 }
4687
4688 int32_t start = -1;
4689 int32_t end = -1;
4690
4691 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4692 strsrch->search->matchedIndex = start;
4693 strsrch->search->matchedLength = end - start;
4694 return TRUE;
4695 } else {
4696 setMatchNotFound(strsrch);
4697 return FALSE;
4698 }
4699 #endif
4700 }
4701
usearch_handlePreviousCanonical(UStringSearch * strsrch,UErrorCode * status)4702 UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4703 UErrorCode *status)
4704 {
4705 if (U_FAILURE(*status)) {
4706 setMatchNotFound(strsrch);
4707 return FALSE;
4708 }
4709
4710 #if BOYER_MOORE
4711 UCollationElements *coleiter = strsrch->textIter;
4712 int32_t *patternce = strsrch->pattern.ces;
4713 int32_t patterncelength = strsrch->pattern.cesLength;
4714 int32_t textoffset = ucol_getOffset(coleiter);
4715 UBool hasPatternAccents =
4716 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4717
4718 // shifting it check for setting offset
4719 // if setOffset is called previously or there was no previous match, we
4720 // leave the offset as it is.
4721 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4722 textoffset = strsrch->search->matchedIndex;
4723 }
4724
4725 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4726 patterncelength);
4727 strsrch->canonicalPrefixAccents[0] = 0;
4728 strsrch->canonicalSuffixAccents[0] = 0;
4729
4730 while (textoffset >= 0)
4731 {
4732 int32_t patternceindex = 1;
4733 int32_t targetce;
4734 UBool found = FALSE;
4735 int32_t firstce = UCOL_NULLORDER;
4736
4737 setColEIterOffset(coleiter, textoffset);
4738 for (;;) {
4739 // finding the first pattern ce match, imagine composite
4740 // characters. for example: search for pattern \u0300 in text
4741 // \u00C0, we'll have to skip A first before we get to
4742 // \u0300 the grave accent
4743 targetce = ucol_next(coleiter, status);
4744 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4745 found = FALSE;
4746 break;
4747 }
4748 targetce = getCE(strsrch, targetce);
4749 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4750 firstce = targetce;
4751 }
4752
4753 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4754 if (targetce == patternce[0]) {
4755 // the first ce can be a contraction
4756 found = TRUE;
4757 break;
4758 }
4759 if (!hasExpansion(coleiter)) {
4760 // checking for accents in composite character
4761 found = FALSE;
4762 break;
4763 }
4764 }
4765
4766 targetce = firstce;
4767
4768 while (found && patternceindex < patterncelength) {
4769 targetce = ucol_next(coleiter, status);
4770 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4771 found = FALSE;
4772 break;
4773 }
4774 targetce = getCE(strsrch, targetce);
4775 if (targetce == UCOL_IGNORABLE) {
4776 continue;
4777 }
4778
4779 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4780 found = found && targetce == patternce[patternceindex];
4781 patternceindex ++;
4782 }
4783
4784 // initializing the rearranged accent array
4785 if (hasPatternAccents && !found) {
4786 strsrch->canonicalPrefixAccents[0] = 0;
4787 strsrch->canonicalSuffixAccents[0] = 0;
4788 if (U_FAILURE(*status)) {
4789 break;
4790 }
4791 found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4792 }
4793
4794 if (!found) {
4795 if (U_FAILURE(*status)) {
4796 break;
4797 }
4798 textoffset = reverseShift(strsrch, textoffset, targetce,
4799 patternceindex);
4800 patternceindex = 0;
4801 continue;
4802 }
4803
4804 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4805 setColEIterOffset(coleiter, textoffset);
4806 return TRUE;
4807 }
4808 }
4809 setMatchNotFound(strsrch);
4810 return FALSE;
4811 #else
4812 int32_t textOffset;
4813
4814 if (strsrch->search->isOverlap) {
4815 if (strsrch->search->matchedIndex != USEARCH_DONE) {
4816 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4817 } else {
4818 // move the start position at the end of possible match
4819 initializePatternPCETable(strsrch, status);
4820 if (!initTextProcessedIter(strsrch, status)) {
4821 setMatchNotFound(strsrch);
4822 return FALSE;
4823 }
4824 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.pcesLength - 1; nPCEs++) {
4825 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4826 if (pce == UCOL_PROCESSED_NULLORDER) {
4827 // at the end of the text
4828 break;
4829 }
4830 }
4831 if (U_FAILURE(*status)) {
4832 setMatchNotFound(strsrch);
4833 return FALSE;
4834 }
4835 textOffset = ucol_getOffset(strsrch->textIter);
4836 }
4837 } else {
4838 textOffset = ucol_getOffset(strsrch->textIter);
4839 }
4840
4841 int32_t start = -1;
4842 int32_t end = -1;
4843
4844 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4845 strsrch->search->matchedIndex = start;
4846 strsrch->search->matchedLength = end - start;
4847 return TRUE;
4848 } else {
4849 setMatchNotFound(strsrch);
4850 return FALSE;
4851 }
4852 #endif
4853 }
4854
4855 #endif /* #if !UCONFIG_NO_COLLATION */
4856