1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_COLLATION
13 
14 #include "cmemory.h"
15 #include "cstring.h"
16 #include "usrchimp.h"
17 
18 #include "unicode/coll.h"
19 #include "unicode/tblcoll.h"
20 #include "unicode/usearch.h"
21 #include "unicode/uset.h"
22 #include "unicode/ustring.h"
23 
24 #include "unicode/coleitr.h"
25 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
26 
27 #include "colldata.h"
28 #include "ssearch.h"
29 #include "xmlparser.h"
30 
31 #include <stdio.h>  // for sprintf
32 
33 char testId[100];
34 
35 #define TEST_ASSERT(x) UPRV_BLOCK_MACRO_BEGIN { \
36     if (!(x)) { \
37         errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId); \
38     } \
39 } UPRV_BLOCK_MACRO_END
40 
41 #define TEST_ASSERT_M(x, m) UPRV_BLOCK_MACRO_BEGIN { \
42     if (!(x)) { \
43         dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m); \
44         return; \
45     } \
46 } UPRV_BLOCK_MACRO_END
47 
48 #define TEST_ASSERT_SUCCESS(errcode) UPRV_BLOCK_MACRO_BEGIN { \
49     if (U_FAILURE(errcode)) { \
50         dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
51                   __FILE__, __LINE__, testId, u_errorName(errcode)); \
52     } \
53 } UPRV_BLOCK_MACRO_END
54 
55 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
56 #define DELETE_ARRAY(array) uprv_free((void *) (array))
57 
58 //---------------------------------------------------------------------------
59 //
60 //  Test class boilerplate
61 //
62 //---------------------------------------------------------------------------
SSearchTest()63 SSearchTest::SSearchTest()
64 {
65 }
66 
~SSearchTest()67 SSearchTest::~SSearchTest()
68 {
69 }
70 
runIndexedTest(int32_t index,UBool exec,const char * & name,char * params)71 void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
72 {
73     if (exec) logln("TestSuite SSearchTest: ");
74     switch (index) {
75 #if !UCONFIG_NO_BREAK_ITERATION
76        case 0: name = "searchTest";
77             if (exec) searchTest();
78             break;
79 
80         case 1: name = "offsetTest";
81             if (exec) offsetTest();
82             break;
83 
84         case 2: name = "monkeyTest";
85             if (exec) monkeyTest(params);
86             break;
87 
88         case 3: name = "sharpSTest";
89             if (exec) sharpSTest();
90             break;
91 
92         case 4: name = "goodSuffixTest";
93             if (exec) goodSuffixTest();
94             break;
95 
96         case 5: name = "searchTime";
97             if (exec) searchTime();
98             break;
99 #endif
100         default: name = "";
101             break; //needed to end loop
102     }
103 }
104 
105 
106 #if !UCONFIG_NO_BREAK_ITERATION
107 
108 #define PATH_BUFFER_SIZE 2048
getPath(char buffer[2048],const char * filename)109 const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
110     UErrorCode status = U_ZERO_ERROR;
111     const char *testDataDirectory = IntlTest::getSourceTestData(status);
112 
113     if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
114         errln("ERROR: getPath() failed - %s", u_errorName(status));
115         return NULL;
116     }
117 
118     strcpy(buffer, testDataDirectory);
119     strcat(buffer, filename);
120     return buffer;
121 }
122 
123 
searchTest()124 void SSearchTest::searchTest()
125 {
126 #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
127     UErrorCode status = U_ZERO_ERROR;
128     char path[PATH_BUFFER_SIZE];
129     const char *testFilePath = getPath(path, "ssearch.xml");
130 
131     if (testFilePath == NULL) {
132         return; /* Couldn't get path: error message already output. */
133     }
134 
135     LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
136     TEST_ASSERT_SUCCESS(status);
137     LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
138     TEST_ASSERT_SUCCESS(status);
139     if (U_FAILURE(status)) {
140         return;
141     }
142 
143     const UnicodeString *debugTestCase = root->getAttribute("debug");
144     if (debugTestCase != NULL) {
145 //       setenv("USEARCH_DEBUG", "1", 1);
146     }
147 
148 
149     const UXMLElement *testCase;
150     int32_t tc = 0;
151 
152     while((testCase = root->nextChildElement(tc)) != NULL) {
153 
154         if (testCase->getTagName().compare("test-case") != 0) {
155             errln("ssearch, unrecognized XML Element in test file");
156             continue;
157         }
158         const UnicodeString *id       = testCase->getAttribute("id");
159         *testId = 0;
160         if (id != NULL) {
161             id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
162         }
163 
164         // If debugging test case has been specified and this is not it, skip to next.
165         if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
166             continue;
167         }
168         //
169         //  Get the requested collation strength.
170         //    Default is tertiary if the XML attribute is missing from the test case.
171         //
172         const UnicodeString *strength = testCase->getAttribute("strength");
173         UColAttributeValue collatorStrength = UCOL_PRIMARY;
174         if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
175         else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
176         else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
177         else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
178         else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
179         else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
180         else {
181             // Bogus value supplied for strength.  Shouldn't happen, even from
182             //  typos, if the  XML source has been validated.
183             //  This assert is a little deceiving in that strength can be
184             //   any of the allowed values, not just TERTIARY, but it will
185             //   do the job of getting the error output.
186             TEST_ASSERT(*strength=="TERTIARY");
187         }
188 
189         //
190         // Get the collator normalization flag.  Default is UCOL_OFF.
191         //
192         UColAttributeValue normalize = UCOL_OFF;
193         const UnicodeString *norm = testCase->getAttribute("norm");
194         TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
195         if (norm!=NULL && *norm=="ON") {
196             normalize = UCOL_ON;
197         }
198 
199         //
200         // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
201         //
202         UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
203         const UnicodeString *alt = testCase->getAttribute("alternate_handling");
204         TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
205         if (alt != NULL && *alt == "SHIFTED") {
206             alternateHandling = UCOL_SHIFTED;
207         }
208 
209         const UnicodeString defLocale("en");
210         char  clocale[100];
211         const UnicodeString *locale   = testCase->getAttribute("locale");
212         if (locale == NULL || locale->length()==0) {
213             locale = &defLocale;
214         }
215         locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
216 
217 
218         UnicodeString  text;
219         UnicodeString  target;
220         UnicodeString  pattern;
221         int32_t        expectedMatchStart = -1;
222         int32_t        expectedMatchLimit = -1;
223         const UXMLElement  *n;
224         int32_t                nodeCount = 0;
225 
226         n = testCase->getChildElement("pattern");
227         TEST_ASSERT(n != NULL);
228         if (n==NULL) {
229             continue;
230         }
231         text = n->getText(FALSE);
232         text = text.unescape();
233         pattern.append(text);
234         nodeCount++;
235 
236         n = testCase->getChildElement("pre");
237         if (n!=NULL) {
238             text = n->getText(FALSE);
239             text = text.unescape();
240             target.append(text);
241             nodeCount++;
242         }
243 
244         n = testCase->getChildElement("m");
245         if (n!=NULL) {
246             expectedMatchStart = target.length();
247             text = n->getText(FALSE);
248             text = text.unescape();
249             target.append(text);
250             expectedMatchLimit = target.length();
251             nodeCount++;
252         }
253 
254         n = testCase->getChildElement("post");
255         if (n!=NULL) {
256             text = n->getText(FALSE);
257             text = text.unescape();
258             target.append(text);
259             nodeCount++;
260         }
261 
262         //  Check that there weren't extra things in the XML
263         TEST_ASSERT(nodeCount == testCase->countChildren());
264 
265         // Open a collator and StringSearch based on the parameters
266         //   obtained from the XML.
267         //
268         status = U_ZERO_ERROR;
269         LocalUCollatorPointer collator(ucol_open(clocale, &status));
270         ucol_setStrength(collator.getAlias(), collatorStrength);
271         ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
272         ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
273         LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
274                                                                target.getBuffer(), target.length(),
275                                                                collator.getAlias(),
276                                                                NULL,     // the break iterator
277                                                                &status));
278 
279         TEST_ASSERT_SUCCESS(status);
280         if (U_FAILURE(status)) {
281             continue;
282         }
283 
284         int32_t foundStart = 0;
285         int32_t foundLimit = 0;
286         UBool   foundMatch;
287 
288         //
289         // Do the search, check the match result against the expected results.
290         //
291         foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
292         TEST_ASSERT_SUCCESS(status);
293         if ((foundMatch && expectedMatchStart<0) ||
294             (foundStart != expectedMatchStart)   ||
295             (foundLimit != expectedMatchLimit)) {
296                 TEST_ASSERT(FALSE);   //  ouput generic error position
297                 infoln("Found, expected match start = %d, %d \n"
298                        "Found, expected match limit = %d, %d",
299                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
300         }
301 
302         // In case there are other matches...
303         // (should we only do this if the test case passed?)
304         while (foundMatch) {
305             expectedMatchStart = foundStart;
306             expectedMatchLimit = foundLimit;
307 
308             foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
309         }
310 
311         uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
312             target.getBuffer(), target.length(),
313             collator.getAlias(),
314             NULL,
315             &status));
316 
317         //
318         // Do the backwards search, check the match result against the expected results.
319         //
320         foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
321         TEST_ASSERT_SUCCESS(status);
322         if ((foundMatch && expectedMatchStart<0) ||
323             (foundStart != expectedMatchStart)   ||
324             (foundLimit != expectedMatchLimit)) {
325                 TEST_ASSERT(FALSE);   //  ouput generic error position
326                 infoln("Found, expected backwards match start = %d, %d \n"
327                        "Found, expected backwards match limit = %d, %d",
328                 foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
329         }
330     }
331 #endif
332 }
333 
334 struct Order
335 {
336     int32_t order;
337     int32_t lowOffset;
338     int32_t highOffset;
339 };
340 
341 class OrderList
342 {
343 public:
344     OrderList();
345     OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
346     ~OrderList();
347 
348     int32_t size(void) const;
349     void add(int32_t order, int32_t low, int32_t high);
350     const Order *get(int32_t index) const;
351     int32_t getLowOffset(int32_t index) const;
352     int32_t getHighOffset(int32_t index) const;
353     int32_t getOrder(int32_t index) const;
354     void reverse(void);
355     UBool compare(const OrderList &other) const;
356     UBool matchesAt(int32_t offset, const OrderList &other) const;
357 
358 private:
359     Order *list;
360     int32_t listMax;
361     int32_t listSize;
362 };
363 
OrderList()364 OrderList::OrderList()
365   : list(NULL),  listMax(16), listSize(0)
366 {
367     list = new Order[listMax];
368 }
369 
OrderList(UCollator * coll,const UnicodeString & string,int32_t stringOffset)370 OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
371     : list(NULL), listMax(16), listSize(0)
372 {
373     UErrorCode status = U_ZERO_ERROR;
374     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
375     uint32_t strengthMask = 0;
376     int32_t order, low, high;
377 
378     switch (ucol_getStrength(coll))
379     {
380     default:
381         strengthMask |= UCOL_TERTIARYORDERMASK;
382         U_FALLTHROUGH;
383     case UCOL_SECONDARY:
384         strengthMask |= UCOL_SECONDARYORDERMASK;
385         U_FALLTHROUGH;
386     case UCOL_PRIMARY:
387         strengthMask |= UCOL_PRIMARYORDERMASK;
388     }
389 
390     list = new Order[listMax];
391 
392     ucol_setOffset(elems, stringOffset, &status);
393 
394     do {
395         low   = ucol_getOffset(elems);
396         order = ucol_next(elems, &status);
397         high  = ucol_getOffset(elems);
398 
399         if (order != UCOL_NULLORDER) {
400             order &= strengthMask;
401         }
402 
403         if (order != UCOL_IGNORABLE) {
404             add(order, low, high);
405         }
406     } while (order != UCOL_NULLORDER);
407 
408     ucol_closeElements(elems);
409 }
410 
~OrderList()411 OrderList::~OrderList()
412 {
413     delete[] list;
414 }
415 
add(int32_t order,int32_t low,int32_t high)416 void OrderList::add(int32_t order, int32_t low, int32_t high)
417 {
418     if (listSize >= listMax) {
419         listMax *= 2;
420 
421         Order *newList = new Order[listMax];
422 
423         uprv_memcpy(newList, list, listSize * sizeof(Order));
424         delete[] list;
425         list = newList;
426     }
427 
428     list[listSize].order      = order;
429     list[listSize].lowOffset  = low;
430     list[listSize].highOffset = high;
431 
432     listSize += 1;
433 }
434 
get(int32_t index) const435 const Order *OrderList::get(int32_t index) const
436 {
437     if (index >= listSize) {
438         return NULL;
439     }
440 
441     return &list[index];
442 }
443 
getLowOffset(int32_t index) const444 int32_t OrderList::getLowOffset(int32_t index) const
445 {
446     const Order *order = get(index);
447 
448     if (order != NULL) {
449         return order->lowOffset;
450     }
451 
452     return -1;
453 }
454 
getHighOffset(int32_t index) const455 int32_t OrderList::getHighOffset(int32_t index) const
456 {
457     const Order *order = get(index);
458 
459     if (order != NULL) {
460         return order->highOffset;
461     }
462 
463     return -1;
464 }
465 
getOrder(int32_t index) const466 int32_t OrderList::getOrder(int32_t index) const
467 {
468     const Order *order = get(index);
469 
470     if (order != NULL) {
471         return order->order;
472     }
473 
474     return UCOL_NULLORDER;
475 }
476 
size() const477 int32_t OrderList::size() const
478 {
479     return listSize;
480 }
481 
reverse()482 void OrderList::reverse()
483 {
484     for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
485         Order swap = list[b];
486 
487         list[b] = list[f];
488         list[f] = swap;
489     }
490 }
491 
compare(const OrderList & other) const492 UBool OrderList::compare(const OrderList &other) const
493 {
494     if (listSize != other.listSize) {
495         return FALSE;
496     }
497 
498     for(int32_t i = 0; i < listSize; i += 1) {
499         if (list[i].order  != other.list[i].order ||
500             list[i].lowOffset != other.list[i].lowOffset ||
501             list[i].highOffset != other.list[i].highOffset) {
502                 return FALSE;
503         }
504     }
505 
506     return TRUE;
507 }
508 
matchesAt(int32_t offset,const OrderList & other) const509 UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
510 {
511     // NOTE: sizes include the NULLORDER, which we don't want to compare.
512     int32_t otherSize = other.size() - 1;
513 
514     if (listSize - 1 - offset < otherSize) {
515         return FALSE;
516     }
517 
518     for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
519         if (getOrder(i) != other.getOrder(j)) {
520             return FALSE;
521         }
522     }
523 
524     return TRUE;
525 }
526 
printOffsets(char * buffer,OrderList & list)527 static char *printOffsets(char *buffer, OrderList &list)
528 {
529     int32_t size = list.size();
530     char *s = buffer;
531 
532     for(int32_t i = 0; i < size; i += 1) {
533         const Order *order = list.get(i);
534 
535         if (i != 0) {
536             s += sprintf(s, ", ");
537         }
538 
539         s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
540     }
541 
542     return buffer;
543 }
544 
printOrders(char * buffer,OrderList & list)545 static char *printOrders(char *buffer, OrderList &list)
546 {
547     int32_t size = list.size();
548     char *s = buffer;
549 
550     for(int32_t i = 0; i < size; i += 1) {
551         const Order *order = list.get(i);
552 
553         if (i != 0) {
554             s += sprintf(s, ", ");
555         }
556 
557         s += sprintf(s, "%8.8X", order->order);
558     }
559 
560     return buffer;
561 }
562 
offsetTest()563 void SSearchTest::offsetTest()
564 {
565     const char *test[] = {
566         // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
567         // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
568         "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
569 
570         "\\ua191\\u16ef\\u2036\\u017a",
571 
572 #if 0
573         // This results in a complex interaction between contraction,
574         // expansion and normalization that confuses the backwards offset fixups.
575         "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
576 #endif
577 
578         "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
579         "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
580 
581         "\\u02FE\\u02FF"
582         "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
583         "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
584         "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
585         "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
586         "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
587 
588         "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
589         "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
590         "a\\u02FF\\u0316\\u0301",
591         "a\\u0430\\u0301\\u0316",
592         "a\\u0430\\u0316\\u0301",
593         "abc\\u0E41\\u0301\\u0316",
594         "abc\\u0E41\\u0316\\u0301",
595         "\\u0E41\\u0301\\u0316",
596         "\\u0E41\\u0316\\u0301",
597         "a\\u0301\\u0316",
598         "a\\u0316\\u0301",
599         "\\uAC52\\uAC53",
600         "\\u34CA\\u34CB",
601         "\\u11ED\\u11EE",
602         "\\u30C3\\u30D0",
603         "p\\u00E9ch\\u00E9",
604         "a\\u0301\\u0325",
605         "a\\u0300\\u0325",
606         "a\\u0325\\u0300",
607         "A\\u0323\\u0300B",
608         "A\\u0300\\u0323B",
609         "A\\u0301\\u0323B",
610         "A\\u0302\\u0301\\u0323B",
611         "abc",
612         "ab\\u0300c",
613         "ab\\u0300\\u0323c",
614         " \\uD800\\uDC00\\uDC00",
615         "a\\uD800\\uDC00\\uDC00",
616         "A\\u0301\\u0301",
617         "A\\u0301\\u0323",
618         "A\\u0301\\u0323B",
619         "B\\u0301\\u0323C",
620         "A\\u0300\\u0323B",
621         "\\u0301A\\u0301\\u0301",
622         "abcd\\r\\u0301",
623         "p\\u00EAche",
624         "pe\\u0302che",
625     };
626 
627     int32_t testCount = UPRV_LENGTHOF(test);
628     UErrorCode status = U_ZERO_ERROR;
629     RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
630     if (U_FAILURE(status)) {
631         errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
632         return;
633     }
634     char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
635                         // We could allocate one that's the right size by (CE_count * 10) + 2
636                         // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
637 
638     col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
639 
640     for(int32_t i = 0; i < testCount; i += 1) {
641         UnicodeString ts = CharsToUnicodeString(test[i]);
642         CollationElementIterator *iter = col->createCollationElementIterator(ts);
643         OrderList forwardList;
644         OrderList backwardList;
645         int32_t order, low, high;
646 
647         do {
648             low   = iter->getOffset();
649             order = iter->next(status);
650             high  = iter->getOffset();
651 
652             forwardList.add(order, low, high);
653         } while (order != CollationElementIterator::NULLORDER);
654 
655         iter->reset();
656         iter->setOffset(ts.length(), status);
657 
658         backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
659 
660         do {
661             high  = iter->getOffset();
662             order = iter->previous(status);
663             low   = iter->getOffset();
664 
665             if (order == CollationElementIterator::NULLORDER) {
666                 break;
667             }
668 
669             backwardList.add(order, low, high);
670         } while (TRUE);
671 
672         backwardList.reverse();
673 
674         if (forwardList.compare(backwardList)) {
675             logln("Works with \"%s\"", test[i]);
676             logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
677 //          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
678 
679             logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
680 //          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
681 
682             logln();
683         } else {
684             errln("Fails with \"%s\"", test[i]);
685             infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
686             infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
687 
688             infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
689             infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
690 
691             infoln();
692         }
693         delete iter;
694     }
695     delete col;
696 }
697 
698 #if 0
699 static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
700 {
701     for(int32_t i = 0; i < string.length(); i += 1) {
702         UChar32 ch = string.char32At(i);
703 
704         if (ch >= 0x0020 && ch <= 0x007F) {
705             if (ch == 0x005C) {
706                 buffer.append("\\\\");
707             } else {
708                 buffer.append(ch);
709             }
710         } else {
711             char cbuffer[12];
712 
713             if (ch <= 0xFFFFL) {
714                 sprintf(cbuffer, "\\u%4.4X", ch);
715             } else {
716                 sprintf(cbuffer, "\\U%8.8X", ch);
717             }
718 
719             buffer.append(cbuffer);
720         }
721 
722         if (ch >= 0x10000L) {
723             i += 1;
724         }
725     }
726 
727     return buffer;
728 }
729 #endif
730 
sharpSTest()731 void SSearchTest::sharpSTest()
732 {
733     UErrorCode status = U_ZERO_ERROR;
734     UCollator *coll = NULL;
735     UnicodeString lp  = "fuss";
736     UnicodeString sp = "fu\\u00DF";
737     UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
738                                 "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
739                                 "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
740     int32_t start = -1, end = -1;
741 
742     coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
743     TEST_ASSERT_SUCCESS(status);
744 
745     UnicodeString lpUnescaped = lp.unescape();
746     UnicodeString spUnescaped = sp.unescape();
747 
748     LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
749                                                            lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
750                                                            coll,
751                                                            NULL,     // the break iterator
752                                                            &status));
753 
754     LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
755                                                            spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
756                                                            coll,
757                                                            NULL,     // the break iterator
758                                                            &status));
759     TEST_ASSERT_SUCCESS(status);
760 
761     for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
762         UBool bFound;
763         UnicodeString target = targets[t].unescape();
764 
765         start = end = -1;
766         usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
767         bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
768         TEST_ASSERT_SUCCESS(status);
769         if (bFound) {
770             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
771         } else {
772             dataerrln("Test %d: did not find long pattern.", t);
773         }
774 
775         usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
776         bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
777         TEST_ASSERT_SUCCESS(status);
778         if (bFound) {
779             logln("Test %d: found long pattern at [%d, %d].", t, start, end);
780         } else {
781             dataerrln("Test %d: did not find long pattern.", t);
782         }
783     }
784 
785     ucol_close(coll);
786 }
787 
goodSuffixTest()788 void SSearchTest::goodSuffixTest()
789 {
790     UErrorCode status = U_ZERO_ERROR;
791     UCollator *coll = NULL;
792     UnicodeString pat = /*"gcagagag"*/ "fxeld";
793     UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
794     int32_t start = -1, end = -1;
795     UBool bFound;
796 
797     coll = ucol_open(NULL, &status);
798     TEST_ASSERT_SUCCESS(status);
799 
800     LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
801                                                           target.getBuffer(), target.length(),
802                                                           coll,
803                                                           NULL,     // the break iterator
804                                                           &status));
805     TEST_ASSERT_SUCCESS(status);
806 
807     bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
808     TEST_ASSERT_SUCCESS(status);
809     if (bFound) {
810         logln("Found pattern at [%d, %d].", start, end);
811     } else {
812         dataerrln("Did not find pattern.");
813     }
814 
815     ucol_close(coll);
816 }
817 
818 //
819 //  searchTime()    A quick and dirty performance test for string search.
820 //                  Probably  doesn't really belong as part of intltest, but it
821 //                  does check that the search succeeds, and gets the right result,
822 //                  so it serves as a functionality test also.
823 //
824 //                  To run as a perf test, up the loop count, select by commenting
825 //                  and uncommenting in the code the operation to be measured,
826 //                  rebuild, and measure the running time of this test alone.
827 //
828 //                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
829 //
searchTime()830 void SSearchTest::searchTime() {
831     static const char *longishText =
832 "Whylom, as olde stories tellen us,\n"
833 "Ther was a duk that highte Theseus:\n"
834 "Of Athenes he was lord and governour,\n"
835 "And in his tyme swich a conquerour,\n"
836 "That gretter was ther noon under the sonne.\n"
837 "Ful many a riche contree hadde he wonne;\n"
838 "What with his wisdom and his chivalrye,\n"
839 "He conquered al the regne of Femenye,\n"
840 "That whylom was y-cleped Scithia;\n"
841 "And weddede the quene Ipolita,\n"
842 "And broghte hir hoom with him in his contree\n"
843 "With muchel glorie and greet solempnitee,\n"
844 "And eek hir yonge suster Emelye.\n"
845 "And thus with victorie and with melodye\n"
846 "Lete I this noble duk to Athenes ryde,\n"
847 "And al his hoost, in armes, him bisyde.\n"
848 "And certes, if it nere to long to here,\n"
849 "I wolde han told yow fully the manere,\n"
850 "How wonnen was the regne of Femenye\n"
851 "By Theseus, and by his chivalrye;\n"
852 "And of the grete bataille for the nones\n"
853 "Bitwixen Athen's and Amazones;\n"
854 "And how asseged was Ipolita,\n"
855 "The faire hardy quene of Scithia;\n"
856 "And of the feste that was at hir weddinge,\n"
857 "And of the tempest at hir hoom-cominge;\n"
858 "But al that thing I moot as now forbere.\n"
859 "I have, God woot, a large feeld to ere,\n"
860 "And wayke been the oxen in my plough.\n"
861 "The remenant of the tale is long y-nough.\n"
862 "I wol nat letten eek noon of this route;\n"
863 "Lat every felawe telle his tale aboute,\n"
864 "And lat see now who shal the soper winne;\n"
865 "And ther I lefte, I wol ageyn biginne.\n"
866 "This duk, of whom I make mencioun,\n"
867 "When he was come almost unto the toun,\n"
868 "In al his wele and in his moste pryde,\n"
869 "He was war, as he caste his eye asyde,\n"
870 "Wher that ther kneled in the hye weye\n"
871 "A companye of ladies, tweye and tweye,\n"
872 "Ech after other, clad in clothes blake; \n"
873 "But swich a cry and swich a wo they make,\n"
874 "That in this world nis creature livinge,\n"
875 "That herde swich another weymentinge;\n"
876 "And of this cry they nolde never stenten,\n"
877 "Til they the reynes of his brydel henten.\n"
878 "'What folk ben ye, that at myn hoomcominge\n"
879 "Perturben so my feste with cryinge'?\n"
880 "Quod Theseus, 'have ye so greet envye\n"
881 "Of myn honour, that thus compleyne and crye? \n"
882 "Or who hath yow misboden, or offended?\n"
883 "And telleth me if it may been amended;\n"
884 "And why that ye ben clothed thus in blak'?\n"
885 "The eldest lady of hem alle spak,\n"
886 "When she hadde swowned with a deedly chere,\n"
887 "That it was routhe for to seen and here,\n"
888 "And seyde: 'Lord, to whom Fortune hath yiven\n"
889 "Victorie, and as a conquerour to liven,\n"
890 "Noght greveth us your glorie and your honour;\n"
891 "But we biseken mercy and socour.\n"
892 "Have mercy on our wo and our distresse.\n"
893 "Som drope of pitee, thurgh thy gentilesse,\n"
894 "Up-on us wrecched wommen lat thou falle.\n"
895 "For certes, lord, ther nis noon of us alle,\n"
896 "That she nath been a duchesse or a quene;\n"
897 "Now be we caitifs, as it is wel sene:\n"
898 "Thanked be Fortune, and hir false wheel,\n"
899 "That noon estat assureth to be weel.\n"
900 "And certes, lord, t'abyden your presence,\n"
901 "Here in the temple of the goddesse Clemence\n"
902 "We han ben waytinge al this fourtenight;\n"
903 "Now help us, lord, sith it is in thy might.\n"
904 "I wrecche, which that wepe and waille thus,\n"
905 "Was whylom wyf to king Capaneus,\n"
906 "That starf at Thebes, cursed be that day!\n"
907 "And alle we, that been in this array,\n"
908 "And maken al this lamentacioun,\n"
909 "We losten alle our housbondes at that toun,\n"
910 "Whyl that the sege ther-aboute lay.\n"
911 "And yet now th'olde Creon, weylaway!\n"
912 "The lord is now of Thebes the citee, \n"
913 "Fulfild of ire and of iniquitee,\n"
914 "He, for despyt, and for his tirannye,\n"
915 "To do the dede bodyes vileinye,\n"
916 "Of alle our lordes, whiche that ben slawe,\n"
917 "Hath alle the bodyes on an heep y-drawe,\n"
918 "And wol nat suffren hem, by noon assent,\n"
919 "Neither to been y-buried nor y-brent,\n"
920 "But maketh houndes ete hem in despyt. zet'\n";
921 
922 const char *cPattern = "maketh houndes ete hem";
923 //const char *cPattern = "Whylom";
924 //const char *cPattern = "zet";
925     const char *testId = "searchTime()";   // for error macros.
926     UnicodeString target = longishText;
927     UErrorCode status = U_ZERO_ERROR;
928 
929 
930     LocalUCollatorPointer collator(ucol_open("en", &status));
931     //ucol_setStrength(collator.getAlias(), collatorStrength);
932     //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
933     UnicodeString uPattern = cPattern;
934     LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
935                                                            target.getBuffer(), target.length(),
936                                                            collator.getAlias(),
937                                                            NULL,     // the break iterator
938                                                            &status));
939     TEST_ASSERT_SUCCESS(status);
940 
941 //  int32_t foundStart;
942 //  int32_t foundEnd;
943     UBool   found;
944 
945     // Find the match position usgin strstr
946     const char *pm = strstr(longishText, cPattern);
947     TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
948     int32_t  refMatchPos = (int32_t)(pm - longishText);
949     int32_t  icuMatchPos;
950     int32_t  icuMatchEnd;
951     usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
952     TEST_ASSERT_SUCCESS(status);
953     TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
954 
955     int32_t i;
956     // int32_t j=0;
957 
958     // Try loopcounts around 100000 to some millions, depending on the operation,
959     //   to get runtimes of at least several seconds.
960     for (i=0; i<10000; i++) {
961         found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
962         (void)found;   // Suppress set but not used warning.
963         //TEST_ASSERT_SUCCESS(status);
964         //TEST_ASSERT(found);
965 
966         // usearch_setOffset(uss.getAlias(), 0, &status);
967         // icuMatchPos = usearch_next(uss.getAlias(), &status);
968 
969          // The i+j stuff is to confuse the optimizer and get it to actually leave the
970          //   call to strstr in place.
971          //pm = strstr(longishText+j, cPattern);
972          //j = (j + i)%5;
973     }
974 
975     //printf("%ld, %d\n", pm-longishText, j);
976 }
977 
978 //----------------------------------------------------------------------------------------
979 //
980 //   Random Numbers.  Similar to standard lib rand() and srand()
981 //                    Not using library to
982 //                      1.  Get same results on all platforms.
983 //                      2.  Get access to current seed, to more easily reproduce failures.
984 //
985 //---------------------------------------------------------------------------------------
986 static uint32_t m_seed = 1;
987 
m_rand()988 static uint32_t m_rand()
989 {
990     m_seed = m_seed * 1103515245 + 12345;
991     return (uint32_t)(m_seed/65536) % 32768;
992 }
993 
994 class Monkey
995 {
996 public:
997     virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
998 
999 protected:
1000     Monkey();
1001     virtual ~Monkey();
1002 };
1003 
Monkey()1004 Monkey::Monkey()
1005 {
1006     // ook?
1007 }
1008 
~Monkey()1009 Monkey::~Monkey()
1010 {
1011     // ook?
1012 }
1013 
1014 class SetMonkey : public Monkey
1015 {
1016 public:
1017     SetMonkey(const USet *theSet);
1018     ~SetMonkey();
1019 
1020     virtual void append(UnicodeString &test, UnicodeString &alternate);
1021 
1022 private:
1023     const USet *set;
1024 };
1025 
SetMonkey(const USet * theSet)1026 SetMonkey::SetMonkey(const USet *theSet)
1027     : Monkey(), set(theSet)
1028 {
1029     // ook?
1030 }
1031 
~SetMonkey()1032 SetMonkey::~SetMonkey()
1033 {
1034     //ook...
1035 }
1036 
append(UnicodeString & test,UnicodeString & alternate)1037 void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1038 {
1039     int32_t size = uset_size(set);
1040     int32_t index = m_rand() % size;
1041     UChar32 ch = uset_charAt(set, index);
1042     UnicodeString str(ch);
1043 
1044     test.append(str);
1045     alternate.append(str); // flip case, or some junk?
1046 }
1047 
1048 class StringSetMonkey : public Monkey
1049 {
1050 public:
1051     StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1052     ~StringSetMonkey();
1053 
1054     void append(UnicodeString &testCase, UnicodeString &alternate);
1055 
1056 private:
1057     UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1058 
1059     const USet *set;
1060     UCollator  *coll;
1061     CollData   *collData;
1062 };
1063 
StringSetMonkey(const USet * theSet,UCollator * theCollator,CollData * theCollData)1064 StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1065 : Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1066 {
1067     // ook.
1068 }
1069 
~StringSetMonkey()1070 StringSetMonkey::~StringSetMonkey()
1071 {
1072     // ook?
1073 }
1074 
append(UnicodeString & testCase,UnicodeString & alternate)1075 void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1076 {
1077     int32_t itemCount = uset_getItemCount(set), len = 0;
1078     int32_t index = m_rand() % itemCount;
1079     UChar32 rangeStart = 0, rangeEnd = 0;
1080     UChar buffer[16];
1081     UErrorCode err = U_ZERO_ERROR;
1082 
1083     len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1084 
1085     if (len == 0) {
1086         int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1087         UChar32 ch = rangeStart + offset;
1088         UnicodeString str(ch);
1089 
1090         testCase.append(str);
1091         generateAlternative(str, alternate);
1092     } else if (len > 0) {
1093         // should check that len < 16...
1094         UnicodeString str(buffer, len);
1095 
1096         testCase.append(str);
1097         generateAlternative(str, alternate);
1098     } else {
1099         // shouldn't happen...
1100     }
1101 }
1102 
generateAlternative(const UnicodeString & testCase,UnicodeString & alternate)1103 UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1104 {
1105     // find out shortest string for the longest sequence of ces.
1106     // needs to be refined to use dynamic programming, but will be roughly right
1107     UErrorCode status = U_ZERO_ERROR;
1108     CEList ceList(coll, testCase, status);
1109     UnicodeString alt;
1110     int32_t offset = 0;
1111 
1112     if (ceList.size() == 0) {
1113         return alternate.append(testCase);
1114     }
1115 
1116     while (offset < ceList.size()) {
1117         int32_t ce = ceList.get(offset);
1118         const StringList *strings = collData->getStringList(ce);
1119 
1120         if (strings == NULL) {
1121             return alternate.append(testCase);
1122         }
1123 
1124         int32_t stringCount = strings->size();
1125         int32_t tries = 0;
1126 
1127         // find random string that generates the same CEList
1128         const CEList *ceList2 = NULL;
1129         const UnicodeString *string = NULL;
1130               UBool matches = FALSE;
1131 
1132         do {
1133             int32_t s = m_rand() % stringCount;
1134 
1135             if (tries++ > stringCount) {
1136                 alternate.append(testCase);
1137                 return alternate;
1138             }
1139 
1140             string = strings->get(s);
1141             ceList2 = collData->getCEList(string);
1142             matches = ceList.matchesAt(offset, ceList2);
1143 
1144             if (! matches) {
1145                 collData->freeCEList((CEList *) ceList2);
1146             }
1147         } while (! matches);
1148 
1149         alt.append(*string);
1150         offset += ceList2->size();
1151         collData->freeCEList(ceList2);
1152     }
1153 
1154     const CEList altCEs(coll, alt, status);
1155 
1156     if (ceList.matchesAt(0, &altCEs)) {
1157         return alternate.append(alt);
1158     }
1159 
1160     return alternate.append(testCase);
1161 }
1162 
generateTestCase(UCollator * coll,Monkey * monkeys[],int32_t monkeyCount,UnicodeString & testCase,UnicodeString & alternate)1163 static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1164 {
1165     int32_t pieces = (m_rand() % 4) + 1;
1166     UErrorCode status = U_ZERO_ERROR;
1167     UBool matches;
1168 
1169     do {
1170         testCase.remove();
1171         alternate.remove();
1172         monkeys[0]->append(testCase, alternate);
1173 
1174         for(int32_t piece = 0; piece < pieces; piece += 1) {
1175             int32_t monkey = m_rand() % monkeyCount;
1176 
1177             monkeys[monkey]->append(testCase, alternate);
1178         }
1179 
1180         const CEList ceTest(coll, testCase, status);
1181         const CEList ceAlt(coll, alternate, status);
1182 
1183         matches = ceTest.matchesAt(0, &ceAlt);
1184     } while (! matches);
1185 }
1186 
simpleSearch(UCollator * coll,const UnicodeString & target,int32_t offset,const UnicodeString & pattern,int32_t & matchStart,int32_t & matchEnd)1187 static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1188 {
1189     UErrorCode      status = U_ZERO_ERROR;
1190     OrderList       targetOrders(coll, target, offset);
1191     OrderList       patternOrders(coll, pattern);
1192     int32_t         targetSize  = targetOrders.size() - 1;
1193     int32_t         patternSize = patternOrders.size() - 1;
1194     UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1195                                                   target.getBuffer(), target.length(), &status);
1196 
1197     if (patternSize == 0) {
1198         // Searching for an empty pattern always fails
1199         matchStart = matchEnd = -1;
1200         ubrk_close(charBreakIterator);
1201         return FALSE;
1202     }
1203 
1204     matchStart = matchEnd = -1;
1205 
1206     for(int32_t i = 0; i < targetSize; i += 1) {
1207         if (targetOrders.matchesAt(i, patternOrders)) {
1208             int32_t start    = targetOrders.getLowOffset(i);
1209             int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1210             int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1211 
1212             // if the low and high offsets of the first CE in
1213             // the match are the same, it means that the match
1214             // starts in the middle of an expansion - all but
1215             // the first CE of the expansion will have the offset
1216             // of the following character.
1217             if (start == targetOrders.getHighOffset(i)) {
1218                 continue;
1219             }
1220 
1221             // Make sure match starts on a grapheme boundary
1222             if (! ubrk_isBoundary(charBreakIterator, start)) {
1223                 continue;
1224             }
1225 
1226             // If the low and high offsets of the CE after the match
1227             // are the same, it means that the match ends in the middle
1228             // of an expansion sequence.
1229             if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1230                 targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1231                 continue;
1232             }
1233 
1234             int32_t mend = maxLimit;
1235 
1236             // Find the first grapheme break after the character index
1237             // of the last CE in the match. If it's after character index
1238             // that's after the last CE in the match, use that index
1239             // as the end of the match.
1240             if (minLimit < maxLimit) {
1241                 // When the last CE's low index is same with its high index, the CE is likely
1242                 // a part of expansion. In this case, the index is located just after the
1243                 // character corresponding to the CEs compared above. If the index is right
1244                 // at the break boundary, move the position to the next boundary will result
1245                 // incorrect match length when there are ignorable characters exist between
1246                 // the position and the next character produces CE(s). See ticket#8482.
1247                 if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1248                     mend = minLimit;
1249                 } else {
1250                     int32_t nba = ubrk_following(charBreakIterator, minLimit);
1251 
1252                     if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1253                         mend = nba;
1254                     }
1255                 }
1256             }
1257 
1258             if (mend > maxLimit) {
1259                 continue;
1260             }
1261 
1262             if (! ubrk_isBoundary(charBreakIterator, mend)) {
1263                 continue;
1264             }
1265 
1266             matchStart = start;
1267             matchEnd   = mend;
1268 
1269             ubrk_close(charBreakIterator);
1270             return TRUE;
1271         }
1272     }
1273 
1274     ubrk_close(charBreakIterator);
1275     return FALSE;
1276 }
1277 
1278 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
getIntParam(UnicodeString name,UnicodeString & params,int32_t defaultVal)1279 static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1280     int32_t val = defaultVal;
1281 
1282     name.append(" *= *(-?\\d+)");
1283 
1284     UErrorCode status = U_ZERO_ERROR;
1285     RegexMatcher m(name, params, 0, status);
1286 
1287     if (m.find()) {
1288         // The param exists.  Convert the string to an int.
1289         char valString[100];
1290         int32_t paramLength = m.end(1, status) - m.start(1, status);
1291 
1292         if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1293             paramLength = (int32_t)(sizeof(valString)-2);
1294         }
1295 
1296         params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1297         val = uprv_strtol(valString,  NULL, 10);
1298 
1299         // Delete this parameter from the params string.
1300         m.reset();
1301         params = m.replaceFirst("", status);
1302     }
1303 
1304   //U_ASSERT(U_SUCCESS(status));
1305     if (! U_SUCCESS(status)) {
1306         val = defaultVal;
1307     }
1308 
1309     return val;
1310 }
1311 #endif
1312 
1313 #if !UCONFIG_NO_COLLATION
monkeyTestCase(UCollator * coll,const UnicodeString & testCase,const UnicodeString & pattern,const UnicodeString & altPattern,const char * name,const char * strength,uint32_t seed)1314 int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1315                                     const char *name, const char *strength, uint32_t seed)
1316 {
1317     UErrorCode status = U_ZERO_ERROR;
1318     int32_t actualStart = -1, actualEnd = -1;
1319   //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1320     int32_t expectedStart = -1, expectedEnd = -1;
1321     int32_t notFoundCount = 0;
1322     LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1323                                                            testCase.getBuffer(), testCase.length(),
1324                                                            coll,
1325                                                            NULL,     // the break iterator
1326                                                            &status));
1327 
1328     // **** TODO: find *all* matches, not just first one ****
1329     simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1330 
1331     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1332 
1333     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1334         errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1335               "    strength=%s seed=%d",
1336               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1337     }
1338 
1339     if (expectedStart == -1 && actualStart == -1) {
1340         notFoundCount += 1;
1341     }
1342 
1343     // **** TODO: find *all* matches, not just first one ****
1344     simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1345 
1346     usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1347 
1348     usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1349 
1350     if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1351         errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1352               "    strength=%s seed=%d",
1353               name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1354     }
1355 
1356     if (expectedStart == -1 && actualStart == -1) {
1357         notFoundCount += 1;
1358     }
1359 
1360     return notFoundCount;
1361 }
1362 #endif
1363 
monkeyTest(char * params)1364 void SSearchTest::monkeyTest(char *params)
1365 {
1366     // ook!
1367     UErrorCode status = U_ZERO_ERROR;
1368   //UCollator *coll = ucol_open(NULL, &status);
1369     UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1370 
1371     if (U_FAILURE(status)) {
1372         errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1373         return;
1374     }
1375 
1376     CollData  *monkeyData = new CollData(coll, status);
1377 
1378     USet *expansions   = uset_openEmpty();
1379     USet *contractions = uset_openEmpty();
1380 
1381     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1382 
1383     U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1384     U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1385     USet *letters = uset_openPattern(letter_pattern, 39, &status);
1386     SetMonkey letterMonkey(letters);
1387     StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1388     StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1389     UnicodeString testCase;
1390     UnicodeString alternate;
1391     UnicodeString pattern, altPattern;
1392     UnicodeString prefix, altPrefix;
1393     UnicodeString suffix, altSuffix;
1394 
1395     Monkey *monkeys[] = {
1396         &letterMonkey,
1397         &contractionMonkey,
1398         &expansionMonkey,
1399         &contractionMonkey,
1400         &expansionMonkey,
1401         &contractionMonkey,
1402         &expansionMonkey,
1403         &contractionMonkey,
1404         &expansionMonkey};
1405     int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1406     // int32_t nonMatchCount = 0;
1407 
1408     UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1409     const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1410     int32_t strengthCount = UPRV_LENGTHOF(strengths);
1411     int32_t loopCount = quick? 1000 : 10000;
1412     int32_t firstStrength = 0;
1413     int32_t lastStrength  = strengthCount - 1; //*/ 0;
1414 
1415     if (params != NULL) {
1416 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
1417         UnicodeString p(params);
1418 
1419         loopCount = getIntParam("loop", p, loopCount);
1420         m_seed    = getIntParam("seed", p, m_seed);
1421 
1422         RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1423         if (m.find()) {
1424             UnicodeString breakType = m.group(1, status);
1425 
1426             for (int32_t s = 0; s < strengthCount; s += 1) {
1427                 if (breakType == strengthNames[s]) {
1428                     firstStrength = lastStrength = s;
1429                     break;
1430                 }
1431             }
1432 
1433             m.reset();
1434             p = m.replaceFirst("", status);
1435         }
1436 
1437         if (RegexMatcher("\\S", p, 0, status).find()) {
1438             // Each option is stripped out of the option string as it is processed.
1439             // All options have been checked.  The option string should have been completely emptied..
1440             char buf[100];
1441             p.extract(buf, sizeof(buf), NULL, status);
1442             buf[sizeof(buf)-1] = 0;
1443             errln("Unrecognized or extra parameter:  %s\n", buf);
1444             return;
1445         }
1446 #else
1447         infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1448 #endif
1449     }
1450 
1451     for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1452         int32_t notFoundCount = 0;
1453 
1454         logln("Setting strength to %s.", strengthNames[s]);
1455         ucol_setStrength(coll, strengths[s]);
1456 
1457         // TODO: try alternate prefix and suffix too?
1458         // TODO: alternates are only equal at primary strength. Is this OK?
1459         for(int32_t t = 0; t < loopCount; t += 1) {
1460             uint32_t seed = m_seed;
1461             // int32_t  nmc = 0;
1462 
1463             generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1464             generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1465             generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1466 
1467             // pattern
1468             notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1469 
1470             testCase.remove();
1471             testCase.append(prefix);
1472             testCase.append(/*alt*/pattern);
1473 
1474             // prefix + pattern
1475             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1476 
1477             testCase.append(suffix);
1478 
1479             // prefix + pattern + suffix
1480             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1481 
1482             testCase.remove();
1483             testCase.append(pattern);
1484             testCase.append(suffix);
1485 
1486             // pattern + suffix
1487             notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1488         }
1489 
1490        logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1491     }
1492 
1493     uset_close(contractions);
1494     uset_close(expansions);
1495     uset_close(letters);
1496     delete monkeyData;
1497 
1498     ucol_close(coll);
1499 }
1500 
1501 #endif
1502 
1503 #endif
1504