1 /********************************************************************
2  * COPYRIGHT:
3  * Copyright (c) 2002-2015, International Business Machines Corporation and
4  * others. All Rights Reserved.
5  ********************************************************************/
6 
7 //
8 //   regextst.cpp
9 //
10 //      ICU Regular Expressions test, part of intltest.
11 //
12 
13 /*
14      NOTE!!
15 
16      PLEASE be careful about ASCII assumptions in this test.
17      This test is one of the worst repeat offenders.
18      If you have questions, contact someone on the ICU PMC
19      who has access to an EBCDIC system.
20 
21  */
22 
23 #include "intltest.h"
24 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
25 
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 
30 #include "unicode/localpointer.h"
31 #include "unicode/regex.h"
32 #include "unicode/uchar.h"
33 #include "unicode/ucnv.h"
34 #include "unicode/uniset.h"
35 #include "unicode/uregex.h"
36 #include "unicode/usetiter.h"
37 #include "unicode/ustring.h"
38 #include "unicode/utext.h"
39 
40 #include "regextst.h"
41 #include "regexcmp.h"
42 #include "uvector.h"
43 #include "util.h"
44 #include "cmemory.h"
45 #include "cstring.h"
46 #include "uinvchar.h"
47 
48 #define SUPPORT_MUTATING_INPUT_STRING   0
49 
50 //---------------------------------------------------------------------------
51 //
52 //  Test class boilerplate
53 //
54 //---------------------------------------------------------------------------
RegexTest()55 RegexTest::RegexTest()
56 {
57 }
58 
59 
~RegexTest()60 RegexTest::~RegexTest()
61 {
62 }
63 
64 
65 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)66 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
67 {
68     if (exec) logln("TestSuite RegexTest: ");
69     switch (index) {
70 
71         case 0: name = "Basic";
72             if (exec) Basic();
73             break;
74         case 1: name = "API_Match";
75             if (exec) API_Match();
76             break;
77         case 2: name = "API_Replace";
78             if (exec) API_Replace();
79             break;
80         case 3: name = "API_Pattern";
81             if (exec) API_Pattern();
82             break;
83         case 4:
84 #if !UCONFIG_NO_FILE_IO
85             name = "Extended";
86             if (exec) Extended();
87 #else
88             name = "skip";
89 #endif
90             break;
91         case 5: name = "Errors";
92             if (exec) Errors();
93             break;
94         case 6: name = "PerlTests";
95             if (exec) PerlTests();
96             break;
97         case 7: name = "Callbacks";
98             if (exec) Callbacks();
99             break;
100         case 8: name = "FindProgressCallbacks";
101             if (exec) FindProgressCallbacks();
102             break;
103         case 9: name = "Bug 6149";
104              if (exec) Bug6149();
105              break;
106         case 10: name = "UTextBasic";
107           if (exec) UTextBasic();
108           break;
109         case 11: name = "API_Match_UTF8";
110           if (exec) API_Match_UTF8();
111           break;
112         case 12: name = "API_Replace_UTF8";
113           if (exec) API_Replace_UTF8();
114           break;
115         case 13: name = "API_Pattern_UTF8";
116           if (exec) API_Pattern_UTF8();
117           break;
118         case 14: name = "PerlTestsUTF8";
119           if (exec) PerlTestsUTF8();
120           break;
121         case 15: name = "PreAllocatedUTextCAPI";
122           if (exec) PreAllocatedUTextCAPI();
123           break;
124         case 16: name = "Bug 7651";
125              if (exec) Bug7651();
126              break;
127         case 17: name = "Bug 7740";
128             if (exec) Bug7740();
129             break;
130         case 18: name = "Bug 8479";
131             if (exec) Bug8479();
132             break;
133         case 19: name = "Bug 7029";
134             if (exec) Bug7029();
135             break;
136         case 20: name = "CheckInvBufSize";
137             if (exec) CheckInvBufSize();
138             break;
139         case 21: name = "Bug 9283";
140             if (exec) Bug9283();
141             break;
142         case 22: name = "Bug10459";
143             if (exec) Bug10459();
144             break;
145         case 23: name = "TestCaseInsensitiveStarters";
146             if (exec) TestCaseInsensitiveStarters();
147             break;
148         case 24: name = "TestBug11049";
149             if (exec) TestBug11049();
150             break;
151         case 25: name = "TestBug11371";
152             if (exec) TestBug11371();
153             break;
154         case 26: name = "TestBug11480";
155             if (exec) TestBug11480();
156             break;
157         case 27: name = "NamedCapture";
158             if (exec) NamedCapture();
159             break;
160         case 28: name = "NamedCaptureLimits";
161             if (exec) NamedCaptureLimits();
162             break;
163         default: name = "";
164             break; //needed to end loop
165     }
166 }
167 
168 
169 
170 /**
171  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
172  * into ASCII.
173  * @see utext_openUTF8
174  */
175 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
176 
177 //---------------------------------------------------------------------------
178 //
179 //   Error Checking / Reporting macros used in all of the tests.
180 //
181 //---------------------------------------------------------------------------
182 
utextToPrintable(char * buf,int32_t bufLen,UText * text)183 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
184   int64_t oldIndex = utext_getNativeIndex(text);
185   utext_setNativeIndex(text, 0);
186   char *bufPtr = buf;
187   UChar32 c = utext_next32From(text, 0);
188   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
189     if (0x000020<=c && c<0x00007e) {
190       *bufPtr = c;
191     } else {
192 #if 0
193       sprintf(bufPtr,"U+%04X", c);
194       bufPtr+= strlen(bufPtr)-1;
195 #else
196       *bufPtr = '%';
197 #endif
198     }
199     bufPtr++;
200     c = UTEXT_NEXT32(text);
201   }
202   *bufPtr = 0;
203 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
204   char *ebuf = (char*)malloc(bufLen);
205   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
206   uprv_strncpy(buf, ebuf, bufLen);
207   free((void*)ebuf);
208 #endif
209   utext_setNativeIndex(text, oldIndex);
210 }
211 
212 
213 static char ASSERT_BUF[1024];
214 
extractToAssertBuf(const UnicodeString & message)215 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
216   if(message.length()==0) {
217     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
218   } else {
219     UnicodeString buf;
220     IntlTest::prettify(message,buf);
221     if(buf.length()==0) {
222       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
223     } else {
224       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
225       if(ASSERT_BUF[0]==0) {
226         ASSERT_BUF[0]=0;
227         for(int32_t i=0;i<buf.length();i++) {
228           UChar ch = buf[i];
229           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
230         }
231       }
232     }
233   }
234   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
235   return ASSERT_BUF;
236 }
237 
238 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,sizeof(buf)/sizeof(buf[0]),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
239 
240 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
241                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
242 
243 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
244 
245 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
246 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
247     __LINE__, u_errorName(errcode), u_errorName(status));};}
248 
249 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
250     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
251 
252 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
253     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
254 
255 // expected: const char * , restricted to invariant characters.
256 // actual: const UnicodeString &
257 #define REGEX_ASSERT_UNISTR(expected, actual) { \
258     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
259         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
260                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
261 
262 
testUTextEqual(UText * uta,UText * utb)263 static UBool testUTextEqual(UText *uta, UText *utb) {
264     UChar32 ca = 0;
265     UChar32 cb = 0;
266     utext_setNativeIndex(uta, 0);
267     utext_setNativeIndex(utb, 0);
268     do {
269         ca = utext_next32(uta);
270         cb = utext_next32(utb);
271         if (ca != cb) {
272             break;
273         }
274     } while (ca != U_SENTINEL);
275     return ca == cb;
276 }
277 
278 
279 /**
280  * @param expected expected text in UTF-8 (not platform) codepage
281  */
assertUText(const char * expected,UText * actual,const char * file,int line)282 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
283     UErrorCode status = U_ZERO_ERROR;
284     UText expectedText = UTEXT_INITIALIZER;
285     utext_openUTF8(&expectedText, expected, -1, &status);
286     if(U_FAILURE(status)) {
287       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
288       return;
289     }
290     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
291       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
292       return;
293     }
294     utext_setNativeIndex(actual, 0);
295     if (!testUTextEqual(&expectedText, actual)) {
296         char buf[201 /*21*/];
297         char expectedBuf[201];
298         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
299         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
300         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
301     }
302     utext_close(&expectedText);
303 }
304 /**
305  * @param expected invariant (platform local text) input
306  */
307 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)308 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
309     UErrorCode status = U_ZERO_ERROR;
310     UText expectedText = UTEXT_INITIALIZER;
311     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
312     if(U_FAILURE(status)) {
313       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
314       return;
315     }
316     utext_setNativeIndex(actual, 0);
317     if (!testUTextEqual(&expectedText, actual)) {
318         char buf[201 /*21*/];
319         char expectedBuf[201];
320         utextToPrintable(buf, sizeof(buf)/sizeof(buf[0]), actual);
321         utextToPrintable(expectedBuf, sizeof(expectedBuf)/sizeof(expectedBuf[0]), &expectedText);
322         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
323     }
324     utext_close(&expectedText);
325 }
326 
327 /**
328  * Assumes utf-8 input
329  */
330 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
331 /**
332  * Assumes Invariant input
333  */
334 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
335 
336 /**
337  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
338  * passed into utext_openUTF8. An error will be given if
339  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
340  */
341 
342 #define INV_BUFSIZ 2048 /* increase this if too small */
343 
344 static int64_t inv_next=0;
345 
346 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
347 static char inv_buf[INV_BUFSIZ];
348 #endif
349 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)350 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
351   if(length==-1) length=strlen(inv);
352 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
353   inv_next+=length;
354   return utext_openUTF8(ut, inv, length, status);
355 #else
356   if(inv_next+length+1>INV_BUFSIZ) {
357     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
358             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
359     *status = U_MEMORY_ALLOCATION_ERROR;
360     return NULL;
361   }
362 
363   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
364   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
365   inv_next+=length;
366 
367 #if 0
368   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
369 #endif
370 
371   return utext_openUTF8(ut, (const char*)buf, length, status);
372 #endif
373 }
374 
375 
376 //---------------------------------------------------------------------------
377 //
378 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
379 //                       for the LookingAt() and  Match() functions.
380 //
381 //       usage:
382 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
383 //
384 //          The expected results are UBool - TRUE or FALSE.
385 //          The input text is unescaped.  The pattern is not.
386 //
387 //
388 //---------------------------------------------------------------------------
389 
390 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
391 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)392 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
393     const UnicodeString pattern(pat, -1, US_INV);
394     const UnicodeString inputText(text, -1, US_INV);
395     UErrorCode          status  = U_ZERO_ERROR;
396     UParseError         pe;
397     RegexPattern        *REPattern = NULL;
398     RegexMatcher        *REMatcher = NULL;
399     UBool               retVal     = TRUE;
400 
401     UnicodeString patString(pat, -1, US_INV);
402     REPattern = RegexPattern::compile(patString, 0, pe, status);
403     if (U_FAILURE(status)) {
404         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
405             line, u_errorName(status));
406         return FALSE;
407     }
408     if (line==376) { REPattern->dumpPattern();}
409 
410     UnicodeString inputString(inputText);
411     UnicodeString unEscapedInput = inputString.unescape();
412     REMatcher = REPattern->matcher(unEscapedInput, status);
413     if (U_FAILURE(status)) {
414         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
415             line, u_errorName(status));
416         return FALSE;
417     }
418 
419     UBool actualmatch;
420     actualmatch = REMatcher->lookingAt(status);
421     if (U_FAILURE(status)) {
422         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
423             line, u_errorName(status));
424         retVal =  FALSE;
425     }
426     if (actualmatch != looking) {
427         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
428         retVal = FALSE;
429     }
430 
431     status = U_ZERO_ERROR;
432     actualmatch = REMatcher->matches(status);
433     if (U_FAILURE(status)) {
434         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
435             line, u_errorName(status));
436         retVal = FALSE;
437     }
438     if (actualmatch != match) {
439         errln("RegexTest: wrong return from matches() at line %d.\n", line);
440         retVal = FALSE;
441     }
442 
443     if (retVal == FALSE) {
444         REPattern->dumpPattern();
445     }
446 
447     delete REPattern;
448     delete REMatcher;
449     return retVal;
450 }
451 
452 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)453 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
454     UText               pattern    = UTEXT_INITIALIZER;
455     int32_t             inputUTF8Length;
456     char                *textChars = NULL;
457     UText               inputText  = UTEXT_INITIALIZER;
458     UErrorCode          status     = U_ZERO_ERROR;
459     UParseError         pe;
460     RegexPattern        *REPattern = NULL;
461     RegexMatcher        *REMatcher = NULL;
462     UBool               retVal     = TRUE;
463 
464     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
465     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
466     if (U_FAILURE(status)) {
467         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
468             line, u_errorName(status));
469         return FALSE;
470     }
471 
472     UnicodeString inputString(text, -1, US_INV);
473     UnicodeString unEscapedInput = inputString.unescape();
474     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
475     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
476 
477     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
478     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
479         // UTF-8 does not allow unpaired surrogates, so this could actually happen
480         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
481         return TRUE; // not a failure of the Regex engine
482     }
483     status = U_ZERO_ERROR; // buffer overflow
484     textChars = new char[inputUTF8Length+1];
485     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
486     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
487 
488     REMatcher = &REPattern->matcher(status)->reset(&inputText);
489     if (U_FAILURE(status)) {
490         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
491             line, u_errorName(status));
492         return FALSE;
493     }
494 
495     UBool actualmatch;
496     actualmatch = REMatcher->lookingAt(status);
497     if (U_FAILURE(status)) {
498         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
499             line, u_errorName(status));
500         retVal =  FALSE;
501     }
502     if (actualmatch != looking) {
503         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
504         retVal = FALSE;
505     }
506 
507     status = U_ZERO_ERROR;
508     actualmatch = REMatcher->matches(status);
509     if (U_FAILURE(status)) {
510         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
511             line, u_errorName(status));
512         retVal = FALSE;
513     }
514     if (actualmatch != match) {
515         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
516         retVal = FALSE;
517     }
518 
519     if (retVal == FALSE) {
520         REPattern->dumpPattern();
521     }
522 
523     delete REPattern;
524     delete REMatcher;
525     utext_close(&inputText);
526     utext_close(&pattern);
527     delete[] textChars;
528     return retVal;
529 }
530 
531 
532 
533 //---------------------------------------------------------------------------
534 //
535 //    REGEX_ERR       Macro + invocation function to simplify writing tests
536 //                       regex tests for incorrect patterns
537 //
538 //       usage:
539 //          REGEX_ERR("pattern",   expected error line, column, expected status);
540 //
541 //---------------------------------------------------------------------------
542 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
543 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)544 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
545                           UErrorCode expectedStatus, int32_t line) {
546     UnicodeString       pattern(pat);
547 
548     UErrorCode          status         = U_ZERO_ERROR;
549     UParseError         pe;
550     RegexPattern        *callerPattern = NULL;
551 
552     //
553     //  Compile the caller's pattern
554     //
555     UnicodeString patString(pat);
556     callerPattern = RegexPattern::compile(patString, 0, pe, status);
557     if (status != expectedStatus) {
558         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
559     } else {
560         if (status != U_ZERO_ERROR) {
561             if (pe.line != errLine || pe.offset != errCol) {
562                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
563                     line, errLine, errCol, pe.line, pe.offset);
564             }
565         }
566     }
567 
568     delete callerPattern;
569 
570     //
571     //  Compile again, using a UTF-8-based UText
572     //
573     UText patternText = UTEXT_INITIALIZER;
574     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
575     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
576     if (status != expectedStatus) {
577         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
578     } else {
579         if (status != U_ZERO_ERROR) {
580             if (pe.line != errLine || pe.offset != errCol) {
581                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
582                     line, errLine, errCol, pe.line, pe.offset);
583             }
584         }
585     }
586 
587     delete callerPattern;
588     utext_close(&patternText);
589 }
590 
591 
592 
593 //---------------------------------------------------------------------------
594 //
595 //      Basic      Check for basic functionality of regex pattern matching.
596 //                 Avoid the use of REGEX_FIND test macro, which has
597 //                 substantial dependencies on basic Regex functionality.
598 //
599 //---------------------------------------------------------------------------
Basic()600 void RegexTest::Basic() {
601 
602 
603 //
604 // Debug - slide failing test cases early
605 //
606 #if 0
607     {
608         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
609         UParseError pe;
610         UErrorCode  status = U_ZERO_ERROR;
611         RegexPattern *pattern;
612         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
613         pattern->dumpPattern();
614         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
615         UBool result = m->find();
616         printf("result = %d\n", result);
617         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
618         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
619     }
620     exit(1);
621 #endif
622 
623 
624     //
625     // Pattern with parentheses
626     //
627     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
628     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
629     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
630 
631     //
632     // Patterns with *
633     //
634     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
635     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
636     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
637     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
638     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
639 
640     REGEX_TESTLM("a*", "",  TRUE, TRUE);
641     REGEX_TESTLM("a*", "b", TRUE, FALSE);
642 
643 
644     //
645     //  Patterns with "."
646     //
647     REGEX_TESTLM(".", "abc", TRUE, FALSE);
648     REGEX_TESTLM("...", "abc", TRUE, TRUE);
649     REGEX_TESTLM("....", "abc", FALSE, FALSE);
650     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
651     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
652     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
653     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
654     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
655 
656     //
657     //  Patterns with * applied to chars at end of literal string
658     //
659     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
660     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
661 
662     //
663     //  Supplemental chars match as single chars, not a pair of surrogates.
664     //
665     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
666     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
667     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
668 
669 
670     //
671     //  UnicodeSets in the pattern
672     //
673     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
674     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
675     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
676     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
677     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
678     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
679 
680     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
681     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
682     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
683     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
684     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
685 
686     //
687     //   OR operator in patterns
688     //
689     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
690     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
691     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
692     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
693 
694     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
695     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
696     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
697     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
698     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
699     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
700 
701     //
702     //  +
703     //
704     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
705     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
706     REGEX_TESTLM("b+", "", FALSE, FALSE);
707     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
708     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
709     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
710 
711     //
712     //   ?
713     //
714     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
715     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
716     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
717     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
718     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
719     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
720     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
721     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
722     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
723 
724     //
725     //  Escape sequences that become single literal chars, handled internally
726     //   by ICU's Unescape.
727     //
728 
729     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
730     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
731     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
732     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
733     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
734     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
735     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
736     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
737     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
738     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
739 
740     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
741     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
742 
743     // Escape of special chars in patterns
744     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
745 }
746 
747 
748 //---------------------------------------------------------------------------
749 //
750 //    UTextBasic   Check for quirks that are specific to the UText
751 //                 implementation.
752 //
753 //---------------------------------------------------------------------------
UTextBasic()754 void RegexTest::UTextBasic() {
755     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
756     UErrorCode status = U_ZERO_ERROR;
757     UText pattern = UTEXT_INITIALIZER;
758     utext_openUTF8(&pattern, str_abc, -1, &status);
759     RegexMatcher matcher(&pattern, 0, status);
760     REGEX_CHECK_STATUS;
761 
762     UText input = UTEXT_INITIALIZER;
763     utext_openUTF8(&input, str_abc, -1, &status);
764     REGEX_CHECK_STATUS;
765     matcher.reset(&input);
766     REGEX_CHECK_STATUS;
767     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
768 
769     matcher.reset(matcher.inputText());
770     REGEX_CHECK_STATUS;
771     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
772 
773     utext_close(&pattern);
774     utext_close(&input);
775 }
776 
777 
778 //---------------------------------------------------------------------------
779 //
780 //      API_Match   Test that the API for class RegexMatcher
781 //                  is present and nominally working, but excluding functions
782 //                  implementing replace operations.
783 //
784 //---------------------------------------------------------------------------
API_Match()785 void RegexTest::API_Match() {
786     UParseError         pe;
787     UErrorCode          status=U_ZERO_ERROR;
788     int32_t             flags = 0;
789 
790     //
791     // Debug - slide failing test cases early
792     //
793 #if 0
794     {
795     }
796     return;
797 #endif
798 
799     //
800     // Simple pattern compilation
801     //
802     {
803         UnicodeString       re("abc");
804         RegexPattern        *pat2;
805         pat2 = RegexPattern::compile(re, flags, pe, status);
806         REGEX_CHECK_STATUS;
807 
808         UnicodeString inStr1 = "abcdef this is a test";
809         UnicodeString instr2 = "not abc";
810         UnicodeString empty  = "";
811 
812 
813         //
814         // Matcher creation and reset.
815         //
816         RegexMatcher *m1 = pat2->matcher(inStr1, status);
817         REGEX_CHECK_STATUS;
818         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
819         REGEX_ASSERT(m1->input() == inStr1);
820         m1->reset(instr2);
821         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
822         REGEX_ASSERT(m1->input() == instr2);
823         m1->reset(inStr1);
824         REGEX_ASSERT(m1->input() == inStr1);
825         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
826         m1->reset(empty);
827         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
828         REGEX_ASSERT(m1->input() == empty);
829         REGEX_ASSERT(&m1->pattern() == pat2);
830 
831         //
832         //  reset(pos, status)
833         //
834         m1->reset(inStr1);
835         m1->reset(4, status);
836         REGEX_CHECK_STATUS;
837         REGEX_ASSERT(m1->input() == inStr1);
838         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
839 
840         m1->reset(-1, status);
841         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
842         status = U_ZERO_ERROR;
843 
844         m1->reset(0, status);
845         REGEX_CHECK_STATUS;
846         status = U_ZERO_ERROR;
847 
848         int32_t len = m1->input().length();
849         m1->reset(len-1, status);
850         REGEX_CHECK_STATUS;
851         status = U_ZERO_ERROR;
852 
853         m1->reset(len, status);
854         REGEX_CHECK_STATUS;
855         status = U_ZERO_ERROR;
856 
857         m1->reset(len+1, status);
858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859         status = U_ZERO_ERROR;
860 
861         //
862         // match(pos, status)
863         //
864         m1->reset(instr2);
865         REGEX_ASSERT(m1->matches(4, status) == TRUE);
866         m1->reset();
867         REGEX_ASSERT(m1->matches(3, status) == FALSE);
868         m1->reset();
869         REGEX_ASSERT(m1->matches(5, status) == FALSE);
870         REGEX_ASSERT(m1->matches(4, status) == TRUE);
871         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
872         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
873 
874         // Match() at end of string should fail, but should not
875         //  be an error.
876         status = U_ZERO_ERROR;
877         len = m1->input().length();
878         REGEX_ASSERT(m1->matches(len, status) == FALSE);
879         REGEX_CHECK_STATUS;
880 
881         // Match beyond end of string should fail with an error.
882         status = U_ZERO_ERROR;
883         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
884         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
885 
886         // Successful match at end of string.
887         {
888             status = U_ZERO_ERROR;
889             RegexMatcher m("A?", 0, status);  // will match zero length string.
890             REGEX_CHECK_STATUS;
891             m.reset(inStr1);
892             len = inStr1.length();
893             REGEX_ASSERT(m.matches(len, status) == TRUE);
894             REGEX_CHECK_STATUS;
895             m.reset(empty);
896             REGEX_ASSERT(m.matches(0, status) == TRUE);
897             REGEX_CHECK_STATUS;
898         }
899 
900 
901         //
902         // lookingAt(pos, status)
903         //
904         status = U_ZERO_ERROR;
905         m1->reset(instr2);  // "not abc"
906         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
907         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
908         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
909         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
910         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
911         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
912         status = U_ZERO_ERROR;
913         len = m1->input().length();
914         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
915         REGEX_CHECK_STATUS;
916         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
917         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
918 
919         delete m1;
920         delete pat2;
921     }
922 
923 
924     //
925     // Capture Group.
926     //     RegexMatcher::start();
927     //     RegexMatcher::end();
928     //     RegexMatcher::groupCount();
929     //
930     {
931         int32_t             flags=0;
932         UParseError         pe;
933         UErrorCode          status=U_ZERO_ERROR;
934 
935         UnicodeString       re("01(23(45)67)(.*)");
936         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937         REGEX_CHECK_STATUS;
938         UnicodeString data = "0123456789";
939 
940         RegexMatcher *matcher = pat->matcher(data, status);
941         REGEX_CHECK_STATUS;
942         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
943         static const int32_t matchStarts[] = {0,  2, 4, 8};
944         static const int32_t matchEnds[]   = {10, 8, 6, 10};
945         int32_t i;
946         for (i=0; i<4; i++) {
947             int32_t actualStart = matcher->start(i, status);
948             REGEX_CHECK_STATUS;
949             if (actualStart != matchStarts[i]) {
950                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
951                     __LINE__, i, matchStarts[i], actualStart);
952             }
953             int32_t actualEnd = matcher->end(i, status);
954             REGEX_CHECK_STATUS;
955             if (actualEnd != matchEnds[i]) {
956                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
957                     __LINE__, i, matchEnds[i], actualEnd);
958             }
959         }
960 
961         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
962         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
963 
964         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
965         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
966         matcher->reset();
967         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
968 
969         matcher->lookingAt(status);
970         REGEX_ASSERT(matcher->group(status)    == "0123456789");
971         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
972         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
973         REGEX_ASSERT(matcher->group(2, status) == "45"        );
974         REGEX_ASSERT(matcher->group(3, status) == "89"        );
975         REGEX_CHECK_STATUS;
976         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
977         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
978         matcher->reset();
979         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
980 
981         delete matcher;
982         delete pat;
983 
984     }
985 
986     //
987     //  find
988     //
989     {
990         int32_t             flags=0;
991         UParseError         pe;
992         UErrorCode          status=U_ZERO_ERROR;
993 
994         UnicodeString       re("abc");
995         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
996         REGEX_CHECK_STATUS;
997         UnicodeString data = ".abc..abc...abc..";
998         //                    012345678901234567
999 
1000         RegexMatcher *matcher = pat->matcher(data, status);
1001         REGEX_CHECK_STATUS;
1002         REGEX_ASSERT(matcher->find());
1003         REGEX_ASSERT(matcher->start(status) == 1);
1004         REGEX_ASSERT(matcher->find());
1005         REGEX_ASSERT(matcher->start(status) == 6);
1006         REGEX_ASSERT(matcher->find());
1007         REGEX_ASSERT(matcher->start(status) == 12);
1008         REGEX_ASSERT(matcher->find() == FALSE);
1009         REGEX_ASSERT(matcher->find() == FALSE);
1010 
1011         matcher->reset();
1012         REGEX_ASSERT(matcher->find());
1013         REGEX_ASSERT(matcher->start(status) == 1);
1014 
1015         REGEX_ASSERT(matcher->find(0, status));
1016         REGEX_ASSERT(matcher->start(status) == 1);
1017         REGEX_ASSERT(matcher->find(1, status));
1018         REGEX_ASSERT(matcher->start(status) == 1);
1019         REGEX_ASSERT(matcher->find(2, status));
1020         REGEX_ASSERT(matcher->start(status) == 6);
1021         REGEX_ASSERT(matcher->find(12, status));
1022         REGEX_ASSERT(matcher->start(status) == 12);
1023         REGEX_ASSERT(matcher->find(13, status) == FALSE);
1024         REGEX_ASSERT(matcher->find(16, status) == FALSE);
1025         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1026         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1027 
1028         status = U_ZERO_ERROR;
1029         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1030         status = U_ZERO_ERROR;
1031         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032 
1033         REGEX_ASSERT(matcher->groupCount() == 0);
1034 
1035         delete matcher;
1036         delete pat;
1037     }
1038 
1039 
1040     //
1041     //  find, with \G in pattern (true if at the end of a previous match).
1042     //
1043     {
1044         int32_t             flags=0;
1045         UParseError         pe;
1046         UErrorCode          status=U_ZERO_ERROR;
1047 
1048         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1049         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1050         REGEX_CHECK_STATUS;
1051         UnicodeString data = ".abcabc.abc..";
1052         //                    012345678901234567
1053 
1054         RegexMatcher *matcher = pat->matcher(data, status);
1055         REGEX_CHECK_STATUS;
1056         REGEX_ASSERT(matcher->find());
1057         REGEX_ASSERT(matcher->start(status) == 0);
1058         REGEX_ASSERT(matcher->start(1, status) == -1);
1059         REGEX_ASSERT(matcher->start(2, status) == 1);
1060 
1061         REGEX_ASSERT(matcher->find());
1062         REGEX_ASSERT(matcher->start(status) == 4);
1063         REGEX_ASSERT(matcher->start(1, status) == 4);
1064         REGEX_ASSERT(matcher->start(2, status) == -1);
1065         REGEX_CHECK_STATUS;
1066 
1067         delete matcher;
1068         delete pat;
1069     }
1070 
1071     //
1072     //   find with zero length matches, match position should bump ahead
1073     //     to prevent loops.
1074     //
1075     {
1076         int32_t                 i;
1077         UErrorCode          status=U_ZERO_ERROR;
1078         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1079                                                       //   using an always-true look-ahead.
1080         REGEX_CHECK_STATUS;
1081         UnicodeString s("    ");
1082         m.reset(s);
1083         for (i=0; ; i++) {
1084             if (m.find() == FALSE) {
1085                 break;
1086             }
1087             REGEX_ASSERT(m.start(status) == i);
1088             REGEX_ASSERT(m.end(status) == i);
1089         }
1090         REGEX_ASSERT(i==5);
1091 
1092         // Check that the bump goes over surrogate pairs OK
1093         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1094         s = s.unescape();
1095         m.reset(s);
1096         for (i=0; ; i+=2) {
1097             if (m.find() == FALSE) {
1098                 break;
1099             }
1100             REGEX_ASSERT(m.start(status) == i);
1101             REGEX_ASSERT(m.end(status) == i);
1102         }
1103         REGEX_ASSERT(i==10);
1104     }
1105     {
1106         // find() loop breaking test.
1107         //        with pattern of /.?/, should see a series of one char matches, then a single
1108         //        match of zero length at the end of the input string.
1109         int32_t                 i;
1110         UErrorCode          status=U_ZERO_ERROR;
1111         RegexMatcher        m(".?", 0, status);
1112         REGEX_CHECK_STATUS;
1113         UnicodeString s("    ");
1114         m.reset(s);
1115         for (i=0; ; i++) {
1116             if (m.find() == FALSE) {
1117                 break;
1118             }
1119             REGEX_ASSERT(m.start(status) == i);
1120             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1121         }
1122         REGEX_ASSERT(i==5);
1123     }
1124 
1125 
1126     //
1127     // Matchers with no input string behave as if they had an empty input string.
1128     //
1129 
1130     {
1131         UErrorCode status = U_ZERO_ERROR;
1132         RegexMatcher  m(".?", 0, status);
1133         REGEX_CHECK_STATUS;
1134         REGEX_ASSERT(m.find());
1135         REGEX_ASSERT(m.start(status) == 0);
1136         REGEX_ASSERT(m.input() == "");
1137     }
1138     {
1139         UErrorCode status = U_ZERO_ERROR;
1140         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1141         RegexMatcher  *m = p->matcher(status);
1142         REGEX_CHECK_STATUS;
1143 
1144         REGEX_ASSERT(m->find() == FALSE);
1145         REGEX_ASSERT(m->input() == "");
1146         delete m;
1147         delete p;
1148     }
1149 
1150     //
1151     // Regions
1152     //
1153     {
1154         UErrorCode status = U_ZERO_ERROR;
1155         UnicodeString testString("This is test data");
1156         RegexMatcher m(".*", testString,  0, status);
1157         REGEX_CHECK_STATUS;
1158         REGEX_ASSERT(m.regionStart() == 0);
1159         REGEX_ASSERT(m.regionEnd() == testString.length());
1160         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1161         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1162 
1163         m.region(2,4, status);
1164         REGEX_CHECK_STATUS;
1165         REGEX_ASSERT(m.matches(status));
1166         REGEX_ASSERT(m.start(status)==2);
1167         REGEX_ASSERT(m.end(status)==4);
1168         REGEX_CHECK_STATUS;
1169 
1170         m.reset();
1171         REGEX_ASSERT(m.regionStart() == 0);
1172         REGEX_ASSERT(m.regionEnd() == testString.length());
1173 
1174         UnicodeString shorterString("short");
1175         m.reset(shorterString);
1176         REGEX_ASSERT(m.regionStart() == 0);
1177         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1178 
1179         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1180         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1181         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1182         REGEX_ASSERT(&m == &m.reset());
1183         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184 
1185         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1186         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1187         REGEX_ASSERT(&m == &m.reset());
1188         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189 
1190         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1191         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1192         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1193         REGEX_ASSERT(&m == &m.reset());
1194         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195 
1196         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1197         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1198         REGEX_ASSERT(&m == &m.reset());
1199         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200 
1201     }
1202 
1203     //
1204     // hitEnd() and requireEnd()
1205     //
1206     {
1207         UErrorCode status = U_ZERO_ERROR;
1208         UnicodeString testString("aabb");
1209         RegexMatcher m1(".*", testString,  0, status);
1210         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1211         REGEX_ASSERT(m1.hitEnd() == TRUE);
1212         REGEX_ASSERT(m1.requireEnd() == FALSE);
1213         REGEX_CHECK_STATUS;
1214 
1215         status = U_ZERO_ERROR;
1216         RegexMatcher m2("a*", testString, 0, status);
1217         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1218         REGEX_ASSERT(m2.hitEnd() == FALSE);
1219         REGEX_ASSERT(m2.requireEnd() == FALSE);
1220         REGEX_CHECK_STATUS;
1221 
1222         status = U_ZERO_ERROR;
1223         RegexMatcher m3(".*$", testString, 0, status);
1224         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1225         REGEX_ASSERT(m3.hitEnd() == TRUE);
1226         REGEX_ASSERT(m3.requireEnd() == TRUE);
1227         REGEX_CHECK_STATUS;
1228     }
1229 
1230 
1231     //
1232     // Compilation error on reset with UChar *
1233     //   These were a hazard that people were stumbling over with runtime errors.
1234     //   Changed them to compiler errors by adding private methods that more closely
1235     //   matched the incorrect use of the functions.
1236     //
1237 #if 0
1238     {
1239         UErrorCode status = U_ZERO_ERROR;
1240         UChar ucharString[20];
1241         RegexMatcher m(".", 0, status);
1242         m.reset(ucharString);  // should not compile.
1243 
1244         RegexPattern *p = RegexPattern::compile(".", 0, status);
1245         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1246 
1247         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1248     }
1249 #endif
1250 
1251     //
1252     //  Time Outs.
1253     //       Note:  These tests will need to be changed when the regexp engine is
1254     //              able to detect and cut short the exponential time behavior on
1255     //              this type of match.
1256     //
1257     {
1258         UErrorCode status = U_ZERO_ERROR;
1259         //    Enough 'a's in the string to cause the match to time out.
1260         //       (Each on additonal 'a' doubles the time)
1261         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1262         RegexMatcher matcher("(a+)+b", testString, 0, status);
1263         REGEX_CHECK_STATUS;
1264         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1265         matcher.setTimeLimit(100, status);
1266         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1267         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1269     }
1270     {
1271         UErrorCode status = U_ZERO_ERROR;
1272         //   Few enough 'a's to slip in under the time limit.
1273         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1274         RegexMatcher matcher("(a+)+b", testString, 0, status);
1275         REGEX_CHECK_STATUS;
1276         matcher.setTimeLimit(100, status);
1277         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1278         REGEX_CHECK_STATUS;
1279     }
1280 
1281     //
1282     //  Stack Limits
1283     //
1284     {
1285         UErrorCode status = U_ZERO_ERROR;
1286         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1287 
1288         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1289         //   of the '+', and makes the stack frames larger.
1290         RegexMatcher matcher("(A)+A$", testString, 0, status);
1291 
1292         // With the default stack, this match should fail to run
1293         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1294         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1295 
1296         // With unlimited stack, it should run
1297         status = U_ZERO_ERROR;
1298         matcher.setStackLimit(0, status);
1299         REGEX_CHECK_STATUS;
1300         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1301         REGEX_CHECK_STATUS;
1302         REGEX_ASSERT(matcher.getStackLimit() == 0);
1303 
1304         // With a limited stack, it the match should fail
1305         status = U_ZERO_ERROR;
1306         matcher.setStackLimit(10000, status);
1307         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1308         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1309         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1310     }
1311 
1312         // A pattern that doesn't save state should work with
1313         //   a minimal sized stack
1314     {
1315         UErrorCode status = U_ZERO_ERROR;
1316         UnicodeString testString = "abc";
1317         RegexMatcher matcher("abc", testString, 0, status);
1318         REGEX_CHECK_STATUS;
1319         matcher.setStackLimit(30, status);
1320         REGEX_CHECK_STATUS;
1321         REGEX_ASSERT(matcher.matches(status) == TRUE);
1322         REGEX_CHECK_STATUS;
1323         REGEX_ASSERT(matcher.getStackLimit() == 30);
1324 
1325         // Negative stack sizes should fail
1326         status = U_ZERO_ERROR;
1327         matcher.setStackLimit(1000, status);
1328         REGEX_CHECK_STATUS;
1329         matcher.setStackLimit(-1, status);
1330         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1331         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1332     }
1333 
1334 
1335 }
1336 
1337 
1338 
1339 
1340 
1341 
1342 //---------------------------------------------------------------------------
1343 //
1344 //      API_Replace        API test for class RegexMatcher, testing the
1345 //                         Replace family of functions.
1346 //
1347 //---------------------------------------------------------------------------
API_Replace()1348 void RegexTest::API_Replace() {
1349     //
1350     //  Replace
1351     //
1352     int32_t             flags=0;
1353     UParseError         pe;
1354     UErrorCode          status=U_ZERO_ERROR;
1355 
1356     UnicodeString       re("abc");
1357     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1358     REGEX_CHECK_STATUS;
1359     UnicodeString data = ".abc..abc...abc..";
1360     //                    012345678901234567
1361     RegexMatcher *matcher = pat->matcher(data, status);
1362 
1363     //
1364     //  Plain vanilla matches.
1365     //
1366     UnicodeString  dest;
1367     dest = matcher->replaceFirst("yz", status);
1368     REGEX_CHECK_STATUS;
1369     REGEX_ASSERT(dest == ".yz..abc...abc..");
1370 
1371     dest = matcher->replaceAll("yz", status);
1372     REGEX_CHECK_STATUS;
1373     REGEX_ASSERT(dest == ".yz..yz...yz..");
1374 
1375     //
1376     //  Plain vanilla non-matches.
1377     //
1378     UnicodeString d2 = ".abx..abx...abx..";
1379     matcher->reset(d2);
1380     dest = matcher->replaceFirst("yz", status);
1381     REGEX_CHECK_STATUS;
1382     REGEX_ASSERT(dest == ".abx..abx...abx..");
1383 
1384     dest = matcher->replaceAll("yz", status);
1385     REGEX_CHECK_STATUS;
1386     REGEX_ASSERT(dest == ".abx..abx...abx..");
1387 
1388     //
1389     // Empty source string
1390     //
1391     UnicodeString d3 = "";
1392     matcher->reset(d3);
1393     dest = matcher->replaceFirst("yz", status);
1394     REGEX_CHECK_STATUS;
1395     REGEX_ASSERT(dest == "");
1396 
1397     dest = matcher->replaceAll("yz", status);
1398     REGEX_CHECK_STATUS;
1399     REGEX_ASSERT(dest == "");
1400 
1401     //
1402     // Empty substitution string
1403     //
1404     matcher->reset(data);              // ".abc..abc...abc.."
1405     dest = matcher->replaceFirst("", status);
1406     REGEX_CHECK_STATUS;
1407     REGEX_ASSERT(dest == "...abc...abc..");
1408 
1409     dest = matcher->replaceAll("", status);
1410     REGEX_CHECK_STATUS;
1411     REGEX_ASSERT(dest == "........");
1412 
1413     //
1414     // match whole string
1415     //
1416     UnicodeString d4 = "abc";
1417     matcher->reset(d4);
1418     dest = matcher->replaceFirst("xyz", status);
1419     REGEX_CHECK_STATUS;
1420     REGEX_ASSERT(dest == "xyz");
1421 
1422     dest = matcher->replaceAll("xyz", status);
1423     REGEX_CHECK_STATUS;
1424     REGEX_ASSERT(dest == "xyz");
1425 
1426     //
1427     // Capture Group, simple case
1428     //
1429     UnicodeString       re2("a(..)");
1430     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1431     REGEX_CHECK_STATUS;
1432     UnicodeString d5 = "abcdefg";
1433     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1434     REGEX_CHECK_STATUS;
1435     dest = matcher2->replaceFirst("$1$1", status);
1436     REGEX_CHECK_STATUS;
1437     REGEX_ASSERT(dest == "bcbcdefg");
1438 
1439     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1440     REGEX_CHECK_STATUS;
1441     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1442 
1443     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1444     REGEX_ASSERT(U_FAILURE(status));
1445     status = U_ZERO_ERROR;
1446 
1447     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1448     replacement = replacement.unescape();
1449     dest = matcher2->replaceFirst(replacement, status);
1450     REGEX_CHECK_STATUS;
1451     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1452 
1453     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1454 
1455 
1456     //
1457     // Replacement String with \u hex escapes
1458     //
1459     {
1460         UnicodeString  src = "abc 1 abc 2 abc 3";
1461         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1462         matcher->reset(src);
1463         UnicodeString  result = matcher->replaceAll(substitute, status);
1464         REGEX_CHECK_STATUS;
1465         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1466     }
1467     {
1468         UnicodeString  src = "abc !";
1469         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1470         matcher->reset(src);
1471         UnicodeString  result = matcher->replaceAll(substitute, status);
1472         REGEX_CHECK_STATUS;
1473         UnicodeString expected = UnicodeString("--");
1474         expected.append((UChar32)0x10000);
1475         expected.append("-- !");
1476         REGEX_ASSERT(result == expected);
1477     }
1478     // TODO:  need more through testing of capture substitutions.
1479 
1480     // Bug 4057
1481     //
1482     {
1483         status = U_ZERO_ERROR;
1484         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1485         RegexMatcher m("ss(.*?)ee", 0, status);
1486         REGEX_CHECK_STATUS;
1487         UnicodeString result;
1488 
1489         // Multiple finds do NOT bump up the previous appendReplacement postion.
1490         m.reset(s);
1491         m.find();
1492         m.find();
1493         m.appendReplacement(result, "ooh", status);
1494         REGEX_CHECK_STATUS;
1495         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1496 
1497         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1498         status = U_ZERO_ERROR;
1499         result.truncate(0);
1500         m.reset(10, status);
1501         m.find();
1502         m.find();
1503         m.appendReplacement(result, "ooh", status);
1504         REGEX_CHECK_STATUS;
1505         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1506 
1507         // find() at interior of string, appendReplacemnt still starts at beginning.
1508         status = U_ZERO_ERROR;
1509         result.truncate(0);
1510         m.reset();
1511         m.find(10, status);
1512         m.find();
1513         m.appendReplacement(result, "ooh", status);
1514         REGEX_CHECK_STATUS;
1515         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1516 
1517         m.appendTail(result);
1518         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1519 
1520     }
1521 
1522     delete matcher2;
1523     delete pat2;
1524     delete matcher;
1525     delete pat;
1526 }
1527 
1528 
1529 //---------------------------------------------------------------------------
1530 //
1531 //      API_Pattern       Test that the API for class RegexPattern is
1532 //                        present and nominally working.
1533 //
1534 //---------------------------------------------------------------------------
API_Pattern()1535 void RegexTest::API_Pattern() {
1536     RegexPattern        pata;    // Test default constructor to not crash.
1537     RegexPattern        patb;
1538 
1539     REGEX_ASSERT(pata == patb);
1540     REGEX_ASSERT(pata == pata);
1541 
1542     UnicodeString re1("abc[a-l][m-z]");
1543     UnicodeString re2("def");
1544     UErrorCode    status = U_ZERO_ERROR;
1545     UParseError   pe;
1546 
1547     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1548     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1549     REGEX_CHECK_STATUS;
1550     REGEX_ASSERT(*pat1 == *pat1);
1551     REGEX_ASSERT(*pat1 != pata);
1552 
1553     // Assign
1554     patb = *pat1;
1555     REGEX_ASSERT(patb == *pat1);
1556 
1557     // Copy Construct
1558     RegexPattern patc(*pat1);
1559     REGEX_ASSERT(patc == *pat1);
1560     REGEX_ASSERT(patb == patc);
1561     REGEX_ASSERT(pat1 != pat2);
1562     patb = *pat2;
1563     REGEX_ASSERT(patb != patc);
1564     REGEX_ASSERT(patb == *pat2);
1565 
1566     // Compile with no flags.
1567     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1568     REGEX_ASSERT(*pat1a == *pat1);
1569 
1570     REGEX_ASSERT(pat1a->flags() == 0);
1571 
1572     // Compile with different flags should be not equal
1573     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1574     REGEX_CHECK_STATUS;
1575 
1576     REGEX_ASSERT(*pat1b != *pat1a);
1577     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1578     REGEX_ASSERT(pat1a->flags() == 0);
1579     delete pat1b;
1580 
1581     // clone
1582     RegexPattern *pat1c = pat1->clone();
1583     REGEX_ASSERT(*pat1c == *pat1);
1584     REGEX_ASSERT(*pat1c != *pat2);
1585 
1586     delete pat1c;
1587     delete pat1a;
1588     delete pat1;
1589     delete pat2;
1590 
1591 
1592     //
1593     //   Verify that a matcher created from a cloned pattern works.
1594     //     (Jitterbug 3423)
1595     //
1596     {
1597         UErrorCode     status     = U_ZERO_ERROR;
1598         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1599         RegexPattern  *pClone     = pSource->clone();
1600         delete         pSource;
1601         RegexMatcher  *mFromClone = pClone->matcher(status);
1602         REGEX_CHECK_STATUS;
1603         UnicodeString s = "Hello World";
1604         mFromClone->reset(s);
1605         REGEX_ASSERT(mFromClone->find() == TRUE);
1606         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1607         REGEX_ASSERT(mFromClone->find() == TRUE);
1608         REGEX_ASSERT(mFromClone->group(status) == "World");
1609         REGEX_ASSERT(mFromClone->find() == FALSE);
1610         delete mFromClone;
1611         delete pClone;
1612     }
1613 
1614     //
1615     //   matches convenience API
1616     //
1617     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1618     REGEX_CHECK_STATUS;
1619     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1620     REGEX_CHECK_STATUS;
1621     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1622     REGEX_CHECK_STATUS;
1623     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1624     REGEX_CHECK_STATUS;
1625     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1626     REGEX_CHECK_STATUS;
1627     status = U_INDEX_OUTOFBOUNDS_ERROR;
1628     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1629     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1630 
1631 
1632     //
1633     // Split()
1634     //
1635     status = U_ZERO_ERROR;
1636     pat1 = RegexPattern::compile(" +",  pe, status);
1637     REGEX_CHECK_STATUS;
1638     UnicodeString  fields[10];
1639 
1640     int32_t n;
1641     n = pat1->split("Now is the time", fields, 10, status);
1642     REGEX_CHECK_STATUS;
1643     REGEX_ASSERT(n==4);
1644     REGEX_ASSERT(fields[0]=="Now");
1645     REGEX_ASSERT(fields[1]=="is");
1646     REGEX_ASSERT(fields[2]=="the");
1647     REGEX_ASSERT(fields[3]=="time");
1648     REGEX_ASSERT(fields[4]=="");
1649 
1650     n = pat1->split("Now is the time", fields, 2, status);
1651     REGEX_CHECK_STATUS;
1652     REGEX_ASSERT(n==2);
1653     REGEX_ASSERT(fields[0]=="Now");
1654     REGEX_ASSERT(fields[1]=="is the time");
1655     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1656 
1657     fields[1] = "*";
1658     status = U_ZERO_ERROR;
1659     n = pat1->split("Now is the time", fields, 1, status);
1660     REGEX_CHECK_STATUS;
1661     REGEX_ASSERT(n==1);
1662     REGEX_ASSERT(fields[0]=="Now is the time");
1663     REGEX_ASSERT(fields[1]=="*");
1664     status = U_ZERO_ERROR;
1665 
1666     n = pat1->split("    Now       is the time   ", fields, 10, status);
1667     REGEX_CHECK_STATUS;
1668     REGEX_ASSERT(n==6);
1669     REGEX_ASSERT(fields[0]=="");
1670     REGEX_ASSERT(fields[1]=="Now");
1671     REGEX_ASSERT(fields[2]=="is");
1672     REGEX_ASSERT(fields[3]=="the");
1673     REGEX_ASSERT(fields[4]=="time");
1674     REGEX_ASSERT(fields[5]=="");
1675 
1676     n = pat1->split("     ", fields, 10, status);
1677     REGEX_CHECK_STATUS;
1678     REGEX_ASSERT(n==2);
1679     REGEX_ASSERT(fields[0]=="");
1680     REGEX_ASSERT(fields[1]=="");
1681 
1682     fields[0] = "foo";
1683     n = pat1->split("", fields, 10, status);
1684     REGEX_CHECK_STATUS;
1685     REGEX_ASSERT(n==0);
1686     REGEX_ASSERT(fields[0]=="foo");
1687 
1688     delete pat1;
1689 
1690     //  split, with a pattern with (capture)
1691     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1692     REGEX_CHECK_STATUS;
1693 
1694     status = U_ZERO_ERROR;
1695     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1696     REGEX_CHECK_STATUS;
1697     REGEX_ASSERT(n==7);
1698     REGEX_ASSERT(fields[0]=="");
1699     REGEX_ASSERT(fields[1]=="a");
1700     REGEX_ASSERT(fields[2]=="Now is ");
1701     REGEX_ASSERT(fields[3]=="b");
1702     REGEX_ASSERT(fields[4]=="the time");
1703     REGEX_ASSERT(fields[5]=="c");
1704     REGEX_ASSERT(fields[6]=="");
1705     REGEX_ASSERT(status==U_ZERO_ERROR);
1706 
1707     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1708     REGEX_CHECK_STATUS;
1709     REGEX_ASSERT(n==7);
1710     REGEX_ASSERT(fields[0]=="  ");
1711     REGEX_ASSERT(fields[1]=="a");
1712     REGEX_ASSERT(fields[2]=="Now is ");
1713     REGEX_ASSERT(fields[3]=="b");
1714     REGEX_ASSERT(fields[4]=="the time");
1715     REGEX_ASSERT(fields[5]=="c");
1716     REGEX_ASSERT(fields[6]=="");
1717 
1718     status = U_ZERO_ERROR;
1719     fields[6] = "foo";
1720     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1721     REGEX_CHECK_STATUS;
1722     REGEX_ASSERT(n==6);
1723     REGEX_ASSERT(fields[0]=="  ");
1724     REGEX_ASSERT(fields[1]=="a");
1725     REGEX_ASSERT(fields[2]=="Now is ");
1726     REGEX_ASSERT(fields[3]=="b");
1727     REGEX_ASSERT(fields[4]=="the time");
1728     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1729     REGEX_ASSERT(fields[6]=="foo");
1730 
1731     status = U_ZERO_ERROR;
1732     fields[5] = "foo";
1733     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1734     REGEX_CHECK_STATUS;
1735     REGEX_ASSERT(n==5);
1736     REGEX_ASSERT(fields[0]=="  ");
1737     REGEX_ASSERT(fields[1]=="a");
1738     REGEX_ASSERT(fields[2]=="Now is ");
1739     REGEX_ASSERT(fields[3]=="b");
1740     REGEX_ASSERT(fields[4]=="the time<c>");
1741     REGEX_ASSERT(fields[5]=="foo");
1742 
1743     status = U_ZERO_ERROR;
1744     fields[5] = "foo";
1745     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1746     REGEX_CHECK_STATUS;
1747     REGEX_ASSERT(n==5);
1748     REGEX_ASSERT(fields[0]=="  ");
1749     REGEX_ASSERT(fields[1]=="a");
1750     REGEX_ASSERT(fields[2]=="Now is ");
1751     REGEX_ASSERT(fields[3]=="b");
1752     REGEX_ASSERT(fields[4]=="the time");
1753     REGEX_ASSERT(fields[5]=="foo");
1754 
1755     status = U_ZERO_ERROR;
1756     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1757     REGEX_CHECK_STATUS;
1758     REGEX_ASSERT(n==4);
1759     REGEX_ASSERT(fields[0]=="  ");
1760     REGEX_ASSERT(fields[1]=="a");
1761     REGEX_ASSERT(fields[2]=="Now is ");
1762     REGEX_ASSERT(fields[3]=="the time<c>");
1763     status = U_ZERO_ERROR;
1764     delete pat1;
1765 
1766     pat1 = RegexPattern::compile("([-,])",  pe, status);
1767     REGEX_CHECK_STATUS;
1768     n = pat1->split("1-10,20", fields, 10, status);
1769     REGEX_CHECK_STATUS;
1770     REGEX_ASSERT(n==5);
1771     REGEX_ASSERT(fields[0]=="1");
1772     REGEX_ASSERT(fields[1]=="-");
1773     REGEX_ASSERT(fields[2]=="10");
1774     REGEX_ASSERT(fields[3]==",");
1775     REGEX_ASSERT(fields[4]=="20");
1776     delete pat1;
1777 
1778     // Test split of string with empty trailing fields
1779     pat1 = RegexPattern::compile(",", pe, status);
1780     REGEX_CHECK_STATUS;
1781     n = pat1->split("a,b,c,", fields, 10, status);
1782     REGEX_CHECK_STATUS;
1783     REGEX_ASSERT(n==4);
1784     REGEX_ASSERT(fields[0]=="a");
1785     REGEX_ASSERT(fields[1]=="b");
1786     REGEX_ASSERT(fields[2]=="c");
1787     REGEX_ASSERT(fields[3]=="");
1788 
1789     n = pat1->split("a,,,", fields, 10, status);
1790     REGEX_CHECK_STATUS;
1791     REGEX_ASSERT(n==4);
1792     REGEX_ASSERT(fields[0]=="a");
1793     REGEX_ASSERT(fields[1]=="");
1794     REGEX_ASSERT(fields[2]=="");
1795     REGEX_ASSERT(fields[3]=="");
1796     delete pat1;
1797 
1798     // Split Separator with zero length match.
1799     pat1 = RegexPattern::compile(":?", pe, status);
1800     REGEX_CHECK_STATUS;
1801     n = pat1->split("abc", fields, 10, status);
1802     REGEX_CHECK_STATUS;
1803     REGEX_ASSERT(n==5);
1804     REGEX_ASSERT(fields[0]=="");
1805     REGEX_ASSERT(fields[1]=="a");
1806     REGEX_ASSERT(fields[2]=="b");
1807     REGEX_ASSERT(fields[3]=="c");
1808     REGEX_ASSERT(fields[4]=="");
1809 
1810     delete pat1;
1811 
1812     //
1813     // RegexPattern::pattern()
1814     //
1815     pat1 = new RegexPattern();
1816     REGEX_ASSERT(pat1->pattern() == "");
1817     delete pat1;
1818 
1819     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1820     REGEX_CHECK_STATUS;
1821     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1822     delete pat1;
1823 
1824 
1825     //
1826     // classID functions
1827     //
1828     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1829     REGEX_CHECK_STATUS;
1830     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1831     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1832     UnicodeString Hello("Hello, world.");
1833     RegexMatcher *m = pat1->matcher(Hello, status);
1834     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1835     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1836     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1837     delete m;
1838     delete pat1;
1839 
1840 }
1841 
1842 //---------------------------------------------------------------------------
1843 //
1844 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1845 //                       is present and working, but excluding functions
1846 //                       implementing replace operations.
1847 //
1848 //---------------------------------------------------------------------------
API_Match_UTF8()1849 void RegexTest::API_Match_UTF8() {
1850     UParseError         pe;
1851     UErrorCode          status=U_ZERO_ERROR;
1852     int32_t             flags = 0;
1853 
1854     //
1855     // Debug - slide failing test cases early
1856     //
1857 #if 0
1858     {
1859     }
1860     return;
1861 #endif
1862 
1863     //
1864     // Simple pattern compilation
1865     //
1866     {
1867         UText               re = UTEXT_INITIALIZER;
1868         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1869         REGEX_VERBOSE_TEXT(&re);
1870         RegexPattern        *pat2;
1871         pat2 = RegexPattern::compile(&re, flags, pe, status);
1872         REGEX_CHECK_STATUS;
1873 
1874         UText input1 = UTEXT_INITIALIZER;
1875         UText input2 = UTEXT_INITIALIZER;
1876         UText empty  = UTEXT_INITIALIZER;
1877         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1878         REGEX_VERBOSE_TEXT(&input1);
1879         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1880         REGEX_VERBOSE_TEXT(&input2);
1881         utext_openUChars(&empty, NULL, 0, &status);
1882 
1883         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1884         int32_t input2Len = strlen("not abc");
1885 
1886 
1887         //
1888         // Matcher creation and reset.
1889         //
1890         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1891         REGEX_CHECK_STATUS;
1892         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1893         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1894         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1895         m1->reset(&input2);
1896         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1897         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1898         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1899         m1->reset(&input1);
1900         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1901         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1902         m1->reset(&empty);
1903         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1904         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1905 
1906         //
1907         //  reset(pos, status)
1908         //
1909         m1->reset(&input1);
1910         m1->reset(4, status);
1911         REGEX_CHECK_STATUS;
1912         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1913         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1914 
1915         m1->reset(-1, status);
1916         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1917         status = U_ZERO_ERROR;
1918 
1919         m1->reset(0, status);
1920         REGEX_CHECK_STATUS;
1921         status = U_ZERO_ERROR;
1922 
1923         m1->reset(input1Len-1, status);
1924         REGEX_CHECK_STATUS;
1925         status = U_ZERO_ERROR;
1926 
1927         m1->reset(input1Len, status);
1928         REGEX_CHECK_STATUS;
1929         status = U_ZERO_ERROR;
1930 
1931         m1->reset(input1Len+1, status);
1932         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1933         status = U_ZERO_ERROR;
1934 
1935         //
1936         // match(pos, status)
1937         //
1938         m1->reset(&input2);
1939         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1940         m1->reset();
1941         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1942         m1->reset();
1943         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1944         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1945         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1946         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1947 
1948         // Match() at end of string should fail, but should not
1949         //  be an error.
1950         status = U_ZERO_ERROR;
1951         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1952         REGEX_CHECK_STATUS;
1953 
1954         // Match beyond end of string should fail with an error.
1955         status = U_ZERO_ERROR;
1956         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1957         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958 
1959         // Successful match at end of string.
1960         {
1961             status = U_ZERO_ERROR;
1962             RegexMatcher m("A?", 0, status);  // will match zero length string.
1963             REGEX_CHECK_STATUS;
1964             m.reset(&input1);
1965             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1966             REGEX_CHECK_STATUS;
1967             m.reset(&empty);
1968             REGEX_ASSERT(m.matches(0, status) == TRUE);
1969             REGEX_CHECK_STATUS;
1970         }
1971 
1972 
1973         //
1974         // lookingAt(pos, status)
1975         //
1976         status = U_ZERO_ERROR;
1977         m1->reset(&input2);  // "not abc"
1978         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1979         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1980         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1981         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1982         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1983         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1984         status = U_ZERO_ERROR;
1985         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1986         REGEX_CHECK_STATUS;
1987         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1988         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1989 
1990         delete m1;
1991         delete pat2;
1992 
1993         utext_close(&re);
1994         utext_close(&input1);
1995         utext_close(&input2);
1996         utext_close(&empty);
1997     }
1998 
1999 
2000     //
2001     // Capture Group.
2002     //     RegexMatcher::start();
2003     //     RegexMatcher::end();
2004     //     RegexMatcher::groupCount();
2005     //
2006     {
2007         int32_t             flags=0;
2008         UParseError         pe;
2009         UErrorCode          status=U_ZERO_ERROR;
2010         UText               re=UTEXT_INITIALIZER;
2011         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2012         utext_openUTF8(&re, str_01234567_pat, -1, &status);
2013 
2014         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2015         REGEX_CHECK_STATUS;
2016 
2017         UText input = UTEXT_INITIALIZER;
2018         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2019         utext_openUTF8(&input, str_0123456789, -1, &status);
2020 
2021         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2022         REGEX_CHECK_STATUS;
2023         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2024         static const int32_t matchStarts[] = {0,  2, 4, 8};
2025         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2026         int32_t i;
2027         for (i=0; i<4; i++) {
2028             int32_t actualStart = matcher->start(i, status);
2029             REGEX_CHECK_STATUS;
2030             if (actualStart != matchStarts[i]) {
2031                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2032                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2033             }
2034             int32_t actualEnd = matcher->end(i, status);
2035             REGEX_CHECK_STATUS;
2036             if (actualEnd != matchEnds[i]) {
2037                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2038                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2039             }
2040         }
2041 
2042         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2043         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2044 
2045         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2046         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2047         matcher->reset();
2048         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2049 
2050         matcher->lookingAt(status);
2051 
2052         UnicodeString dest;
2053         UText destText = UTEXT_INITIALIZER;
2054         utext_openUnicodeString(&destText, &dest, &status);
2055         UText *result;
2056         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2057         //  Test shallow-clone API
2058         int64_t   group_len;
2059         result = matcher->group((UText *)NULL, group_len, status);
2060         REGEX_CHECK_STATUS;
2061         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2062         utext_close(result);
2063         result = matcher->group(0, &destText, group_len, status);
2064         REGEX_CHECK_STATUS;
2065         REGEX_ASSERT(result == &destText);
2066         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2067         //  destText is now immutable, reopen it
2068         utext_close(&destText);
2069         utext_openUnicodeString(&destText, &dest, &status);
2070 
2071         int64_t length;
2072         result = matcher->group(0, NULL, length, status);
2073         REGEX_CHECK_STATUS;
2074         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2075         utext_close(result);
2076         result = matcher->group(0, &destText, length, status);
2077         REGEX_CHECK_STATUS;
2078         REGEX_ASSERT(result == &destText);
2079         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2080         REGEX_ASSERT(length == 10);
2081         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2082 
2083         // Capture Group 1 == "234567"
2084         result = matcher->group(1, NULL, length, status);
2085         REGEX_CHECK_STATUS;
2086         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2087         REGEX_ASSERT(length == 6);
2088         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2089         utext_close(result);
2090 
2091         result = matcher->group(1, &destText, length, status);
2092         REGEX_CHECK_STATUS;
2093         REGEX_ASSERT(result == &destText);
2094         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2095         REGEX_ASSERT(length == 6);
2096         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2097         utext_close(result);
2098 
2099         // Capture Group 2 == "45"
2100         result = matcher->group(2, NULL, length, status);
2101         REGEX_CHECK_STATUS;
2102         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2103         REGEX_ASSERT(length == 2);
2104         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2105         utext_close(result);
2106 
2107         result = matcher->group(2, &destText, length, status);
2108         REGEX_CHECK_STATUS;
2109         REGEX_ASSERT(result == &destText);
2110         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2111         REGEX_ASSERT(length == 2);
2112         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2113         utext_close(result);
2114 
2115         // Capture Group 3 == "89"
2116         result = matcher->group(3, NULL, length, status);
2117         REGEX_CHECK_STATUS;
2118         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2119         REGEX_ASSERT(length == 2);
2120         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2121         utext_close(result);
2122 
2123         result = matcher->group(3, &destText, length, status);
2124         REGEX_CHECK_STATUS;
2125         REGEX_ASSERT(result == &destText);
2126         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2127         REGEX_ASSERT(length == 2);
2128         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2129         utext_close(result);
2130 
2131         // Capture Group number out of range.
2132         status = U_ZERO_ERROR;
2133         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2134         status = U_ZERO_ERROR;
2135         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136         status = U_ZERO_ERROR;
2137         matcher->reset();
2138         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2139 
2140         delete matcher;
2141         delete pat;
2142 
2143         utext_close(&destText);
2144         utext_close(&input);
2145         utext_close(&re);
2146     }
2147 
2148     //
2149     //  find
2150     //
2151     {
2152         int32_t             flags=0;
2153         UParseError         pe;
2154         UErrorCode          status=U_ZERO_ERROR;
2155         UText               re=UTEXT_INITIALIZER;
2156         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2157         utext_openUTF8(&re, str_abc, -1, &status);
2158 
2159         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2160         REGEX_CHECK_STATUS;
2161         UText input = UTEXT_INITIALIZER;
2162         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2163         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2164         //                      012345678901234567
2165 
2166         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2167         REGEX_CHECK_STATUS;
2168         REGEX_ASSERT(matcher->find());
2169         REGEX_ASSERT(matcher->start(status) == 1);
2170         REGEX_ASSERT(matcher->find());
2171         REGEX_ASSERT(matcher->start(status) == 6);
2172         REGEX_ASSERT(matcher->find());
2173         REGEX_ASSERT(matcher->start(status) == 12);
2174         REGEX_ASSERT(matcher->find() == FALSE);
2175         REGEX_ASSERT(matcher->find() == FALSE);
2176 
2177         matcher->reset();
2178         REGEX_ASSERT(matcher->find());
2179         REGEX_ASSERT(matcher->start(status) == 1);
2180 
2181         REGEX_ASSERT(matcher->find(0, status));
2182         REGEX_ASSERT(matcher->start(status) == 1);
2183         REGEX_ASSERT(matcher->find(1, status));
2184         REGEX_ASSERT(matcher->start(status) == 1);
2185         REGEX_ASSERT(matcher->find(2, status));
2186         REGEX_ASSERT(matcher->start(status) == 6);
2187         REGEX_ASSERT(matcher->find(12, status));
2188         REGEX_ASSERT(matcher->start(status) == 12);
2189         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2190         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2191         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2192         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2193 
2194         status = U_ZERO_ERROR;
2195         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2196         status = U_ZERO_ERROR;
2197         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198 
2199         REGEX_ASSERT(matcher->groupCount() == 0);
2200 
2201         delete matcher;
2202         delete pat;
2203 
2204         utext_close(&input);
2205         utext_close(&re);
2206     }
2207 
2208 
2209     //
2210     //  find, with \G in pattern (true if at the end of a previous match).
2211     //
2212     {
2213         int32_t             flags=0;
2214         UParseError         pe;
2215         UErrorCode          status=U_ZERO_ERROR;
2216         UText               re=UTEXT_INITIALIZER;
2217         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2218         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2219 
2220         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2221 
2222         REGEX_CHECK_STATUS;
2223         UText input = UTEXT_INITIALIZER;
2224         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2225         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2226         //                      012345678901234567
2227 
2228         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2229         REGEX_CHECK_STATUS;
2230         REGEX_ASSERT(matcher->find());
2231         REGEX_ASSERT(matcher->start(status) == 0);
2232         REGEX_ASSERT(matcher->start(1, status) == -1);
2233         REGEX_ASSERT(matcher->start(2, status) == 1);
2234 
2235         REGEX_ASSERT(matcher->find());
2236         REGEX_ASSERT(matcher->start(status) == 4);
2237         REGEX_ASSERT(matcher->start(1, status) == 4);
2238         REGEX_ASSERT(matcher->start(2, status) == -1);
2239         REGEX_CHECK_STATUS;
2240 
2241         delete matcher;
2242         delete pat;
2243 
2244         utext_close(&input);
2245         utext_close(&re);
2246     }
2247 
2248     //
2249     //   find with zero length matches, match position should bump ahead
2250     //     to prevent loops.
2251     //
2252     {
2253         int32_t                 i;
2254         UErrorCode          status=U_ZERO_ERROR;
2255         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2256                                                       //   using an always-true look-ahead.
2257         REGEX_CHECK_STATUS;
2258         UText s = UTEXT_INITIALIZER;
2259         utext_openUTF8(&s, "    ", -1, &status);
2260         m.reset(&s);
2261         for (i=0; ; i++) {
2262             if (m.find() == FALSE) {
2263                 break;
2264             }
2265             REGEX_ASSERT(m.start(status) == i);
2266             REGEX_ASSERT(m.end(status) == i);
2267         }
2268         REGEX_ASSERT(i==5);
2269 
2270         // Check that the bump goes over characters outside the BMP OK
2271         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2272         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2273         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2274         m.reset(&s);
2275         for (i=0; ; i+=4) {
2276             if (m.find() == FALSE) {
2277                 break;
2278             }
2279             REGEX_ASSERT(m.start(status) == i);
2280             REGEX_ASSERT(m.end(status) == i);
2281         }
2282         REGEX_ASSERT(i==20);
2283 
2284         utext_close(&s);
2285     }
2286     {
2287         // find() loop breaking test.
2288         //        with pattern of /.?/, should see a series of one char matches, then a single
2289         //        match of zero length at the end of the input string.
2290         int32_t                 i;
2291         UErrorCode          status=U_ZERO_ERROR;
2292         RegexMatcher        m(".?", 0, status);
2293         REGEX_CHECK_STATUS;
2294         UText s = UTEXT_INITIALIZER;
2295         utext_openUTF8(&s, "    ", -1, &status);
2296         m.reset(&s);
2297         for (i=0; ; i++) {
2298             if (m.find() == FALSE) {
2299                 break;
2300             }
2301             REGEX_ASSERT(m.start(status) == i);
2302             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2303         }
2304         REGEX_ASSERT(i==5);
2305 
2306         utext_close(&s);
2307     }
2308 
2309 
2310     //
2311     // Matchers with no input string behave as if they had an empty input string.
2312     //
2313 
2314     {
2315         UErrorCode status = U_ZERO_ERROR;
2316         RegexMatcher  m(".?", 0, status);
2317         REGEX_CHECK_STATUS;
2318         REGEX_ASSERT(m.find());
2319         REGEX_ASSERT(m.start(status) == 0);
2320         REGEX_ASSERT(m.input() == "");
2321     }
2322     {
2323         UErrorCode status = U_ZERO_ERROR;
2324         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2325         RegexMatcher  *m = p->matcher(status);
2326         REGEX_CHECK_STATUS;
2327 
2328         REGEX_ASSERT(m->find() == FALSE);
2329         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2330         delete m;
2331         delete p;
2332     }
2333 
2334     //
2335     // Regions
2336     //
2337     {
2338         UErrorCode status = U_ZERO_ERROR;
2339         UText testPattern = UTEXT_INITIALIZER;
2340         UText testText    = UTEXT_INITIALIZER;
2341         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2342         REGEX_VERBOSE_TEXT(&testPattern);
2343         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2344         REGEX_VERBOSE_TEXT(&testText);
2345 
2346         RegexMatcher m(&testPattern, &testText, 0, status);
2347         REGEX_CHECK_STATUS;
2348         REGEX_ASSERT(m.regionStart() == 0);
2349         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2350         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2351         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352 
2353         m.region(2,4, status);
2354         REGEX_CHECK_STATUS;
2355         REGEX_ASSERT(m.matches(status));
2356         REGEX_ASSERT(m.start(status)==2);
2357         REGEX_ASSERT(m.end(status)==4);
2358         REGEX_CHECK_STATUS;
2359 
2360         m.reset();
2361         REGEX_ASSERT(m.regionStart() == 0);
2362         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2363 
2364         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2365         REGEX_VERBOSE_TEXT(&testText);
2366         m.reset(&testText);
2367         REGEX_ASSERT(m.regionStart() == 0);
2368         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2369 
2370         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2371         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2372         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2373         REGEX_ASSERT(&m == &m.reset());
2374         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375 
2376         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2377         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2378         REGEX_ASSERT(&m == &m.reset());
2379         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380 
2381         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2382         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2383         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2384         REGEX_ASSERT(&m == &m.reset());
2385         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386 
2387         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2388         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2389         REGEX_ASSERT(&m == &m.reset());
2390         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391 
2392         utext_close(&testText);
2393         utext_close(&testPattern);
2394     }
2395 
2396     //
2397     // hitEnd() and requireEnd()
2398     //
2399     {
2400         UErrorCode status = U_ZERO_ERROR;
2401         UText testPattern = UTEXT_INITIALIZER;
2402         UText testText    = UTEXT_INITIALIZER;
2403         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2404         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2405         utext_openUTF8(&testPattern, str_, -1, &status);
2406         utext_openUTF8(&testText, str_aabb, -1, &status);
2407 
2408         RegexMatcher m1(&testPattern, &testText,  0, status);
2409         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2410         REGEX_ASSERT(m1.hitEnd() == TRUE);
2411         REGEX_ASSERT(m1.requireEnd() == FALSE);
2412         REGEX_CHECK_STATUS;
2413 
2414         status = U_ZERO_ERROR;
2415         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2416         utext_openUTF8(&testPattern, str_a, -1, &status);
2417         RegexMatcher m2(&testPattern, &testText, 0, status);
2418         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2419         REGEX_ASSERT(m2.hitEnd() == FALSE);
2420         REGEX_ASSERT(m2.requireEnd() == FALSE);
2421         REGEX_CHECK_STATUS;
2422 
2423         status = U_ZERO_ERROR;
2424         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2425         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2426         RegexMatcher m3(&testPattern, &testText, 0, status);
2427         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2428         REGEX_ASSERT(m3.hitEnd() == TRUE);
2429         REGEX_ASSERT(m3.requireEnd() == TRUE);
2430         REGEX_CHECK_STATUS;
2431 
2432         utext_close(&testText);
2433         utext_close(&testPattern);
2434     }
2435 }
2436 
2437 
2438 //---------------------------------------------------------------------------
2439 //
2440 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2441 //                         Replace family of functions.
2442 //
2443 //---------------------------------------------------------------------------
API_Replace_UTF8()2444 void RegexTest::API_Replace_UTF8() {
2445     //
2446     //  Replace
2447     //
2448     int32_t             flags=0;
2449     UParseError         pe;
2450     UErrorCode          status=U_ZERO_ERROR;
2451 
2452     UText               re=UTEXT_INITIALIZER;
2453     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2454     REGEX_VERBOSE_TEXT(&re);
2455     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2456     REGEX_CHECK_STATUS;
2457 
2458     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2459     //             012345678901234567
2460     UText dataText = UTEXT_INITIALIZER;
2461     utext_openUTF8(&dataText, data, -1, &status);
2462     REGEX_CHECK_STATUS;
2463     REGEX_VERBOSE_TEXT(&dataText);
2464     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2465 
2466     //
2467     //  Plain vanilla matches.
2468     //
2469     UnicodeString  dest;
2470     UText destText = UTEXT_INITIALIZER;
2471     utext_openUnicodeString(&destText, &dest, &status);
2472     UText *result;
2473 
2474     UText replText = UTEXT_INITIALIZER;
2475 
2476     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2477     utext_openUTF8(&replText, str_yz, -1, &status);
2478     REGEX_VERBOSE_TEXT(&replText);
2479     result = matcher->replaceFirst(&replText, NULL, status);
2480     REGEX_CHECK_STATUS;
2481     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2482     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2483     utext_close(result);
2484     result = matcher->replaceFirst(&replText, &destText, status);
2485     REGEX_CHECK_STATUS;
2486     REGEX_ASSERT(result == &destText);
2487     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2488 
2489     result = matcher->replaceAll(&replText, NULL, status);
2490     REGEX_CHECK_STATUS;
2491     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2492     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2493     utext_close(result);
2494 
2495     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496     result = matcher->replaceAll(&replText, &destText, status);
2497     REGEX_CHECK_STATUS;
2498     REGEX_ASSERT(result == &destText);
2499     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2500 
2501     //
2502     //  Plain vanilla non-matches.
2503     //
2504     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2505     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2506     matcher->reset(&dataText);
2507 
2508     result = matcher->replaceFirst(&replText, NULL, status);
2509     REGEX_CHECK_STATUS;
2510     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2511     utext_close(result);
2512     result = matcher->replaceFirst(&replText, &destText, status);
2513     REGEX_CHECK_STATUS;
2514     REGEX_ASSERT(result == &destText);
2515     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2516 
2517     result = matcher->replaceAll(&replText, NULL, status);
2518     REGEX_CHECK_STATUS;
2519     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2520     utext_close(result);
2521     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2522     result = matcher->replaceAll(&replText, &destText, status);
2523     REGEX_CHECK_STATUS;
2524     REGEX_ASSERT(result == &destText);
2525     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2526 
2527     //
2528     // Empty source string
2529     //
2530     utext_openUTF8(&dataText, NULL, 0, &status);
2531     matcher->reset(&dataText);
2532 
2533     result = matcher->replaceFirst(&replText, NULL, status);
2534     REGEX_CHECK_STATUS;
2535     REGEX_ASSERT_UTEXT_UTF8("", result);
2536     utext_close(result);
2537     result = matcher->replaceFirst(&replText, &destText, status);
2538     REGEX_CHECK_STATUS;
2539     REGEX_ASSERT(result == &destText);
2540     REGEX_ASSERT_UTEXT_UTF8("", result);
2541 
2542     result = matcher->replaceAll(&replText, NULL, status);
2543     REGEX_CHECK_STATUS;
2544     REGEX_ASSERT_UTEXT_UTF8("", result);
2545     utext_close(result);
2546     result = matcher->replaceAll(&replText, &destText, status);
2547     REGEX_CHECK_STATUS;
2548     REGEX_ASSERT(result == &destText);
2549     REGEX_ASSERT_UTEXT_UTF8("", result);
2550 
2551     //
2552     // Empty substitution string
2553     //
2554     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2555     matcher->reset(&dataText);
2556 
2557     utext_openUTF8(&replText, NULL, 0, &status);
2558     result = matcher->replaceFirst(&replText, NULL, status);
2559     REGEX_CHECK_STATUS;
2560     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2561     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2562     utext_close(result);
2563     result = matcher->replaceFirst(&replText, &destText, status);
2564     REGEX_CHECK_STATUS;
2565     REGEX_ASSERT(result == &destText);
2566     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2567 
2568     result = matcher->replaceAll(&replText, NULL, status);
2569     REGEX_CHECK_STATUS;
2570     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2571     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2572     utext_close(result);
2573     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2574     result = matcher->replaceAll(&replText, &destText, status);
2575     REGEX_CHECK_STATUS;
2576     REGEX_ASSERT(result == &destText);
2577     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2578 
2579     //
2580     // match whole string
2581     //
2582     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2583     utext_openUTF8(&dataText, str_abc, -1, &status);
2584     matcher->reset(&dataText);
2585 
2586     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2587     utext_openUTF8(&replText, str_xyz, -1, &status);
2588     result = matcher->replaceFirst(&replText, NULL, status);
2589     REGEX_CHECK_STATUS;
2590     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2591     utext_close(result);
2592     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2593     result = matcher->replaceFirst(&replText, &destText, status);
2594     REGEX_CHECK_STATUS;
2595     REGEX_ASSERT(result == &destText);
2596     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2597 
2598     result = matcher->replaceAll(&replText, NULL, status);
2599     REGEX_CHECK_STATUS;
2600     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2601     utext_close(result);
2602     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603     result = matcher->replaceAll(&replText, &destText, status);
2604     REGEX_CHECK_STATUS;
2605     REGEX_ASSERT(result == &destText);
2606     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2607 
2608     //
2609     // Capture Group, simple case
2610     //
2611     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2612     utext_openUTF8(&re, str_add, -1, &status);
2613     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2614     REGEX_CHECK_STATUS;
2615 
2616     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2617     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2618     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2619     REGEX_CHECK_STATUS;
2620 
2621     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2622     utext_openUTF8(&replText, str_11, -1, &status);
2623     result = matcher2->replaceFirst(&replText, NULL, status);
2624     REGEX_CHECK_STATUS;
2625     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2626     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2627     utext_close(result);
2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629     result = matcher2->replaceFirst(&replText, &destText, status);
2630     REGEX_CHECK_STATUS;
2631     REGEX_ASSERT(result == &destText);
2632     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2633 
2634     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2635     utext_openUTF8(&replText, str_v, -1, &status);
2636     REGEX_VERBOSE_TEXT(&replText);
2637     result = matcher2->replaceFirst(&replText, NULL, status);
2638     REGEX_CHECK_STATUS;
2639     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2640     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2641     utext_close(result);
2642     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2643     result = matcher2->replaceFirst(&replText, &destText, status);
2644     REGEX_CHECK_STATUS;
2645     REGEX_ASSERT(result == &destText);
2646     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2647 
2648     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2649                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2650                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2651     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2652     result = matcher2->replaceFirst(&replText, NULL, status);
2653     REGEX_CHECK_STATUS;
2654     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2655     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2656     utext_close(result);
2657     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2658     result = matcher2->replaceFirst(&replText, &destText, status);
2659     REGEX_CHECK_STATUS;
2660     REGEX_ASSERT(result == &destText);
2661     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2662 
2663     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2664     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2665     //                                 012345678901234567890123456
2666     supplDigitChars[22] = 0xF0;
2667     supplDigitChars[23] = 0x9D;
2668     supplDigitChars[24] = 0x9F;
2669     supplDigitChars[25] = 0x8F;
2670     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2671 
2672     result = matcher2->replaceFirst(&replText, NULL, status);
2673     REGEX_CHECK_STATUS;
2674     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2675     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2676     utext_close(result);
2677     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2678     result = matcher2->replaceFirst(&replText, &destText, status);
2679     REGEX_CHECK_STATUS;
2680     REGEX_ASSERT(result == &destText);
2681     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2682     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2683     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2684     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2685 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2686     utext_close(result);
2687     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2688     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2689     REGEX_ASSERT(result == &destText);
2690 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2691 
2692     //
2693     // Replacement String with \u hex escapes
2694     //
2695     {
2696       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2697       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2698         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2699         utext_openUTF8(&replText, str_u0043, -1, &status);
2700         matcher->reset(&dataText);
2701 
2702         result = matcher->replaceAll(&replText, NULL, status);
2703         REGEX_CHECK_STATUS;
2704         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2705         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2706         utext_close(result);
2707         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2708         result = matcher->replaceAll(&replText, &destText, status);
2709         REGEX_CHECK_STATUS;
2710         REGEX_ASSERT(result == &destText);
2711         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2712     }
2713     {
2714       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2715         utext_openUTF8(&dataText, str_abc, -1, &status);
2716         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2717         utext_openUTF8(&replText, str_U00010000, -1, &status);
2718         matcher->reset(&dataText);
2719 
2720         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2721         //                          0123456789
2722         expected[2] = 0xF0;
2723         expected[3] = 0x90;
2724         expected[4] = 0x80;
2725         expected[5] = 0x80;
2726 
2727         result = matcher->replaceAll(&replText, NULL, status);
2728         REGEX_CHECK_STATUS;
2729         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2730         utext_close(result);
2731         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2732         result = matcher->replaceAll(&replText, &destText, status);
2733         REGEX_CHECK_STATUS;
2734         REGEX_ASSERT(result == &destText);
2735         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2736     }
2737     // TODO:  need more through testing of capture substitutions.
2738 
2739     // Bug 4057
2740     //
2741     {
2742         status = U_ZERO_ERROR;
2743 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2744 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2745 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2746         utext_openUTF8(&re, str_ssee, -1, &status);
2747         utext_openUTF8(&dataText, str_blah, -1, &status);
2748         utext_openUTF8(&replText, str_ooh, -1, &status);
2749 
2750         RegexMatcher m(&re, 0, status);
2751         REGEX_CHECK_STATUS;
2752 
2753         UnicodeString result;
2754         UText resultText = UTEXT_INITIALIZER;
2755         utext_openUnicodeString(&resultText, &result, &status);
2756 
2757         // Multiple finds do NOT bump up the previous appendReplacement postion.
2758         m.reset(&dataText);
2759         m.find();
2760         m.find();
2761         m.appendReplacement(&resultText, &replText, status);
2762         REGEX_CHECK_STATUS;
2763         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2764         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2765 
2766         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2767         status = U_ZERO_ERROR;
2768         result.truncate(0);
2769         utext_openUnicodeString(&resultText, &result, &status);
2770         m.reset(10, status);
2771         m.find();
2772         m.find();
2773         m.appendReplacement(&resultText, &replText, status);
2774         REGEX_CHECK_STATUS;
2775         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2776         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2777 
2778         // find() at interior of string, appendReplacement still starts at beginning.
2779         status = U_ZERO_ERROR;
2780         result.truncate(0);
2781         utext_openUnicodeString(&resultText, &result, &status);
2782         m.reset();
2783         m.find(10, status);
2784         m.find();
2785         m.appendReplacement(&resultText, &replText, status);
2786         REGEX_CHECK_STATUS;
2787         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2788         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2789 
2790         m.appendTail(&resultText, status);
2791         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2792         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2793 
2794         utext_close(&resultText);
2795     }
2796 
2797     delete matcher2;
2798     delete pat2;
2799     delete matcher;
2800     delete pat;
2801 
2802     utext_close(&dataText);
2803     utext_close(&replText);
2804     utext_close(&destText);
2805     utext_close(&re);
2806 }
2807 
2808 
2809 //---------------------------------------------------------------------------
2810 //
2811 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2812 //                        present and nominally working.
2813 //
2814 //---------------------------------------------------------------------------
API_Pattern_UTF8()2815 void RegexTest::API_Pattern_UTF8() {
2816     RegexPattern        pata;    // Test default constructor to not crash.
2817     RegexPattern        patb;
2818 
2819     REGEX_ASSERT(pata == patb);
2820     REGEX_ASSERT(pata == pata);
2821 
2822     UText         re1 = UTEXT_INITIALIZER;
2823     UText         re2 = UTEXT_INITIALIZER;
2824     UErrorCode    status = U_ZERO_ERROR;
2825     UParseError   pe;
2826 
2827     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2828     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2829     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2830     utext_openUTF8(&re2, str_def, -1, &status);
2831 
2832     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2833     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2834     REGEX_CHECK_STATUS;
2835     REGEX_ASSERT(*pat1 == *pat1);
2836     REGEX_ASSERT(*pat1 != pata);
2837 
2838     // Assign
2839     patb = *pat1;
2840     REGEX_ASSERT(patb == *pat1);
2841 
2842     // Copy Construct
2843     RegexPattern patc(*pat1);
2844     REGEX_ASSERT(patc == *pat1);
2845     REGEX_ASSERT(patb == patc);
2846     REGEX_ASSERT(pat1 != pat2);
2847     patb = *pat2;
2848     REGEX_ASSERT(patb != patc);
2849     REGEX_ASSERT(patb == *pat2);
2850 
2851     // Compile with no flags.
2852     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2853     REGEX_ASSERT(*pat1a == *pat1);
2854 
2855     REGEX_ASSERT(pat1a->flags() == 0);
2856 
2857     // Compile with different flags should be not equal
2858     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2859     REGEX_CHECK_STATUS;
2860 
2861     REGEX_ASSERT(*pat1b != *pat1a);
2862     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2863     REGEX_ASSERT(pat1a->flags() == 0);
2864     delete pat1b;
2865 
2866     // clone
2867     RegexPattern *pat1c = pat1->clone();
2868     REGEX_ASSERT(*pat1c == *pat1);
2869     REGEX_ASSERT(*pat1c != *pat2);
2870 
2871     delete pat1c;
2872     delete pat1a;
2873     delete pat1;
2874     delete pat2;
2875 
2876     utext_close(&re1);
2877     utext_close(&re2);
2878 
2879 
2880     //
2881     //   Verify that a matcher created from a cloned pattern works.
2882     //     (Jitterbug 3423)
2883     //
2884     {
2885         UErrorCode     status     = U_ZERO_ERROR;
2886         UText          pattern    = UTEXT_INITIALIZER;
2887         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2888         utext_openUTF8(&pattern, str_pL, -1, &status);
2889 
2890         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2891         RegexPattern  *pClone     = pSource->clone();
2892         delete         pSource;
2893         RegexMatcher  *mFromClone = pClone->matcher(status);
2894         REGEX_CHECK_STATUS;
2895 
2896         UText          input      = UTEXT_INITIALIZER;
2897         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2898         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2899         mFromClone->reset(&input);
2900         REGEX_ASSERT(mFromClone->find() == TRUE);
2901         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2902         REGEX_ASSERT(mFromClone->find() == TRUE);
2903         REGEX_ASSERT(mFromClone->group(status) == "World");
2904         REGEX_ASSERT(mFromClone->find() == FALSE);
2905         delete mFromClone;
2906         delete pClone;
2907 
2908         utext_close(&input);
2909         utext_close(&pattern);
2910     }
2911 
2912     //
2913     //   matches convenience API
2914     //
2915     {
2916         UErrorCode status  = U_ZERO_ERROR;
2917         UText      pattern = UTEXT_INITIALIZER;
2918         UText      input   = UTEXT_INITIALIZER;
2919 
2920         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2921         utext_openUTF8(&input, str_randominput, -1, &status);
2922 
2923         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2924         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2925         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2926         REGEX_CHECK_STATUS;
2927 
2928         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2929         utext_openUTF8(&pattern, str_abc, -1, &status);
2930         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2931         REGEX_CHECK_STATUS;
2932 
2933         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2934         utext_openUTF8(&pattern, str_nput, -1, &status);
2935         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2936         REGEX_CHECK_STATUS;
2937 
2938         utext_openUTF8(&pattern, str_randominput, -1, &status);
2939         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2940         REGEX_CHECK_STATUS;
2941 
2942         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2943         utext_openUTF8(&pattern, str_u, -1, &status);
2944         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2945         REGEX_CHECK_STATUS;
2946 
2947         utext_openUTF8(&input, str_abc, -1, &status);
2948         utext_openUTF8(&pattern, str_abc, -1, &status);
2949         status = U_INDEX_OUTOFBOUNDS_ERROR;
2950         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2951         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2952 
2953         utext_close(&input);
2954         utext_close(&pattern);
2955     }
2956 
2957 
2958     //
2959     // Split()
2960     //
2961     status = U_ZERO_ERROR;
2962     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2963     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2964     pat1 = RegexPattern::compile(&re1, pe, status);
2965     REGEX_CHECK_STATUS;
2966     UnicodeString  fields[10];
2967 
2968     int32_t n;
2969     n = pat1->split("Now is the time", fields, 10, status);
2970     REGEX_CHECK_STATUS;
2971     REGEX_ASSERT(n==4);
2972     REGEX_ASSERT(fields[0]=="Now");
2973     REGEX_ASSERT(fields[1]=="is");
2974     REGEX_ASSERT(fields[2]=="the");
2975     REGEX_ASSERT(fields[3]=="time");
2976     REGEX_ASSERT(fields[4]=="");
2977 
2978     n = pat1->split("Now is the time", fields, 2, status);
2979     REGEX_CHECK_STATUS;
2980     REGEX_ASSERT(n==2);
2981     REGEX_ASSERT(fields[0]=="Now");
2982     REGEX_ASSERT(fields[1]=="is the time");
2983     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2984 
2985     fields[1] = "*";
2986     status = U_ZERO_ERROR;
2987     n = pat1->split("Now is the time", fields, 1, status);
2988     REGEX_CHECK_STATUS;
2989     REGEX_ASSERT(n==1);
2990     REGEX_ASSERT(fields[0]=="Now is the time");
2991     REGEX_ASSERT(fields[1]=="*");
2992     status = U_ZERO_ERROR;
2993 
2994     n = pat1->split("    Now       is the time   ", fields, 10, status);
2995     REGEX_CHECK_STATUS;
2996     REGEX_ASSERT(n==6);
2997     REGEX_ASSERT(fields[0]=="");
2998     REGEX_ASSERT(fields[1]=="Now");
2999     REGEX_ASSERT(fields[2]=="is");
3000     REGEX_ASSERT(fields[3]=="the");
3001     REGEX_ASSERT(fields[4]=="time");
3002     REGEX_ASSERT(fields[5]=="");
3003     REGEX_ASSERT(fields[6]=="");
3004 
3005     fields[2] = "*";
3006     n = pat1->split("     ", fields, 10, status);
3007     REGEX_CHECK_STATUS;
3008     REGEX_ASSERT(n==2);
3009     REGEX_ASSERT(fields[0]=="");
3010     REGEX_ASSERT(fields[1]=="");
3011     REGEX_ASSERT(fields[2]=="*");
3012 
3013     fields[0] = "foo";
3014     n = pat1->split("", fields, 10, status);
3015     REGEX_CHECK_STATUS;
3016     REGEX_ASSERT(n==0);
3017     REGEX_ASSERT(fields[0]=="foo");
3018 
3019     delete pat1;
3020 
3021     //  split, with a pattern with (capture)
3022     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3023     pat1 = RegexPattern::compile(&re1,  pe, status);
3024     REGEX_CHECK_STATUS;
3025 
3026     status = U_ZERO_ERROR;
3027     fields[6] = fields[7] = "*";
3028     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3029     REGEX_CHECK_STATUS;
3030     REGEX_ASSERT(n==7);
3031     REGEX_ASSERT(fields[0]=="");
3032     REGEX_ASSERT(fields[1]=="a");
3033     REGEX_ASSERT(fields[2]=="Now is ");
3034     REGEX_ASSERT(fields[3]=="b");
3035     REGEX_ASSERT(fields[4]=="the time");
3036     REGEX_ASSERT(fields[5]=="c");
3037     REGEX_ASSERT(fields[6]=="");
3038     REGEX_ASSERT(fields[7]=="*");
3039     REGEX_ASSERT(status==U_ZERO_ERROR);
3040 
3041     fields[6] = fields[7] = "*";
3042     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3043     REGEX_CHECK_STATUS;
3044     REGEX_ASSERT(n==7);
3045     REGEX_ASSERT(fields[0]=="  ");
3046     REGEX_ASSERT(fields[1]=="a");
3047     REGEX_ASSERT(fields[2]=="Now is ");
3048     REGEX_ASSERT(fields[3]=="b");
3049     REGEX_ASSERT(fields[4]=="the time");
3050     REGEX_ASSERT(fields[5]=="c");
3051     REGEX_ASSERT(fields[6]=="");
3052     REGEX_ASSERT(fields[7]=="*");
3053 
3054     status = U_ZERO_ERROR;
3055     fields[6] = "foo";
3056     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3057     REGEX_CHECK_STATUS;
3058     REGEX_ASSERT(n==6);
3059     REGEX_ASSERT(fields[0]=="  ");
3060     REGEX_ASSERT(fields[1]=="a");
3061     REGEX_ASSERT(fields[2]=="Now is ");
3062     REGEX_ASSERT(fields[3]=="b");
3063     REGEX_ASSERT(fields[4]=="the time");
3064     REGEX_ASSERT(fields[5]==" ");
3065     REGEX_ASSERT(fields[6]=="foo");
3066 
3067     status = U_ZERO_ERROR;
3068     fields[5] = "foo";
3069     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3070     REGEX_CHECK_STATUS;
3071     REGEX_ASSERT(n==5);
3072     REGEX_ASSERT(fields[0]=="  ");
3073     REGEX_ASSERT(fields[1]=="a");
3074     REGEX_ASSERT(fields[2]=="Now is ");
3075     REGEX_ASSERT(fields[3]=="b");
3076     REGEX_ASSERT(fields[4]=="the time<c>");
3077     REGEX_ASSERT(fields[5]=="foo");
3078 
3079     status = U_ZERO_ERROR;
3080     fields[5] = "foo";
3081     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3082     REGEX_CHECK_STATUS;
3083     REGEX_ASSERT(n==5);
3084     REGEX_ASSERT(fields[0]=="  ");
3085     REGEX_ASSERT(fields[1]=="a");
3086     REGEX_ASSERT(fields[2]=="Now is ");
3087     REGEX_ASSERT(fields[3]=="b");
3088     REGEX_ASSERT(fields[4]=="the time");
3089     REGEX_ASSERT(fields[5]=="foo");
3090 
3091     status = U_ZERO_ERROR;
3092     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3093     REGEX_CHECK_STATUS;
3094     REGEX_ASSERT(n==4);
3095     REGEX_ASSERT(fields[0]=="  ");
3096     REGEX_ASSERT(fields[1]=="a");
3097     REGEX_ASSERT(fields[2]=="Now is ");
3098     REGEX_ASSERT(fields[3]=="the time<c>");
3099     status = U_ZERO_ERROR;
3100     delete pat1;
3101 
3102     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3103     pat1 = RegexPattern::compile(&re1, pe, status);
3104     REGEX_CHECK_STATUS;
3105     n = pat1->split("1-10,20", fields, 10, status);
3106     REGEX_CHECK_STATUS;
3107     REGEX_ASSERT(n==5);
3108     REGEX_ASSERT(fields[0]=="1");
3109     REGEX_ASSERT(fields[1]=="-");
3110     REGEX_ASSERT(fields[2]=="10");
3111     REGEX_ASSERT(fields[3]==",");
3112     REGEX_ASSERT(fields[4]=="20");
3113     delete pat1;
3114 
3115 
3116     //
3117     // split of a UText based string, with library allocating output UTexts.
3118     //
3119     {
3120         status = U_ZERO_ERROR;
3121         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3122         UnicodeString stringToSplit("first:second:third");
3123         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3124         REGEX_CHECK_STATUS;
3125 
3126         UText *splits[10] = {NULL};
3127         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3128         REGEX_CHECK_STATUS;
3129         REGEX_ASSERT(numFields == 5);
3130         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3131         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3132         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3133         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3134         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3135         REGEX_ASSERT(splits[5] == NULL);
3136 
3137         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3138             if (splits[i]) {
3139                 utext_close(splits[i]);
3140                 splits[i] = NULL;
3141             }
3142         }
3143         utext_close(textToSplit);
3144     }
3145 
3146 
3147     //
3148     // RegexPattern::pattern() and patternText()
3149     //
3150     pat1 = new RegexPattern();
3151     REGEX_ASSERT(pat1->pattern() == "");
3152     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3153     delete pat1;
3154     const char *helloWorldInvariant = "(Hello, world)*";
3155     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3156     pat1 = RegexPattern::compile(&re1, pe, status);
3157     REGEX_CHECK_STATUS;
3158     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3159     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3160     delete pat1;
3161 
3162     utext_close(&re1);
3163 }
3164 
3165 
3166 //---------------------------------------------------------------------------
3167 //
3168 //      Extended       A more thorough check for features of regex patterns
3169 //                     The test cases are in a separate data file,
3170 //                       source/tests/testdata/regextst.txt
3171 //                     A description of the test data format is included in that file.
3172 //
3173 //---------------------------------------------------------------------------
3174 
3175 const char *
getPath(char buffer[2048],const char * filename)3176 RegexTest::getPath(char buffer[2048], const char *filename) {
3177     UErrorCode status=U_ZERO_ERROR;
3178     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3179     if (U_FAILURE(status)) {
3180         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3181         return NULL;
3182     }
3183 
3184     strcpy(buffer, testDataDirectory);
3185     strcat(buffer, filename);
3186     return buffer;
3187 }
3188 
Extended()3189 void RegexTest::Extended() {
3190     char tdd[2048];
3191     const char *srcPath;
3192     UErrorCode  status  = U_ZERO_ERROR;
3193     int32_t     lineNum = 0;
3194 
3195     //
3196     //  Open and read the test data file.
3197     //
3198     srcPath=getPath(tdd, "regextst.txt");
3199     if(srcPath==NULL) {
3200         return; /* something went wrong, error already output */
3201     }
3202 
3203     int32_t    len;
3204     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3205     if (U_FAILURE(status)) {
3206         return; /* something went wrong, error already output */
3207     }
3208 
3209     //
3210     //  Put the test data into a UnicodeString
3211     //
3212     UnicodeString testString(FALSE, testData, len);
3213 
3214     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3215     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3216     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3217 
3218     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3219     UnicodeString   testPattern;   // The pattern for test from the test file.
3220     UnicodeString   testFlags;     // the flags   for a test.
3221     UnicodeString   matchString;   // The marked up string to be used as input
3222 
3223     if (U_FAILURE(status)){
3224         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3225         delete [] testData;
3226         return;
3227     }
3228 
3229     //
3230     //  Loop over the test data file, once per line.
3231     //
3232     while (lineMat.find()) {
3233         lineNum++;
3234         if (U_FAILURE(status)) {
3235           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3236         }
3237 
3238         status = U_ZERO_ERROR;
3239         UnicodeString testLine = lineMat.group(1, status);
3240         if (testLine.length() == 0) {
3241             continue;
3242         }
3243 
3244         //
3245         // Parse the test line.  Skip blank and comment only lines.
3246         // Separate out the three main fields - pattern, flags, target.
3247         //
3248 
3249         commentMat.reset(testLine);
3250         if (commentMat.lookingAt(status)) {
3251             // This line is a comment, or blank.
3252             continue;
3253         }
3254 
3255         //
3256         //  Pull out the pattern field, remove it from the test file line.
3257         //
3258         quotedStuffMat.reset(testLine);
3259         if (quotedStuffMat.lookingAt(status)) {
3260             testPattern = quotedStuffMat.group(2, status);
3261             testLine.remove(0, quotedStuffMat.end(0, status));
3262         } else {
3263             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3264             continue;
3265         }
3266 
3267 
3268         //
3269         //  Pull out the flags from the test file line.
3270         //
3271         flagsMat.reset(testLine);
3272         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3273         testFlags = flagsMat.group(1, status);
3274         if (flagsMat.group(2, status).length() > 0) {
3275             errln("Bad Match flag at line %d. Scanning %c\n",
3276                 lineNum, flagsMat.group(2, status).charAt(0));
3277             continue;
3278         }
3279         testLine.remove(0, flagsMat.end(0, status));
3280 
3281         //
3282         //  Pull out the match string, as a whole.
3283         //    We'll process the <tags> later.
3284         //
3285         quotedStuffMat.reset(testLine);
3286         if (quotedStuffMat.lookingAt(status)) {
3287             matchString = quotedStuffMat.group(2, status);
3288             testLine.remove(0, quotedStuffMat.end(0, status));
3289         } else {
3290             errln("Bad match string at test file line %d", lineNum);
3291             continue;
3292         }
3293 
3294         //
3295         //  The only thing left from the input line should be an optional trailing comment.
3296         //
3297         commentMat.reset(testLine);
3298         if (commentMat.lookingAt(status) == FALSE) {
3299             errln("Line %d: unexpected characters at end of test line.", lineNum);
3300             continue;
3301         }
3302 
3303         //
3304         //  Run the test
3305         //
3306         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3307     }
3308 
3309     delete [] testData;
3310 
3311 }
3312 
3313 
3314 
3315 //---------------------------------------------------------------------------
3316 //
3317 //    regex_find(pattern, flags, inputString, lineNumber)
3318 //
3319 //         Function to run a single test from the Extended (data driven) tests.
3320 //         See file test/testdata/regextst.txt for a description of the
3321 //         pattern and inputString fields, and the allowed flags.
3322 //         lineNumber is the source line in regextst.txt of the test.
3323 //
3324 //---------------------------------------------------------------------------
3325 
3326 
3327 //  Set a value into a UVector at position specified by a decimal number in
3328 //   a UnicodeString.   This is a utility function needed by the actual test function,
3329 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3330 static void set(UVector &vec, int32_t val, UnicodeString index) {
3331     UErrorCode  status=U_ZERO_ERROR;
3332     int32_t  idx = 0;
3333     for (int32_t i=0; i<index.length(); i++) {
3334         int32_t d=u_charDigitValue(index.charAt(i));
3335         if (d<0) {return;}
3336         idx = idx*10 + d;
3337     }
3338     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3339     vec.setElementAt(val, idx);
3340 }
3341 
setInt(UVector & vec,int32_t val,int32_t idx)3342 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3343     UErrorCode  status=U_ZERO_ERROR;
3344     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3345     vec.setElementAt(val, idx);
3346 }
3347 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3348 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3349 {
3350     UBool couldFind = TRUE;
3351     UTEXT_SETNATIVEINDEX(utext, 0);
3352     int32_t i = 0;
3353     while (i < unistrOffset) {
3354         UChar32 c = UTEXT_NEXT32(utext);
3355         if (c != U_SENTINEL) {
3356             i += U16_LENGTH(c);
3357         } else {
3358             couldFind = FALSE;
3359             break;
3360         }
3361     }
3362     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3363     return couldFind;
3364 }
3365 
3366 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3367 void RegexTest::regex_find(const UnicodeString &pattern,
3368                            const UnicodeString &flags,
3369                            const UnicodeString &inputString,
3370                            const char *srcPath,
3371                            int32_t line) {
3372     UnicodeString       unEscapedInput;
3373     UnicodeString       deTaggedInput;
3374 
3375     int32_t             patternUTF8Length,      inputUTF8Length;
3376     char                *patternChars  = NULL, *inputChars = NULL;
3377     UText               patternText    = UTEXT_INITIALIZER;
3378     UText               inputText      = UTEXT_INITIALIZER;
3379     UConverter          *UTF8Converter = NULL;
3380 
3381     UErrorCode          status         = U_ZERO_ERROR;
3382     UParseError         pe;
3383     RegexPattern        *parsePat      = NULL;
3384     RegexMatcher        *parseMatcher  = NULL;
3385     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3386     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3387     UVector             groupStarts(status);
3388     UVector             groupEnds(status);
3389     UVector             groupStartsUTF8(status);
3390     UVector             groupEndsUTF8(status);
3391     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3392     UBool               failed         = FALSE;
3393     int32_t             numFinds;
3394     int32_t             i;
3395     UBool               useMatchesFunc   = FALSE;
3396     UBool               useLookingAtFunc = FALSE;
3397     int32_t             regionStart      = -1;
3398     int32_t             regionEnd        = -1;
3399     int32_t             regionStartUTF8  = -1;
3400     int32_t             regionEndUTF8    = -1;
3401 
3402 
3403     //
3404     //  Compile the caller's pattern
3405     //
3406     uint32_t bflags = 0;
3407     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3408         bflags |= UREGEX_CASE_INSENSITIVE;
3409     }
3410     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3411         bflags |= UREGEX_COMMENTS;
3412     }
3413     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3414         bflags |= UREGEX_DOTALL;
3415     }
3416     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3417         bflags |= UREGEX_MULTILINE;
3418     }
3419 
3420     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3421         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3422     }
3423     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3424         bflags |= UREGEX_UNIX_LINES;
3425     }
3426     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3427         bflags |= UREGEX_LITERAL;
3428     }
3429 
3430 
3431     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3432     if (status != U_ZERO_ERROR) {
3433         #if UCONFIG_NO_BREAK_ITERATION==1
3434         // 'v' test flag means that the test pattern should not compile if ICU was configured
3435         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3436         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3437             goto cleanupAndReturn;
3438         }
3439         #endif
3440         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3441             // Expected pattern compilation error.
3442             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3443                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3444             }
3445             goto cleanupAndReturn;
3446         } else {
3447             // Unexpected pattern compilation error.
3448             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3449             goto cleanupAndReturn;
3450         }
3451     }
3452 
3453     UTF8Converter = ucnv_open("UTF8", &status);
3454     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3455 
3456     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3457     status = U_ZERO_ERROR; // buffer overflow
3458     patternChars = new char[patternUTF8Length+1];
3459     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3460     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3461 
3462     if (status == U_ZERO_ERROR) {
3463         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3464 
3465         if (status != U_ZERO_ERROR) {
3466 #if UCONFIG_NO_BREAK_ITERATION==1
3467             // 'v' test flag means that the test pattern should not compile if ICU was configured
3468             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3469             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3470                 goto cleanupAndReturn;
3471             }
3472 #endif
3473             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3474                 // Expected pattern compilation error.
3475                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3476                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3477                 }
3478                 goto cleanupAndReturn;
3479             } else {
3480                 // Unexpected pattern compilation error.
3481                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3482                 goto cleanupAndReturn;
3483             }
3484         }
3485     }
3486 
3487     if (UTF8Pattern == NULL) {
3488         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3489         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3490         status = U_ZERO_ERROR;
3491     }
3492 
3493     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3494         callerPattern->dumpPattern();
3495     }
3496 
3497     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3498         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3499         goto cleanupAndReturn;
3500     }
3501 
3502 
3503     //
3504     // Number of times find() should be called on the test string, default to 1
3505     //
3506     numFinds = 1;
3507     for (i=2; i<=9; i++) {
3508         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3509             if (numFinds != 1) {
3510                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3511                 goto cleanupAndReturn;
3512             }
3513             numFinds = i;
3514         }
3515     }
3516 
3517     // 'M' flag.  Use matches() instead of find()
3518     if (flags.indexOf((UChar)0x4d) >= 0) {
3519         useMatchesFunc = TRUE;
3520     }
3521     if (flags.indexOf((UChar)0x4c) >= 0) {
3522         useLookingAtFunc = TRUE;
3523     }
3524 
3525     //
3526     //  Find the tags in the input data, remove them, and record the group boundary
3527     //    positions.
3528     //
3529     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3530     REGEX_CHECK_STATUS_L(line);
3531 
3532     unEscapedInput = inputString.unescape();
3533     parseMatcher = parsePat->matcher(unEscapedInput, status);
3534     REGEX_CHECK_STATUS_L(line);
3535     while(parseMatcher->find()) {
3536         parseMatcher->appendReplacement(deTaggedInput, "", status);
3537         REGEX_CHECK_STATUS;
3538         UnicodeString groupNum = parseMatcher->group(2, status);
3539         if (groupNum == "r") {
3540             // <r> or </r>, a region specification within the string
3541             if (parseMatcher->group(1, status) == "/") {
3542                 regionEnd = deTaggedInput.length();
3543             } else {
3544                 regionStart = deTaggedInput.length();
3545             }
3546         } else {
3547             // <digits> or </digits>, a group match boundary tag.
3548             if (parseMatcher->group(1, status) == "/") {
3549                 set(groupEnds, deTaggedInput.length(), groupNum);
3550             } else {
3551                 set(groupStarts, deTaggedInput.length(), groupNum);
3552             }
3553         }
3554     }
3555     parseMatcher->appendTail(deTaggedInput);
3556     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3557     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3558       errln("mismatched <r> tags");
3559       failed = TRUE;
3560       goto cleanupAndReturn;
3561     }
3562 
3563     //
3564     //  Configure the matcher according to the flags specified with this test.
3565     //
3566     matcher = callerPattern->matcher(deTaggedInput, status);
3567     REGEX_CHECK_STATUS_L(line);
3568     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3569         matcher->setTrace(TRUE);
3570     }
3571 
3572     if (UTF8Pattern != NULL) {
3573         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3574         status = U_ZERO_ERROR; // buffer overflow
3575         inputChars = new char[inputUTF8Length+1];
3576         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3577         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3578 
3579         if (status == U_ZERO_ERROR) {
3580             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3581             REGEX_CHECK_STATUS_L(line);
3582         }
3583 
3584         if (UTF8Matcher == NULL) {
3585             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3586           logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3587             status = U_ZERO_ERROR;
3588         }
3589     }
3590 
3591     //
3592     //  Generate native indices for UTF8 versions of region and capture group info
3593     //
3594     if (UTF8Matcher != NULL) {
3595         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3596         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3597 
3598         //  Fill out the native index UVector info.
3599         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3600         for (i=0; i<groupStarts.size(); i++) {
3601             int32_t  start = groupStarts.elementAti(i);
3602             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3603             if (start >= 0) {
3604                 int32_t  startUTF8;
3605                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3606                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3607                     failed = TRUE;
3608                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3609                 }
3610                 setInt(groupStartsUTF8, startUTF8, i);
3611             }
3612 
3613             int32_t  end = groupEnds.elementAti(i);
3614             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3615             if (end >= 0) {
3616                 int32_t  endUTF8;
3617                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3618                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3619                     failed = TRUE;
3620                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3621                 }
3622                 setInt(groupEndsUTF8, endUTF8, i);
3623             }
3624         }
3625     }
3626 
3627     if (regionStart>=0) {
3628        matcher->region(regionStart, regionEnd, status);
3629        REGEX_CHECK_STATUS_L(line);
3630        if (UTF8Matcher != NULL) {
3631            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3632            REGEX_CHECK_STATUS_L(line);
3633        }
3634     }
3635     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3636         matcher->useAnchoringBounds(FALSE);
3637         if (UTF8Matcher != NULL) {
3638             UTF8Matcher->useAnchoringBounds(FALSE);
3639         }
3640     }
3641     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3642         matcher->useTransparentBounds(TRUE);
3643         if (UTF8Matcher != NULL) {
3644             UTF8Matcher->useTransparentBounds(TRUE);
3645         }
3646     }
3647 
3648 
3649 
3650     //
3651     // Do a find on the de-tagged input using the caller's pattern
3652     //     TODO: error on count>1 and not find().
3653     //           error on both matches() and lookingAt().
3654     //
3655     for (i=0; i<numFinds; i++) {
3656         if (useMatchesFunc) {
3657             isMatch = matcher->matches(status);
3658             if (UTF8Matcher != NULL) {
3659                isUTF8Match = UTF8Matcher->matches(status);
3660             }
3661         } else  if (useLookingAtFunc) {
3662             isMatch = matcher->lookingAt(status);
3663             if (UTF8Matcher != NULL) {
3664                 isUTF8Match = UTF8Matcher->lookingAt(status);
3665             }
3666         } else {
3667             isMatch = matcher->find();
3668             if (UTF8Matcher != NULL) {
3669                 isUTF8Match = UTF8Matcher->find();
3670             }
3671         }
3672     }
3673     matcher->setTrace(FALSE);
3674     if (U_FAILURE(status)) {
3675         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3676     }
3677 
3678     //
3679     // Match up the groups from the find() with the groups from the tags
3680     //
3681 
3682     // number of tags should match number of groups from find operation.
3683     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3684     //   G option in test means that capture group data is not available in the
3685     //     expected results, so the check needs to be suppressed.
3686     if (isMatch == FALSE && groupStarts.size() != 0) {
3687         dataerrln("Error at line %d:  Match expected, but none found.", line);
3688         failed = TRUE;
3689         goto cleanupAndReturn;
3690     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3691         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3692         failed = TRUE;
3693         goto cleanupAndReturn;
3694     }
3695 
3696     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3697         // Only check for match / no match.  Don't check capture groups.
3698         if (isMatch && groupStarts.size() == 0) {
3699             errln("Error at line %d:  No match expected, but one found.", line);
3700             failed = TRUE;
3701         } else if (UTF8Matcher != NULL && isUTF8Match && groupStarts.size() == 0) {
3702             errln("Error at line %d:  No match expected, but one found. (UTF8)", line);
3703             failed = TRUE;
3704         }
3705         goto cleanupAndReturn;
3706     }
3707 
3708     REGEX_CHECK_STATUS_L(line);
3709     for (i=0; i<=matcher->groupCount(); i++) {
3710         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3711         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3712         if (matcher->start(i, status) != expectedStart) {
3713             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3714                 line, i, expectedStart, matcher->start(i, status));
3715             failed = TRUE;
3716             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3717         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3718             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3719                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3720             failed = TRUE;
3721             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3722         }
3723 
3724         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3725         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3726         if (matcher->end(i, status) != expectedEnd) {
3727             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3728                 line, i, expectedEnd, matcher->end(i, status));
3729             failed = TRUE;
3730             // Error on end position;  keep going; real error is probably yet to come as group
3731             //   end positions work from end of the input data towards the front.
3732         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3733             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3734                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3735             failed = TRUE;
3736             // Error on end position;  keep going; real error is probably yet to come as group
3737             //   end positions work from end of the input data towards the front.
3738         }
3739     }
3740     if ( matcher->groupCount()+1 < groupStarts.size()) {
3741         errln("Error at line %d: Expected %d capture groups, found %d.",
3742             line, groupStarts.size()-1, matcher->groupCount());
3743         failed = TRUE;
3744         }
3745     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3746         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3747               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3748         failed = TRUE;
3749     }
3750 
3751     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3752         matcher->requireEnd() == TRUE) {
3753         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3754         failed = TRUE;
3755     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3756         UTF8Matcher->requireEnd() == TRUE) {
3757         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3758         failed = TRUE;
3759     }
3760 
3761     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3762         matcher->requireEnd() == FALSE) {
3763         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3764         failed = TRUE;
3765     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3766         UTF8Matcher->requireEnd() == FALSE) {
3767         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3768         failed = TRUE;
3769     }
3770 
3771     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3772         matcher->hitEnd() == TRUE) {
3773         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3774         failed = TRUE;
3775     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3776                UTF8Matcher->hitEnd() == TRUE) {
3777         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3778         failed = TRUE;
3779     }
3780 
3781     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3782         matcher->hitEnd() == FALSE) {
3783         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3784         failed = TRUE;
3785     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3786                UTF8Matcher->hitEnd() == FALSE) {
3787         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3788         failed = TRUE;
3789     }
3790 
3791 
3792 cleanupAndReturn:
3793     if (failed) {
3794         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3795             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3796         // callerPattern->dump();
3797     }
3798     delete parseMatcher;
3799     delete parsePat;
3800     delete UTF8Matcher;
3801     delete UTF8Pattern;
3802     delete matcher;
3803     delete callerPattern;
3804 
3805     utext_close(&inputText);
3806     delete[] inputChars;
3807     utext_close(&patternText);
3808     delete[] patternChars;
3809     ucnv_close(UTF8Converter);
3810 }
3811 
3812 
3813 
3814 
3815 //---------------------------------------------------------------------------
3816 //
3817 //      Errors     Check for error handling in patterns.
3818 //
3819 //---------------------------------------------------------------------------
Errors()3820 void RegexTest::Errors() {
3821     // \escape sequences that aren't implemented yet.
3822     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3823 
3824     // Missing close parentheses
3825     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3826     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3827     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3828 
3829     // Extra close paren
3830     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3831     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3832     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3833 
3834     // Look-ahead, Look-behind
3835     //  TODO:  add tests for unbounded length look-behinds.
3836     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3837 
3838     // Attempt to use non-default flags
3839     {
3840         UParseError   pe;
3841         UErrorCode    status = U_ZERO_ERROR;
3842         int32_t       flags  = UREGEX_CANON_EQ |
3843                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3844                                UREGEX_MULTILINE;
3845         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3846         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3847         delete pat1;
3848     }
3849 
3850 
3851     // Quantifiers are allowed only after something that can be quantified.
3852     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3853     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3854     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3855 
3856     // Mal-formed {min,max} quantifiers
3857     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3858     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3859     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3860     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3861     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3862     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3863     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3864     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3865     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3866 
3867     // Ticket 5389
3868     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3869 
3870     // Invalid Back Reference \0
3871     //    For ICU 3.8 and earlier
3872     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3873     //
3874     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3875 
3876 }
3877 
3878 
3879 //-------------------------------------------------------------------------------
3880 //
3881 //  Read a text data file, convert it to UChars, and return the data
3882 //    in one big UChar * buffer, which the caller must delete.
3883 //
3884 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3885 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3886                                      const char *defEncoding, UErrorCode &status) {
3887     UChar       *retPtr  = NULL;
3888     char        *fileBuf = NULL;
3889     UConverter* conv     = NULL;
3890     FILE        *f       = NULL;
3891 
3892     ulen = 0;
3893     if (U_FAILURE(status)) {
3894         return retPtr;
3895     }
3896 
3897     //
3898     //  Open the file.
3899     //
3900     f = fopen(fileName, "rb");
3901     if (f == 0) {
3902         dataerrln("Error opening test data file %s\n", fileName);
3903         status = U_FILE_ACCESS_ERROR;
3904         return NULL;
3905     }
3906     //
3907     //  Read it in
3908     //
3909     int32_t            fileSize;
3910     int32_t            amt_read;
3911 
3912     fseek( f, 0, SEEK_END);
3913     fileSize = ftell(f);
3914     fileBuf = new char[fileSize];
3915     fseek(f, 0, SEEK_SET);
3916     amt_read = fread(fileBuf, 1, fileSize, f);
3917     if (amt_read != fileSize || fileSize <= 0) {
3918         errln("Error reading test data file.");
3919         goto cleanUpAndReturn;
3920     }
3921 
3922     //
3923     // Look for a Unicode Signature (BOM) on the data just read
3924     //
3925     int32_t        signatureLength;
3926     const char *   fileBufC;
3927     const char*    encoding;
3928 
3929     fileBufC = fileBuf;
3930     encoding = ucnv_detectUnicodeSignature(
3931         fileBuf, fileSize, &signatureLength, &status);
3932     if(encoding!=NULL ){
3933         fileBufC  += signatureLength;
3934         fileSize  -= signatureLength;
3935     } else {
3936         encoding = defEncoding;
3937         if (strcmp(encoding, "utf-8") == 0) {
3938             errln("file %s is missing its BOM", fileName);
3939         }
3940     }
3941 
3942     //
3943     // Open a converter to take the rule file to UTF-16
3944     //
3945     conv = ucnv_open(encoding, &status);
3946     if (U_FAILURE(status)) {
3947         goto cleanUpAndReturn;
3948     }
3949 
3950     //
3951     // Convert the rules to UChar.
3952     //  Preflight first to determine required buffer size.
3953     //
3954     ulen = ucnv_toUChars(conv,
3955         NULL,           //  dest,
3956         0,              //  destCapacity,
3957         fileBufC,
3958         fileSize,
3959         &status);
3960     if (status == U_BUFFER_OVERFLOW_ERROR) {
3961         // Buffer Overflow is expected from the preflight operation.
3962         status = U_ZERO_ERROR;
3963 
3964         retPtr = new UChar[ulen+1];
3965         ucnv_toUChars(conv,
3966             retPtr,       //  dest,
3967             ulen+1,
3968             fileBufC,
3969             fileSize,
3970             &status);
3971     }
3972 
3973 cleanUpAndReturn:
3974     fclose(f);
3975     delete[] fileBuf;
3976     ucnv_close(conv);
3977     if (U_FAILURE(status)) {
3978         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3979         delete []retPtr;
3980         retPtr = 0;
3981         ulen   = 0;
3982     };
3983     return retPtr;
3984 }
3985 
3986 
3987 //-------------------------------------------------------------------------------
3988 //
3989 //   PerlTests  - Run Perl's regular expression tests
3990 //                The input file for this test is re_tests, the standard regular
3991 //                expression test data distributed with the Perl source code.
3992 //
3993 //                Here is Perl's description of the test data file:
3994 //
3995 //        # The tests are in a separate file 't/op/re_tests'.
3996 //        # Each line in that file is a separate test.
3997 //        # There are five columns, separated by tabs.
3998 //        #
3999 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
4000 //        # Modifiers can be put after the closing C<'>.
4001 //        #
4002 //        # Column 2 contains the string to be matched.
4003 //        #
4004 //        # Column 3 contains the expected result:
4005 //        #     y   expect a match
4006 //        #     n   expect no match
4007 //        #     c   expect an error
4008 //        # B   test exposes a known bug in Perl, should be skipped
4009 //        # b   test exposes a known bug in Perl, should be skipped if noamp
4010 //        #
4011 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4012 //        #
4013 //        # Column 4 contains a string, usually C<$&>.
4014 //        #
4015 //        # Column 5 contains the expected result of double-quote
4016 //        # interpolating that string after the match, or start of error message.
4017 //        #
4018 //        # Column 6, if present, contains a reason why the test is skipped.
4019 //        # This is printed with "skipped", for harness to pick up.
4020 //        #
4021 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4022 //        #
4023 //        # If you want to add a regular expression test that can't be expressed
4024 //        # in this format, don't add it here: put it in op/pat.t instead.
4025 //
4026 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4027 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4028 //        (The i is in addition to whatever was there before.)
4029 //
4030 //-------------------------------------------------------------------------------
PerlTests()4031 void RegexTest::PerlTests() {
4032     char tdd[2048];
4033     const char *srcPath;
4034     UErrorCode  status = U_ZERO_ERROR;
4035     UParseError pe;
4036 
4037     //
4038     //  Open and read the test data file.
4039     //
4040     srcPath=getPath(tdd, "re_tests.txt");
4041     if(srcPath==NULL) {
4042         return; /* something went wrong, error already output */
4043     }
4044 
4045     int32_t    len;
4046     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4047     if (U_FAILURE(status)) {
4048         return; /* something went wrong, error already output */
4049     }
4050 
4051     //
4052     //  Put the test data into a UnicodeString
4053     //
4054     UnicodeString testDataString(FALSE, testData, len);
4055 
4056     //
4057     //  Regex to break the input file into lines, and strip the new lines.
4058     //     One line per match, capture group one is the desired data.
4059     //
4060     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4061     if (U_FAILURE(status)) {
4062         dataerrln("RegexPattern::compile() error");
4063         return;
4064     }
4065     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4066 
4067     //
4068     //  Regex to split a test file line into fields.
4069     //    There are six fields, separated by tabs.
4070     //
4071     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4072 
4073     //
4074     //  Regex to identify test patterns with flag settings, and to separate them.
4075     //    Test patterns with flags look like 'pattern'i
4076     //    Test patterns without flags are not quoted:   pattern
4077     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4078     //
4079     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4080     RegexMatcher* flagMat = flagPat->matcher(status);
4081 
4082     //
4083     // The Perl tests reference several perl-isms, which are evaluated/substituted
4084     //   in the test data.  Not being perl, this must be done explicitly.  Here
4085     //   are string constants and REs for these constructs.
4086     //
4087     UnicodeString nulnulSrc("${nulnul}");
4088     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4089     nulnul = nulnul.unescape();
4090 
4091     UnicodeString ffffSrc("${ffff}");
4092     UnicodeString ffff("\\uffff", -1, US_INV);
4093     ffff = ffff.unescape();
4094 
4095     //  regexp for $-[0], $+[2], etc.
4096     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4097     RegexMatcher *groupsMat = groupsPat->matcher(status);
4098 
4099     //  regexp for $0, $1, $2, etc.
4100     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4101     RegexMatcher *cgMat = cgPat->matcher(status);
4102 
4103 
4104     //
4105     // Main Loop for the Perl Tests, runs once per line from the
4106     //   test data file.
4107     //
4108     int32_t  lineNum = 0;
4109     int32_t  skippedUnimplementedCount = 0;
4110     while (lineMat->find()) {
4111         lineNum++;
4112 
4113         //
4114         //  Get a line, break it into its fields, do the Perl
4115         //    variable substitutions.
4116         //
4117         UnicodeString line = lineMat->group(1, status);
4118         UnicodeString fields[7];
4119         fieldPat->split(line, fields, 7, status);
4120 
4121         flagMat->reset(fields[0]);
4122         flagMat->matches(status);
4123         UnicodeString pattern  = flagMat->group(2, status);
4124         pattern.findAndReplace("${bang}", "!");
4125         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4126         pattern.findAndReplace(ffffSrc, ffff);
4127 
4128         //
4129         //  Identify patterns that include match flag settings,
4130         //    split off the flags, remove the extra quotes.
4131         //
4132         UnicodeString flagStr = flagMat->group(3, status);
4133         if (U_FAILURE(status)) {
4134             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4135             return;
4136         }
4137         int32_t flags = 0;
4138         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4139         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4140         const UChar UChar_m = 0x6d;
4141         const UChar UChar_x = 0x78;
4142         const UChar UChar_y = 0x79;
4143         if (flagStr.indexOf(UChar_i) != -1) {
4144             flags |= UREGEX_CASE_INSENSITIVE;
4145         }
4146         if (flagStr.indexOf(UChar_m) != -1) {
4147             flags |= UREGEX_MULTILINE;
4148         }
4149         if (flagStr.indexOf(UChar_x) != -1) {
4150             flags |= UREGEX_COMMENTS;
4151         }
4152 
4153         //
4154         // Compile the test pattern.
4155         //
4156         status = U_ZERO_ERROR;
4157         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4158         if (status == U_REGEX_UNIMPLEMENTED) {
4159             //
4160             // Test of a feature that is planned for ICU, but not yet implemented.
4161             //   skip the test.
4162             skippedUnimplementedCount++;
4163             delete testPat;
4164             status = U_ZERO_ERROR;
4165             continue;
4166         }
4167 
4168         if (U_FAILURE(status)) {
4169             // Some tests are supposed to generate errors.
4170             //   Only report an error for tests that are supposed to succeed.
4171             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4172                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4173             {
4174                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4175             }
4176             status = U_ZERO_ERROR;
4177             delete testPat;
4178             continue;
4179         }
4180 
4181         if (fields[2].indexOf(UChar_i) >= 0) {
4182             // ICU should skip this test.
4183             delete testPat;
4184             continue;
4185         }
4186 
4187         if (fields[2].indexOf(UChar_c) >= 0) {
4188             // This pattern should have caused a compilation error, but didn't/
4189             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4190             delete testPat;
4191             continue;
4192         }
4193 
4194         //
4195         // replace the Perl variables that appear in some of the
4196         //   match data strings.
4197         //
4198         UnicodeString matchString = fields[1];
4199         matchString.findAndReplace(nulnulSrc, nulnul);
4200         matchString.findAndReplace(ffffSrc,   ffff);
4201 
4202         // Replace any \n in the match string with an actual new-line char.
4203         //  Don't do full unescape, as this unescapes more than Perl does, which
4204         //  causes other spurious failures in the tests.
4205         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4206 
4207 
4208 
4209         //
4210         // Run the test, check for expected match/don't match result.
4211         //
4212         RegexMatcher *testMat = testPat->matcher(matchString, status);
4213         UBool found = testMat->find();
4214         UBool expected = FALSE;
4215         if (fields[2].indexOf(UChar_y) >=0) {
4216             expected = TRUE;
4217         }
4218         if (expected != found) {
4219             errln("line %d: Expected %smatch, got %smatch",
4220                 lineNum, expected?"":"no ", found?"":"no " );
4221             continue;
4222         }
4223 
4224         // Don't try to check expected results if there is no match.
4225         //   (Some have stuff in the expected fields)
4226         if (!found) {
4227             delete testMat;
4228             delete testPat;
4229             continue;
4230         }
4231 
4232         //
4233         // Interpret the Perl expression from the fourth field of the data file,
4234         // building up an ICU string from the results of the ICU match.
4235         //   The Perl expression will contain references to the results of
4236         //     a regex match, including the matched string, capture group strings,
4237         //     group starting and ending indicies, etc.
4238         //
4239         UnicodeString resultString;
4240         UnicodeString perlExpr = fields[3];
4241 #if SUPPORT_MUTATING_INPUT_STRING
4242         groupsMat->reset(perlExpr);
4243         cgMat->reset(perlExpr);
4244 #endif
4245 
4246         while (perlExpr.length() > 0) {
4247 #if !SUPPORT_MUTATING_INPUT_STRING
4248             //  Perferred usage.  Reset after any modification to input string.
4249             groupsMat->reset(perlExpr);
4250             cgMat->reset(perlExpr);
4251 #endif
4252 
4253             if (perlExpr.startsWith("$&")) {
4254                 resultString.append(testMat->group(status));
4255                 perlExpr.remove(0, 2);
4256             }
4257 
4258             else if (groupsMat->lookingAt(status)) {
4259                 // $-[0]   $+[2]  etc.
4260                 UnicodeString digitString = groupsMat->group(2, status);
4261                 int32_t t = 0;
4262                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4263                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4264                 int32_t matchPosition;
4265                 if (plusOrMinus.compare("+") == 0) {
4266                     matchPosition = testMat->end(groupNum, status);
4267                 } else {
4268                     matchPosition = testMat->start(groupNum, status);
4269                 }
4270                 if (matchPosition != -1) {
4271                     ICU_Utility::appendNumber(resultString, matchPosition);
4272                 }
4273                 perlExpr.remove(0, groupsMat->end(status));
4274             }
4275 
4276             else if (cgMat->lookingAt(status)) {
4277                 // $1, $2, $3, etc.
4278                 UnicodeString digitString = cgMat->group(1, status);
4279                 int32_t t = 0;
4280                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4281                 if (U_SUCCESS(status)) {
4282                     resultString.append(testMat->group(groupNum, status));
4283                     status = U_ZERO_ERROR;
4284                 }
4285                 perlExpr.remove(0, cgMat->end(status));
4286             }
4287 
4288             else if (perlExpr.startsWith("@-")) {
4289                 int32_t i;
4290                 for (i=0; i<=testMat->groupCount(); i++) {
4291                     if (i>0) {
4292                         resultString.append(" ");
4293                     }
4294                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4295                 }
4296                 perlExpr.remove(0, 2);
4297             }
4298 
4299             else if (perlExpr.startsWith("@+")) {
4300                 int32_t i;
4301                 for (i=0; i<=testMat->groupCount(); i++) {
4302                     if (i>0) {
4303                         resultString.append(" ");
4304                     }
4305                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4306                 }
4307                 perlExpr.remove(0, 2);
4308             }
4309 
4310             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4311                                                      //           or as an escaped sequence (e.g. \n)
4312                 if (perlExpr.length() > 1) {
4313                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4314                 }
4315                 UChar c = perlExpr.charAt(0);
4316                 switch (c) {
4317                 case 'n':   c = '\n'; break;
4318                 // add any other escape sequences that show up in the test expected results.
4319                 }
4320                 resultString.append(c);
4321                 perlExpr.remove(0, 1);
4322             }
4323 
4324             else  {
4325                 // Any characters from the perl expression that we don't explicitly
4326                 //  recognize before here are assumed to be literals and copied
4327                 //  as-is to the expected results.
4328                 resultString.append(perlExpr.charAt(0));
4329                 perlExpr.remove(0, 1);
4330             }
4331 
4332             if (U_FAILURE(status)) {
4333                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4334                 break;
4335             }
4336         }
4337 
4338         //
4339         // Expected Results Compare
4340         //
4341         UnicodeString expectedS(fields[4]);
4342         expectedS.findAndReplace(nulnulSrc, nulnul);
4343         expectedS.findAndReplace(ffffSrc,   ffff);
4344         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4345 
4346 
4347         if (expectedS.compare(resultString) != 0) {
4348             err("Line %d: Incorrect perl expression results.", lineNum);
4349             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4350         }
4351 
4352         delete testMat;
4353         delete testPat;
4354     }
4355 
4356     //
4357     // All done.  Clean up allocated stuff.
4358     //
4359     delete cgMat;
4360     delete cgPat;
4361 
4362     delete groupsMat;
4363     delete groupsPat;
4364 
4365     delete flagMat;
4366     delete flagPat;
4367 
4368     delete lineMat;
4369     delete linePat;
4370 
4371     delete fieldPat;
4372     delete [] testData;
4373 
4374 
4375     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4376 
4377 }
4378 
4379 
4380 //-------------------------------------------------------------------------------
4381 //
4382 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4383 //                  (instead of using UnicodeStrings) to test the alternate engine.
4384 //                  The input file for this test is re_tests, the standard regular
4385 //                  expression test data distributed with the Perl source code.
4386 //                  See PerlTests() for more information.
4387 //
4388 //-------------------------------------------------------------------------------
PerlTestsUTF8()4389 void RegexTest::PerlTestsUTF8() {
4390     char tdd[2048];
4391     const char *srcPath;
4392     UErrorCode  status = U_ZERO_ERROR;
4393     UParseError pe;
4394     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4395     UText       patternText = UTEXT_INITIALIZER;
4396     char       *patternChars = NULL;
4397     int32_t     patternLength;
4398     int32_t     patternCapacity = 0;
4399     UText       inputText = UTEXT_INITIALIZER;
4400     char       *inputChars = NULL;
4401     int32_t     inputLength;
4402     int32_t     inputCapacity = 0;
4403 
4404     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4405 
4406     //
4407     //  Open and read the test data file.
4408     //
4409     srcPath=getPath(tdd, "re_tests.txt");
4410     if(srcPath==NULL) {
4411         return; /* something went wrong, error already output */
4412     }
4413 
4414     int32_t    len;
4415     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4416     if (U_FAILURE(status)) {
4417         return; /* something went wrong, error already output */
4418     }
4419 
4420     //
4421     //  Put the test data into a UnicodeString
4422     //
4423     UnicodeString testDataString(FALSE, testData, len);
4424 
4425     //
4426     //  Regex to break the input file into lines, and strip the new lines.
4427     //     One line per match, capture group one is the desired data.
4428     //
4429     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4430     if (U_FAILURE(status)) {
4431         dataerrln("RegexPattern::compile() error");
4432         return;
4433     }
4434     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4435 
4436     //
4437     //  Regex to split a test file line into fields.
4438     //    There are six fields, separated by tabs.
4439     //
4440     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4441 
4442     //
4443     //  Regex to identify test patterns with flag settings, and to separate them.
4444     //    Test patterns with flags look like 'pattern'i
4445     //    Test patterns without flags are not quoted:   pattern
4446     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4447     //
4448     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4449     RegexMatcher* flagMat = flagPat->matcher(status);
4450 
4451     //
4452     // The Perl tests reference several perl-isms, which are evaluated/substituted
4453     //   in the test data.  Not being perl, this must be done explicitly.  Here
4454     //   are string constants and REs for these constructs.
4455     //
4456     UnicodeString nulnulSrc("${nulnul}");
4457     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4458     nulnul = nulnul.unescape();
4459 
4460     UnicodeString ffffSrc("${ffff}");
4461     UnicodeString ffff("\\uffff", -1, US_INV);
4462     ffff = ffff.unescape();
4463 
4464     //  regexp for $-[0], $+[2], etc.
4465     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4466     RegexMatcher *groupsMat = groupsPat->matcher(status);
4467 
4468     //  regexp for $0, $1, $2, etc.
4469     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4470     RegexMatcher *cgMat = cgPat->matcher(status);
4471 
4472 
4473     //
4474     // Main Loop for the Perl Tests, runs once per line from the
4475     //   test data file.
4476     //
4477     int32_t  lineNum = 0;
4478     int32_t  skippedUnimplementedCount = 0;
4479     while (lineMat->find()) {
4480         lineNum++;
4481 
4482         //
4483         //  Get a line, break it into its fields, do the Perl
4484         //    variable substitutions.
4485         //
4486         UnicodeString line = lineMat->group(1, status);
4487         UnicodeString fields[7];
4488         fieldPat->split(line, fields, 7, status);
4489 
4490         flagMat->reset(fields[0]);
4491         flagMat->matches(status);
4492         UnicodeString pattern  = flagMat->group(2, status);
4493         pattern.findAndReplace("${bang}", "!");
4494         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4495         pattern.findAndReplace(ffffSrc, ffff);
4496 
4497         //
4498         //  Identify patterns that include match flag settings,
4499         //    split off the flags, remove the extra quotes.
4500         //
4501         UnicodeString flagStr = flagMat->group(3, status);
4502         if (U_FAILURE(status)) {
4503             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4504             return;
4505         }
4506         int32_t flags = 0;
4507         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4508         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4509         const UChar UChar_m = 0x6d;
4510         const UChar UChar_x = 0x78;
4511         const UChar UChar_y = 0x79;
4512         if (flagStr.indexOf(UChar_i) != -1) {
4513             flags |= UREGEX_CASE_INSENSITIVE;
4514         }
4515         if (flagStr.indexOf(UChar_m) != -1) {
4516             flags |= UREGEX_MULTILINE;
4517         }
4518         if (flagStr.indexOf(UChar_x) != -1) {
4519             flags |= UREGEX_COMMENTS;
4520         }
4521 
4522         //
4523         // Put the pattern in a UTF-8 UText
4524         //
4525         status = U_ZERO_ERROR;
4526         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4527         if (status == U_BUFFER_OVERFLOW_ERROR) {
4528             status = U_ZERO_ERROR;
4529             delete[] patternChars;
4530             patternCapacity = patternLength + 1;
4531             patternChars = new char[patternCapacity];
4532             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4533         }
4534         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4535 
4536         //
4537         // Compile the test pattern.
4538         //
4539         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4540         if (status == U_REGEX_UNIMPLEMENTED) {
4541             //
4542             // Test of a feature that is planned for ICU, but not yet implemented.
4543             //   skip the test.
4544             skippedUnimplementedCount++;
4545             delete testPat;
4546             status = U_ZERO_ERROR;
4547             continue;
4548         }
4549 
4550         if (U_FAILURE(status)) {
4551             // Some tests are supposed to generate errors.
4552             //   Only report an error for tests that are supposed to succeed.
4553             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4554                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4555             {
4556                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4557             }
4558             status = U_ZERO_ERROR;
4559             delete testPat;
4560             continue;
4561         }
4562 
4563         if (fields[2].indexOf(UChar_i) >= 0) {
4564             // ICU should skip this test.
4565             delete testPat;
4566             continue;
4567         }
4568 
4569         if (fields[2].indexOf(UChar_c) >= 0) {
4570             // This pattern should have caused a compilation error, but didn't/
4571             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4572             delete testPat;
4573             continue;
4574         }
4575 
4576 
4577         //
4578         // replace the Perl variables that appear in some of the
4579         //   match data strings.
4580         //
4581         UnicodeString matchString = fields[1];
4582         matchString.findAndReplace(nulnulSrc, nulnul);
4583         matchString.findAndReplace(ffffSrc,   ffff);
4584 
4585         // Replace any \n in the match string with an actual new-line char.
4586         //  Don't do full unescape, as this unescapes more than Perl does, which
4587         //  causes other spurious failures in the tests.
4588         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4589 
4590         //
4591         // Put the input in a UTF-8 UText
4592         //
4593         status = U_ZERO_ERROR;
4594         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4595         if (status == U_BUFFER_OVERFLOW_ERROR) {
4596             status = U_ZERO_ERROR;
4597             delete[] inputChars;
4598             inputCapacity = inputLength + 1;
4599             inputChars = new char[inputCapacity];
4600             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4601         }
4602         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4603 
4604         //
4605         // Run the test, check for expected match/don't match result.
4606         //
4607         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4608         UBool found = testMat->find();
4609         UBool expected = FALSE;
4610         if (fields[2].indexOf(UChar_y) >=0) {
4611             expected = TRUE;
4612         }
4613         if (expected != found) {
4614             errln("line %d: Expected %smatch, got %smatch",
4615                 lineNum, expected?"":"no ", found?"":"no " );
4616             continue;
4617         }
4618 
4619         // Don't try to check expected results if there is no match.
4620         //   (Some have stuff in the expected fields)
4621         if (!found) {
4622             delete testMat;
4623             delete testPat;
4624             continue;
4625         }
4626 
4627         //
4628         // Interpret the Perl expression from the fourth field of the data file,
4629         // building up an ICU string from the results of the ICU match.
4630         //   The Perl expression will contain references to the results of
4631         //     a regex match, including the matched string, capture group strings,
4632         //     group starting and ending indicies, etc.
4633         //
4634         UnicodeString resultString;
4635         UnicodeString perlExpr = fields[3];
4636 
4637         while (perlExpr.length() > 0) {
4638             groupsMat->reset(perlExpr);
4639             cgMat->reset(perlExpr);
4640 
4641             if (perlExpr.startsWith("$&")) {
4642                 resultString.append(testMat->group(status));
4643                 perlExpr.remove(0, 2);
4644             }
4645 
4646             else if (groupsMat->lookingAt(status)) {
4647                 // $-[0]   $+[2]  etc.
4648                 UnicodeString digitString = groupsMat->group(2, status);
4649                 int32_t t = 0;
4650                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4651                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4652                 int32_t matchPosition;
4653                 if (plusOrMinus.compare("+") == 0) {
4654                     matchPosition = testMat->end(groupNum, status);
4655                 } else {
4656                     matchPosition = testMat->start(groupNum, status);
4657                 }
4658                 if (matchPosition != -1) {
4659                     ICU_Utility::appendNumber(resultString, matchPosition);
4660                 }
4661                 perlExpr.remove(0, groupsMat->end(status));
4662             }
4663 
4664             else if (cgMat->lookingAt(status)) {
4665                 // $1, $2, $3, etc.
4666                 UnicodeString digitString = cgMat->group(1, status);
4667                 int32_t t = 0;
4668                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4669                 if (U_SUCCESS(status)) {
4670                     resultString.append(testMat->group(groupNum, status));
4671                     status = U_ZERO_ERROR;
4672                 }
4673                 perlExpr.remove(0, cgMat->end(status));
4674             }
4675 
4676             else if (perlExpr.startsWith("@-")) {
4677                 int32_t i;
4678                 for (i=0; i<=testMat->groupCount(); i++) {
4679                     if (i>0) {
4680                         resultString.append(" ");
4681                     }
4682                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4683                 }
4684                 perlExpr.remove(0, 2);
4685             }
4686 
4687             else if (perlExpr.startsWith("@+")) {
4688                 int32_t i;
4689                 for (i=0; i<=testMat->groupCount(); i++) {
4690                     if (i>0) {
4691                         resultString.append(" ");
4692                     }
4693                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4694                 }
4695                 perlExpr.remove(0, 2);
4696             }
4697 
4698             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4699                                                      //           or as an escaped sequence (e.g. \n)
4700                 if (perlExpr.length() > 1) {
4701                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4702                 }
4703                 UChar c = perlExpr.charAt(0);
4704                 switch (c) {
4705                 case 'n':   c = '\n'; break;
4706                 // add any other escape sequences that show up in the test expected results.
4707                 }
4708                 resultString.append(c);
4709                 perlExpr.remove(0, 1);
4710             }
4711 
4712             else  {
4713                 // Any characters from the perl expression that we don't explicitly
4714                 //  recognize before here are assumed to be literals and copied
4715                 //  as-is to the expected results.
4716                 resultString.append(perlExpr.charAt(0));
4717                 perlExpr.remove(0, 1);
4718             }
4719 
4720             if (U_FAILURE(status)) {
4721                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4722                 break;
4723             }
4724         }
4725 
4726         //
4727         // Expected Results Compare
4728         //
4729         UnicodeString expectedS(fields[4]);
4730         expectedS.findAndReplace(nulnulSrc, nulnul);
4731         expectedS.findAndReplace(ffffSrc,   ffff);
4732         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4733 
4734 
4735         if (expectedS.compare(resultString) != 0) {
4736             err("Line %d: Incorrect perl expression results.", lineNum);
4737             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4738         }
4739 
4740         delete testMat;
4741         delete testPat;
4742     }
4743 
4744     //
4745     // All done.  Clean up allocated stuff.
4746     //
4747     delete cgMat;
4748     delete cgPat;
4749 
4750     delete groupsMat;
4751     delete groupsPat;
4752 
4753     delete flagMat;
4754     delete flagPat;
4755 
4756     delete lineMat;
4757     delete linePat;
4758 
4759     delete fieldPat;
4760     delete [] testData;
4761 
4762     utext_close(&patternText);
4763     utext_close(&inputText);
4764 
4765     delete [] patternChars;
4766     delete [] inputChars;
4767 
4768 
4769     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4770 
4771 }
4772 
4773 
4774 //--------------------------------------------------------------
4775 //
4776 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4777 //             Use this pattern,
4778 //                 "(a?){1,8000000}"
4779 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4780 //                   This test is likely to be fragile, as further optimizations stop
4781 //                   more cases of pointless looping in the match engine.
4782 //
4783 //---------------------------------------------------------------
Bug6149()4784 void RegexTest::Bug6149() {
4785     UnicodeString pattern("(a?){1,8000000}");
4786     UnicodeString s("xyz");
4787     uint32_t flags = 0;
4788     UErrorCode status = U_ZERO_ERROR;
4789 
4790     RegexMatcher  matcher(pattern, s, flags, status);
4791     UBool result = false;
4792     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4793     REGEX_ASSERT(result == FALSE);
4794  }
4795 
4796 
4797 //
4798 //   Callbacks()    Test the callback function.
4799 //                  When set, callbacks occur periodically during matching operations,
4800 //                  giving the application code the ability to abort the operation
4801 //                  before it's normal completion.
4802 //
4803 
4804 struct callBackContext {
4805     RegexTest        *test;
4806     int32_t          maxCalls;
4807     int32_t          numCalls;
4808     int32_t          lastSteps;
resetcallBackContext4809     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4810 };
4811 
4812 U_CDECL_BEGIN
4813 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4814 testCallBackFn(const void *context, int32_t steps) {
4815     callBackContext  *info = (callBackContext *)context;
4816     if (info->lastSteps+1 != steps) {
4817         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4818     }
4819     info->lastSteps = steps;
4820     info->numCalls++;
4821     return (info->numCalls < info->maxCalls);
4822 }
4823 U_CDECL_END
4824 
Callbacks()4825 void RegexTest::Callbacks() {
4826    {
4827         // Getter returns NULLs if no callback has been set
4828 
4829         //   The variables that the getter will fill in.
4830         //   Init to non-null values so that the action of the getter can be seen.
4831         const void          *returnedContext = &returnedContext;
4832         URegexMatchCallback *returnedFn = &testCallBackFn;
4833 
4834         UErrorCode status = U_ZERO_ERROR;
4835         RegexMatcher matcher("x", 0, status);
4836         REGEX_CHECK_STATUS;
4837         matcher.getMatchCallback(returnedFn, returnedContext, status);
4838         REGEX_CHECK_STATUS;
4839         REGEX_ASSERT(returnedFn == NULL);
4840         REGEX_ASSERT(returnedContext == NULL);
4841     }
4842 
4843    {
4844         // Set and Get work
4845         callBackContext cbInfo = {this, 0, 0, 0};
4846         const void          *returnedContext;
4847         URegexMatchCallback *returnedFn;
4848         UErrorCode status = U_ZERO_ERROR;
4849         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4850         REGEX_CHECK_STATUS;
4851         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4852         REGEX_CHECK_STATUS;
4853         matcher.getMatchCallback(returnedFn, returnedContext, status);
4854         REGEX_CHECK_STATUS;
4855         REGEX_ASSERT(returnedFn == testCallBackFn);
4856         REGEX_ASSERT(returnedContext == &cbInfo);
4857 
4858         // A short-running match shouldn't invoke the callback
4859         status = U_ZERO_ERROR;
4860         cbInfo.reset(1);
4861         UnicodeString s = "xxx";
4862         matcher.reset(s);
4863         REGEX_ASSERT(matcher.matches(status));
4864         REGEX_CHECK_STATUS;
4865         REGEX_ASSERT(cbInfo.numCalls == 0);
4866 
4867         // A medium-length match that runs long enough to invoke the
4868         //   callback, but not so long that the callback aborts it.
4869         status = U_ZERO_ERROR;
4870         cbInfo.reset(4);
4871         s = "aaaaaaaaaaaaaaaaaaab";
4872         matcher.reset(s);
4873         REGEX_ASSERT(matcher.matches(status)==FALSE);
4874         REGEX_CHECK_STATUS;
4875         REGEX_ASSERT(cbInfo.numCalls > 0);
4876 
4877         // A longer running match that the callback function will abort.
4878         status = U_ZERO_ERROR;
4879         cbInfo.reset(4);
4880         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4881         matcher.reset(s);
4882         REGEX_ASSERT(matcher.matches(status)==FALSE);
4883         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4884         REGEX_ASSERT(cbInfo.numCalls == 4);
4885 
4886         // A longer running find that the callback function will abort.
4887         status = U_ZERO_ERROR;
4888         cbInfo.reset(4);
4889         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890         matcher.reset(s);
4891         REGEX_ASSERT(matcher.find(status)==FALSE);
4892         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893         REGEX_ASSERT(cbInfo.numCalls == 4);
4894     }
4895 
4896 
4897 }
4898 
4899 
4900 //
4901 //   FindProgressCallbacks()    Test the find "progress" callback function.
4902 //                  When set, the find progress callback will be invoked during a find operations
4903 //                  after each return from a match attempt, giving the application the opportunity
4904 //                  to terminate a long-running find operation before it's normal completion.
4905 //
4906 
4907 struct progressCallBackContext {
4908     RegexTest        *test;
4909     int64_t          lastIndex;
4910     int32_t          maxCalls;
4911     int32_t          numCalls;
resetprogressCallBackContext4912     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4913 };
4914 
4915 // call-back function for find().
4916 // Return TRUE to continue the find().
4917 // Return FALSE to stop the find().
4918 U_CDECL_BEGIN
4919 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4920 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4921     progressCallBackContext  *info = (progressCallBackContext *)context;
4922     info->numCalls++;
4923     info->lastIndex = matchIndex;
4924 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4925     return (info->numCalls < info->maxCalls);
4926 }
4927 U_CDECL_END
4928 
FindProgressCallbacks()4929 void RegexTest::FindProgressCallbacks() {
4930    {
4931         // Getter returns NULLs if no callback has been set
4932 
4933         //   The variables that the getter will fill in.
4934         //   Init to non-null values so that the action of the getter can be seen.
4935         const void                  *returnedContext = &returnedContext;
4936         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4937 
4938         UErrorCode status = U_ZERO_ERROR;
4939         RegexMatcher matcher("x", 0, status);
4940         REGEX_CHECK_STATUS;
4941         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4942         REGEX_CHECK_STATUS;
4943         REGEX_ASSERT(returnedFn == NULL);
4944         REGEX_ASSERT(returnedContext == NULL);
4945     }
4946 
4947    {
4948         // Set and Get work
4949         progressCallBackContext cbInfo = {this, 0, 0, 0};
4950         const void                  *returnedContext;
4951         URegexFindProgressCallback  *returnedFn;
4952         UErrorCode status = U_ZERO_ERROR;
4953         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4954         REGEX_CHECK_STATUS;
4955         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4956         REGEX_CHECK_STATUS;
4957         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4958         REGEX_CHECK_STATUS;
4959         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4960         REGEX_ASSERT(returnedContext == &cbInfo);
4961 
4962         // A find that matches on the initial position does NOT invoke the callback.
4963         status = U_ZERO_ERROR;
4964         cbInfo.reset(100);
4965         UnicodeString s = "aaxxx";
4966         matcher.reset(s);
4967 #if 0
4968         matcher.setTrace(TRUE);
4969 #endif
4970         REGEX_ASSERT(matcher.find(0, status));
4971         REGEX_CHECK_STATUS;
4972         REGEX_ASSERT(cbInfo.numCalls == 0);
4973 
4974         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4975         //   but not so many times that we interrupt the operation.
4976         status = U_ZERO_ERROR;
4977         s = "aaaaaaaaaaaaaaaaaaab";
4978         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4979         matcher.reset(s);
4980         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4981         REGEX_CHECK_STATUS;
4982         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4983 
4984         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4985         status = U_ZERO_ERROR;
4986         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4987         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4988         matcher.reset(s1);
4989         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4990         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4991         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4992 
4993         // Now a match that will succeed, but after an interruption
4994         status = U_ZERO_ERROR;
4995         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4996         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4997         matcher.reset(s2);
4998         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000         // Now retry the match from where left off
5001         cbInfo.maxCalls = 100; //  No callback limit
5002         status = U_ZERO_ERROR;
5003         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5004         REGEX_CHECK_STATUS;
5005     }
5006 
5007 
5008 }
5009 
5010 
5011 //---------------------------------------------------------------------------
5012 //
5013 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5014 //                             UTexts. The pure-C implementation of UText
5015 //                             has no mutable backing stores, but we can
5016 //                             use UnicodeString here to test the functionality.
5017 //
5018 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5019 void RegexTest::PreAllocatedUTextCAPI () {
5020     UErrorCode           status = U_ZERO_ERROR;
5021     URegularExpression  *re;
5022     UText                patternText = UTEXT_INITIALIZER;
5023     UnicodeString        buffer;
5024     UText                bufferText = UTEXT_INITIALIZER;
5025 
5026     utext_openUnicodeString(&bufferText, &buffer, &status);
5027 
5028     /*
5029      *  getText() and getUText()
5030      */
5031     {
5032         UText  text1 = UTEXT_INITIALIZER;
5033         UText  text2 = UTEXT_INITIALIZER;
5034         UChar  text2Chars[20];
5035         UText  *resultText;
5036 
5037         status = U_ZERO_ERROR;
5038         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5039         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5040         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5041         utext_openUChars(&text2, text2Chars, -1, &status);
5042 
5043         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5044         re = uregex_openUText(&patternText, 0, NULL, &status);
5045 
5046         /* First set a UText */
5047         uregex_setUText(re, &text1, &status);
5048         resultText = uregex_getUText(re, &bufferText, &status);
5049         REGEX_CHECK_STATUS;
5050         REGEX_ASSERT(resultText == &bufferText);
5051         utext_setNativeIndex(resultText, 0);
5052         utext_setNativeIndex(&text1, 0);
5053         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5054 
5055         resultText = uregex_getUText(re, &bufferText, &status);
5056         REGEX_CHECK_STATUS;
5057         REGEX_ASSERT(resultText == &bufferText);
5058         utext_setNativeIndex(resultText, 0);
5059         utext_setNativeIndex(&text1, 0);
5060         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5061 
5062         /* Then set a UChar * */
5063         uregex_setText(re, text2Chars, 7, &status);
5064         resultText = uregex_getUText(re, &bufferText, &status);
5065         REGEX_CHECK_STATUS;
5066         REGEX_ASSERT(resultText == &bufferText);
5067         utext_setNativeIndex(resultText, 0);
5068         utext_setNativeIndex(&text2, 0);
5069         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5070 
5071         uregex_close(re);
5072         utext_close(&text1);
5073         utext_close(&text2);
5074     }
5075 
5076     /*
5077      *  group()
5078      */
5079     {
5080         UChar    text1[80];
5081         UText   *actual;
5082         UBool    result;
5083         int64_t  length = 0;
5084 
5085         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5086         //                  012345678901234567890123456789012345678901234567
5087         //                  0         1         2         3         4
5088 
5089         status = U_ZERO_ERROR;
5090         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5091         REGEX_CHECK_STATUS;
5092 
5093         uregex_setText(re, text1, -1, &status);
5094         result = uregex_find(re, 0, &status);
5095         REGEX_ASSERT(result==TRUE);
5096 
5097         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5098         status = U_ZERO_ERROR;
5099         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5100         REGEX_CHECK_STATUS;
5101         REGEX_ASSERT(actual == &bufferText);
5102         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5103         REGEX_ASSERT(length == 16);
5104         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5105 
5106         /*  Capture group #1.  Should succeed, matching " interior ". */
5107         status = U_ZERO_ERROR;
5108         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5109         REGEX_CHECK_STATUS;
5110         REGEX_ASSERT(actual == &bufferText);
5111         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5112         REGEX_ASSERT(length == 10);
5113         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5114 
5115         /*  Capture group out of range.  Error. */
5116         status = U_ZERO_ERROR;
5117         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5118         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5119         REGEX_ASSERT(actual == &bufferText);
5120         uregex_close(re);
5121 
5122     }
5123 
5124     /*
5125      *  replaceFirst()
5126      */
5127     {
5128         UChar    text1[80];
5129         UChar    text2[80];
5130         UText    replText = UTEXT_INITIALIZER;
5131         UText   *result;
5132         status = U_ZERO_ERROR;
5133         utext_openUnicodeString(&bufferText, &buffer, &status);
5134 
5135         status = U_ZERO_ERROR;
5136         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5137         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5138         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5139 
5140         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5141         REGEX_CHECK_STATUS;
5142 
5143         /*  Normal case, with match */
5144         uregex_setText(re, text1, -1, &status);
5145         REGEX_CHECK_STATUS;
5146         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5147         REGEX_CHECK_STATUS;
5148         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5149         REGEX_CHECK_STATUS;
5150         REGEX_ASSERT(result == &bufferText);
5151         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5152 
5153         /* No match.  Text should copy to output with no changes.  */
5154         uregex_setText(re, text2, -1, &status);
5155         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5156         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5157         REGEX_CHECK_STATUS;
5158         REGEX_ASSERT(result == &bufferText);
5159         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5160 
5161         /* Unicode escapes */
5162         uregex_setText(re, text1, -1, &status);
5163         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5164         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5166         REGEX_CHECK_STATUS;
5167         REGEX_ASSERT(result == &bufferText);
5168         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5169 
5170         uregex_close(re);
5171         utext_close(&replText);
5172     }
5173 
5174 
5175     /*
5176      *  replaceAll()
5177      */
5178     {
5179         UChar    text1[80];
5180         UChar    text2[80];
5181         UText    replText = UTEXT_INITIALIZER;
5182         UText   *result;
5183 
5184         status = U_ZERO_ERROR;
5185         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5186         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5187         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5188 
5189         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5190         REGEX_CHECK_STATUS;
5191 
5192         /*  Normal case, with match */
5193         uregex_setText(re, text1, -1, &status);
5194         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5195         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5196         REGEX_CHECK_STATUS;
5197         REGEX_ASSERT(result == &bufferText);
5198         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5199 
5200         /* No match.  Text should copy to output with no changes.  */
5201         uregex_setText(re, text2, -1, &status);
5202         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5203         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5204         REGEX_CHECK_STATUS;
5205         REGEX_ASSERT(result == &bufferText);
5206         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5207 
5208         uregex_close(re);
5209         utext_close(&replText);
5210     }
5211 
5212 
5213     /*
5214      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5215      *   so we don't need to test it here.
5216      */
5217 
5218     utext_close(&bufferText);
5219     utext_close(&patternText);
5220 }
5221 
5222 
5223 //--------------------------------------------------------------
5224 //
5225 //  NamedCapture   Check basic named capture group functionality
5226 //
5227 //--------------------------------------------------------------
NamedCapture()5228 void RegexTest::NamedCapture() {
5229     UErrorCode status = U_ZERO_ERROR;
5230     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5231             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5232     REGEX_CHECK_STATUS;
5233     int32_t group = pat->groupNumberFromName("five", -1, status);
5234     REGEX_CHECK_STATUS;
5235     REGEX_ASSERT(5 == group);
5236     group = pat->groupNumberFromName("three", -1, status);
5237     REGEX_CHECK_STATUS;
5238     REGEX_ASSERT(3 == group);
5239 
5240     status = U_ZERO_ERROR;
5241     group = pat->groupNumberFromName(UnicodeString("six"), status);
5242     REGEX_CHECK_STATUS;
5243     REGEX_ASSERT(6 == group);
5244 
5245     status = U_ZERO_ERROR;
5246     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5247     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5248 
5249     status = U_ZERO_ERROR;
5250 
5251     // After copying a pattern, named capture should still work in the copy.
5252     RegexPattern *copiedPat = new RegexPattern(*pat);
5253     REGEX_ASSERT(*copiedPat == *pat);
5254     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5255 
5256     group = copiedPat->groupNumberFromName("five", -1, status);
5257     REGEX_CHECK_STATUS;
5258     REGEX_ASSERT(5 == group);
5259     group = copiedPat->groupNumberFromName("three", -1, status);
5260     REGEX_CHECK_STATUS;
5261     REGEX_ASSERT(3 == group);
5262     delete copiedPat;
5263 
5264     // ReplaceAll with named capture group.
5265     status = U_ZERO_ERROR;
5266     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5267     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5268     REGEX_CHECK_STATUS;
5269     // m.pattern().dumpPattern();
5270     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5271     REGEX_CHECK_STATUS;
5272     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5273     delete m;
5274 
5275     // ReplaceAll, allowed capture group numbers.
5276     text = UnicodeString("abcmxyz");
5277     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5278     REGEX_CHECK_STATUS;
5279 
5280     status = U_ZERO_ERROR;
5281     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5282     REGEX_CHECK_STATUS;
5283     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5284 
5285     status = U_ZERO_ERROR;
5286     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5287     REGEX_CHECK_STATUS;
5288     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5289 
5290     status = U_ZERO_ERROR;
5291     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5292     REGEX_CHECK_STATUS;
5293     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5294 
5295     status = U_ZERO_ERROR;
5296     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5297     REGEX_CHECK_STATUS;
5298     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5299 
5300     status = U_ZERO_ERROR;
5301     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5302     REGEX_CHECK_STATUS;
5303     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5304 
5305     status = U_ZERO_ERROR;
5306     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5307     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5308 
5309     status = U_ZERO_ERROR;
5310     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5311     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5312     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5313 
5314     status = U_ZERO_ERROR;
5315     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5316     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5317     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5318 
5319     status = U_ZERO_ERROR;
5320     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5321     REGEX_CHECK_STATUS;
5322     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5323 
5324     status = U_ZERO_ERROR;
5325     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5326     REGEX_CHECK_STATUS;
5327     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5328 
5329     status = U_ZERO_ERROR;
5330     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5331     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5332 
5333     status = U_ZERO_ERROR;
5334     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5335     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5336 
5337     status = U_ZERO_ERROR;
5338     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5339     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5340 
5341     status = U_ZERO_ERROR;
5342     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5343     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5344 
5345     delete m;
5346 
5347     // Repeat the above replaceAll() tests using the plain C API, which
5348     //  has a separate implementation internally.
5349     //  TODO: factor out the test data.
5350 
5351     status = U_ZERO_ERROR;
5352     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5353     REGEX_CHECK_STATUS;
5354     text = UnicodeString("abcmxyz");
5355     uregex_setText(re, text.getBuffer(), text.length(), &status);
5356     REGEX_CHECK_STATUS;
5357 
5358     UChar resultBuf[100];
5359     int32_t resultLength;
5360     UnicodeString repl;
5361 
5362     status = U_ZERO_ERROR;
5363     repl = UnicodeString("<$0>");
5364     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365     REGEX_CHECK_STATUS;
5366     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5367 
5368     status = U_ZERO_ERROR;
5369     repl = UnicodeString("<$1>");
5370     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371     REGEX_CHECK_STATUS;
5372     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5373 
5374     status = U_ZERO_ERROR;
5375     repl = UnicodeString("<${one}>");
5376     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377     REGEX_CHECK_STATUS;
5378     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5379 
5380     status = U_ZERO_ERROR;
5381     repl = UnicodeString("<$2>");
5382     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383     REGEX_CHECK_STATUS;
5384     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5385 
5386     status = U_ZERO_ERROR;
5387     repl = UnicodeString("<$3>");
5388     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5389     REGEX_CHECK_STATUS;
5390     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5391 
5392     status = U_ZERO_ERROR;
5393     repl = UnicodeString("<$4>");
5394     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5395     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5396 
5397     status = U_ZERO_ERROR;
5398     repl = UnicodeString("<$04>");
5399     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5400     REGEX_CHECK_STATUS;
5401     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5402 
5403     status = U_ZERO_ERROR;
5404     repl = UnicodeString("<$000016>");
5405     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5406     REGEX_CHECK_STATUS;
5407     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5408 
5409     status = U_ZERO_ERROR;
5410     repl = UnicodeString("<$3$2$1${one}>");
5411     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5412     REGEX_CHECK_STATUS;
5413     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5414 
5415     status = U_ZERO_ERROR;
5416     repl = UnicodeString("$3$2$1${one}");
5417     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5418     REGEX_CHECK_STATUS;
5419     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5420 
5421     status = U_ZERO_ERROR;
5422     repl = UnicodeString("<${noSuchName}>");
5423     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5424     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5425 
5426     status = U_ZERO_ERROR;
5427     repl = UnicodeString("<${invalid-name}>");
5428     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5429     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5430 
5431     status = U_ZERO_ERROR;
5432     repl = UnicodeString("<${one");
5433     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5434     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5435 
5436     status = U_ZERO_ERROR;
5437     repl = UnicodeString("$not a capture group");
5438     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5439     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5440 
5441     uregex_close(re);
5442 }
5443 
5444 //--------------------------------------------------------------
5445 //
5446 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5447 //                       The point is not so much what the exact limit is,
5448 //                       but that a largish number doesn't hit bad non-linear performance,
5449 //                       and that exceeding the limit fails cleanly.
5450 //
5451 //--------------------------------------------------------------
NamedCaptureLimits()5452 void RegexTest::NamedCaptureLimits() {
5453     if (quick) {
5454         logln("Skipping test. Runs in exhuastive mode only.");
5455         return;
5456     }
5457     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5458     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5459     char nnbuf[100];
5460     UnicodeString pattern;
5461     int32_t nn;
5462 
5463     for (nn=1; nn<goodLimit; nn++) {
5464         sprintf(nnbuf, "(?<nn%d>)", nn);
5465         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5466     }
5467     UErrorCode status = U_ZERO_ERROR;
5468     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5469     REGEX_CHECK_STATUS;
5470     for (nn=1; nn<goodLimit; nn++) {
5471         sprintf(nnbuf, "nn%d", nn);
5472         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5473         REGEX_ASSERT(nn == groupNum);
5474         if (nn != groupNum) {
5475             break;
5476         }
5477     }
5478     delete pat;
5479 
5480     pattern.remove();
5481     for (nn=1; nn<failLimit; nn++) {
5482         sprintf(nnbuf, "(?<nn%d>)", nn);
5483         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5484     }
5485     status = U_ZERO_ERROR;
5486     pat = RegexPattern::compile(pattern, 0, status);
5487     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5488     delete pat;
5489 }
5490 
5491 
5492 //--------------------------------------------------------------
5493 //
5494 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5495 //
5496 //---------------------------------------------------------------
Bug7651()5497 void RegexTest::Bug7651() {
5498     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5499     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5500     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5501     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5502     UnicodeString s("#ff @abcd This is test");
5503     RegexPattern  *REPattern = NULL;
5504     RegexMatcher  *REMatcher = NULL;
5505     UErrorCode status = U_ZERO_ERROR;
5506     UParseError pe;
5507 
5508     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5509     REGEX_CHECK_STATUS;
5510     REMatcher = REPattern->matcher(s, status);
5511     REGEX_CHECK_STATUS;
5512     REGEX_ASSERT(REMatcher->find());
5513     REGEX_ASSERT(REMatcher->start(status) == 0);
5514     delete REPattern;
5515     delete REMatcher;
5516     status = U_ZERO_ERROR;
5517 
5518     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5519     REGEX_CHECK_STATUS;
5520     REMatcher = REPattern->matcher(s, status);
5521     REGEX_CHECK_STATUS;
5522     REGEX_ASSERT(REMatcher->find());
5523     REGEX_ASSERT(REMatcher->start(status) == 0);
5524     delete REPattern;
5525     delete REMatcher;
5526     status = U_ZERO_ERROR;
5527  }
5528 
Bug7740()5529 void RegexTest::Bug7740() {
5530     UErrorCode status = U_ZERO_ERROR;
5531     UnicodeString pattern = "(a)";
5532     UnicodeString text = "abcdef";
5533     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5534     REGEX_CHECK_STATUS;
5535     REGEX_ASSERT(m->lookingAt(status));
5536     REGEX_CHECK_STATUS;
5537     status = U_ILLEGAL_ARGUMENT_ERROR;
5538     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5539     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5540     REGEX_ASSERT(s == "");
5541     delete m;
5542 }
5543 
5544 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5545 
Bug8479()5546 void RegexTest::Bug8479() {
5547     UErrorCode status = U_ZERO_ERROR;
5548 
5549     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5550     REGEX_CHECK_STATUS;
5551     if (U_SUCCESS(status))
5552     {
5553         UnicodeString str;
5554         str.setToBogus();
5555         pMatcher->reset(str);
5556         status = U_ZERO_ERROR;
5557         pMatcher->matches(status);
5558         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5559         delete pMatcher;
5560     }
5561 }
5562 
5563 
5564 // Bug 7029
Bug7029()5565 void RegexTest::Bug7029() {
5566     UErrorCode status = U_ZERO_ERROR;
5567 
5568     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5569     UnicodeString text = "abc.def";
5570     UnicodeString splits[10];
5571     REGEX_CHECK_STATUS;
5572     int32_t numFields = pMatcher->split(text, splits, 10, status);
5573     REGEX_CHECK_STATUS;
5574     REGEX_ASSERT(numFields == 8);
5575     delete pMatcher;
5576 }
5577 
5578 // Bug 9283
5579 //   This test is checking for the existance of any supplemental characters that case-fold
5580 //   to a bmp character.
5581 //
5582 //   At the time of this writing there are none. If any should appear in a subsequent release
5583 //   of Unicode, the code in regular expressions compilation that determines the longest
5584 //   posssible match for a literal string  will need to be enhanced.
5585 //
5586 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5587 //   for details on what to do in case of a failure of this test.
5588 //
Bug9283()5589 void RegexTest::Bug9283() {
5590 #if !UCONFIG_NO_NORMALIZATION
5591     UErrorCode status = U_ZERO_ERROR;
5592     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5593     REGEX_CHECK_STATUS;
5594     int32_t index;
5595     UChar32 c;
5596     for (index=0; ; index++) {
5597         c = supplementalsWithCaseFolding.charAt(index);
5598         if (c == -1) {
5599             break;
5600         }
5601         UnicodeString cf = UnicodeString(c).foldCase();
5602         REGEX_ASSERT(cf.length() >= 2);
5603     }
5604 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5605 }
5606 
5607 
CheckInvBufSize()5608 void RegexTest::CheckInvBufSize() {
5609   if(inv_next>=INV_BUFSIZ) {
5610     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5611           __FILE__, INV_BUFSIZ, inv_next);
5612   } else {
5613     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5614   }
5615 }
5616 
5617 
Bug10459()5618 void RegexTest::Bug10459() {
5619     UErrorCode status = U_ZERO_ERROR;
5620     UnicodeString patternString("(txt)");
5621     UnicodeString txtString("txt");
5622 
5623     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5624     REGEX_CHECK_STATUS;
5625     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5626     REGEX_CHECK_STATUS;
5627 
5628     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5629     REGEX_CHECK_STATUS;
5630 
5631     uregex_setUText(icu_re, utext_txt, &status);
5632     REGEX_CHECK_STATUS;
5633 
5634     // The bug was that calling uregex_group() before doing a matching operation
5635     //   was causing a segfault. Only for Regular Expressions created from UText.
5636     //   It should set an U_REGEX_INVALID_STATE.
5637 
5638     UChar buf[100];
5639     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5640     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5641     REGEX_ASSERT(len == 0);
5642 
5643     uregex_close(icu_re);
5644     utext_close(utext_pat);
5645     utext_close(utext_txt);
5646 }
5647 
TestCaseInsensitiveStarters()5648 void RegexTest::TestCaseInsensitiveStarters() {
5649     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5650     //  become stale because of new Unicode characters.
5651     // If it is stale, rerun the generation tool
5652     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5653     // and replace the embedded data in i18n/regexcmp.cpp
5654 
5655     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5656         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5657             continue;
5658         }
5659         UnicodeSet s(cp, cp);
5660         s.closeOver(USET_CASE_INSENSITIVE);
5661         UnicodeSetIterator setIter(s);
5662         while (setIter.next()) {
5663             if (!setIter.isString()) {
5664                 continue;
5665             }
5666             const UnicodeString &str = setIter.getString();
5667             UChar32 firstChar = str.char32At(0);
5668             UnicodeSet starters;
5669             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5670             if (!starters.contains(cp)) {
5671                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5672                 return;
5673             }
5674         }
5675     }
5676 }
5677 
5678 
TestBug11049()5679 void RegexTest::TestBug11049() {
5680     // Original bug report: pattern with match start consisting of one of several individual characters,
5681     //  and the text being matched ending with a supplementary character. find() would read past the
5682     //  end of the input text when searching for potential match starting points.
5683 
5684     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5685     // detect the bad read.
5686 
5687     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5688     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5689 
5690     // Test again with a pattern starting with a single character,
5691     // which takes a different code path than starting with an OR expression,
5692     // but with similar logic.
5693     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5694     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5695 }
5696 
5697 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5698 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5699     UErrorCode status = U_ZERO_ERROR;
5700     UnicodeString patternString = UnicodeString(pattern).unescape();
5701     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5702 
5703     UnicodeString dataString = UnicodeString(data).unescape();
5704     UChar *exactBuffer = new UChar[dataString.length()];
5705     dataString.extract(exactBuffer, dataString.length(), status);
5706     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5707 
5708     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5709     REGEX_CHECK_STATUS;
5710     matcher->reset(ut);
5711     UBool result = matcher->find();
5712     if (result != expectMatch) {
5713         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5714               __FILE__, lineNumber, expectMatch, result, pattern, data);
5715     }
5716 
5717     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5718     //   off-by-one on find() with match at the last code point.
5719     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5720     //   because string.unescape() will only shrink it.
5721     char * utf8Buffer = new char[uprv_strlen(data)+1];
5722     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5723     REGEX_CHECK_STATUS;
5724     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5725     REGEX_CHECK_STATUS;
5726     matcher->reset(ut);
5727     result = matcher->find();
5728     if (result != expectMatch) {
5729         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5730               __FILE__, lineNumber, expectMatch, result, pattern, data);
5731     }
5732     delete [] utf8Buffer;
5733 
5734     utext_close(ut);
5735     delete [] exactBuffer;
5736 }
5737 
5738 
TestBug11371()5739 void RegexTest::TestBug11371() {
5740     if (quick) {
5741         logln("Skipping test. Runs in exhuastive mode only.");
5742         return;
5743     }
5744     UErrorCode status = U_ZERO_ERROR;
5745     UnicodeString patternString;
5746 
5747     for (int i=0; i<8000000; i++) {
5748         patternString.append(UnicodeString("()"));
5749     }
5750     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5751     if (status != U_REGEX_PATTERN_TOO_BIG) {
5752         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5753               __FILE__, __LINE__, u_errorName(status));
5754     }
5755 
5756     status = U_ZERO_ERROR;
5757     patternString = "(";
5758     for (int i=0; i<20000000; i++) {
5759         patternString.append(UnicodeString("A++"));
5760     }
5761     patternString.append(UnicodeString("){0}B++"));
5762     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5763     if (status != U_REGEX_PATTERN_TOO_BIG) {
5764         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5765               __FILE__, __LINE__, u_errorName(status));
5766     }
5767 
5768     // Pattern with too much string data, such that string indexes overflow operand data field size
5769     // in compiled instruction.
5770     status = U_ZERO_ERROR;
5771     patternString = "";
5772     while (patternString.length() < 0x00ffffff) {
5773         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5774     }
5775     patternString.append(UnicodeString("X? trailing string"));
5776     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5777     if (status != U_REGEX_PATTERN_TOO_BIG) {
5778         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5779               __FILE__, __LINE__, u_errorName(status));
5780     }
5781 }
5782 
TestBug11480()5783 void RegexTest::TestBug11480() {
5784     // C API, get capture group of a group that does not participate in the match.
5785     //        (Returns a zero length string, with nul termination,
5786     //         indistinguishable from a group with a zero length match.)
5787 
5788     UErrorCode status = U_ZERO_ERROR;
5789     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5790     REGEX_CHECK_STATUS;
5791     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5792     uregex_setText(re, text.getBuffer(), text.length(), &status);
5793     REGEX_CHECK_STATUS;
5794     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5795     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5796     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5797     REGEX_ASSERT(length == 0);
5798     REGEX_ASSERT(buf[0] == 13);
5799     REGEX_ASSERT(buf[1] == 0);
5800     REGEX_ASSERT(buf[2] == 13);
5801     uregex_close(re);
5802 
5803     // UText C++ API, length of match is 0 for non-participating matches.
5804     UText ut = UTEXT_INITIALIZER;
5805     utext_openUnicodeString(&ut, &text, &status);
5806     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5807     REGEX_CHECK_STATUS;
5808     matcher.reset(&ut);
5809     REGEX_ASSERT(matcher.lookingAt(0, status));
5810 
5811     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5812     int64_t groupLen = -666;
5813     UText group = UTEXT_INITIALIZER;
5814     matcher.group(1, &group, groupLen, status);
5815     REGEX_CHECK_STATUS;
5816     REGEX_ASSERT(groupLen == 1);
5817     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5818 
5819     // Capture group 2, the (B), does not participate in the match.
5820     matcher.group(2, &group, groupLen, status);
5821     REGEX_CHECK_STATUS;
5822     REGEX_ASSERT(groupLen == 0);
5823     REGEX_ASSERT(matcher.start(2, status) == -1);
5824     REGEX_CHECK_STATUS;
5825 }
5826 
5827 
5828 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5829