1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 
9 //
10 //   regextst.cpp
11 //
12 //      ICU Regular Expressions test, part of intltest.
13 //
14 
15 /*
16      NOTE!!
17 
18      PLEASE be careful about ASCII assumptions in this test.
19      This test is one of the worst repeat offenders.
20      If you have questions, contact someone on the ICU PMC
21      who has access to an EBCDIC system.
22 
23  */
24 
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41 #include "unicode/utf16.h"
42 #include "cstr.h"
43 #include "regextst.h"
44 #include "regexcmp.h"
45 #include "uvector.h"
46 #include "util.h"
47 #include "cmemory.h"
48 #include "cstring.h"
49 #include "uinvchar.h"
50 
51 #define SUPPORT_MUTATING_INPUT_STRING   0
52 
53 //---------------------------------------------------------------------------
54 //
55 //  Test class boilerplate
56 //
57 //---------------------------------------------------------------------------
RegexTest()58 RegexTest::RegexTest()
59 {
60 }
61 
62 
~RegexTest()63 RegexTest::~RegexTest()
64 {
65 }
66 
67 
68 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)69 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
70 {
71     if (exec) logln("TestSuite RegexTest: ");
72     TESTCASE_AUTO_BEGIN;
73     TESTCASE_AUTO(Basic);
74     TESTCASE_AUTO(API_Match);
75     TESTCASE_AUTO(API_Replace);
76     TESTCASE_AUTO(API_Pattern);
77 #if !UCONFIG_NO_FILE_IO
78     TESTCASE_AUTO(Extended);
79 #endif
80     TESTCASE_AUTO(Errors);
81     TESTCASE_AUTO(PerlTests);
82     TESTCASE_AUTO(Callbacks);
83     TESTCASE_AUTO(FindProgressCallbacks);
84     TESTCASE_AUTO(Bug6149);
85     TESTCASE_AUTO(UTextBasic);
86     TESTCASE_AUTO(API_Match_UTF8);
87     TESTCASE_AUTO(API_Replace_UTF8);
88     TESTCASE_AUTO(API_Pattern_UTF8);
89     TESTCASE_AUTO(PerlTestsUTF8);
90     TESTCASE_AUTO(PreAllocatedUTextCAPI);
91     TESTCASE_AUTO(Bug7651);
92     TESTCASE_AUTO(Bug7740);
93     TESTCASE_AUTO(Bug8479);
94     TESTCASE_AUTO(Bug7029);
95     TESTCASE_AUTO(CheckInvBufSize);
96     TESTCASE_AUTO(Bug9283);
97     TESTCASE_AUTO(Bug10459);
98     TESTCASE_AUTO(TestCaseInsensitiveStarters);
99     TESTCASE_AUTO(TestBug11049);
100     TESTCASE_AUTO(TestBug11371);
101     TESTCASE_AUTO(TestBug11480);
102     TESTCASE_AUTO(NamedCapture);
103     TESTCASE_AUTO(NamedCaptureLimits);
104     TESTCASE_AUTO(TestBug12884);
105     TESTCASE_AUTO(TestBug13631);
106     TESTCASE_AUTO(TestBug13632);
107     TESTCASE_AUTO_END;
108 }
109 
110 
111 /**
112  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
113  * into ASCII.
114  * @see utext_openUTF8
115  */
116 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
117 
118 //---------------------------------------------------------------------------
119 //
120 //   Error Checking / Reporting macros used in all of the tests.
121 //
122 //---------------------------------------------------------------------------
123 
utextToPrintable(char * buf,int32_t bufLen,UText * text)124 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
125   int64_t oldIndex = utext_getNativeIndex(text);
126   utext_setNativeIndex(text, 0);
127   char *bufPtr = buf;
128   UChar32 c = utext_next32From(text, 0);
129   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
130     if (0x000020<=c && c<0x00007e) {
131       *bufPtr = c;
132     } else {
133 #if 0
134       sprintf(bufPtr,"U+%04X", c);
135       bufPtr+= strlen(bufPtr)-1;
136 #else
137       *bufPtr = '%';
138 #endif
139     }
140     bufPtr++;
141     c = UTEXT_NEXT32(text);
142   }
143   *bufPtr = 0;
144 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
145   char *ebuf = (char*)malloc(bufLen);
146   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
147   uprv_strncpy(buf, ebuf, bufLen);
148   free((void*)ebuf);
149 #endif
150   utext_setNativeIndex(text, oldIndex);
151 }
152 
153 
154 static char ASSERT_BUF[1024];
155 
extractToAssertBuf(const UnicodeString & message)156 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
157   if(message.length()==0) {
158     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
159   } else {
160     UnicodeString buf;
161     IntlTest::prettify(message,buf);
162     if(buf.length()==0) {
163       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
164     } else {
165       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
166       if(ASSERT_BUF[0]==0) {
167         ASSERT_BUF[0]=0;
168         for(int32_t i=0;i<buf.length();i++) {
169           UChar ch = buf[i];
170           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
171         }
172       }
173     }
174   }
175   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
176   return ASSERT_BUF;
177 }
178 
179 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
180 
181 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure.  status=%s", \
182                                                               __FILE__, __LINE__, u_errorName(status)); return;}}
183 
184 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
185 
186 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
187 if (status!=errcode) {dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
188     __LINE__, u_errorName(errcode), u_errorName(status));};}
189 
190 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
191     "RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); }}
192 
193 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
194     errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
195 
196 // expected: const char * , restricted to invariant characters.
197 // actual: const UnicodeString &
198 #define REGEX_ASSERT_UNISTR(expected, actual) { \
199     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
200         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
201                 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
202 
203 
testUTextEqual(UText * uta,UText * utb)204 static UBool testUTextEqual(UText *uta, UText *utb) {
205     UChar32 ca = 0;
206     UChar32 cb = 0;
207     utext_setNativeIndex(uta, 0);
208     utext_setNativeIndex(utb, 0);
209     do {
210         ca = utext_next32(uta);
211         cb = utext_next32(utb);
212         if (ca != cb) {
213             break;
214         }
215     } while (ca != U_SENTINEL);
216     return ca == cb;
217 }
218 
219 
220 /**
221  * @param expected expected text in UTF-8 (not platform) codepage
222  */
assertUText(const char * expected,UText * actual,const char * file,int line)223 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
224     UErrorCode status = U_ZERO_ERROR;
225     UText expectedText = UTEXT_INITIALIZER;
226     utext_openUTF8(&expectedText, expected, -1, &status);
227     if(U_FAILURE(status)) {
228       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
229       return;
230     }
231     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
232       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
233       return;
234     }
235     utext_setNativeIndex(actual, 0);
236     if (!testUTextEqual(&expectedText, actual)) {
237         char buf[201 /*21*/];
238         char expectedBuf[201];
239         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
240         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
241         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
242     }
243     utext_close(&expectedText);
244 }
245 /**
246  * @param expected invariant (platform local text) input
247  */
248 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)249 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
250     UErrorCode status = U_ZERO_ERROR;
251     UText expectedText = UTEXT_INITIALIZER;
252     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
253     if(U_FAILURE(status)) {
254       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
255       return;
256     }
257     utext_setNativeIndex(actual, 0);
258     if (!testUTextEqual(&expectedText, actual)) {
259         char buf[201 /*21*/];
260         char expectedBuf[201];
261         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
262         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
263         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
264     }
265     utext_close(&expectedText);
266 }
267 
268 /**
269  * Assumes utf-8 input
270  */
271 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
272 /**
273  * Assumes Invariant input
274  */
275 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
276 
277 /**
278  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
279  * passed into utext_openUTF8. An error will be given if
280  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
281  */
282 
283 #define INV_BUFSIZ 2048 /* increase this if too small */
284 
285 static int64_t inv_next=0;
286 
287 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
288 static char inv_buf[INV_BUFSIZ];
289 #endif
290 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)291 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
292   if(length==-1) length=strlen(inv);
293 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
294   inv_next+=length;
295   return utext_openUTF8(ut, inv, length, status);
296 #else
297   if(inv_next+length+1>INV_BUFSIZ) {
298     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
299             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
300     *status = U_MEMORY_ALLOCATION_ERROR;
301     return NULL;
302   }
303 
304   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
305   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
306   inv_next+=length;
307 
308 #if 0
309   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
310 #endif
311 
312   return utext_openUTF8(ut, (const char*)buf, length, status);
313 #endif
314 }
315 
316 
317 //---------------------------------------------------------------------------
318 //
319 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
320 //                       for the LookingAt() and  Match() functions.
321 //
322 //       usage:
323 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
324 //
325 //          The expected results are UBool - TRUE or FALSE.
326 //          The input text is unescaped.  The pattern is not.
327 //
328 //
329 //---------------------------------------------------------------------------
330 
331 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
332 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)333 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
334     const UnicodeString pattern(pat, -1, US_INV);
335     const UnicodeString inputText(text, -1, US_INV);
336     UErrorCode          status  = U_ZERO_ERROR;
337     UParseError         pe;
338     RegexPattern        *REPattern = NULL;
339     RegexMatcher        *REMatcher = NULL;
340     UBool               retVal     = TRUE;
341 
342     UnicodeString patString(pat, -1, US_INV);
343     REPattern = RegexPattern::compile(patString, 0, pe, status);
344     if (U_FAILURE(status)) {
345         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
346             line, u_errorName(status));
347         return FALSE;
348     }
349     if (line==376) { REPattern->dumpPattern();}
350 
351     UnicodeString inputString(inputText);
352     UnicodeString unEscapedInput = inputString.unescape();
353     REMatcher = REPattern->matcher(unEscapedInput, status);
354     if (U_FAILURE(status)) {
355         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
356             line, u_errorName(status));
357         return FALSE;
358     }
359 
360     UBool actualmatch;
361     actualmatch = REMatcher->lookingAt(status);
362     if (U_FAILURE(status)) {
363         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
364             line, u_errorName(status));
365         retVal =  FALSE;
366     }
367     if (actualmatch != looking) {
368         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
369         retVal = FALSE;
370     }
371 
372     status = U_ZERO_ERROR;
373     actualmatch = REMatcher->matches(status);
374     if (U_FAILURE(status)) {
375         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
376             line, u_errorName(status));
377         retVal = FALSE;
378     }
379     if (actualmatch != match) {
380         errln("RegexTest: wrong return from matches() at line %d.\n", line);
381         retVal = FALSE;
382     }
383 
384     if (retVal == FALSE) {
385         REPattern->dumpPattern();
386     }
387 
388     delete REPattern;
389     delete REMatcher;
390     return retVal;
391 }
392 
393 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)394 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
395     UText               pattern    = UTEXT_INITIALIZER;
396     int32_t             inputUTF8Length;
397     char                *textChars = NULL;
398     UText               inputText  = UTEXT_INITIALIZER;
399     UErrorCode          status     = U_ZERO_ERROR;
400     UParseError         pe;
401     RegexPattern        *REPattern = NULL;
402     RegexMatcher        *REMatcher = NULL;
403     UBool               retVal     = TRUE;
404 
405     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
406     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
407     if (U_FAILURE(status)) {
408         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
409             line, u_errorName(status));
410         return FALSE;
411     }
412 
413     UnicodeString inputString(text, -1, US_INV);
414     UnicodeString unEscapedInput = inputString.unescape();
415     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
416     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
417 
418     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
419     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
420         // UTF-8 does not allow unpaired surrogates, so this could actually happen
421         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
422         return TRUE; // not a failure of the Regex engine
423     }
424     status = U_ZERO_ERROR; // buffer overflow
425     textChars = new char[inputUTF8Length+1];
426     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
427     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
428 
429     REMatcher = &REPattern->matcher(status)->reset(&inputText);
430     if (U_FAILURE(status)) {
431         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
432             line, u_errorName(status));
433         return FALSE;
434     }
435 
436     UBool actualmatch;
437     actualmatch = REMatcher->lookingAt(status);
438     if (U_FAILURE(status)) {
439         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
440             line, u_errorName(status));
441         retVal =  FALSE;
442     }
443     if (actualmatch != looking) {
444         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
445         retVal = FALSE;
446     }
447 
448     status = U_ZERO_ERROR;
449     actualmatch = REMatcher->matches(status);
450     if (U_FAILURE(status)) {
451         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
452             line, u_errorName(status));
453         retVal = FALSE;
454     }
455     if (actualmatch != match) {
456         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
457         retVal = FALSE;
458     }
459 
460     if (retVal == FALSE) {
461         REPattern->dumpPattern();
462     }
463 
464     delete REPattern;
465     delete REMatcher;
466     utext_close(&inputText);
467     utext_close(&pattern);
468     delete[] textChars;
469     return retVal;
470 }
471 
472 
473 
474 //---------------------------------------------------------------------------
475 //
476 //    REGEX_ERR       Macro + invocation function to simplify writing tests
477 //                       regex tests for incorrect patterns
478 //
479 //       usage:
480 //          REGEX_ERR("pattern",   expected error line, column, expected status);
481 //
482 //---------------------------------------------------------------------------
483 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
484 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)485 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
486                           UErrorCode expectedStatus, int32_t line) {
487     UnicodeString       pattern(pat);
488 
489     UErrorCode          status         = U_ZERO_ERROR;
490     UParseError         pe;
491     RegexPattern        *callerPattern = NULL;
492 
493     //
494     //  Compile the caller's pattern
495     //
496     UnicodeString patString(pat);
497     callerPattern = RegexPattern::compile(patString, 0, pe, status);
498     if (status != expectedStatus) {
499         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
500     } else {
501         if (status != U_ZERO_ERROR) {
502             if (pe.line != errLine || pe.offset != errCol) {
503                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
504                     line, errLine, errCol, pe.line, pe.offset);
505             }
506         }
507     }
508 
509     delete callerPattern;
510 
511     //
512     //  Compile again, using a UTF-8-based UText
513     //
514     UText patternText = UTEXT_INITIALIZER;
515     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
516     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
517     if (status != expectedStatus) {
518         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
519     } else {
520         if (status != U_ZERO_ERROR) {
521             if (pe.line != errLine || pe.offset != errCol) {
522                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
523                     line, errLine, errCol, pe.line, pe.offset);
524             }
525         }
526     }
527 
528     delete callerPattern;
529     utext_close(&patternText);
530 }
531 
532 
533 
534 //---------------------------------------------------------------------------
535 //
536 //      Basic      Check for basic functionality of regex pattern matching.
537 //                 Avoid the use of REGEX_FIND test macro, which has
538 //                 substantial dependencies on basic Regex functionality.
539 //
540 //---------------------------------------------------------------------------
Basic()541 void RegexTest::Basic() {
542 
543 
544 //
545 // Debug - slide failing test cases early
546 //
547 #if 0
548     {
549         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
550         UParseError pe;
551         UErrorCode  status = U_ZERO_ERROR;
552         RegexPattern *pattern;
553         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
554         pattern->dumpPattern();
555         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
556         UBool result = m->find();
557         printf("result = %d\n", result);
558         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
559         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
560     }
561     exit(1);
562 #endif
563 
564 
565     //
566     // Pattern with parentheses
567     //
568     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
569     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
570     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
571 
572     //
573     // Patterns with *
574     //
575     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
576     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
577     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
578     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
579     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
580 
581     REGEX_TESTLM("a*", "",  TRUE, TRUE);
582     REGEX_TESTLM("a*", "b", TRUE, FALSE);
583 
584 
585     //
586     //  Patterns with "."
587     //
588     REGEX_TESTLM(".", "abc", TRUE, FALSE);
589     REGEX_TESTLM("...", "abc", TRUE, TRUE);
590     REGEX_TESTLM("....", "abc", FALSE, FALSE);
591     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
592     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
593     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
594     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
595     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
596 
597     //
598     //  Patterns with * applied to chars at end of literal string
599     //
600     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
601     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
602 
603     //
604     //  Supplemental chars match as single chars, not a pair of surrogates.
605     //
606     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
607     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
608     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
609 
610 
611     //
612     //  UnicodeSets in the pattern
613     //
614     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
615     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
616     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
617     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
618     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
619     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
620 
621     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
622     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
623     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
624     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
625     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
626 
627     //
628     //   OR operator in patterns
629     //
630     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
631     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
632     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
633     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
634 
635     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
636     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
637     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
638     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
639     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
640     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
641 
642     //
643     //  +
644     //
645     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
646     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
647     REGEX_TESTLM("b+", "", FALSE, FALSE);
648     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
649     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
650     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
651 
652     //
653     //   ?
654     //
655     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
656     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
657     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
658     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
659     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
660     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
661     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
662     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
663     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
664 
665     //
666     //  Escape sequences that become single literal chars, handled internally
667     //   by ICU's Unescape.
668     //
669 
670     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
671     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
672     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
673     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
674     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
675     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
676     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
677     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
678     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
679     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
680 
681     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
682     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
683 
684     // Escape of special chars in patterns
685     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
686 }
687 
688 
689 //---------------------------------------------------------------------------
690 //
691 //    UTextBasic   Check for quirks that are specific to the UText
692 //                 implementation.
693 //
694 //---------------------------------------------------------------------------
UTextBasic()695 void RegexTest::UTextBasic() {
696     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
697     UErrorCode status = U_ZERO_ERROR;
698     UText pattern = UTEXT_INITIALIZER;
699     utext_openUTF8(&pattern, str_abc, -1, &status);
700     RegexMatcher matcher(&pattern, 0, status);
701     REGEX_CHECK_STATUS;
702 
703     UText input = UTEXT_INITIALIZER;
704     utext_openUTF8(&input, str_abc, -1, &status);
705     REGEX_CHECK_STATUS;
706     matcher.reset(&input);
707     REGEX_CHECK_STATUS;
708     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
709 
710     matcher.reset(matcher.inputText());
711     REGEX_CHECK_STATUS;
712     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
713 
714     utext_close(&pattern);
715     utext_close(&input);
716 }
717 
718 
719 //---------------------------------------------------------------------------
720 //
721 //      API_Match   Test that the API for class RegexMatcher
722 //                  is present and nominally working, but excluding functions
723 //                  implementing replace operations.
724 //
725 //---------------------------------------------------------------------------
API_Match()726 void RegexTest::API_Match() {
727     UParseError         pe;
728     UErrorCode          status=U_ZERO_ERROR;
729     int32_t             flags = 0;
730 
731     //
732     // Debug - slide failing test cases early
733     //
734 #if 0
735     {
736     }
737     return;
738 #endif
739 
740     //
741     // Simple pattern compilation
742     //
743     {
744         UnicodeString       re("abc");
745         RegexPattern        *pat2;
746         pat2 = RegexPattern::compile(re, flags, pe, status);
747         REGEX_CHECK_STATUS;
748 
749         UnicodeString inStr1 = "abcdef this is a test";
750         UnicodeString instr2 = "not abc";
751         UnicodeString empty  = "";
752 
753 
754         //
755         // Matcher creation and reset.
756         //
757         RegexMatcher *m1 = pat2->matcher(inStr1, status);
758         REGEX_CHECK_STATUS;
759         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
760         REGEX_ASSERT(m1->input() == inStr1);
761         m1->reset(instr2);
762         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
763         REGEX_ASSERT(m1->input() == instr2);
764         m1->reset(inStr1);
765         REGEX_ASSERT(m1->input() == inStr1);
766         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
767         m1->reset(empty);
768         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
769         REGEX_ASSERT(m1->input() == empty);
770         REGEX_ASSERT(&m1->pattern() == pat2);
771 
772         //
773         //  reset(pos, status)
774         //
775         m1->reset(inStr1);
776         m1->reset(4, status);
777         REGEX_CHECK_STATUS;
778         REGEX_ASSERT(m1->input() == inStr1);
779         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
780 
781         m1->reset(-1, status);
782         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
783         status = U_ZERO_ERROR;
784 
785         m1->reset(0, status);
786         REGEX_CHECK_STATUS;
787         status = U_ZERO_ERROR;
788 
789         int32_t len = m1->input().length();
790         m1->reset(len-1, status);
791         REGEX_CHECK_STATUS;
792         status = U_ZERO_ERROR;
793 
794         m1->reset(len, status);
795         REGEX_CHECK_STATUS;
796         status = U_ZERO_ERROR;
797 
798         m1->reset(len+1, status);
799         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
800         status = U_ZERO_ERROR;
801 
802         //
803         // match(pos, status)
804         //
805         m1->reset(instr2);
806         REGEX_ASSERT(m1->matches(4, status) == TRUE);
807         m1->reset();
808         REGEX_ASSERT(m1->matches(3, status) == FALSE);
809         m1->reset();
810         REGEX_ASSERT(m1->matches(5, status) == FALSE);
811         REGEX_ASSERT(m1->matches(4, status) == TRUE);
812         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
813         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
814 
815         // Match() at end of string should fail, but should not
816         //  be an error.
817         status = U_ZERO_ERROR;
818         len = m1->input().length();
819         REGEX_ASSERT(m1->matches(len, status) == FALSE);
820         REGEX_CHECK_STATUS;
821 
822         // Match beyond end of string should fail with an error.
823         status = U_ZERO_ERROR;
824         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
825         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
826 
827         // Successful match at end of string.
828         {
829             status = U_ZERO_ERROR;
830             RegexMatcher m("A?", 0, status);  // will match zero length string.
831             REGEX_CHECK_STATUS;
832             m.reset(inStr1);
833             len = inStr1.length();
834             REGEX_ASSERT(m.matches(len, status) == TRUE);
835             REGEX_CHECK_STATUS;
836             m.reset(empty);
837             REGEX_ASSERT(m.matches(0, status) == TRUE);
838             REGEX_CHECK_STATUS;
839         }
840 
841 
842         //
843         // lookingAt(pos, status)
844         //
845         status = U_ZERO_ERROR;
846         m1->reset(instr2);  // "not abc"
847         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
848         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
849         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
850         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
851         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
852         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
853         status = U_ZERO_ERROR;
854         len = m1->input().length();
855         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
856         REGEX_CHECK_STATUS;
857         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859 
860         delete m1;
861         delete pat2;
862     }
863 
864 
865     //
866     // Capture Group.
867     //     RegexMatcher::start();
868     //     RegexMatcher::end();
869     //     RegexMatcher::groupCount();
870     //
871     {
872         int32_t             flags=0;
873         UParseError         pe;
874         UErrorCode          status=U_ZERO_ERROR;
875 
876         UnicodeString       re("01(23(45)67)(.*)");
877         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
878         REGEX_CHECK_STATUS;
879         UnicodeString data = "0123456789";
880 
881         RegexMatcher *matcher = pat->matcher(data, status);
882         REGEX_CHECK_STATUS;
883         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
884         static const int32_t matchStarts[] = {0,  2, 4, 8};
885         static const int32_t matchEnds[]   = {10, 8, 6, 10};
886         int32_t i;
887         for (i=0; i<4; i++) {
888             int32_t actualStart = matcher->start(i, status);
889             REGEX_CHECK_STATUS;
890             if (actualStart != matchStarts[i]) {
891                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
892                     __LINE__, i, matchStarts[i], actualStart);
893             }
894             int32_t actualEnd = matcher->end(i, status);
895             REGEX_CHECK_STATUS;
896             if (actualEnd != matchEnds[i]) {
897                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
898                     __LINE__, i, matchEnds[i], actualEnd);
899             }
900         }
901 
902         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
903         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
904 
905         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
906         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
907         matcher->reset();
908         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
909 
910         matcher->lookingAt(status);
911         REGEX_ASSERT(matcher->group(status)    == "0123456789");
912         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
913         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
914         REGEX_ASSERT(matcher->group(2, status) == "45"        );
915         REGEX_ASSERT(matcher->group(3, status) == "89"        );
916         REGEX_CHECK_STATUS;
917         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
918         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
919         matcher->reset();
920         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
921 
922         delete matcher;
923         delete pat;
924 
925     }
926 
927     //
928     //  find
929     //
930     {
931         int32_t             flags=0;
932         UParseError         pe;
933         UErrorCode          status=U_ZERO_ERROR;
934 
935         UnicodeString       re("abc");
936         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
937         REGEX_CHECK_STATUS;
938         UnicodeString data = ".abc..abc...abc..";
939         //                    012345678901234567
940 
941         RegexMatcher *matcher = pat->matcher(data, status);
942         REGEX_CHECK_STATUS;
943         REGEX_ASSERT(matcher->find());
944         REGEX_ASSERT(matcher->start(status) == 1);
945         REGEX_ASSERT(matcher->find());
946         REGEX_ASSERT(matcher->start(status) == 6);
947         REGEX_ASSERT(matcher->find());
948         REGEX_ASSERT(matcher->start(status) == 12);
949         REGEX_ASSERT(matcher->find() == FALSE);
950         REGEX_ASSERT(matcher->find() == FALSE);
951 
952         matcher->reset();
953         REGEX_ASSERT(matcher->find());
954         REGEX_ASSERT(matcher->start(status) == 1);
955 
956         REGEX_ASSERT(matcher->find(0, status));
957         REGEX_ASSERT(matcher->start(status) == 1);
958         REGEX_ASSERT(matcher->find(1, status));
959         REGEX_ASSERT(matcher->start(status) == 1);
960         REGEX_ASSERT(matcher->find(2, status));
961         REGEX_ASSERT(matcher->start(status) == 6);
962         REGEX_ASSERT(matcher->find(12, status));
963         REGEX_ASSERT(matcher->start(status) == 12);
964         REGEX_ASSERT(matcher->find(13, status) == FALSE);
965         REGEX_ASSERT(matcher->find(16, status) == FALSE);
966         REGEX_ASSERT(matcher->find(17, status) == FALSE);
967         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
968 
969         status = U_ZERO_ERROR;
970         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
971         status = U_ZERO_ERROR;
972         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
973 
974         REGEX_ASSERT(matcher->groupCount() == 0);
975 
976         delete matcher;
977         delete pat;
978     }
979 
980 
981     //
982     //  find, with \G in pattern (true if at the end of a previous match).
983     //
984     {
985         int32_t             flags=0;
986         UParseError         pe;
987         UErrorCode          status=U_ZERO_ERROR;
988 
989         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
990         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
991         REGEX_CHECK_STATUS;
992         UnicodeString data = ".abcabc.abc..";
993         //                    012345678901234567
994 
995         RegexMatcher *matcher = pat->matcher(data, status);
996         REGEX_CHECK_STATUS;
997         REGEX_ASSERT(matcher->find());
998         REGEX_ASSERT(matcher->start(status) == 0);
999         REGEX_ASSERT(matcher->start(1, status) == -1);
1000         REGEX_ASSERT(matcher->start(2, status) == 1);
1001 
1002         REGEX_ASSERT(matcher->find());
1003         REGEX_ASSERT(matcher->start(status) == 4);
1004         REGEX_ASSERT(matcher->start(1, status) == 4);
1005         REGEX_ASSERT(matcher->start(2, status) == -1);
1006         REGEX_CHECK_STATUS;
1007 
1008         delete matcher;
1009         delete pat;
1010     }
1011 
1012     //
1013     //   find with zero length matches, match position should bump ahead
1014     //     to prevent loops.
1015     //
1016     {
1017         int32_t                 i;
1018         UErrorCode          status=U_ZERO_ERROR;
1019         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1020                                                       //   using an always-true look-ahead.
1021         REGEX_CHECK_STATUS;
1022         UnicodeString s("    ");
1023         m.reset(s);
1024         for (i=0; ; i++) {
1025             if (m.find() == FALSE) {
1026                 break;
1027             }
1028             REGEX_ASSERT(m.start(status) == i);
1029             REGEX_ASSERT(m.end(status) == i);
1030         }
1031         REGEX_ASSERT(i==5);
1032 
1033         // Check that the bump goes over surrogate pairs OK
1034         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1035         s = s.unescape();
1036         m.reset(s);
1037         for (i=0; ; i+=2) {
1038             if (m.find() == FALSE) {
1039                 break;
1040             }
1041             REGEX_ASSERT(m.start(status) == i);
1042             REGEX_ASSERT(m.end(status) == i);
1043         }
1044         REGEX_ASSERT(i==10);
1045     }
1046     {
1047         // find() loop breaking test.
1048         //        with pattern of /.?/, should see a series of one char matches, then a single
1049         //        match of zero length at the end of the input string.
1050         int32_t                 i;
1051         UErrorCode          status=U_ZERO_ERROR;
1052         RegexMatcher        m(".?", 0, status);
1053         REGEX_CHECK_STATUS;
1054         UnicodeString s("    ");
1055         m.reset(s);
1056         for (i=0; ; i++) {
1057             if (m.find() == FALSE) {
1058                 break;
1059             }
1060             REGEX_ASSERT(m.start(status) == i);
1061             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1062         }
1063         REGEX_ASSERT(i==5);
1064     }
1065 
1066 
1067     //
1068     // Matchers with no input string behave as if they had an empty input string.
1069     //
1070 
1071     {
1072         UErrorCode status = U_ZERO_ERROR;
1073         RegexMatcher  m(".?", 0, status);
1074         REGEX_CHECK_STATUS;
1075         REGEX_ASSERT(m.find());
1076         REGEX_ASSERT(m.start(status) == 0);
1077         REGEX_ASSERT(m.input() == "");
1078     }
1079     {
1080         UErrorCode status = U_ZERO_ERROR;
1081         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1082         RegexMatcher  *m = p->matcher(status);
1083         REGEX_CHECK_STATUS;
1084 
1085         REGEX_ASSERT(m->find() == FALSE);
1086         REGEX_ASSERT(m->input() == "");
1087         delete m;
1088         delete p;
1089     }
1090 
1091     //
1092     // Regions
1093     //
1094     {
1095         UErrorCode status = U_ZERO_ERROR;
1096         UnicodeString testString("This is test data");
1097         RegexMatcher m(".*", testString,  0, status);
1098         REGEX_CHECK_STATUS;
1099         REGEX_ASSERT(m.regionStart() == 0);
1100         REGEX_ASSERT(m.regionEnd() == testString.length());
1101         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1102         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1103 
1104         m.region(2,4, status);
1105         REGEX_CHECK_STATUS;
1106         REGEX_ASSERT(m.matches(status));
1107         REGEX_ASSERT(m.start(status)==2);
1108         REGEX_ASSERT(m.end(status)==4);
1109         REGEX_CHECK_STATUS;
1110 
1111         m.reset();
1112         REGEX_ASSERT(m.regionStart() == 0);
1113         REGEX_ASSERT(m.regionEnd() == testString.length());
1114 
1115         UnicodeString shorterString("short");
1116         m.reset(shorterString);
1117         REGEX_ASSERT(m.regionStart() == 0);
1118         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1119 
1120         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1121         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1122         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1123         REGEX_ASSERT(&m == &m.reset());
1124         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1125 
1126         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1127         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1128         REGEX_ASSERT(&m == &m.reset());
1129         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1130 
1131         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1132         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1133         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1134         REGEX_ASSERT(&m == &m.reset());
1135         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1136 
1137         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1138         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1139         REGEX_ASSERT(&m == &m.reset());
1140         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1141 
1142     }
1143 
1144     //
1145     // hitEnd() and requireEnd()
1146     //
1147     {
1148         UErrorCode status = U_ZERO_ERROR;
1149         UnicodeString testString("aabb");
1150         RegexMatcher m1(".*", testString,  0, status);
1151         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1152         REGEX_ASSERT(m1.hitEnd() == TRUE);
1153         REGEX_ASSERT(m1.requireEnd() == FALSE);
1154         REGEX_CHECK_STATUS;
1155 
1156         status = U_ZERO_ERROR;
1157         RegexMatcher m2("a*", testString, 0, status);
1158         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1159         REGEX_ASSERT(m2.hitEnd() == FALSE);
1160         REGEX_ASSERT(m2.requireEnd() == FALSE);
1161         REGEX_CHECK_STATUS;
1162 
1163         status = U_ZERO_ERROR;
1164         RegexMatcher m3(".*$", testString, 0, status);
1165         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1166         REGEX_ASSERT(m3.hitEnd() == TRUE);
1167         REGEX_ASSERT(m3.requireEnd() == TRUE);
1168         REGEX_CHECK_STATUS;
1169     }
1170 
1171 
1172     //
1173     // Compilation error on reset with UChar *
1174     //   These were a hazard that people were stumbling over with runtime errors.
1175     //   Changed them to compiler errors by adding private methods that more closely
1176     //   matched the incorrect use of the functions.
1177     //
1178 #if 0
1179     {
1180         UErrorCode status = U_ZERO_ERROR;
1181         UChar ucharString[20];
1182         RegexMatcher m(".", 0, status);
1183         m.reset(ucharString);  // should not compile.
1184 
1185         RegexPattern *p = RegexPattern::compile(".", 0, status);
1186         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1187 
1188         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1189     }
1190 #endif
1191 
1192     //
1193     //  Time Outs.
1194     //       Note:  These tests will need to be changed when the regexp engine is
1195     //              able to detect and cut short the exponential time behavior on
1196     //              this type of match.
1197     //
1198     {
1199         UErrorCode status = U_ZERO_ERROR;
1200         //    Enough 'a's in the string to cause the match to time out.
1201         //       (Each on additonal 'a' doubles the time)
1202         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1203         RegexMatcher matcher("(a+)+b", testString, 0, status);
1204         REGEX_CHECK_STATUS;
1205         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1206         matcher.setTimeLimit(100, status);
1207         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1208         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1209         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1210     }
1211     {
1212         UErrorCode status = U_ZERO_ERROR;
1213         //   Few enough 'a's to slip in under the time limit.
1214         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1215         RegexMatcher matcher("(a+)+b", testString, 0, status);
1216         REGEX_CHECK_STATUS;
1217         matcher.setTimeLimit(100, status);
1218         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1219         REGEX_CHECK_STATUS;
1220     }
1221 
1222     //
1223     //  Stack Limits
1224     //
1225     {
1226         UErrorCode status = U_ZERO_ERROR;
1227         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1228 
1229         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1230         //   of the '+', and makes the stack frames larger.
1231         RegexMatcher matcher("(A)+A$", testString, 0, status);
1232 
1233         // With the default stack, this match should fail to run
1234         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1235         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1236 
1237         // With unlimited stack, it should run
1238         status = U_ZERO_ERROR;
1239         matcher.setStackLimit(0, status);
1240         REGEX_CHECK_STATUS;
1241         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1242         REGEX_CHECK_STATUS;
1243         REGEX_ASSERT(matcher.getStackLimit() == 0);
1244 
1245         // With a limited stack, it the match should fail
1246         status = U_ZERO_ERROR;
1247         matcher.setStackLimit(10000, status);
1248         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1249         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1250         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1251     }
1252 
1253         // A pattern that doesn't save state should work with
1254         //   a minimal sized stack
1255     {
1256         UErrorCode status = U_ZERO_ERROR;
1257         UnicodeString testString = "abc";
1258         RegexMatcher matcher("abc", testString, 0, status);
1259         REGEX_CHECK_STATUS;
1260         matcher.setStackLimit(30, status);
1261         REGEX_CHECK_STATUS;
1262         REGEX_ASSERT(matcher.matches(status) == TRUE);
1263         REGEX_CHECK_STATUS;
1264         REGEX_ASSERT(matcher.getStackLimit() == 30);
1265 
1266         // Negative stack sizes should fail
1267         status = U_ZERO_ERROR;
1268         matcher.setStackLimit(1000, status);
1269         REGEX_CHECK_STATUS;
1270         matcher.setStackLimit(-1, status);
1271         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1272         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1273     }
1274 
1275 
1276 }
1277 
1278 
1279 
1280 
1281 
1282 
1283 //---------------------------------------------------------------------------
1284 //
1285 //      API_Replace        API test for class RegexMatcher, testing the
1286 //                         Replace family of functions.
1287 //
1288 //---------------------------------------------------------------------------
API_Replace()1289 void RegexTest::API_Replace() {
1290     //
1291     //  Replace
1292     //
1293     int32_t             flags=0;
1294     UParseError         pe;
1295     UErrorCode          status=U_ZERO_ERROR;
1296 
1297     UnicodeString       re("abc");
1298     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1299     REGEX_CHECK_STATUS;
1300     UnicodeString data = ".abc..abc...abc..";
1301     //                    012345678901234567
1302     RegexMatcher *matcher = pat->matcher(data, status);
1303 
1304     //
1305     //  Plain vanilla matches.
1306     //
1307     UnicodeString  dest;
1308     dest = matcher->replaceFirst("yz", status);
1309     REGEX_CHECK_STATUS;
1310     REGEX_ASSERT(dest == ".yz..abc...abc..");
1311 
1312     dest = matcher->replaceAll("yz", status);
1313     REGEX_CHECK_STATUS;
1314     REGEX_ASSERT(dest == ".yz..yz...yz..");
1315 
1316     //
1317     //  Plain vanilla non-matches.
1318     //
1319     UnicodeString d2 = ".abx..abx...abx..";
1320     matcher->reset(d2);
1321     dest = matcher->replaceFirst("yz", status);
1322     REGEX_CHECK_STATUS;
1323     REGEX_ASSERT(dest == ".abx..abx...abx..");
1324 
1325     dest = matcher->replaceAll("yz", status);
1326     REGEX_CHECK_STATUS;
1327     REGEX_ASSERT(dest == ".abx..abx...abx..");
1328 
1329     //
1330     // Empty source string
1331     //
1332     UnicodeString d3 = "";
1333     matcher->reset(d3);
1334     dest = matcher->replaceFirst("yz", status);
1335     REGEX_CHECK_STATUS;
1336     REGEX_ASSERT(dest == "");
1337 
1338     dest = matcher->replaceAll("yz", status);
1339     REGEX_CHECK_STATUS;
1340     REGEX_ASSERT(dest == "");
1341 
1342     //
1343     // Empty substitution string
1344     //
1345     matcher->reset(data);              // ".abc..abc...abc.."
1346     dest = matcher->replaceFirst("", status);
1347     REGEX_CHECK_STATUS;
1348     REGEX_ASSERT(dest == "...abc...abc..");
1349 
1350     dest = matcher->replaceAll("", status);
1351     REGEX_CHECK_STATUS;
1352     REGEX_ASSERT(dest == "........");
1353 
1354     //
1355     // match whole string
1356     //
1357     UnicodeString d4 = "abc";
1358     matcher->reset(d4);
1359     dest = matcher->replaceFirst("xyz", status);
1360     REGEX_CHECK_STATUS;
1361     REGEX_ASSERT(dest == "xyz");
1362 
1363     dest = matcher->replaceAll("xyz", status);
1364     REGEX_CHECK_STATUS;
1365     REGEX_ASSERT(dest == "xyz");
1366 
1367     //
1368     // Capture Group, simple case
1369     //
1370     UnicodeString       re2("a(..)");
1371     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1372     REGEX_CHECK_STATUS;
1373     UnicodeString d5 = "abcdefg";
1374     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1375     REGEX_CHECK_STATUS;
1376     dest = matcher2->replaceFirst("$1$1", status);
1377     REGEX_CHECK_STATUS;
1378     REGEX_ASSERT(dest == "bcbcdefg");
1379 
1380     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1381     REGEX_CHECK_STATUS;
1382     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1383 
1384     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1385     REGEX_ASSERT(U_FAILURE(status));
1386     status = U_ZERO_ERROR;
1387 
1388     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1389     replacement = replacement.unescape();
1390     dest = matcher2->replaceFirst(replacement, status);
1391     REGEX_CHECK_STATUS;
1392     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1393 
1394     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1395 
1396 
1397     //
1398     // Replacement String with \u hex escapes
1399     //
1400     {
1401         UnicodeString  src = "abc 1 abc 2 abc 3";
1402         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1403         matcher->reset(src);
1404         UnicodeString  result = matcher->replaceAll(substitute, status);
1405         REGEX_CHECK_STATUS;
1406         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1407     }
1408     {
1409         UnicodeString  src = "abc !";
1410         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1411         matcher->reset(src);
1412         UnicodeString  result = matcher->replaceAll(substitute, status);
1413         REGEX_CHECK_STATUS;
1414         UnicodeString expected = UnicodeString("--");
1415         expected.append((UChar32)0x10000);
1416         expected.append("-- !");
1417         REGEX_ASSERT(result == expected);
1418     }
1419     // TODO:  need more through testing of capture substitutions.
1420 
1421     // Bug 4057
1422     //
1423     {
1424         status = U_ZERO_ERROR;
1425         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1426         RegexMatcher m("ss(.*?)ee", 0, status);
1427         REGEX_CHECK_STATUS;
1428         UnicodeString result;
1429 
1430         // Multiple finds do NOT bump up the previous appendReplacement postion.
1431         m.reset(s);
1432         m.find();
1433         m.find();
1434         m.appendReplacement(result, "ooh", status);
1435         REGEX_CHECK_STATUS;
1436         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1437 
1438         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1439         status = U_ZERO_ERROR;
1440         result.truncate(0);
1441         m.reset(10, status);
1442         m.find();
1443         m.find();
1444         m.appendReplacement(result, "ooh", status);
1445         REGEX_CHECK_STATUS;
1446         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1447 
1448         // find() at interior of string, appendReplacemnt still starts at beginning.
1449         status = U_ZERO_ERROR;
1450         result.truncate(0);
1451         m.reset();
1452         m.find(10, status);
1453         m.find();
1454         m.appendReplacement(result, "ooh", status);
1455         REGEX_CHECK_STATUS;
1456         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1457 
1458         m.appendTail(result);
1459         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1460 
1461     }
1462 
1463     delete matcher2;
1464     delete pat2;
1465     delete matcher;
1466     delete pat;
1467 }
1468 
1469 
1470 //---------------------------------------------------------------------------
1471 //
1472 //      API_Pattern       Test that the API for class RegexPattern is
1473 //                        present and nominally working.
1474 //
1475 //---------------------------------------------------------------------------
API_Pattern()1476 void RegexTest::API_Pattern() {
1477     RegexPattern        pata;    // Test default constructor to not crash.
1478     RegexPattern        patb;
1479 
1480     REGEX_ASSERT(pata == patb);
1481     REGEX_ASSERT(pata == pata);
1482 
1483     UnicodeString re1("abc[a-l][m-z]");
1484     UnicodeString re2("def");
1485     UErrorCode    status = U_ZERO_ERROR;
1486     UParseError   pe;
1487 
1488     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1489     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1490     REGEX_CHECK_STATUS;
1491     REGEX_ASSERT(*pat1 == *pat1);
1492     REGEX_ASSERT(*pat1 != pata);
1493 
1494     // Assign
1495     patb = *pat1;
1496     REGEX_ASSERT(patb == *pat1);
1497 
1498     // Copy Construct
1499     RegexPattern patc(*pat1);
1500     REGEX_ASSERT(patc == *pat1);
1501     REGEX_ASSERT(patb == patc);
1502     REGEX_ASSERT(pat1 != pat2);
1503     patb = *pat2;
1504     REGEX_ASSERT(patb != patc);
1505     REGEX_ASSERT(patb == *pat2);
1506 
1507     // Compile with no flags.
1508     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1509     REGEX_ASSERT(*pat1a == *pat1);
1510 
1511     REGEX_ASSERT(pat1a->flags() == 0);
1512 
1513     // Compile with different flags should be not equal
1514     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1515     REGEX_CHECK_STATUS;
1516 
1517     REGEX_ASSERT(*pat1b != *pat1a);
1518     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1519     REGEX_ASSERT(pat1a->flags() == 0);
1520     delete pat1b;
1521 
1522     // clone
1523     RegexPattern *pat1c = pat1->clone();
1524     REGEX_ASSERT(*pat1c == *pat1);
1525     REGEX_ASSERT(*pat1c != *pat2);
1526 
1527     delete pat1c;
1528     delete pat1a;
1529     delete pat1;
1530     delete pat2;
1531 
1532 
1533     //
1534     //   Verify that a matcher created from a cloned pattern works.
1535     //     (Jitterbug 3423)
1536     //
1537     {
1538         UErrorCode     status     = U_ZERO_ERROR;
1539         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1540         RegexPattern  *pClone     = pSource->clone();
1541         delete         pSource;
1542         RegexMatcher  *mFromClone = pClone->matcher(status);
1543         REGEX_CHECK_STATUS;
1544         UnicodeString s = "Hello World";
1545         mFromClone->reset(s);
1546         REGEX_ASSERT(mFromClone->find() == TRUE);
1547         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1548         REGEX_ASSERT(mFromClone->find() == TRUE);
1549         REGEX_ASSERT(mFromClone->group(status) == "World");
1550         REGEX_ASSERT(mFromClone->find() == FALSE);
1551         delete mFromClone;
1552         delete pClone;
1553     }
1554 
1555     //
1556     //   matches convenience API
1557     //
1558     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1559     REGEX_CHECK_STATUS;
1560     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1561     REGEX_CHECK_STATUS;
1562     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1563     REGEX_CHECK_STATUS;
1564     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1565     REGEX_CHECK_STATUS;
1566     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1567     REGEX_CHECK_STATUS;
1568     status = U_INDEX_OUTOFBOUNDS_ERROR;
1569     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1570     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1571 
1572 
1573     //
1574     // Split()
1575     //
1576     status = U_ZERO_ERROR;
1577     pat1 = RegexPattern::compile(" +",  pe, status);
1578     REGEX_CHECK_STATUS;
1579     UnicodeString  fields[10];
1580 
1581     int32_t n;
1582     n = pat1->split("Now is the time", fields, 10, status);
1583     REGEX_CHECK_STATUS;
1584     REGEX_ASSERT(n==4);
1585     REGEX_ASSERT(fields[0]=="Now");
1586     REGEX_ASSERT(fields[1]=="is");
1587     REGEX_ASSERT(fields[2]=="the");
1588     REGEX_ASSERT(fields[3]=="time");
1589     REGEX_ASSERT(fields[4]=="");
1590 
1591     n = pat1->split("Now is the time", fields, 2, status);
1592     REGEX_CHECK_STATUS;
1593     REGEX_ASSERT(n==2);
1594     REGEX_ASSERT(fields[0]=="Now");
1595     REGEX_ASSERT(fields[1]=="is the time");
1596     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1597 
1598     fields[1] = "*";
1599     status = U_ZERO_ERROR;
1600     n = pat1->split("Now is the time", fields, 1, status);
1601     REGEX_CHECK_STATUS;
1602     REGEX_ASSERT(n==1);
1603     REGEX_ASSERT(fields[0]=="Now is the time");
1604     REGEX_ASSERT(fields[1]=="*");
1605     status = U_ZERO_ERROR;
1606 
1607     n = pat1->split("    Now       is the time   ", fields, 10, status);
1608     REGEX_CHECK_STATUS;
1609     REGEX_ASSERT(n==6);
1610     REGEX_ASSERT(fields[0]=="");
1611     REGEX_ASSERT(fields[1]=="Now");
1612     REGEX_ASSERT(fields[2]=="is");
1613     REGEX_ASSERT(fields[3]=="the");
1614     REGEX_ASSERT(fields[4]=="time");
1615     REGEX_ASSERT(fields[5]=="");
1616 
1617     n = pat1->split("     ", fields, 10, status);
1618     REGEX_CHECK_STATUS;
1619     REGEX_ASSERT(n==2);
1620     REGEX_ASSERT(fields[0]=="");
1621     REGEX_ASSERT(fields[1]=="");
1622 
1623     fields[0] = "foo";
1624     n = pat1->split("", fields, 10, status);
1625     REGEX_CHECK_STATUS;
1626     REGEX_ASSERT(n==0);
1627     REGEX_ASSERT(fields[0]=="foo");
1628 
1629     delete pat1;
1630 
1631     //  split, with a pattern with (capture)
1632     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1633     REGEX_CHECK_STATUS;
1634 
1635     status = U_ZERO_ERROR;
1636     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1637     REGEX_CHECK_STATUS;
1638     REGEX_ASSERT(n==7);
1639     REGEX_ASSERT(fields[0]=="");
1640     REGEX_ASSERT(fields[1]=="a");
1641     REGEX_ASSERT(fields[2]=="Now is ");
1642     REGEX_ASSERT(fields[3]=="b");
1643     REGEX_ASSERT(fields[4]=="the time");
1644     REGEX_ASSERT(fields[5]=="c");
1645     REGEX_ASSERT(fields[6]=="");
1646     REGEX_ASSERT(status==U_ZERO_ERROR);
1647 
1648     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1649     REGEX_CHECK_STATUS;
1650     REGEX_ASSERT(n==7);
1651     REGEX_ASSERT(fields[0]=="  ");
1652     REGEX_ASSERT(fields[1]=="a");
1653     REGEX_ASSERT(fields[2]=="Now is ");
1654     REGEX_ASSERT(fields[3]=="b");
1655     REGEX_ASSERT(fields[4]=="the time");
1656     REGEX_ASSERT(fields[5]=="c");
1657     REGEX_ASSERT(fields[6]=="");
1658 
1659     status = U_ZERO_ERROR;
1660     fields[6] = "foo";
1661     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1662     REGEX_CHECK_STATUS;
1663     REGEX_ASSERT(n==6);
1664     REGEX_ASSERT(fields[0]=="  ");
1665     REGEX_ASSERT(fields[1]=="a");
1666     REGEX_ASSERT(fields[2]=="Now is ");
1667     REGEX_ASSERT(fields[3]=="b");
1668     REGEX_ASSERT(fields[4]=="the time");
1669     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1670     REGEX_ASSERT(fields[6]=="foo");
1671 
1672     status = U_ZERO_ERROR;
1673     fields[5] = "foo";
1674     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1675     REGEX_CHECK_STATUS;
1676     REGEX_ASSERT(n==5);
1677     REGEX_ASSERT(fields[0]=="  ");
1678     REGEX_ASSERT(fields[1]=="a");
1679     REGEX_ASSERT(fields[2]=="Now is ");
1680     REGEX_ASSERT(fields[3]=="b");
1681     REGEX_ASSERT(fields[4]=="the time<c>");
1682     REGEX_ASSERT(fields[5]=="foo");
1683 
1684     status = U_ZERO_ERROR;
1685     fields[5] = "foo";
1686     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1687     REGEX_CHECK_STATUS;
1688     REGEX_ASSERT(n==5);
1689     REGEX_ASSERT(fields[0]=="  ");
1690     REGEX_ASSERT(fields[1]=="a");
1691     REGEX_ASSERT(fields[2]=="Now is ");
1692     REGEX_ASSERT(fields[3]=="b");
1693     REGEX_ASSERT(fields[4]=="the time");
1694     REGEX_ASSERT(fields[5]=="foo");
1695 
1696     status = U_ZERO_ERROR;
1697     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1698     REGEX_CHECK_STATUS;
1699     REGEX_ASSERT(n==4);
1700     REGEX_ASSERT(fields[0]=="  ");
1701     REGEX_ASSERT(fields[1]=="a");
1702     REGEX_ASSERT(fields[2]=="Now is ");
1703     REGEX_ASSERT(fields[3]=="the time<c>");
1704     status = U_ZERO_ERROR;
1705     delete pat1;
1706 
1707     pat1 = RegexPattern::compile("([-,])",  pe, status);
1708     REGEX_CHECK_STATUS;
1709     n = pat1->split("1-10,20", fields, 10, status);
1710     REGEX_CHECK_STATUS;
1711     REGEX_ASSERT(n==5);
1712     REGEX_ASSERT(fields[0]=="1");
1713     REGEX_ASSERT(fields[1]=="-");
1714     REGEX_ASSERT(fields[2]=="10");
1715     REGEX_ASSERT(fields[3]==",");
1716     REGEX_ASSERT(fields[4]=="20");
1717     delete pat1;
1718 
1719     // Test split of string with empty trailing fields
1720     pat1 = RegexPattern::compile(",", pe, status);
1721     REGEX_CHECK_STATUS;
1722     n = pat1->split("a,b,c,", fields, 10, status);
1723     REGEX_CHECK_STATUS;
1724     REGEX_ASSERT(n==4);
1725     REGEX_ASSERT(fields[0]=="a");
1726     REGEX_ASSERT(fields[1]=="b");
1727     REGEX_ASSERT(fields[2]=="c");
1728     REGEX_ASSERT(fields[3]=="");
1729 
1730     n = pat1->split("a,,,", fields, 10, status);
1731     REGEX_CHECK_STATUS;
1732     REGEX_ASSERT(n==4);
1733     REGEX_ASSERT(fields[0]=="a");
1734     REGEX_ASSERT(fields[1]=="");
1735     REGEX_ASSERT(fields[2]=="");
1736     REGEX_ASSERT(fields[3]=="");
1737     delete pat1;
1738 
1739     // Split Separator with zero length match.
1740     pat1 = RegexPattern::compile(":?", pe, status);
1741     REGEX_CHECK_STATUS;
1742     n = pat1->split("abc", fields, 10, status);
1743     REGEX_CHECK_STATUS;
1744     REGEX_ASSERT(n==5);
1745     REGEX_ASSERT(fields[0]=="");
1746     REGEX_ASSERT(fields[1]=="a");
1747     REGEX_ASSERT(fields[2]=="b");
1748     REGEX_ASSERT(fields[3]=="c");
1749     REGEX_ASSERT(fields[4]=="");
1750 
1751     delete pat1;
1752 
1753     //
1754     // RegexPattern::pattern()
1755     //
1756     pat1 = new RegexPattern();
1757     REGEX_ASSERT(pat1->pattern() == "");
1758     delete pat1;
1759 
1760     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1761     REGEX_CHECK_STATUS;
1762     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1763     delete pat1;
1764 
1765 
1766     //
1767     // classID functions
1768     //
1769     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1770     REGEX_CHECK_STATUS;
1771     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1772     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1773     UnicodeString Hello("Hello, world.");
1774     RegexMatcher *m = pat1->matcher(Hello, status);
1775     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1776     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1777     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1778     delete m;
1779     delete pat1;
1780 
1781 }
1782 
1783 //---------------------------------------------------------------------------
1784 //
1785 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1786 //                       is present and working, but excluding functions
1787 //                       implementing replace operations.
1788 //
1789 //---------------------------------------------------------------------------
API_Match_UTF8()1790 void RegexTest::API_Match_UTF8() {
1791     UParseError         pe;
1792     UErrorCode          status=U_ZERO_ERROR;
1793     int32_t             flags = 0;
1794 
1795     //
1796     // Debug - slide failing test cases early
1797     //
1798 #if 0
1799     {
1800     }
1801     return;
1802 #endif
1803 
1804     //
1805     // Simple pattern compilation
1806     //
1807     {
1808         UText               re = UTEXT_INITIALIZER;
1809         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1810         REGEX_VERBOSE_TEXT(&re);
1811         RegexPattern        *pat2;
1812         pat2 = RegexPattern::compile(&re, flags, pe, status);
1813         REGEX_CHECK_STATUS;
1814 
1815         UText input1 = UTEXT_INITIALIZER;
1816         UText input2 = UTEXT_INITIALIZER;
1817         UText empty  = UTEXT_INITIALIZER;
1818         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1819         REGEX_VERBOSE_TEXT(&input1);
1820         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1821         REGEX_VERBOSE_TEXT(&input2);
1822         utext_openUChars(&empty, NULL, 0, &status);
1823 
1824         int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1825         int32_t input2Len = strlen("not abc");
1826 
1827 
1828         //
1829         // Matcher creation and reset.
1830         //
1831         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1832         REGEX_CHECK_STATUS;
1833         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1834         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1835         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1836         m1->reset(&input2);
1837         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1838         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1839         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1840         m1->reset(&input1);
1841         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1842         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1843         m1->reset(&empty);
1844         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1845         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1846 
1847         //
1848         //  reset(pos, status)
1849         //
1850         m1->reset(&input1);
1851         m1->reset(4, status);
1852         REGEX_CHECK_STATUS;
1853         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1854         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1855 
1856         m1->reset(-1, status);
1857         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1858         status = U_ZERO_ERROR;
1859 
1860         m1->reset(0, status);
1861         REGEX_CHECK_STATUS;
1862         status = U_ZERO_ERROR;
1863 
1864         m1->reset(input1Len-1, status);
1865         REGEX_CHECK_STATUS;
1866         status = U_ZERO_ERROR;
1867 
1868         m1->reset(input1Len, status);
1869         REGEX_CHECK_STATUS;
1870         status = U_ZERO_ERROR;
1871 
1872         m1->reset(input1Len+1, status);
1873         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1874         status = U_ZERO_ERROR;
1875 
1876         //
1877         // match(pos, status)
1878         //
1879         m1->reset(&input2);
1880         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1881         m1->reset();
1882         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1883         m1->reset();
1884         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1885         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1886         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1887         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1888 
1889         // Match() at end of string should fail, but should not
1890         //  be an error.
1891         status = U_ZERO_ERROR;
1892         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1893         REGEX_CHECK_STATUS;
1894 
1895         // Match beyond end of string should fail with an error.
1896         status = U_ZERO_ERROR;
1897         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1898         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1899 
1900         // Successful match at end of string.
1901         {
1902             status = U_ZERO_ERROR;
1903             RegexMatcher m("A?", 0, status);  // will match zero length string.
1904             REGEX_CHECK_STATUS;
1905             m.reset(&input1);
1906             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1907             REGEX_CHECK_STATUS;
1908             m.reset(&empty);
1909             REGEX_ASSERT(m.matches(0, status) == TRUE);
1910             REGEX_CHECK_STATUS;
1911         }
1912 
1913 
1914         //
1915         // lookingAt(pos, status)
1916         //
1917         status = U_ZERO_ERROR;
1918         m1->reset(&input2);  // "not abc"
1919         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1920         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1921         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1922         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1923         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1924         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1925         status = U_ZERO_ERROR;
1926         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1927         REGEX_CHECK_STATUS;
1928         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1929         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1930 
1931         delete m1;
1932         delete pat2;
1933 
1934         utext_close(&re);
1935         utext_close(&input1);
1936         utext_close(&input2);
1937         utext_close(&empty);
1938     }
1939 
1940 
1941     //
1942     // Capture Group.
1943     //     RegexMatcher::start();
1944     //     RegexMatcher::end();
1945     //     RegexMatcher::groupCount();
1946     //
1947     {
1948         int32_t             flags=0;
1949         UParseError         pe;
1950         UErrorCode          status=U_ZERO_ERROR;
1951         UText               re=UTEXT_INITIALIZER;
1952         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1953         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1954 
1955         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1956         REGEX_CHECK_STATUS;
1957 
1958         UText input = UTEXT_INITIALIZER;
1959         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1960         utext_openUTF8(&input, str_0123456789, -1, &status);
1961 
1962         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1963         REGEX_CHECK_STATUS;
1964         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1965         static const int32_t matchStarts[] = {0,  2, 4, 8};
1966         static const int32_t matchEnds[]   = {10, 8, 6, 10};
1967         int32_t i;
1968         for (i=0; i<4; i++) {
1969             int32_t actualStart = matcher->start(i, status);
1970             REGEX_CHECK_STATUS;
1971             if (actualStart != matchStarts[i]) {
1972                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
1973                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
1974             }
1975             int32_t actualEnd = matcher->end(i, status);
1976             REGEX_CHECK_STATUS;
1977             if (actualEnd != matchEnds[i]) {
1978                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
1979                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
1980             }
1981         }
1982 
1983         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
1984         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
1985 
1986         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1987         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
1988         matcher->reset();
1989         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
1990 
1991         matcher->lookingAt(status);
1992 
1993         UnicodeString dest;
1994         UText destText = UTEXT_INITIALIZER;
1995         utext_openUnicodeString(&destText, &dest, &status);
1996         UText *result;
1997         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1998         //  Test shallow-clone API
1999         int64_t   group_len;
2000         result = matcher->group((UText *)NULL, group_len, status);
2001         REGEX_CHECK_STATUS;
2002         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2003         utext_close(result);
2004         result = matcher->group(0, &destText, group_len, status);
2005         REGEX_CHECK_STATUS;
2006         REGEX_ASSERT(result == &destText);
2007         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2008         //  destText is now immutable, reopen it
2009         utext_close(&destText);
2010         utext_openUnicodeString(&destText, &dest, &status);
2011 
2012         int64_t length;
2013         result = matcher->group(0, NULL, length, status);
2014         REGEX_CHECK_STATUS;
2015         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2016         utext_close(result);
2017         result = matcher->group(0, &destText, length, status);
2018         REGEX_CHECK_STATUS;
2019         REGEX_ASSERT(result == &destText);
2020         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2021         REGEX_ASSERT(length == 10);
2022         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2023 
2024         // Capture Group 1 == "234567"
2025         result = matcher->group(1, NULL, length, status);
2026         REGEX_CHECK_STATUS;
2027         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2028         REGEX_ASSERT(length == 6);
2029         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2030         utext_close(result);
2031 
2032         result = matcher->group(1, &destText, length, status);
2033         REGEX_CHECK_STATUS;
2034         REGEX_ASSERT(result == &destText);
2035         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2036         REGEX_ASSERT(length == 6);
2037         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2038         utext_close(result);
2039 
2040         // Capture Group 2 == "45"
2041         result = matcher->group(2, NULL, length, status);
2042         REGEX_CHECK_STATUS;
2043         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2044         REGEX_ASSERT(length == 2);
2045         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2046         utext_close(result);
2047 
2048         result = matcher->group(2, &destText, length, status);
2049         REGEX_CHECK_STATUS;
2050         REGEX_ASSERT(result == &destText);
2051         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2052         REGEX_ASSERT(length == 2);
2053         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2054         utext_close(result);
2055 
2056         // Capture Group 3 == "89"
2057         result = matcher->group(3, NULL, length, status);
2058         REGEX_CHECK_STATUS;
2059         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2060         REGEX_ASSERT(length == 2);
2061         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2062         utext_close(result);
2063 
2064         result = matcher->group(3, &destText, length, status);
2065         REGEX_CHECK_STATUS;
2066         REGEX_ASSERT(result == &destText);
2067         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2068         REGEX_ASSERT(length == 2);
2069         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2070         utext_close(result);
2071 
2072         // Capture Group number out of range.
2073         status = U_ZERO_ERROR;
2074         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2075         status = U_ZERO_ERROR;
2076         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2077         status = U_ZERO_ERROR;
2078         matcher->reset();
2079         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2080 
2081         delete matcher;
2082         delete pat;
2083 
2084         utext_close(&destText);
2085         utext_close(&input);
2086         utext_close(&re);
2087     }
2088 
2089     //
2090     //  find
2091     //
2092     {
2093         int32_t             flags=0;
2094         UParseError         pe;
2095         UErrorCode          status=U_ZERO_ERROR;
2096         UText               re=UTEXT_INITIALIZER;
2097         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2098         utext_openUTF8(&re, str_abc, -1, &status);
2099 
2100         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2101         REGEX_CHECK_STATUS;
2102         UText input = UTEXT_INITIALIZER;
2103         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2104         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2105         //                      012345678901234567
2106 
2107         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2108         REGEX_CHECK_STATUS;
2109         REGEX_ASSERT(matcher->find());
2110         REGEX_ASSERT(matcher->start(status) == 1);
2111         REGEX_ASSERT(matcher->find());
2112         REGEX_ASSERT(matcher->start(status) == 6);
2113         REGEX_ASSERT(matcher->find());
2114         REGEX_ASSERT(matcher->start(status) == 12);
2115         REGEX_ASSERT(matcher->find() == FALSE);
2116         REGEX_ASSERT(matcher->find() == FALSE);
2117 
2118         matcher->reset();
2119         REGEX_ASSERT(matcher->find());
2120         REGEX_ASSERT(matcher->start(status) == 1);
2121 
2122         REGEX_ASSERT(matcher->find(0, status));
2123         REGEX_ASSERT(matcher->start(status) == 1);
2124         REGEX_ASSERT(matcher->find(1, status));
2125         REGEX_ASSERT(matcher->start(status) == 1);
2126         REGEX_ASSERT(matcher->find(2, status));
2127         REGEX_ASSERT(matcher->start(status) == 6);
2128         REGEX_ASSERT(matcher->find(12, status));
2129         REGEX_ASSERT(matcher->start(status) == 12);
2130         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2131         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2132         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2133         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2134 
2135         status = U_ZERO_ERROR;
2136         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2137         status = U_ZERO_ERROR;
2138         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2139 
2140         REGEX_ASSERT(matcher->groupCount() == 0);
2141 
2142         delete matcher;
2143         delete pat;
2144 
2145         utext_close(&input);
2146         utext_close(&re);
2147     }
2148 
2149 
2150     //
2151     //  find, with \G in pattern (true if at the end of a previous match).
2152     //
2153     {
2154         int32_t             flags=0;
2155         UParseError         pe;
2156         UErrorCode          status=U_ZERO_ERROR;
2157         UText               re=UTEXT_INITIALIZER;
2158         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2159         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2160 
2161         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2162 
2163         REGEX_CHECK_STATUS;
2164         UText input = UTEXT_INITIALIZER;
2165         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2166         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2167         //                      012345678901234567
2168 
2169         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2170         REGEX_CHECK_STATUS;
2171         REGEX_ASSERT(matcher->find());
2172         REGEX_ASSERT(matcher->start(status) == 0);
2173         REGEX_ASSERT(matcher->start(1, status) == -1);
2174         REGEX_ASSERT(matcher->start(2, status) == 1);
2175 
2176         REGEX_ASSERT(matcher->find());
2177         REGEX_ASSERT(matcher->start(status) == 4);
2178         REGEX_ASSERT(matcher->start(1, status) == 4);
2179         REGEX_ASSERT(matcher->start(2, status) == -1);
2180         REGEX_CHECK_STATUS;
2181 
2182         delete matcher;
2183         delete pat;
2184 
2185         utext_close(&input);
2186         utext_close(&re);
2187     }
2188 
2189     //
2190     //   find with zero length matches, match position should bump ahead
2191     //     to prevent loops.
2192     //
2193     {
2194         int32_t                 i;
2195         UErrorCode          status=U_ZERO_ERROR;
2196         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2197                                                       //   using an always-true look-ahead.
2198         REGEX_CHECK_STATUS;
2199         UText s = UTEXT_INITIALIZER;
2200         utext_openUTF8(&s, "    ", -1, &status);
2201         m.reset(&s);
2202         for (i=0; ; i++) {
2203             if (m.find() == FALSE) {
2204                 break;
2205             }
2206             REGEX_ASSERT(m.start(status) == i);
2207             REGEX_ASSERT(m.end(status) == i);
2208         }
2209         REGEX_ASSERT(i==5);
2210 
2211         // Check that the bump goes over characters outside the BMP OK
2212         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2213         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2214         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2215         m.reset(&s);
2216         for (i=0; ; i+=4) {
2217             if (m.find() == FALSE) {
2218                 break;
2219             }
2220             REGEX_ASSERT(m.start(status) == i);
2221             REGEX_ASSERT(m.end(status) == i);
2222         }
2223         REGEX_ASSERT(i==20);
2224 
2225         utext_close(&s);
2226     }
2227     {
2228         // find() loop breaking test.
2229         //        with pattern of /.?/, should see a series of one char matches, then a single
2230         //        match of zero length at the end of the input string.
2231         int32_t                 i;
2232         UErrorCode          status=U_ZERO_ERROR;
2233         RegexMatcher        m(".?", 0, status);
2234         REGEX_CHECK_STATUS;
2235         UText s = UTEXT_INITIALIZER;
2236         utext_openUTF8(&s, "    ", -1, &status);
2237         m.reset(&s);
2238         for (i=0; ; i++) {
2239             if (m.find() == FALSE) {
2240                 break;
2241             }
2242             REGEX_ASSERT(m.start(status) == i);
2243             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2244         }
2245         REGEX_ASSERT(i==5);
2246 
2247         utext_close(&s);
2248     }
2249 
2250 
2251     //
2252     // Matchers with no input string behave as if they had an empty input string.
2253     //
2254 
2255     {
2256         UErrorCode status = U_ZERO_ERROR;
2257         RegexMatcher  m(".?", 0, status);
2258         REGEX_CHECK_STATUS;
2259         REGEX_ASSERT(m.find());
2260         REGEX_ASSERT(m.start(status) == 0);
2261         REGEX_ASSERT(m.input() == "");
2262     }
2263     {
2264         UErrorCode status = U_ZERO_ERROR;
2265         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2266         RegexMatcher  *m = p->matcher(status);
2267         REGEX_CHECK_STATUS;
2268 
2269         REGEX_ASSERT(m->find() == FALSE);
2270         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2271         delete m;
2272         delete p;
2273     }
2274 
2275     //
2276     // Regions
2277     //
2278     {
2279         UErrorCode status = U_ZERO_ERROR;
2280         UText testPattern = UTEXT_INITIALIZER;
2281         UText testText    = UTEXT_INITIALIZER;
2282         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2283         REGEX_VERBOSE_TEXT(&testPattern);
2284         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2285         REGEX_VERBOSE_TEXT(&testText);
2286 
2287         RegexMatcher m(&testPattern, &testText, 0, status);
2288         REGEX_CHECK_STATUS;
2289         REGEX_ASSERT(m.regionStart() == 0);
2290         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2291         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2292         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2293 
2294         m.region(2,4, status);
2295         REGEX_CHECK_STATUS;
2296         REGEX_ASSERT(m.matches(status));
2297         REGEX_ASSERT(m.start(status)==2);
2298         REGEX_ASSERT(m.end(status)==4);
2299         REGEX_CHECK_STATUS;
2300 
2301         m.reset();
2302         REGEX_ASSERT(m.regionStart() == 0);
2303         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2304 
2305         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2306         REGEX_VERBOSE_TEXT(&testText);
2307         m.reset(&testText);
2308         REGEX_ASSERT(m.regionStart() == 0);
2309         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2310 
2311         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2312         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2313         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2314         REGEX_ASSERT(&m == &m.reset());
2315         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2316 
2317         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2318         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2319         REGEX_ASSERT(&m == &m.reset());
2320         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2321 
2322         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2323         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2324         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2325         REGEX_ASSERT(&m == &m.reset());
2326         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2327 
2328         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2329         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2330         REGEX_ASSERT(&m == &m.reset());
2331         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2332 
2333         utext_close(&testText);
2334         utext_close(&testPattern);
2335     }
2336 
2337     //
2338     // hitEnd() and requireEnd()
2339     //
2340     {
2341         UErrorCode status = U_ZERO_ERROR;
2342         UText testPattern = UTEXT_INITIALIZER;
2343         UText testText    = UTEXT_INITIALIZER;
2344         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2345         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2346         utext_openUTF8(&testPattern, str_, -1, &status);
2347         utext_openUTF8(&testText, str_aabb, -1, &status);
2348 
2349         RegexMatcher m1(&testPattern, &testText,  0, status);
2350         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2351         REGEX_ASSERT(m1.hitEnd() == TRUE);
2352         REGEX_ASSERT(m1.requireEnd() == FALSE);
2353         REGEX_CHECK_STATUS;
2354 
2355         status = U_ZERO_ERROR;
2356         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2357         utext_openUTF8(&testPattern, str_a, -1, &status);
2358         RegexMatcher m2(&testPattern, &testText, 0, status);
2359         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2360         REGEX_ASSERT(m2.hitEnd() == FALSE);
2361         REGEX_ASSERT(m2.requireEnd() == FALSE);
2362         REGEX_CHECK_STATUS;
2363 
2364         status = U_ZERO_ERROR;
2365         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2366         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2367         RegexMatcher m3(&testPattern, &testText, 0, status);
2368         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2369         REGEX_ASSERT(m3.hitEnd() == TRUE);
2370         REGEX_ASSERT(m3.requireEnd() == TRUE);
2371         REGEX_CHECK_STATUS;
2372 
2373         utext_close(&testText);
2374         utext_close(&testPattern);
2375     }
2376 }
2377 
2378 
2379 //---------------------------------------------------------------------------
2380 //
2381 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2382 //                         Replace family of functions.
2383 //
2384 //---------------------------------------------------------------------------
API_Replace_UTF8()2385 void RegexTest::API_Replace_UTF8() {
2386     //
2387     //  Replace
2388     //
2389     int32_t             flags=0;
2390     UParseError         pe;
2391     UErrorCode          status=U_ZERO_ERROR;
2392 
2393     UText               re=UTEXT_INITIALIZER;
2394     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2395     REGEX_VERBOSE_TEXT(&re);
2396     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2397     REGEX_CHECK_STATUS;
2398 
2399     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2400     //             012345678901234567
2401     UText dataText = UTEXT_INITIALIZER;
2402     utext_openUTF8(&dataText, data, -1, &status);
2403     REGEX_CHECK_STATUS;
2404     REGEX_VERBOSE_TEXT(&dataText);
2405     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2406 
2407     //
2408     //  Plain vanilla matches.
2409     //
2410     UnicodeString  dest;
2411     UText destText = UTEXT_INITIALIZER;
2412     utext_openUnicodeString(&destText, &dest, &status);
2413     UText *result;
2414 
2415     UText replText = UTEXT_INITIALIZER;
2416 
2417     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2418     utext_openUTF8(&replText, str_yz, -1, &status);
2419     REGEX_VERBOSE_TEXT(&replText);
2420     result = matcher->replaceFirst(&replText, NULL, status);
2421     REGEX_CHECK_STATUS;
2422     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2423     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2424     utext_close(result);
2425     result = matcher->replaceFirst(&replText, &destText, status);
2426     REGEX_CHECK_STATUS;
2427     REGEX_ASSERT(result == &destText);
2428     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2429 
2430     result = matcher->replaceAll(&replText, NULL, status);
2431     REGEX_CHECK_STATUS;
2432     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2433     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2434     utext_close(result);
2435 
2436     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2437     result = matcher->replaceAll(&replText, &destText, status);
2438     REGEX_CHECK_STATUS;
2439     REGEX_ASSERT(result == &destText);
2440     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2441 
2442     //
2443     //  Plain vanilla non-matches.
2444     //
2445     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2446     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2447     matcher->reset(&dataText);
2448 
2449     result = matcher->replaceFirst(&replText, NULL, status);
2450     REGEX_CHECK_STATUS;
2451     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2452     utext_close(result);
2453     result = matcher->replaceFirst(&replText, &destText, status);
2454     REGEX_CHECK_STATUS;
2455     REGEX_ASSERT(result == &destText);
2456     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2457 
2458     result = matcher->replaceAll(&replText, NULL, status);
2459     REGEX_CHECK_STATUS;
2460     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2461     utext_close(result);
2462     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2463     result = matcher->replaceAll(&replText, &destText, status);
2464     REGEX_CHECK_STATUS;
2465     REGEX_ASSERT(result == &destText);
2466     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2467 
2468     //
2469     // Empty source string
2470     //
2471     utext_openUTF8(&dataText, NULL, 0, &status);
2472     matcher->reset(&dataText);
2473 
2474     result = matcher->replaceFirst(&replText, NULL, status);
2475     REGEX_CHECK_STATUS;
2476     REGEX_ASSERT_UTEXT_UTF8("", result);
2477     utext_close(result);
2478     result = matcher->replaceFirst(&replText, &destText, status);
2479     REGEX_CHECK_STATUS;
2480     REGEX_ASSERT(result == &destText);
2481     REGEX_ASSERT_UTEXT_UTF8("", result);
2482 
2483     result = matcher->replaceAll(&replText, NULL, status);
2484     REGEX_CHECK_STATUS;
2485     REGEX_ASSERT_UTEXT_UTF8("", result);
2486     utext_close(result);
2487     result = matcher->replaceAll(&replText, &destText, status);
2488     REGEX_CHECK_STATUS;
2489     REGEX_ASSERT(result == &destText);
2490     REGEX_ASSERT_UTEXT_UTF8("", result);
2491 
2492     //
2493     // Empty substitution string
2494     //
2495     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2496     matcher->reset(&dataText);
2497 
2498     utext_openUTF8(&replText, NULL, 0, &status);
2499     result = matcher->replaceFirst(&replText, NULL, status);
2500     REGEX_CHECK_STATUS;
2501     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2502     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2503     utext_close(result);
2504     result = matcher->replaceFirst(&replText, &destText, status);
2505     REGEX_CHECK_STATUS;
2506     REGEX_ASSERT(result == &destText);
2507     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2508 
2509     result = matcher->replaceAll(&replText, NULL, status);
2510     REGEX_CHECK_STATUS;
2511     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2512     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2513     utext_close(result);
2514     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2515     result = matcher->replaceAll(&replText, &destText, status);
2516     REGEX_CHECK_STATUS;
2517     REGEX_ASSERT(result == &destText);
2518     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2519 
2520     //
2521     // match whole string
2522     //
2523     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2524     utext_openUTF8(&dataText, str_abc, -1, &status);
2525     matcher->reset(&dataText);
2526 
2527     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2528     utext_openUTF8(&replText, str_xyz, -1, &status);
2529     result = matcher->replaceFirst(&replText, NULL, status);
2530     REGEX_CHECK_STATUS;
2531     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2532     utext_close(result);
2533     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2534     result = matcher->replaceFirst(&replText, &destText, status);
2535     REGEX_CHECK_STATUS;
2536     REGEX_ASSERT(result == &destText);
2537     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2538 
2539     result = matcher->replaceAll(&replText, NULL, status);
2540     REGEX_CHECK_STATUS;
2541     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2542     utext_close(result);
2543     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2544     result = matcher->replaceAll(&replText, &destText, status);
2545     REGEX_CHECK_STATUS;
2546     REGEX_ASSERT(result == &destText);
2547     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2548 
2549     //
2550     // Capture Group, simple case
2551     //
2552     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2553     utext_openUTF8(&re, str_add, -1, &status);
2554     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2555     REGEX_CHECK_STATUS;
2556 
2557     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2558     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2559     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2560     REGEX_CHECK_STATUS;
2561 
2562     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2563     utext_openUTF8(&replText, str_11, -1, &status);
2564     result = matcher2->replaceFirst(&replText, NULL, status);
2565     REGEX_CHECK_STATUS;
2566     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2567     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2568     utext_close(result);
2569     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2570     result = matcher2->replaceFirst(&replText, &destText, status);
2571     REGEX_CHECK_STATUS;
2572     REGEX_ASSERT(result == &destText);
2573     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2574 
2575     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2576     utext_openUTF8(&replText, str_v, -1, &status);
2577     REGEX_VERBOSE_TEXT(&replText);
2578     result = matcher2->replaceFirst(&replText, NULL, status);
2579     REGEX_CHECK_STATUS;
2580     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2581     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2582     utext_close(result);
2583     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2584     result = matcher2->replaceFirst(&replText, &destText, status);
2585     REGEX_CHECK_STATUS;
2586     REGEX_ASSERT(result == &destText);
2587     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2588 
2589     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2590                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2591                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2592     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2593     result = matcher2->replaceFirst(&replText, NULL, status);
2594     REGEX_CHECK_STATUS;
2595     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2596     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2597     utext_close(result);
2598     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2599     result = matcher2->replaceFirst(&replText, &destText, status);
2600     REGEX_CHECK_STATUS;
2601     REGEX_ASSERT(result == &destText);
2602     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2603 
2604     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2605     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2606     //                                 012345678901234567890123456
2607     supplDigitChars[22] = 0xF0;
2608     supplDigitChars[23] = 0x9D;
2609     supplDigitChars[24] = 0x9F;
2610     supplDigitChars[25] = 0x8F;
2611     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2612 
2613     result = matcher2->replaceFirst(&replText, NULL, status);
2614     REGEX_CHECK_STATUS;
2615     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2616     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2617     utext_close(result);
2618     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2619     result = matcher2->replaceFirst(&replText, &destText, status);
2620     REGEX_CHECK_STATUS;
2621     REGEX_ASSERT(result == &destText);
2622     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2623     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2624     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2625     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2626 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2627     utext_close(result);
2628     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2629     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2630     REGEX_ASSERT(result == &destText);
2631 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2632 
2633     //
2634     // Replacement String with \u hex escapes
2635     //
2636     {
2637       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2638       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2639         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2640         utext_openUTF8(&replText, str_u0043, -1, &status);
2641         matcher->reset(&dataText);
2642 
2643         result = matcher->replaceAll(&replText, NULL, status);
2644         REGEX_CHECK_STATUS;
2645         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2646         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2647         utext_close(result);
2648         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2649         result = matcher->replaceAll(&replText, &destText, status);
2650         REGEX_CHECK_STATUS;
2651         REGEX_ASSERT(result == &destText);
2652         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2653     }
2654     {
2655       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2656         utext_openUTF8(&dataText, str_abc, -1, &status);
2657         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2658         utext_openUTF8(&replText, str_U00010000, -1, &status);
2659         matcher->reset(&dataText);
2660 
2661         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2662         //                          0123456789
2663         expected[2] = 0xF0;
2664         expected[3] = 0x90;
2665         expected[4] = 0x80;
2666         expected[5] = 0x80;
2667 
2668         result = matcher->replaceAll(&replText, NULL, status);
2669         REGEX_CHECK_STATUS;
2670         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2671         utext_close(result);
2672         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2673         result = matcher->replaceAll(&replText, &destText, status);
2674         REGEX_CHECK_STATUS;
2675         REGEX_ASSERT(result == &destText);
2676         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2677     }
2678     // TODO:  need more through testing of capture substitutions.
2679 
2680     // Bug 4057
2681     //
2682     {
2683         status = U_ZERO_ERROR;
2684 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2685 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2686 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2687         utext_openUTF8(&re, str_ssee, -1, &status);
2688         utext_openUTF8(&dataText, str_blah, -1, &status);
2689         utext_openUTF8(&replText, str_ooh, -1, &status);
2690 
2691         RegexMatcher m(&re, 0, status);
2692         REGEX_CHECK_STATUS;
2693 
2694         UnicodeString result;
2695         UText resultText = UTEXT_INITIALIZER;
2696         utext_openUnicodeString(&resultText, &result, &status);
2697 
2698         // Multiple finds do NOT bump up the previous appendReplacement postion.
2699         m.reset(&dataText);
2700         m.find();
2701         m.find();
2702         m.appendReplacement(&resultText, &replText, status);
2703         REGEX_CHECK_STATUS;
2704         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2705         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2706 
2707         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2708         status = U_ZERO_ERROR;
2709         result.truncate(0);
2710         utext_openUnicodeString(&resultText, &result, &status);
2711         m.reset(10, status);
2712         m.find();
2713         m.find();
2714         m.appendReplacement(&resultText, &replText, status);
2715         REGEX_CHECK_STATUS;
2716         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2717         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2718 
2719         // find() at interior of string, appendReplacement still starts at beginning.
2720         status = U_ZERO_ERROR;
2721         result.truncate(0);
2722         utext_openUnicodeString(&resultText, &result, &status);
2723         m.reset();
2724         m.find(10, status);
2725         m.find();
2726         m.appendReplacement(&resultText, &replText, status);
2727         REGEX_CHECK_STATUS;
2728         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2729         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2730 
2731         m.appendTail(&resultText, status);
2732         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2733         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2734 
2735         utext_close(&resultText);
2736     }
2737 
2738     delete matcher2;
2739     delete pat2;
2740     delete matcher;
2741     delete pat;
2742 
2743     utext_close(&dataText);
2744     utext_close(&replText);
2745     utext_close(&destText);
2746     utext_close(&re);
2747 }
2748 
2749 
2750 //---------------------------------------------------------------------------
2751 //
2752 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2753 //                        present and nominally working.
2754 //
2755 //---------------------------------------------------------------------------
API_Pattern_UTF8()2756 void RegexTest::API_Pattern_UTF8() {
2757     RegexPattern        pata;    // Test default constructor to not crash.
2758     RegexPattern        patb;
2759 
2760     REGEX_ASSERT(pata == patb);
2761     REGEX_ASSERT(pata == pata);
2762 
2763     UText         re1 = UTEXT_INITIALIZER;
2764     UText         re2 = UTEXT_INITIALIZER;
2765     UErrorCode    status = U_ZERO_ERROR;
2766     UParseError   pe;
2767 
2768     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2769     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2770     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2771     utext_openUTF8(&re2, str_def, -1, &status);
2772 
2773     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2774     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2775     REGEX_CHECK_STATUS;
2776     REGEX_ASSERT(*pat1 == *pat1);
2777     REGEX_ASSERT(*pat1 != pata);
2778 
2779     // Assign
2780     patb = *pat1;
2781     REGEX_ASSERT(patb == *pat1);
2782 
2783     // Copy Construct
2784     RegexPattern patc(*pat1);
2785     REGEX_ASSERT(patc == *pat1);
2786     REGEX_ASSERT(patb == patc);
2787     REGEX_ASSERT(pat1 != pat2);
2788     patb = *pat2;
2789     REGEX_ASSERT(patb != patc);
2790     REGEX_ASSERT(patb == *pat2);
2791 
2792     // Compile with no flags.
2793     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2794     REGEX_ASSERT(*pat1a == *pat1);
2795 
2796     REGEX_ASSERT(pat1a->flags() == 0);
2797 
2798     // Compile with different flags should be not equal
2799     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2800     REGEX_CHECK_STATUS;
2801 
2802     REGEX_ASSERT(*pat1b != *pat1a);
2803     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2804     REGEX_ASSERT(pat1a->flags() == 0);
2805     delete pat1b;
2806 
2807     // clone
2808     RegexPattern *pat1c = pat1->clone();
2809     REGEX_ASSERT(*pat1c == *pat1);
2810     REGEX_ASSERT(*pat1c != *pat2);
2811 
2812     delete pat1c;
2813     delete pat1a;
2814     delete pat1;
2815     delete pat2;
2816 
2817     utext_close(&re1);
2818     utext_close(&re2);
2819 
2820 
2821     //
2822     //   Verify that a matcher created from a cloned pattern works.
2823     //     (Jitterbug 3423)
2824     //
2825     {
2826         UErrorCode     status     = U_ZERO_ERROR;
2827         UText          pattern    = UTEXT_INITIALIZER;
2828         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2829         utext_openUTF8(&pattern, str_pL, -1, &status);
2830 
2831         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2832         RegexPattern  *pClone     = pSource->clone();
2833         delete         pSource;
2834         RegexMatcher  *mFromClone = pClone->matcher(status);
2835         REGEX_CHECK_STATUS;
2836 
2837         UText          input      = UTEXT_INITIALIZER;
2838         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2839         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2840         mFromClone->reset(&input);
2841         REGEX_ASSERT(mFromClone->find() == TRUE);
2842         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2843         REGEX_ASSERT(mFromClone->find() == TRUE);
2844         REGEX_ASSERT(mFromClone->group(status) == "World");
2845         REGEX_ASSERT(mFromClone->find() == FALSE);
2846         delete mFromClone;
2847         delete pClone;
2848 
2849         utext_close(&input);
2850         utext_close(&pattern);
2851     }
2852 
2853     //
2854     //   matches convenience API
2855     //
2856     {
2857         UErrorCode status  = U_ZERO_ERROR;
2858         UText      pattern = UTEXT_INITIALIZER;
2859         UText      input   = UTEXT_INITIALIZER;
2860 
2861         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2862         utext_openUTF8(&input, str_randominput, -1, &status);
2863 
2864         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2865         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2866         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2867         REGEX_CHECK_STATUS;
2868 
2869         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2870         utext_openUTF8(&pattern, str_abc, -1, &status);
2871         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2872         REGEX_CHECK_STATUS;
2873 
2874         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2875         utext_openUTF8(&pattern, str_nput, -1, &status);
2876         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2877         REGEX_CHECK_STATUS;
2878 
2879         utext_openUTF8(&pattern, str_randominput, -1, &status);
2880         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2881         REGEX_CHECK_STATUS;
2882 
2883         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2884         utext_openUTF8(&pattern, str_u, -1, &status);
2885         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2886         REGEX_CHECK_STATUS;
2887 
2888         utext_openUTF8(&input, str_abc, -1, &status);
2889         utext_openUTF8(&pattern, str_abc, -1, &status);
2890         status = U_INDEX_OUTOFBOUNDS_ERROR;
2891         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2892         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2893 
2894         utext_close(&input);
2895         utext_close(&pattern);
2896     }
2897 
2898 
2899     //
2900     // Split()
2901     //
2902     status = U_ZERO_ERROR;
2903     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2904     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2905     pat1 = RegexPattern::compile(&re1, pe, status);
2906     REGEX_CHECK_STATUS;
2907     UnicodeString  fields[10];
2908 
2909     int32_t n;
2910     n = pat1->split("Now is the time", fields, 10, status);
2911     REGEX_CHECK_STATUS;
2912     REGEX_ASSERT(n==4);
2913     REGEX_ASSERT(fields[0]=="Now");
2914     REGEX_ASSERT(fields[1]=="is");
2915     REGEX_ASSERT(fields[2]=="the");
2916     REGEX_ASSERT(fields[3]=="time");
2917     REGEX_ASSERT(fields[4]=="");
2918 
2919     n = pat1->split("Now is the time", fields, 2, status);
2920     REGEX_CHECK_STATUS;
2921     REGEX_ASSERT(n==2);
2922     REGEX_ASSERT(fields[0]=="Now");
2923     REGEX_ASSERT(fields[1]=="is the time");
2924     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2925 
2926     fields[1] = "*";
2927     status = U_ZERO_ERROR;
2928     n = pat1->split("Now is the time", fields, 1, status);
2929     REGEX_CHECK_STATUS;
2930     REGEX_ASSERT(n==1);
2931     REGEX_ASSERT(fields[0]=="Now is the time");
2932     REGEX_ASSERT(fields[1]=="*");
2933     status = U_ZERO_ERROR;
2934 
2935     n = pat1->split("    Now       is the time   ", fields, 10, status);
2936     REGEX_CHECK_STATUS;
2937     REGEX_ASSERT(n==6);
2938     REGEX_ASSERT(fields[0]=="");
2939     REGEX_ASSERT(fields[1]=="Now");
2940     REGEX_ASSERT(fields[2]=="is");
2941     REGEX_ASSERT(fields[3]=="the");
2942     REGEX_ASSERT(fields[4]=="time");
2943     REGEX_ASSERT(fields[5]=="");
2944     REGEX_ASSERT(fields[6]=="");
2945 
2946     fields[2] = "*";
2947     n = pat1->split("     ", fields, 10, status);
2948     REGEX_CHECK_STATUS;
2949     REGEX_ASSERT(n==2);
2950     REGEX_ASSERT(fields[0]=="");
2951     REGEX_ASSERT(fields[1]=="");
2952     REGEX_ASSERT(fields[2]=="*");
2953 
2954     fields[0] = "foo";
2955     n = pat1->split("", fields, 10, status);
2956     REGEX_CHECK_STATUS;
2957     REGEX_ASSERT(n==0);
2958     REGEX_ASSERT(fields[0]=="foo");
2959 
2960     delete pat1;
2961 
2962     //  split, with a pattern with (capture)
2963     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2964     pat1 = RegexPattern::compile(&re1,  pe, status);
2965     REGEX_CHECK_STATUS;
2966 
2967     status = U_ZERO_ERROR;
2968     fields[6] = fields[7] = "*";
2969     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
2970     REGEX_CHECK_STATUS;
2971     REGEX_ASSERT(n==7);
2972     REGEX_ASSERT(fields[0]=="");
2973     REGEX_ASSERT(fields[1]=="a");
2974     REGEX_ASSERT(fields[2]=="Now is ");
2975     REGEX_ASSERT(fields[3]=="b");
2976     REGEX_ASSERT(fields[4]=="the time");
2977     REGEX_ASSERT(fields[5]=="c");
2978     REGEX_ASSERT(fields[6]=="");
2979     REGEX_ASSERT(fields[7]=="*");
2980     REGEX_ASSERT(status==U_ZERO_ERROR);
2981 
2982     fields[6] = fields[7] = "*";
2983     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
2984     REGEX_CHECK_STATUS;
2985     REGEX_ASSERT(n==7);
2986     REGEX_ASSERT(fields[0]=="  ");
2987     REGEX_ASSERT(fields[1]=="a");
2988     REGEX_ASSERT(fields[2]=="Now is ");
2989     REGEX_ASSERT(fields[3]=="b");
2990     REGEX_ASSERT(fields[4]=="the time");
2991     REGEX_ASSERT(fields[5]=="c");
2992     REGEX_ASSERT(fields[6]=="");
2993     REGEX_ASSERT(fields[7]=="*");
2994 
2995     status = U_ZERO_ERROR;
2996     fields[6] = "foo";
2997     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
2998     REGEX_CHECK_STATUS;
2999     REGEX_ASSERT(n==6);
3000     REGEX_ASSERT(fields[0]=="  ");
3001     REGEX_ASSERT(fields[1]=="a");
3002     REGEX_ASSERT(fields[2]=="Now is ");
3003     REGEX_ASSERT(fields[3]=="b");
3004     REGEX_ASSERT(fields[4]=="the time");
3005     REGEX_ASSERT(fields[5]==" ");
3006     REGEX_ASSERT(fields[6]=="foo");
3007 
3008     status = U_ZERO_ERROR;
3009     fields[5] = "foo";
3010     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3011     REGEX_CHECK_STATUS;
3012     REGEX_ASSERT(n==5);
3013     REGEX_ASSERT(fields[0]=="  ");
3014     REGEX_ASSERT(fields[1]=="a");
3015     REGEX_ASSERT(fields[2]=="Now is ");
3016     REGEX_ASSERT(fields[3]=="b");
3017     REGEX_ASSERT(fields[4]=="the time<c>");
3018     REGEX_ASSERT(fields[5]=="foo");
3019 
3020     status = U_ZERO_ERROR;
3021     fields[5] = "foo";
3022     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3023     REGEX_CHECK_STATUS;
3024     REGEX_ASSERT(n==5);
3025     REGEX_ASSERT(fields[0]=="  ");
3026     REGEX_ASSERT(fields[1]=="a");
3027     REGEX_ASSERT(fields[2]=="Now is ");
3028     REGEX_ASSERT(fields[3]=="b");
3029     REGEX_ASSERT(fields[4]=="the time");
3030     REGEX_ASSERT(fields[5]=="foo");
3031 
3032     status = U_ZERO_ERROR;
3033     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3034     REGEX_CHECK_STATUS;
3035     REGEX_ASSERT(n==4);
3036     REGEX_ASSERT(fields[0]=="  ");
3037     REGEX_ASSERT(fields[1]=="a");
3038     REGEX_ASSERT(fields[2]=="Now is ");
3039     REGEX_ASSERT(fields[3]=="the time<c>");
3040     status = U_ZERO_ERROR;
3041     delete pat1;
3042 
3043     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3044     pat1 = RegexPattern::compile(&re1, pe, status);
3045     REGEX_CHECK_STATUS;
3046     n = pat1->split("1-10,20", fields, 10, status);
3047     REGEX_CHECK_STATUS;
3048     REGEX_ASSERT(n==5);
3049     REGEX_ASSERT(fields[0]=="1");
3050     REGEX_ASSERT(fields[1]=="-");
3051     REGEX_ASSERT(fields[2]=="10");
3052     REGEX_ASSERT(fields[3]==",");
3053     REGEX_ASSERT(fields[4]=="20");
3054     delete pat1;
3055 
3056 
3057     //
3058     // split of a UText based string, with library allocating output UTexts.
3059     //
3060     {
3061         status = U_ZERO_ERROR;
3062         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3063         UnicodeString stringToSplit("first:second:third");
3064         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3065         REGEX_CHECK_STATUS;
3066 
3067         UText *splits[10] = {NULL};
3068         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3069         REGEX_CHECK_STATUS;
3070         REGEX_ASSERT(numFields == 5);
3071         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3072         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3073         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3074         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3075         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3076         REGEX_ASSERT(splits[5] == NULL);
3077 
3078         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3079             if (splits[i]) {
3080                 utext_close(splits[i]);
3081                 splits[i] = NULL;
3082             }
3083         }
3084         utext_close(textToSplit);
3085     }
3086 
3087 
3088     //
3089     // RegexPattern::pattern() and patternText()
3090     //
3091     pat1 = new RegexPattern();
3092     REGEX_ASSERT(pat1->pattern() == "");
3093     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3094     delete pat1;
3095     const char *helloWorldInvariant = "(Hello, world)*";
3096     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3097     pat1 = RegexPattern::compile(&re1, pe, status);
3098     REGEX_CHECK_STATUS;
3099     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3100     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3101     delete pat1;
3102 
3103     utext_close(&re1);
3104 }
3105 
3106 
3107 //---------------------------------------------------------------------------
3108 //
3109 //      Extended       A more thorough check for features of regex patterns
3110 //                     The test cases are in a separate data file,
3111 //                       source/tests/testdata/regextst.txt
3112 //                     A description of the test data format is included in that file.
3113 //
3114 //---------------------------------------------------------------------------
3115 
3116 const char *
getPath(char buffer[2048],const char * filename)3117 RegexTest::getPath(char buffer[2048], const char *filename) {
3118     UErrorCode status=U_ZERO_ERROR;
3119     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3120     if (U_FAILURE(status)) {
3121         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3122         return NULL;
3123     }
3124 
3125     strcpy(buffer, testDataDirectory);
3126     strcat(buffer, filename);
3127     return buffer;
3128 }
3129 
Extended()3130 void RegexTest::Extended() {
3131     char tdd[2048];
3132     const char *srcPath;
3133     UErrorCode  status  = U_ZERO_ERROR;
3134     int32_t     lineNum = 0;
3135 
3136     //
3137     //  Open and read the test data file.
3138     //
3139     srcPath=getPath(tdd, "regextst.txt");
3140     if(srcPath==NULL) {
3141         return; /* something went wrong, error already output */
3142     }
3143 
3144     int32_t    len;
3145     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3146     if (U_FAILURE(status)) {
3147         return; /* something went wrong, error already output */
3148     }
3149 
3150     //
3151     //  Put the test data into a UnicodeString
3152     //
3153     UnicodeString testString(FALSE, testData, len);
3154 
3155     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3156     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3157     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3158 
3159     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3160     UnicodeString   testPattern;   // The pattern for test from the test file.
3161     UnicodeString   testFlags;     // the flags   for a test.
3162     UnicodeString   matchString;   // The marked up string to be used as input
3163 
3164     if (U_FAILURE(status)){
3165         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3166         delete [] testData;
3167         return;
3168     }
3169 
3170     //
3171     //  Loop over the test data file, once per line.
3172     //
3173     while (lineMat.find()) {
3174         lineNum++;
3175         if (U_FAILURE(status)) {
3176           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3177         }
3178 
3179         status = U_ZERO_ERROR;
3180         UnicodeString testLine = lineMat.group(1, status);
3181         if (testLine.length() == 0) {
3182             continue;
3183         }
3184 
3185         //
3186         // Parse the test line.  Skip blank and comment only lines.
3187         // Separate out the three main fields - pattern, flags, target.
3188         //
3189 
3190         commentMat.reset(testLine);
3191         if (commentMat.lookingAt(status)) {
3192             // This line is a comment, or blank.
3193             continue;
3194         }
3195 
3196         //
3197         //  Pull out the pattern field, remove it from the test file line.
3198         //
3199         quotedStuffMat.reset(testLine);
3200         if (quotedStuffMat.lookingAt(status)) {
3201             testPattern = quotedStuffMat.group(2, status);
3202             testLine.remove(0, quotedStuffMat.end(0, status));
3203         } else {
3204             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3205             continue;
3206         }
3207 
3208 
3209         //
3210         //  Pull out the flags from the test file line.
3211         //
3212         flagsMat.reset(testLine);
3213         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3214         testFlags = flagsMat.group(1, status);
3215         if (flagsMat.group(2, status).length() > 0) {
3216             errln("Bad Match flag at line %d. Scanning %c\n",
3217                 lineNum, flagsMat.group(2, status).charAt(0));
3218             continue;
3219         }
3220         testLine.remove(0, flagsMat.end(0, status));
3221 
3222         //
3223         //  Pull out the match string, as a whole.
3224         //    We'll process the <tags> later.
3225         //
3226         quotedStuffMat.reset(testLine);
3227         if (quotedStuffMat.lookingAt(status)) {
3228             matchString = quotedStuffMat.group(2, status);
3229             testLine.remove(0, quotedStuffMat.end(0, status));
3230         } else {
3231             errln("Bad match string at test file line %d", lineNum);
3232             continue;
3233         }
3234 
3235         //
3236         //  The only thing left from the input line should be an optional trailing comment.
3237         //
3238         commentMat.reset(testLine);
3239         if (commentMat.lookingAt(status) == FALSE) {
3240             errln("Line %d: unexpected characters at end of test line.", lineNum);
3241             continue;
3242         }
3243 
3244         //
3245         //  Run the test
3246         //
3247         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3248     }
3249 
3250     delete [] testData;
3251 
3252 }
3253 
3254 
3255 
3256 //---------------------------------------------------------------------------
3257 //
3258 //    regex_find(pattern, flags, inputString, lineNumber)
3259 //
3260 //         Function to run a single test from the Extended (data driven) tests.
3261 //         See file test/testdata/regextst.txt for a description of the
3262 //         pattern and inputString fields, and the allowed flags.
3263 //         lineNumber is the source line in regextst.txt of the test.
3264 //
3265 //---------------------------------------------------------------------------
3266 
3267 
3268 //  Set a value into a UVector at position specified by a decimal number in
3269 //   a UnicodeString.   This is a utility function needed by the actual test function,
3270 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3271 static void set(UVector &vec, int32_t val, UnicodeString index) {
3272     UErrorCode  status=U_ZERO_ERROR;
3273     int32_t  idx = 0;
3274     for (int32_t i=0; i<index.length(); i++) {
3275         int32_t d=u_charDigitValue(index.charAt(i));
3276         if (d<0) {return;}
3277         idx = idx*10 + d;
3278     }
3279     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3280     vec.setElementAt(val, idx);
3281 }
3282 
setInt(UVector & vec,int32_t val,int32_t idx)3283 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3284     UErrorCode  status=U_ZERO_ERROR;
3285     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3286     vec.setElementAt(val, idx);
3287 }
3288 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3289 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3290 {
3291     UBool couldFind = TRUE;
3292     UTEXT_SETNATIVEINDEX(utext, 0);
3293     int32_t i = 0;
3294     while (i < unistrOffset) {
3295         UChar32 c = UTEXT_NEXT32(utext);
3296         if (c != U_SENTINEL) {
3297             i += U16_LENGTH(c);
3298         } else {
3299             couldFind = FALSE;
3300             break;
3301         }
3302     }
3303     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3304     return couldFind;
3305 }
3306 
3307 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3308 void RegexTest::regex_find(const UnicodeString &pattern,
3309                            const UnicodeString &flags,
3310                            const UnicodeString &inputString,
3311                            const char *srcPath,
3312                            int32_t line) {
3313     UnicodeString       unEscapedInput;
3314     UnicodeString       deTaggedInput;
3315 
3316     int32_t             patternUTF8Length,      inputUTF8Length;
3317     char                *patternChars  = NULL, *inputChars = NULL;
3318     UText               patternText    = UTEXT_INITIALIZER;
3319     UText               inputText      = UTEXT_INITIALIZER;
3320     UConverter          *UTF8Converter = NULL;
3321 
3322     UErrorCode          status         = U_ZERO_ERROR;
3323     UParseError         pe;
3324     RegexPattern        *parsePat      = NULL;
3325     RegexMatcher        *parseMatcher  = NULL;
3326     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3327     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3328     UVector             groupStarts(status);
3329     UVector             groupEnds(status);
3330     UVector             groupStartsUTF8(status);
3331     UVector             groupEndsUTF8(status);
3332     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3333     UBool               failed         = FALSE;
3334     int32_t             numFinds;
3335     int32_t             i;
3336     UBool               useMatchesFunc   = FALSE;
3337     UBool               useLookingAtFunc = FALSE;
3338     int32_t             regionStart      = -1;
3339     int32_t             regionEnd        = -1;
3340     int32_t             regionStartUTF8  = -1;
3341     int32_t             regionEndUTF8    = -1;
3342 
3343 
3344     //
3345     //  Compile the caller's pattern
3346     //
3347     uint32_t bflags = 0;
3348     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3349         bflags |= UREGEX_CASE_INSENSITIVE;
3350     }
3351     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3352         bflags |= UREGEX_COMMENTS;
3353     }
3354     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3355         bflags |= UREGEX_DOTALL;
3356     }
3357     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3358         bflags |= UREGEX_MULTILINE;
3359     }
3360 
3361     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3362         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3363     }
3364     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3365         bflags |= UREGEX_UNIX_LINES;
3366     }
3367     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3368         bflags |= UREGEX_LITERAL;
3369     }
3370 
3371 
3372     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3373     if (status != U_ZERO_ERROR) {
3374         #if UCONFIG_NO_BREAK_ITERATION==1
3375         // 'v' test flag means that the test pattern should not compile if ICU was configured
3376         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3377         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3378             goto cleanupAndReturn;
3379         }
3380         #endif
3381         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3382             // Expected pattern compilation error.
3383             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3384                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3385             }
3386             goto cleanupAndReturn;
3387         } else {
3388             // Unexpected pattern compilation error.
3389             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3390             goto cleanupAndReturn;
3391         }
3392     }
3393 
3394     UTF8Converter = ucnv_open("UTF8", &status);
3395     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3396 
3397     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3398     status = U_ZERO_ERROR; // buffer overflow
3399     patternChars = new char[patternUTF8Length+1];
3400     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3401     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3402 
3403     if (status == U_ZERO_ERROR) {
3404         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3405 
3406         if (status != U_ZERO_ERROR) {
3407 #if UCONFIG_NO_BREAK_ITERATION==1
3408             // 'v' test flag means that the test pattern should not compile if ICU was configured
3409             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3410             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411                 goto cleanupAndReturn;
3412             }
3413 #endif
3414             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3415                 // Expected pattern compilation error.
3416                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3417                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3418                 }
3419                 goto cleanupAndReturn;
3420             } else {
3421                 // Unexpected pattern compilation error.
3422                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3423                 goto cleanupAndReturn;
3424             }
3425         }
3426     }
3427 
3428     if (UTF8Pattern == NULL) {
3429         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3430         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3431         status = U_ZERO_ERROR;
3432     }
3433 
3434     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3435         callerPattern->dumpPattern();
3436     }
3437 
3438     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3439         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3440         goto cleanupAndReturn;
3441     }
3442 
3443 
3444     //
3445     // Number of times find() should be called on the test string, default to 1
3446     //
3447     numFinds = 1;
3448     for (i=2; i<=9; i++) {
3449         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3450             if (numFinds != 1) {
3451                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3452                 goto cleanupAndReturn;
3453             }
3454             numFinds = i;
3455         }
3456     }
3457 
3458     // 'M' flag.  Use matches() instead of find()
3459     if (flags.indexOf((UChar)0x4d) >= 0) {
3460         useMatchesFunc = TRUE;
3461     }
3462     if (flags.indexOf((UChar)0x4c) >= 0) {
3463         useLookingAtFunc = TRUE;
3464     }
3465 
3466     //
3467     //  Find the tags in the input data, remove them, and record the group boundary
3468     //    positions.
3469     //
3470     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3471     REGEX_CHECK_STATUS_L(line);
3472 
3473     unEscapedInput = inputString.unescape();
3474     parseMatcher = parsePat->matcher(unEscapedInput, status);
3475     REGEX_CHECK_STATUS_L(line);
3476     while(parseMatcher->find()) {
3477         parseMatcher->appendReplacement(deTaggedInput, "", status);
3478         REGEX_CHECK_STATUS;
3479         UnicodeString groupNum = parseMatcher->group(2, status);
3480         if (groupNum == "r") {
3481             // <r> or </r>, a region specification within the string
3482             if (parseMatcher->group(1, status) == "/") {
3483                 regionEnd = deTaggedInput.length();
3484             } else {
3485                 regionStart = deTaggedInput.length();
3486             }
3487         } else {
3488             // <digits> or </digits>, a group match boundary tag.
3489             if (parseMatcher->group(1, status) == "/") {
3490                 set(groupEnds, deTaggedInput.length(), groupNum);
3491             } else {
3492                 set(groupStarts, deTaggedInput.length(), groupNum);
3493             }
3494         }
3495     }
3496     parseMatcher->appendTail(deTaggedInput);
3497     REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3498     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3499       errln("mismatched <r> tags");
3500       failed = TRUE;
3501       goto cleanupAndReturn;
3502     }
3503 
3504     //
3505     //  Configure the matcher according to the flags specified with this test.
3506     //
3507     matcher = callerPattern->matcher(deTaggedInput, status);
3508     REGEX_CHECK_STATUS_L(line);
3509     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3510         matcher->setTrace(TRUE);
3511     }
3512 
3513     if (UTF8Pattern != NULL) {
3514         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3515         status = U_ZERO_ERROR; // buffer overflow
3516         inputChars = new char[inputUTF8Length+1];
3517         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3518         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3519 
3520         if (status == U_ZERO_ERROR) {
3521             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3522             REGEX_CHECK_STATUS_L(line);
3523         }
3524 
3525         if (UTF8Matcher == NULL) {
3526             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3527             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3528             status = U_ZERO_ERROR;
3529         }
3530     }
3531 
3532     //
3533     //  Generate native indices for UTF8 versions of region and capture group info
3534     //
3535     if (UTF8Matcher != NULL) {
3536         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3537             UTF8Matcher->setTrace(TRUE);
3538         }
3539         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3540         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3541 
3542         //  Fill out the native index UVector info.
3543         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3544         for (i=0; i<groupStarts.size(); i++) {
3545             int32_t  start = groupStarts.elementAti(i);
3546             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3547             if (start >= 0) {
3548                 int32_t  startUTF8;
3549                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3550                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3551                     failed = TRUE;
3552                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3553                 }
3554                 setInt(groupStartsUTF8, startUTF8, i);
3555             }
3556 
3557             int32_t  end = groupEnds.elementAti(i);
3558             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3559             if (end >= 0) {
3560                 int32_t  endUTF8;
3561                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3562                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3563                     failed = TRUE;
3564                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3565                 }
3566                 setInt(groupEndsUTF8, endUTF8, i);
3567             }
3568         }
3569     }
3570 
3571     if (regionStart>=0) {
3572        matcher->region(regionStart, regionEnd, status);
3573        REGEX_CHECK_STATUS_L(line);
3574        if (UTF8Matcher != NULL) {
3575            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3576            REGEX_CHECK_STATUS_L(line);
3577        }
3578     }
3579     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3580         matcher->useAnchoringBounds(FALSE);
3581         if (UTF8Matcher != NULL) {
3582             UTF8Matcher->useAnchoringBounds(FALSE);
3583         }
3584     }
3585     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3586         matcher->useTransparentBounds(TRUE);
3587         if (UTF8Matcher != NULL) {
3588             UTF8Matcher->useTransparentBounds(TRUE);
3589         }
3590     }
3591 
3592 
3593 
3594     //
3595     // Do a find on the de-tagged input using the caller's pattern
3596     //     TODO: error on count>1 and not find().
3597     //           error on both matches() and lookingAt().
3598     //
3599     for (i=0; i<numFinds; i++) {
3600         if (useMatchesFunc) {
3601             isMatch = matcher->matches(status);
3602             if (UTF8Matcher != NULL) {
3603                isUTF8Match = UTF8Matcher->matches(status);
3604             }
3605         } else  if (useLookingAtFunc) {
3606             isMatch = matcher->lookingAt(status);
3607             if (UTF8Matcher != NULL) {
3608                 isUTF8Match = UTF8Matcher->lookingAt(status);
3609             }
3610         } else {
3611             isMatch = matcher->find();
3612             if (UTF8Matcher != NULL) {
3613                 isUTF8Match = UTF8Matcher->find();
3614             }
3615         }
3616     }
3617     matcher->setTrace(FALSE);
3618     if (UTF8Matcher) {
3619         UTF8Matcher->setTrace(FALSE);
3620     }
3621     if (U_FAILURE(status)) {
3622         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3623     }
3624 
3625     //
3626     // Match up the groups from the find() with the groups from the tags
3627     //
3628 
3629     // number of tags should match number of groups from find operation.
3630     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3631     //   G option in test means that capture group data is not available in the
3632     //     expected results, so the check needs to be suppressed.
3633     if (isMatch == FALSE && groupStarts.size() != 0) {
3634         dataerrln("Error at line %d:  Match expected, but none found.", line);
3635         failed = TRUE;
3636         goto cleanupAndReturn;
3637     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3638         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3639         failed = TRUE;
3640         goto cleanupAndReturn;
3641     }
3642     if (isMatch && groupStarts.size() == 0) {
3643         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3644         failed = TRUE;
3645     }
3646     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3647         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3648         failed = TRUE;
3649     }
3650 
3651     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3652         // Only check for match / no match.  Don't check capture groups.
3653         goto cleanupAndReturn;
3654     }
3655 
3656     REGEX_CHECK_STATUS_L(line);
3657     for (i=0; i<=matcher->groupCount(); i++) {
3658         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3659         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3660         if (matcher->start(i, status) != expectedStart) {
3661             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3662                 line, i, expectedStart, matcher->start(i, status));
3663             failed = TRUE;
3664             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3665         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3666             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3667                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3668             failed = TRUE;
3669             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3670         }
3671 
3672         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3673         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3674         if (matcher->end(i, status) != expectedEnd) {
3675             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3676                 line, i, expectedEnd, matcher->end(i, status));
3677             failed = TRUE;
3678             // Error on end position;  keep going; real error is probably yet to come as group
3679             //   end positions work from end of the input data towards the front.
3680         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3681             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3682                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3683             failed = TRUE;
3684             // Error on end position;  keep going; real error is probably yet to come as group
3685             //   end positions work from end of the input data towards the front.
3686         }
3687     }
3688     if ( matcher->groupCount()+1 < groupStarts.size()) {
3689         errln("Error at line %d: Expected %d capture groups, found %d.",
3690             line, groupStarts.size()-1, matcher->groupCount());
3691         failed = TRUE;
3692         }
3693     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3694         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3695               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3696         failed = TRUE;
3697     }
3698 
3699     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3700         matcher->requireEnd() == TRUE) {
3701         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3702         failed = TRUE;
3703     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3704         UTF8Matcher->requireEnd() == TRUE) {
3705         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3706         failed = TRUE;
3707     }
3708 
3709     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3710         matcher->requireEnd() == FALSE) {
3711         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3712         failed = TRUE;
3713     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3714         UTF8Matcher->requireEnd() == FALSE) {
3715         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3716         failed = TRUE;
3717     }
3718 
3719     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3720         matcher->hitEnd() == TRUE) {
3721         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3722         failed = TRUE;
3723     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3724                UTF8Matcher->hitEnd() == TRUE) {
3725         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3726         failed = TRUE;
3727     }
3728 
3729     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3730         matcher->hitEnd() == FALSE) {
3731         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3732         failed = TRUE;
3733     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3734                UTF8Matcher->hitEnd() == FALSE) {
3735         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3736         failed = TRUE;
3737     }
3738 
3739 
3740 cleanupAndReturn:
3741     if (failed) {
3742         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3743             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3744         // callerPattern->dump();
3745     }
3746     delete parseMatcher;
3747     delete parsePat;
3748     delete UTF8Matcher;
3749     delete UTF8Pattern;
3750     delete matcher;
3751     delete callerPattern;
3752 
3753     utext_close(&inputText);
3754     delete[] inputChars;
3755     utext_close(&patternText);
3756     delete[] patternChars;
3757     ucnv_close(UTF8Converter);
3758 }
3759 
3760 
3761 
3762 
3763 //---------------------------------------------------------------------------
3764 //
3765 //      Errors     Check for error handling in patterns.
3766 //
3767 //---------------------------------------------------------------------------
Errors()3768 void RegexTest::Errors() {
3769     // \escape sequences that aren't implemented yet.
3770     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3771 
3772     // Missing close parentheses
3773     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3774     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3775     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3776 
3777     // Extra close paren
3778     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3779     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3780     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3781 
3782     // Look-ahead, Look-behind
3783     //  TODO:  add tests for unbounded length look-behinds.
3784     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3785 
3786     // Attempt to use non-default flags
3787     {
3788         UParseError   pe;
3789         UErrorCode    status = U_ZERO_ERROR;
3790         int32_t       flags  = UREGEX_CANON_EQ |
3791                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3792                                UREGEX_MULTILINE;
3793         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3794         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3795         delete pat1;
3796     }
3797 
3798 
3799     // Quantifiers are allowed only after something that can be quantified.
3800     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3801     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3802     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3803 
3804     // Mal-formed {min,max} quantifiers
3805     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3806     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3807     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3808     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3809     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3810     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3811     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3812     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3813     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3814 
3815     // Ticket 5389
3816     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3817 
3818     // Invalid Back Reference \0
3819     //    For ICU 3.8 and earlier
3820     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3821     //
3822     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3823 
3824 }
3825 
3826 
3827 //-------------------------------------------------------------------------------
3828 //
3829 //  Read a text data file, convert it to UChars, and return the data
3830 //    in one big UChar * buffer, which the caller must delete.
3831 //
3832 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3833 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3834                                      const char *defEncoding, UErrorCode &status) {
3835     UChar       *retPtr  = NULL;
3836     char        *fileBuf = NULL;
3837     UConverter* conv     = NULL;
3838     FILE        *f       = NULL;
3839 
3840     ulen = 0;
3841     if (U_FAILURE(status)) {
3842         return retPtr;
3843     }
3844 
3845     //
3846     //  Open the file.
3847     //
3848     f = fopen(fileName, "rb");
3849     if (f == 0) {
3850         dataerrln("Error opening test data file %s\n", fileName);
3851         status = U_FILE_ACCESS_ERROR;
3852         return NULL;
3853     }
3854     //
3855     //  Read it in
3856     //
3857     int32_t            fileSize;
3858     int32_t            amt_read;
3859 
3860     fseek( f, 0, SEEK_END);
3861     fileSize = ftell(f);
3862     fileBuf = new char[fileSize];
3863     fseek(f, 0, SEEK_SET);
3864     amt_read = fread(fileBuf, 1, fileSize, f);
3865     if (amt_read != fileSize || fileSize <= 0) {
3866         errln("Error reading test data file.");
3867         goto cleanUpAndReturn;
3868     }
3869 
3870     //
3871     // Look for a Unicode Signature (BOM) on the data just read
3872     //
3873     int32_t        signatureLength;
3874     const char *   fileBufC;
3875     const char*    encoding;
3876 
3877     fileBufC = fileBuf;
3878     encoding = ucnv_detectUnicodeSignature(
3879         fileBuf, fileSize, &signatureLength, &status);
3880     if(encoding!=NULL ){
3881         fileBufC  += signatureLength;
3882         fileSize  -= signatureLength;
3883     } else {
3884         encoding = defEncoding;
3885         if (strcmp(encoding, "utf-8") == 0) {
3886             errln("file %s is missing its BOM", fileName);
3887         }
3888     }
3889 
3890     //
3891     // Open a converter to take the rule file to UTF-16
3892     //
3893     conv = ucnv_open(encoding, &status);
3894     if (U_FAILURE(status)) {
3895         goto cleanUpAndReturn;
3896     }
3897 
3898     //
3899     // Convert the rules to UChar.
3900     //  Preflight first to determine required buffer size.
3901     //
3902     ulen = ucnv_toUChars(conv,
3903         NULL,           //  dest,
3904         0,              //  destCapacity,
3905         fileBufC,
3906         fileSize,
3907         &status);
3908     if (status == U_BUFFER_OVERFLOW_ERROR) {
3909         // Buffer Overflow is expected from the preflight operation.
3910         status = U_ZERO_ERROR;
3911 
3912         retPtr = new UChar[ulen+1];
3913         ucnv_toUChars(conv,
3914             retPtr,       //  dest,
3915             ulen+1,
3916             fileBufC,
3917             fileSize,
3918             &status);
3919     }
3920 
3921 cleanUpAndReturn:
3922     fclose(f);
3923     delete[] fileBuf;
3924     ucnv_close(conv);
3925     if (U_FAILURE(status)) {
3926         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3927         delete []retPtr;
3928         retPtr = 0;
3929         ulen   = 0;
3930     };
3931     return retPtr;
3932 }
3933 
3934 
3935 //-------------------------------------------------------------------------------
3936 //
3937 //   PerlTests  - Run Perl's regular expression tests
3938 //                The input file for this test is re_tests, the standard regular
3939 //                expression test data distributed with the Perl source code.
3940 //
3941 //                Here is Perl's description of the test data file:
3942 //
3943 //        # The tests are in a separate file 't/op/re_tests'.
3944 //        # Each line in that file is a separate test.
3945 //        # There are five columns, separated by tabs.
3946 //        #
3947 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3948 //        # Modifiers can be put after the closing C<'>.
3949 //        #
3950 //        # Column 2 contains the string to be matched.
3951 //        #
3952 //        # Column 3 contains the expected result:
3953 //        #     y   expect a match
3954 //        #     n   expect no match
3955 //        #     c   expect an error
3956 //        # B   test exposes a known bug in Perl, should be skipped
3957 //        # b   test exposes a known bug in Perl, should be skipped if noamp
3958 //        #
3959 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
3960 //        #
3961 //        # Column 4 contains a string, usually C<$&>.
3962 //        #
3963 //        # Column 5 contains the expected result of double-quote
3964 //        # interpolating that string after the match, or start of error message.
3965 //        #
3966 //        # Column 6, if present, contains a reason why the test is skipped.
3967 //        # This is printed with "skipped", for harness to pick up.
3968 //        #
3969 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
3970 //        #
3971 //        # If you want to add a regular expression test that can't be expressed
3972 //        # in this format, don't add it here: put it in op/pat.t instead.
3973 //
3974 //        For ICU, if field 3 contains an 'i', the test will be skipped.
3975 //        The test exposes is some known incompatibility between ICU and Perl regexps.
3976 //        (The i is in addition to whatever was there before.)
3977 //
3978 //-------------------------------------------------------------------------------
PerlTests()3979 void RegexTest::PerlTests() {
3980     char tdd[2048];
3981     const char *srcPath;
3982     UErrorCode  status = U_ZERO_ERROR;
3983     UParseError pe;
3984 
3985     //
3986     //  Open and read the test data file.
3987     //
3988     srcPath=getPath(tdd, "re_tests.txt");
3989     if(srcPath==NULL) {
3990         return; /* something went wrong, error already output */
3991     }
3992 
3993     int32_t    len;
3994     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
3995     if (U_FAILURE(status)) {
3996         return; /* something went wrong, error already output */
3997     }
3998 
3999     //
4000     //  Put the test data into a UnicodeString
4001     //
4002     UnicodeString testDataString(FALSE, testData, len);
4003 
4004     //
4005     //  Regex to break the input file into lines, and strip the new lines.
4006     //     One line per match, capture group one is the desired data.
4007     //
4008     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4009     if (U_FAILURE(status)) {
4010         dataerrln("RegexPattern::compile() error");
4011         return;
4012     }
4013     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4014 
4015     //
4016     //  Regex to split a test file line into fields.
4017     //    There are six fields, separated by tabs.
4018     //
4019     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4020 
4021     //
4022     //  Regex to identify test patterns with flag settings, and to separate them.
4023     //    Test patterns with flags look like 'pattern'i
4024     //    Test patterns without flags are not quoted:   pattern
4025     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4026     //
4027     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4028     RegexMatcher* flagMat = flagPat->matcher(status);
4029 
4030     //
4031     // The Perl tests reference several perl-isms, which are evaluated/substituted
4032     //   in the test data.  Not being perl, this must be done explicitly.  Here
4033     //   are string constants and REs for these constructs.
4034     //
4035     UnicodeString nulnulSrc("${nulnul}");
4036     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4037     nulnul = nulnul.unescape();
4038 
4039     UnicodeString ffffSrc("${ffff}");
4040     UnicodeString ffff("\\uffff", -1, US_INV);
4041     ffff = ffff.unescape();
4042 
4043     //  regexp for $-[0], $+[2], etc.
4044     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4045     RegexMatcher *groupsMat = groupsPat->matcher(status);
4046 
4047     //  regexp for $0, $1, $2, etc.
4048     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4049     RegexMatcher *cgMat = cgPat->matcher(status);
4050 
4051 
4052     //
4053     // Main Loop for the Perl Tests, runs once per line from the
4054     //   test data file.
4055     //
4056     int32_t  lineNum = 0;
4057     int32_t  skippedUnimplementedCount = 0;
4058     while (lineMat->find()) {
4059         lineNum++;
4060 
4061         //
4062         //  Get a line, break it into its fields, do the Perl
4063         //    variable substitutions.
4064         //
4065         UnicodeString line = lineMat->group(1, status);
4066         UnicodeString fields[7];
4067         fieldPat->split(line, fields, 7, status);
4068 
4069         flagMat->reset(fields[0]);
4070         flagMat->matches(status);
4071         UnicodeString pattern  = flagMat->group(2, status);
4072         pattern.findAndReplace("${bang}", "!");
4073         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4074         pattern.findAndReplace(ffffSrc, ffff);
4075 
4076         //
4077         //  Identify patterns that include match flag settings,
4078         //    split off the flags, remove the extra quotes.
4079         //
4080         UnicodeString flagStr = flagMat->group(3, status);
4081         if (U_FAILURE(status)) {
4082             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4083             return;
4084         }
4085         int32_t flags = 0;
4086         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4087         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4088         const UChar UChar_m = 0x6d;
4089         const UChar UChar_x = 0x78;
4090         const UChar UChar_y = 0x79;
4091         if (flagStr.indexOf(UChar_i) != -1) {
4092             flags |= UREGEX_CASE_INSENSITIVE;
4093         }
4094         if (flagStr.indexOf(UChar_m) != -1) {
4095             flags |= UREGEX_MULTILINE;
4096         }
4097         if (flagStr.indexOf(UChar_x) != -1) {
4098             flags |= UREGEX_COMMENTS;
4099         }
4100 
4101         //
4102         // Compile the test pattern.
4103         //
4104         status = U_ZERO_ERROR;
4105         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4106         if (status == U_REGEX_UNIMPLEMENTED) {
4107             //
4108             // Test of a feature that is planned for ICU, but not yet implemented.
4109             //   skip the test.
4110             skippedUnimplementedCount++;
4111             delete testPat;
4112             status = U_ZERO_ERROR;
4113             continue;
4114         }
4115 
4116         if (U_FAILURE(status)) {
4117             // Some tests are supposed to generate errors.
4118             //   Only report an error for tests that are supposed to succeed.
4119             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4120                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4121             {
4122                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4123             }
4124             status = U_ZERO_ERROR;
4125             delete testPat;
4126             continue;
4127         }
4128 
4129         if (fields[2].indexOf(UChar_i) >= 0) {
4130             // ICU should skip this test.
4131             delete testPat;
4132             continue;
4133         }
4134 
4135         if (fields[2].indexOf(UChar_c) >= 0) {
4136             // This pattern should have caused a compilation error, but didn't/
4137             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4138             delete testPat;
4139             continue;
4140         }
4141 
4142         //
4143         // replace the Perl variables that appear in some of the
4144         //   match data strings.
4145         //
4146         UnicodeString matchString = fields[1];
4147         matchString.findAndReplace(nulnulSrc, nulnul);
4148         matchString.findAndReplace(ffffSrc,   ffff);
4149 
4150         // Replace any \n in the match string with an actual new-line char.
4151         //  Don't do full unescape, as this unescapes more than Perl does, which
4152         //  causes other spurious failures in the tests.
4153         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4154 
4155 
4156 
4157         //
4158         // Run the test, check for expected match/don't match result.
4159         //
4160         RegexMatcher *testMat = testPat->matcher(matchString, status);
4161         UBool found = testMat->find();
4162         UBool expected = FALSE;
4163         if (fields[2].indexOf(UChar_y) >=0) {
4164             expected = TRUE;
4165         }
4166         if (expected != found) {
4167             errln("line %d: Expected %smatch, got %smatch",
4168                 lineNum, expected?"":"no ", found?"":"no " );
4169             continue;
4170         }
4171 
4172         // Don't try to check expected results if there is no match.
4173         //   (Some have stuff in the expected fields)
4174         if (!found) {
4175             delete testMat;
4176             delete testPat;
4177             continue;
4178         }
4179 
4180         //
4181         // Interpret the Perl expression from the fourth field of the data file,
4182         // building up an ICU string from the results of the ICU match.
4183         //   The Perl expression will contain references to the results of
4184         //     a regex match, including the matched string, capture group strings,
4185         //     group starting and ending indicies, etc.
4186         //
4187         UnicodeString resultString;
4188         UnicodeString perlExpr = fields[3];
4189 #if SUPPORT_MUTATING_INPUT_STRING
4190         groupsMat->reset(perlExpr);
4191         cgMat->reset(perlExpr);
4192 #endif
4193 
4194         while (perlExpr.length() > 0) {
4195 #if !SUPPORT_MUTATING_INPUT_STRING
4196             //  Perferred usage.  Reset after any modification to input string.
4197             groupsMat->reset(perlExpr);
4198             cgMat->reset(perlExpr);
4199 #endif
4200 
4201             if (perlExpr.startsWith("$&")) {
4202                 resultString.append(testMat->group(status));
4203                 perlExpr.remove(0, 2);
4204             }
4205 
4206             else if (groupsMat->lookingAt(status)) {
4207                 // $-[0]   $+[2]  etc.
4208                 UnicodeString digitString = groupsMat->group(2, status);
4209                 int32_t t = 0;
4210                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4211                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4212                 int32_t matchPosition;
4213                 if (plusOrMinus.compare("+") == 0) {
4214                     matchPosition = testMat->end(groupNum, status);
4215                 } else {
4216                     matchPosition = testMat->start(groupNum, status);
4217                 }
4218                 if (matchPosition != -1) {
4219                     ICU_Utility::appendNumber(resultString, matchPosition);
4220                 }
4221                 perlExpr.remove(0, groupsMat->end(status));
4222             }
4223 
4224             else if (cgMat->lookingAt(status)) {
4225                 // $1, $2, $3, etc.
4226                 UnicodeString digitString = cgMat->group(1, status);
4227                 int32_t t = 0;
4228                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4229                 if (U_SUCCESS(status)) {
4230                     resultString.append(testMat->group(groupNum, status));
4231                     status = U_ZERO_ERROR;
4232                 }
4233                 perlExpr.remove(0, cgMat->end(status));
4234             }
4235 
4236             else if (perlExpr.startsWith("@-")) {
4237                 int32_t i;
4238                 for (i=0; i<=testMat->groupCount(); i++) {
4239                     if (i>0) {
4240                         resultString.append(" ");
4241                     }
4242                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4243                 }
4244                 perlExpr.remove(0, 2);
4245             }
4246 
4247             else if (perlExpr.startsWith("@+")) {
4248                 int32_t i;
4249                 for (i=0; i<=testMat->groupCount(); i++) {
4250                     if (i>0) {
4251                         resultString.append(" ");
4252                     }
4253                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4254                 }
4255                 perlExpr.remove(0, 2);
4256             }
4257 
4258             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4259                                                      //           or as an escaped sequence (e.g. \n)
4260                 if (perlExpr.length() > 1) {
4261                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4262                 }
4263                 UChar c = perlExpr.charAt(0);
4264                 switch (c) {
4265                 case 'n':   c = '\n'; break;
4266                 // add any other escape sequences that show up in the test expected results.
4267                 }
4268                 resultString.append(c);
4269                 perlExpr.remove(0, 1);
4270             }
4271 
4272             else  {
4273                 // Any characters from the perl expression that we don't explicitly
4274                 //  recognize before here are assumed to be literals and copied
4275                 //  as-is to the expected results.
4276                 resultString.append(perlExpr.charAt(0));
4277                 perlExpr.remove(0, 1);
4278             }
4279 
4280             if (U_FAILURE(status)) {
4281                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4282                 break;
4283             }
4284         }
4285 
4286         //
4287         // Expected Results Compare
4288         //
4289         UnicodeString expectedS(fields[4]);
4290         expectedS.findAndReplace(nulnulSrc, nulnul);
4291         expectedS.findAndReplace(ffffSrc,   ffff);
4292         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4293 
4294 
4295         if (expectedS.compare(resultString) != 0) {
4296             err("Line %d: Incorrect perl expression results.", lineNum);
4297             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4298         }
4299 
4300         delete testMat;
4301         delete testPat;
4302     }
4303 
4304     //
4305     // All done.  Clean up allocated stuff.
4306     //
4307     delete cgMat;
4308     delete cgPat;
4309 
4310     delete groupsMat;
4311     delete groupsPat;
4312 
4313     delete flagMat;
4314     delete flagPat;
4315 
4316     delete lineMat;
4317     delete linePat;
4318 
4319     delete fieldPat;
4320     delete [] testData;
4321 
4322 
4323     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4324 
4325 }
4326 
4327 
4328 //-------------------------------------------------------------------------------
4329 //
4330 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4331 //                  (instead of using UnicodeStrings) to test the alternate engine.
4332 //                  The input file for this test is re_tests, the standard regular
4333 //                  expression test data distributed with the Perl source code.
4334 //                  See PerlTests() for more information.
4335 //
4336 //-------------------------------------------------------------------------------
PerlTestsUTF8()4337 void RegexTest::PerlTestsUTF8() {
4338     char tdd[2048];
4339     const char *srcPath;
4340     UErrorCode  status = U_ZERO_ERROR;
4341     UParseError pe;
4342     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4343     UText       patternText = UTEXT_INITIALIZER;
4344     char       *patternChars = NULL;
4345     int32_t     patternLength;
4346     int32_t     patternCapacity = 0;
4347     UText       inputText = UTEXT_INITIALIZER;
4348     char       *inputChars = NULL;
4349     int32_t     inputLength;
4350     int32_t     inputCapacity = 0;
4351 
4352     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4353 
4354     //
4355     //  Open and read the test data file.
4356     //
4357     srcPath=getPath(tdd, "re_tests.txt");
4358     if(srcPath==NULL) {
4359         return; /* something went wrong, error already output */
4360     }
4361 
4362     int32_t    len;
4363     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4364     if (U_FAILURE(status)) {
4365         return; /* something went wrong, error already output */
4366     }
4367 
4368     //
4369     //  Put the test data into a UnicodeString
4370     //
4371     UnicodeString testDataString(FALSE, testData, len);
4372 
4373     //
4374     //  Regex to break the input file into lines, and strip the new lines.
4375     //     One line per match, capture group one is the desired data.
4376     //
4377     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4378     if (U_FAILURE(status)) {
4379         dataerrln("RegexPattern::compile() error");
4380         return;
4381     }
4382     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4383 
4384     //
4385     //  Regex to split a test file line into fields.
4386     //    There are six fields, separated by tabs.
4387     //
4388     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4389 
4390     //
4391     //  Regex to identify test patterns with flag settings, and to separate them.
4392     //    Test patterns with flags look like 'pattern'i
4393     //    Test patterns without flags are not quoted:   pattern
4394     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4395     //
4396     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4397     RegexMatcher* flagMat = flagPat->matcher(status);
4398 
4399     //
4400     // The Perl tests reference several perl-isms, which are evaluated/substituted
4401     //   in the test data.  Not being perl, this must be done explicitly.  Here
4402     //   are string constants and REs for these constructs.
4403     //
4404     UnicodeString nulnulSrc("${nulnul}");
4405     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4406     nulnul = nulnul.unescape();
4407 
4408     UnicodeString ffffSrc("${ffff}");
4409     UnicodeString ffff("\\uffff", -1, US_INV);
4410     ffff = ffff.unescape();
4411 
4412     //  regexp for $-[0], $+[2], etc.
4413     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4414     RegexMatcher *groupsMat = groupsPat->matcher(status);
4415 
4416     //  regexp for $0, $1, $2, etc.
4417     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4418     RegexMatcher *cgMat = cgPat->matcher(status);
4419 
4420 
4421     //
4422     // Main Loop for the Perl Tests, runs once per line from the
4423     //   test data file.
4424     //
4425     int32_t  lineNum = 0;
4426     int32_t  skippedUnimplementedCount = 0;
4427     while (lineMat->find()) {
4428         lineNum++;
4429 
4430         //
4431         //  Get a line, break it into its fields, do the Perl
4432         //    variable substitutions.
4433         //
4434         UnicodeString line = lineMat->group(1, status);
4435         UnicodeString fields[7];
4436         fieldPat->split(line, fields, 7, status);
4437 
4438         flagMat->reset(fields[0]);
4439         flagMat->matches(status);
4440         UnicodeString pattern  = flagMat->group(2, status);
4441         pattern.findAndReplace("${bang}", "!");
4442         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4443         pattern.findAndReplace(ffffSrc, ffff);
4444 
4445         //
4446         //  Identify patterns that include match flag settings,
4447         //    split off the flags, remove the extra quotes.
4448         //
4449         UnicodeString flagStr = flagMat->group(3, status);
4450         if (U_FAILURE(status)) {
4451             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4452             return;
4453         }
4454         int32_t flags = 0;
4455         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4456         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4457         const UChar UChar_m = 0x6d;
4458         const UChar UChar_x = 0x78;
4459         const UChar UChar_y = 0x79;
4460         if (flagStr.indexOf(UChar_i) != -1) {
4461             flags |= UREGEX_CASE_INSENSITIVE;
4462         }
4463         if (flagStr.indexOf(UChar_m) != -1) {
4464             flags |= UREGEX_MULTILINE;
4465         }
4466         if (flagStr.indexOf(UChar_x) != -1) {
4467             flags |= UREGEX_COMMENTS;
4468         }
4469 
4470         //
4471         // Put the pattern in a UTF-8 UText
4472         //
4473         status = U_ZERO_ERROR;
4474         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4475         if (status == U_BUFFER_OVERFLOW_ERROR) {
4476             status = U_ZERO_ERROR;
4477             delete[] patternChars;
4478             patternCapacity = patternLength + 1;
4479             patternChars = new char[patternCapacity];
4480             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4481         }
4482         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4483 
4484         //
4485         // Compile the test pattern.
4486         //
4487         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4488         if (status == U_REGEX_UNIMPLEMENTED) {
4489             //
4490             // Test of a feature that is planned for ICU, but not yet implemented.
4491             //   skip the test.
4492             skippedUnimplementedCount++;
4493             delete testPat;
4494             status = U_ZERO_ERROR;
4495             continue;
4496         }
4497 
4498         if (U_FAILURE(status)) {
4499             // Some tests are supposed to generate errors.
4500             //   Only report an error for tests that are supposed to succeed.
4501             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4502                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4503             {
4504                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4505             }
4506             status = U_ZERO_ERROR;
4507             delete testPat;
4508             continue;
4509         }
4510 
4511         if (fields[2].indexOf(UChar_i) >= 0) {
4512             // ICU should skip this test.
4513             delete testPat;
4514             continue;
4515         }
4516 
4517         if (fields[2].indexOf(UChar_c) >= 0) {
4518             // This pattern should have caused a compilation error, but didn't/
4519             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4520             delete testPat;
4521             continue;
4522         }
4523 
4524 
4525         //
4526         // replace the Perl variables that appear in some of the
4527         //   match data strings.
4528         //
4529         UnicodeString matchString = fields[1];
4530         matchString.findAndReplace(nulnulSrc, nulnul);
4531         matchString.findAndReplace(ffffSrc,   ffff);
4532 
4533         // Replace any \n in the match string with an actual new-line char.
4534         //  Don't do full unescape, as this unescapes more than Perl does, which
4535         //  causes other spurious failures in the tests.
4536         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4537 
4538         //
4539         // Put the input in a UTF-8 UText
4540         //
4541         status = U_ZERO_ERROR;
4542         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4543         if (status == U_BUFFER_OVERFLOW_ERROR) {
4544             status = U_ZERO_ERROR;
4545             delete[] inputChars;
4546             inputCapacity = inputLength + 1;
4547             inputChars = new char[inputCapacity];
4548             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4549         }
4550         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4551 
4552         //
4553         // Run the test, check for expected match/don't match result.
4554         //
4555         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4556         UBool found = testMat->find();
4557         UBool expected = FALSE;
4558         if (fields[2].indexOf(UChar_y) >=0) {
4559             expected = TRUE;
4560         }
4561         if (expected != found) {
4562             errln("line %d: Expected %smatch, got %smatch",
4563                 lineNum, expected?"":"no ", found?"":"no " );
4564             continue;
4565         }
4566 
4567         // Don't try to check expected results if there is no match.
4568         //   (Some have stuff in the expected fields)
4569         if (!found) {
4570             delete testMat;
4571             delete testPat;
4572             continue;
4573         }
4574 
4575         //
4576         // Interpret the Perl expression from the fourth field of the data file,
4577         // building up an ICU string from the results of the ICU match.
4578         //   The Perl expression will contain references to the results of
4579         //     a regex match, including the matched string, capture group strings,
4580         //     group starting and ending indicies, etc.
4581         //
4582         UnicodeString resultString;
4583         UnicodeString perlExpr = fields[3];
4584 
4585         while (perlExpr.length() > 0) {
4586             groupsMat->reset(perlExpr);
4587             cgMat->reset(perlExpr);
4588 
4589             if (perlExpr.startsWith("$&")) {
4590                 resultString.append(testMat->group(status));
4591                 perlExpr.remove(0, 2);
4592             }
4593 
4594             else if (groupsMat->lookingAt(status)) {
4595                 // $-[0]   $+[2]  etc.
4596                 UnicodeString digitString = groupsMat->group(2, status);
4597                 int32_t t = 0;
4598                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4599                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4600                 int32_t matchPosition;
4601                 if (plusOrMinus.compare("+") == 0) {
4602                     matchPosition = testMat->end(groupNum, status);
4603                 } else {
4604                     matchPosition = testMat->start(groupNum, status);
4605                 }
4606                 if (matchPosition != -1) {
4607                     ICU_Utility::appendNumber(resultString, matchPosition);
4608                 }
4609                 perlExpr.remove(0, groupsMat->end(status));
4610             }
4611 
4612             else if (cgMat->lookingAt(status)) {
4613                 // $1, $2, $3, etc.
4614                 UnicodeString digitString = cgMat->group(1, status);
4615                 int32_t t = 0;
4616                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4617                 if (U_SUCCESS(status)) {
4618                     resultString.append(testMat->group(groupNum, status));
4619                     status = U_ZERO_ERROR;
4620                 }
4621                 perlExpr.remove(0, cgMat->end(status));
4622             }
4623 
4624             else if (perlExpr.startsWith("@-")) {
4625                 int32_t i;
4626                 for (i=0; i<=testMat->groupCount(); i++) {
4627                     if (i>0) {
4628                         resultString.append(" ");
4629                     }
4630                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4631                 }
4632                 perlExpr.remove(0, 2);
4633             }
4634 
4635             else if (perlExpr.startsWith("@+")) {
4636                 int32_t i;
4637                 for (i=0; i<=testMat->groupCount(); i++) {
4638                     if (i>0) {
4639                         resultString.append(" ");
4640                     }
4641                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4642                 }
4643                 perlExpr.remove(0, 2);
4644             }
4645 
4646             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4647                                                      //           or as an escaped sequence (e.g. \n)
4648                 if (perlExpr.length() > 1) {
4649                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4650                 }
4651                 UChar c = perlExpr.charAt(0);
4652                 switch (c) {
4653                 case 'n':   c = '\n'; break;
4654                 // add any other escape sequences that show up in the test expected results.
4655                 }
4656                 resultString.append(c);
4657                 perlExpr.remove(0, 1);
4658             }
4659 
4660             else  {
4661                 // Any characters from the perl expression that we don't explicitly
4662                 //  recognize before here are assumed to be literals and copied
4663                 //  as-is to the expected results.
4664                 resultString.append(perlExpr.charAt(0));
4665                 perlExpr.remove(0, 1);
4666             }
4667 
4668             if (U_FAILURE(status)) {
4669                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4670                 break;
4671             }
4672         }
4673 
4674         //
4675         // Expected Results Compare
4676         //
4677         UnicodeString expectedS(fields[4]);
4678         expectedS.findAndReplace(nulnulSrc, nulnul);
4679         expectedS.findAndReplace(ffffSrc,   ffff);
4680         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4681 
4682 
4683         if (expectedS.compare(resultString) != 0) {
4684             err("Line %d: Incorrect perl expression results.", lineNum);
4685             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4686         }
4687 
4688         delete testMat;
4689         delete testPat;
4690     }
4691 
4692     //
4693     // All done.  Clean up allocated stuff.
4694     //
4695     delete cgMat;
4696     delete cgPat;
4697 
4698     delete groupsMat;
4699     delete groupsPat;
4700 
4701     delete flagMat;
4702     delete flagPat;
4703 
4704     delete lineMat;
4705     delete linePat;
4706 
4707     delete fieldPat;
4708     delete [] testData;
4709 
4710     utext_close(&patternText);
4711     utext_close(&inputText);
4712 
4713     delete [] patternChars;
4714     delete [] inputChars;
4715 
4716 
4717     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4718 
4719 }
4720 
4721 
4722 //--------------------------------------------------------------
4723 //
4724 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4725 //             Use this pattern,
4726 //                 "(a?){1,8000000}"
4727 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4728 //                   This test is likely to be fragile, as further optimizations stop
4729 //                   more cases of pointless looping in the match engine.
4730 //
4731 //---------------------------------------------------------------
Bug6149()4732 void RegexTest::Bug6149() {
4733     UnicodeString pattern("(a?){1,8000000}");
4734     UnicodeString s("xyz");
4735     uint32_t flags = 0;
4736     UErrorCode status = U_ZERO_ERROR;
4737 
4738     RegexMatcher  matcher(pattern, s, flags, status);
4739     UBool result = false;
4740     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4741     REGEX_ASSERT(result == FALSE);
4742  }
4743 
4744 
4745 //
4746 //   Callbacks()    Test the callback function.
4747 //                  When set, callbacks occur periodically during matching operations,
4748 //                  giving the application code the ability to abort the operation
4749 //                  before it's normal completion.
4750 //
4751 
4752 struct callBackContext {
4753     RegexTest        *test;
4754     int32_t          maxCalls;
4755     int32_t          numCalls;
4756     int32_t          lastSteps;
resetcallBackContext4757     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4758 };
4759 
4760 U_CDECL_BEGIN
4761 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4762 testCallBackFn(const void *context, int32_t steps) {
4763     callBackContext  *info = (callBackContext *)context;
4764     if (info->lastSteps+1 != steps) {
4765         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4766     }
4767     info->lastSteps = steps;
4768     info->numCalls++;
4769     return (info->numCalls < info->maxCalls);
4770 }
4771 U_CDECL_END
4772 
Callbacks()4773 void RegexTest::Callbacks() {
4774    {
4775         // Getter returns NULLs if no callback has been set
4776 
4777         //   The variables that the getter will fill in.
4778         //   Init to non-null values so that the action of the getter can be seen.
4779         const void          *returnedContext = &returnedContext;
4780         URegexMatchCallback *returnedFn = &testCallBackFn;
4781 
4782         UErrorCode status = U_ZERO_ERROR;
4783         RegexMatcher matcher("x", 0, status);
4784         REGEX_CHECK_STATUS;
4785         matcher.getMatchCallback(returnedFn, returnedContext, status);
4786         REGEX_CHECK_STATUS;
4787         REGEX_ASSERT(returnedFn == NULL);
4788         REGEX_ASSERT(returnedContext == NULL);
4789     }
4790 
4791    {
4792         // Set and Get work
4793         callBackContext cbInfo = {this, 0, 0, 0};
4794         const void          *returnedContext;
4795         URegexMatchCallback *returnedFn;
4796         UErrorCode status = U_ZERO_ERROR;
4797         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4798         REGEX_CHECK_STATUS;
4799         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4800         REGEX_CHECK_STATUS;
4801         matcher.getMatchCallback(returnedFn, returnedContext, status);
4802         REGEX_CHECK_STATUS;
4803         REGEX_ASSERT(returnedFn == testCallBackFn);
4804         REGEX_ASSERT(returnedContext == &cbInfo);
4805 
4806         // A short-running match shouldn't invoke the callback
4807         status = U_ZERO_ERROR;
4808         cbInfo.reset(1);
4809         UnicodeString s = "xxx";
4810         matcher.reset(s);
4811         REGEX_ASSERT(matcher.matches(status));
4812         REGEX_CHECK_STATUS;
4813         REGEX_ASSERT(cbInfo.numCalls == 0);
4814 
4815         // A medium-length match that runs long enough to invoke the
4816         //   callback, but not so long that the callback aborts it.
4817         status = U_ZERO_ERROR;
4818         cbInfo.reset(4);
4819         s = "aaaaaaaaaaaaaaaaaaab";
4820         matcher.reset(s);
4821         REGEX_ASSERT(matcher.matches(status)==FALSE);
4822         REGEX_CHECK_STATUS;
4823         REGEX_ASSERT(cbInfo.numCalls > 0);
4824 
4825         // A longer running match that the callback function will abort.
4826         status = U_ZERO_ERROR;
4827         cbInfo.reset(4);
4828         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4829         matcher.reset(s);
4830         REGEX_ASSERT(matcher.matches(status)==FALSE);
4831         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4832         REGEX_ASSERT(cbInfo.numCalls == 4);
4833 
4834         // A longer running find that the callback function will abort.
4835         status = U_ZERO_ERROR;
4836         cbInfo.reset(4);
4837         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4838         matcher.reset(s);
4839         REGEX_ASSERT(matcher.find(status)==FALSE);
4840         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4841         REGEX_ASSERT(cbInfo.numCalls == 4);
4842     }
4843 
4844 
4845 }
4846 
4847 
4848 //
4849 //   FindProgressCallbacks()    Test the find "progress" callback function.
4850 //                  When set, the find progress callback will be invoked during a find operations
4851 //                  after each return from a match attempt, giving the application the opportunity
4852 //                  to terminate a long-running find operation before it's normal completion.
4853 //
4854 
4855 struct progressCallBackContext {
4856     RegexTest        *test;
4857     int64_t          lastIndex;
4858     int32_t          maxCalls;
4859     int32_t          numCalls;
resetprogressCallBackContext4860     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4861 };
4862 
4863 // call-back function for find().
4864 // Return TRUE to continue the find().
4865 // Return FALSE to stop the find().
4866 U_CDECL_BEGIN
4867 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4868 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4869     progressCallBackContext  *info = (progressCallBackContext *)context;
4870     info->numCalls++;
4871     info->lastIndex = matchIndex;
4872 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4873     return (info->numCalls < info->maxCalls);
4874 }
4875 U_CDECL_END
4876 
FindProgressCallbacks()4877 void RegexTest::FindProgressCallbacks() {
4878    {
4879         // Getter returns NULLs if no callback has been set
4880 
4881         //   The variables that the getter will fill in.
4882         //   Init to non-null values so that the action of the getter can be seen.
4883         const void                  *returnedContext = &returnedContext;
4884         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4885 
4886         UErrorCode status = U_ZERO_ERROR;
4887         RegexMatcher matcher("x", 0, status);
4888         REGEX_CHECK_STATUS;
4889         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4890         REGEX_CHECK_STATUS;
4891         REGEX_ASSERT(returnedFn == NULL);
4892         REGEX_ASSERT(returnedContext == NULL);
4893     }
4894 
4895    {
4896         // Set and Get work
4897         progressCallBackContext cbInfo = {this, 0, 0, 0};
4898         const void                  *returnedContext;
4899         URegexFindProgressCallback  *returnedFn;
4900         UErrorCode status = U_ZERO_ERROR;
4901         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4902         REGEX_CHECK_STATUS;
4903         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4904         REGEX_CHECK_STATUS;
4905         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4906         REGEX_CHECK_STATUS;
4907         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4908         REGEX_ASSERT(returnedContext == &cbInfo);
4909 
4910         // A find that matches on the initial position does NOT invoke the callback.
4911         status = U_ZERO_ERROR;
4912         cbInfo.reset(100);
4913         UnicodeString s = "aaxxx";
4914         matcher.reset(s);
4915 #if 0
4916         matcher.setTrace(TRUE);
4917 #endif
4918         REGEX_ASSERT(matcher.find(0, status));
4919         REGEX_CHECK_STATUS;
4920         REGEX_ASSERT(cbInfo.numCalls == 0);
4921 
4922         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4923         //   but not so many times that we interrupt the operation.
4924         status = U_ZERO_ERROR;
4925         s = "aaaaaaaaaaaaaaaaaaab";
4926         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4927         matcher.reset(s);
4928         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4929         REGEX_CHECK_STATUS;
4930         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4931 
4932         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4933         status = U_ZERO_ERROR;
4934         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4935         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4936         matcher.reset(s1);
4937         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4938         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4939         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4940 
4941         // Now a match that will succeed, but after an interruption
4942         status = U_ZERO_ERROR;
4943         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4944         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4945         matcher.reset(s2);
4946         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4947         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4948         // Now retry the match from where left off
4949         cbInfo.maxCalls = 100; //  No callback limit
4950         status = U_ZERO_ERROR;
4951         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4952         REGEX_CHECK_STATUS;
4953     }
4954 
4955 
4956 }
4957 
4958 
4959 //---------------------------------------------------------------------------
4960 //
4961 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
4962 //                             UTexts. The pure-C implementation of UText
4963 //                             has no mutable backing stores, but we can
4964 //                             use UnicodeString here to test the functionality.
4965 //
4966 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()4967 void RegexTest::PreAllocatedUTextCAPI () {
4968     UErrorCode           status = U_ZERO_ERROR;
4969     URegularExpression  *re;
4970     UText                patternText = UTEXT_INITIALIZER;
4971     UnicodeString        buffer;
4972     UText                bufferText = UTEXT_INITIALIZER;
4973 
4974     utext_openUnicodeString(&bufferText, &buffer, &status);
4975 
4976     /*
4977      *  getText() and getUText()
4978      */
4979     {
4980         UText  text1 = UTEXT_INITIALIZER;
4981         UText  text2 = UTEXT_INITIALIZER;
4982         UChar  text2Chars[20];
4983         UText  *resultText;
4984 
4985         status = U_ZERO_ERROR;
4986         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
4987         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
4988         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
4989         utext_openUChars(&text2, text2Chars, -1, &status);
4990 
4991         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
4992         re = uregex_openUText(&patternText, 0, NULL, &status);
4993 
4994         /* First set a UText */
4995         uregex_setUText(re, &text1, &status);
4996         resultText = uregex_getUText(re, &bufferText, &status);
4997         REGEX_CHECK_STATUS;
4998         REGEX_ASSERT(resultText == &bufferText);
4999         utext_setNativeIndex(resultText, 0);
5000         utext_setNativeIndex(&text1, 0);
5001         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5002 
5003         resultText = uregex_getUText(re, &bufferText, &status);
5004         REGEX_CHECK_STATUS;
5005         REGEX_ASSERT(resultText == &bufferText);
5006         utext_setNativeIndex(resultText, 0);
5007         utext_setNativeIndex(&text1, 0);
5008         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5009 
5010         /* Then set a UChar * */
5011         uregex_setText(re, text2Chars, 7, &status);
5012         resultText = uregex_getUText(re, &bufferText, &status);
5013         REGEX_CHECK_STATUS;
5014         REGEX_ASSERT(resultText == &bufferText);
5015         utext_setNativeIndex(resultText, 0);
5016         utext_setNativeIndex(&text2, 0);
5017         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5018 
5019         uregex_close(re);
5020         utext_close(&text1);
5021         utext_close(&text2);
5022     }
5023 
5024     /*
5025      *  group()
5026      */
5027     {
5028         UChar    text1[80];
5029         UText   *actual;
5030         UBool    result;
5031         int64_t  length = 0;
5032 
5033         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5034         //                  012345678901234567890123456789012345678901234567
5035         //                  0         1         2         3         4
5036 
5037         status = U_ZERO_ERROR;
5038         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5039         REGEX_CHECK_STATUS;
5040 
5041         uregex_setText(re, text1, -1, &status);
5042         result = uregex_find(re, 0, &status);
5043         REGEX_ASSERT(result==TRUE);
5044 
5045         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5046         status = U_ZERO_ERROR;
5047         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5048         REGEX_CHECK_STATUS;
5049         REGEX_ASSERT(actual == &bufferText);
5050         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5051         REGEX_ASSERT(length == 16);
5052         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5053 
5054         /*  Capture group #1.  Should succeed, matching " interior ". */
5055         status = U_ZERO_ERROR;
5056         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5057         REGEX_CHECK_STATUS;
5058         REGEX_ASSERT(actual == &bufferText);
5059         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5060         REGEX_ASSERT(length == 10);
5061         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5062 
5063         /*  Capture group out of range.  Error. */
5064         status = U_ZERO_ERROR;
5065         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5066         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5067         REGEX_ASSERT(actual == &bufferText);
5068         uregex_close(re);
5069 
5070     }
5071 
5072     /*
5073      *  replaceFirst()
5074      */
5075     {
5076         UChar    text1[80];
5077         UChar    text2[80];
5078         UText    replText = UTEXT_INITIALIZER;
5079         UText   *result;
5080         status = U_ZERO_ERROR;
5081         utext_openUnicodeString(&bufferText, &buffer, &status);
5082 
5083         status = U_ZERO_ERROR;
5084         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5085         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5086         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5087 
5088         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5089         REGEX_CHECK_STATUS;
5090 
5091         /*  Normal case, with match */
5092         uregex_setText(re, text1, -1, &status);
5093         REGEX_CHECK_STATUS;
5094         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5095         REGEX_CHECK_STATUS;
5096         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5097         REGEX_CHECK_STATUS;
5098         REGEX_ASSERT(result == &bufferText);
5099         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5100 
5101         /* No match.  Text should copy to output with no changes.  */
5102         uregex_setText(re, text2, -1, &status);
5103         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5104         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5105         REGEX_CHECK_STATUS;
5106         REGEX_ASSERT(result == &bufferText);
5107         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5108 
5109         /* Unicode escapes */
5110         uregex_setText(re, text1, -1, &status);
5111         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5112         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5113         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5114         REGEX_CHECK_STATUS;
5115         REGEX_ASSERT(result == &bufferText);
5116         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5117 
5118         uregex_close(re);
5119         utext_close(&replText);
5120     }
5121 
5122 
5123     /*
5124      *  replaceAll()
5125      */
5126     {
5127         UChar    text1[80];
5128         UChar    text2[80];
5129         UText    replText = UTEXT_INITIALIZER;
5130         UText   *result;
5131 
5132         status = U_ZERO_ERROR;
5133         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5134         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5135         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5136 
5137         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5138         REGEX_CHECK_STATUS;
5139 
5140         /*  Normal case, with match */
5141         uregex_setText(re, text1, -1, &status);
5142         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5143         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5144         REGEX_CHECK_STATUS;
5145         REGEX_ASSERT(result == &bufferText);
5146         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5147 
5148         /* No match.  Text should copy to output with no changes.  */
5149         uregex_setText(re, text2, -1, &status);
5150         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5151         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5152         REGEX_CHECK_STATUS;
5153         REGEX_ASSERT(result == &bufferText);
5154         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5155 
5156         uregex_close(re);
5157         utext_close(&replText);
5158     }
5159 
5160 
5161     /*
5162      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5163      *   so we don't need to test it here.
5164      */
5165 
5166     utext_close(&bufferText);
5167     utext_close(&patternText);
5168 }
5169 
5170 
5171 //--------------------------------------------------------------
5172 //
5173 //  NamedCapture   Check basic named capture group functionality
5174 //
5175 //--------------------------------------------------------------
NamedCapture()5176 void RegexTest::NamedCapture() {
5177     UErrorCode status = U_ZERO_ERROR;
5178     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5179             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5180     REGEX_CHECK_STATUS;
5181     int32_t group = pat->groupNumberFromName("five", -1, status);
5182     REGEX_CHECK_STATUS;
5183     REGEX_ASSERT(5 == group);
5184     group = pat->groupNumberFromName("three", -1, status);
5185     REGEX_CHECK_STATUS;
5186     REGEX_ASSERT(3 == group);
5187 
5188     status = U_ZERO_ERROR;
5189     group = pat->groupNumberFromName(UnicodeString("six"), status);
5190     REGEX_CHECK_STATUS;
5191     REGEX_ASSERT(6 == group);
5192 
5193     status = U_ZERO_ERROR;
5194     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5195     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5196 
5197     status = U_ZERO_ERROR;
5198 
5199     // After copying a pattern, named capture should still work in the copy.
5200     RegexPattern *copiedPat = new RegexPattern(*pat);
5201     REGEX_ASSERT(*copiedPat == *pat);
5202     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5203 
5204     group = copiedPat->groupNumberFromName("five", -1, status);
5205     REGEX_CHECK_STATUS;
5206     REGEX_ASSERT(5 == group);
5207     group = copiedPat->groupNumberFromName("three", -1, status);
5208     REGEX_CHECK_STATUS;
5209     REGEX_ASSERT(3 == group);
5210     delete copiedPat;
5211 
5212     // ReplaceAll with named capture group.
5213     status = U_ZERO_ERROR;
5214     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5215     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5216     REGEX_CHECK_STATUS;
5217     // m.pattern().dumpPattern();
5218     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5219     REGEX_CHECK_STATUS;
5220     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5221     delete m;
5222 
5223     // ReplaceAll, allowed capture group numbers.
5224     text = UnicodeString("abcmxyz");
5225     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5226     REGEX_CHECK_STATUS;
5227 
5228     status = U_ZERO_ERROR;
5229     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5230     REGEX_CHECK_STATUS;
5231     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5232 
5233     status = U_ZERO_ERROR;
5234     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5235     REGEX_CHECK_STATUS;
5236     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5237 
5238     status = U_ZERO_ERROR;
5239     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5240     REGEX_CHECK_STATUS;
5241     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5242 
5243     status = U_ZERO_ERROR;
5244     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5245     REGEX_CHECK_STATUS;
5246     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5247 
5248     status = U_ZERO_ERROR;
5249     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5250     REGEX_CHECK_STATUS;
5251     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5252 
5253     status = U_ZERO_ERROR;
5254     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5255     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5256 
5257     status = U_ZERO_ERROR;
5258     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5259     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5260     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5261 
5262     status = U_ZERO_ERROR;
5263     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5264     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5265     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5266 
5267     status = U_ZERO_ERROR;
5268     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5269     REGEX_CHECK_STATUS;
5270     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5271 
5272     status = U_ZERO_ERROR;
5273     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5274     REGEX_CHECK_STATUS;
5275     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5276 
5277     status = U_ZERO_ERROR;
5278     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5279     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5280 
5281     status = U_ZERO_ERROR;
5282     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5283     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5284 
5285     status = U_ZERO_ERROR;
5286     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5287     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5288 
5289     status = U_ZERO_ERROR;
5290     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5291     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5292 
5293     delete m;
5294 
5295     // Repeat the above replaceAll() tests using the plain C API, which
5296     //  has a separate implementation internally.
5297     //  TODO: factor out the test data.
5298 
5299     status = U_ZERO_ERROR;
5300     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5301     REGEX_CHECK_STATUS;
5302     text = UnicodeString("abcmxyz");
5303     uregex_setText(re, text.getBuffer(), text.length(), &status);
5304     REGEX_CHECK_STATUS;
5305 
5306     UChar resultBuf[100];
5307     int32_t resultLength;
5308     UnicodeString repl;
5309 
5310     status = U_ZERO_ERROR;
5311     repl = UnicodeString("<$0>");
5312     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5313     REGEX_CHECK_STATUS;
5314     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5315 
5316     status = U_ZERO_ERROR;
5317     repl = UnicodeString("<$1>");
5318     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5319     REGEX_CHECK_STATUS;
5320     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5321 
5322     status = U_ZERO_ERROR;
5323     repl = UnicodeString("<${one}>");
5324     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5325     REGEX_CHECK_STATUS;
5326     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5327 
5328     status = U_ZERO_ERROR;
5329     repl = UnicodeString("<$2>");
5330     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5331     REGEX_CHECK_STATUS;
5332     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5333 
5334     status = U_ZERO_ERROR;
5335     repl = UnicodeString("<$3>");
5336     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5337     REGEX_CHECK_STATUS;
5338     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5339 
5340     status = U_ZERO_ERROR;
5341     repl = UnicodeString("<$4>");
5342     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5343     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5344 
5345     status = U_ZERO_ERROR;
5346     repl = UnicodeString("<$04>");
5347     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5348     REGEX_CHECK_STATUS;
5349     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5350 
5351     status = U_ZERO_ERROR;
5352     repl = UnicodeString("<$000016>");
5353     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5354     REGEX_CHECK_STATUS;
5355     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5356 
5357     status = U_ZERO_ERROR;
5358     repl = UnicodeString("<$3$2$1${one}>");
5359     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5360     REGEX_CHECK_STATUS;
5361     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5362 
5363     status = U_ZERO_ERROR;
5364     repl = UnicodeString("$3$2$1${one}");
5365     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5366     REGEX_CHECK_STATUS;
5367     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5368 
5369     status = U_ZERO_ERROR;
5370     repl = UnicodeString("<${noSuchName}>");
5371     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5372     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5373 
5374     status = U_ZERO_ERROR;
5375     repl = UnicodeString("<${invalid-name}>");
5376     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5378 
5379     status = U_ZERO_ERROR;
5380     repl = UnicodeString("<${one");
5381     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5382     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5383 
5384     status = U_ZERO_ERROR;
5385     repl = UnicodeString("$not a capture group");
5386     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5387     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5388 
5389     uregex_close(re);
5390 }
5391 
5392 //--------------------------------------------------------------
5393 //
5394 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5395 //                       The point is not so much what the exact limit is,
5396 //                       but that a largish number doesn't hit bad non-linear performance,
5397 //                       and that exceeding the limit fails cleanly.
5398 //
5399 //--------------------------------------------------------------
NamedCaptureLimits()5400 void RegexTest::NamedCaptureLimits() {
5401     if (quick) {
5402         logln("Skipping test. Runs in exhuastive mode only.");
5403         return;
5404     }
5405     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5406     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5407     char nnbuf[100];
5408     UnicodeString pattern;
5409     int32_t nn;
5410 
5411     for (nn=1; nn<goodLimit; nn++) {
5412         sprintf(nnbuf, "(?<nn%d>)", nn);
5413         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5414     }
5415     UErrorCode status = U_ZERO_ERROR;
5416     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5417     REGEX_CHECK_STATUS;
5418     for (nn=1; nn<goodLimit; nn++) {
5419         sprintf(nnbuf, "nn%d", nn);
5420         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5421         REGEX_ASSERT(nn == groupNum);
5422         if (nn != groupNum) {
5423             break;
5424         }
5425     }
5426     delete pat;
5427 
5428     pattern.remove();
5429     for (nn=1; nn<failLimit; nn++) {
5430         sprintf(nnbuf, "(?<nn%d>)", nn);
5431         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5432     }
5433     status = U_ZERO_ERROR;
5434     pat = RegexPattern::compile(pattern, 0, status);
5435     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5436     delete pat;
5437 }
5438 
5439 
5440 //--------------------------------------------------------------
5441 //
5442 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5443 //
5444 //---------------------------------------------------------------
Bug7651()5445 void RegexTest::Bug7651() {
5446     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5447     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5448     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5449     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5450     UnicodeString s("#ff @abcd This is test");
5451     RegexPattern  *REPattern = NULL;
5452     RegexMatcher  *REMatcher = NULL;
5453     UErrorCode status = U_ZERO_ERROR;
5454     UParseError pe;
5455 
5456     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5457     REGEX_CHECK_STATUS;
5458     REMatcher = REPattern->matcher(s, status);
5459     REGEX_CHECK_STATUS;
5460     REGEX_ASSERT(REMatcher->find());
5461     REGEX_ASSERT(REMatcher->start(status) == 0);
5462     delete REPattern;
5463     delete REMatcher;
5464     status = U_ZERO_ERROR;
5465 
5466     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5467     REGEX_CHECK_STATUS;
5468     REMatcher = REPattern->matcher(s, status);
5469     REGEX_CHECK_STATUS;
5470     REGEX_ASSERT(REMatcher->find());
5471     REGEX_ASSERT(REMatcher->start(status) == 0);
5472     delete REPattern;
5473     delete REMatcher;
5474     status = U_ZERO_ERROR;
5475  }
5476 
Bug7740()5477 void RegexTest::Bug7740() {
5478     UErrorCode status = U_ZERO_ERROR;
5479     UnicodeString pattern = "(a)";
5480     UnicodeString text = "abcdef";
5481     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5482     REGEX_CHECK_STATUS;
5483     REGEX_ASSERT(m->lookingAt(status));
5484     REGEX_CHECK_STATUS;
5485     status = U_ILLEGAL_ARGUMENT_ERROR;
5486     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5487     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5488     REGEX_ASSERT(s == "");
5489     delete m;
5490 }
5491 
5492 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5493 
Bug8479()5494 void RegexTest::Bug8479() {
5495     UErrorCode status = U_ZERO_ERROR;
5496 
5497     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5498     REGEX_CHECK_STATUS;
5499     if (U_SUCCESS(status))
5500     {
5501         UnicodeString str;
5502         str.setToBogus();
5503         pMatcher->reset(str);
5504         status = U_ZERO_ERROR;
5505         pMatcher->matches(status);
5506         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5507         delete pMatcher;
5508     }
5509 }
5510 
5511 
5512 // Bug 7029
Bug7029()5513 void RegexTest::Bug7029() {
5514     UErrorCode status = U_ZERO_ERROR;
5515 
5516     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5517     UnicodeString text = "abc.def";
5518     UnicodeString splits[10];
5519     REGEX_CHECK_STATUS;
5520     int32_t numFields = pMatcher->split(text, splits, 10, status);
5521     REGEX_CHECK_STATUS;
5522     REGEX_ASSERT(numFields == 8);
5523     delete pMatcher;
5524 }
5525 
5526 // Bug 9283
5527 //   This test is checking for the existance of any supplemental characters that case-fold
5528 //   to a bmp character.
5529 //
5530 //   At the time of this writing there are none. If any should appear in a subsequent release
5531 //   of Unicode, the code in regular expressions compilation that determines the longest
5532 //   posssible match for a literal string  will need to be enhanced.
5533 //
5534 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5535 //   for details on what to do in case of a failure of this test.
5536 //
Bug9283()5537 void RegexTest::Bug9283() {
5538 #if !UCONFIG_NO_NORMALIZATION
5539     UErrorCode status = U_ZERO_ERROR;
5540     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5541     REGEX_CHECK_STATUS;
5542     int32_t index;
5543     UChar32 c;
5544     for (index=0; ; index++) {
5545         c = supplementalsWithCaseFolding.charAt(index);
5546         if (c == -1) {
5547             break;
5548         }
5549         UnicodeString cf = UnicodeString(c).foldCase();
5550         REGEX_ASSERT(cf.length() >= 2);
5551     }
5552 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5553 }
5554 
5555 
CheckInvBufSize()5556 void RegexTest::CheckInvBufSize() {
5557   if(inv_next>=INV_BUFSIZ) {
5558     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5559           __FILE__, INV_BUFSIZ, inv_next);
5560   } else {
5561     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5562   }
5563 }
5564 
5565 
Bug10459()5566 void RegexTest::Bug10459() {
5567     UErrorCode status = U_ZERO_ERROR;
5568     UnicodeString patternString("(txt)");
5569     UnicodeString txtString("txt");
5570 
5571     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5572     REGEX_CHECK_STATUS;
5573     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5574     REGEX_CHECK_STATUS;
5575 
5576     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5577     REGEX_CHECK_STATUS;
5578 
5579     uregex_setUText(icu_re, utext_txt, &status);
5580     REGEX_CHECK_STATUS;
5581 
5582     // The bug was that calling uregex_group() before doing a matching operation
5583     //   was causing a segfault. Only for Regular Expressions created from UText.
5584     //   It should set an U_REGEX_INVALID_STATE.
5585 
5586     UChar buf[100];
5587     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5588     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5589     REGEX_ASSERT(len == 0);
5590 
5591     uregex_close(icu_re);
5592     utext_close(utext_pat);
5593     utext_close(utext_txt);
5594 }
5595 
TestCaseInsensitiveStarters()5596 void RegexTest::TestCaseInsensitiveStarters() {
5597     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5598     //  become stale because of new Unicode characters.
5599     // If it is stale, rerun the generation tool
5600     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5601     // and replace the embedded data in i18n/regexcmp.cpp
5602 
5603     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5604         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5605             continue;
5606         }
5607         UnicodeSet s(cp, cp);
5608         s.closeOver(USET_CASE_INSENSITIVE);
5609         UnicodeSetIterator setIter(s);
5610         while (setIter.next()) {
5611             if (!setIter.isString()) {
5612                 continue;
5613             }
5614             const UnicodeString &str = setIter.getString();
5615             UChar32 firstChar = str.char32At(0);
5616             UnicodeSet starters;
5617             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5618             if (!starters.contains(cp)) {
5619                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5620                 return;
5621             }
5622         }
5623     }
5624 }
5625 
5626 
TestBug11049()5627 void RegexTest::TestBug11049() {
5628     // Original bug report: pattern with match start consisting of one of several individual characters,
5629     //  and the text being matched ending with a supplementary character. find() would read past the
5630     //  end of the input text when searching for potential match starting points.
5631 
5632     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5633     // detect the bad read.
5634 
5635     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5636     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5637 
5638     // Test again with a pattern starting with a single character,
5639     // which takes a different code path than starting with an OR expression,
5640     // but with similar logic.
5641     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5642     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5643 }
5644 
5645 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5646 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5647     UErrorCode status = U_ZERO_ERROR;
5648     UnicodeString patternString = UnicodeString(pattern).unescape();
5649     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5650 
5651     UnicodeString dataString = UnicodeString(data).unescape();
5652     UChar *exactBuffer = new UChar[dataString.length()];
5653     dataString.extract(exactBuffer, dataString.length(), status);
5654     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5655 
5656     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5657     REGEX_CHECK_STATUS;
5658     matcher->reset(ut);
5659     UBool result = matcher->find();
5660     if (result != expectMatch) {
5661         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5662               __FILE__, lineNumber, expectMatch, result, pattern, data);
5663     }
5664 
5665     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5666     //   off-by-one on find() with match at the last code point.
5667     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5668     //   because string.unescape() will only shrink it.
5669     char * utf8Buffer = new char[uprv_strlen(data)+1];
5670     u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5671     REGEX_CHECK_STATUS;
5672     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5673     REGEX_CHECK_STATUS;
5674     matcher->reset(ut);
5675     result = matcher->find();
5676     if (result != expectMatch) {
5677         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5678               __FILE__, lineNumber, expectMatch, result, pattern, data);
5679     }
5680     delete [] utf8Buffer;
5681 
5682     utext_close(ut);
5683     delete [] exactBuffer;
5684 }
5685 
5686 
TestBug11371()5687 void RegexTest::TestBug11371() {
5688     if (quick) {
5689         logln("Skipping test. Runs in exhuastive mode only.");
5690         return;
5691     }
5692     UErrorCode status = U_ZERO_ERROR;
5693     UnicodeString patternString;
5694 
5695     for (int i=0; i<8000000; i++) {
5696         patternString.append(UnicodeString("()"));
5697     }
5698     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5699     if (status != U_REGEX_PATTERN_TOO_BIG) {
5700         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5701               __FILE__, __LINE__, u_errorName(status));
5702     }
5703 
5704     status = U_ZERO_ERROR;
5705     patternString = "(";
5706     for (int i=0; i<20000000; i++) {
5707         patternString.append(UnicodeString("A++"));
5708     }
5709     patternString.append(UnicodeString("){0}B++"));
5710     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5711     if (status != U_REGEX_PATTERN_TOO_BIG) {
5712         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5713               __FILE__, __LINE__, u_errorName(status));
5714     }
5715 
5716     // Pattern with too much string data, such that string indexes overflow operand data field size
5717     // in compiled instruction.
5718     status = U_ZERO_ERROR;
5719     patternString = "";
5720     while (patternString.length() < 0x00ffffff) {
5721         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5722     }
5723     patternString.append(UnicodeString("X? trailing string"));
5724     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5725     if (status != U_REGEX_PATTERN_TOO_BIG) {
5726         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5727               __FILE__, __LINE__, u_errorName(status));
5728     }
5729 }
5730 
TestBug11480()5731 void RegexTest::TestBug11480() {
5732     // C API, get capture group of a group that does not participate in the match.
5733     //        (Returns a zero length string, with nul termination,
5734     //         indistinguishable from a group with a zero length match.)
5735 
5736     UErrorCode status = U_ZERO_ERROR;
5737     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5738     REGEX_CHECK_STATUS;
5739     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5740     uregex_setText(re, text.getBuffer(), text.length(), &status);
5741     REGEX_CHECK_STATUS;
5742     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5743     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5744     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5745     REGEX_ASSERT(length == 0);
5746     REGEX_ASSERT(buf[0] == 13);
5747     REGEX_ASSERT(buf[1] == 0);
5748     REGEX_ASSERT(buf[2] == 13);
5749     uregex_close(re);
5750 
5751     // UText C++ API, length of match is 0 for non-participating matches.
5752     UText ut = UTEXT_INITIALIZER;
5753     utext_openUnicodeString(&ut, &text, &status);
5754     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5755     REGEX_CHECK_STATUS;
5756     matcher.reset(&ut);
5757     REGEX_ASSERT(matcher.lookingAt(0, status));
5758 
5759     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5760     int64_t groupLen = -666;
5761     UText group = UTEXT_INITIALIZER;
5762     matcher.group(1, &group, groupLen, status);
5763     REGEX_CHECK_STATUS;
5764     REGEX_ASSERT(groupLen == 1);
5765     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5766 
5767     // Capture group 2, the (B), does not participate in the match.
5768     matcher.group(2, &group, groupLen, status);
5769     REGEX_CHECK_STATUS;
5770     REGEX_ASSERT(groupLen == 0);
5771     REGEX_ASSERT(matcher.start(2, status) == -1);
5772     REGEX_CHECK_STATUS;
5773 }
5774 
TestBug12884()5775 void RegexTest::TestBug12884() {
5776     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5777     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5778     UnicodeString text(u"hello");
5779     UErrorCode status = U_ZERO_ERROR;
5780     RegexMatcher m(pattern, text, 0, status);
5781     REGEX_CHECK_STATUS;
5782     m.setTimeLimit(5, status);
5783     m.find(status);
5784     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5785 
5786     // Non-greedy loops. They take a different code path during matching.
5787     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5788     status = U_ZERO_ERROR;
5789     RegexMatcher ngM(ngPattern, text, 0, status);
5790     REGEX_CHECK_STATUS;
5791     ngM.setTimeLimit(5, status);
5792     ngM.find(status);
5793     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5794 
5795     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5796     const char *text8 = u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5797                           "carácter, sin importar la plataforma, sin importar el programa,"
5798                           "sin importar el idioma.";
5799     status = U_ZERO_ERROR;
5800     LocalUTextPointer ut(utext_openUTF8(NULL, text8, -1, &status));
5801     REGEX_CHECK_STATUS;
5802     m.reset(ut.getAlias());
5803     m.find(status);
5804     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5805 
5806     status = U_ZERO_ERROR;
5807     ngM.reset(ut.getAlias());
5808     ngM.find(status);
5809     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5810 }
5811 
5812 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5813 //            can cause a read past the end of the input text.
5814 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
5815 
TestBug13631()5816 void RegexTest::TestBug13631() {
5817     const UChar *pats[] = { u"(?<!^)",
5818                             u"(?<=^)",
5819                             nullptr
5820                           };
5821     for (const UChar **pat=pats; *pat; ++pat) {
5822         UErrorCode status = U_ZERO_ERROR;
5823         UnicodeString upat(*pat);
5824         RegexMatcher matcher(upat, 0, status);
5825         const UChar s =u'a';
5826         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5827         REGEX_CHECK_STATUS;
5828         matcher.reset(ut);
5829         while (matcher.find()) {
5830         }
5831         utext_close(ut);
5832     }
5833 }
5834 
5835 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5836 //           where a following group specification would be expected.
5837 //           Failure shows when running the test under Clang's Address Sanitizer.
5838 
TestBug13632()5839 void RegexTest::TestBug13632() {
5840     UErrorCode status = U_ZERO_ERROR;
5841     URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5842     const char16_t *sourceString = u"Hello, world.";
5843     uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5844 
5845     const int32_t destCap = 20;
5846     char16_t dest[destCap] = {};
5847     const char16_t replacement[] = {u'x', u'$'};    // Not nul terminated string.
5848     uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5849 
5850     assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5851     uregex_close(re);
5852 }
5853 
5854 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
5855