1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4  * COPYRIGHT:
5  * Copyright (c) 2002-2016, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ********************************************************************/
8 
9 //
10 //   regextst.cpp
11 //
12 //      ICU Regular Expressions test, part of intltest.
13 //
14 
15 /*
16      NOTE!!
17 
18      PLEASE be careful about ASCII assumptions in this test.
19      This test is one of the worst repeat offenders.
20      If you have questions, contact someone on the ICU PMC
21      who has access to an EBCDIC system.
22 
23  */
24 
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27 
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31 
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/stringpiece.h"
35 #include "unicode/uchar.h"
36 #include "unicode/ucnv.h"
37 #include "unicode/uniset.h"
38 #include "unicode/uregex.h"
39 #include "unicode/usetiter.h"
40 #include "unicode/ustring.h"
41 #include "unicode/utext.h"
42 #include "unicode/utf16.h"
43 #include "cstr.h"
44 #include "regextst.h"
45 #include "regexcmp.h"
46 #include "uvector.h"
47 #include "util.h"
48 #include "cmemory.h"
49 #include "cstring.h"
50 #include "uinvchar.h"
51 
52 #define SUPPORT_MUTATING_INPUT_STRING   0
53 
54 //---------------------------------------------------------------------------
55 //
56 //  Test class boilerplate
57 //
58 //---------------------------------------------------------------------------
RegexTest()59 RegexTest::RegexTest()
60 {
61 }
62 
63 
~RegexTest()64 RegexTest::~RegexTest()
65 {
66 }
67 
68 
69 
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)70 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
71 {
72     if (exec) logln("TestSuite RegexTest: ");
73     TESTCASE_AUTO_BEGIN;
74     TESTCASE_AUTO(Basic);
75     TESTCASE_AUTO(API_Match);
76     TESTCASE_AUTO(API_Replace);
77     TESTCASE_AUTO(API_Pattern);
78 #if !UCONFIG_NO_FILE_IO
79     TESTCASE_AUTO(Extended);
80 #endif
81     TESTCASE_AUTO(Errors);
82     TESTCASE_AUTO(PerlTests);
83     TESTCASE_AUTO(Callbacks);
84     TESTCASE_AUTO(FindProgressCallbacks);
85     TESTCASE_AUTO(Bug6149);
86     TESTCASE_AUTO(UTextBasic);
87     TESTCASE_AUTO(API_Match_UTF8);
88     TESTCASE_AUTO(API_Replace_UTF8);
89     TESTCASE_AUTO(API_Pattern_UTF8);
90     TESTCASE_AUTO(PerlTestsUTF8);
91     TESTCASE_AUTO(PreAllocatedUTextCAPI);
92     TESTCASE_AUTO(Bug7651);
93     TESTCASE_AUTO(Bug7740);
94     TESTCASE_AUTO(Bug8479);
95     TESTCASE_AUTO(Bug7029);
96     TESTCASE_AUTO(CheckInvBufSize);
97     TESTCASE_AUTO(Bug9283);
98     TESTCASE_AUTO(Bug10459);
99     TESTCASE_AUTO(TestCaseInsensitiveStarters);
100     TESTCASE_AUTO(TestBug11049);
101     TESTCASE_AUTO(TestBug11371);
102     TESTCASE_AUTO(TestBug11480);
103     TESTCASE_AUTO(NamedCapture);
104     TESTCASE_AUTO(NamedCaptureLimits);
105     TESTCASE_AUTO(TestBug12884);
106     TESTCASE_AUTO(TestBug13631);
107     TESTCASE_AUTO(TestBug13632);
108     TESTCASE_AUTO(TestBug20359);
109     TESTCASE_AUTO(TestBug20863);
110     TESTCASE_AUTO_END;
111 }
112 
113 
114 /**
115  * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
116  * into ASCII.
117  * @see utext_openUTF8
118  */
119 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
120 
121 //---------------------------------------------------------------------------
122 //
123 //   Error Checking / Reporting macros used in all of the tests.
124 //
125 //---------------------------------------------------------------------------
126 
utextToPrintable(char * buf,int32_t bufLen,UText * text)127 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
128   int64_t oldIndex = utext_getNativeIndex(text);
129   utext_setNativeIndex(text, 0);
130   char *bufPtr = buf;
131   UChar32 c = utext_next32From(text, 0);
132   while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
133     if (0x000020<=c && c<0x00007e) {
134       *bufPtr = c;
135     } else {
136 #if 0
137       sprintf(bufPtr,"U+%04X", c);
138       bufPtr+= strlen(bufPtr)-1;
139 #else
140       *bufPtr = '%';
141 #endif
142     }
143     bufPtr++;
144     c = UTEXT_NEXT32(text);
145   }
146   *bufPtr = 0;
147 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
148   char *ebuf = (char*)malloc(bufLen);
149   uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
150   uprv_strncpy(buf, ebuf, bufLen);
151   free((void*)ebuf);
152 #endif
153   utext_setNativeIndex(text, oldIndex);
154 }
155 
156 
157 static char ASSERT_BUF[1024];
158 
extractToAssertBuf(const UnicodeString & message)159 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
160   if(message.length()==0) {
161     strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
162   } else {
163     UnicodeString buf;
164     IntlTest::prettify(message,buf);
165     if(buf.length()==0) {
166       strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
167     } else {
168       buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
169       if(ASSERT_BUF[0]==0) {
170         ASSERT_BUF[0]=0;
171         for(int32_t i=0;i<buf.length();i++) {
172           UChar ch = buf[i];
173           sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
174         }
175       }
176     }
177   }
178   ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
179   return ASSERT_BUF;
180 }
181 
182 #define REGEX_VERBOSE_TEXT(text) UPRV_BLOCK_MACRO_BEGIN { \
183     char buf[200]; \
184     utextToPrintable(buf,UPRV_LENGTHOF(buf),text); \
185     logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf); \
186 } UPRV_BLOCK_MACRO_END
187 
188 #define REGEX_CHECK_STATUS UPRV_BLOCK_MACRO_BEGIN { \
189     if (U_FAILURE(status)) { \
190         dataerrln("%s:%d: RegexTest failure.  status=%s", \
191                   __FILE__, __LINE__, u_errorName(status)); \
192         return; \
193     } \
194 } UPRV_BLOCK_MACRO_END
195 
196 #define REGEX_ASSERT(expr) UPRV_BLOCK_MACRO_BEGIN { \
197     if ((expr)==FALSE) { \
198         errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr); \
199     } \
200 } UPRV_BLOCK_MACRO_END
201 
202 #define REGEX_ASSERT_FAIL(expr, errcode) UPRV_BLOCK_MACRO_BEGIN { \
203     UErrorCode status=U_ZERO_ERROR; \
204     (expr); \
205     if (status!=errcode) { \
206         dataerrln("RegexTest failure at line %d.  Expected status=%s, got %s", \
207                   __LINE__, u_errorName(errcode), u_errorName(status)); \
208     } \
209 } UPRV_BLOCK_MACRO_END
210 
211 #define REGEX_CHECK_STATUS_L(line) UPRV_BLOCK_MACRO_BEGIN { \
212     if (U_FAILURE(status)) { \
213         errln("RegexTest failure at line %d, from %d.  status=%d\n",__LINE__, (line), status); \
214     } \
215 } UPRV_BLOCK_MACRO_END
216 
217 #define REGEX_ASSERT_L(expr, line) UPRV_BLOCK_MACRO_BEGIN { \
218     if ((expr)==FALSE) { \
219         errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); \
220         return; \
221     } \
222 } UPRV_BLOCK_MACRO_END
223 
224 // expected: const char * , restricted to invariant characters.
225 // actual: const UnicodeString &
226 #define REGEX_ASSERT_UNISTR(expected, actual) UPRV_BLOCK_MACRO_BEGIN { \
227     if (UnicodeString(expected, -1, US_INV) != (actual)) { \
228         errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n",  \
229               __FILE__, __LINE__, expected, extractToAssertBuf(actual)); \
230     } \
231 } UPRV_BLOCK_MACRO_END
232 
233 
testUTextEqual(UText * uta,UText * utb)234 static UBool testUTextEqual(UText *uta, UText *utb) {
235     UChar32 ca = 0;
236     UChar32 cb = 0;
237     utext_setNativeIndex(uta, 0);
238     utext_setNativeIndex(utb, 0);
239     do {
240         ca = utext_next32(uta);
241         cb = utext_next32(utb);
242         if (ca != cb) {
243             break;
244         }
245     } while (ca != U_SENTINEL);
246     return ca == cb;
247 }
248 
249 
250 /**
251  * @param expected expected text in UTF-8 (not platform) codepage
252  */
assertUText(const char * expected,UText * actual,const char * file,int line)253 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
254     UErrorCode status = U_ZERO_ERROR;
255     UText expectedText = UTEXT_INITIALIZER;
256     utext_openUTF8(&expectedText, expected, -1, &status);
257     if(U_FAILURE(status)) {
258       errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
259       return;
260     }
261     if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
262       errln("%s:%d: assertUText:  expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
263       return;
264     }
265     utext_setNativeIndex(actual, 0);
266     if (!testUTextEqual(&expectedText, actual)) {
267         char buf[201 /*21*/];
268         char expectedBuf[201];
269         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
270         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
271         errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
272     }
273     utext_close(&expectedText);
274 }
275 /**
276  * @param expected invariant (platform local text) input
277  */
278 
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)279 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
280     UErrorCode status = U_ZERO_ERROR;
281     UText expectedText = UTEXT_INITIALIZER;
282     regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
283     if(U_FAILURE(status)) {
284       errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
285       return;
286     }
287     utext_setNativeIndex(actual, 0);
288     if (!testUTextEqual(&expectedText, actual)) {
289         char buf[201 /*21*/];
290         char expectedBuf[201];
291         utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
292         utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
293         errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
294     }
295     utext_close(&expectedText);
296 }
297 
298 /**
299  * Assumes utf-8 input
300  */
301 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
302 /**
303  * Assumes Invariant input
304  */
305 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
306 
307 /**
308  * This buffer ( inv_buf ) is used to hold the UTF-8 strings
309  * passed into utext_openUTF8. An error will be given if
310  * INV_BUFSIZ is too small.  It's only used on EBCDIC systems.
311  */
312 
313 #define INV_BUFSIZ 2048 /* increase this if too small */
314 
315 static int64_t inv_next=0;
316 
317 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
318 static char inv_buf[INV_BUFSIZ];
319 #endif
320 
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)321 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
322   if(length==-1) length=strlen(inv);
323 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
324   inv_next+=length;
325   return utext_openUTF8(ut, inv, length, status);
326 #else
327   if(inv_next+length+1>INV_BUFSIZ) {
328     fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
329             __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
330     *status = U_MEMORY_ALLOCATION_ERROR;
331     return NULL;
332   }
333 
334   unsigned char *buf = (unsigned char*)inv_buf+inv_next;
335   uprv_aestrncpy(buf, (const uint8_t*)inv, length);
336   inv_next+=length;
337 
338 #if 0
339   fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
340 #endif
341 
342   return utext_openUTF8(ut, (const char*)buf, length, status);
343 #endif
344 }
345 
346 
347 //---------------------------------------------------------------------------
348 //
349 //    REGEX_TESTLM       Macro + invocation function to simplify writing quick tests
350 //                       for the LookingAt() and  Match() functions.
351 //
352 //       usage:
353 //          REGEX_TESTLM("pattern",  "input text",  lookingAt expected, matches expected);
354 //
355 //          The expected results are UBool - TRUE or FALSE.
356 //          The input text is unescaped.  The pattern is not.
357 //
358 //
359 //---------------------------------------------------------------------------
360 
361 #define REGEX_TESTLM(pat, text, looking, match) UPRV_BLOCK_MACRO_BEGIN { \
362     doRegexLMTest(pat, text, looking, match, __LINE__); \
363     doRegexLMTestUTF8(pat, text, looking, match, __LINE__); \
364 } UPRV_BLOCK_MACRO_END
365 
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)366 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
367     const UnicodeString pattern(pat, -1, US_INV);
368     const UnicodeString inputText(text, -1, US_INV);
369     UErrorCode          status  = U_ZERO_ERROR;
370     UParseError         pe;
371     RegexPattern        *REPattern = NULL;
372     RegexMatcher        *REMatcher = NULL;
373     UBool               retVal     = TRUE;
374 
375     UnicodeString patString(pat, -1, US_INV);
376     REPattern = RegexPattern::compile(patString, 0, pe, status);
377     if (U_FAILURE(status)) {
378         dataerrln("RegexTest failure in RegexPattern::compile() at line %d.  Status = %s",
379             line, u_errorName(status));
380         return FALSE;
381     }
382     if (line==376) { REPattern->dumpPattern();}
383 
384     UnicodeString inputString(inputText);
385     UnicodeString unEscapedInput = inputString.unescape();
386     REMatcher = REPattern->matcher(unEscapedInput, status);
387     if (U_FAILURE(status)) {
388         errln("RegexTest failure in REPattern::matcher() at line %d.  Status = %s\n",
389             line, u_errorName(status));
390         return FALSE;
391     }
392 
393     UBool actualmatch;
394     actualmatch = REMatcher->lookingAt(status);
395     if (U_FAILURE(status)) {
396         errln("RegexTest failure in lookingAt() at line %d.  Status = %s\n",
397             line, u_errorName(status));
398         retVal =  FALSE;
399     }
400     if (actualmatch != looking) {
401         errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
402         retVal = FALSE;
403     }
404 
405     status = U_ZERO_ERROR;
406     actualmatch = REMatcher->matches(status);
407     if (U_FAILURE(status)) {
408         errln("RegexTest failure in matches() at line %d.  Status = %s\n",
409             line, u_errorName(status));
410         retVal = FALSE;
411     }
412     if (actualmatch != match) {
413         errln("RegexTest: wrong return from matches() at line %d.\n", line);
414         retVal = FALSE;
415     }
416 
417     if (retVal == FALSE) {
418         REPattern->dumpPattern();
419     }
420 
421     delete REPattern;
422     delete REMatcher;
423     return retVal;
424 }
425 
426 
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)427 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
428     UText               pattern    = UTEXT_INITIALIZER;
429     int32_t             inputUTF8Length;
430     char                *textChars = NULL;
431     UText               inputText  = UTEXT_INITIALIZER;
432     UErrorCode          status     = U_ZERO_ERROR;
433     UParseError         pe;
434     RegexPattern        *REPattern = NULL;
435     RegexMatcher        *REMatcher = NULL;
436     UBool               retVal     = TRUE;
437 
438     regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
439     REPattern = RegexPattern::compile(&pattern, 0, pe, status);
440     if (U_FAILURE(status)) {
441         dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8).  Status = %s\n",
442             line, u_errorName(status));
443         return FALSE;
444     }
445 
446     UnicodeString inputString(text, -1, US_INV);
447     UnicodeString unEscapedInput = inputString.unescape();
448     LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
449     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
450 
451     inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
452     if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
453         // UTF-8 does not allow unpaired surrogates, so this could actually happen
454         logln("RegexTest unable to convert input to UTF8 at line %d.  Status = %s\n", line, u_errorName(status));
455         return TRUE; // not a failure of the Regex engine
456     }
457     status = U_ZERO_ERROR; // buffer overflow
458     textChars = new char[inputUTF8Length+1];
459     unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
460     utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
461 
462     REMatcher = &REPattern->matcher(status)->reset(&inputText);
463     if (U_FAILURE(status)) {
464         errln("RegexTest failure in REPattern::matcher() at line %d (UTF8).  Status = %s\n",
465             line, u_errorName(status));
466         return FALSE;
467     }
468 
469     UBool actualmatch;
470     actualmatch = REMatcher->lookingAt(status);
471     if (U_FAILURE(status)) {
472         errln("RegexTest failure in lookingAt() at line %d (UTF8).  Status = %s\n",
473             line, u_errorName(status));
474         retVal =  FALSE;
475     }
476     if (actualmatch != looking) {
477         errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
478         retVal = FALSE;
479     }
480 
481     status = U_ZERO_ERROR;
482     actualmatch = REMatcher->matches(status);
483     if (U_FAILURE(status)) {
484         errln("RegexTest failure in matches() at line %d (UTF8).  Status = %s\n",
485             line, u_errorName(status));
486         retVal = FALSE;
487     }
488     if (actualmatch != match) {
489         errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
490         retVal = FALSE;
491     }
492 
493     if (retVal == FALSE) {
494         REPattern->dumpPattern();
495     }
496 
497     delete REPattern;
498     delete REMatcher;
499     utext_close(&inputText);
500     utext_close(&pattern);
501     delete[] textChars;
502     return retVal;
503 }
504 
505 
506 
507 //---------------------------------------------------------------------------
508 //
509 //    REGEX_ERR       Macro + invocation function to simplify writing tests
510 //                       regex tests for incorrect patterns
511 //
512 //       usage:
513 //          REGEX_ERR("pattern",   expected error line, column, expected status);
514 //
515 //---------------------------------------------------------------------------
516 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__)
517 
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)518 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
519                           UErrorCode expectedStatus, int32_t line) {
520     UnicodeString       pattern(pat);
521 
522     UErrorCode          status         = U_ZERO_ERROR;
523     UParseError         pe;
524     RegexPattern        *callerPattern = NULL;
525 
526     //
527     //  Compile the caller's pattern
528     //
529     UnicodeString patString(pat);
530     callerPattern = RegexPattern::compile(patString, 0, pe, status);
531     if (status != expectedStatus) {
532         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
533     } else {
534         if (status != U_ZERO_ERROR) {
535             if (pe.line != errLine || pe.offset != errCol) {
536                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
537                     line, errLine, errCol, pe.line, pe.offset);
538             }
539         }
540     }
541 
542     delete callerPattern;
543 
544     //
545     //  Compile again, using a UTF-8-based UText
546     //
547     UText patternText = UTEXT_INITIALIZER;
548     regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
549     callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
550     if (status != expectedStatus) {
551         dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
552     } else {
553         if (status != U_ZERO_ERROR) {
554             if (pe.line != errLine || pe.offset != errCol) {
555                 errln("Line %d: incorrect line/offset from UParseError.  Expected %d/%d; got %d/%d.\n",
556                     line, errLine, errCol, pe.line, pe.offset);
557             }
558         }
559     }
560 
561     delete callerPattern;
562     utext_close(&patternText);
563 }
564 
565 
566 
567 //---------------------------------------------------------------------------
568 //
569 //      Basic      Check for basic functionality of regex pattern matching.
570 //                 Avoid the use of REGEX_FIND test macro, which has
571 //                 substantial dependencies on basic Regex functionality.
572 //
573 //---------------------------------------------------------------------------
Basic()574 void RegexTest::Basic() {
575 
576 
577 //
578 // Debug - slide failing test cases early
579 //
580 #if 0
581     {
582         // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
583         UParseError pe;
584         UErrorCode  status = U_ZERO_ERROR;
585         RegexPattern *pattern;
586         pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
587         pattern->dumpPattern();
588         RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
589         UBool result = m->find();
590         printf("result = %d\n", result);
591         // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
592         // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
593     }
594     exit(1);
595 #endif
596 
597 
598     //
599     // Pattern with parentheses
600     //
601     REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE,  FALSE);
602     REGEX_TESTLM("st(abc)ring", "stabcring",       TRUE,  TRUE);
603     REGEX_TESTLM("st(abc)ring", "stabcrung",       FALSE, FALSE);
604 
605     //
606     // Patterns with *
607     //
608     REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
609     REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
610     REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
611     REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
612     REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
613 
614     REGEX_TESTLM("a*", "",  TRUE, TRUE);
615     REGEX_TESTLM("a*", "b", TRUE, FALSE);
616 
617 
618     //
619     //  Patterns with "."
620     //
621     REGEX_TESTLM(".", "abc", TRUE, FALSE);
622     REGEX_TESTLM("...", "abc", TRUE, TRUE);
623     REGEX_TESTLM("....", "abc", FALSE, FALSE);
624     REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
625     REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
626     REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
627     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
628     REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
629 
630     //
631     //  Patterns with * applied to chars at end of literal string
632     //
633     REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
634     REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
635 
636     //
637     //  Supplemental chars match as single chars, not a pair of surrogates.
638     //
639     REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
640     REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
641     REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
642 
643 
644     //
645     //  UnicodeSets in the pattern
646     //
647     REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
648     REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
649     REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
650     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
651     REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
652     REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
653 
654     REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
655     REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
656     REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
657     REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE);   // note that * matches 0 occurences.
658     REGEX_TESTLM("[a][b][[:Zs:]]*", "ab   ", TRUE, TRUE);
659 
660     //
661     //   OR operator in patterns
662     //
663     REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
664     REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
665     REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
666     REGEX_TESTLM("a|b", "b", TRUE, TRUE);
667 
668     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
669     REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
670     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
671     REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
672     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
673     REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
674 
675     //
676     //  +
677     //
678     REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
679     REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
680     REGEX_TESTLM("b+", "", FALSE, FALSE);
681     REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
682     REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
683     REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
684 
685     //
686     //   ?
687     //
688     REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
689     REGEX_TESTLM("ab?", "a", TRUE, TRUE);
690     REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
691     REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
692     REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
693     REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
694     REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
695     REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
696     REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
697 
698     //
699     //  Escape sequences that become single literal chars, handled internally
700     //   by ICU's Unescape.
701     //
702 
703     // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE);      // Octal     TODO: not implemented yet.
704     REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE);        // BEL
705     REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE);       // Control-L
706     REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE);        // Escape
707     REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE);        // Form Feed
708     REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE);        // new line
709     REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE);        //  CR
710     REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE);        // Tab
711     REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
712     REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
713 
714     REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE);  //  \A matches only at the beginning of input
715     REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE);  //  \A matches only at the beginning of input
716 
717     // Escape of special chars in patterns
718     REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
719 }
720 
721 
722 //---------------------------------------------------------------------------
723 //
724 //    UTextBasic   Check for quirks that are specific to the UText
725 //                 implementation.
726 //
727 //---------------------------------------------------------------------------
UTextBasic()728 void RegexTest::UTextBasic() {
729     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
730     UErrorCode status = U_ZERO_ERROR;
731     UText pattern = UTEXT_INITIALIZER;
732     utext_openUTF8(&pattern, str_abc, -1, &status);
733     RegexMatcher matcher(&pattern, 0, status);
734     REGEX_CHECK_STATUS;
735 
736     UText input = UTEXT_INITIALIZER;
737     utext_openUTF8(&input, str_abc, -1, &status);
738     REGEX_CHECK_STATUS;
739     matcher.reset(&input);
740     REGEX_CHECK_STATUS;
741     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
742 
743     matcher.reset(matcher.inputText());
744     REGEX_CHECK_STATUS;
745     REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
746 
747     utext_close(&pattern);
748     utext_close(&input);
749 }
750 
751 
752 //---------------------------------------------------------------------------
753 //
754 //      API_Match   Test that the API for class RegexMatcher
755 //                  is present and nominally working, but excluding functions
756 //                  implementing replace operations.
757 //
758 //---------------------------------------------------------------------------
API_Match()759 void RegexTest::API_Match() {
760     UParseError         pe;
761     UErrorCode          status=U_ZERO_ERROR;
762     int32_t             flags = 0;
763 
764     //
765     // Debug - slide failing test cases early
766     //
767 #if 0
768     {
769     }
770     return;
771 #endif
772 
773     //
774     // Simple pattern compilation
775     //
776     {
777         UnicodeString       re("abc");
778         RegexPattern        *pat2;
779         pat2 = RegexPattern::compile(re, flags, pe, status);
780         REGEX_CHECK_STATUS;
781 
782         UnicodeString inStr1 = "abcdef this is a test";
783         UnicodeString instr2 = "not abc";
784         UnicodeString empty  = "";
785 
786 
787         //
788         // Matcher creation and reset.
789         //
790         RegexMatcher *m1 = pat2->matcher(inStr1, status);
791         REGEX_CHECK_STATUS;
792         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
793         REGEX_ASSERT(m1->input() == inStr1);
794         m1->reset(instr2);
795         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
796         REGEX_ASSERT(m1->input() == instr2);
797         m1->reset(inStr1);
798         REGEX_ASSERT(m1->input() == inStr1);
799         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
800         m1->reset(empty);
801         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
802         REGEX_ASSERT(m1->input() == empty);
803         REGEX_ASSERT(&m1->pattern() == pat2);
804 
805         //
806         //  reset(pos, status)
807         //
808         m1->reset(inStr1);
809         m1->reset(4, status);
810         REGEX_CHECK_STATUS;
811         REGEX_ASSERT(m1->input() == inStr1);
812         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
813 
814         m1->reset(-1, status);
815         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
816         status = U_ZERO_ERROR;
817 
818         m1->reset(0, status);
819         REGEX_CHECK_STATUS;
820         status = U_ZERO_ERROR;
821 
822         int32_t len = m1->input().length();
823         m1->reset(len-1, status);
824         REGEX_CHECK_STATUS;
825         status = U_ZERO_ERROR;
826 
827         m1->reset(len, status);
828         REGEX_CHECK_STATUS;
829         status = U_ZERO_ERROR;
830 
831         m1->reset(len+1, status);
832         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
833         status = U_ZERO_ERROR;
834 
835         //
836         // match(pos, status)
837         //
838         m1->reset(instr2);
839         REGEX_ASSERT(m1->matches(4, status) == TRUE);
840         m1->reset();
841         REGEX_ASSERT(m1->matches(3, status) == FALSE);
842         m1->reset();
843         REGEX_ASSERT(m1->matches(5, status) == FALSE);
844         REGEX_ASSERT(m1->matches(4, status) == TRUE);
845         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
846         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
847 
848         // Match() at end of string should fail, but should not
849         //  be an error.
850         status = U_ZERO_ERROR;
851         len = m1->input().length();
852         REGEX_ASSERT(m1->matches(len, status) == FALSE);
853         REGEX_CHECK_STATUS;
854 
855         // Match beyond end of string should fail with an error.
856         status = U_ZERO_ERROR;
857         REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
858         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
859 
860         // Successful match at end of string.
861         {
862             status = U_ZERO_ERROR;
863             RegexMatcher m("A?", 0, status);  // will match zero length string.
864             REGEX_CHECK_STATUS;
865             m.reset(inStr1);
866             len = inStr1.length();
867             REGEX_ASSERT(m.matches(len, status) == TRUE);
868             REGEX_CHECK_STATUS;
869             m.reset(empty);
870             REGEX_ASSERT(m.matches(0, status) == TRUE);
871             REGEX_CHECK_STATUS;
872         }
873 
874 
875         //
876         // lookingAt(pos, status)
877         //
878         status = U_ZERO_ERROR;
879         m1->reset(instr2);  // "not abc"
880         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
881         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
882         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
883         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
884         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
885         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
886         status = U_ZERO_ERROR;
887         len = m1->input().length();
888         REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
889         REGEX_CHECK_STATUS;
890         REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
891         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
892 
893         delete m1;
894         delete pat2;
895     }
896 
897 
898     //
899     // Capture Group.
900     //     RegexMatcher::start();
901     //     RegexMatcher::end();
902     //     RegexMatcher::groupCount();
903     //
904     {
905         int32_t             flags=0;
906         UParseError         pe;
907         UErrorCode          status=U_ZERO_ERROR;
908 
909         UnicodeString       re("01(23(45)67)(.*)");
910         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
911         REGEX_CHECK_STATUS;
912         UnicodeString data = "0123456789";
913 
914         RegexMatcher *matcher = pat->matcher(data, status);
915         REGEX_CHECK_STATUS;
916         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
917         static const int32_t matchStarts[] = {0,  2, 4, 8};
918         static const int32_t matchEnds[]   = {10, 8, 6, 10};
919         int32_t i;
920         for (i=0; i<4; i++) {
921             int32_t actualStart = matcher->start(i, status);
922             REGEX_CHECK_STATUS;
923             if (actualStart != matchStarts[i]) {
924                 errln("RegexTest failure at line %d, index %d.  Expected %d, got %d\n",
925                     __LINE__, i, matchStarts[i], actualStart);
926             }
927             int32_t actualEnd = matcher->end(i, status);
928             REGEX_CHECK_STATUS;
929             if (actualEnd != matchEnds[i]) {
930                 errln("RegexTest failure at line %d index %d.  Expected %d, got %d\n",
931                     __LINE__, i, matchEnds[i], actualEnd);
932             }
933         }
934 
935         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
936         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
937 
938         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
939         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
940         matcher->reset();
941         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
942 
943         matcher->lookingAt(status);
944         REGEX_ASSERT(matcher->group(status)    == "0123456789");
945         REGEX_ASSERT(matcher->group(0, status) == "0123456789");
946         REGEX_ASSERT(matcher->group(1, status) == "234567"    );
947         REGEX_ASSERT(matcher->group(2, status) == "45"        );
948         REGEX_ASSERT(matcher->group(3, status) == "89"        );
949         REGEX_CHECK_STATUS;
950         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
951         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
952         matcher->reset();
953         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
954 
955         delete matcher;
956         delete pat;
957 
958     }
959 
960     //
961     //  find
962     //
963     {
964         int32_t             flags=0;
965         UParseError         pe;
966         UErrorCode          status=U_ZERO_ERROR;
967 
968         UnicodeString       re("abc");
969         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
970         REGEX_CHECK_STATUS;
971         UnicodeString data = ".abc..abc...abc..";
972         //                    012345678901234567
973 
974         RegexMatcher *matcher = pat->matcher(data, status);
975         REGEX_CHECK_STATUS;
976         REGEX_ASSERT(matcher->find());
977         REGEX_ASSERT(matcher->start(status) == 1);
978         REGEX_ASSERT(matcher->find());
979         REGEX_ASSERT(matcher->start(status) == 6);
980         REGEX_ASSERT(matcher->find());
981         REGEX_ASSERT(matcher->start(status) == 12);
982         REGEX_ASSERT(matcher->find() == FALSE);
983         REGEX_ASSERT(matcher->find() == FALSE);
984 
985         matcher->reset();
986         REGEX_ASSERT(matcher->find());
987         REGEX_ASSERT(matcher->start(status) == 1);
988 
989         REGEX_ASSERT(matcher->find(0, status));
990         REGEX_ASSERT(matcher->start(status) == 1);
991         REGEX_ASSERT(matcher->find(1, status));
992         REGEX_ASSERT(matcher->start(status) == 1);
993         REGEX_ASSERT(matcher->find(2, status));
994         REGEX_ASSERT(matcher->start(status) == 6);
995         REGEX_ASSERT(matcher->find(12, status));
996         REGEX_ASSERT(matcher->start(status) == 12);
997         REGEX_ASSERT(matcher->find(13, status) == FALSE);
998         REGEX_ASSERT(matcher->find(16, status) == FALSE);
999         REGEX_ASSERT(matcher->find(17, status) == FALSE);
1000         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1001 
1002         status = U_ZERO_ERROR;
1003         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1004         status = U_ZERO_ERROR;
1005         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1006 
1007         REGEX_ASSERT(matcher->groupCount() == 0);
1008 
1009         delete matcher;
1010         delete pat;
1011     }
1012 
1013 
1014     //
1015     //  find, with \G in pattern (true if at the end of a previous match).
1016     //
1017     {
1018         int32_t             flags=0;
1019         UParseError         pe;
1020         UErrorCode          status=U_ZERO_ERROR;
1021 
1022         UnicodeString       re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1023         RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1024         REGEX_CHECK_STATUS;
1025         UnicodeString data = ".abcabc.abc..";
1026         //                    012345678901234567
1027 
1028         RegexMatcher *matcher = pat->matcher(data, status);
1029         REGEX_CHECK_STATUS;
1030         REGEX_ASSERT(matcher->find());
1031         REGEX_ASSERT(matcher->start(status) == 0);
1032         REGEX_ASSERT(matcher->start(1, status) == -1);
1033         REGEX_ASSERT(matcher->start(2, status) == 1);
1034 
1035         REGEX_ASSERT(matcher->find());
1036         REGEX_ASSERT(matcher->start(status) == 4);
1037         REGEX_ASSERT(matcher->start(1, status) == 4);
1038         REGEX_ASSERT(matcher->start(2, status) == -1);
1039         REGEX_CHECK_STATUS;
1040 
1041         delete matcher;
1042         delete pat;
1043     }
1044 
1045     //
1046     //   find with zero length matches, match position should bump ahead
1047     //     to prevent loops.
1048     //
1049     {
1050         int32_t                 i;
1051         UErrorCode          status=U_ZERO_ERROR;
1052         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
1053                                                       //   using an always-true look-ahead.
1054         REGEX_CHECK_STATUS;
1055         UnicodeString s("    ");
1056         m.reset(s);
1057         for (i=0; ; i++) {
1058             if (m.find() == FALSE) {
1059                 break;
1060             }
1061             REGEX_ASSERT(m.start(status) == i);
1062             REGEX_ASSERT(m.end(status) == i);
1063         }
1064         REGEX_ASSERT(i==5);
1065 
1066         // Check that the bump goes over surrogate pairs OK
1067         s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1068         s = s.unescape();
1069         m.reset(s);
1070         for (i=0; ; i+=2) {
1071             if (m.find() == FALSE) {
1072                 break;
1073             }
1074             REGEX_ASSERT(m.start(status) == i);
1075             REGEX_ASSERT(m.end(status) == i);
1076         }
1077         REGEX_ASSERT(i==10);
1078     }
1079     {
1080         // find() loop breaking test.
1081         //        with pattern of /.?/, should see a series of one char matches, then a single
1082         //        match of zero length at the end of the input string.
1083         int32_t                 i;
1084         UErrorCode          status=U_ZERO_ERROR;
1085         RegexMatcher        m(".?", 0, status);
1086         REGEX_CHECK_STATUS;
1087         UnicodeString s("    ");
1088         m.reset(s);
1089         for (i=0; ; i++) {
1090             if (m.find() == FALSE) {
1091                 break;
1092             }
1093             REGEX_ASSERT(m.start(status) == i);
1094             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1095         }
1096         REGEX_ASSERT(i==5);
1097     }
1098 
1099 
1100     //
1101     // Matchers with no input string behave as if they had an empty input string.
1102     //
1103 
1104     {
1105         UErrorCode status = U_ZERO_ERROR;
1106         RegexMatcher  m(".?", 0, status);
1107         REGEX_CHECK_STATUS;
1108         REGEX_ASSERT(m.find());
1109         REGEX_ASSERT(m.start(status) == 0);
1110         REGEX_ASSERT(m.input() == "");
1111     }
1112     {
1113         UErrorCode status = U_ZERO_ERROR;
1114         RegexPattern  *p = RegexPattern::compile(".", 0, status);
1115         RegexMatcher  *m = p->matcher(status);
1116         REGEX_CHECK_STATUS;
1117 
1118         REGEX_ASSERT(m->find() == FALSE);
1119         REGEX_ASSERT(m->input() == "");
1120         delete m;
1121         delete p;
1122     }
1123 
1124     //
1125     // Regions
1126     //
1127     {
1128         UErrorCode status = U_ZERO_ERROR;
1129         UnicodeString testString("This is test data");
1130         RegexMatcher m(".*", testString,  0, status);
1131         REGEX_CHECK_STATUS;
1132         REGEX_ASSERT(m.regionStart() == 0);
1133         REGEX_ASSERT(m.regionEnd() == testString.length());
1134         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1135         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1136 
1137         m.region(2,4, status);
1138         REGEX_CHECK_STATUS;
1139         REGEX_ASSERT(m.matches(status));
1140         REGEX_ASSERT(m.start(status)==2);
1141         REGEX_ASSERT(m.end(status)==4);
1142         REGEX_CHECK_STATUS;
1143 
1144         m.reset();
1145         REGEX_ASSERT(m.regionStart() == 0);
1146         REGEX_ASSERT(m.regionEnd() == testString.length());
1147 
1148         UnicodeString shorterString("short");
1149         m.reset(shorterString);
1150         REGEX_ASSERT(m.regionStart() == 0);
1151         REGEX_ASSERT(m.regionEnd() == shorterString.length());
1152 
1153         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1154         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1155         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1156         REGEX_ASSERT(&m == &m.reset());
1157         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1158 
1159         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1160         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1161         REGEX_ASSERT(&m == &m.reset());
1162         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1163 
1164         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1165         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1166         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1167         REGEX_ASSERT(&m == &m.reset());
1168         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1169 
1170         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1171         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1172         REGEX_ASSERT(&m == &m.reset());
1173         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1174 
1175     }
1176 
1177     //
1178     // hitEnd() and requireEnd()
1179     //
1180     {
1181         UErrorCode status = U_ZERO_ERROR;
1182         UnicodeString testString("aabb");
1183         RegexMatcher m1(".*", testString,  0, status);
1184         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1185         REGEX_ASSERT(m1.hitEnd() == TRUE);
1186         REGEX_ASSERT(m1.requireEnd() == FALSE);
1187         REGEX_CHECK_STATUS;
1188 
1189         status = U_ZERO_ERROR;
1190         RegexMatcher m2("a*", testString, 0, status);
1191         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1192         REGEX_ASSERT(m2.hitEnd() == FALSE);
1193         REGEX_ASSERT(m2.requireEnd() == FALSE);
1194         REGEX_CHECK_STATUS;
1195 
1196         status = U_ZERO_ERROR;
1197         RegexMatcher m3(".*$", testString, 0, status);
1198         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1199         REGEX_ASSERT(m3.hitEnd() == TRUE);
1200         REGEX_ASSERT(m3.requireEnd() == TRUE);
1201         REGEX_CHECK_STATUS;
1202     }
1203 
1204 
1205     //
1206     // Compilation error on reset with UChar *
1207     //   These were a hazard that people were stumbling over with runtime errors.
1208     //   Changed them to compiler errors by adding private methods that more closely
1209     //   matched the incorrect use of the functions.
1210     //
1211 #if 0
1212     {
1213         UErrorCode status = U_ZERO_ERROR;
1214         UChar ucharString[20];
1215         RegexMatcher m(".", 0, status);
1216         m.reset(ucharString);  // should not compile.
1217 
1218         RegexPattern *p = RegexPattern::compile(".", 0, status);
1219         RegexMatcher *m2 = p->matcher(ucharString, status);    //  should not compile.
1220 
1221         RegexMatcher m3(".", ucharString, 0, status);  //  Should not compile
1222     }
1223 #endif
1224 
1225     //
1226     //  Time Outs.
1227     //       Note:  These tests will need to be changed when the regexp engine is
1228     //              able to detect and cut short the exponential time behavior on
1229     //              this type of match.
1230     //
1231     {
1232         UErrorCode status = U_ZERO_ERROR;
1233         //    Enough 'a's in the string to cause the match to time out.
1234         //       (Each on additonal 'a' doubles the time)
1235         UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1236         RegexMatcher matcher("(a+)+b", testString, 0, status);
1237         REGEX_CHECK_STATUS;
1238         REGEX_ASSERT(matcher.getTimeLimit() == 0);
1239         matcher.setTimeLimit(100, status);
1240         REGEX_ASSERT(matcher.getTimeLimit() == 100);
1241         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1242         REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1243     }
1244     {
1245         UErrorCode status = U_ZERO_ERROR;
1246         //   Few enough 'a's to slip in under the time limit.
1247         UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1248         RegexMatcher matcher("(a+)+b", testString, 0, status);
1249         REGEX_CHECK_STATUS;
1250         matcher.setTimeLimit(100, status);
1251         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1252         REGEX_CHECK_STATUS;
1253     }
1254 
1255     //
1256     //  Stack Limits
1257     //
1258     {
1259         UErrorCode status = U_ZERO_ERROR;
1260         UnicodeString testString(1000000, 0x41, 1000000);  // Length 1,000,000, filled with 'A'
1261 
1262         // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1263         //   of the '+', and makes the stack frames larger.
1264         RegexMatcher matcher("(A)+A$", testString, 0, status);
1265 
1266         // With the default stack, this match should fail to run
1267         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1268         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1269 
1270         // With unlimited stack, it should run
1271         status = U_ZERO_ERROR;
1272         matcher.setStackLimit(0, status);
1273         REGEX_CHECK_STATUS;
1274         REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1275         REGEX_CHECK_STATUS;
1276         REGEX_ASSERT(matcher.getStackLimit() == 0);
1277 
1278         // With a limited stack, it the match should fail
1279         status = U_ZERO_ERROR;
1280         matcher.setStackLimit(10000, status);
1281         REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1282         REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1283         REGEX_ASSERT(matcher.getStackLimit() == 10000);
1284     }
1285 
1286         // A pattern that doesn't save state should work with
1287         //   a minimal sized stack
1288     {
1289         UErrorCode status = U_ZERO_ERROR;
1290         UnicodeString testString = "abc";
1291         RegexMatcher matcher("abc", testString, 0, status);
1292         REGEX_CHECK_STATUS;
1293         matcher.setStackLimit(30, status);
1294         REGEX_CHECK_STATUS;
1295         REGEX_ASSERT(matcher.matches(status) == TRUE);
1296         REGEX_CHECK_STATUS;
1297         REGEX_ASSERT(matcher.getStackLimit() == 30);
1298 
1299         // Negative stack sizes should fail
1300         status = U_ZERO_ERROR;
1301         matcher.setStackLimit(1000, status);
1302         REGEX_CHECK_STATUS;
1303         matcher.setStackLimit(-1, status);
1304         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1305         REGEX_ASSERT(matcher.getStackLimit() == 1000);
1306     }
1307 
1308 
1309 }
1310 
1311 
1312 
1313 
1314 
1315 
1316 //---------------------------------------------------------------------------
1317 //
1318 //      API_Replace        API test for class RegexMatcher, testing the
1319 //                         Replace family of functions.
1320 //
1321 //---------------------------------------------------------------------------
API_Replace()1322 void RegexTest::API_Replace() {
1323     //
1324     //  Replace
1325     //
1326     int32_t             flags=0;
1327     UParseError         pe;
1328     UErrorCode          status=U_ZERO_ERROR;
1329 
1330     UnicodeString       re("abc");
1331     RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1332     REGEX_CHECK_STATUS;
1333     UnicodeString data = ".abc..abc...abc..";
1334     //                    012345678901234567
1335     RegexMatcher *matcher = pat->matcher(data, status);
1336 
1337     //
1338     //  Plain vanilla matches.
1339     //
1340     UnicodeString  dest;
1341     dest = matcher->replaceFirst("yz", status);
1342     REGEX_CHECK_STATUS;
1343     REGEX_ASSERT(dest == ".yz..abc...abc..");
1344 
1345     dest = matcher->replaceAll("yz", status);
1346     REGEX_CHECK_STATUS;
1347     REGEX_ASSERT(dest == ".yz..yz...yz..");
1348 
1349     //
1350     //  Plain vanilla non-matches.
1351     //
1352     UnicodeString d2 = ".abx..abx...abx..";
1353     matcher->reset(d2);
1354     dest = matcher->replaceFirst("yz", status);
1355     REGEX_CHECK_STATUS;
1356     REGEX_ASSERT(dest == ".abx..abx...abx..");
1357 
1358     dest = matcher->replaceAll("yz", status);
1359     REGEX_CHECK_STATUS;
1360     REGEX_ASSERT(dest == ".abx..abx...abx..");
1361 
1362     //
1363     // Empty source string
1364     //
1365     UnicodeString d3 = "";
1366     matcher->reset(d3);
1367     dest = matcher->replaceFirst("yz", status);
1368     REGEX_CHECK_STATUS;
1369     REGEX_ASSERT(dest == "");
1370 
1371     dest = matcher->replaceAll("yz", status);
1372     REGEX_CHECK_STATUS;
1373     REGEX_ASSERT(dest == "");
1374 
1375     //
1376     // Empty substitution string
1377     //
1378     matcher->reset(data);              // ".abc..abc...abc.."
1379     dest = matcher->replaceFirst("", status);
1380     REGEX_CHECK_STATUS;
1381     REGEX_ASSERT(dest == "...abc...abc..");
1382 
1383     dest = matcher->replaceAll("", status);
1384     REGEX_CHECK_STATUS;
1385     REGEX_ASSERT(dest == "........");
1386 
1387     //
1388     // match whole string
1389     //
1390     UnicodeString d4 = "abc";
1391     matcher->reset(d4);
1392     dest = matcher->replaceFirst("xyz", status);
1393     REGEX_CHECK_STATUS;
1394     REGEX_ASSERT(dest == "xyz");
1395 
1396     dest = matcher->replaceAll("xyz", status);
1397     REGEX_CHECK_STATUS;
1398     REGEX_ASSERT(dest == "xyz");
1399 
1400     //
1401     // Capture Group, simple case
1402     //
1403     UnicodeString       re2("a(..)");
1404     RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1405     REGEX_CHECK_STATUS;
1406     UnicodeString d5 = "abcdefg";
1407     RegexMatcher *matcher2 = pat2->matcher(d5, status);
1408     REGEX_CHECK_STATUS;
1409     dest = matcher2->replaceFirst("$1$1", status);
1410     REGEX_CHECK_STATUS;
1411     REGEX_ASSERT(dest == "bcbcdefg");
1412 
1413     dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1414     REGEX_CHECK_STATUS;
1415     REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1416 
1417     dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1418     REGEX_ASSERT(U_FAILURE(status));
1419     status = U_ZERO_ERROR;
1420 
1421     UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1422     replacement = replacement.unescape();
1423     dest = matcher2->replaceFirst(replacement, status);
1424     REGEX_CHECK_STATUS;
1425     REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1426 
1427     REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1428 
1429 
1430     //
1431     // Replacement String with \u hex escapes
1432     //
1433     {
1434         UnicodeString  src = "abc 1 abc 2 abc 3";
1435         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1436         matcher->reset(src);
1437         UnicodeString  result = matcher->replaceAll(substitute, status);
1438         REGEX_CHECK_STATUS;
1439         REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1440     }
1441     {
1442         UnicodeString  src = "abc !";
1443         UnicodeString  substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1444         matcher->reset(src);
1445         UnicodeString  result = matcher->replaceAll(substitute, status);
1446         REGEX_CHECK_STATUS;
1447         UnicodeString expected = UnicodeString("--");
1448         expected.append((UChar32)0x10000);
1449         expected.append("-- !");
1450         REGEX_ASSERT(result == expected);
1451     }
1452     // TODO:  need more through testing of capture substitutions.
1453 
1454     // Bug 4057
1455     //
1456     {
1457         status = U_ZERO_ERROR;
1458         UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1459         RegexMatcher m("ss(.*?)ee", 0, status);
1460         REGEX_CHECK_STATUS;
1461         UnicodeString result;
1462 
1463         // Multiple finds do NOT bump up the previous appendReplacement postion.
1464         m.reset(s);
1465         m.find();
1466         m.find();
1467         m.appendReplacement(result, "ooh", status);
1468         REGEX_CHECK_STATUS;
1469         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1470 
1471         // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1472         status = U_ZERO_ERROR;
1473         result.truncate(0);
1474         m.reset(10, status);
1475         m.find();
1476         m.find();
1477         m.appendReplacement(result, "ooh", status);
1478         REGEX_CHECK_STATUS;
1479         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1480 
1481         // find() at interior of string, appendReplacemnt still starts at beginning.
1482         status = U_ZERO_ERROR;
1483         result.truncate(0);
1484         m.reset();
1485         m.find(10, status);
1486         m.find();
1487         m.appendReplacement(result, "ooh", status);
1488         REGEX_CHECK_STATUS;
1489         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1490 
1491         m.appendTail(result);
1492         REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1493 
1494     }
1495 
1496     delete matcher2;
1497     delete pat2;
1498     delete matcher;
1499     delete pat;
1500 }
1501 
1502 
1503 //---------------------------------------------------------------------------
1504 //
1505 //      API_Pattern       Test that the API for class RegexPattern is
1506 //                        present and nominally working.
1507 //
1508 //---------------------------------------------------------------------------
API_Pattern()1509 void RegexTest::API_Pattern() {
1510     RegexPattern        pata;    // Test default constructor to not crash.
1511     RegexPattern        patb;
1512 
1513     REGEX_ASSERT(pata == patb);
1514     REGEX_ASSERT(pata == pata);
1515 
1516     UnicodeString re1("abc[a-l][m-z]");
1517     UnicodeString re2("def");
1518     UErrorCode    status = U_ZERO_ERROR;
1519     UParseError   pe;
1520 
1521     RegexPattern        *pat1 = RegexPattern::compile(re1, 0, pe, status);
1522     RegexPattern        *pat2 = RegexPattern::compile(re2, 0, pe, status);
1523     REGEX_CHECK_STATUS;
1524     REGEX_ASSERT(*pat1 == *pat1);
1525     REGEX_ASSERT(*pat1 != pata);
1526 
1527     // Assign
1528     patb = *pat1;
1529     REGEX_ASSERT(patb == *pat1);
1530 
1531     // Copy Construct
1532     RegexPattern patc(*pat1);
1533     REGEX_ASSERT(patc == *pat1);
1534     REGEX_ASSERT(patb == patc);
1535     REGEX_ASSERT(pat1 != pat2);
1536     patb = *pat2;
1537     REGEX_ASSERT(patb != patc);
1538     REGEX_ASSERT(patb == *pat2);
1539 
1540     // Compile with no flags.
1541     RegexPattern         *pat1a = RegexPattern::compile(re1, pe, status);
1542     REGEX_ASSERT(*pat1a == *pat1);
1543 
1544     REGEX_ASSERT(pat1a->flags() == 0);
1545 
1546     // Compile with different flags should be not equal
1547     RegexPattern        *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1548     REGEX_CHECK_STATUS;
1549 
1550     REGEX_ASSERT(*pat1b != *pat1a);
1551     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1552     REGEX_ASSERT(pat1a->flags() == 0);
1553     delete pat1b;
1554 
1555     // clone
1556     RegexPattern *pat1c = pat1->clone();
1557     REGEX_ASSERT(*pat1c == *pat1);
1558     REGEX_ASSERT(*pat1c != *pat2);
1559 
1560     delete pat1c;
1561     delete pat1a;
1562     delete pat1;
1563     delete pat2;
1564 
1565 
1566     //
1567     //   Verify that a matcher created from a cloned pattern works.
1568     //     (Jitterbug 3423)
1569     //
1570     {
1571         UErrorCode     status     = U_ZERO_ERROR;
1572         RegexPattern  *pSource    = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1573         RegexPattern  *pClone     = pSource->clone();
1574         delete         pSource;
1575         RegexMatcher  *mFromClone = pClone->matcher(status);
1576         REGEX_CHECK_STATUS;
1577         UnicodeString s = "Hello World";
1578         mFromClone->reset(s);
1579         REGEX_ASSERT(mFromClone->find() == TRUE);
1580         REGEX_ASSERT(mFromClone->group(status) == "Hello");
1581         REGEX_ASSERT(mFromClone->find() == TRUE);
1582         REGEX_ASSERT(mFromClone->group(status) == "World");
1583         REGEX_ASSERT(mFromClone->find() == FALSE);
1584         delete mFromClone;
1585         delete pClone;
1586     }
1587 
1588     //
1589     //   matches convenience API
1590     //
1591     REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1592     REGEX_CHECK_STATUS;
1593     REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1594     REGEX_CHECK_STATUS;
1595     REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1596     REGEX_CHECK_STATUS;
1597     REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1598     REGEX_CHECK_STATUS;
1599     REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1600     REGEX_CHECK_STATUS;
1601     status = U_INDEX_OUTOFBOUNDS_ERROR;
1602     REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1603     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1604 
1605 
1606     //
1607     // Split()
1608     //
1609     status = U_ZERO_ERROR;
1610     pat1 = RegexPattern::compile(" +",  pe, status);
1611     REGEX_CHECK_STATUS;
1612     UnicodeString  fields[10];
1613 
1614     int32_t n;
1615     n = pat1->split("Now is the time", fields, 10, status);
1616     REGEX_CHECK_STATUS;
1617     REGEX_ASSERT(n==4);
1618     REGEX_ASSERT(fields[0]=="Now");
1619     REGEX_ASSERT(fields[1]=="is");
1620     REGEX_ASSERT(fields[2]=="the");
1621     REGEX_ASSERT(fields[3]=="time");
1622     REGEX_ASSERT(fields[4]=="");
1623 
1624     n = pat1->split("Now is the time", fields, 2, status);
1625     REGEX_CHECK_STATUS;
1626     REGEX_ASSERT(n==2);
1627     REGEX_ASSERT(fields[0]=="Now");
1628     REGEX_ASSERT(fields[1]=="is the time");
1629     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
1630 
1631     fields[1] = "*";
1632     status = U_ZERO_ERROR;
1633     n = pat1->split("Now is the time", fields, 1, status);
1634     REGEX_CHECK_STATUS;
1635     REGEX_ASSERT(n==1);
1636     REGEX_ASSERT(fields[0]=="Now is the time");
1637     REGEX_ASSERT(fields[1]=="*");
1638     status = U_ZERO_ERROR;
1639 
1640     n = pat1->split("    Now       is the time   ", fields, 10, status);
1641     REGEX_CHECK_STATUS;
1642     REGEX_ASSERT(n==6);
1643     REGEX_ASSERT(fields[0]=="");
1644     REGEX_ASSERT(fields[1]=="Now");
1645     REGEX_ASSERT(fields[2]=="is");
1646     REGEX_ASSERT(fields[3]=="the");
1647     REGEX_ASSERT(fields[4]=="time");
1648     REGEX_ASSERT(fields[5]=="");
1649 
1650     n = pat1->split("     ", fields, 10, status);
1651     REGEX_CHECK_STATUS;
1652     REGEX_ASSERT(n==2);
1653     REGEX_ASSERT(fields[0]=="");
1654     REGEX_ASSERT(fields[1]=="");
1655 
1656     fields[0] = "foo";
1657     n = pat1->split("", fields, 10, status);
1658     REGEX_CHECK_STATUS;
1659     REGEX_ASSERT(n==0);
1660     REGEX_ASSERT(fields[0]=="foo");
1661 
1662     delete pat1;
1663 
1664     //  split, with a pattern with (capture)
1665     pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"),  pe, status);
1666     REGEX_CHECK_STATUS;
1667 
1668     status = U_ZERO_ERROR;
1669     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1670     REGEX_CHECK_STATUS;
1671     REGEX_ASSERT(n==7);
1672     REGEX_ASSERT(fields[0]=="");
1673     REGEX_ASSERT(fields[1]=="a");
1674     REGEX_ASSERT(fields[2]=="Now is ");
1675     REGEX_ASSERT(fields[3]=="b");
1676     REGEX_ASSERT(fields[4]=="the time");
1677     REGEX_ASSERT(fields[5]=="c");
1678     REGEX_ASSERT(fields[6]=="");
1679     REGEX_ASSERT(status==U_ZERO_ERROR);
1680 
1681     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
1682     REGEX_CHECK_STATUS;
1683     REGEX_ASSERT(n==7);
1684     REGEX_ASSERT(fields[0]=="  ");
1685     REGEX_ASSERT(fields[1]=="a");
1686     REGEX_ASSERT(fields[2]=="Now is ");
1687     REGEX_ASSERT(fields[3]=="b");
1688     REGEX_ASSERT(fields[4]=="the time");
1689     REGEX_ASSERT(fields[5]=="c");
1690     REGEX_ASSERT(fields[6]=="");
1691 
1692     status = U_ZERO_ERROR;
1693     fields[6] = "foo";
1694     n = pat1->split("  <a>Now is <b>the time<c>", fields, 6, status);
1695     REGEX_CHECK_STATUS;
1696     REGEX_ASSERT(n==6);
1697     REGEX_ASSERT(fields[0]=="  ");
1698     REGEX_ASSERT(fields[1]=="a");
1699     REGEX_ASSERT(fields[2]=="Now is ");
1700     REGEX_ASSERT(fields[3]=="b");
1701     REGEX_ASSERT(fields[4]=="the time");
1702     REGEX_ASSERT(fields[5]=="");  // All text following "<c>" field delimiter.
1703     REGEX_ASSERT(fields[6]=="foo");
1704 
1705     status = U_ZERO_ERROR;
1706     fields[5] = "foo";
1707     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
1708     REGEX_CHECK_STATUS;
1709     REGEX_ASSERT(n==5);
1710     REGEX_ASSERT(fields[0]=="  ");
1711     REGEX_ASSERT(fields[1]=="a");
1712     REGEX_ASSERT(fields[2]=="Now is ");
1713     REGEX_ASSERT(fields[3]=="b");
1714     REGEX_ASSERT(fields[4]=="the time<c>");
1715     REGEX_ASSERT(fields[5]=="foo");
1716 
1717     status = U_ZERO_ERROR;
1718     fields[5] = "foo";
1719     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
1720     REGEX_CHECK_STATUS;
1721     REGEX_ASSERT(n==5);
1722     REGEX_ASSERT(fields[0]=="  ");
1723     REGEX_ASSERT(fields[1]=="a");
1724     REGEX_ASSERT(fields[2]=="Now is ");
1725     REGEX_ASSERT(fields[3]=="b");
1726     REGEX_ASSERT(fields[4]=="the time");
1727     REGEX_ASSERT(fields[5]=="foo");
1728 
1729     status = U_ZERO_ERROR;
1730     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
1731     REGEX_CHECK_STATUS;
1732     REGEX_ASSERT(n==4);
1733     REGEX_ASSERT(fields[0]=="  ");
1734     REGEX_ASSERT(fields[1]=="a");
1735     REGEX_ASSERT(fields[2]=="Now is ");
1736     REGEX_ASSERT(fields[3]=="the time<c>");
1737     status = U_ZERO_ERROR;
1738     delete pat1;
1739 
1740     pat1 = RegexPattern::compile("([-,])",  pe, status);
1741     REGEX_CHECK_STATUS;
1742     n = pat1->split("1-10,20", fields, 10, status);
1743     REGEX_CHECK_STATUS;
1744     REGEX_ASSERT(n==5);
1745     REGEX_ASSERT(fields[0]=="1");
1746     REGEX_ASSERT(fields[1]=="-");
1747     REGEX_ASSERT(fields[2]=="10");
1748     REGEX_ASSERT(fields[3]==",");
1749     REGEX_ASSERT(fields[4]=="20");
1750     delete pat1;
1751 
1752     // Test split of string with empty trailing fields
1753     pat1 = RegexPattern::compile(",", pe, status);
1754     REGEX_CHECK_STATUS;
1755     n = pat1->split("a,b,c,", fields, 10, status);
1756     REGEX_CHECK_STATUS;
1757     REGEX_ASSERT(n==4);
1758     REGEX_ASSERT(fields[0]=="a");
1759     REGEX_ASSERT(fields[1]=="b");
1760     REGEX_ASSERT(fields[2]=="c");
1761     REGEX_ASSERT(fields[3]=="");
1762 
1763     n = pat1->split("a,,,", fields, 10, status);
1764     REGEX_CHECK_STATUS;
1765     REGEX_ASSERT(n==4);
1766     REGEX_ASSERT(fields[0]=="a");
1767     REGEX_ASSERT(fields[1]=="");
1768     REGEX_ASSERT(fields[2]=="");
1769     REGEX_ASSERT(fields[3]=="");
1770     delete pat1;
1771 
1772     // Split Separator with zero length match.
1773     pat1 = RegexPattern::compile(":?", pe, status);
1774     REGEX_CHECK_STATUS;
1775     n = pat1->split("abc", fields, 10, status);
1776     REGEX_CHECK_STATUS;
1777     REGEX_ASSERT(n==5);
1778     REGEX_ASSERT(fields[0]=="");
1779     REGEX_ASSERT(fields[1]=="a");
1780     REGEX_ASSERT(fields[2]=="b");
1781     REGEX_ASSERT(fields[3]=="c");
1782     REGEX_ASSERT(fields[4]=="");
1783 
1784     delete pat1;
1785 
1786     //
1787     // RegexPattern::pattern()
1788     //
1789     pat1 = new RegexPattern();
1790     REGEX_ASSERT(pat1->pattern() == "");
1791     delete pat1;
1792 
1793     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1794     REGEX_CHECK_STATUS;
1795     REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1796     delete pat1;
1797 
1798 
1799     //
1800     // classID functions
1801     //
1802     pat1 = RegexPattern::compile("(Hello, world)*",  pe, status);
1803     REGEX_CHECK_STATUS;
1804     REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1805     REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1806     UnicodeString Hello("Hello, world.");
1807     RegexMatcher *m = pat1->matcher(Hello, status);
1808     REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1809     REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1810     REGEX_ASSERT(m->getDynamicClassID() != NULL);
1811     delete m;
1812     delete pat1;
1813 
1814 }
1815 
1816 //---------------------------------------------------------------------------
1817 //
1818 //      API_Match_UTF8   Test that the alternate engine for class RegexMatcher
1819 //                       is present and working, but excluding functions
1820 //                       implementing replace operations.
1821 //
1822 //---------------------------------------------------------------------------
API_Match_UTF8()1823 void RegexTest::API_Match_UTF8() {
1824     UParseError         pe;
1825     UErrorCode          status=U_ZERO_ERROR;
1826     int32_t             flags = 0;
1827 
1828     //
1829     // Debug - slide failing test cases early
1830     //
1831 #if 0
1832     {
1833     }
1834     return;
1835 #endif
1836 
1837     //
1838     // Simple pattern compilation
1839     //
1840     {
1841         UText               re = UTEXT_INITIALIZER;
1842         regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1843         REGEX_VERBOSE_TEXT(&re);
1844         RegexPattern        *pat2;
1845         pat2 = RegexPattern::compile(&re, flags, pe, status);
1846         REGEX_CHECK_STATUS;
1847 
1848         UText input1 = UTEXT_INITIALIZER;
1849         UText input2 = UTEXT_INITIALIZER;
1850         UText empty  = UTEXT_INITIALIZER;
1851         regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1852         REGEX_VERBOSE_TEXT(&input1);
1853         regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1854         REGEX_VERBOSE_TEXT(&input2);
1855         utext_openUChars(&empty, NULL, 0, &status);
1856 
1857         int32_t input1Len = static_cast<int32_t>(strlen("abcdef this is a test")); /* TODO: why not nativelen (input1) ? */
1858         int32_t input2Len = static_cast<int32_t>(strlen("not abc"));
1859 
1860 
1861         //
1862         // Matcher creation and reset.
1863         //
1864         RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1865         REGEX_CHECK_STATUS;
1866         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1867         const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1868         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1869         m1->reset(&input2);
1870         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1871         const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1872         REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1873         m1->reset(&input1);
1874         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1875         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1876         m1->reset(&empty);
1877         REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1878         REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1879 
1880         //
1881         //  reset(pos, status)
1882         //
1883         m1->reset(&input1);
1884         m1->reset(4, status);
1885         REGEX_CHECK_STATUS;
1886         REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1887         REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1888 
1889         m1->reset(-1, status);
1890         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1891         status = U_ZERO_ERROR;
1892 
1893         m1->reset(0, status);
1894         REGEX_CHECK_STATUS;
1895         status = U_ZERO_ERROR;
1896 
1897         m1->reset(input1Len-1, status);
1898         REGEX_CHECK_STATUS;
1899         status = U_ZERO_ERROR;
1900 
1901         m1->reset(input1Len, status);
1902         REGEX_CHECK_STATUS;
1903         status = U_ZERO_ERROR;
1904 
1905         m1->reset(input1Len+1, status);
1906         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1907         status = U_ZERO_ERROR;
1908 
1909         //
1910         // match(pos, status)
1911         //
1912         m1->reset(&input2);
1913         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1914         m1->reset();
1915         REGEX_ASSERT(m1->matches(3, status) == FALSE);
1916         m1->reset();
1917         REGEX_ASSERT(m1->matches(5, status) == FALSE);
1918         REGEX_ASSERT(m1->matches(4, status) == TRUE);
1919         REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1920         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1921 
1922         // Match() at end of string should fail, but should not
1923         //  be an error.
1924         status = U_ZERO_ERROR;
1925         REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1926         REGEX_CHECK_STATUS;
1927 
1928         // Match beyond end of string should fail with an error.
1929         status = U_ZERO_ERROR;
1930         REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1931         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1932 
1933         // Successful match at end of string.
1934         {
1935             status = U_ZERO_ERROR;
1936             RegexMatcher m("A?", 0, status);  // will match zero length string.
1937             REGEX_CHECK_STATUS;
1938             m.reset(&input1);
1939             REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1940             REGEX_CHECK_STATUS;
1941             m.reset(&empty);
1942             REGEX_ASSERT(m.matches(0, status) == TRUE);
1943             REGEX_CHECK_STATUS;
1944         }
1945 
1946 
1947         //
1948         // lookingAt(pos, status)
1949         //
1950         status = U_ZERO_ERROR;
1951         m1->reset(&input2);  // "not abc"
1952         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1953         REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1954         REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1955         REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1956         REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1957         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1958         status = U_ZERO_ERROR;
1959         REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1960         REGEX_CHECK_STATUS;
1961         REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1962         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1963 
1964         delete m1;
1965         delete pat2;
1966 
1967         utext_close(&re);
1968         utext_close(&input1);
1969         utext_close(&input2);
1970         utext_close(&empty);
1971     }
1972 
1973 
1974     //
1975     // Capture Group.
1976     //     RegexMatcher::start();
1977     //     RegexMatcher::end();
1978     //     RegexMatcher::groupCount();
1979     //
1980     {
1981         int32_t             flags=0;
1982         UParseError         pe;
1983         UErrorCode          status=U_ZERO_ERROR;
1984         UText               re=UTEXT_INITIALIZER;
1985         const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
1986         utext_openUTF8(&re, str_01234567_pat, -1, &status);
1987 
1988         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
1989         REGEX_CHECK_STATUS;
1990 
1991         UText input = UTEXT_INITIALIZER;
1992         const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
1993         utext_openUTF8(&input, str_0123456789, -1, &status);
1994 
1995         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
1996         REGEX_CHECK_STATUS;
1997         REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
1998         static const int32_t matchStarts[] = {0,  2, 4, 8};
1999         static const int32_t matchEnds[]   = {10, 8, 6, 10};
2000         int32_t i;
2001         for (i=0; i<4; i++) {
2002             int32_t actualStart = matcher->start(i, status);
2003             REGEX_CHECK_STATUS;
2004             if (actualStart != matchStarts[i]) {
2005                 errln("RegexTest failure at %s:%d, index %d.  Expected %d, got %d\n",
2006                       __FILE__, __LINE__, i, matchStarts[i], actualStart);
2007             }
2008             int32_t actualEnd = matcher->end(i, status);
2009             REGEX_CHECK_STATUS;
2010             if (actualEnd != matchEnds[i]) {
2011                 errln("RegexTest failure at %s:%d index %d.  Expected %d, got %d\n",
2012                       __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2013             }
2014         }
2015 
2016         REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2017         REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2018 
2019         REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2020         REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2021         matcher->reset();
2022         REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2023 
2024         matcher->lookingAt(status);
2025 
2026         UnicodeString dest;
2027         UText destText = UTEXT_INITIALIZER;
2028         utext_openUnicodeString(&destText, &dest, &status);
2029         UText *result;
2030         //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2031         //  Test shallow-clone API
2032         int64_t   group_len;
2033         result = matcher->group((UText *)NULL, group_len, status);
2034         REGEX_CHECK_STATUS;
2035         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2036         utext_close(result);
2037         result = matcher->group(0, &destText, group_len, status);
2038         REGEX_CHECK_STATUS;
2039         REGEX_ASSERT(result == &destText);
2040         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2041         //  destText is now immutable, reopen it
2042         utext_close(&destText);
2043         utext_openUnicodeString(&destText, &dest, &status);
2044 
2045         int64_t length;
2046         result = matcher->group(0, NULL, length, status);
2047         REGEX_CHECK_STATUS;
2048         REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2049         utext_close(result);
2050         result = matcher->group(0, &destText, length, status);
2051         REGEX_CHECK_STATUS;
2052         REGEX_ASSERT(result == &destText);
2053         REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2054         REGEX_ASSERT(length == 10);
2055         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2056 
2057         // Capture Group 1 == "234567"
2058         result = matcher->group(1, NULL, length, status);
2059         REGEX_CHECK_STATUS;
2060         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2061         REGEX_ASSERT(length == 6);
2062         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2063         utext_close(result);
2064 
2065         result = matcher->group(1, &destText, length, status);
2066         REGEX_CHECK_STATUS;
2067         REGEX_ASSERT(result == &destText);
2068         REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2069         REGEX_ASSERT(length == 6);
2070         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2071         utext_close(result);
2072 
2073         // Capture Group 2 == "45"
2074         result = matcher->group(2, NULL, length, status);
2075         REGEX_CHECK_STATUS;
2076         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2077         REGEX_ASSERT(length == 2);
2078         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2079         utext_close(result);
2080 
2081         result = matcher->group(2, &destText, length, status);
2082         REGEX_CHECK_STATUS;
2083         REGEX_ASSERT(result == &destText);
2084         REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2085         REGEX_ASSERT(length == 2);
2086         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2087         utext_close(result);
2088 
2089         // Capture Group 3 == "89"
2090         result = matcher->group(3, NULL, length, status);
2091         REGEX_CHECK_STATUS;
2092         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2093         REGEX_ASSERT(length == 2);
2094         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2095         utext_close(result);
2096 
2097         result = matcher->group(3, &destText, length, status);
2098         REGEX_CHECK_STATUS;
2099         REGEX_ASSERT(result == &destText);
2100         REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2101         REGEX_ASSERT(length == 2);
2102         REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2103         utext_close(result);
2104 
2105         // Capture Group number out of range.
2106         status = U_ZERO_ERROR;
2107         REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2108         status = U_ZERO_ERROR;
2109         REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2110         status = U_ZERO_ERROR;
2111         matcher->reset();
2112         REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2113 
2114         delete matcher;
2115         delete pat;
2116 
2117         utext_close(&destText);
2118         utext_close(&input);
2119         utext_close(&re);
2120     }
2121 
2122     //
2123     //  find
2124     //
2125     {
2126         int32_t             flags=0;
2127         UParseError         pe;
2128         UErrorCode          status=U_ZERO_ERROR;
2129         UText               re=UTEXT_INITIALIZER;
2130         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2131         utext_openUTF8(&re, str_abc, -1, &status);
2132 
2133         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2134         REGEX_CHECK_STATUS;
2135         UText input = UTEXT_INITIALIZER;
2136         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2137         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2138         //                      012345678901234567
2139 
2140         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2141         REGEX_CHECK_STATUS;
2142         REGEX_ASSERT(matcher->find());
2143         REGEX_ASSERT(matcher->start(status) == 1);
2144         REGEX_ASSERT(matcher->find());
2145         REGEX_ASSERT(matcher->start(status) == 6);
2146         REGEX_ASSERT(matcher->find());
2147         REGEX_ASSERT(matcher->start(status) == 12);
2148         REGEX_ASSERT(matcher->find() == FALSE);
2149         REGEX_ASSERT(matcher->find() == FALSE);
2150 
2151         matcher->reset();
2152         REGEX_ASSERT(matcher->find());
2153         REGEX_ASSERT(matcher->start(status) == 1);
2154 
2155         REGEX_ASSERT(matcher->find(0, status));
2156         REGEX_ASSERT(matcher->start(status) == 1);
2157         REGEX_ASSERT(matcher->find(1, status));
2158         REGEX_ASSERT(matcher->start(status) == 1);
2159         REGEX_ASSERT(matcher->find(2, status));
2160         REGEX_ASSERT(matcher->start(status) == 6);
2161         REGEX_ASSERT(matcher->find(12, status));
2162         REGEX_ASSERT(matcher->start(status) == 12);
2163         REGEX_ASSERT(matcher->find(13, status) == FALSE);
2164         REGEX_ASSERT(matcher->find(16, status) == FALSE);
2165         REGEX_ASSERT(matcher->find(17, status) == FALSE);
2166         REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2167 
2168         status = U_ZERO_ERROR;
2169         REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2170         status = U_ZERO_ERROR;
2171         REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2172 
2173         REGEX_ASSERT(matcher->groupCount() == 0);
2174 
2175         delete matcher;
2176         delete pat;
2177 
2178         utext_close(&input);
2179         utext_close(&re);
2180     }
2181 
2182 
2183     //
2184     //  find, with \G in pattern (true if at the end of a previous match).
2185     //
2186     {
2187         int32_t             flags=0;
2188         UParseError         pe;
2189         UErrorCode          status=U_ZERO_ERROR;
2190         UText               re=UTEXT_INITIALIZER;
2191         const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2192         utext_openUTF8(&re, str_Gabcabc, -1, &status);
2193 
2194         RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2195 
2196         REGEX_CHECK_STATUS;
2197         UText input = UTEXT_INITIALIZER;
2198         const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2199         utext_openUTF8(&input, str_abcabcabc, -1, &status);
2200         //                      012345678901234567
2201 
2202         RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2203         REGEX_CHECK_STATUS;
2204         REGEX_ASSERT(matcher->find());
2205         REGEX_ASSERT(matcher->start(status) == 0);
2206         REGEX_ASSERT(matcher->start(1, status) == -1);
2207         REGEX_ASSERT(matcher->start(2, status) == 1);
2208 
2209         REGEX_ASSERT(matcher->find());
2210         REGEX_ASSERT(matcher->start(status) == 4);
2211         REGEX_ASSERT(matcher->start(1, status) == 4);
2212         REGEX_ASSERT(matcher->start(2, status) == -1);
2213         REGEX_CHECK_STATUS;
2214 
2215         delete matcher;
2216         delete pat;
2217 
2218         utext_close(&input);
2219         utext_close(&re);
2220     }
2221 
2222     //
2223     //   find with zero length matches, match position should bump ahead
2224     //     to prevent loops.
2225     //
2226     {
2227         int32_t                 i;
2228         UErrorCode          status=U_ZERO_ERROR;
2229         RegexMatcher        m("(?= ?)", 0, status);   // This pattern will zero-length matches anywhere,
2230                                                       //   using an always-true look-ahead.
2231         REGEX_CHECK_STATUS;
2232         UText s = UTEXT_INITIALIZER;
2233         utext_openUTF8(&s, "    ", -1, &status);
2234         m.reset(&s);
2235         for (i=0; ; i++) {
2236             if (m.find() == FALSE) {
2237                 break;
2238             }
2239             REGEX_ASSERT(m.start(status) == i);
2240             REGEX_ASSERT(m.end(status) == i);
2241         }
2242         REGEX_ASSERT(i==5);
2243 
2244         // Check that the bump goes over characters outside the BMP OK
2245         // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2246         unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2247         utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2248         m.reset(&s);
2249         for (i=0; ; i+=4) {
2250             if (m.find() == FALSE) {
2251                 break;
2252             }
2253             REGEX_ASSERT(m.start(status) == i);
2254             REGEX_ASSERT(m.end(status) == i);
2255         }
2256         REGEX_ASSERT(i==20);
2257 
2258         utext_close(&s);
2259     }
2260     {
2261         // find() loop breaking test.
2262         //        with pattern of /.?/, should see a series of one char matches, then a single
2263         //        match of zero length at the end of the input string.
2264         int32_t                 i;
2265         UErrorCode          status=U_ZERO_ERROR;
2266         RegexMatcher        m(".?", 0, status);
2267         REGEX_CHECK_STATUS;
2268         UText s = UTEXT_INITIALIZER;
2269         utext_openUTF8(&s, "    ", -1, &status);
2270         m.reset(&s);
2271         for (i=0; ; i++) {
2272             if (m.find() == FALSE) {
2273                 break;
2274             }
2275             REGEX_ASSERT(m.start(status) == i);
2276             REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2277         }
2278         REGEX_ASSERT(i==5);
2279 
2280         utext_close(&s);
2281     }
2282 
2283 
2284     //
2285     // Matchers with no input string behave as if they had an empty input string.
2286     //
2287 
2288     {
2289         UErrorCode status = U_ZERO_ERROR;
2290         RegexMatcher  m(".?", 0, status);
2291         REGEX_CHECK_STATUS;
2292         REGEX_ASSERT(m.find());
2293         REGEX_ASSERT(m.start(status) == 0);
2294         REGEX_ASSERT(m.input() == "");
2295     }
2296     {
2297         UErrorCode status = U_ZERO_ERROR;
2298         RegexPattern  *p = RegexPattern::compile(".", 0, status);
2299         RegexMatcher  *m = p->matcher(status);
2300         REGEX_CHECK_STATUS;
2301 
2302         REGEX_ASSERT(m->find() == FALSE);
2303         REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2304         delete m;
2305         delete p;
2306     }
2307 
2308     //
2309     // Regions
2310     //
2311     {
2312         UErrorCode status = U_ZERO_ERROR;
2313         UText testPattern = UTEXT_INITIALIZER;
2314         UText testText    = UTEXT_INITIALIZER;
2315         regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2316         REGEX_VERBOSE_TEXT(&testPattern);
2317         regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2318         REGEX_VERBOSE_TEXT(&testText);
2319 
2320         RegexMatcher m(&testPattern, &testText, 0, status);
2321         REGEX_CHECK_STATUS;
2322         REGEX_ASSERT(m.regionStart() == 0);
2323         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2324         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2325         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2326 
2327         m.region(2,4, status);
2328         REGEX_CHECK_STATUS;
2329         REGEX_ASSERT(m.matches(status));
2330         REGEX_ASSERT(m.start(status)==2);
2331         REGEX_ASSERT(m.end(status)==4);
2332         REGEX_CHECK_STATUS;
2333 
2334         m.reset();
2335         REGEX_ASSERT(m.regionStart() == 0);
2336         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2337 
2338         regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2339         REGEX_VERBOSE_TEXT(&testText);
2340         m.reset(&testText);
2341         REGEX_ASSERT(m.regionStart() == 0);
2342         REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2343 
2344         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2345         REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2346         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2347         REGEX_ASSERT(&m == &m.reset());
2348         REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2349 
2350         REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2351         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2352         REGEX_ASSERT(&m == &m.reset());
2353         REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354 
2355         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2356         REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2357         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2358         REGEX_ASSERT(&m == &m.reset());
2359         REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2360 
2361         REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2362         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2363         REGEX_ASSERT(&m == &m.reset());
2364         REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2365 
2366         utext_close(&testText);
2367         utext_close(&testPattern);
2368     }
2369 
2370     //
2371     // hitEnd() and requireEnd()
2372     //
2373     {
2374         UErrorCode status = U_ZERO_ERROR;
2375         UText testPattern = UTEXT_INITIALIZER;
2376         UText testText    = UTEXT_INITIALIZER;
2377         const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2378         const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2379         utext_openUTF8(&testPattern, str_, -1, &status);
2380         utext_openUTF8(&testText, str_aabb, -1, &status);
2381 
2382         RegexMatcher m1(&testPattern, &testText,  0, status);
2383         REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2384         REGEX_ASSERT(m1.hitEnd() == TRUE);
2385         REGEX_ASSERT(m1.requireEnd() == FALSE);
2386         REGEX_CHECK_STATUS;
2387 
2388         status = U_ZERO_ERROR;
2389         const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2390         utext_openUTF8(&testPattern, str_a, -1, &status);
2391         RegexMatcher m2(&testPattern, &testText, 0, status);
2392         REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2393         REGEX_ASSERT(m2.hitEnd() == FALSE);
2394         REGEX_ASSERT(m2.requireEnd() == FALSE);
2395         REGEX_CHECK_STATUS;
2396 
2397         status = U_ZERO_ERROR;
2398         const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2399         utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2400         RegexMatcher m3(&testPattern, &testText, 0, status);
2401         REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2402         REGEX_ASSERT(m3.hitEnd() == TRUE);
2403         REGEX_ASSERT(m3.requireEnd() == TRUE);
2404         REGEX_CHECK_STATUS;
2405 
2406         utext_close(&testText);
2407         utext_close(&testPattern);
2408     }
2409 }
2410 
2411 
2412 //---------------------------------------------------------------------------
2413 //
2414 //      API_Replace_UTF8   API test for class RegexMatcher, testing the
2415 //                         Replace family of functions.
2416 //
2417 //---------------------------------------------------------------------------
API_Replace_UTF8()2418 void RegexTest::API_Replace_UTF8() {
2419     //
2420     //  Replace
2421     //
2422     int32_t             flags=0;
2423     UParseError         pe;
2424     UErrorCode          status=U_ZERO_ERROR;
2425 
2426     UText               re=UTEXT_INITIALIZER;
2427     regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2428     REGEX_VERBOSE_TEXT(&re);
2429     RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2430     REGEX_CHECK_STATUS;
2431 
2432     char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2433     //             012345678901234567
2434     UText dataText = UTEXT_INITIALIZER;
2435     utext_openUTF8(&dataText, data, -1, &status);
2436     REGEX_CHECK_STATUS;
2437     REGEX_VERBOSE_TEXT(&dataText);
2438     RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2439 
2440     //
2441     //  Plain vanilla matches.
2442     //
2443     UnicodeString  dest;
2444     UText destText = UTEXT_INITIALIZER;
2445     utext_openUnicodeString(&destText, &dest, &status);
2446     UText *result;
2447 
2448     UText replText = UTEXT_INITIALIZER;
2449 
2450     const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2451     utext_openUTF8(&replText, str_yz, -1, &status);
2452     REGEX_VERBOSE_TEXT(&replText);
2453     result = matcher->replaceFirst(&replText, NULL, status);
2454     REGEX_CHECK_STATUS;
2455     const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2456     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2457     utext_close(result);
2458     result = matcher->replaceFirst(&replText, &destText, status);
2459     REGEX_CHECK_STATUS;
2460     REGEX_ASSERT(result == &destText);
2461     REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2462 
2463     result = matcher->replaceAll(&replText, NULL, status);
2464     REGEX_CHECK_STATUS;
2465     const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2466     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2467     utext_close(result);
2468 
2469     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2470     result = matcher->replaceAll(&replText, &destText, status);
2471     REGEX_CHECK_STATUS;
2472     REGEX_ASSERT(result == &destText);
2473     REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2474 
2475     //
2476     //  Plain vanilla non-matches.
2477     //
2478     const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2479     utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2480     matcher->reset(&dataText);
2481 
2482     result = matcher->replaceFirst(&replText, NULL, status);
2483     REGEX_CHECK_STATUS;
2484     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2485     utext_close(result);
2486     result = matcher->replaceFirst(&replText, &destText, status);
2487     REGEX_CHECK_STATUS;
2488     REGEX_ASSERT(result == &destText);
2489     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2490 
2491     result = matcher->replaceAll(&replText, NULL, status);
2492     REGEX_CHECK_STATUS;
2493     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2494     utext_close(result);
2495     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2496     result = matcher->replaceAll(&replText, &destText, status);
2497     REGEX_CHECK_STATUS;
2498     REGEX_ASSERT(result == &destText);
2499     REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2500 
2501     //
2502     // Empty source string
2503     //
2504     utext_openUTF8(&dataText, NULL, 0, &status);
2505     matcher->reset(&dataText);
2506 
2507     result = matcher->replaceFirst(&replText, NULL, status);
2508     REGEX_CHECK_STATUS;
2509     REGEX_ASSERT_UTEXT_UTF8("", result);
2510     utext_close(result);
2511     result = matcher->replaceFirst(&replText, &destText, status);
2512     REGEX_CHECK_STATUS;
2513     REGEX_ASSERT(result == &destText);
2514     REGEX_ASSERT_UTEXT_UTF8("", result);
2515 
2516     result = matcher->replaceAll(&replText, NULL, status);
2517     REGEX_CHECK_STATUS;
2518     REGEX_ASSERT_UTEXT_UTF8("", result);
2519     utext_close(result);
2520     result = matcher->replaceAll(&replText, &destText, status);
2521     REGEX_CHECK_STATUS;
2522     REGEX_ASSERT(result == &destText);
2523     REGEX_ASSERT_UTEXT_UTF8("", result);
2524 
2525     //
2526     // Empty substitution string
2527     //
2528     utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2529     matcher->reset(&dataText);
2530 
2531     utext_openUTF8(&replText, NULL, 0, &status);
2532     result = matcher->replaceFirst(&replText, NULL, status);
2533     REGEX_CHECK_STATUS;
2534     const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2535     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2536     utext_close(result);
2537     result = matcher->replaceFirst(&replText, &destText, status);
2538     REGEX_CHECK_STATUS;
2539     REGEX_ASSERT(result == &destText);
2540     REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2541 
2542     result = matcher->replaceAll(&replText, NULL, status);
2543     REGEX_CHECK_STATUS;
2544     const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2545     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2546     utext_close(result);
2547     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2548     result = matcher->replaceAll(&replText, &destText, status);
2549     REGEX_CHECK_STATUS;
2550     REGEX_ASSERT(result == &destText);
2551     REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2552 
2553     //
2554     // match whole string
2555     //
2556     const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2557     utext_openUTF8(&dataText, str_abc, -1, &status);
2558     matcher->reset(&dataText);
2559 
2560     const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2561     utext_openUTF8(&replText, str_xyz, -1, &status);
2562     result = matcher->replaceFirst(&replText, NULL, status);
2563     REGEX_CHECK_STATUS;
2564     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2565     utext_close(result);
2566     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2567     result = matcher->replaceFirst(&replText, &destText, status);
2568     REGEX_CHECK_STATUS;
2569     REGEX_ASSERT(result == &destText);
2570     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2571 
2572     result = matcher->replaceAll(&replText, NULL, status);
2573     REGEX_CHECK_STATUS;
2574     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2575     utext_close(result);
2576     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2577     result = matcher->replaceAll(&replText, &destText, status);
2578     REGEX_CHECK_STATUS;
2579     REGEX_ASSERT(result == &destText);
2580     REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2581 
2582     //
2583     // Capture Group, simple case
2584     //
2585     const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2586     utext_openUTF8(&re, str_add, -1, &status);
2587     RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2588     REGEX_CHECK_STATUS;
2589 
2590     const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2591     utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2592     RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2593     REGEX_CHECK_STATUS;
2594 
2595     const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2596     utext_openUTF8(&replText, str_11, -1, &status);
2597     result = matcher2->replaceFirst(&replText, NULL, status);
2598     REGEX_CHECK_STATUS;
2599     const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2600     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2601     utext_close(result);
2602     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2603     result = matcher2->replaceFirst(&replText, &destText, status);
2604     REGEX_CHECK_STATUS;
2605     REGEX_ASSERT(result == &destText);
2606     REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2607 
2608     const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2609     utext_openUTF8(&replText, str_v, -1, &status);
2610     REGEX_VERBOSE_TEXT(&replText);
2611     result = matcher2->replaceFirst(&replText, NULL, status);
2612     REGEX_CHECK_STATUS;
2613     const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2614     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2615     utext_close(result);
2616     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2617     result = matcher2->replaceFirst(&replText, &destText, status);
2618     REGEX_CHECK_STATUS;
2619     REGEX_ASSERT(result == &destText);
2620     REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2621 
2622     const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2623                0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2624                0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2625     utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2626     result = matcher2->replaceFirst(&replText, NULL, status);
2627     REGEX_CHECK_STATUS;
2628     const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2629     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2630     utext_close(result);
2631     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2632     result = matcher2->replaceFirst(&replText, &destText, status);
2633     REGEX_CHECK_STATUS;
2634     REGEX_ASSERT(result == &destText);
2635     REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2636 
2637     unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2638     //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2639     //                                 012345678901234567890123456
2640     supplDigitChars[22] = 0xF0;
2641     supplDigitChars[23] = 0x9D;
2642     supplDigitChars[24] = 0x9F;
2643     supplDigitChars[25] = 0x8F;
2644     utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2645 
2646     result = matcher2->replaceFirst(&replText, NULL, status);
2647     REGEX_CHECK_STATUS;
2648     const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2649     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2650     utext_close(result);
2651     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2652     result = matcher2->replaceFirst(&replText, &destText, status);
2653     REGEX_CHECK_STATUS;
2654     REGEX_ASSERT(result == &destText);
2655     REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2656     const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e,  0x00 }; /* bad capture group number $5..." */
2657     utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2658     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2659 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2660     utext_close(result);
2661     utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2662     REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2663     REGEX_ASSERT(result == &destText);
2664 //    REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2665 
2666     //
2667     // Replacement String with \u hex escapes
2668     //
2669     {
2670       const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2671       const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2672         utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2673         utext_openUTF8(&replText, str_u0043, -1, &status);
2674         matcher->reset(&dataText);
2675 
2676         result = matcher->replaceAll(&replText, NULL, status);
2677         REGEX_CHECK_STATUS;
2678         const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2679         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2680         utext_close(result);
2681         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2682         result = matcher->replaceAll(&replText, &destText, status);
2683         REGEX_CHECK_STATUS;
2684         REGEX_ASSERT(result == &destText);
2685         REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2686     }
2687     {
2688       const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2689         utext_openUTF8(&dataText, str_abc, -1, &status);
2690         const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2691         utext_openUTF8(&replText, str_U00010000, -1, &status);
2692         matcher->reset(&dataText);
2693 
2694         unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2695         //                          0123456789
2696         expected[2] = 0xF0;
2697         expected[3] = 0x90;
2698         expected[4] = 0x80;
2699         expected[5] = 0x80;
2700 
2701         result = matcher->replaceAll(&replText, NULL, status);
2702         REGEX_CHECK_STATUS;
2703         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2704         utext_close(result);
2705         utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2706         result = matcher->replaceAll(&replText, &destText, status);
2707         REGEX_CHECK_STATUS;
2708         REGEX_ASSERT(result == &destText);
2709         REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2710     }
2711     // TODO:  need more through testing of capture substitutions.
2712 
2713     // Bug 4057
2714     //
2715     {
2716         status = U_ZERO_ERROR;
2717 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2718 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2719 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2720         utext_openUTF8(&re, str_ssee, -1, &status);
2721         utext_openUTF8(&dataText, str_blah, -1, &status);
2722         utext_openUTF8(&replText, str_ooh, -1, &status);
2723 
2724         RegexMatcher m(&re, 0, status);
2725         REGEX_CHECK_STATUS;
2726 
2727         UnicodeString result;
2728         UText resultText = UTEXT_INITIALIZER;
2729         utext_openUnicodeString(&resultText, &result, &status);
2730 
2731         // Multiple finds do NOT bump up the previous appendReplacement postion.
2732         m.reset(&dataText);
2733         m.find();
2734         m.find();
2735         m.appendReplacement(&resultText, &replText, status);
2736         REGEX_CHECK_STATUS;
2737         const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2738         REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2739 
2740         // After a reset into the interior of a string, appendReplacement still starts at beginning.
2741         status = U_ZERO_ERROR;
2742         result.truncate(0);
2743         utext_openUnicodeString(&resultText, &result, &status);
2744         m.reset(10, status);
2745         m.find();
2746         m.find();
2747         m.appendReplacement(&resultText, &replText, status);
2748         REGEX_CHECK_STATUS;
2749         const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2750         REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2751 
2752         // find() at interior of string, appendReplacement still starts at beginning.
2753         status = U_ZERO_ERROR;
2754         result.truncate(0);
2755         utext_openUnicodeString(&resultText, &result, &status);
2756         m.reset();
2757         m.find(10, status);
2758         m.find();
2759         m.appendReplacement(&resultText, &replText, status);
2760         REGEX_CHECK_STATUS;
2761         const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2762         REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2763 
2764         m.appendTail(&resultText, status);
2765         const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2766         REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2767 
2768         utext_close(&resultText);
2769     }
2770 
2771     delete matcher2;
2772     delete pat2;
2773     delete matcher;
2774     delete pat;
2775 
2776     utext_close(&dataText);
2777     utext_close(&replText);
2778     utext_close(&destText);
2779     utext_close(&re);
2780 }
2781 
2782 
2783 //---------------------------------------------------------------------------
2784 //
2785 //      API_Pattern_UTF8  Test that the API for class RegexPattern is
2786 //                        present and nominally working.
2787 //
2788 //---------------------------------------------------------------------------
API_Pattern_UTF8()2789 void RegexTest::API_Pattern_UTF8() {
2790     RegexPattern        pata;    // Test default constructor to not crash.
2791     RegexPattern        patb;
2792 
2793     REGEX_ASSERT(pata == patb);
2794     REGEX_ASSERT(pata == pata);
2795 
2796     UText         re1 = UTEXT_INITIALIZER;
2797     UText         re2 = UTEXT_INITIALIZER;
2798     UErrorCode    status = U_ZERO_ERROR;
2799     UParseError   pe;
2800 
2801     const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2802     const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2803     utext_openUTF8(&re1, str_abcalmz, -1, &status);
2804     utext_openUTF8(&re2, str_def, -1, &status);
2805 
2806     RegexPattern        *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2807     RegexPattern        *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2808     REGEX_CHECK_STATUS;
2809     REGEX_ASSERT(*pat1 == *pat1);
2810     REGEX_ASSERT(*pat1 != pata);
2811 
2812     // Assign
2813     patb = *pat1;
2814     REGEX_ASSERT(patb == *pat1);
2815 
2816     // Copy Construct
2817     RegexPattern patc(*pat1);
2818     REGEX_ASSERT(patc == *pat1);
2819     REGEX_ASSERT(patb == patc);
2820     REGEX_ASSERT(pat1 != pat2);
2821     patb = *pat2;
2822     REGEX_ASSERT(patb != patc);
2823     REGEX_ASSERT(patb == *pat2);
2824 
2825     // Compile with no flags.
2826     RegexPattern         *pat1a = RegexPattern::compile(&re1, pe, status);
2827     REGEX_ASSERT(*pat1a == *pat1);
2828 
2829     REGEX_ASSERT(pat1a->flags() == 0);
2830 
2831     // Compile with different flags should be not equal
2832     RegexPattern        *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2833     REGEX_CHECK_STATUS;
2834 
2835     REGEX_ASSERT(*pat1b != *pat1a);
2836     REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2837     REGEX_ASSERT(pat1a->flags() == 0);
2838     delete pat1b;
2839 
2840     // clone
2841     RegexPattern *pat1c = pat1->clone();
2842     REGEX_ASSERT(*pat1c == *pat1);
2843     REGEX_ASSERT(*pat1c != *pat2);
2844 
2845     delete pat1c;
2846     delete pat1a;
2847     delete pat1;
2848     delete pat2;
2849 
2850     utext_close(&re1);
2851     utext_close(&re2);
2852 
2853 
2854     //
2855     //   Verify that a matcher created from a cloned pattern works.
2856     //     (Jitterbug 3423)
2857     //
2858     {
2859         UErrorCode     status     = U_ZERO_ERROR;
2860         UText          pattern    = UTEXT_INITIALIZER;
2861         const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2862         utext_openUTF8(&pattern, str_pL, -1, &status);
2863 
2864         RegexPattern  *pSource    = RegexPattern::compile(&pattern, 0, status);
2865         RegexPattern  *pClone     = pSource->clone();
2866         delete         pSource;
2867         RegexMatcher  *mFromClone = pClone->matcher(status);
2868         REGEX_CHECK_STATUS;
2869 
2870         UText          input      = UTEXT_INITIALIZER;
2871         const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2872         utext_openUTF8(&input, str_HelloWorld, -1, &status);
2873         mFromClone->reset(&input);
2874         REGEX_ASSERT(mFromClone->find() == TRUE);
2875         REGEX_ASSERT(mFromClone->group(status) == "Hello");
2876         REGEX_ASSERT(mFromClone->find() == TRUE);
2877         REGEX_ASSERT(mFromClone->group(status) == "World");
2878         REGEX_ASSERT(mFromClone->find() == FALSE);
2879         delete mFromClone;
2880         delete pClone;
2881 
2882         utext_close(&input);
2883         utext_close(&pattern);
2884     }
2885 
2886     //
2887     //   matches convenience API
2888     //
2889     {
2890         UErrorCode status  = U_ZERO_ERROR;
2891         UText      pattern = UTEXT_INITIALIZER;
2892         UText      input   = UTEXT_INITIALIZER;
2893 
2894         const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2895         utext_openUTF8(&input, str_randominput, -1, &status);
2896 
2897         const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2898         utext_openUTF8(&pattern, str_dotstar, -1, &status);
2899         REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2900         REGEX_CHECK_STATUS;
2901 
2902         const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2903         utext_openUTF8(&pattern, str_abc, -1, &status);
2904         REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2905         REGEX_CHECK_STATUS;
2906 
2907         const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2908         utext_openUTF8(&pattern, str_nput, -1, &status);
2909         REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2910         REGEX_CHECK_STATUS;
2911 
2912         utext_openUTF8(&pattern, str_randominput, -1, &status);
2913         REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2914         REGEX_CHECK_STATUS;
2915 
2916         const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2917         utext_openUTF8(&pattern, str_u, -1, &status);
2918         REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2919         REGEX_CHECK_STATUS;
2920 
2921         utext_openUTF8(&input, str_abc, -1, &status);
2922         utext_openUTF8(&pattern, str_abc, -1, &status);
2923         status = U_INDEX_OUTOFBOUNDS_ERROR;
2924         REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2925         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2926 
2927         utext_close(&input);
2928         utext_close(&pattern);
2929     }
2930 
2931 
2932     //
2933     // Split()
2934     //
2935     status = U_ZERO_ERROR;
2936     const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /*  + */
2937     utext_openUTF8(&re1, str_spaceplus, -1, &status);
2938     pat1 = RegexPattern::compile(&re1, pe, status);
2939     REGEX_CHECK_STATUS;
2940     UnicodeString  fields[10];
2941 
2942     int32_t n;
2943     n = pat1->split("Now is the time", fields, 10, status);
2944     REGEX_CHECK_STATUS;
2945     REGEX_ASSERT(n==4);
2946     REGEX_ASSERT(fields[0]=="Now");
2947     REGEX_ASSERT(fields[1]=="is");
2948     REGEX_ASSERT(fields[2]=="the");
2949     REGEX_ASSERT(fields[3]=="time");
2950     REGEX_ASSERT(fields[4]=="");
2951 
2952     n = pat1->split("Now is the time", fields, 2, status);
2953     REGEX_CHECK_STATUS;
2954     REGEX_ASSERT(n==2);
2955     REGEX_ASSERT(fields[0]=="Now");
2956     REGEX_ASSERT(fields[1]=="is the time");
2957     REGEX_ASSERT(fields[2]=="the");   // left over from previous test
2958 
2959     fields[1] = "*";
2960     status = U_ZERO_ERROR;
2961     n = pat1->split("Now is the time", fields, 1, status);
2962     REGEX_CHECK_STATUS;
2963     REGEX_ASSERT(n==1);
2964     REGEX_ASSERT(fields[0]=="Now is the time");
2965     REGEX_ASSERT(fields[1]=="*");
2966     status = U_ZERO_ERROR;
2967 
2968     n = pat1->split("    Now       is the time   ", fields, 10, status);
2969     REGEX_CHECK_STATUS;
2970     REGEX_ASSERT(n==6);
2971     REGEX_ASSERT(fields[0]=="");
2972     REGEX_ASSERT(fields[1]=="Now");
2973     REGEX_ASSERT(fields[2]=="is");
2974     REGEX_ASSERT(fields[3]=="the");
2975     REGEX_ASSERT(fields[4]=="time");
2976     REGEX_ASSERT(fields[5]=="");
2977     REGEX_ASSERT(fields[6]=="");
2978 
2979     fields[2] = "*";
2980     n = pat1->split("     ", fields, 10, status);
2981     REGEX_CHECK_STATUS;
2982     REGEX_ASSERT(n==2);
2983     REGEX_ASSERT(fields[0]=="");
2984     REGEX_ASSERT(fields[1]=="");
2985     REGEX_ASSERT(fields[2]=="*");
2986 
2987     fields[0] = "foo";
2988     n = pat1->split("", fields, 10, status);
2989     REGEX_CHECK_STATUS;
2990     REGEX_ASSERT(n==0);
2991     REGEX_ASSERT(fields[0]=="foo");
2992 
2993     delete pat1;
2994 
2995     //  split, with a pattern with (capture)
2996     regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
2997     pat1 = RegexPattern::compile(&re1,  pe, status);
2998     REGEX_CHECK_STATUS;
2999 
3000     status = U_ZERO_ERROR;
3001     fields[6] = fields[7] = "*";
3002     n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3003     REGEX_CHECK_STATUS;
3004     REGEX_ASSERT(n==7);
3005     REGEX_ASSERT(fields[0]=="");
3006     REGEX_ASSERT(fields[1]=="a");
3007     REGEX_ASSERT(fields[2]=="Now is ");
3008     REGEX_ASSERT(fields[3]=="b");
3009     REGEX_ASSERT(fields[4]=="the time");
3010     REGEX_ASSERT(fields[5]=="c");
3011     REGEX_ASSERT(fields[6]=="");
3012     REGEX_ASSERT(fields[7]=="*");
3013     REGEX_ASSERT(status==U_ZERO_ERROR);
3014 
3015     fields[6] = fields[7] = "*";
3016     n = pat1->split("  <a>Now is <b>the time<c>", fields, 10, status);
3017     REGEX_CHECK_STATUS;
3018     REGEX_ASSERT(n==7);
3019     REGEX_ASSERT(fields[0]=="  ");
3020     REGEX_ASSERT(fields[1]=="a");
3021     REGEX_ASSERT(fields[2]=="Now is ");
3022     REGEX_ASSERT(fields[3]=="b");
3023     REGEX_ASSERT(fields[4]=="the time");
3024     REGEX_ASSERT(fields[5]=="c");
3025     REGEX_ASSERT(fields[6]=="");
3026     REGEX_ASSERT(fields[7]=="*");
3027 
3028     status = U_ZERO_ERROR;
3029     fields[6] = "foo";
3030     n = pat1->split("  <a>Now is <b>the time<c> ", fields, 6, status);
3031     REGEX_CHECK_STATUS;
3032     REGEX_ASSERT(n==6);
3033     REGEX_ASSERT(fields[0]=="  ");
3034     REGEX_ASSERT(fields[1]=="a");
3035     REGEX_ASSERT(fields[2]=="Now is ");
3036     REGEX_ASSERT(fields[3]=="b");
3037     REGEX_ASSERT(fields[4]=="the time");
3038     REGEX_ASSERT(fields[5]==" ");
3039     REGEX_ASSERT(fields[6]=="foo");
3040 
3041     status = U_ZERO_ERROR;
3042     fields[5] = "foo";
3043     n = pat1->split("  <a>Now is <b>the time<c>", fields, 5, status);
3044     REGEX_CHECK_STATUS;
3045     REGEX_ASSERT(n==5);
3046     REGEX_ASSERT(fields[0]=="  ");
3047     REGEX_ASSERT(fields[1]=="a");
3048     REGEX_ASSERT(fields[2]=="Now is ");
3049     REGEX_ASSERT(fields[3]=="b");
3050     REGEX_ASSERT(fields[4]=="the time<c>");
3051     REGEX_ASSERT(fields[5]=="foo");
3052 
3053     status = U_ZERO_ERROR;
3054     fields[5] = "foo";
3055     n = pat1->split("  <a>Now is <b>the time", fields, 5, status);
3056     REGEX_CHECK_STATUS;
3057     REGEX_ASSERT(n==5);
3058     REGEX_ASSERT(fields[0]=="  ");
3059     REGEX_ASSERT(fields[1]=="a");
3060     REGEX_ASSERT(fields[2]=="Now is ");
3061     REGEX_ASSERT(fields[3]=="b");
3062     REGEX_ASSERT(fields[4]=="the time");
3063     REGEX_ASSERT(fields[5]=="foo");
3064 
3065     status = U_ZERO_ERROR;
3066     n = pat1->split("  <a>Now is <b>the time<c>", fields, 4, status);
3067     REGEX_CHECK_STATUS;
3068     REGEX_ASSERT(n==4);
3069     REGEX_ASSERT(fields[0]=="  ");
3070     REGEX_ASSERT(fields[1]=="a");
3071     REGEX_ASSERT(fields[2]=="Now is ");
3072     REGEX_ASSERT(fields[3]=="the time<c>");
3073     status = U_ZERO_ERROR;
3074     delete pat1;
3075 
3076     regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3077     pat1 = RegexPattern::compile(&re1, pe, status);
3078     REGEX_CHECK_STATUS;
3079     n = pat1->split("1-10,20", fields, 10, status);
3080     REGEX_CHECK_STATUS;
3081     REGEX_ASSERT(n==5);
3082     REGEX_ASSERT(fields[0]=="1");
3083     REGEX_ASSERT(fields[1]=="-");
3084     REGEX_ASSERT(fields[2]=="10");
3085     REGEX_ASSERT(fields[3]==",");
3086     REGEX_ASSERT(fields[4]=="20");
3087     delete pat1;
3088 
3089 
3090     //
3091     // split of a UText based string, with library allocating output UTexts.
3092     //
3093     {
3094         status = U_ZERO_ERROR;
3095         RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3096         UnicodeString stringToSplit("first:second:third");
3097         UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3098         REGEX_CHECK_STATUS;
3099 
3100         UText *splits[10] = {NULL};
3101         int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3102         REGEX_CHECK_STATUS;
3103         REGEX_ASSERT(numFields == 5);
3104         REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3105         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3106         REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3107         REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3108         REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3109         REGEX_ASSERT(splits[5] == NULL);
3110 
3111         for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3112             if (splits[i]) {
3113                 utext_close(splits[i]);
3114                 splits[i] = NULL;
3115             }
3116         }
3117         utext_close(textToSplit);
3118     }
3119 
3120 
3121     //
3122     // RegexPattern::pattern() and patternText()
3123     //
3124     pat1 = new RegexPattern();
3125     REGEX_ASSERT(pat1->pattern() == "");
3126     REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3127     delete pat1;
3128     const char *helloWorldInvariant = "(Hello, world)*";
3129     regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3130     pat1 = RegexPattern::compile(&re1, pe, status);
3131     REGEX_CHECK_STATUS;
3132     REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3133     REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3134     delete pat1;
3135 
3136     utext_close(&re1);
3137 }
3138 
3139 
3140 //---------------------------------------------------------------------------
3141 //
3142 //      Extended       A more thorough check for features of regex patterns
3143 //                     The test cases are in a separate data file,
3144 //                       source/tests/testdata/regextst.txt
3145 //                     A description of the test data format is included in that file.
3146 //
3147 //---------------------------------------------------------------------------
3148 
3149 const char *
getPath(char buffer[2048],const char * filename)3150 RegexTest::getPath(char buffer[2048], const char *filename) {
3151     UErrorCode status=U_ZERO_ERROR;
3152     const char *testDataDirectory = IntlTest::getSourceTestData(status);
3153     if (U_FAILURE(status)) {
3154         errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3155         return NULL;
3156     }
3157 
3158     strcpy(buffer, testDataDirectory);
3159     strcat(buffer, filename);
3160     return buffer;
3161 }
3162 
Extended()3163 void RegexTest::Extended() {
3164     char tdd[2048];
3165     const char *srcPath;
3166     UErrorCode  status  = U_ZERO_ERROR;
3167     int32_t     lineNum = 0;
3168 
3169     //
3170     //  Open and read the test data file.
3171     //
3172     srcPath=getPath(tdd, "regextst.txt");
3173     if(srcPath==NULL) {
3174         return; /* something went wrong, error already output */
3175     }
3176 
3177     int32_t    len;
3178     UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3179     if (U_FAILURE(status)) {
3180         return; /* something went wrong, error already output */
3181     }
3182 
3183     //
3184     //  Put the test data into a UnicodeString
3185     //
3186     UnicodeString testString(FALSE, testData, len);
3187 
3188     RegexMatcher    quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3189     RegexMatcher    commentMat    (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3190     RegexMatcher    flagsMat      (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3191 
3192     RegexMatcher    lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3193     UnicodeString   testPattern;   // The pattern for test from the test file.
3194     UnicodeString   testFlags;     // the flags   for a test.
3195     UnicodeString   matchString;   // The marked up string to be used as input
3196 
3197     if (U_FAILURE(status)){
3198         dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3199         delete [] testData;
3200         return;
3201     }
3202 
3203     //
3204     //  Loop over the test data file, once per line.
3205     //
3206     while (lineMat.find()) {
3207         lineNum++;
3208         if (U_FAILURE(status)) {
3209           errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3210         }
3211 
3212         status = U_ZERO_ERROR;
3213         UnicodeString testLine = lineMat.group(1, status);
3214         if (testLine.length() == 0) {
3215             continue;
3216         }
3217 
3218         //
3219         // Parse the test line.  Skip blank and comment only lines.
3220         // Separate out the three main fields - pattern, flags, target.
3221         //
3222 
3223         commentMat.reset(testLine);
3224         if (commentMat.lookingAt(status)) {
3225             // This line is a comment, or blank.
3226             continue;
3227         }
3228 
3229         //
3230         //  Pull out the pattern field, remove it from the test file line.
3231         //
3232         quotedStuffMat.reset(testLine);
3233         if (quotedStuffMat.lookingAt(status)) {
3234             testPattern = quotedStuffMat.group(2, status);
3235             testLine.remove(0, quotedStuffMat.end(0, status));
3236         } else {
3237             errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3238             continue;
3239         }
3240 
3241 
3242         //
3243         //  Pull out the flags from the test file line.
3244         //
3245         flagsMat.reset(testLine);
3246         flagsMat.lookingAt(status);                  // Will always match, possibly an empty string.
3247         testFlags = flagsMat.group(1, status);
3248         if (flagsMat.group(2, status).length() > 0) {
3249             errln("Bad Match flag at line %d. Scanning %c\n",
3250                 lineNum, flagsMat.group(2, status).charAt(0));
3251             continue;
3252         }
3253         testLine.remove(0, flagsMat.end(0, status));
3254 
3255         //
3256         //  Pull out the match string, as a whole.
3257         //    We'll process the <tags> later.
3258         //
3259         quotedStuffMat.reset(testLine);
3260         if (quotedStuffMat.lookingAt(status)) {
3261             matchString = quotedStuffMat.group(2, status);
3262             testLine.remove(0, quotedStuffMat.end(0, status));
3263         } else {
3264             errln("Bad match string at test file line %d", lineNum);
3265             continue;
3266         }
3267 
3268         //
3269         //  The only thing left from the input line should be an optional trailing comment.
3270         //
3271         commentMat.reset(testLine);
3272         if (commentMat.lookingAt(status) == FALSE) {
3273             errln("Line %d: unexpected characters at end of test line.", lineNum);
3274             continue;
3275         }
3276 
3277         //
3278         //  Run the test
3279         //
3280         regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3281     }
3282 
3283     delete [] testData;
3284 
3285 }
3286 
3287 
3288 
3289 //---------------------------------------------------------------------------
3290 //
3291 //    regex_find(pattern, flags, inputString, lineNumber)
3292 //
3293 //         Function to run a single test from the Extended (data driven) tests.
3294 //         See file test/testdata/regextst.txt for a description of the
3295 //         pattern and inputString fields, and the allowed flags.
3296 //         lineNumber is the source line in regextst.txt of the test.
3297 //
3298 //---------------------------------------------------------------------------
3299 
3300 
3301 //  Set a value into a UVector at position specified by a decimal number in
3302 //   a UnicodeString.   This is a utility function needed by the actual test function,
3303 //   which follows.
set(UVector & vec,int32_t val,UnicodeString index)3304 static void set(UVector &vec, int32_t val, UnicodeString index) {
3305     UErrorCode  status=U_ZERO_ERROR;
3306     int32_t  idx = 0;
3307     for (int32_t i=0; i<index.length(); i++) {
3308         int32_t d=u_charDigitValue(index.charAt(i));
3309         if (d<0) {return;}
3310         idx = idx*10 + d;
3311     }
3312     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3313     vec.setElementAt(val, idx);
3314 }
3315 
setInt(UVector & vec,int32_t val,int32_t idx)3316 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3317     UErrorCode  status=U_ZERO_ERROR;
3318     while (vec.size()<idx+1) {vec.addElement(-1, status);}
3319     vec.setElementAt(val, idx);
3320 }
3321 
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3322 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3323 {
3324     UBool couldFind = TRUE;
3325     UTEXT_SETNATIVEINDEX(utext, 0);
3326     int32_t i = 0;
3327     while (i < unistrOffset) {
3328         UChar32 c = UTEXT_NEXT32(utext);
3329         if (c != U_SENTINEL) {
3330             i += U16_LENGTH(c);
3331         } else {
3332             couldFind = FALSE;
3333             break;
3334         }
3335     }
3336     nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3337     return couldFind;
3338 }
3339 
3340 
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3341 void RegexTest::regex_find(const UnicodeString &pattern,
3342                            const UnicodeString &flags,
3343                            const UnicodeString &inputString,
3344                            const char *srcPath,
3345                            int32_t line) {
3346     UnicodeString       unEscapedInput;
3347     UnicodeString       deTaggedInput;
3348 
3349     int32_t             patternUTF8Length,      inputUTF8Length;
3350     char                *patternChars  = NULL, *inputChars = NULL;
3351     UText               patternText    = UTEXT_INITIALIZER;
3352     UText               inputText      = UTEXT_INITIALIZER;
3353     UConverter          *UTF8Converter = NULL;
3354 
3355     UErrorCode          status         = U_ZERO_ERROR;
3356     UParseError         pe;
3357     RegexPattern        *parsePat      = NULL;
3358     RegexMatcher        *parseMatcher  = NULL;
3359     RegexPattern        *callerPattern = NULL, *UTF8Pattern = NULL;
3360     RegexMatcher        *matcher       = NULL, *UTF8Matcher = NULL;
3361     UVector             groupStarts(status);
3362     UVector             groupEnds(status);
3363     UVector             groupStartsUTF8(status);
3364     UVector             groupEndsUTF8(status);
3365     UBool               isMatch        = FALSE, isUTF8Match = FALSE;
3366     UBool               failed         = FALSE;
3367     int32_t             numFinds;
3368     int32_t             i;
3369     UBool               useMatchesFunc   = FALSE;
3370     UBool               useLookingAtFunc = FALSE;
3371     int32_t             regionStart      = -1;
3372     int32_t             regionEnd        = -1;
3373     int32_t             regionStartUTF8  = -1;
3374     int32_t             regionEndUTF8    = -1;
3375 
3376 
3377     //
3378     //  Compile the caller's pattern
3379     //
3380     uint32_t bflags = 0;
3381     if (flags.indexOf((UChar)0x69) >= 0)  { // 'i' flag
3382         bflags |= UREGEX_CASE_INSENSITIVE;
3383     }
3384     if (flags.indexOf((UChar)0x78) >= 0)  { // 'x' flag
3385         bflags |= UREGEX_COMMENTS;
3386     }
3387     if (flags.indexOf((UChar)0x73) >= 0)  { // 's' flag
3388         bflags |= UREGEX_DOTALL;
3389     }
3390     if (flags.indexOf((UChar)0x6d) >= 0)  { // 'm' flag
3391         bflags |= UREGEX_MULTILINE;
3392     }
3393 
3394     if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3395         bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3396     }
3397     if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3398         bflags |= UREGEX_UNIX_LINES;
3399     }
3400     if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3401         bflags |= UREGEX_LITERAL;
3402     }
3403 
3404 
3405     callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3406     if (status != U_ZERO_ERROR) {
3407         #if UCONFIG_NO_BREAK_ITERATION==1
3408         // 'v' test flag means that the test pattern should not compile if ICU was configured
3409         //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3410         if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3411             goto cleanupAndReturn;
3412         }
3413         #endif
3414         if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3415             // Expected pattern compilation error.
3416             if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3417                 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3418             }
3419             goto cleanupAndReturn;
3420         } else {
3421             // Unexpected pattern compilation error.
3422             dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3423             goto cleanupAndReturn;
3424         }
3425     }
3426 
3427     UTF8Converter = ucnv_open("UTF8", &status);
3428     ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3429 
3430     patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3431     status = U_ZERO_ERROR; // buffer overflow
3432     patternChars = new char[patternUTF8Length+1];
3433     pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3434     utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3435 
3436     if (status == U_ZERO_ERROR) {
3437         UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3438 
3439         if (status != U_ZERO_ERROR) {
3440 #if UCONFIG_NO_BREAK_ITERATION==1
3441             // 'v' test flag means that the test pattern should not compile if ICU was configured
3442             //     to not include break iteration.  RBBI is needed for Unicode word boundaries.
3443             if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3444                 goto cleanupAndReturn;
3445             }
3446 #endif
3447             if (flags.indexOf((UChar)0x45) >= 0) {  //  flags contain 'E'
3448                 // Expected pattern compilation error.
3449                 if (flags.indexOf((UChar)0x64) >= 0) {   // flags contain 'd'
3450                     logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3451                 }
3452                 goto cleanupAndReturn;
3453             } else {
3454                 // Unexpected pattern compilation error.
3455                 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3456                 goto cleanupAndReturn;
3457             }
3458         }
3459     }
3460 
3461     if (UTF8Pattern == NULL) {
3462         // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3463         logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3464         status = U_ZERO_ERROR;
3465     }
3466 
3467     if (flags.indexOf((UChar)0x64) >= 0) {  // 'd' flag
3468         callerPattern->dumpPattern();
3469     }
3470 
3471     if (flags.indexOf((UChar)0x45) >= 0) {  // 'E' flag
3472         errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3473         goto cleanupAndReturn;
3474     }
3475 
3476 
3477     //
3478     // Number of times find() should be called on the test string, default to 1
3479     //
3480     numFinds = 1;
3481     for (i=2; i<=9; i++) {
3482         if (flags.indexOf((UChar)(0x30 + i)) >= 0) {   // digit flag
3483             if (numFinds != 1) {
3484                 errln("Line %d: more than one digit flag.  Scanning %d.", line, i);
3485                 goto cleanupAndReturn;
3486             }
3487             numFinds = i;
3488         }
3489     }
3490 
3491     // 'M' flag.  Use matches() instead of find()
3492     if (flags.indexOf((UChar)0x4d) >= 0) {
3493         useMatchesFunc = TRUE;
3494     }
3495     if (flags.indexOf((UChar)0x4c) >= 0) {
3496         useLookingAtFunc = TRUE;
3497     }
3498 
3499     //
3500     //  Find the tags in the input data, remove them, and record the group boundary
3501     //    positions.
3502     //
3503     parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3504     if (!assertSuccess(WHERE, status) ) {
3505         goto cleanupAndReturn;
3506     }
3507 
3508     unEscapedInput = inputString.unescape();
3509     parseMatcher = parsePat->matcher(unEscapedInput, status);
3510     if (!assertSuccess(WHERE, status) ) {
3511         goto cleanupAndReturn;
3512     }
3513     while(parseMatcher->find()) {
3514         parseMatcher->appendReplacement(deTaggedInput, "", status);
3515         REGEX_CHECK_STATUS;
3516         UnicodeString groupNum = parseMatcher->group(2, status);
3517         if (groupNum == "r") {
3518             // <r> or </r>, a region specification within the string
3519             if (parseMatcher->group(1, status) == "/") {
3520                 regionEnd = deTaggedInput.length();
3521             } else {
3522                 regionStart = deTaggedInput.length();
3523             }
3524         } else {
3525             // <digits> or </digits>, a group match boundary tag.
3526             if (parseMatcher->group(1, status) == "/") {
3527                 set(groupEnds, deTaggedInput.length(), groupNum);
3528             } else {
3529                 set(groupStarts, deTaggedInput.length(), groupNum);
3530             }
3531         }
3532     }
3533     parseMatcher->appendTail(deTaggedInput);
3534 
3535     if (groupStarts.size() != groupEnds.size()) {
3536         errln("Error at line %d: mismatched <n> group tags in expected results.", line);
3537         failed = true;
3538         goto cleanupAndReturn;
3539     }
3540     if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3541         errln("mismatched <r> tags");
3542         failed = TRUE;
3543         goto cleanupAndReturn;
3544     }
3545 
3546     //
3547     //  Configure the matcher according to the flags specified with this test.
3548     //
3549     matcher = callerPattern->matcher(deTaggedInput, status);
3550     REGEX_CHECK_STATUS_L(line);
3551     if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3552         matcher->setTrace(TRUE);
3553     }
3554 
3555     if (UTF8Pattern != NULL) {
3556         inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3557         status = U_ZERO_ERROR; // buffer overflow
3558         inputChars = new char[inputUTF8Length+1];
3559         deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3560         utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3561 
3562         if (status == U_ZERO_ERROR) {
3563             UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3564             REGEX_CHECK_STATUS_L(line);
3565         }
3566 
3567         if (UTF8Matcher == NULL) {
3568             // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3569             logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3570             status = U_ZERO_ERROR;
3571         }
3572     }
3573 
3574     //
3575     //  Generate native indices for UTF8 versions of region and capture group info
3576     //
3577     if (UTF8Matcher != NULL) {
3578         if (flags.indexOf((UChar)0x74) >= 0) {   //  't' trace flag
3579             UTF8Matcher->setTrace(TRUE);
3580         }
3581         if (regionStart>=0)    (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3582         if (regionEnd>=0)      (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3583 
3584         //  Fill out the native index UVector info.
3585         //  Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3586         for (i=0; i<groupStarts.size(); i++) {
3587             int32_t  start = groupStarts.elementAti(i);
3588             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3589             if (start >= 0) {
3590                 int32_t  startUTF8;
3591                 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3592                     errln("Error at line %d: could not find native index for group start %d.  UTF16 index %d", line, i, start);
3593                     failed = TRUE;
3594                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3595                 }
3596                 setInt(groupStartsUTF8, startUTF8, i);
3597             }
3598 
3599             int32_t  end = groupEnds.elementAti(i);
3600             //  -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3601             if (end >= 0) {
3602                 int32_t  endUTF8;
3603                 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3604                     errln("Error at line %d: could not find native index for group end %d.  UTF16 index %d", line, i, end);
3605                     failed = TRUE;
3606                     goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3607                 }
3608                 setInt(groupEndsUTF8, endUTF8, i);
3609             }
3610         }
3611     }
3612 
3613     if (regionStart>=0) {
3614        matcher->region(regionStart, regionEnd, status);
3615        REGEX_CHECK_STATUS_L(line);
3616        if (UTF8Matcher != NULL) {
3617            UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3618            REGEX_CHECK_STATUS_L(line);
3619        }
3620     }
3621     if (flags.indexOf((UChar)0x61) >= 0) {   //  'a' anchoring bounds flag
3622         matcher->useAnchoringBounds(FALSE);
3623         if (UTF8Matcher != NULL) {
3624             UTF8Matcher->useAnchoringBounds(FALSE);
3625         }
3626     }
3627     if (flags.indexOf((UChar)0x62) >= 0) {   //  'b' transparent bounds flag
3628         matcher->useTransparentBounds(TRUE);
3629         if (UTF8Matcher != NULL) {
3630             UTF8Matcher->useTransparentBounds(TRUE);
3631         }
3632     }
3633 
3634 
3635 
3636     //
3637     // Do a find on the de-tagged input using the caller's pattern
3638     //     TODO: error on count>1 and not find().
3639     //           error on both matches() and lookingAt().
3640     //
3641     for (i=0; i<numFinds; i++) {
3642         if (useMatchesFunc) {
3643             isMatch = matcher->matches(status);
3644             if (UTF8Matcher != NULL) {
3645                isUTF8Match = UTF8Matcher->matches(status);
3646             }
3647         } else  if (useLookingAtFunc) {
3648             isMatch = matcher->lookingAt(status);
3649             if (UTF8Matcher != NULL) {
3650                 isUTF8Match = UTF8Matcher->lookingAt(status);
3651             }
3652         } else {
3653             isMatch = matcher->find();
3654             if (UTF8Matcher != NULL) {
3655                 isUTF8Match = UTF8Matcher->find();
3656             }
3657         }
3658     }
3659     matcher->setTrace(FALSE);
3660     if (UTF8Matcher) {
3661         UTF8Matcher->setTrace(FALSE);
3662     }
3663     if (U_FAILURE(status)) {
3664         errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3665     }
3666 
3667     //
3668     // Match up the groups from the find() with the groups from the tags
3669     //
3670 
3671     // number of tags should match number of groups from find operation.
3672     // matcher->groupCount does not include group 0, the entire match, hence the +1.
3673     //   G option in test means that capture group data is not available in the
3674     //     expected results, so the check needs to be suppressed.
3675     if (isMatch == FALSE && groupStarts.size() != 0) {
3676         dataerrln("Error at line %d:  Match expected, but none found.", line);
3677         failed = TRUE;
3678         goto cleanupAndReturn;
3679     } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3680         errln("Error at line %d:  Match expected, but none found. (UTF8)", line);
3681         failed = TRUE;
3682         goto cleanupAndReturn;
3683     }
3684     if (isMatch && groupStarts.size() == 0) {
3685         errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3686         failed = TRUE;
3687     }
3688     if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3689         errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3690         failed = TRUE;
3691     }
3692 
3693     if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3694         // Only check for match / no match.  Don't check capture groups.
3695         goto cleanupAndReturn;
3696     }
3697 
3698     REGEX_CHECK_STATUS_L(line);
3699     for (i=0; i<=matcher->groupCount(); i++) {
3700         int32_t  expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3701         int32_t  expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3702         if (matcher->start(i, status) != expectedStart) {
3703             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d",
3704                 line, i, expectedStart, matcher->start(i, status));
3705             failed = TRUE;
3706             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3707         } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3708             errln("Error at line %d: incorrect start position for group %d.  Expected %d, got %d (UTF8)",
3709                   line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3710             failed = TRUE;
3711             goto cleanupAndReturn;  // Good chance of subsequent bogus errors.  Stop now.
3712         }
3713 
3714         int32_t  expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3715         int32_t  expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3716         if (matcher->end(i, status) != expectedEnd) {
3717             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d",
3718                 line, i, expectedEnd, matcher->end(i, status));
3719             failed = TRUE;
3720             // Error on end position;  keep going; real error is probably yet to come as group
3721             //   end positions work from end of the input data towards the front.
3722         } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3723             errln("Error at line %d: incorrect end position for group %d.  Expected %d, got %d (UTF8)",
3724                   line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3725             failed = TRUE;
3726             // Error on end position;  keep going; real error is probably yet to come as group
3727             //   end positions work from end of the input data towards the front.
3728         }
3729     }
3730     if ( matcher->groupCount()+1 < groupStarts.size()) {
3731         errln("Error at line %d: Expected %d capture groups, found %d.",
3732             line, groupStarts.size()-1, matcher->groupCount());
3733         failed = TRUE;
3734         }
3735     else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3736         errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3737               line, groupStarts.size()-1, UTF8Matcher->groupCount());
3738         failed = TRUE;
3739     }
3740 
3741     if ((flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3742         matcher->requireEnd() == TRUE) {
3743         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE", line);
3744         failed = TRUE;
3745     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3746         UTF8Matcher->requireEnd() == TRUE) {
3747         errln("Error at line %d: requireEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3748         failed = TRUE;
3749     }
3750 
3751     if ((flags.indexOf((UChar)0x79) >= 0) &&   //  'y' flag:  RequireEnd() == true
3752         matcher->requireEnd() == FALSE) {
3753         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE", line);
3754         failed = TRUE;
3755     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) &&   //  'Y' flag:  RequireEnd() == false
3756         UTF8Matcher->requireEnd() == FALSE) {
3757         errln("Error at line %d: requireEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3758         failed = TRUE;
3759     }
3760 
3761     if ((flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3762         matcher->hitEnd() == TRUE) {
3763         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE", line);
3764         failed = TRUE;
3765     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) &&   //  'Z' flag:  hitEnd() == false
3766                UTF8Matcher->hitEnd() == TRUE) {
3767         errln("Error at line %d: hitEnd() returned TRUE.  Expected FALSE (UTF8)", line);
3768         failed = TRUE;
3769     }
3770 
3771     if ((flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3772         matcher->hitEnd() == FALSE) {
3773         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE", line);
3774         failed = TRUE;
3775     } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) &&   //  'z' flag:  hitEnd() == true
3776                UTF8Matcher->hitEnd() == FALSE) {
3777         errln("Error at line %d: hitEnd() returned FALSE.  Expected TRUE (UTF8)", line);
3778         failed = TRUE;
3779     }
3780 
3781 
3782 cleanupAndReturn:
3783     if (failed) {
3784         infoln((UnicodeString)"\""+pattern+(UnicodeString)"\"  "
3785             +flags+(UnicodeString)"  \""+inputString+(UnicodeString)"\"");
3786         // callerPattern->dump();
3787     }
3788     delete parseMatcher;
3789     delete parsePat;
3790     delete UTF8Matcher;
3791     delete UTF8Pattern;
3792     delete matcher;
3793     delete callerPattern;
3794 
3795     utext_close(&inputText);
3796     delete[] inputChars;
3797     utext_close(&patternText);
3798     delete[] patternChars;
3799     ucnv_close(UTF8Converter);
3800 }
3801 
3802 
3803 
3804 
3805 //---------------------------------------------------------------------------
3806 //
3807 //      Errors     Check for error handling in patterns.
3808 //
3809 //---------------------------------------------------------------------------
Errors()3810 void RegexTest::Errors() {
3811     // \escape sequences that aren't implemented yet.
3812     //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3813 
3814     // Missing close parentheses
3815     REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3816     REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3817     REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3818 
3819     // Extra close paren
3820     REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3821     REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3822     REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3823 
3824     // Look-ahead, Look-behind
3825     //  TODO:  add tests for unbounded length look-behinds.
3826     REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX);       // illegal construct
3827 
3828     // Attempt to use non-default flags
3829     {
3830         UParseError   pe;
3831         UErrorCode    status = U_ZERO_ERROR;
3832         int32_t       flags  = UREGEX_CANON_EQ |
3833                                UREGEX_COMMENTS         | UREGEX_DOTALL   |
3834                                UREGEX_MULTILINE;
3835         RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3836         REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3837         delete pat1;
3838     }
3839 
3840 
3841     // Quantifiers are allowed only after something that can be quantified.
3842     REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3843     REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3844     REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3845 
3846     // Mal-formed {min,max} quantifiers
3847     REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3848     REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3849     REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3850     REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3851     REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3852     REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3853     REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG);        // Overflows int during scan
3854     REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG);          // Overflows regex binary format
3855     REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3856 
3857     // Ticket 5389
3858     REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3859 
3860     // Invalid Back Reference \0
3861     //    For ICU 3.8 and earlier
3862     //    For ICU versions newer than 3.8, \0 introduces an octal escape.
3863     //
3864     REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3865 
3866 }
3867 
3868 
3869 //-------------------------------------------------------------------------------
3870 //
3871 //  Read a text data file, convert it to UChars, and return the data
3872 //    in one big UChar * buffer, which the caller must delete.
3873 //
3874 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3875 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3876                                      const char *defEncoding, UErrorCode &status) {
3877     UChar       *retPtr  = NULL;
3878     char        *fileBuf = NULL;
3879     UConverter* conv     = NULL;
3880     FILE        *f       = NULL;
3881 
3882     ulen = 0;
3883     if (U_FAILURE(status)) {
3884         return retPtr;
3885     }
3886 
3887     //
3888     //  Open the file.
3889     //
3890     f = fopen(fileName, "rb");
3891     if (f == 0) {
3892         dataerrln("Error opening test data file %s\n", fileName);
3893         status = U_FILE_ACCESS_ERROR;
3894         return NULL;
3895     }
3896     //
3897     //  Read it in
3898     //
3899     int32_t            fileSize;
3900     int32_t            amt_read;
3901 
3902     fseek( f, 0, SEEK_END);
3903     fileSize = ftell(f);
3904     fileBuf = new char[fileSize];
3905     fseek(f, 0, SEEK_SET);
3906     amt_read = static_cast<int32_t>(fread(fileBuf, 1, fileSize, f));
3907     if (amt_read != fileSize || fileSize <= 0) {
3908         errln("Error reading test data file.");
3909         goto cleanUpAndReturn;
3910     }
3911 
3912     //
3913     // Look for a Unicode Signature (BOM) on the data just read
3914     //
3915     int32_t        signatureLength;
3916     const char *   fileBufC;
3917     const char*    encoding;
3918 
3919     fileBufC = fileBuf;
3920     encoding = ucnv_detectUnicodeSignature(
3921         fileBuf, fileSize, &signatureLength, &status);
3922     if(encoding!=NULL ){
3923         fileBufC  += signatureLength;
3924         fileSize  -= signatureLength;
3925     } else {
3926         encoding = defEncoding;
3927         if (strcmp(encoding, "utf-8") == 0) {
3928             errln("file %s is missing its BOM", fileName);
3929         }
3930     }
3931 
3932     //
3933     // Open a converter to take the rule file to UTF-16
3934     //
3935     conv = ucnv_open(encoding, &status);
3936     if (U_FAILURE(status)) {
3937         goto cleanUpAndReturn;
3938     }
3939 
3940     //
3941     // Convert the rules to UChar.
3942     //  Preflight first to determine required buffer size.
3943     //
3944     ulen = ucnv_toUChars(conv,
3945         NULL,           //  dest,
3946         0,              //  destCapacity,
3947         fileBufC,
3948         fileSize,
3949         &status);
3950     if (status == U_BUFFER_OVERFLOW_ERROR) {
3951         // Buffer Overflow is expected from the preflight operation.
3952         status = U_ZERO_ERROR;
3953 
3954         retPtr = new UChar[ulen+1];
3955         ucnv_toUChars(conv,
3956             retPtr,       //  dest,
3957             ulen+1,
3958             fileBufC,
3959             fileSize,
3960             &status);
3961     }
3962 
3963 cleanUpAndReturn:
3964     fclose(f);
3965     delete[] fileBuf;
3966     ucnv_close(conv);
3967     if (U_FAILURE(status)) {
3968         errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3969         delete []retPtr;
3970         retPtr = 0;
3971         ulen   = 0;
3972     }
3973     return retPtr;
3974 }
3975 
3976 
3977 //-------------------------------------------------------------------------------
3978 //
3979 //   PerlTests  - Run Perl's regular expression tests
3980 //                The input file for this test is re_tests, the standard regular
3981 //                expression test data distributed with the Perl source code.
3982 //
3983 //                Here is Perl's description of the test data file:
3984 //
3985 //        # The tests are in a separate file 't/op/re_tests'.
3986 //        # Each line in that file is a separate test.
3987 //        # There are five columns, separated by tabs.
3988 //        #
3989 //        # Column 1 contains the pattern, optionally enclosed in C<''>.
3990 //        # Modifiers can be put after the closing C<'>.
3991 //        #
3992 //        # Column 2 contains the string to be matched.
3993 //        #
3994 //        # Column 3 contains the expected result:
3995 //        #     y   expect a match
3996 //        #     n   expect no match
3997 //        #     c   expect an error
3998 //        # B   test exposes a known bug in Perl, should be skipped
3999 //        # b   test exposes a known bug in Perl, should be skipped if noamp
4000 //        #
4001 //        # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4002 //        #
4003 //        # Column 4 contains a string, usually C<$&>.
4004 //        #
4005 //        # Column 5 contains the expected result of double-quote
4006 //        # interpolating that string after the match, or start of error message.
4007 //        #
4008 //        # Column 6, if present, contains a reason why the test is skipped.
4009 //        # This is printed with "skipped", for harness to pick up.
4010 //        #
4011 //        # \n in the tests are interpolated, as are variables of the form ${\w+}.
4012 //        #
4013 //        # If you want to add a regular expression test that can't be expressed
4014 //        # in this format, don't add it here: put it in op/pat.t instead.
4015 //
4016 //        For ICU, if field 3 contains an 'i', the test will be skipped.
4017 //        The test exposes is some known incompatibility between ICU and Perl regexps.
4018 //        (The i is in addition to whatever was there before.)
4019 //
4020 //-------------------------------------------------------------------------------
PerlTests()4021 void RegexTest::PerlTests() {
4022     char tdd[2048];
4023     const char *srcPath;
4024     UErrorCode  status = U_ZERO_ERROR;
4025     UParseError pe;
4026 
4027     //
4028     //  Open and read the test data file.
4029     //
4030     srcPath=getPath(tdd, "re_tests.txt");
4031     if(srcPath==NULL) {
4032         return; /* something went wrong, error already output */
4033     }
4034 
4035     int32_t    len;
4036     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4037     if (U_FAILURE(status)) {
4038         return; /* something went wrong, error already output */
4039     }
4040 
4041     //
4042     //  Put the test data into a UnicodeString
4043     //
4044     UnicodeString testDataString(FALSE, testData, len);
4045 
4046     //
4047     //  Regex to break the input file into lines, and strip the new lines.
4048     //     One line per match, capture group one is the desired data.
4049     //
4050     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4051     if (U_FAILURE(status)) {
4052         dataerrln("RegexPattern::compile() error");
4053         return;
4054     }
4055     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4056 
4057     //
4058     //  Regex to split a test file line into fields.
4059     //    There are six fields, separated by tabs.
4060     //
4061     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4062 
4063     //
4064     //  Regex to identify test patterns with flag settings, and to separate them.
4065     //    Test patterns with flags look like 'pattern'i
4066     //    Test patterns without flags are not quoted:   pattern
4067     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4068     //
4069     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4070     RegexMatcher* flagMat = flagPat->matcher(status);
4071 
4072     //
4073     // The Perl tests reference several perl-isms, which are evaluated/substituted
4074     //   in the test data.  Not being perl, this must be done explicitly.  Here
4075     //   are string constants and REs for these constructs.
4076     //
4077     UnicodeString nulnulSrc("${nulnul}");
4078     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4079     nulnul = nulnul.unescape();
4080 
4081     UnicodeString ffffSrc("${ffff}");
4082     UnicodeString ffff("\\uffff", -1, US_INV);
4083     ffff = ffff.unescape();
4084 
4085     //  regexp for $-[0], $+[2], etc.
4086     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4087     RegexMatcher *groupsMat = groupsPat->matcher(status);
4088 
4089     //  regexp for $0, $1, $2, etc.
4090     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4091     RegexMatcher *cgMat = cgPat->matcher(status);
4092 
4093 
4094     //
4095     // Main Loop for the Perl Tests, runs once per line from the
4096     //   test data file.
4097     //
4098     int32_t  lineNum = 0;
4099     int32_t  skippedUnimplementedCount = 0;
4100     while (lineMat->find()) {
4101         lineNum++;
4102 
4103         //
4104         //  Get a line, break it into its fields, do the Perl
4105         //    variable substitutions.
4106         //
4107         UnicodeString line = lineMat->group(1, status);
4108         UnicodeString fields[7];
4109         fieldPat->split(line, fields, 7, status);
4110 
4111         flagMat->reset(fields[0]);
4112         flagMat->matches(status);
4113         UnicodeString pattern  = flagMat->group(2, status);
4114         pattern.findAndReplace("${bang}", "!");
4115         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4116         pattern.findAndReplace(ffffSrc, ffff);
4117 
4118         //
4119         //  Identify patterns that include match flag settings,
4120         //    split off the flags, remove the extra quotes.
4121         //
4122         UnicodeString flagStr = flagMat->group(3, status);
4123         if (U_FAILURE(status)) {
4124             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4125             return;
4126         }
4127         int32_t flags = 0;
4128         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4129         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4130         const UChar UChar_m = 0x6d;
4131         const UChar UChar_x = 0x78;
4132         const UChar UChar_y = 0x79;
4133         if (flagStr.indexOf(UChar_i) != -1) {
4134             flags |= UREGEX_CASE_INSENSITIVE;
4135         }
4136         if (flagStr.indexOf(UChar_m) != -1) {
4137             flags |= UREGEX_MULTILINE;
4138         }
4139         if (flagStr.indexOf(UChar_x) != -1) {
4140             flags |= UREGEX_COMMENTS;
4141         }
4142 
4143         //
4144         // Compile the test pattern.
4145         //
4146         status = U_ZERO_ERROR;
4147         RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4148         if (status == U_REGEX_UNIMPLEMENTED) {
4149             //
4150             // Test of a feature that is planned for ICU, but not yet implemented.
4151             //   skip the test.
4152             skippedUnimplementedCount++;
4153             delete testPat;
4154             status = U_ZERO_ERROR;
4155             continue;
4156         }
4157 
4158         if (U_FAILURE(status)) {
4159             // Some tests are supposed to generate errors.
4160             //   Only report an error for tests that are supposed to succeed.
4161             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4162                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4163             {
4164                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4165             }
4166             status = U_ZERO_ERROR;
4167             delete testPat;
4168             continue;
4169         }
4170 
4171         if (fields[2].indexOf(UChar_i) >= 0) {
4172             // ICU should skip this test.
4173             delete testPat;
4174             continue;
4175         }
4176 
4177         if (fields[2].indexOf(UChar_c) >= 0) {
4178             // This pattern should have caused a compilation error, but didn't/
4179             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4180             delete testPat;
4181             continue;
4182         }
4183 
4184         //
4185         // replace the Perl variables that appear in some of the
4186         //   match data strings.
4187         //
4188         UnicodeString matchString = fields[1];
4189         matchString.findAndReplace(nulnulSrc, nulnul);
4190         matchString.findAndReplace(ffffSrc,   ffff);
4191 
4192         // Replace any \n in the match string with an actual new-line char.
4193         //  Don't do full unescape, as this unescapes more than Perl does, which
4194         //  causes other spurious failures in the tests.
4195         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4196 
4197 
4198 
4199         //
4200         // Run the test, check for expected match/don't match result.
4201         //
4202         RegexMatcher *testMat = testPat->matcher(matchString, status);
4203         UBool found = testMat->find();
4204         UBool expected = FALSE;
4205         if (fields[2].indexOf(UChar_y) >=0) {
4206             expected = TRUE;
4207         }
4208         if (expected != found) {
4209             errln("line %d: Expected %smatch, got %smatch",
4210                 lineNum, expected?"":"no ", found?"":"no " );
4211             delete testMat;
4212             delete testPat;
4213             continue;
4214         }
4215 
4216         // Don't try to check expected results if there is no match.
4217         //   (Some have stuff in the expected fields)
4218         if (!found) {
4219             delete testMat;
4220             delete testPat;
4221             continue;
4222         }
4223 
4224         //
4225         // Interpret the Perl expression from the fourth field of the data file,
4226         // building up an ICU string from the results of the ICU match.
4227         //   The Perl expression will contain references to the results of
4228         //     a regex match, including the matched string, capture group strings,
4229         //     group starting and ending indicies, etc.
4230         //
4231         UnicodeString resultString;
4232         UnicodeString perlExpr = fields[3];
4233 #if SUPPORT_MUTATING_INPUT_STRING
4234         groupsMat->reset(perlExpr);
4235         cgMat->reset(perlExpr);
4236 #endif
4237 
4238         while (perlExpr.length() > 0) {
4239 #if !SUPPORT_MUTATING_INPUT_STRING
4240             //  Perferred usage.  Reset after any modification to input string.
4241             groupsMat->reset(perlExpr);
4242             cgMat->reset(perlExpr);
4243 #endif
4244 
4245             if (perlExpr.startsWith("$&")) {
4246                 resultString.append(testMat->group(status));
4247                 perlExpr.remove(0, 2);
4248             }
4249 
4250             else if (groupsMat->lookingAt(status)) {
4251                 // $-[0]   $+[2]  etc.
4252                 UnicodeString digitString = groupsMat->group(2, status);
4253                 int32_t t = 0;
4254                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4255                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4256                 int32_t matchPosition;
4257                 if (plusOrMinus.compare("+") == 0) {
4258                     matchPosition = testMat->end(groupNum, status);
4259                 } else {
4260                     matchPosition = testMat->start(groupNum, status);
4261                 }
4262                 if (matchPosition != -1) {
4263                     ICU_Utility::appendNumber(resultString, matchPosition);
4264                 }
4265                 perlExpr.remove(0, groupsMat->end(status));
4266             }
4267 
4268             else if (cgMat->lookingAt(status)) {
4269                 // $1, $2, $3, etc.
4270                 UnicodeString digitString = cgMat->group(1, status);
4271                 int32_t t = 0;
4272                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4273                 if (U_SUCCESS(status)) {
4274                     resultString.append(testMat->group(groupNum, status));
4275                     status = U_ZERO_ERROR;
4276                 }
4277                 perlExpr.remove(0, cgMat->end(status));
4278             }
4279 
4280             else if (perlExpr.startsWith("@-")) {
4281                 int32_t i;
4282                 for (i=0; i<=testMat->groupCount(); i++) {
4283                     if (i>0) {
4284                         resultString.append(" ");
4285                     }
4286                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4287                 }
4288                 perlExpr.remove(0, 2);
4289             }
4290 
4291             else if (perlExpr.startsWith("@+")) {
4292                 int32_t i;
4293                 for (i=0; i<=testMat->groupCount(); i++) {
4294                     if (i>0) {
4295                         resultString.append(" ");
4296                     }
4297                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4298                 }
4299                 perlExpr.remove(0, 2);
4300             }
4301 
4302             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4303                                                      //           or as an escaped sequence (e.g. \n)
4304                 if (perlExpr.length() > 1) {
4305                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4306                 }
4307                 UChar c = perlExpr.charAt(0);
4308                 switch (c) {
4309                 case 'n':   c = '\n'; break;
4310                 // add any other escape sequences that show up in the test expected results.
4311                 }
4312                 resultString.append(c);
4313                 perlExpr.remove(0, 1);
4314             }
4315 
4316             else  {
4317                 // Any characters from the perl expression that we don't explicitly
4318                 //  recognize before here are assumed to be literals and copied
4319                 //  as-is to the expected results.
4320                 resultString.append(perlExpr.charAt(0));
4321                 perlExpr.remove(0, 1);
4322             }
4323 
4324             if (U_FAILURE(status)) {
4325                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4326                 break;
4327             }
4328         }
4329 
4330         //
4331         // Expected Results Compare
4332         //
4333         UnicodeString expectedS(fields[4]);
4334         expectedS.findAndReplace(nulnulSrc, nulnul);
4335         expectedS.findAndReplace(ffffSrc,   ffff);
4336         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4337 
4338 
4339         if (expectedS.compare(resultString) != 0) {
4340             err("Line %d: Incorrect perl expression results.", lineNum);
4341             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4342         }
4343 
4344         delete testMat;
4345         delete testPat;
4346     }
4347 
4348     //
4349     // All done.  Clean up allocated stuff.
4350     //
4351     delete cgMat;
4352     delete cgPat;
4353 
4354     delete groupsMat;
4355     delete groupsPat;
4356 
4357     delete flagMat;
4358     delete flagPat;
4359 
4360     delete lineMat;
4361     delete linePat;
4362 
4363     delete fieldPat;
4364     delete [] testData;
4365 
4366 
4367     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4368 
4369 }
4370 
4371 
4372 //-------------------------------------------------------------------------------
4373 //
4374 //   PerlTestsUTF8  Run Perl's regular expression tests on UTF-8-based UTexts
4375 //                  (instead of using UnicodeStrings) to test the alternate engine.
4376 //                  The input file for this test is re_tests, the standard regular
4377 //                  expression test data distributed with the Perl source code.
4378 //                  See PerlTests() for more information.
4379 //
4380 //-------------------------------------------------------------------------------
PerlTestsUTF8()4381 void RegexTest::PerlTestsUTF8() {
4382     char tdd[2048];
4383     const char *srcPath;
4384     UErrorCode  status = U_ZERO_ERROR;
4385     UParseError pe;
4386     LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4387     UText       patternText = UTEXT_INITIALIZER;
4388     char       *patternChars = NULL;
4389     int32_t     patternLength;
4390     int32_t     patternCapacity = 0;
4391     UText       inputText = UTEXT_INITIALIZER;
4392     char       *inputChars = NULL;
4393     int32_t     inputLength;
4394     int32_t     inputCapacity = 0;
4395 
4396     ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4397 
4398     //
4399     //  Open and read the test data file.
4400     //
4401     srcPath=getPath(tdd, "re_tests.txt");
4402     if(srcPath==NULL) {
4403         return; /* something went wrong, error already output */
4404     }
4405 
4406     int32_t    len;
4407     UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4408     if (U_FAILURE(status)) {
4409         return; /* something went wrong, error already output */
4410     }
4411 
4412     //
4413     //  Put the test data into a UnicodeString
4414     //
4415     UnicodeString testDataString(FALSE, testData, len);
4416 
4417     //
4418     //  Regex to break the input file into lines, and strip the new lines.
4419     //     One line per match, capture group one is the desired data.
4420     //
4421     RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4422     if (U_FAILURE(status)) {
4423         dataerrln("RegexPattern::compile() error");
4424         return;
4425     }
4426     RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4427 
4428     //
4429     //  Regex to split a test file line into fields.
4430     //    There are six fields, separated by tabs.
4431     //
4432     RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4433 
4434     //
4435     //  Regex to identify test patterns with flag settings, and to separate them.
4436     //    Test patterns with flags look like 'pattern'i
4437     //    Test patterns without flags are not quoted:   pattern
4438     //   Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4439     //
4440     RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4441     RegexMatcher* flagMat = flagPat->matcher(status);
4442 
4443     //
4444     // The Perl tests reference several perl-isms, which are evaluated/substituted
4445     //   in the test data.  Not being perl, this must be done explicitly.  Here
4446     //   are string constants and REs for these constructs.
4447     //
4448     UnicodeString nulnulSrc("${nulnul}");
4449     UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4450     nulnul = nulnul.unescape();
4451 
4452     UnicodeString ffffSrc("${ffff}");
4453     UnicodeString ffff("\\uffff", -1, US_INV);
4454     ffff = ffff.unescape();
4455 
4456     //  regexp for $-[0], $+[2], etc.
4457     RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4458     RegexMatcher *groupsMat = groupsPat->matcher(status);
4459 
4460     //  regexp for $0, $1, $2, etc.
4461     RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4462     RegexMatcher *cgMat = cgPat->matcher(status);
4463 
4464 
4465     //
4466     // Main Loop for the Perl Tests, runs once per line from the
4467     //   test data file.
4468     //
4469     int32_t  lineNum = 0;
4470     int32_t  skippedUnimplementedCount = 0;
4471     while (lineMat->find()) {
4472         lineNum++;
4473 
4474         //
4475         //  Get a line, break it into its fields, do the Perl
4476         //    variable substitutions.
4477         //
4478         UnicodeString line = lineMat->group(1, status);
4479         UnicodeString fields[7];
4480         fieldPat->split(line, fields, 7, status);
4481 
4482         flagMat->reset(fields[0]);
4483         flagMat->matches(status);
4484         UnicodeString pattern  = flagMat->group(2, status);
4485         pattern.findAndReplace("${bang}", "!");
4486         pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4487         pattern.findAndReplace(ffffSrc, ffff);
4488 
4489         //
4490         //  Identify patterns that include match flag settings,
4491         //    split off the flags, remove the extra quotes.
4492         //
4493         UnicodeString flagStr = flagMat->group(3, status);
4494         if (U_FAILURE(status)) {
4495             errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4496             return;
4497         }
4498         int32_t flags = 0;
4499         const UChar UChar_c = 0x63;  // Char constants for the flag letters.
4500         const UChar UChar_i = 0x69;  //   (Damn the lack of Unicode support in C)
4501         const UChar UChar_m = 0x6d;
4502         const UChar UChar_x = 0x78;
4503         const UChar UChar_y = 0x79;
4504         if (flagStr.indexOf(UChar_i) != -1) {
4505             flags |= UREGEX_CASE_INSENSITIVE;
4506         }
4507         if (flagStr.indexOf(UChar_m) != -1) {
4508             flags |= UREGEX_MULTILINE;
4509         }
4510         if (flagStr.indexOf(UChar_x) != -1) {
4511             flags |= UREGEX_COMMENTS;
4512         }
4513 
4514         //
4515         // Put the pattern in a UTF-8 UText
4516         //
4517         status = U_ZERO_ERROR;
4518         patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4519         if (status == U_BUFFER_OVERFLOW_ERROR) {
4520             status = U_ZERO_ERROR;
4521             delete[] patternChars;
4522             patternCapacity = patternLength + 1;
4523             patternChars = new char[patternCapacity];
4524             pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4525         }
4526         utext_openUTF8(&patternText, patternChars, patternLength, &status);
4527 
4528         //
4529         // Compile the test pattern.
4530         //
4531         RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4532         if (status == U_REGEX_UNIMPLEMENTED) {
4533             //
4534             // Test of a feature that is planned for ICU, but not yet implemented.
4535             //   skip the test.
4536             skippedUnimplementedCount++;
4537             delete testPat;
4538             status = U_ZERO_ERROR;
4539             continue;
4540         }
4541 
4542         if (U_FAILURE(status)) {
4543             // Some tests are supposed to generate errors.
4544             //   Only report an error for tests that are supposed to succeed.
4545             if (fields[2].indexOf(UChar_c) == -1  &&  // Compilation is not supposed to fail AND
4546                 fields[2].indexOf(UChar_i) == -1)     //   it's not an accepted ICU incompatibility
4547             {
4548                 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4549             }
4550             status = U_ZERO_ERROR;
4551             delete testPat;
4552             continue;
4553         }
4554 
4555         if (fields[2].indexOf(UChar_i) >= 0) {
4556             // ICU should skip this test.
4557             delete testPat;
4558             continue;
4559         }
4560 
4561         if (fields[2].indexOf(UChar_c) >= 0) {
4562             // This pattern should have caused a compilation error, but didn't/
4563             errln("line %d: Expected a pattern compile error, got success.", lineNum);
4564             delete testPat;
4565             continue;
4566         }
4567 
4568 
4569         //
4570         // replace the Perl variables that appear in some of the
4571         //   match data strings.
4572         //
4573         UnicodeString matchString = fields[1];
4574         matchString.findAndReplace(nulnulSrc, nulnul);
4575         matchString.findAndReplace(ffffSrc,   ffff);
4576 
4577         // Replace any \n in the match string with an actual new-line char.
4578         //  Don't do full unescape, as this unescapes more than Perl does, which
4579         //  causes other spurious failures in the tests.
4580         matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4581 
4582         //
4583         // Put the input in a UTF-8 UText
4584         //
4585         status = U_ZERO_ERROR;
4586         inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4587         if (status == U_BUFFER_OVERFLOW_ERROR) {
4588             status = U_ZERO_ERROR;
4589             delete[] inputChars;
4590             inputCapacity = inputLength + 1;
4591             inputChars = new char[inputCapacity];
4592             matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4593         }
4594         utext_openUTF8(&inputText, inputChars, inputLength, &status);
4595 
4596         //
4597         // Run the test, check for expected match/don't match result.
4598         //
4599         RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4600         UBool found = testMat->find();
4601         UBool expected = FALSE;
4602         if (fields[2].indexOf(UChar_y) >=0) {
4603             expected = TRUE;
4604         }
4605         if (expected != found) {
4606             errln("line %d: Expected %smatch, got %smatch",
4607                 lineNum, expected?"":"no ", found?"":"no " );
4608             delete testMat;
4609             delete testPat;
4610             continue;
4611         }
4612 
4613         // Don't try to check expected results if there is no match.
4614         //   (Some have stuff in the expected fields)
4615         if (!found) {
4616             delete testMat;
4617             delete testPat;
4618             continue;
4619         }
4620 
4621         //
4622         // Interpret the Perl expression from the fourth field of the data file,
4623         // building up an ICU string from the results of the ICU match.
4624         //   The Perl expression will contain references to the results of
4625         //     a regex match, including the matched string, capture group strings,
4626         //     group starting and ending indicies, etc.
4627         //
4628         UnicodeString resultString;
4629         UnicodeString perlExpr = fields[3];
4630 
4631         while (perlExpr.length() > 0) {
4632             groupsMat->reset(perlExpr);
4633             cgMat->reset(perlExpr);
4634 
4635             if (perlExpr.startsWith("$&")) {
4636                 resultString.append(testMat->group(status));
4637                 perlExpr.remove(0, 2);
4638             }
4639 
4640             else if (groupsMat->lookingAt(status)) {
4641                 // $-[0]   $+[2]  etc.
4642                 UnicodeString digitString = groupsMat->group(2, status);
4643                 int32_t t = 0;
4644                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4645                 UnicodeString plusOrMinus = groupsMat->group(1, status);
4646                 int32_t matchPosition;
4647                 if (plusOrMinus.compare("+") == 0) {
4648                     matchPosition = testMat->end(groupNum, status);
4649                 } else {
4650                     matchPosition = testMat->start(groupNum, status);
4651                 }
4652                 if (matchPosition != -1) {
4653                     ICU_Utility::appendNumber(resultString, matchPosition);
4654                 }
4655                 perlExpr.remove(0, groupsMat->end(status));
4656             }
4657 
4658             else if (cgMat->lookingAt(status)) {
4659                 // $1, $2, $3, etc.
4660                 UnicodeString digitString = cgMat->group(1, status);
4661                 int32_t t = 0;
4662                 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4663                 if (U_SUCCESS(status)) {
4664                     resultString.append(testMat->group(groupNum, status));
4665                     status = U_ZERO_ERROR;
4666                 }
4667                 perlExpr.remove(0, cgMat->end(status));
4668             }
4669 
4670             else if (perlExpr.startsWith("@-")) {
4671                 int32_t i;
4672                 for (i=0; i<=testMat->groupCount(); i++) {
4673                     if (i>0) {
4674                         resultString.append(" ");
4675                     }
4676                     ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4677                 }
4678                 perlExpr.remove(0, 2);
4679             }
4680 
4681             else if (perlExpr.startsWith("@+")) {
4682                 int32_t i;
4683                 for (i=0; i<=testMat->groupCount(); i++) {
4684                     if (i>0) {
4685                         resultString.append(" ");
4686                     }
4687                     ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4688                 }
4689                 perlExpr.remove(0, 2);
4690             }
4691 
4692             else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) {    // \Escape.  Take following char as a literal.
4693                                                      //           or as an escaped sequence (e.g. \n)
4694                 if (perlExpr.length() > 1) {
4695                     perlExpr.remove(0, 1);  // Remove the '\', but only if not last char.
4696                 }
4697                 UChar c = perlExpr.charAt(0);
4698                 switch (c) {
4699                 case 'n':   c = '\n'; break;
4700                 // add any other escape sequences that show up in the test expected results.
4701                 }
4702                 resultString.append(c);
4703                 perlExpr.remove(0, 1);
4704             }
4705 
4706             else  {
4707                 // Any characters from the perl expression that we don't explicitly
4708                 //  recognize before here are assumed to be literals and copied
4709                 //  as-is to the expected results.
4710                 resultString.append(perlExpr.charAt(0));
4711                 perlExpr.remove(0, 1);
4712             }
4713 
4714             if (U_FAILURE(status)) {
4715                 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4716                 break;
4717             }
4718         }
4719 
4720         //
4721         // Expected Results Compare
4722         //
4723         UnicodeString expectedS(fields[4]);
4724         expectedS.findAndReplace(nulnulSrc, nulnul);
4725         expectedS.findAndReplace(ffffSrc,   ffff);
4726         expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4727 
4728 
4729         if (expectedS.compare(resultString) != 0) {
4730             err("Line %d: Incorrect perl expression results.", lineNum);
4731             infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4732         }
4733 
4734         delete testMat;
4735         delete testPat;
4736     }
4737 
4738     //
4739     // All done.  Clean up allocated stuff.
4740     //
4741     delete cgMat;
4742     delete cgPat;
4743 
4744     delete groupsMat;
4745     delete groupsPat;
4746 
4747     delete flagMat;
4748     delete flagPat;
4749 
4750     delete lineMat;
4751     delete linePat;
4752 
4753     delete fieldPat;
4754     delete [] testData;
4755 
4756     utext_close(&patternText);
4757     utext_close(&inputText);
4758 
4759     delete [] patternChars;
4760     delete [] inputChars;
4761 
4762 
4763     logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4764 
4765 }
4766 
4767 
4768 //--------------------------------------------------------------
4769 //
4770 //  Bug6149   Verify limits to heap expansion for backtrack stack.
4771 //             Use this pattern,
4772 //                 "(a?){1,8000000}"
4773 //             Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4774 //                   This test is likely to be fragile, as further optimizations stop
4775 //                   more cases of pointless looping in the match engine.
4776 //
4777 //---------------------------------------------------------------
Bug6149()4778 void RegexTest::Bug6149() {
4779     UnicodeString pattern("(a?){1,8000000}");
4780     UnicodeString s("xyz");
4781     uint32_t flags = 0;
4782     UErrorCode status = U_ZERO_ERROR;
4783 
4784     RegexMatcher  matcher(pattern, s, flags, status);
4785     UBool result = false;
4786     REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4787     REGEX_ASSERT(result == FALSE);
4788  }
4789 
4790 
4791 //
4792 //   Callbacks()    Test the callback function.
4793 //                  When set, callbacks occur periodically during matching operations,
4794 //                  giving the application code the ability to abort the operation
4795 //                  before it's normal completion.
4796 //
4797 
4798 struct callBackContext {
4799     RegexTest        *test;
4800     int32_t          maxCalls;
4801     int32_t          numCalls;
4802     int32_t          lastSteps;
resetcallBackContext4803     void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;}
4804 };
4805 
4806 U_CDECL_BEGIN
4807 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4808 testCallBackFn(const void *context, int32_t steps) {
4809     callBackContext  *info = (callBackContext *)context;
4810     if (info->lastSteps+1 != steps) {
4811         info->test->errln("incorrect steps in callback.  Expected %d, got %d\n", info->lastSteps+1, steps);
4812     }
4813     info->lastSteps = steps;
4814     info->numCalls++;
4815     return (info->numCalls < info->maxCalls);
4816 }
4817 U_CDECL_END
4818 
Callbacks()4819 void RegexTest::Callbacks() {
4820    {
4821         // Getter returns NULLs if no callback has been set
4822 
4823         //   The variables that the getter will fill in.
4824         //   Init to non-null values so that the action of the getter can be seen.
4825         const void          *returnedContext = &returnedContext;
4826         URegexMatchCallback *returnedFn = &testCallBackFn;
4827 
4828         UErrorCode status = U_ZERO_ERROR;
4829         RegexMatcher matcher("x", 0, status);
4830         REGEX_CHECK_STATUS;
4831         matcher.getMatchCallback(returnedFn, returnedContext, status);
4832         REGEX_CHECK_STATUS;
4833         REGEX_ASSERT(returnedFn == NULL);
4834         REGEX_ASSERT(returnedContext == NULL);
4835     }
4836 
4837    {
4838         // Set and Get work
4839         callBackContext cbInfo = {this, 0, 0, 0};
4840         const void          *returnedContext;
4841         URegexMatchCallback *returnedFn;
4842         UErrorCode status = U_ZERO_ERROR;
4843         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status);  // A pattern that can run long.
4844         REGEX_CHECK_STATUS;
4845         matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4846         REGEX_CHECK_STATUS;
4847         matcher.getMatchCallback(returnedFn, returnedContext, status);
4848         REGEX_CHECK_STATUS;
4849         REGEX_ASSERT(returnedFn == testCallBackFn);
4850         REGEX_ASSERT(returnedContext == &cbInfo);
4851 
4852         // A short-running match shouldn't invoke the callback
4853         status = U_ZERO_ERROR;
4854         cbInfo.reset(1);
4855         UnicodeString s = "xxx";
4856         matcher.reset(s);
4857         REGEX_ASSERT(matcher.matches(status));
4858         REGEX_CHECK_STATUS;
4859         REGEX_ASSERT(cbInfo.numCalls == 0);
4860 
4861         // A medium-length match that runs long enough to invoke the
4862         //   callback, but not so long that the callback aborts it.
4863         status = U_ZERO_ERROR;
4864         cbInfo.reset(4);
4865         s = "aaaaaaaaaaaaaaaaaaab";
4866         matcher.reset(s);
4867         REGEX_ASSERT(matcher.matches(status)==FALSE);
4868         REGEX_CHECK_STATUS;
4869         REGEX_ASSERT(cbInfo.numCalls > 0);
4870 
4871         // A longer running match that the callback function will abort.
4872         status = U_ZERO_ERROR;
4873         cbInfo.reset(4);
4874         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4875         matcher.reset(s);
4876         REGEX_ASSERT(matcher.matches(status)==FALSE);
4877         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4878         REGEX_ASSERT(cbInfo.numCalls == 4);
4879 
4880         // A longer running find that the callback function will abort.
4881         status = U_ZERO_ERROR;
4882         cbInfo.reset(4);
4883         s = "aaaaaaaaaaaaaaaaaaaaaaab";
4884         matcher.reset(s);
4885         REGEX_ASSERT(matcher.find(status)==FALSE);
4886         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4887         REGEX_ASSERT(cbInfo.numCalls == 4);
4888     }
4889 
4890 
4891 }
4892 
4893 
4894 //
4895 //   FindProgressCallbacks()    Test the find "progress" callback function.
4896 //                  When set, the find progress callback will be invoked during a find operations
4897 //                  after each return from a match attempt, giving the application the opportunity
4898 //                  to terminate a long-running find operation before it's normal completion.
4899 //
4900 
4901 struct progressCallBackContext {
4902     RegexTest        *test;
4903     int64_t          lastIndex;
4904     int32_t          maxCalls;
4905     int32_t          numCalls;
resetprogressCallBackContext4906     void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;}
4907 };
4908 
4909 // call-back function for find().
4910 // Return TRUE to continue the find().
4911 // Return FALSE to stop the find().
4912 U_CDECL_BEGIN
4913 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4914 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4915     progressCallBackContext  *info = (progressCallBackContext *)context;
4916     info->numCalls++;
4917     info->lastIndex = matchIndex;
4918 //    info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4919     return (info->numCalls < info->maxCalls);
4920 }
4921 U_CDECL_END
4922 
FindProgressCallbacks()4923 void RegexTest::FindProgressCallbacks() {
4924    {
4925         // Getter returns NULLs if no callback has been set
4926 
4927         //   The variables that the getter will fill in.
4928         //   Init to non-null values so that the action of the getter can be seen.
4929         const void                  *returnedContext = &returnedContext;
4930         URegexFindProgressCallback  *returnedFn = &testProgressCallBackFn;
4931 
4932         UErrorCode status = U_ZERO_ERROR;
4933         RegexMatcher matcher("x", 0, status);
4934         REGEX_CHECK_STATUS;
4935         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4936         REGEX_CHECK_STATUS;
4937         REGEX_ASSERT(returnedFn == NULL);
4938         REGEX_ASSERT(returnedContext == NULL);
4939     }
4940 
4941    {
4942         // Set and Get work
4943         progressCallBackContext cbInfo = {this, 0, 0, 0};
4944         const void                  *returnedContext;
4945         URegexFindProgressCallback  *returnedFn;
4946         UErrorCode status = U_ZERO_ERROR;
4947         RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4948         REGEX_CHECK_STATUS;
4949         matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4950         REGEX_CHECK_STATUS;
4951         matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4952         REGEX_CHECK_STATUS;
4953         REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4954         REGEX_ASSERT(returnedContext == &cbInfo);
4955 
4956         // A find that matches on the initial position does NOT invoke the callback.
4957         status = U_ZERO_ERROR;
4958         cbInfo.reset(100);
4959         UnicodeString s = "aaxxx";
4960         matcher.reset(s);
4961 #if 0
4962         matcher.setTrace(TRUE);
4963 #endif
4964         REGEX_ASSERT(matcher.find(0, status));
4965         REGEX_CHECK_STATUS;
4966         REGEX_ASSERT(cbInfo.numCalls == 0);
4967 
4968         // A medium running find() that causes matcher.find() to invoke our callback for each index,
4969         //   but not so many times that we interrupt the operation.
4970         status = U_ZERO_ERROR;
4971         s = "aaaaaaaaaaaaaaaaaaab";
4972         cbInfo.reset(s.length()); //  Some upper limit for number of calls that is greater than size of our input string
4973         matcher.reset(s);
4974         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4975         REGEX_CHECK_STATUS;
4976         REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4977 
4978         // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4979         status = U_ZERO_ERROR;
4980         UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4981         cbInfo.reset(s1.length() - 5); //  Bail early somewhere near the end of input string
4982         matcher.reset(s1);
4983         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4984         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4985         REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
4986 
4987         // Now a match that will succeed, but after an interruption
4988         status = U_ZERO_ERROR;
4989         UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
4990         cbInfo.reset(s2.length() - 10); //  Bail early somewhere near the end of input string
4991         matcher.reset(s2);
4992         REGEX_ASSERT(matcher.find(0, status)==FALSE);
4993         REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4994         // Now retry the match from where left off
4995         cbInfo.maxCalls = 100; //  No callback limit
4996         status = U_ZERO_ERROR;
4997         REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
4998         REGEX_CHECK_STATUS;
4999     }
5000 
5001 
5002 }
5003 
5004 
5005 //---------------------------------------------------------------------------
5006 //
5007 //    PreAllocatedUTextCAPI    Check the C API with pre-allocated mutable
5008 //                             UTexts. The pure-C implementation of UText
5009 //                             has no mutable backing stores, but we can
5010 //                             use UnicodeString here to test the functionality.
5011 //
5012 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5013 void RegexTest::PreAllocatedUTextCAPI () {
5014     UErrorCode           status = U_ZERO_ERROR;
5015     URegularExpression  *re;
5016     UText                patternText = UTEXT_INITIALIZER;
5017     UnicodeString        buffer;
5018     UText                bufferText = UTEXT_INITIALIZER;
5019 
5020     utext_openUnicodeString(&bufferText, &buffer, &status);
5021 
5022     /*
5023      *  getText() and getUText()
5024      */
5025     {
5026         UText  text1 = UTEXT_INITIALIZER;
5027         UText  text2 = UTEXT_INITIALIZER;
5028         UChar  text2Chars[20];
5029         UText  *resultText;
5030 
5031         status = U_ZERO_ERROR;
5032         regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5033         regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5034         u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5035         utext_openUChars(&text2, text2Chars, -1, &status);
5036 
5037         regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5038         re = uregex_openUText(&patternText, 0, NULL, &status);
5039 
5040         /* First set a UText */
5041         uregex_setUText(re, &text1, &status);
5042         resultText = uregex_getUText(re, &bufferText, &status);
5043         REGEX_CHECK_STATUS;
5044         REGEX_ASSERT(resultText == &bufferText);
5045         utext_setNativeIndex(resultText, 0);
5046         utext_setNativeIndex(&text1, 0);
5047         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5048 
5049         resultText = uregex_getUText(re, &bufferText, &status);
5050         REGEX_CHECK_STATUS;
5051         REGEX_ASSERT(resultText == &bufferText);
5052         utext_setNativeIndex(resultText, 0);
5053         utext_setNativeIndex(&text1, 0);
5054         REGEX_ASSERT(testUTextEqual(resultText, &text1));
5055 
5056         /* Then set a UChar * */
5057         uregex_setText(re, text2Chars, 7, &status);
5058         resultText = uregex_getUText(re, &bufferText, &status);
5059         REGEX_CHECK_STATUS;
5060         REGEX_ASSERT(resultText == &bufferText);
5061         utext_setNativeIndex(resultText, 0);
5062         utext_setNativeIndex(&text2, 0);
5063         REGEX_ASSERT(testUTextEqual(resultText, &text2));
5064 
5065         uregex_close(re);
5066         utext_close(&text1);
5067         utext_close(&text2);
5068     }
5069 
5070     /*
5071      *  group()
5072      */
5073     {
5074         UChar    text1[80];
5075         UText   *actual;
5076         UBool    result;
5077         int64_t  length = 0;
5078 
5079         u_uastrncpy(text1, "noise abc interior def, and this is off the end",  UPRV_LENGTHOF(text1));
5080         //                  012345678901234567890123456789012345678901234567
5081         //                  0         1         2         3         4
5082 
5083         status = U_ZERO_ERROR;
5084         re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5085         REGEX_CHECK_STATUS;
5086 
5087         uregex_setText(re, text1, -1, &status);
5088         result = uregex_find(re, 0, &status);
5089         REGEX_ASSERT(result==TRUE);
5090 
5091         /*  Capture Group 0, the full match.  Should succeed. "abc interior def" */
5092         status = U_ZERO_ERROR;
5093         actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5094         REGEX_CHECK_STATUS;
5095         REGEX_ASSERT(actual == &bufferText);
5096         REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5097         REGEX_ASSERT(length == 16);
5098         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5099 
5100         /*  Capture group #1.  Should succeed, matching " interior ". */
5101         status = U_ZERO_ERROR;
5102         actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5103         REGEX_CHECK_STATUS;
5104         REGEX_ASSERT(actual == &bufferText);
5105         REGEX_ASSERT(utext_getNativeIndex(actual) == 9);   // position of " interior "
5106         REGEX_ASSERT(length == 10);
5107         REGEX_ASSERT(utext_nativeLength(actual) == 47);
5108 
5109         /*  Capture group out of range.  Error. */
5110         status = U_ZERO_ERROR;
5111         actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5112         REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5113         REGEX_ASSERT(actual == &bufferText);
5114         uregex_close(re);
5115 
5116     }
5117 
5118     /*
5119      *  replaceFirst()
5120      */
5121     {
5122         UChar    text1[80];
5123         UChar    text2[80];
5124         UText    replText = UTEXT_INITIALIZER;
5125         UText   *result;
5126         status = U_ZERO_ERROR;
5127         utext_openUnicodeString(&bufferText, &buffer, &status);
5128 
5129         status = U_ZERO_ERROR;
5130         u_uastrncpy(text1, "Replace xaax x1x x...x.",  UPRV_LENGTHOF(text1));
5131         u_uastrncpy(text2, "No match here.",  UPRV_LENGTHOF(text2)/2);
5132         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5133 
5134         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5135         REGEX_CHECK_STATUS;
5136 
5137         /*  Normal case, with match */
5138         uregex_setText(re, text1, -1, &status);
5139         REGEX_CHECK_STATUS;
5140         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5141         REGEX_CHECK_STATUS;
5142         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5143         REGEX_CHECK_STATUS;
5144         REGEX_ASSERT(result == &bufferText);
5145         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5146 
5147         /* No match.  Text should copy to output with no changes.  */
5148         uregex_setText(re, text2, -1, &status);
5149         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5150         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5151         REGEX_CHECK_STATUS;
5152         REGEX_ASSERT(result == &bufferText);
5153         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5154 
5155         /* Unicode escapes */
5156         uregex_setText(re, text1, -1, &status);
5157         regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5158         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5159         result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5160         REGEX_CHECK_STATUS;
5161         REGEX_ASSERT(result == &bufferText);
5162         REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5163 
5164         uregex_close(re);
5165         utext_close(&replText);
5166     }
5167 
5168 
5169     /*
5170      *  replaceAll()
5171      */
5172     {
5173         UChar    text1[80];
5174         UChar    text2[80];
5175         UText    replText = UTEXT_INITIALIZER;
5176         UText   *result;
5177 
5178         status = U_ZERO_ERROR;
5179         u_uastrncpy(text1, "Replace xaax x1x x...x.",  sizeof(text1)/2);
5180         u_uastrncpy(text2, "No match here.",  sizeof(text2)/2);
5181         regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5182 
5183         re = uregex_openC("x(.*?)x", 0, NULL, &status);
5184         REGEX_CHECK_STATUS;
5185 
5186         /*  Normal case, with match */
5187         uregex_setText(re, text1, -1, &status);
5188         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5189         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5190         REGEX_CHECK_STATUS;
5191         REGEX_ASSERT(result == &bufferText);
5192         REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5193 
5194         /* No match.  Text should copy to output with no changes.  */
5195         uregex_setText(re, text2, -1, &status);
5196         utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5197         result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5198         REGEX_CHECK_STATUS;
5199         REGEX_ASSERT(result == &bufferText);
5200         REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5201 
5202         uregex_close(re);
5203         utext_close(&replText);
5204     }
5205 
5206 
5207     /*
5208      *  splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5209      *   so we don't need to test it here.
5210      */
5211 
5212     utext_close(&bufferText);
5213     utext_close(&patternText);
5214 }
5215 
5216 
5217 //--------------------------------------------------------------
5218 //
5219 //  NamedCapture   Check basic named capture group functionality
5220 //
5221 //--------------------------------------------------------------
NamedCapture()5222 void RegexTest::NamedCapture() {
5223     UErrorCode status = U_ZERO_ERROR;
5224     RegexPattern *pat = RegexPattern::compile(UnicodeString(
5225             "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5226     REGEX_CHECK_STATUS;
5227     int32_t group = pat->groupNumberFromName("five", -1, status);
5228     REGEX_CHECK_STATUS;
5229     REGEX_ASSERT(5 == group);
5230     group = pat->groupNumberFromName("three", -1, status);
5231     REGEX_CHECK_STATUS;
5232     REGEX_ASSERT(3 == group);
5233 
5234     status = U_ZERO_ERROR;
5235     group = pat->groupNumberFromName(UnicodeString("six"), status);
5236     REGEX_CHECK_STATUS;
5237     REGEX_ASSERT(6 == group);
5238 
5239     status = U_ZERO_ERROR;
5240     group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5241     U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5242 
5243     status = U_ZERO_ERROR;
5244 
5245     // After copying a pattern, named capture should still work in the copy.
5246     RegexPattern *copiedPat = new RegexPattern(*pat);
5247     REGEX_ASSERT(*copiedPat == *pat);
5248     delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
5249 
5250     group = copiedPat->groupNumberFromName("five", -1, status);
5251     REGEX_CHECK_STATUS;
5252     REGEX_ASSERT(5 == group);
5253     group = copiedPat->groupNumberFromName("three", -1, status);
5254     REGEX_CHECK_STATUS;
5255     REGEX_ASSERT(3 == group);
5256     delete copiedPat;
5257 
5258     // ReplaceAll with named capture group.
5259     status = U_ZERO_ERROR;
5260     UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5261     RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5262     REGEX_CHECK_STATUS;
5263     // m.pattern().dumpPattern();
5264     UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5265     REGEX_CHECK_STATUS;
5266     REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5267     delete m;
5268 
5269     // ReplaceAll, allowed capture group numbers.
5270     text = UnicodeString("abcmxyz");
5271     m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5272     REGEX_CHECK_STATUS;
5273 
5274     status = U_ZERO_ERROR;
5275     replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
5276     REGEX_CHECK_STATUS;
5277     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5278 
5279     status = U_ZERO_ERROR;
5280     replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
5281     REGEX_CHECK_STATUS;
5282     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5283 
5284     status = U_ZERO_ERROR;
5285     replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
5286     REGEX_CHECK_STATUS;
5287     REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5288 
5289     status = U_ZERO_ERROR;
5290     replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
5291     REGEX_CHECK_STATUS;
5292     REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5293 
5294     status = U_ZERO_ERROR;
5295     replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
5296     REGEX_CHECK_STATUS;
5297     REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5298 
5299     status = U_ZERO_ERROR;
5300     replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
5301     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5302 
5303     status = U_ZERO_ERROR;
5304     replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
5305     REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
5306     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5307 
5308     status = U_ZERO_ERROR;
5309     replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
5310     REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
5311     REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
5312 
5313     status = U_ZERO_ERROR;
5314     replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5315     REGEX_CHECK_STATUS;
5316     REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5317 
5318     status = U_ZERO_ERROR;
5319     replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5320     REGEX_CHECK_STATUS;
5321     REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5322 
5323     status = U_ZERO_ERROR;
5324     replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5325     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5326 
5327     status = U_ZERO_ERROR;
5328     replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5329     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5330 
5331     status = U_ZERO_ERROR;
5332     replacedText  = m->replaceAll(UnicodeString("<${one"), status);
5333     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5334 
5335     status = U_ZERO_ERROR;
5336     replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
5337     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5338 
5339     delete m;
5340 
5341     // Repeat the above replaceAll() tests using the plain C API, which
5342     //  has a separate implementation internally.
5343     //  TODO: factor out the test data.
5344 
5345     status = U_ZERO_ERROR;
5346     URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5347     REGEX_CHECK_STATUS;
5348     text = UnicodeString("abcmxyz");
5349     uregex_setText(re, text.getBuffer(), text.length(), &status);
5350     REGEX_CHECK_STATUS;
5351 
5352     UChar resultBuf[100];
5353     int32_t resultLength;
5354     UnicodeString repl;
5355 
5356     status = U_ZERO_ERROR;
5357     repl = UnicodeString("<$0>");
5358     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5359     REGEX_CHECK_STATUS;
5360     REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5361 
5362     status = U_ZERO_ERROR;
5363     repl = UnicodeString("<$1>");
5364     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5365     REGEX_CHECK_STATUS;
5366     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5367 
5368     status = U_ZERO_ERROR;
5369     repl = UnicodeString("<${one}>");
5370     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5371     REGEX_CHECK_STATUS;
5372     REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5373 
5374     status = U_ZERO_ERROR;
5375     repl = UnicodeString("<$2>");
5376     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5377     REGEX_CHECK_STATUS;
5378     REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5379 
5380     status = U_ZERO_ERROR;
5381     repl = UnicodeString("<$3>");
5382     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5383     REGEX_CHECK_STATUS;
5384     REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5385 
5386     status = U_ZERO_ERROR;
5387     repl = UnicodeString("<$4>");
5388     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5389     REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5390 
5391     status = U_ZERO_ERROR;
5392     repl = UnicodeString("<$04>");
5393     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5394     REGEX_CHECK_STATUS;
5395     REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5396 
5397     status = U_ZERO_ERROR;
5398     repl = UnicodeString("<$000016>");
5399     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5400     REGEX_CHECK_STATUS;
5401     REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5402 
5403     status = U_ZERO_ERROR;
5404     repl = UnicodeString("<$3$2$1${one}>");
5405     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5406     REGEX_CHECK_STATUS;
5407     REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5408 
5409     status = U_ZERO_ERROR;
5410     repl = UnicodeString("$3$2$1${one}");
5411     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5412     REGEX_CHECK_STATUS;
5413     REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5414 
5415     status = U_ZERO_ERROR;
5416     repl = UnicodeString("<${noSuchName}>");
5417     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5418     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5419 
5420     status = U_ZERO_ERROR;
5421     repl = UnicodeString("<${invalid-name}>");
5422     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5423     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5424 
5425     status = U_ZERO_ERROR;
5426     repl = UnicodeString("<${one");
5427     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5428     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5429 
5430     status = U_ZERO_ERROR;
5431     repl = UnicodeString("$not a capture group");
5432     resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5433     REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5434 
5435     uregex_close(re);
5436 }
5437 
5438 //--------------------------------------------------------------
5439 //
5440 //  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
5441 //                       The point is not so much what the exact limit is,
5442 //                       but that a largish number doesn't hit bad non-linear performance,
5443 //                       and that exceeding the limit fails cleanly.
5444 //
5445 //--------------------------------------------------------------
NamedCaptureLimits()5446 void RegexTest::NamedCaptureLimits() {
5447     if (quick) {
5448         logln("Skipping test. Runs in exhuastive mode only.");
5449         return;
5450     }
5451     const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
5452     const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
5453     char nnbuf[100];
5454     UnicodeString pattern;
5455     int32_t nn;
5456 
5457     for (nn=1; nn<goodLimit; nn++) {
5458         sprintf(nnbuf, "(?<nn%d>)", nn);
5459         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5460     }
5461     UErrorCode status = U_ZERO_ERROR;
5462     RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5463     REGEX_CHECK_STATUS;
5464     for (nn=1; nn<goodLimit; nn++) {
5465         sprintf(nnbuf, "nn%d", nn);
5466         int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5467         REGEX_ASSERT(nn == groupNum);
5468         if (nn != groupNum) {
5469             break;
5470         }
5471     }
5472     delete pat;
5473 
5474     pattern.remove();
5475     for (nn=1; nn<failLimit; nn++) {
5476         sprintf(nnbuf, "(?<nn%d>)", nn);
5477         pattern.append(UnicodeString(nnbuf, -1, US_INV));
5478     }
5479     status = U_ZERO_ERROR;
5480     pat = RegexPattern::compile(pattern, 0, status);
5481     REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5482     delete pat;
5483 }
5484 
5485 
5486 //--------------------------------------------------------------
5487 //
5488 //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
5489 //
5490 //---------------------------------------------------------------
Bug7651()5491 void RegexTest::Bug7651() {
5492     UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5493     //  The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5494     //  It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5495     UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5496     UnicodeString s("#ff @abcd This is test");
5497     RegexPattern  *REPattern = NULL;
5498     RegexMatcher  *REMatcher = NULL;
5499     UErrorCode status = U_ZERO_ERROR;
5500     UParseError pe;
5501 
5502     REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5503     REGEX_CHECK_STATUS;
5504     REMatcher = REPattern->matcher(s, status);
5505     REGEX_CHECK_STATUS;
5506     REGEX_ASSERT(REMatcher->find());
5507     REGEX_ASSERT(REMatcher->start(status) == 0);
5508     delete REPattern;
5509     delete REMatcher;
5510     status = U_ZERO_ERROR;
5511 
5512     REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5513     REGEX_CHECK_STATUS;
5514     REMatcher = REPattern->matcher(s, status);
5515     REGEX_CHECK_STATUS;
5516     REGEX_ASSERT(REMatcher->find());
5517     REGEX_ASSERT(REMatcher->start(status) == 0);
5518     delete REPattern;
5519     delete REMatcher;
5520     status = U_ZERO_ERROR;
5521  }
5522 
Bug7740()5523 void RegexTest::Bug7740() {
5524     UErrorCode status = U_ZERO_ERROR;
5525     UnicodeString pattern = "(a)";
5526     UnicodeString text = "abcdef";
5527     RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5528     REGEX_CHECK_STATUS;
5529     REGEX_ASSERT(m->lookingAt(status));
5530     REGEX_CHECK_STATUS;
5531     status = U_ILLEGAL_ARGUMENT_ERROR;
5532     UnicodeString s = m->group(1, status);    // Bug 7740: segfault here.
5533     REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5534     REGEX_ASSERT(s == "");
5535     delete m;
5536 }
5537 
5538 // Bug 8479:  was crashing whith a Bogus UnicodeString as input.
5539 
Bug8479()5540 void RegexTest::Bug8479() {
5541     UErrorCode status = U_ZERO_ERROR;
5542 
5543     RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5544     REGEX_CHECK_STATUS;
5545     if (U_SUCCESS(status))
5546     {
5547         UnicodeString str;
5548         str.setToBogus();
5549         pMatcher->reset(str);
5550         status = U_ZERO_ERROR;
5551         pMatcher->matches(status);
5552         REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5553         delete pMatcher;
5554     }
5555 }
5556 
5557 
5558 // Bug 7029
Bug7029()5559 void RegexTest::Bug7029() {
5560     UErrorCode status = U_ZERO_ERROR;
5561 
5562     RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5563     UnicodeString text = "abc.def";
5564     UnicodeString splits[10];
5565     REGEX_CHECK_STATUS;
5566     int32_t numFields = pMatcher->split(text, splits, 10, status);
5567     REGEX_CHECK_STATUS;
5568     REGEX_ASSERT(numFields == 8);
5569     delete pMatcher;
5570 }
5571 
5572 // Bug 9283
5573 //   This test is checking for the existance of any supplemental characters that case-fold
5574 //   to a bmp character.
5575 //
5576 //   At the time of this writing there are none. If any should appear in a subsequent release
5577 //   of Unicode, the code in regular expressions compilation that determines the longest
5578 //   posssible match for a literal string  will need to be enhanced.
5579 //
5580 //   See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5581 //   for details on what to do in case of a failure of this test.
5582 //
Bug9283()5583 void RegexTest::Bug9283() {
5584 #if !UCONFIG_NO_NORMALIZATION
5585     UErrorCode status = U_ZERO_ERROR;
5586     UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5587     REGEX_CHECK_STATUS;
5588     int32_t index;
5589     UChar32 c;
5590     for (index=0; ; index++) {
5591         c = supplementalsWithCaseFolding.charAt(index);
5592         if (c == -1) {
5593             break;
5594         }
5595         UnicodeString cf = UnicodeString(c).foldCase();
5596         REGEX_ASSERT(cf.length() >= 2);
5597     }
5598 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5599 }
5600 
5601 
CheckInvBufSize()5602 void RegexTest::CheckInvBufSize() {
5603   if(inv_next>=INV_BUFSIZ) {
5604     errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5605           __FILE__, INV_BUFSIZ, inv_next);
5606   } else {
5607     logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5608   }
5609 }
5610 
5611 
Bug10459()5612 void RegexTest::Bug10459() {
5613     UErrorCode status = U_ZERO_ERROR;
5614     UnicodeString patternString("(txt)");
5615     UnicodeString txtString("txt");
5616 
5617     UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5618     REGEX_CHECK_STATUS;
5619     UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5620     REGEX_CHECK_STATUS;
5621 
5622     URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5623     REGEX_CHECK_STATUS;
5624 
5625     uregex_setUText(icu_re, utext_txt, &status);
5626     REGEX_CHECK_STATUS;
5627 
5628     // The bug was that calling uregex_group() before doing a matching operation
5629     //   was causing a segfault. Only for Regular Expressions created from UText.
5630     //   It should set an U_REGEX_INVALID_STATE.
5631 
5632     UChar buf[100];
5633     int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5634     REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5635     REGEX_ASSERT(len == 0);
5636 
5637     uregex_close(icu_re);
5638     utext_close(utext_pat);
5639     utext_close(utext_txt);
5640 }
5641 
TestCaseInsensitiveStarters()5642 void RegexTest::TestCaseInsensitiveStarters() {
5643     // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5644     //  become stale because of new Unicode characters.
5645     // If it is stale, rerun the generation tool
5646     //    svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5647     // and replace the embedded data in i18n/regexcmp.cpp
5648 
5649     for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5650         if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5651             continue;
5652         }
5653         UnicodeSet s(cp, cp);
5654         s.closeOver(USET_CASE_INSENSITIVE);
5655         UnicodeSetIterator setIter(s);
5656         while (setIter.next()) {
5657             if (!setIter.isString()) {
5658                 continue;
5659             }
5660             const UnicodeString &str = setIter.getString();
5661             UChar32 firstChar = str.char32At(0);
5662             UnicodeSet starters;
5663             RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5664             if (!starters.contains(cp)) {
5665                 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5666                 return;
5667             }
5668         }
5669     }
5670 }
5671 
5672 
TestBug11049()5673 void RegexTest::TestBug11049() {
5674     // Original bug report: pattern with match start consisting of one of several individual characters,
5675     //  and the text being matched ending with a supplementary character. find() would read past the
5676     //  end of the input text when searching for potential match starting points.
5677 
5678     // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5679     // detect the bad read.
5680 
5681     TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5682     TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5683 
5684     // Test again with a pattern starting with a single character,
5685     // which takes a different code path than starting with an OR expression,
5686     // but with similar logic.
5687     TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5688     TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5689 }
5690 
5691 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5692 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5693     UErrorCode status = U_ZERO_ERROR;
5694     UnicodeString patternString = UnicodeString(pattern).unescape();
5695     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5696 
5697     UnicodeString dataString = UnicodeString(data).unescape();
5698     UChar *exactBuffer = new UChar[dataString.length()];
5699     dataString.extract(exactBuffer, dataString.length(), status);
5700     UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5701 
5702     LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5703     REGEX_CHECK_STATUS;
5704     matcher->reset(ut);
5705     UBool result = matcher->find();
5706     if (result != expectMatch) {
5707         errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5708               __FILE__, lineNumber, expectMatch, result, pattern, data);
5709     }
5710 
5711     // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5712     //   off-by-one on find() with match at the last code point.
5713     //   Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5714     //   because string.unescape() will only shrink it.
5715     char * utf8Buffer = new char[uprv_strlen(data)+1];
5716     u_strToUTF8(utf8Buffer, static_cast<int32_t>(uprv_strlen(data)+1), NULL, dataString.getBuffer(), dataString.length(), &status);
5717     REGEX_CHECK_STATUS;
5718     ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5719     REGEX_CHECK_STATUS;
5720     matcher->reset(ut);
5721     result = matcher->find();
5722     if (result != expectMatch) {
5723         errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5724               __FILE__, lineNumber, expectMatch, result, pattern, data);
5725     }
5726     delete [] utf8Buffer;
5727 
5728     utext_close(ut);
5729     delete [] exactBuffer;
5730 }
5731 
5732 
TestBug11371()5733 void RegexTest::TestBug11371() {
5734     if (quick) {
5735         logln("Skipping test. Runs in exhuastive mode only.");
5736         return;
5737     }
5738     UErrorCode status = U_ZERO_ERROR;
5739     UnicodeString patternString;
5740 
5741     for (int i=0; i<8000000; i++) {
5742         patternString.append(UnicodeString("()"));
5743     }
5744     LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5745     if (status != U_REGEX_PATTERN_TOO_BIG) {
5746         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5747               __FILE__, __LINE__, u_errorName(status));
5748     }
5749 
5750     status = U_ZERO_ERROR;
5751     patternString = "(";
5752     for (int i=0; i<20000000; i++) {
5753         patternString.append(UnicodeString("A++"));
5754     }
5755     patternString.append(UnicodeString("){0}B++"));
5756     LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5757     if (status != U_REGEX_PATTERN_TOO_BIG) {
5758         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5759               __FILE__, __LINE__, u_errorName(status));
5760     }
5761 
5762     // Pattern with too much string data, such that string indexes overflow operand data field size
5763     // in compiled instruction.
5764     status = U_ZERO_ERROR;
5765     patternString = "";
5766     while (patternString.length() < 0x00ffffff) {
5767         patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5768     }
5769     patternString.append(UnicodeString("X? trailing string"));
5770     LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5771     if (status != U_REGEX_PATTERN_TOO_BIG) {
5772         errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5773               __FILE__, __LINE__, u_errorName(status));
5774     }
5775 }
5776 
TestBug11480()5777 void RegexTest::TestBug11480() {
5778     // C API, get capture group of a group that does not participate in the match.
5779     //        (Returns a zero length string, with nul termination,
5780     //         indistinguishable from a group with a zero length match.)
5781 
5782     UErrorCode status = U_ZERO_ERROR;
5783     URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5784     REGEX_CHECK_STATUS;
5785     UnicodeString text = UNICODE_STRING_SIMPLE("A");
5786     uregex_setText(re, text.getBuffer(), text.length(), &status);
5787     REGEX_CHECK_STATUS;
5788     REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5789     UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5790     int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5791     REGEX_ASSERT(length == 0);
5792     REGEX_ASSERT(buf[0] == 13);
5793     REGEX_ASSERT(buf[1] == 0);
5794     REGEX_ASSERT(buf[2] == 13);
5795     uregex_close(re);
5796 
5797     // UText C++ API, length of match is 0 for non-participating matches.
5798     UText ut = UTEXT_INITIALIZER;
5799     utext_openUnicodeString(&ut, &text, &status);
5800     RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5801     REGEX_CHECK_STATUS;
5802     matcher.reset(&ut);
5803     REGEX_ASSERT(matcher.lookingAt(0, status));
5804 
5805     // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5806     int64_t groupLen = -666;
5807     UText group = UTEXT_INITIALIZER;
5808     matcher.group(1, &group, groupLen, status);
5809     REGEX_CHECK_STATUS;
5810     REGEX_ASSERT(groupLen == 1);
5811     REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5812 
5813     // Capture group 2, the (B), does not participate in the match.
5814     matcher.group(2, &group, groupLen, status);
5815     REGEX_CHECK_STATUS;
5816     REGEX_ASSERT(groupLen == 0);
5817     REGEX_ASSERT(matcher.start(2, status) == -1);
5818     REGEX_CHECK_STATUS;
5819 }
5820 
TestBug12884()5821 void RegexTest::TestBug12884() {
5822     // setTimeLimit() was not effective for empty sub-patterns with large {minimum counts}
5823     UnicodeString pattern(u"(((((((){120}){11}){11}){11}){80}){11}){4}");
5824     UnicodeString text(u"hello");
5825     UErrorCode status = U_ZERO_ERROR;
5826     RegexMatcher m(pattern, text, 0, status);
5827     REGEX_CHECK_STATUS;
5828     m.setTimeLimit(5, status);
5829     m.find(status);
5830     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5831 
5832     // Non-greedy loops. They take a different code path during matching.
5833     UnicodeString ngPattern(u"(((((((){120}?){11}?){11}?){11}?){80}?){11}?){4}?");
5834     status = U_ZERO_ERROR;
5835     RegexMatcher ngM(ngPattern, text, 0, status);
5836     REGEX_CHECK_STATUS;
5837     ngM.setTimeLimit(5, status);
5838     ngM.find(status);
5839     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5840 
5841     // UText, wrapping non-UTF-16 text, also takes a different execution path.
5842     StringPiece text8(u8"¿Qué es Unicode?  Unicode proporciona un número único para cada"
5843                           "carácter, sin importar la plataforma, sin importar el programa,"
5844                           "sin importar el idioma.");
5845     status = U_ZERO_ERROR;
5846     LocalUTextPointer ut(utext_openUTF8(NULL, text8.data(), text8.length(), &status));
5847     REGEX_CHECK_STATUS;
5848     m.reset(ut.getAlias());
5849     m.find(status);
5850     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5851 
5852     status = U_ZERO_ERROR;
5853     ngM.reset(ut.getAlias());
5854     ngM.find(status);
5855     REGEX_ASSERT(status == U_REGEX_TIME_OUT);
5856 }
5857 
5858 // Bug 13631. A find() of a pattern with a zero length look-behind assertions
5859 //            can cause a read past the end of the input text.
5860 //            The failure is seen when running this test with Clang's Addresss Sanitizer.
5861 
TestBug13631()5862 void RegexTest::TestBug13631() {
5863     const UChar *pats[] = { u"(?<!^)",
5864                             u"(?<=^)",
5865                             nullptr
5866                           };
5867     for (const UChar **pat=pats; *pat; ++pat) {
5868         UErrorCode status = U_ZERO_ERROR;
5869         UnicodeString upat(*pat);
5870         RegexMatcher matcher(upat, 0, status);
5871         const UChar s =u'a';
5872         UText *ut = utext_openUChars(nullptr, &s, 1, &status);
5873         REGEX_CHECK_STATUS;
5874         matcher.reset(ut);
5875         while (matcher.find()) {
5876         }
5877         utext_close(ut);
5878     }
5879 }
5880 
5881 // Bug 13632 Out of bounds memory reference if a replacement string ends with a '$',
5882 //           where a following group specification would be expected.
5883 //           Failure shows when running the test under Clang's Address Sanitizer.
5884 
TestBug13632()5885 void RegexTest::TestBug13632() {
5886     UErrorCode status = U_ZERO_ERROR;
5887     URegularExpression *re = uregex_openC(" ", 0, nullptr, &status);
5888     const char16_t *sourceString = u"Hello, world.";
5889     uregex_setText(re, sourceString, u_strlen(sourceString), &status);
5890 
5891     const int32_t destCap = 20;
5892     char16_t dest[destCap] = {};
5893     const char16_t replacement[] = {u'x', u'$'};    // Not nul terminated string.
5894     uregex_replaceAll(re, replacement, 2, dest, destCap, &status);
5895 
5896     assertEquals("", U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5897     uregex_close(re);
5898 }
5899 
TestBug20359()5900 void RegexTest::TestBug20359() {
5901     // The bug was stack overflow while parsing a pattern with a huge number of adjacent \Q\E
5902     // pairs. (Enter and exit pattern literal quote mode). Logic was correct.
5903     // Changed implementation to loop instead of recursing.
5904 
5905     UnicodeString pattern;
5906     for (int i=0; i<50000; ++i) {
5907         pattern += u"\\Q\\E";
5908     }
5909     pattern += u"x";
5910 
5911     UErrorCode status = U_ZERO_ERROR;
5912     LocalURegularExpressionPointer re(uregex_open(pattern.getBuffer(), pattern.length(),
5913                                        0, nullptr, &status));
5914     assertSuccess(WHERE, status);
5915 
5916     // We have passed the point where the bug crashed. The following is a small sanity
5917     // check that the pattern works, that all the \Q\E\Q\E... didn't cause other problems.
5918 
5919     uregex_setText(re.getAlias(), u"abcxyz", -1, &status);
5920     assertSuccess(WHERE, status);
5921     assertTrue(WHERE, uregex_find(re.getAlias(), 0, &status));
5922     assertEquals(WHERE, 3, uregex_start(re.getAlias(), 0, &status));
5923     assertSuccess(WHERE, status);
5924 }
5925 
5926 
TestBug20863()5927 void RegexTest::TestBug20863() {
5928     // Test that patterns with a large number of named capture groups work correctly.
5929     //
5930     // The ticket was not for a bug per se, but to reduce memory usage by using lazy
5931     // construction of the map from capture names to numbers, and decreasing the
5932     // default size of the map.
5933 
5934     constexpr int GROUP_COUNT = 2000;
5935     std::vector<UnicodeString> groupNames;
5936     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5937         UnicodeString name;
5938         name.append(u"name");
5939         name.append(Int64ToUnicodeString(i));
5940         groupNames.push_back(name);
5941     }
5942 
5943     UnicodeString patternString;
5944     for (UnicodeString name: groupNames) {
5945         patternString.append(u"(?<");
5946         patternString.append(name);
5947         patternString.append(u">.)");
5948     }
5949 
5950     UErrorCode status = U_ZERO_ERROR;
5951     UParseError pe;
5952     LocalPointer<RegexPattern> pattern(RegexPattern::compile(patternString, pe, status), status);
5953     if (!assertSuccess(WHERE, status)) {
5954         return;
5955     }
5956 
5957     for (int32_t i=0; i<GROUP_COUNT; ++i) {
5958         int32_t group = pattern->groupNumberFromName(groupNames[i], status);
5959         if (!assertSuccess(WHERE, status)) {
5960             return;
5961         }
5962         assertEquals(WHERE, i+1, group);
5963         // Note: group 0 is the overall match; group 1 is the first separate capture group.
5964     }
5965 
5966     // Verify that assignment of patterns with various combinations of named capture work.
5967     // Lazy creation of the internal named capture map changed the implementation logic here.
5968     {
5969         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5970         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5971         assertSuccess(WHERE, status);
5972         assertFalse(WHERE, *pat1 == *pat2);
5973         *pat1 = *pat2;
5974         assertTrue(WHERE, *pat1 == *pat2);
5975         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name", status));
5976         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name", status));
5977         assertSuccess(WHERE, status);
5978     }
5979 
5980     {
5981         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"abc", pe, status), status);
5982         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name>b)c", pe, status), status);
5983         assertSuccess(WHERE, status);
5984         assertFalse(WHERE, *pat1 == *pat2);
5985         *pat2 = *pat1;
5986         assertTrue(WHERE, *pat1 == *pat2);
5987         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name", status));
5988         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5989         status = U_ZERO_ERROR;
5990         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name", status));
5991         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
5992         status = U_ZERO_ERROR;
5993     }
5994 
5995     {
5996         LocalPointer<RegexPattern> pat1(RegexPattern::compile(u"a(?<name1>b)c", pe, status), status);
5997         LocalPointer<RegexPattern> pat2(RegexPattern::compile(u"a(?<name2>b)c", pe, status), status);
5998         assertSuccess(WHERE, status);
5999         assertFalse(WHERE, *pat1 == *pat2);
6000         *pat2 = *pat1;
6001         assertTrue(WHERE, *pat1 == *pat2);
6002         assertEquals(WHERE, 1, pat1->groupNumberFromName(u"name1", status));
6003         assertSuccess(WHERE, status);
6004         assertEquals(WHERE, 1, pat2->groupNumberFromName(u"name1", status));
6005         assertSuccess(WHERE, status);
6006         assertEquals(WHERE, 0, pat1->groupNumberFromName(u"name2", status));
6007         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6008         status = U_ZERO_ERROR;
6009         assertEquals(WHERE, 0, pat2->groupNumberFromName(u"name2", status));
6010         assertEquals(WHERE, U_REGEX_INVALID_CAPTURE_GROUP_NAME, status);
6011         status = U_ZERO_ERROR;
6012     }
6013 
6014 }
6015 
6016 
6017 #endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
6018