1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /********************************************************************
4 * COPYRIGHT:
5 * Copyright (c) 2002-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 ********************************************************************/
8
9 //
10 // regextst.cpp
11 //
12 // ICU Regular Expressions test, part of intltest.
13 //
14
15 /*
16 NOTE!!
17
18 PLEASE be careful about ASCII assumptions in this test.
19 This test is one of the worst repeat offenders.
20 If you have questions, contact someone on the ICU PMC
21 who has access to an EBCDIC system.
22
23 */
24
25 #include "intltest.h"
26 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
27
28 #include <stdlib.h>
29 #include <stdio.h>
30 #include <string.h>
31
32 #include "unicode/localpointer.h"
33 #include "unicode/regex.h"
34 #include "unicode/uchar.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uniset.h"
37 #include "unicode/uregex.h"
38 #include "unicode/usetiter.h"
39 #include "unicode/ustring.h"
40 #include "unicode/utext.h"
41
42 #include "regextst.h"
43 #include "regexcmp.h"
44 #include "uvector.h"
45 #include "util.h"
46 #include "cmemory.h"
47 #include "cstring.h"
48 #include "uinvchar.h"
49
50 #define SUPPORT_MUTATING_INPUT_STRING 0
51
52 //---------------------------------------------------------------------------
53 //
54 // Test class boilerplate
55 //
56 //---------------------------------------------------------------------------
RegexTest()57 RegexTest::RegexTest()
58 {
59 }
60
61
~RegexTest()62 RegexTest::~RegexTest()
63 {
64 }
65
66
67
runIndexedTest(int32_t index,UBool exec,const char * & name,char *)68 void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char* /*par*/ )
69 {
70 if (exec) logln("TestSuite RegexTest: ");
71 switch (index) {
72
73 case 0: name = "Basic";
74 if (exec) Basic();
75 break;
76 case 1: name = "API_Match";
77 if (exec) API_Match();
78 break;
79 case 2: name = "API_Replace";
80 if (exec) API_Replace();
81 break;
82 case 3: name = "API_Pattern";
83 if (exec) API_Pattern();
84 break;
85 case 4:
86 #if !UCONFIG_NO_FILE_IO
87 name = "Extended";
88 if (exec) Extended();
89 #else
90 name = "skip";
91 #endif
92 break;
93 case 5: name = "Errors";
94 if (exec) Errors();
95 break;
96 case 6: name = "PerlTests";
97 if (exec) PerlTests();
98 break;
99 case 7: name = "Callbacks";
100 if (exec) Callbacks();
101 break;
102 case 8: name = "FindProgressCallbacks";
103 if (exec) FindProgressCallbacks();
104 break;
105 case 9: name = "Bug 6149";
106 if (exec) Bug6149();
107 break;
108 case 10: name = "UTextBasic";
109 if (exec) UTextBasic();
110 break;
111 case 11: name = "API_Match_UTF8";
112 if (exec) API_Match_UTF8();
113 break;
114 case 12: name = "API_Replace_UTF8";
115 if (exec) API_Replace_UTF8();
116 break;
117 case 13: name = "API_Pattern_UTF8";
118 if (exec) API_Pattern_UTF8();
119 break;
120 case 14: name = "PerlTestsUTF8";
121 if (exec) PerlTestsUTF8();
122 break;
123 case 15: name = "PreAllocatedUTextCAPI";
124 if (exec) PreAllocatedUTextCAPI();
125 break;
126 case 16: name = "Bug 7651";
127 if (exec) Bug7651();
128 break;
129 case 17: name = "Bug 7740";
130 if (exec) Bug7740();
131 break;
132 case 18: name = "Bug 8479";
133 if (exec) Bug8479();
134 break;
135 case 19: name = "Bug 7029";
136 if (exec) Bug7029();
137 break;
138 case 20: name = "CheckInvBufSize";
139 if (exec) CheckInvBufSize();
140 break;
141 case 21: name = "Bug 9283";
142 if (exec) Bug9283();
143 break;
144 case 22: name = "Bug10459";
145 if (exec) Bug10459();
146 break;
147 case 23: name = "TestCaseInsensitiveStarters";
148 if (exec) TestCaseInsensitiveStarters();
149 break;
150 case 24: name = "TestBug11049";
151 if (exec) TestBug11049();
152 break;
153 case 25: name = "TestBug11371";
154 if (exec) TestBug11371();
155 break;
156 case 26: name = "TestBug11480";
157 if (exec) TestBug11480();
158 break;
159 case 27: name = "NamedCapture";
160 if (exec) NamedCapture();
161 break;
162 case 28: name = "NamedCaptureLimits";
163 if (exec) NamedCaptureLimits();
164 break;
165 default: name = "";
166 break; //needed to end loop
167 }
168 }
169
170
171
172 /**
173 * Calls utext_openUTF8 after, potentially, converting invariant text from the compilation codepage
174 * into ASCII.
175 * @see utext_openUTF8
176 */
177 static UText* regextst_openUTF8FromInvariant(UText* ut, const char *inv, int64_t length, UErrorCode *status);
178
179 //---------------------------------------------------------------------------
180 //
181 // Error Checking / Reporting macros used in all of the tests.
182 //
183 //---------------------------------------------------------------------------
184
utextToPrintable(char * buf,int32_t bufLen,UText * text)185 static void utextToPrintable(char *buf, int32_t bufLen, UText *text) {
186 int64_t oldIndex = utext_getNativeIndex(text);
187 utext_setNativeIndex(text, 0);
188 char *bufPtr = buf;
189 UChar32 c = utext_next32From(text, 0);
190 while ((c != U_SENTINEL) && (bufPtr < buf+bufLen)) {
191 if (0x000020<=c && c<0x00007e) {
192 *bufPtr = c;
193 } else {
194 #if 0
195 sprintf(bufPtr,"U+%04X", c);
196 bufPtr+= strlen(bufPtr)-1;
197 #else
198 *bufPtr = '%';
199 #endif
200 }
201 bufPtr++;
202 c = UTEXT_NEXT32(text);
203 }
204 *bufPtr = 0;
205 #if (U_CHARSET_FAMILY==U_EBCDIC_FAMILY)
206 char *ebuf = (char*)malloc(bufLen);
207 uprv_eastrncpy((unsigned char*)ebuf, (const unsigned char*)buf, bufLen);
208 uprv_strncpy(buf, ebuf, bufLen);
209 free((void*)ebuf);
210 #endif
211 utext_setNativeIndex(text, oldIndex);
212 }
213
214
215 static char ASSERT_BUF[1024];
216
extractToAssertBuf(const UnicodeString & message)217 const char* RegexTest::extractToAssertBuf(const UnicodeString& message) {
218 if(message.length()==0) {
219 strcpy(ASSERT_BUF, "[[empty UnicodeString]]");
220 } else {
221 UnicodeString buf;
222 IntlTest::prettify(message,buf);
223 if(buf.length()==0) {
224 strcpy(ASSERT_BUF, "[[escape() returned 0 chars]]");
225 } else {
226 buf.extract(0, 0x7FFFFFFF, ASSERT_BUF, sizeof(ASSERT_BUF)-1);
227 if(ASSERT_BUF[0]==0) {
228 ASSERT_BUF[0]=0;
229 for(int32_t i=0;i<buf.length();i++) {
230 UChar ch = buf[i];
231 sprintf(ASSERT_BUF+strlen(ASSERT_BUF),"\\u%02x",ch);
232 }
233 }
234 }
235 }
236 ASSERT_BUF[sizeof(ASSERT_BUF)-1] = 0;
237 return ASSERT_BUF;
238 }
239
240 #define REGEX_VERBOSE_TEXT(text) {char buf[200];utextToPrintable(buf,UPRV_LENGTHOF(buf),text);logln("%s:%d: UText %s=\"%s\"", __FILE__, __LINE__, #text, buf);}
241
242 #define REGEX_CHECK_STATUS {if (U_FAILURE(status)) {dataerrln("%s:%d: RegexTest failure. status=%s", \
243 __FILE__, __LINE__, u_errorName(status)); return;}}
244
245 #define REGEX_ASSERT(expr) {if ((expr)==FALSE) {errln("%s:%d: RegexTest failure: REGEX_ASSERT(%s) failed \n", __FILE__, __LINE__, #expr);};}
246
247 #define REGEX_ASSERT_FAIL(expr, errcode) {UErrorCode status=U_ZERO_ERROR; (expr);\
248 if (status!=errcode) {dataerrln("RegexTest failure at line %d. Expected status=%s, got %s", \
249 __LINE__, u_errorName(errcode), u_errorName(status));};}
250
251 #define REGEX_CHECK_STATUS_L(line) {if (U_FAILURE(status)) {errln( \
252 "RegexTest failure at line %d, from %d. status=%d\n",__LINE__, (line), status); }}
253
254 #define REGEX_ASSERT_L(expr, line) {if ((expr)==FALSE) { \
255 errln("RegexTest failure at line %d, from %d.", __LINE__, (line)); return;}}
256
257 // expected: const char * , restricted to invariant characters.
258 // actual: const UnicodeString &
259 #define REGEX_ASSERT_UNISTR(expected, actual) { \
260 if (UnicodeString(expected, -1, US_INV) != (actual)) { \
261 errln("%s:%d: RegexTest failure: REGEX_ASSERT_UNISTR(%s, %s) failed \n", \
262 __FILE__, __LINE__, expected, extractToAssertBuf(actual));};}
263
264
testUTextEqual(UText * uta,UText * utb)265 static UBool testUTextEqual(UText *uta, UText *utb) {
266 UChar32 ca = 0;
267 UChar32 cb = 0;
268 utext_setNativeIndex(uta, 0);
269 utext_setNativeIndex(utb, 0);
270 do {
271 ca = utext_next32(uta);
272 cb = utext_next32(utb);
273 if (ca != cb) {
274 break;
275 }
276 } while (ca != U_SENTINEL);
277 return ca == cb;
278 }
279
280
281 /**
282 * @param expected expected text in UTF-8 (not platform) codepage
283 */
assertUText(const char * expected,UText * actual,const char * file,int line)284 void RegexTest::assertUText(const char *expected, UText *actual, const char *file, int line) {
285 UErrorCode status = U_ZERO_ERROR;
286 UText expectedText = UTEXT_INITIALIZER;
287 utext_openUTF8(&expectedText, expected, -1, &status);
288 if(U_FAILURE(status)) {
289 errln("%s:%d: assertUText: error %s calling utext_openUTF8(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
290 return;
291 }
292 if(utext_nativeLength(&expectedText)==0 && (strlen(expected)!=0)) {
293 errln("%s:%d: assertUText: expected is %d utf-8 bytes, but utext_nativeLength(expectedText) returned 0.", file, line, strlen(expected));
294 return;
295 }
296 utext_setNativeIndex(actual, 0);
297 if (!testUTextEqual(&expectedText, actual)) {
298 char buf[201 /*21*/];
299 char expectedBuf[201];
300 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
301 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
302 errln("%s:%d: assertUText: Failure: expected \"%s\" (%d chars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
303 }
304 utext_close(&expectedText);
305 }
306 /**
307 * @param expected invariant (platform local text) input
308 */
309
assertUTextInvariant(const char * expected,UText * actual,const char * file,int line)310 void RegexTest::assertUTextInvariant(const char *expected, UText *actual, const char *file, int line) {
311 UErrorCode status = U_ZERO_ERROR;
312 UText expectedText = UTEXT_INITIALIZER;
313 regextst_openUTF8FromInvariant(&expectedText, expected, -1, &status);
314 if(U_FAILURE(status)) {
315 errln("%s:%d: assertUTextInvariant: error %s calling regextst_openUTF8FromInvariant(expected: %d chars)\n", file, line, u_errorName(status), strlen(expected));
316 return;
317 }
318 utext_setNativeIndex(actual, 0);
319 if (!testUTextEqual(&expectedText, actual)) {
320 char buf[201 /*21*/];
321 char expectedBuf[201];
322 utextToPrintable(buf, UPRV_LENGTHOF(buf), actual);
323 utextToPrintable(expectedBuf, UPRV_LENGTHOF(expectedBuf), &expectedText);
324 errln("%s:%d: assertUTextInvariant: Failure: expected \"%s\" (%d uchars), got \"%s\" (%d chars)", file, line, expectedBuf, (int)utext_nativeLength(&expectedText), buf, (int)utext_nativeLength(actual));
325 }
326 utext_close(&expectedText);
327 }
328
329 /**
330 * Assumes utf-8 input
331 */
332 #define REGEX_ASSERT_UTEXT_UTF8(expected, actual) assertUText((expected), (actual), __FILE__, __LINE__)
333 /**
334 * Assumes Invariant input
335 */
336 #define REGEX_ASSERT_UTEXT_INVARIANT(expected, actual) assertUTextInvariant((expected), (actual), __FILE__, __LINE__)
337
338 /**
339 * This buffer ( inv_buf ) is used to hold the UTF-8 strings
340 * passed into utext_openUTF8. An error will be given if
341 * INV_BUFSIZ is too small. It's only used on EBCDIC systems.
342 */
343
344 #define INV_BUFSIZ 2048 /* increase this if too small */
345
346 static int64_t inv_next=0;
347
348 #if U_CHARSET_FAMILY!=U_ASCII_FAMILY
349 static char inv_buf[INV_BUFSIZ];
350 #endif
351
regextst_openUTF8FromInvariant(UText * ut,const char * inv,int64_t length,UErrorCode * status)352 static UText* regextst_openUTF8FromInvariant(UText *ut, const char *inv, int64_t length, UErrorCode *status) {
353 if(length==-1) length=strlen(inv);
354 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
355 inv_next+=length;
356 return utext_openUTF8(ut, inv, length, status);
357 #else
358 if(inv_next+length+1>INV_BUFSIZ) {
359 fprintf(stderr, "%s:%d Error: INV_BUFSIZ #defined to be %d but needs to be at least %d.\n",
360 __FILE__, __LINE__, INV_BUFSIZ, (inv_next+length+1));
361 *status = U_MEMORY_ALLOCATION_ERROR;
362 return NULL;
363 }
364
365 unsigned char *buf = (unsigned char*)inv_buf+inv_next;
366 uprv_aestrncpy(buf, (const uint8_t*)inv, length);
367 inv_next+=length;
368
369 #if 0
370 fprintf(stderr, " Note: INV_BUFSIZ at %d, used=%d\n", INV_BUFSIZ, inv_next);
371 #endif
372
373 return utext_openUTF8(ut, (const char*)buf, length, status);
374 #endif
375 }
376
377
378 //---------------------------------------------------------------------------
379 //
380 // REGEX_TESTLM Macro + invocation function to simplify writing quick tests
381 // for the LookingAt() and Match() functions.
382 //
383 // usage:
384 // REGEX_TESTLM("pattern", "input text", lookingAt expected, matches expected);
385 //
386 // The expected results are UBool - TRUE or FALSE.
387 // The input text is unescaped. The pattern is not.
388 //
389 //
390 //---------------------------------------------------------------------------
391
392 #define REGEX_TESTLM(pat, text, looking, match) {doRegexLMTest(pat, text, looking, match, __LINE__);doRegexLMTestUTF8(pat, text, looking, match, __LINE__);}
393
doRegexLMTest(const char * pat,const char * text,UBool looking,UBool match,int32_t line)394 UBool RegexTest::doRegexLMTest(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
395 const UnicodeString pattern(pat, -1, US_INV);
396 const UnicodeString inputText(text, -1, US_INV);
397 UErrorCode status = U_ZERO_ERROR;
398 UParseError pe;
399 RegexPattern *REPattern = NULL;
400 RegexMatcher *REMatcher = NULL;
401 UBool retVal = TRUE;
402
403 UnicodeString patString(pat, -1, US_INV);
404 REPattern = RegexPattern::compile(patString, 0, pe, status);
405 if (U_FAILURE(status)) {
406 dataerrln("RegexTest failure in RegexPattern::compile() at line %d. Status = %s",
407 line, u_errorName(status));
408 return FALSE;
409 }
410 if (line==376) { REPattern->dumpPattern();}
411
412 UnicodeString inputString(inputText);
413 UnicodeString unEscapedInput = inputString.unescape();
414 REMatcher = REPattern->matcher(unEscapedInput, status);
415 if (U_FAILURE(status)) {
416 errln("RegexTest failure in REPattern::matcher() at line %d. Status = %s\n",
417 line, u_errorName(status));
418 return FALSE;
419 }
420
421 UBool actualmatch;
422 actualmatch = REMatcher->lookingAt(status);
423 if (U_FAILURE(status)) {
424 errln("RegexTest failure in lookingAt() at line %d. Status = %s\n",
425 line, u_errorName(status));
426 retVal = FALSE;
427 }
428 if (actualmatch != looking) {
429 errln("RegexTest: wrong return from lookingAt() at line %d.\n", line);
430 retVal = FALSE;
431 }
432
433 status = U_ZERO_ERROR;
434 actualmatch = REMatcher->matches(status);
435 if (U_FAILURE(status)) {
436 errln("RegexTest failure in matches() at line %d. Status = %s\n",
437 line, u_errorName(status));
438 retVal = FALSE;
439 }
440 if (actualmatch != match) {
441 errln("RegexTest: wrong return from matches() at line %d.\n", line);
442 retVal = FALSE;
443 }
444
445 if (retVal == FALSE) {
446 REPattern->dumpPattern();
447 }
448
449 delete REPattern;
450 delete REMatcher;
451 return retVal;
452 }
453
454
doRegexLMTestUTF8(const char * pat,const char * text,UBool looking,UBool match,int32_t line)455 UBool RegexTest::doRegexLMTestUTF8(const char *pat, const char *text, UBool looking, UBool match, int32_t line) {
456 UText pattern = UTEXT_INITIALIZER;
457 int32_t inputUTF8Length;
458 char *textChars = NULL;
459 UText inputText = UTEXT_INITIALIZER;
460 UErrorCode status = U_ZERO_ERROR;
461 UParseError pe;
462 RegexPattern *REPattern = NULL;
463 RegexMatcher *REMatcher = NULL;
464 UBool retVal = TRUE;
465
466 regextst_openUTF8FromInvariant(&pattern, pat, -1, &status);
467 REPattern = RegexPattern::compile(&pattern, 0, pe, status);
468 if (U_FAILURE(status)) {
469 dataerrln("RegexTest failure in RegexPattern::compile() at line %d (UTF8). Status = %s\n",
470 line, u_errorName(status));
471 return FALSE;
472 }
473
474 UnicodeString inputString(text, -1, US_INV);
475 UnicodeString unEscapedInput = inputString.unescape();
476 LocalUConverterPointer UTF8Converter(ucnv_open("UTF8", &status));
477 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
478
479 inputUTF8Length = unEscapedInput.extract(NULL, 0, UTF8Converter.getAlias(), status);
480 if (U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR) {
481 // UTF-8 does not allow unpaired surrogates, so this could actually happen
482 logln("RegexTest unable to convert input to UTF8 at line %d. Status = %s\n", line, u_errorName(status));
483 return TRUE; // not a failure of the Regex engine
484 }
485 status = U_ZERO_ERROR; // buffer overflow
486 textChars = new char[inputUTF8Length+1];
487 unEscapedInput.extract(textChars, inputUTF8Length+1, UTF8Converter.getAlias(), status);
488 utext_openUTF8(&inputText, textChars, inputUTF8Length, &status);
489
490 REMatcher = &REPattern->matcher(status)->reset(&inputText);
491 if (U_FAILURE(status)) {
492 errln("RegexTest failure in REPattern::matcher() at line %d (UTF8). Status = %s\n",
493 line, u_errorName(status));
494 return FALSE;
495 }
496
497 UBool actualmatch;
498 actualmatch = REMatcher->lookingAt(status);
499 if (U_FAILURE(status)) {
500 errln("RegexTest failure in lookingAt() at line %d (UTF8). Status = %s\n",
501 line, u_errorName(status));
502 retVal = FALSE;
503 }
504 if (actualmatch != looking) {
505 errln("RegexTest: wrong return from lookingAt() at line %d (UTF8).\n", line);
506 retVal = FALSE;
507 }
508
509 status = U_ZERO_ERROR;
510 actualmatch = REMatcher->matches(status);
511 if (U_FAILURE(status)) {
512 errln("RegexTest failure in matches() at line %d (UTF8). Status = %s\n",
513 line, u_errorName(status));
514 retVal = FALSE;
515 }
516 if (actualmatch != match) {
517 errln("RegexTest: wrong return from matches() at line %d (UTF8).\n", line);
518 retVal = FALSE;
519 }
520
521 if (retVal == FALSE) {
522 REPattern->dumpPattern();
523 }
524
525 delete REPattern;
526 delete REMatcher;
527 utext_close(&inputText);
528 utext_close(&pattern);
529 delete[] textChars;
530 return retVal;
531 }
532
533
534
535 //---------------------------------------------------------------------------
536 //
537 // REGEX_ERR Macro + invocation function to simplify writing tests
538 // regex tests for incorrect patterns
539 //
540 // usage:
541 // REGEX_ERR("pattern", expected error line, column, expected status);
542 //
543 //---------------------------------------------------------------------------
544 #define REGEX_ERR(pat, line, col, status) regex_err(pat, line, col, status, __LINE__);
545
regex_err(const char * pat,int32_t errLine,int32_t errCol,UErrorCode expectedStatus,int32_t line)546 void RegexTest::regex_err(const char *pat, int32_t errLine, int32_t errCol,
547 UErrorCode expectedStatus, int32_t line) {
548 UnicodeString pattern(pat);
549
550 UErrorCode status = U_ZERO_ERROR;
551 UParseError pe;
552 RegexPattern *callerPattern = NULL;
553
554 //
555 // Compile the caller's pattern
556 //
557 UnicodeString patString(pat);
558 callerPattern = RegexPattern::compile(patString, 0, pe, status);
559 if (status != expectedStatus) {
560 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
561 } else {
562 if (status != U_ZERO_ERROR) {
563 if (pe.line != errLine || pe.offset != errCol) {
564 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
565 line, errLine, errCol, pe.line, pe.offset);
566 }
567 }
568 }
569
570 delete callerPattern;
571
572 //
573 // Compile again, using a UTF-8-based UText
574 //
575 UText patternText = UTEXT_INITIALIZER;
576 regextst_openUTF8FromInvariant(&patternText, pat, -1, &status);
577 callerPattern = RegexPattern::compile(&patternText, 0, pe, status);
578 if (status != expectedStatus) {
579 dataerrln("Line %d: unexpected error %s compiling pattern.", line, u_errorName(status));
580 } else {
581 if (status != U_ZERO_ERROR) {
582 if (pe.line != errLine || pe.offset != errCol) {
583 errln("Line %d: incorrect line/offset from UParseError. Expected %d/%d; got %d/%d.\n",
584 line, errLine, errCol, pe.line, pe.offset);
585 }
586 }
587 }
588
589 delete callerPattern;
590 utext_close(&patternText);
591 }
592
593
594
595 //---------------------------------------------------------------------------
596 //
597 // Basic Check for basic functionality of regex pattern matching.
598 // Avoid the use of REGEX_FIND test macro, which has
599 // substantial dependencies on basic Regex functionality.
600 //
601 //---------------------------------------------------------------------------
Basic()602 void RegexTest::Basic() {
603
604
605 //
606 // Debug - slide failing test cases early
607 //
608 #if 0
609 {
610 // REGEX_TESTLM("a\N{LATIN SMALL LETTER B}c", "abc", FALSE, FALSE);
611 UParseError pe;
612 UErrorCode status = U_ZERO_ERROR;
613 RegexPattern *pattern;
614 pattern = RegexPattern::compile(UNICODE_STRING_SIMPLE("a\\u00dfx").unescape(), UREGEX_CASE_INSENSITIVE, pe, status);
615 pattern->dumpPattern();
616 RegexMatcher *m = pattern->matcher(UNICODE_STRING_SIMPLE("a\\u00dfxzzz").unescape(), status);
617 UBool result = m->find();
618 printf("result = %d\n", result);
619 // REGEX_FIND("", "<0>ab<1>cc</1><2>ccc</2></0>ddd");
620 // REGEX_FIND("(X([abc=X]+)+X)|(y[abc=]+)", "=XX====================");
621 }
622 exit(1);
623 #endif
624
625
626 //
627 // Pattern with parentheses
628 //
629 REGEX_TESTLM("st(abc)ring", "stabcring thing", TRUE, FALSE);
630 REGEX_TESTLM("st(abc)ring", "stabcring", TRUE, TRUE);
631 REGEX_TESTLM("st(abc)ring", "stabcrung", FALSE, FALSE);
632
633 //
634 // Patterns with *
635 //
636 REGEX_TESTLM("st(abc)*ring", "string", TRUE, TRUE);
637 REGEX_TESTLM("st(abc)*ring", "stabcring", TRUE, TRUE);
638 REGEX_TESTLM("st(abc)*ring", "stabcabcring", TRUE, TRUE);
639 REGEX_TESTLM("st(abc)*ring", "stabcabcdring", FALSE, FALSE);
640 REGEX_TESTLM("st(abc)*ring", "stabcabcabcring etc.", TRUE, FALSE);
641
642 REGEX_TESTLM("a*", "", TRUE, TRUE);
643 REGEX_TESTLM("a*", "b", TRUE, FALSE);
644
645
646 //
647 // Patterns with "."
648 //
649 REGEX_TESTLM(".", "abc", TRUE, FALSE);
650 REGEX_TESTLM("...", "abc", TRUE, TRUE);
651 REGEX_TESTLM("....", "abc", FALSE, FALSE);
652 REGEX_TESTLM(".*", "abcxyz123", TRUE, TRUE);
653 REGEX_TESTLM("ab.*xyz", "abcdefghij", FALSE, FALSE);
654 REGEX_TESTLM("ab.*xyz", "abcdefg...wxyz", TRUE, TRUE);
655 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz", TRUE, TRUE);
656 REGEX_TESTLM("ab.*xyz", "abcde...wxyz...abc..xyz...", TRUE, FALSE);
657
658 //
659 // Patterns with * applied to chars at end of literal string
660 //
661 REGEX_TESTLM("abc*", "ab", TRUE, TRUE);
662 REGEX_TESTLM("abc*", "abccccc", TRUE, TRUE);
663
664 //
665 // Supplemental chars match as single chars, not a pair of surrogates.
666 //
667 REGEX_TESTLM(".", "\\U00011000", TRUE, TRUE);
668 REGEX_TESTLM("...", "\\U00011000x\\U00012002", TRUE, TRUE);
669 REGEX_TESTLM("...", "\\U00011000x\\U00012002y", TRUE, FALSE);
670
671
672 //
673 // UnicodeSets in the pattern
674 //
675 REGEX_TESTLM("[1-6]", "1", TRUE, TRUE);
676 REGEX_TESTLM("[1-6]", "3", TRUE, TRUE);
677 REGEX_TESTLM("[1-6]", "7", FALSE, FALSE);
678 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
679 REGEX_TESTLM("a[1-6]", "a3", TRUE, TRUE);
680 REGEX_TESTLM("a[1-6]b", "a3b", TRUE, TRUE);
681
682 REGEX_TESTLM("a[0-9]*b", "a123b", TRUE, TRUE);
683 REGEX_TESTLM("a[0-9]*b", "abc", TRUE, FALSE);
684 REGEX_TESTLM("[\\p{Nd}]*", "123456", TRUE, TRUE);
685 REGEX_TESTLM("[\\p{Nd}]*", "a123456", TRUE, FALSE); // note that * matches 0 occurences.
686 REGEX_TESTLM("[a][b][[:Zs:]]*", "ab ", TRUE, TRUE);
687
688 //
689 // OR operator in patterns
690 //
691 REGEX_TESTLM("(a|b)", "a", TRUE, TRUE);
692 REGEX_TESTLM("(a|b)", "b", TRUE, TRUE);
693 REGEX_TESTLM("(a|b)", "c", FALSE, FALSE);
694 REGEX_TESTLM("a|b", "b", TRUE, TRUE);
695
696 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabc", TRUE, TRUE);
697 REGEX_TESTLM("(a|b|c)*", "aabcaaccbcabdc", TRUE, FALSE);
698 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "ac", TRUE, TRUE);
699 REGEX_TESTLM("(a(b|c|d)(x|y|z)*|123)", "123", TRUE, TRUE);
700 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "123", TRUE, TRUE);
701 REGEX_TESTLM("(a|(1|2)*)(b|c|d)(x|y|z)*|123", "222211111czzzzw", TRUE, FALSE);
702
703 //
704 // +
705 //
706 REGEX_TESTLM("ab+", "abbc", TRUE, FALSE);
707 REGEX_TESTLM("ab+c", "ac", FALSE, FALSE);
708 REGEX_TESTLM("b+", "", FALSE, FALSE);
709 REGEX_TESTLM("(abc|def)+", "defabc", TRUE, TRUE);
710 REGEX_TESTLM(".+y", "zippity dooy dah ", TRUE, FALSE);
711 REGEX_TESTLM(".+y", "zippity dooy", TRUE, TRUE);
712
713 //
714 // ?
715 //
716 REGEX_TESTLM("ab?", "ab", TRUE, TRUE);
717 REGEX_TESTLM("ab?", "a", TRUE, TRUE);
718 REGEX_TESTLM("ab?", "ac", TRUE, FALSE);
719 REGEX_TESTLM("ab?", "abb", TRUE, FALSE);
720 REGEX_TESTLM("a(b|c)?d", "abd", TRUE, TRUE);
721 REGEX_TESTLM("a(b|c)?d", "acd", TRUE, TRUE);
722 REGEX_TESTLM("a(b|c)?d", "ad", TRUE, TRUE);
723 REGEX_TESTLM("a(b|c)?d", "abcd", FALSE, FALSE);
724 REGEX_TESTLM("a(b|c)?d", "ab", FALSE, FALSE);
725
726 //
727 // Escape sequences that become single literal chars, handled internally
728 // by ICU's Unescape.
729 //
730
731 // REGEX_TESTLM("\101\142", "Ab", TRUE, TRUE); // Octal TODO: not implemented yet.
732 REGEX_TESTLM("\\a", "\\u0007", TRUE, TRUE); // BEL
733 REGEX_TESTLM("\\cL", "\\u000c", TRUE, TRUE); // Control-L
734 REGEX_TESTLM("\\e", "\\u001b", TRUE, TRUE); // Escape
735 REGEX_TESTLM("\\f", "\\u000c", TRUE, TRUE); // Form Feed
736 REGEX_TESTLM("\\n", "\\u000a", TRUE, TRUE); // new line
737 REGEX_TESTLM("\\r", "\\u000d", TRUE, TRUE); // CR
738 REGEX_TESTLM("\\t", "\\u0009", TRUE, TRUE); // Tab
739 REGEX_TESTLM("\\u1234", "\\u1234", TRUE, TRUE);
740 REGEX_TESTLM("\\U00001234", "\\u1234", TRUE, TRUE);
741
742 REGEX_TESTLM(".*\\Ax", "xyz", TRUE, FALSE); // \A matches only at the beginning of input
743 REGEX_TESTLM(".*\\Ax", " xyz", FALSE, FALSE); // \A matches only at the beginning of input
744
745 // Escape of special chars in patterns
746 REGEX_TESTLM("\\\\\\|\\(\\)\\[\\{\\~\\$\\*\\+\\?\\.", "\\\\|()[{~$*+?.", TRUE, TRUE);
747 }
748
749
750 //---------------------------------------------------------------------------
751 //
752 // UTextBasic Check for quirks that are specific to the UText
753 // implementation.
754 //
755 //---------------------------------------------------------------------------
UTextBasic()756 void RegexTest::UTextBasic() {
757 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
758 UErrorCode status = U_ZERO_ERROR;
759 UText pattern = UTEXT_INITIALIZER;
760 utext_openUTF8(&pattern, str_abc, -1, &status);
761 RegexMatcher matcher(&pattern, 0, status);
762 REGEX_CHECK_STATUS;
763
764 UText input = UTEXT_INITIALIZER;
765 utext_openUTF8(&input, str_abc, -1, &status);
766 REGEX_CHECK_STATUS;
767 matcher.reset(&input);
768 REGEX_CHECK_STATUS;
769 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
770
771 matcher.reset(matcher.inputText());
772 REGEX_CHECK_STATUS;
773 REGEX_ASSERT_UTEXT_UTF8(str_abc, matcher.inputText());
774
775 utext_close(&pattern);
776 utext_close(&input);
777 }
778
779
780 //---------------------------------------------------------------------------
781 //
782 // API_Match Test that the API for class RegexMatcher
783 // is present and nominally working, but excluding functions
784 // implementing replace operations.
785 //
786 //---------------------------------------------------------------------------
API_Match()787 void RegexTest::API_Match() {
788 UParseError pe;
789 UErrorCode status=U_ZERO_ERROR;
790 int32_t flags = 0;
791
792 //
793 // Debug - slide failing test cases early
794 //
795 #if 0
796 {
797 }
798 return;
799 #endif
800
801 //
802 // Simple pattern compilation
803 //
804 {
805 UnicodeString re("abc");
806 RegexPattern *pat2;
807 pat2 = RegexPattern::compile(re, flags, pe, status);
808 REGEX_CHECK_STATUS;
809
810 UnicodeString inStr1 = "abcdef this is a test";
811 UnicodeString instr2 = "not abc";
812 UnicodeString empty = "";
813
814
815 //
816 // Matcher creation and reset.
817 //
818 RegexMatcher *m1 = pat2->matcher(inStr1, status);
819 REGEX_CHECK_STATUS;
820 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
821 REGEX_ASSERT(m1->input() == inStr1);
822 m1->reset(instr2);
823 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
824 REGEX_ASSERT(m1->input() == instr2);
825 m1->reset(inStr1);
826 REGEX_ASSERT(m1->input() == inStr1);
827 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
828 m1->reset(empty);
829 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
830 REGEX_ASSERT(m1->input() == empty);
831 REGEX_ASSERT(&m1->pattern() == pat2);
832
833 //
834 // reset(pos, status)
835 //
836 m1->reset(inStr1);
837 m1->reset(4, status);
838 REGEX_CHECK_STATUS;
839 REGEX_ASSERT(m1->input() == inStr1);
840 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
841
842 m1->reset(-1, status);
843 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
844 status = U_ZERO_ERROR;
845
846 m1->reset(0, status);
847 REGEX_CHECK_STATUS;
848 status = U_ZERO_ERROR;
849
850 int32_t len = m1->input().length();
851 m1->reset(len-1, status);
852 REGEX_CHECK_STATUS;
853 status = U_ZERO_ERROR;
854
855 m1->reset(len, status);
856 REGEX_CHECK_STATUS;
857 status = U_ZERO_ERROR;
858
859 m1->reset(len+1, status);
860 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
861 status = U_ZERO_ERROR;
862
863 //
864 // match(pos, status)
865 //
866 m1->reset(instr2);
867 REGEX_ASSERT(m1->matches(4, status) == TRUE);
868 m1->reset();
869 REGEX_ASSERT(m1->matches(3, status) == FALSE);
870 m1->reset();
871 REGEX_ASSERT(m1->matches(5, status) == FALSE);
872 REGEX_ASSERT(m1->matches(4, status) == TRUE);
873 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
874 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
875
876 // Match() at end of string should fail, but should not
877 // be an error.
878 status = U_ZERO_ERROR;
879 len = m1->input().length();
880 REGEX_ASSERT(m1->matches(len, status) == FALSE);
881 REGEX_CHECK_STATUS;
882
883 // Match beyond end of string should fail with an error.
884 status = U_ZERO_ERROR;
885 REGEX_ASSERT(m1->matches(len+1, status) == FALSE);
886 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
887
888 // Successful match at end of string.
889 {
890 status = U_ZERO_ERROR;
891 RegexMatcher m("A?", 0, status); // will match zero length string.
892 REGEX_CHECK_STATUS;
893 m.reset(inStr1);
894 len = inStr1.length();
895 REGEX_ASSERT(m.matches(len, status) == TRUE);
896 REGEX_CHECK_STATUS;
897 m.reset(empty);
898 REGEX_ASSERT(m.matches(0, status) == TRUE);
899 REGEX_CHECK_STATUS;
900 }
901
902
903 //
904 // lookingAt(pos, status)
905 //
906 status = U_ZERO_ERROR;
907 m1->reset(instr2); // "not abc"
908 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
909 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
910 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
911 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
912 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
913 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
914 status = U_ZERO_ERROR;
915 len = m1->input().length();
916 REGEX_ASSERT(m1->lookingAt(len, status) == FALSE);
917 REGEX_CHECK_STATUS;
918 REGEX_ASSERT(m1->lookingAt(len+1, status) == FALSE);
919 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
920
921 delete m1;
922 delete pat2;
923 }
924
925
926 //
927 // Capture Group.
928 // RegexMatcher::start();
929 // RegexMatcher::end();
930 // RegexMatcher::groupCount();
931 //
932 {
933 int32_t flags=0;
934 UParseError pe;
935 UErrorCode status=U_ZERO_ERROR;
936
937 UnicodeString re("01(23(45)67)(.*)");
938 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
939 REGEX_CHECK_STATUS;
940 UnicodeString data = "0123456789";
941
942 RegexMatcher *matcher = pat->matcher(data, status);
943 REGEX_CHECK_STATUS;
944 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
945 static const int32_t matchStarts[] = {0, 2, 4, 8};
946 static const int32_t matchEnds[] = {10, 8, 6, 10};
947 int32_t i;
948 for (i=0; i<4; i++) {
949 int32_t actualStart = matcher->start(i, status);
950 REGEX_CHECK_STATUS;
951 if (actualStart != matchStarts[i]) {
952 errln("RegexTest failure at line %d, index %d. Expected %d, got %d\n",
953 __LINE__, i, matchStarts[i], actualStart);
954 }
955 int32_t actualEnd = matcher->end(i, status);
956 REGEX_CHECK_STATUS;
957 if (actualEnd != matchEnds[i]) {
958 errln("RegexTest failure at line %d index %d. Expected %d, got %d\n",
959 __LINE__, i, matchEnds[i], actualEnd);
960 }
961 }
962
963 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
964 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
965
966 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
967 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
968 matcher->reset();
969 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
970
971 matcher->lookingAt(status);
972 REGEX_ASSERT(matcher->group(status) == "0123456789");
973 REGEX_ASSERT(matcher->group(0, status) == "0123456789");
974 REGEX_ASSERT(matcher->group(1, status) == "234567" );
975 REGEX_ASSERT(matcher->group(2, status) == "45" );
976 REGEX_ASSERT(matcher->group(3, status) == "89" );
977 REGEX_CHECK_STATUS;
978 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
979 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
980 matcher->reset();
981 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
982
983 delete matcher;
984 delete pat;
985
986 }
987
988 //
989 // find
990 //
991 {
992 int32_t flags=0;
993 UParseError pe;
994 UErrorCode status=U_ZERO_ERROR;
995
996 UnicodeString re("abc");
997 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
998 REGEX_CHECK_STATUS;
999 UnicodeString data = ".abc..abc...abc..";
1000 // 012345678901234567
1001
1002 RegexMatcher *matcher = pat->matcher(data, status);
1003 REGEX_CHECK_STATUS;
1004 REGEX_ASSERT(matcher->find());
1005 REGEX_ASSERT(matcher->start(status) == 1);
1006 REGEX_ASSERT(matcher->find());
1007 REGEX_ASSERT(matcher->start(status) == 6);
1008 REGEX_ASSERT(matcher->find());
1009 REGEX_ASSERT(matcher->start(status) == 12);
1010 REGEX_ASSERT(matcher->find() == FALSE);
1011 REGEX_ASSERT(matcher->find() == FALSE);
1012
1013 matcher->reset();
1014 REGEX_ASSERT(matcher->find());
1015 REGEX_ASSERT(matcher->start(status) == 1);
1016
1017 REGEX_ASSERT(matcher->find(0, status));
1018 REGEX_ASSERT(matcher->start(status) == 1);
1019 REGEX_ASSERT(matcher->find(1, status));
1020 REGEX_ASSERT(matcher->start(status) == 1);
1021 REGEX_ASSERT(matcher->find(2, status));
1022 REGEX_ASSERT(matcher->start(status) == 6);
1023 REGEX_ASSERT(matcher->find(12, status));
1024 REGEX_ASSERT(matcher->start(status) == 12);
1025 REGEX_ASSERT(matcher->find(13, status) == FALSE);
1026 REGEX_ASSERT(matcher->find(16, status) == FALSE);
1027 REGEX_ASSERT(matcher->find(17, status) == FALSE);
1028 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
1029
1030 status = U_ZERO_ERROR;
1031 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
1032 status = U_ZERO_ERROR;
1033 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
1034
1035 REGEX_ASSERT(matcher->groupCount() == 0);
1036
1037 delete matcher;
1038 delete pat;
1039 }
1040
1041
1042 //
1043 // find, with \G in pattern (true if at the end of a previous match).
1044 //
1045 {
1046 int32_t flags=0;
1047 UParseError pe;
1048 UErrorCode status=U_ZERO_ERROR;
1049
1050 UnicodeString re(".*?(?:(\\Gabc)|(abc))", -1, US_INV);
1051 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1052 REGEX_CHECK_STATUS;
1053 UnicodeString data = ".abcabc.abc..";
1054 // 012345678901234567
1055
1056 RegexMatcher *matcher = pat->matcher(data, status);
1057 REGEX_CHECK_STATUS;
1058 REGEX_ASSERT(matcher->find());
1059 REGEX_ASSERT(matcher->start(status) == 0);
1060 REGEX_ASSERT(matcher->start(1, status) == -1);
1061 REGEX_ASSERT(matcher->start(2, status) == 1);
1062
1063 REGEX_ASSERT(matcher->find());
1064 REGEX_ASSERT(matcher->start(status) == 4);
1065 REGEX_ASSERT(matcher->start(1, status) == 4);
1066 REGEX_ASSERT(matcher->start(2, status) == -1);
1067 REGEX_CHECK_STATUS;
1068
1069 delete matcher;
1070 delete pat;
1071 }
1072
1073 //
1074 // find with zero length matches, match position should bump ahead
1075 // to prevent loops.
1076 //
1077 {
1078 int32_t i;
1079 UErrorCode status=U_ZERO_ERROR;
1080 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
1081 // using an always-true look-ahead.
1082 REGEX_CHECK_STATUS;
1083 UnicodeString s(" ");
1084 m.reset(s);
1085 for (i=0; ; i++) {
1086 if (m.find() == FALSE) {
1087 break;
1088 }
1089 REGEX_ASSERT(m.start(status) == i);
1090 REGEX_ASSERT(m.end(status) == i);
1091 }
1092 REGEX_ASSERT(i==5);
1093
1094 // Check that the bump goes over surrogate pairs OK
1095 s = UNICODE_STRING_SIMPLE("\\U00010001\\U00010002\\U00010003\\U00010004");
1096 s = s.unescape();
1097 m.reset(s);
1098 for (i=0; ; i+=2) {
1099 if (m.find() == FALSE) {
1100 break;
1101 }
1102 REGEX_ASSERT(m.start(status) == i);
1103 REGEX_ASSERT(m.end(status) == i);
1104 }
1105 REGEX_ASSERT(i==10);
1106 }
1107 {
1108 // find() loop breaking test.
1109 // with pattern of /.?/, should see a series of one char matches, then a single
1110 // match of zero length at the end of the input string.
1111 int32_t i;
1112 UErrorCode status=U_ZERO_ERROR;
1113 RegexMatcher m(".?", 0, status);
1114 REGEX_CHECK_STATUS;
1115 UnicodeString s(" ");
1116 m.reset(s);
1117 for (i=0; ; i++) {
1118 if (m.find() == FALSE) {
1119 break;
1120 }
1121 REGEX_ASSERT(m.start(status) == i);
1122 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
1123 }
1124 REGEX_ASSERT(i==5);
1125 }
1126
1127
1128 //
1129 // Matchers with no input string behave as if they had an empty input string.
1130 //
1131
1132 {
1133 UErrorCode status = U_ZERO_ERROR;
1134 RegexMatcher m(".?", 0, status);
1135 REGEX_CHECK_STATUS;
1136 REGEX_ASSERT(m.find());
1137 REGEX_ASSERT(m.start(status) == 0);
1138 REGEX_ASSERT(m.input() == "");
1139 }
1140 {
1141 UErrorCode status = U_ZERO_ERROR;
1142 RegexPattern *p = RegexPattern::compile(".", 0, status);
1143 RegexMatcher *m = p->matcher(status);
1144 REGEX_CHECK_STATUS;
1145
1146 REGEX_ASSERT(m->find() == FALSE);
1147 REGEX_ASSERT(m->input() == "");
1148 delete m;
1149 delete p;
1150 }
1151
1152 //
1153 // Regions
1154 //
1155 {
1156 UErrorCode status = U_ZERO_ERROR;
1157 UnicodeString testString("This is test data");
1158 RegexMatcher m(".*", testString, 0, status);
1159 REGEX_CHECK_STATUS;
1160 REGEX_ASSERT(m.regionStart() == 0);
1161 REGEX_ASSERT(m.regionEnd() == testString.length());
1162 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1163 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1164
1165 m.region(2,4, status);
1166 REGEX_CHECK_STATUS;
1167 REGEX_ASSERT(m.matches(status));
1168 REGEX_ASSERT(m.start(status)==2);
1169 REGEX_ASSERT(m.end(status)==4);
1170 REGEX_CHECK_STATUS;
1171
1172 m.reset();
1173 REGEX_ASSERT(m.regionStart() == 0);
1174 REGEX_ASSERT(m.regionEnd() == testString.length());
1175
1176 UnicodeString shorterString("short");
1177 m.reset(shorterString);
1178 REGEX_ASSERT(m.regionStart() == 0);
1179 REGEX_ASSERT(m.regionEnd() == shorterString.length());
1180
1181 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1182 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
1183 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1184 REGEX_ASSERT(&m == &m.reset());
1185 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
1186
1187 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
1188 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1189 REGEX_ASSERT(&m == &m.reset());
1190 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
1191
1192 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1193 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
1194 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1195 REGEX_ASSERT(&m == &m.reset());
1196 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
1197
1198 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
1199 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1200 REGEX_ASSERT(&m == &m.reset());
1201 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
1202
1203 }
1204
1205 //
1206 // hitEnd() and requireEnd()
1207 //
1208 {
1209 UErrorCode status = U_ZERO_ERROR;
1210 UnicodeString testString("aabb");
1211 RegexMatcher m1(".*", testString, 0, status);
1212 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
1213 REGEX_ASSERT(m1.hitEnd() == TRUE);
1214 REGEX_ASSERT(m1.requireEnd() == FALSE);
1215 REGEX_CHECK_STATUS;
1216
1217 status = U_ZERO_ERROR;
1218 RegexMatcher m2("a*", testString, 0, status);
1219 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
1220 REGEX_ASSERT(m2.hitEnd() == FALSE);
1221 REGEX_ASSERT(m2.requireEnd() == FALSE);
1222 REGEX_CHECK_STATUS;
1223
1224 status = U_ZERO_ERROR;
1225 RegexMatcher m3(".*$", testString, 0, status);
1226 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
1227 REGEX_ASSERT(m3.hitEnd() == TRUE);
1228 REGEX_ASSERT(m3.requireEnd() == TRUE);
1229 REGEX_CHECK_STATUS;
1230 }
1231
1232
1233 //
1234 // Compilation error on reset with UChar *
1235 // These were a hazard that people were stumbling over with runtime errors.
1236 // Changed them to compiler errors by adding private methods that more closely
1237 // matched the incorrect use of the functions.
1238 //
1239 #if 0
1240 {
1241 UErrorCode status = U_ZERO_ERROR;
1242 UChar ucharString[20];
1243 RegexMatcher m(".", 0, status);
1244 m.reset(ucharString); // should not compile.
1245
1246 RegexPattern *p = RegexPattern::compile(".", 0, status);
1247 RegexMatcher *m2 = p->matcher(ucharString, status); // should not compile.
1248
1249 RegexMatcher m3(".", ucharString, 0, status); // Should not compile
1250 }
1251 #endif
1252
1253 //
1254 // Time Outs.
1255 // Note: These tests will need to be changed when the regexp engine is
1256 // able to detect and cut short the exponential time behavior on
1257 // this type of match.
1258 //
1259 {
1260 UErrorCode status = U_ZERO_ERROR;
1261 // Enough 'a's in the string to cause the match to time out.
1262 // (Each on additonal 'a' doubles the time)
1263 UnicodeString testString("aaaaaaaaaaaaaaaaaaaaa");
1264 RegexMatcher matcher("(a+)+b", testString, 0, status);
1265 REGEX_CHECK_STATUS;
1266 REGEX_ASSERT(matcher.getTimeLimit() == 0);
1267 matcher.setTimeLimit(100, status);
1268 REGEX_ASSERT(matcher.getTimeLimit() == 100);
1269 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1270 REGEX_ASSERT(status == U_REGEX_TIME_OUT);
1271 }
1272 {
1273 UErrorCode status = U_ZERO_ERROR;
1274 // Few enough 'a's to slip in under the time limit.
1275 UnicodeString testString("aaaaaaaaaaaaaaaaaa");
1276 RegexMatcher matcher("(a+)+b", testString, 0, status);
1277 REGEX_CHECK_STATUS;
1278 matcher.setTimeLimit(100, status);
1279 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1280 REGEX_CHECK_STATUS;
1281 }
1282
1283 //
1284 // Stack Limits
1285 //
1286 {
1287 UErrorCode status = U_ZERO_ERROR;
1288 UnicodeString testString(1000000, 0x41, 1000000); // Length 1,000,000, filled with 'A'
1289
1290 // Adding the capturing parentheses to the pattern "(A)+A$" inhibits optimizations
1291 // of the '+', and makes the stack frames larger.
1292 RegexMatcher matcher("(A)+A$", testString, 0, status);
1293
1294 // With the default stack, this match should fail to run
1295 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1296 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1297
1298 // With unlimited stack, it should run
1299 status = U_ZERO_ERROR;
1300 matcher.setStackLimit(0, status);
1301 REGEX_CHECK_STATUS;
1302 REGEX_ASSERT(matcher.lookingAt(status) == TRUE);
1303 REGEX_CHECK_STATUS;
1304 REGEX_ASSERT(matcher.getStackLimit() == 0);
1305
1306 // With a limited stack, it the match should fail
1307 status = U_ZERO_ERROR;
1308 matcher.setStackLimit(10000, status);
1309 REGEX_ASSERT(matcher.lookingAt(status) == FALSE);
1310 REGEX_ASSERT(status == U_REGEX_STACK_OVERFLOW);
1311 REGEX_ASSERT(matcher.getStackLimit() == 10000);
1312 }
1313
1314 // A pattern that doesn't save state should work with
1315 // a minimal sized stack
1316 {
1317 UErrorCode status = U_ZERO_ERROR;
1318 UnicodeString testString = "abc";
1319 RegexMatcher matcher("abc", testString, 0, status);
1320 REGEX_CHECK_STATUS;
1321 matcher.setStackLimit(30, status);
1322 REGEX_CHECK_STATUS;
1323 REGEX_ASSERT(matcher.matches(status) == TRUE);
1324 REGEX_CHECK_STATUS;
1325 REGEX_ASSERT(matcher.getStackLimit() == 30);
1326
1327 // Negative stack sizes should fail
1328 status = U_ZERO_ERROR;
1329 matcher.setStackLimit(1000, status);
1330 REGEX_CHECK_STATUS;
1331 matcher.setStackLimit(-1, status);
1332 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
1333 REGEX_ASSERT(matcher.getStackLimit() == 1000);
1334 }
1335
1336
1337 }
1338
1339
1340
1341
1342
1343
1344 //---------------------------------------------------------------------------
1345 //
1346 // API_Replace API test for class RegexMatcher, testing the
1347 // Replace family of functions.
1348 //
1349 //---------------------------------------------------------------------------
API_Replace()1350 void RegexTest::API_Replace() {
1351 //
1352 // Replace
1353 //
1354 int32_t flags=0;
1355 UParseError pe;
1356 UErrorCode status=U_ZERO_ERROR;
1357
1358 UnicodeString re("abc");
1359 RegexPattern *pat = RegexPattern::compile(re, flags, pe, status);
1360 REGEX_CHECK_STATUS;
1361 UnicodeString data = ".abc..abc...abc..";
1362 // 012345678901234567
1363 RegexMatcher *matcher = pat->matcher(data, status);
1364
1365 //
1366 // Plain vanilla matches.
1367 //
1368 UnicodeString dest;
1369 dest = matcher->replaceFirst("yz", status);
1370 REGEX_CHECK_STATUS;
1371 REGEX_ASSERT(dest == ".yz..abc...abc..");
1372
1373 dest = matcher->replaceAll("yz", status);
1374 REGEX_CHECK_STATUS;
1375 REGEX_ASSERT(dest == ".yz..yz...yz..");
1376
1377 //
1378 // Plain vanilla non-matches.
1379 //
1380 UnicodeString d2 = ".abx..abx...abx..";
1381 matcher->reset(d2);
1382 dest = matcher->replaceFirst("yz", status);
1383 REGEX_CHECK_STATUS;
1384 REGEX_ASSERT(dest == ".abx..abx...abx..");
1385
1386 dest = matcher->replaceAll("yz", status);
1387 REGEX_CHECK_STATUS;
1388 REGEX_ASSERT(dest == ".abx..abx...abx..");
1389
1390 //
1391 // Empty source string
1392 //
1393 UnicodeString d3 = "";
1394 matcher->reset(d3);
1395 dest = matcher->replaceFirst("yz", status);
1396 REGEX_CHECK_STATUS;
1397 REGEX_ASSERT(dest == "");
1398
1399 dest = matcher->replaceAll("yz", status);
1400 REGEX_CHECK_STATUS;
1401 REGEX_ASSERT(dest == "");
1402
1403 //
1404 // Empty substitution string
1405 //
1406 matcher->reset(data); // ".abc..abc...abc.."
1407 dest = matcher->replaceFirst("", status);
1408 REGEX_CHECK_STATUS;
1409 REGEX_ASSERT(dest == "...abc...abc..");
1410
1411 dest = matcher->replaceAll("", status);
1412 REGEX_CHECK_STATUS;
1413 REGEX_ASSERT(dest == "........");
1414
1415 //
1416 // match whole string
1417 //
1418 UnicodeString d4 = "abc";
1419 matcher->reset(d4);
1420 dest = matcher->replaceFirst("xyz", status);
1421 REGEX_CHECK_STATUS;
1422 REGEX_ASSERT(dest == "xyz");
1423
1424 dest = matcher->replaceAll("xyz", status);
1425 REGEX_CHECK_STATUS;
1426 REGEX_ASSERT(dest == "xyz");
1427
1428 //
1429 // Capture Group, simple case
1430 //
1431 UnicodeString re2("a(..)");
1432 RegexPattern *pat2 = RegexPattern::compile(re2, flags, pe, status);
1433 REGEX_CHECK_STATUS;
1434 UnicodeString d5 = "abcdefg";
1435 RegexMatcher *matcher2 = pat2->matcher(d5, status);
1436 REGEX_CHECK_STATUS;
1437 dest = matcher2->replaceFirst("$1$1", status);
1438 REGEX_CHECK_STATUS;
1439 REGEX_ASSERT(dest == "bcbcdefg");
1440
1441 dest = matcher2->replaceFirst(UNICODE_STRING_SIMPLE("The value of \\$1 is $1."), status);
1442 REGEX_CHECK_STATUS;
1443 REGEX_ASSERT(dest == "The value of $1 is bc.defg");
1444
1445 dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
1446 REGEX_ASSERT(U_FAILURE(status));
1447 status = U_ZERO_ERROR;
1448
1449 UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
1450 replacement = replacement.unescape();
1451 dest = matcher2->replaceFirst(replacement, status);
1452 REGEX_CHECK_STATUS;
1453 REGEX_ASSERT(dest == "Supplemental Digit 1 bc.defg");
1454
1455 REGEX_ASSERT_FAIL(matcher2->replaceFirst("bad capture group number $5...",status), U_INDEX_OUTOFBOUNDS_ERROR);
1456
1457
1458 //
1459 // Replacement String with \u hex escapes
1460 //
1461 {
1462 UnicodeString src = "abc 1 abc 2 abc 3";
1463 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\u0043--");
1464 matcher->reset(src);
1465 UnicodeString result = matcher->replaceAll(substitute, status);
1466 REGEX_CHECK_STATUS;
1467 REGEX_ASSERT(result == "--C-- 1 --C-- 2 --C-- 3");
1468 }
1469 {
1470 UnicodeString src = "abc !";
1471 UnicodeString substitute = UNICODE_STRING_SIMPLE("--\\U00010000--");
1472 matcher->reset(src);
1473 UnicodeString result = matcher->replaceAll(substitute, status);
1474 REGEX_CHECK_STATUS;
1475 UnicodeString expected = UnicodeString("--");
1476 expected.append((UChar32)0x10000);
1477 expected.append("-- !");
1478 REGEX_ASSERT(result == expected);
1479 }
1480 // TODO: need more through testing of capture substitutions.
1481
1482 // Bug 4057
1483 //
1484 {
1485 status = U_ZERO_ERROR;
1486 UnicodeString s = "The matches start with ss and end with ee ss stuff ee fin";
1487 RegexMatcher m("ss(.*?)ee", 0, status);
1488 REGEX_CHECK_STATUS;
1489 UnicodeString result;
1490
1491 // Multiple finds do NOT bump up the previous appendReplacement postion.
1492 m.reset(s);
1493 m.find();
1494 m.find();
1495 m.appendReplacement(result, "ooh", status);
1496 REGEX_CHECK_STATUS;
1497 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1498
1499 // After a reset into the interior of a string, appendReplacemnt still starts at beginning.
1500 status = U_ZERO_ERROR;
1501 result.truncate(0);
1502 m.reset(10, status);
1503 m.find();
1504 m.find();
1505 m.appendReplacement(result, "ooh", status);
1506 REGEX_CHECK_STATUS;
1507 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1508
1509 // find() at interior of string, appendReplacemnt still starts at beginning.
1510 status = U_ZERO_ERROR;
1511 result.truncate(0);
1512 m.reset();
1513 m.find(10, status);
1514 m.find();
1515 m.appendReplacement(result, "ooh", status);
1516 REGEX_CHECK_STATUS;
1517 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh");
1518
1519 m.appendTail(result);
1520 REGEX_ASSERT(result == "The matches start with ss and end with ee ooh fin");
1521
1522 }
1523
1524 delete matcher2;
1525 delete pat2;
1526 delete matcher;
1527 delete pat;
1528 }
1529
1530
1531 //---------------------------------------------------------------------------
1532 //
1533 // API_Pattern Test that the API for class RegexPattern is
1534 // present and nominally working.
1535 //
1536 //---------------------------------------------------------------------------
API_Pattern()1537 void RegexTest::API_Pattern() {
1538 RegexPattern pata; // Test default constructor to not crash.
1539 RegexPattern patb;
1540
1541 REGEX_ASSERT(pata == patb);
1542 REGEX_ASSERT(pata == pata);
1543
1544 UnicodeString re1("abc[a-l][m-z]");
1545 UnicodeString re2("def");
1546 UErrorCode status = U_ZERO_ERROR;
1547 UParseError pe;
1548
1549 RegexPattern *pat1 = RegexPattern::compile(re1, 0, pe, status);
1550 RegexPattern *pat2 = RegexPattern::compile(re2, 0, pe, status);
1551 REGEX_CHECK_STATUS;
1552 REGEX_ASSERT(*pat1 == *pat1);
1553 REGEX_ASSERT(*pat1 != pata);
1554
1555 // Assign
1556 patb = *pat1;
1557 REGEX_ASSERT(patb == *pat1);
1558
1559 // Copy Construct
1560 RegexPattern patc(*pat1);
1561 REGEX_ASSERT(patc == *pat1);
1562 REGEX_ASSERT(patb == patc);
1563 REGEX_ASSERT(pat1 != pat2);
1564 patb = *pat2;
1565 REGEX_ASSERT(patb != patc);
1566 REGEX_ASSERT(patb == *pat2);
1567
1568 // Compile with no flags.
1569 RegexPattern *pat1a = RegexPattern::compile(re1, pe, status);
1570 REGEX_ASSERT(*pat1a == *pat1);
1571
1572 REGEX_ASSERT(pat1a->flags() == 0);
1573
1574 // Compile with different flags should be not equal
1575 RegexPattern *pat1b = RegexPattern::compile(re1, UREGEX_CASE_INSENSITIVE, pe, status);
1576 REGEX_CHECK_STATUS;
1577
1578 REGEX_ASSERT(*pat1b != *pat1a);
1579 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
1580 REGEX_ASSERT(pat1a->flags() == 0);
1581 delete pat1b;
1582
1583 // clone
1584 RegexPattern *pat1c = pat1->clone();
1585 REGEX_ASSERT(*pat1c == *pat1);
1586 REGEX_ASSERT(*pat1c != *pat2);
1587
1588 delete pat1c;
1589 delete pat1a;
1590 delete pat1;
1591 delete pat2;
1592
1593
1594 //
1595 // Verify that a matcher created from a cloned pattern works.
1596 // (Jitterbug 3423)
1597 //
1598 {
1599 UErrorCode status = U_ZERO_ERROR;
1600 RegexPattern *pSource = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\p{L}+"), 0, status);
1601 RegexPattern *pClone = pSource->clone();
1602 delete pSource;
1603 RegexMatcher *mFromClone = pClone->matcher(status);
1604 REGEX_CHECK_STATUS;
1605 UnicodeString s = "Hello World";
1606 mFromClone->reset(s);
1607 REGEX_ASSERT(mFromClone->find() == TRUE);
1608 REGEX_ASSERT(mFromClone->group(status) == "Hello");
1609 REGEX_ASSERT(mFromClone->find() == TRUE);
1610 REGEX_ASSERT(mFromClone->group(status) == "World");
1611 REGEX_ASSERT(mFromClone->find() == FALSE);
1612 delete mFromClone;
1613 delete pClone;
1614 }
1615
1616 //
1617 // matches convenience API
1618 //
1619 REGEX_ASSERT(RegexPattern::matches(".*", "random input", pe, status) == TRUE);
1620 REGEX_CHECK_STATUS;
1621 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
1622 REGEX_CHECK_STATUS;
1623 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
1624 REGEX_CHECK_STATUS;
1625 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
1626 REGEX_CHECK_STATUS;
1627 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
1628 REGEX_CHECK_STATUS;
1629 status = U_INDEX_OUTOFBOUNDS_ERROR;
1630 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
1631 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1632
1633
1634 //
1635 // Split()
1636 //
1637 status = U_ZERO_ERROR;
1638 pat1 = RegexPattern::compile(" +", pe, status);
1639 REGEX_CHECK_STATUS;
1640 UnicodeString fields[10];
1641
1642 int32_t n;
1643 n = pat1->split("Now is the time", fields, 10, status);
1644 REGEX_CHECK_STATUS;
1645 REGEX_ASSERT(n==4);
1646 REGEX_ASSERT(fields[0]=="Now");
1647 REGEX_ASSERT(fields[1]=="is");
1648 REGEX_ASSERT(fields[2]=="the");
1649 REGEX_ASSERT(fields[3]=="time");
1650 REGEX_ASSERT(fields[4]=="");
1651
1652 n = pat1->split("Now is the time", fields, 2, status);
1653 REGEX_CHECK_STATUS;
1654 REGEX_ASSERT(n==2);
1655 REGEX_ASSERT(fields[0]=="Now");
1656 REGEX_ASSERT(fields[1]=="is the time");
1657 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
1658
1659 fields[1] = "*";
1660 status = U_ZERO_ERROR;
1661 n = pat1->split("Now is the time", fields, 1, status);
1662 REGEX_CHECK_STATUS;
1663 REGEX_ASSERT(n==1);
1664 REGEX_ASSERT(fields[0]=="Now is the time");
1665 REGEX_ASSERT(fields[1]=="*");
1666 status = U_ZERO_ERROR;
1667
1668 n = pat1->split(" Now is the time ", fields, 10, status);
1669 REGEX_CHECK_STATUS;
1670 REGEX_ASSERT(n==6);
1671 REGEX_ASSERT(fields[0]=="");
1672 REGEX_ASSERT(fields[1]=="Now");
1673 REGEX_ASSERT(fields[2]=="is");
1674 REGEX_ASSERT(fields[3]=="the");
1675 REGEX_ASSERT(fields[4]=="time");
1676 REGEX_ASSERT(fields[5]=="");
1677
1678 n = pat1->split(" ", fields, 10, status);
1679 REGEX_CHECK_STATUS;
1680 REGEX_ASSERT(n==2);
1681 REGEX_ASSERT(fields[0]=="");
1682 REGEX_ASSERT(fields[1]=="");
1683
1684 fields[0] = "foo";
1685 n = pat1->split("", fields, 10, status);
1686 REGEX_CHECK_STATUS;
1687 REGEX_ASSERT(n==0);
1688 REGEX_ASSERT(fields[0]=="foo");
1689
1690 delete pat1;
1691
1692 // split, with a pattern with (capture)
1693 pat1 = RegexPattern::compile(UNICODE_STRING_SIMPLE("<(\\w*)>"), pe, status);
1694 REGEX_CHECK_STATUS;
1695
1696 status = U_ZERO_ERROR;
1697 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
1698 REGEX_CHECK_STATUS;
1699 REGEX_ASSERT(n==7);
1700 REGEX_ASSERT(fields[0]=="");
1701 REGEX_ASSERT(fields[1]=="a");
1702 REGEX_ASSERT(fields[2]=="Now is ");
1703 REGEX_ASSERT(fields[3]=="b");
1704 REGEX_ASSERT(fields[4]=="the time");
1705 REGEX_ASSERT(fields[5]=="c");
1706 REGEX_ASSERT(fields[6]=="");
1707 REGEX_ASSERT(status==U_ZERO_ERROR);
1708
1709 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
1710 REGEX_CHECK_STATUS;
1711 REGEX_ASSERT(n==7);
1712 REGEX_ASSERT(fields[0]==" ");
1713 REGEX_ASSERT(fields[1]=="a");
1714 REGEX_ASSERT(fields[2]=="Now is ");
1715 REGEX_ASSERT(fields[3]=="b");
1716 REGEX_ASSERT(fields[4]=="the time");
1717 REGEX_ASSERT(fields[5]=="c");
1718 REGEX_ASSERT(fields[6]=="");
1719
1720 status = U_ZERO_ERROR;
1721 fields[6] = "foo";
1722 n = pat1->split(" <a>Now is <b>the time<c>", fields, 6, status);
1723 REGEX_CHECK_STATUS;
1724 REGEX_ASSERT(n==6);
1725 REGEX_ASSERT(fields[0]==" ");
1726 REGEX_ASSERT(fields[1]=="a");
1727 REGEX_ASSERT(fields[2]=="Now is ");
1728 REGEX_ASSERT(fields[3]=="b");
1729 REGEX_ASSERT(fields[4]=="the time");
1730 REGEX_ASSERT(fields[5]==""); // All text following "<c>" field delimiter.
1731 REGEX_ASSERT(fields[6]=="foo");
1732
1733 status = U_ZERO_ERROR;
1734 fields[5] = "foo";
1735 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
1736 REGEX_CHECK_STATUS;
1737 REGEX_ASSERT(n==5);
1738 REGEX_ASSERT(fields[0]==" ");
1739 REGEX_ASSERT(fields[1]=="a");
1740 REGEX_ASSERT(fields[2]=="Now is ");
1741 REGEX_ASSERT(fields[3]=="b");
1742 REGEX_ASSERT(fields[4]=="the time<c>");
1743 REGEX_ASSERT(fields[5]=="foo");
1744
1745 status = U_ZERO_ERROR;
1746 fields[5] = "foo";
1747 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
1748 REGEX_CHECK_STATUS;
1749 REGEX_ASSERT(n==5);
1750 REGEX_ASSERT(fields[0]==" ");
1751 REGEX_ASSERT(fields[1]=="a");
1752 REGEX_ASSERT(fields[2]=="Now is ");
1753 REGEX_ASSERT(fields[3]=="b");
1754 REGEX_ASSERT(fields[4]=="the time");
1755 REGEX_ASSERT(fields[5]=="foo");
1756
1757 status = U_ZERO_ERROR;
1758 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
1759 REGEX_CHECK_STATUS;
1760 REGEX_ASSERT(n==4);
1761 REGEX_ASSERT(fields[0]==" ");
1762 REGEX_ASSERT(fields[1]=="a");
1763 REGEX_ASSERT(fields[2]=="Now is ");
1764 REGEX_ASSERT(fields[3]=="the time<c>");
1765 status = U_ZERO_ERROR;
1766 delete pat1;
1767
1768 pat1 = RegexPattern::compile("([-,])", pe, status);
1769 REGEX_CHECK_STATUS;
1770 n = pat1->split("1-10,20", fields, 10, status);
1771 REGEX_CHECK_STATUS;
1772 REGEX_ASSERT(n==5);
1773 REGEX_ASSERT(fields[0]=="1");
1774 REGEX_ASSERT(fields[1]=="-");
1775 REGEX_ASSERT(fields[2]=="10");
1776 REGEX_ASSERT(fields[3]==",");
1777 REGEX_ASSERT(fields[4]=="20");
1778 delete pat1;
1779
1780 // Test split of string with empty trailing fields
1781 pat1 = RegexPattern::compile(",", pe, status);
1782 REGEX_CHECK_STATUS;
1783 n = pat1->split("a,b,c,", fields, 10, status);
1784 REGEX_CHECK_STATUS;
1785 REGEX_ASSERT(n==4);
1786 REGEX_ASSERT(fields[0]=="a");
1787 REGEX_ASSERT(fields[1]=="b");
1788 REGEX_ASSERT(fields[2]=="c");
1789 REGEX_ASSERT(fields[3]=="");
1790
1791 n = pat1->split("a,,,", fields, 10, status);
1792 REGEX_CHECK_STATUS;
1793 REGEX_ASSERT(n==4);
1794 REGEX_ASSERT(fields[0]=="a");
1795 REGEX_ASSERT(fields[1]=="");
1796 REGEX_ASSERT(fields[2]=="");
1797 REGEX_ASSERT(fields[3]=="");
1798 delete pat1;
1799
1800 // Split Separator with zero length match.
1801 pat1 = RegexPattern::compile(":?", pe, status);
1802 REGEX_CHECK_STATUS;
1803 n = pat1->split("abc", fields, 10, status);
1804 REGEX_CHECK_STATUS;
1805 REGEX_ASSERT(n==5);
1806 REGEX_ASSERT(fields[0]=="");
1807 REGEX_ASSERT(fields[1]=="a");
1808 REGEX_ASSERT(fields[2]=="b");
1809 REGEX_ASSERT(fields[3]=="c");
1810 REGEX_ASSERT(fields[4]=="");
1811
1812 delete pat1;
1813
1814 //
1815 // RegexPattern::pattern()
1816 //
1817 pat1 = new RegexPattern();
1818 REGEX_ASSERT(pat1->pattern() == "");
1819 delete pat1;
1820
1821 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1822 REGEX_CHECK_STATUS;
1823 REGEX_ASSERT(pat1->pattern() == "(Hello, world)*");
1824 delete pat1;
1825
1826
1827 //
1828 // classID functions
1829 //
1830 pat1 = RegexPattern::compile("(Hello, world)*", pe, status);
1831 REGEX_CHECK_STATUS;
1832 REGEX_ASSERT(pat1->getDynamicClassID() == RegexPattern::getStaticClassID());
1833 REGEX_ASSERT(pat1->getDynamicClassID() != NULL);
1834 UnicodeString Hello("Hello, world.");
1835 RegexMatcher *m = pat1->matcher(Hello, status);
1836 REGEX_ASSERT(pat1->getDynamicClassID() != m->getDynamicClassID());
1837 REGEX_ASSERT(m->getDynamicClassID() == RegexMatcher::getStaticClassID());
1838 REGEX_ASSERT(m->getDynamicClassID() != NULL);
1839 delete m;
1840 delete pat1;
1841
1842 }
1843
1844 //---------------------------------------------------------------------------
1845 //
1846 // API_Match_UTF8 Test that the alternate engine for class RegexMatcher
1847 // is present and working, but excluding functions
1848 // implementing replace operations.
1849 //
1850 //---------------------------------------------------------------------------
API_Match_UTF8()1851 void RegexTest::API_Match_UTF8() {
1852 UParseError pe;
1853 UErrorCode status=U_ZERO_ERROR;
1854 int32_t flags = 0;
1855
1856 //
1857 // Debug - slide failing test cases early
1858 //
1859 #if 0
1860 {
1861 }
1862 return;
1863 #endif
1864
1865 //
1866 // Simple pattern compilation
1867 //
1868 {
1869 UText re = UTEXT_INITIALIZER;
1870 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
1871 REGEX_VERBOSE_TEXT(&re);
1872 RegexPattern *pat2;
1873 pat2 = RegexPattern::compile(&re, flags, pe, status);
1874 REGEX_CHECK_STATUS;
1875
1876 UText input1 = UTEXT_INITIALIZER;
1877 UText input2 = UTEXT_INITIALIZER;
1878 UText empty = UTEXT_INITIALIZER;
1879 regextst_openUTF8FromInvariant(&input1, "abcdef this is a test", -1, &status);
1880 REGEX_VERBOSE_TEXT(&input1);
1881 regextst_openUTF8FromInvariant(&input2, "not abc", -1, &status);
1882 REGEX_VERBOSE_TEXT(&input2);
1883 utext_openUChars(&empty, NULL, 0, &status);
1884
1885 int32_t input1Len = strlen("abcdef this is a test"); /* TODO: why not nativelen (input1) ? */
1886 int32_t input2Len = strlen("not abc");
1887
1888
1889 //
1890 // Matcher creation and reset.
1891 //
1892 RegexMatcher *m1 = &pat2->matcher(status)->reset(&input1);
1893 REGEX_CHECK_STATUS;
1894 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1895 const char str_abcdefthisisatest[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x74, 0x65, 0x73, 0x74, 0x00 }; /* abcdef this is a test */
1896 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1897 m1->reset(&input2);
1898 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1899 const char str_notabc[] = { 0x6e, 0x6f, 0x74, 0x20, 0x61, 0x62, 0x63, 0x00 }; /* not abc */
1900 REGEX_ASSERT_UTEXT_UTF8(str_notabc, m1->inputText());
1901 m1->reset(&input1);
1902 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1903 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1904 m1->reset(&empty);
1905 REGEX_ASSERT(m1->lookingAt(status) == FALSE);
1906 REGEX_ASSERT(utext_nativeLength(&empty) == 0);
1907
1908 //
1909 // reset(pos, status)
1910 //
1911 m1->reset(&input1);
1912 m1->reset(4, status);
1913 REGEX_CHECK_STATUS;
1914 REGEX_ASSERT_UTEXT_UTF8(str_abcdefthisisatest, m1->inputText());
1915 REGEX_ASSERT(m1->lookingAt(status) == TRUE);
1916
1917 m1->reset(-1, status);
1918 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1919 status = U_ZERO_ERROR;
1920
1921 m1->reset(0, status);
1922 REGEX_CHECK_STATUS;
1923 status = U_ZERO_ERROR;
1924
1925 m1->reset(input1Len-1, status);
1926 REGEX_CHECK_STATUS;
1927 status = U_ZERO_ERROR;
1928
1929 m1->reset(input1Len, status);
1930 REGEX_CHECK_STATUS;
1931 status = U_ZERO_ERROR;
1932
1933 m1->reset(input1Len+1, status);
1934 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1935 status = U_ZERO_ERROR;
1936
1937 //
1938 // match(pos, status)
1939 //
1940 m1->reset(&input2);
1941 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1942 m1->reset();
1943 REGEX_ASSERT(m1->matches(3, status) == FALSE);
1944 m1->reset();
1945 REGEX_ASSERT(m1->matches(5, status) == FALSE);
1946 REGEX_ASSERT(m1->matches(4, status) == TRUE);
1947 REGEX_ASSERT(m1->matches(-1, status) == FALSE);
1948 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1949
1950 // Match() at end of string should fail, but should not
1951 // be an error.
1952 status = U_ZERO_ERROR;
1953 REGEX_ASSERT(m1->matches(input2Len, status) == FALSE);
1954 REGEX_CHECK_STATUS;
1955
1956 // Match beyond end of string should fail with an error.
1957 status = U_ZERO_ERROR;
1958 REGEX_ASSERT(m1->matches(input2Len+1, status) == FALSE);
1959 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1960
1961 // Successful match at end of string.
1962 {
1963 status = U_ZERO_ERROR;
1964 RegexMatcher m("A?", 0, status); // will match zero length string.
1965 REGEX_CHECK_STATUS;
1966 m.reset(&input1);
1967 REGEX_ASSERT(m.matches(input1Len, status) == TRUE);
1968 REGEX_CHECK_STATUS;
1969 m.reset(&empty);
1970 REGEX_ASSERT(m.matches(0, status) == TRUE);
1971 REGEX_CHECK_STATUS;
1972 }
1973
1974
1975 //
1976 // lookingAt(pos, status)
1977 //
1978 status = U_ZERO_ERROR;
1979 m1->reset(&input2); // "not abc"
1980 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1981 REGEX_ASSERT(m1->lookingAt(5, status) == FALSE);
1982 REGEX_ASSERT(m1->lookingAt(3, status) == FALSE);
1983 REGEX_ASSERT(m1->lookingAt(4, status) == TRUE);
1984 REGEX_ASSERT(m1->lookingAt(-1, status) == FALSE);
1985 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1986 status = U_ZERO_ERROR;
1987 REGEX_ASSERT(m1->lookingAt(input2Len, status) == FALSE);
1988 REGEX_CHECK_STATUS;
1989 REGEX_ASSERT(m1->lookingAt(input2Len+1, status) == FALSE);
1990 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
1991
1992 delete m1;
1993 delete pat2;
1994
1995 utext_close(&re);
1996 utext_close(&input1);
1997 utext_close(&input2);
1998 utext_close(&empty);
1999 }
2000
2001
2002 //
2003 // Capture Group.
2004 // RegexMatcher::start();
2005 // RegexMatcher::end();
2006 // RegexMatcher::groupCount();
2007 //
2008 {
2009 int32_t flags=0;
2010 UParseError pe;
2011 UErrorCode status=U_ZERO_ERROR;
2012 UText re=UTEXT_INITIALIZER;
2013 const char str_01234567_pat[] = { 0x30, 0x31, 0x28, 0x32, 0x33, 0x28, 0x34, 0x35, 0x29, 0x36, 0x37, 0x29, 0x28, 0x2e, 0x2a, 0x29, 0x00 }; /* 01(23(45)67)(.*) */
2014 utext_openUTF8(&re, str_01234567_pat, -1, &status);
2015
2016 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2017 REGEX_CHECK_STATUS;
2018
2019 UText input = UTEXT_INITIALIZER;
2020 const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2021 utext_openUTF8(&input, str_0123456789, -1, &status);
2022
2023 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2024 REGEX_CHECK_STATUS;
2025 REGEX_ASSERT(matcher->lookingAt(status) == TRUE);
2026 static const int32_t matchStarts[] = {0, 2, 4, 8};
2027 static const int32_t matchEnds[] = {10, 8, 6, 10};
2028 int32_t i;
2029 for (i=0; i<4; i++) {
2030 int32_t actualStart = matcher->start(i, status);
2031 REGEX_CHECK_STATUS;
2032 if (actualStart != matchStarts[i]) {
2033 errln("RegexTest failure at %s:%d, index %d. Expected %d, got %d\n",
2034 __FILE__, __LINE__, i, matchStarts[i], actualStart);
2035 }
2036 int32_t actualEnd = matcher->end(i, status);
2037 REGEX_CHECK_STATUS;
2038 if (actualEnd != matchEnds[i]) {
2039 errln("RegexTest failure at %s:%d index %d. Expected %d, got %d\n",
2040 __FILE__, __LINE__, i, matchEnds[i], actualEnd);
2041 }
2042 }
2043
2044 REGEX_ASSERT(matcher->start(0, status) == matcher->start(status));
2045 REGEX_ASSERT(matcher->end(0, status) == matcher->end(status));
2046
2047 REGEX_ASSERT_FAIL(matcher->start(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2048 REGEX_ASSERT_FAIL(matcher->start( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2049 matcher->reset();
2050 REGEX_ASSERT_FAIL(matcher->start( 0, status), U_REGEX_INVALID_STATE);
2051
2052 matcher->lookingAt(status);
2053
2054 UnicodeString dest;
2055 UText destText = UTEXT_INITIALIZER;
2056 utext_openUnicodeString(&destText, &dest, &status);
2057 UText *result;
2058 //const char str_0123456789[] = { 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x00 }; /* 0123456789 */
2059 // Test shallow-clone API
2060 int64_t group_len;
2061 result = matcher->group((UText *)NULL, group_len, status);
2062 REGEX_CHECK_STATUS;
2063 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2064 utext_close(result);
2065 result = matcher->group(0, &destText, group_len, status);
2066 REGEX_CHECK_STATUS;
2067 REGEX_ASSERT(result == &destText);
2068 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2069 // destText is now immutable, reopen it
2070 utext_close(&destText);
2071 utext_openUnicodeString(&destText, &dest, &status);
2072
2073 int64_t length;
2074 result = matcher->group(0, NULL, length, status);
2075 REGEX_CHECK_STATUS;
2076 REGEX_ASSERT_UTEXT_UTF8(str_0123456789, result);
2077 utext_close(result);
2078 result = matcher->group(0, &destText, length, status);
2079 REGEX_CHECK_STATUS;
2080 REGEX_ASSERT(result == &destText);
2081 REGEX_ASSERT(utext_getNativeIndex(result) == 0);
2082 REGEX_ASSERT(length == 10);
2083 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2084
2085 // Capture Group 1 == "234567"
2086 result = matcher->group(1, NULL, length, status);
2087 REGEX_CHECK_STATUS;
2088 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2089 REGEX_ASSERT(length == 6);
2090 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2091 utext_close(result);
2092
2093 result = matcher->group(1, &destText, length, status);
2094 REGEX_CHECK_STATUS;
2095 REGEX_ASSERT(result == &destText);
2096 REGEX_ASSERT(utext_getNativeIndex(result) == 2);
2097 REGEX_ASSERT(length == 6);
2098 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2099 utext_close(result);
2100
2101 // Capture Group 2 == "45"
2102 result = matcher->group(2, NULL, length, status);
2103 REGEX_CHECK_STATUS;
2104 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2105 REGEX_ASSERT(length == 2);
2106 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2107 utext_close(result);
2108
2109 result = matcher->group(2, &destText, length, status);
2110 REGEX_CHECK_STATUS;
2111 REGEX_ASSERT(result == &destText);
2112 REGEX_ASSERT(utext_getNativeIndex(result) == 4);
2113 REGEX_ASSERT(length == 2);
2114 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2115 utext_close(result);
2116
2117 // Capture Group 3 == "89"
2118 result = matcher->group(3, NULL, length, status);
2119 REGEX_CHECK_STATUS;
2120 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2121 REGEX_ASSERT(length == 2);
2122 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2123 utext_close(result);
2124
2125 result = matcher->group(3, &destText, length, status);
2126 REGEX_CHECK_STATUS;
2127 REGEX_ASSERT(result == &destText);
2128 REGEX_ASSERT(utext_getNativeIndex(result) == 8);
2129 REGEX_ASSERT(length == 2);
2130 REGEX_ASSERT_UTEXT_INVARIANT("0123456789", result);
2131 utext_close(result);
2132
2133 // Capture Group number out of range.
2134 status = U_ZERO_ERROR;
2135 REGEX_ASSERT_FAIL(matcher->group(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2136 status = U_ZERO_ERROR;
2137 REGEX_ASSERT_FAIL(matcher->group( 4, status), U_INDEX_OUTOFBOUNDS_ERROR);
2138 status = U_ZERO_ERROR;
2139 matcher->reset();
2140 REGEX_ASSERT_FAIL(matcher->group( 0, status), U_REGEX_INVALID_STATE);
2141
2142 delete matcher;
2143 delete pat;
2144
2145 utext_close(&destText);
2146 utext_close(&input);
2147 utext_close(&re);
2148 }
2149
2150 //
2151 // find
2152 //
2153 {
2154 int32_t flags=0;
2155 UParseError pe;
2156 UErrorCode status=U_ZERO_ERROR;
2157 UText re=UTEXT_INITIALIZER;
2158 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2159 utext_openUTF8(&re, str_abc, -1, &status);
2160
2161 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2162 REGEX_CHECK_STATUS;
2163 UText input = UTEXT_INITIALIZER;
2164 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2165 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2166 // 012345678901234567
2167
2168 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2169 REGEX_CHECK_STATUS;
2170 REGEX_ASSERT(matcher->find());
2171 REGEX_ASSERT(matcher->start(status) == 1);
2172 REGEX_ASSERT(matcher->find());
2173 REGEX_ASSERT(matcher->start(status) == 6);
2174 REGEX_ASSERT(matcher->find());
2175 REGEX_ASSERT(matcher->start(status) == 12);
2176 REGEX_ASSERT(matcher->find() == FALSE);
2177 REGEX_ASSERT(matcher->find() == FALSE);
2178
2179 matcher->reset();
2180 REGEX_ASSERT(matcher->find());
2181 REGEX_ASSERT(matcher->start(status) == 1);
2182
2183 REGEX_ASSERT(matcher->find(0, status));
2184 REGEX_ASSERT(matcher->start(status) == 1);
2185 REGEX_ASSERT(matcher->find(1, status));
2186 REGEX_ASSERT(matcher->start(status) == 1);
2187 REGEX_ASSERT(matcher->find(2, status));
2188 REGEX_ASSERT(matcher->start(status) == 6);
2189 REGEX_ASSERT(matcher->find(12, status));
2190 REGEX_ASSERT(matcher->start(status) == 12);
2191 REGEX_ASSERT(matcher->find(13, status) == FALSE);
2192 REGEX_ASSERT(matcher->find(16, status) == FALSE);
2193 REGEX_ASSERT(matcher->find(17, status) == FALSE);
2194 REGEX_ASSERT_FAIL(matcher->start(status), U_REGEX_INVALID_STATE);
2195
2196 status = U_ZERO_ERROR;
2197 REGEX_ASSERT_FAIL(matcher->find(-1, status), U_INDEX_OUTOFBOUNDS_ERROR);
2198 status = U_ZERO_ERROR;
2199 REGEX_ASSERT_FAIL(matcher->find(18, status), U_INDEX_OUTOFBOUNDS_ERROR);
2200
2201 REGEX_ASSERT(matcher->groupCount() == 0);
2202
2203 delete matcher;
2204 delete pat;
2205
2206 utext_close(&input);
2207 utext_close(&re);
2208 }
2209
2210
2211 //
2212 // find, with \G in pattern (true if at the end of a previous match).
2213 //
2214 {
2215 int32_t flags=0;
2216 UParseError pe;
2217 UErrorCode status=U_ZERO_ERROR;
2218 UText re=UTEXT_INITIALIZER;
2219 const char str_Gabcabc[] = { 0x2e, 0x2a, 0x3f, 0x28, 0x3f, 0x3a, 0x28, 0x5c, 0x47, 0x61, 0x62, 0x63, 0x29, 0x7c, 0x28, 0x61, 0x62, 0x63, 0x29, 0x29, 0x00 }; /* .*?(?:(\\Gabc)|(abc)) */
2220 utext_openUTF8(&re, str_Gabcabc, -1, &status);
2221
2222 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2223
2224 REGEX_CHECK_STATUS;
2225 UText input = UTEXT_INITIALIZER;
2226 const char str_abcabcabc[] = { 0x2e, 0x61, 0x62, 0x63, 0x61, 0x62, 0x63, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abcabc.abc.. */
2227 utext_openUTF8(&input, str_abcabcabc, -1, &status);
2228 // 012345678901234567
2229
2230 RegexMatcher *matcher = &pat->matcher(status)->reset(&input);
2231 REGEX_CHECK_STATUS;
2232 REGEX_ASSERT(matcher->find());
2233 REGEX_ASSERT(matcher->start(status) == 0);
2234 REGEX_ASSERT(matcher->start(1, status) == -1);
2235 REGEX_ASSERT(matcher->start(2, status) == 1);
2236
2237 REGEX_ASSERT(matcher->find());
2238 REGEX_ASSERT(matcher->start(status) == 4);
2239 REGEX_ASSERT(matcher->start(1, status) == 4);
2240 REGEX_ASSERT(matcher->start(2, status) == -1);
2241 REGEX_CHECK_STATUS;
2242
2243 delete matcher;
2244 delete pat;
2245
2246 utext_close(&input);
2247 utext_close(&re);
2248 }
2249
2250 //
2251 // find with zero length matches, match position should bump ahead
2252 // to prevent loops.
2253 //
2254 {
2255 int32_t i;
2256 UErrorCode status=U_ZERO_ERROR;
2257 RegexMatcher m("(?= ?)", 0, status); // This pattern will zero-length matches anywhere,
2258 // using an always-true look-ahead.
2259 REGEX_CHECK_STATUS;
2260 UText s = UTEXT_INITIALIZER;
2261 utext_openUTF8(&s, " ", -1, &status);
2262 m.reset(&s);
2263 for (i=0; ; i++) {
2264 if (m.find() == FALSE) {
2265 break;
2266 }
2267 REGEX_ASSERT(m.start(status) == i);
2268 REGEX_ASSERT(m.end(status) == i);
2269 }
2270 REGEX_ASSERT(i==5);
2271
2272 // Check that the bump goes over characters outside the BMP OK
2273 // "\\U00010001\\U00010002\\U00010003\\U00010004".unescape()...in UTF-8
2274 unsigned char aboveBMP[] = {0xF0, 0x90, 0x80, 0x81, 0xF0, 0x90, 0x80, 0x82, 0xF0, 0x90, 0x80, 0x83, 0xF0, 0x90, 0x80, 0x84, 0x00};
2275 utext_openUTF8(&s, (char *)aboveBMP, -1, &status);
2276 m.reset(&s);
2277 for (i=0; ; i+=4) {
2278 if (m.find() == FALSE) {
2279 break;
2280 }
2281 REGEX_ASSERT(m.start(status) == i);
2282 REGEX_ASSERT(m.end(status) == i);
2283 }
2284 REGEX_ASSERT(i==20);
2285
2286 utext_close(&s);
2287 }
2288 {
2289 // find() loop breaking test.
2290 // with pattern of /.?/, should see a series of one char matches, then a single
2291 // match of zero length at the end of the input string.
2292 int32_t i;
2293 UErrorCode status=U_ZERO_ERROR;
2294 RegexMatcher m(".?", 0, status);
2295 REGEX_CHECK_STATUS;
2296 UText s = UTEXT_INITIALIZER;
2297 utext_openUTF8(&s, " ", -1, &status);
2298 m.reset(&s);
2299 for (i=0; ; i++) {
2300 if (m.find() == FALSE) {
2301 break;
2302 }
2303 REGEX_ASSERT(m.start(status) == i);
2304 REGEX_ASSERT(m.end(status) == (i<4 ? i+1 : i));
2305 }
2306 REGEX_ASSERT(i==5);
2307
2308 utext_close(&s);
2309 }
2310
2311
2312 //
2313 // Matchers with no input string behave as if they had an empty input string.
2314 //
2315
2316 {
2317 UErrorCode status = U_ZERO_ERROR;
2318 RegexMatcher m(".?", 0, status);
2319 REGEX_CHECK_STATUS;
2320 REGEX_ASSERT(m.find());
2321 REGEX_ASSERT(m.start(status) == 0);
2322 REGEX_ASSERT(m.input() == "");
2323 }
2324 {
2325 UErrorCode status = U_ZERO_ERROR;
2326 RegexPattern *p = RegexPattern::compile(".", 0, status);
2327 RegexMatcher *m = p->matcher(status);
2328 REGEX_CHECK_STATUS;
2329
2330 REGEX_ASSERT(m->find() == FALSE);
2331 REGEX_ASSERT(utext_nativeLength(m->inputText()) == 0);
2332 delete m;
2333 delete p;
2334 }
2335
2336 //
2337 // Regions
2338 //
2339 {
2340 UErrorCode status = U_ZERO_ERROR;
2341 UText testPattern = UTEXT_INITIALIZER;
2342 UText testText = UTEXT_INITIALIZER;
2343 regextst_openUTF8FromInvariant(&testPattern, ".*", -1, &status);
2344 REGEX_VERBOSE_TEXT(&testPattern);
2345 regextst_openUTF8FromInvariant(&testText, "This is test data", -1, &status);
2346 REGEX_VERBOSE_TEXT(&testText);
2347
2348 RegexMatcher m(&testPattern, &testText, 0, status);
2349 REGEX_CHECK_STATUS;
2350 REGEX_ASSERT(m.regionStart() == 0);
2351 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2352 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2353 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2354
2355 m.region(2,4, status);
2356 REGEX_CHECK_STATUS;
2357 REGEX_ASSERT(m.matches(status));
2358 REGEX_ASSERT(m.start(status)==2);
2359 REGEX_ASSERT(m.end(status)==4);
2360 REGEX_CHECK_STATUS;
2361
2362 m.reset();
2363 REGEX_ASSERT(m.regionStart() == 0);
2364 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("This is test data"));
2365
2366 regextst_openUTF8FromInvariant(&testText, "short", -1, &status);
2367 REGEX_VERBOSE_TEXT(&testText);
2368 m.reset(&testText);
2369 REGEX_ASSERT(m.regionStart() == 0);
2370 REGEX_ASSERT(m.regionEnd() == (int32_t)strlen("short"));
2371
2372 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2373 REGEX_ASSERT(&m == &m.useAnchoringBounds(FALSE));
2374 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2375 REGEX_ASSERT(&m == &m.reset());
2376 REGEX_ASSERT(m.hasAnchoringBounds() == FALSE);
2377
2378 REGEX_ASSERT(&m == &m.useAnchoringBounds(TRUE));
2379 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2380 REGEX_ASSERT(&m == &m.reset());
2381 REGEX_ASSERT(m.hasAnchoringBounds() == TRUE);
2382
2383 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2384 REGEX_ASSERT(&m == &m.useTransparentBounds(TRUE));
2385 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2386 REGEX_ASSERT(&m == &m.reset());
2387 REGEX_ASSERT(m.hasTransparentBounds() == TRUE);
2388
2389 REGEX_ASSERT(&m == &m.useTransparentBounds(FALSE));
2390 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2391 REGEX_ASSERT(&m == &m.reset());
2392 REGEX_ASSERT(m.hasTransparentBounds() == FALSE);
2393
2394 utext_close(&testText);
2395 utext_close(&testPattern);
2396 }
2397
2398 //
2399 // hitEnd() and requireEnd()
2400 //
2401 {
2402 UErrorCode status = U_ZERO_ERROR;
2403 UText testPattern = UTEXT_INITIALIZER;
2404 UText testText = UTEXT_INITIALIZER;
2405 const char str_[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2406 const char str_aabb[] = { 0x61, 0x61, 0x62, 0x62, 0x00 }; /* aabb */
2407 utext_openUTF8(&testPattern, str_, -1, &status);
2408 utext_openUTF8(&testText, str_aabb, -1, &status);
2409
2410 RegexMatcher m1(&testPattern, &testText, 0, status);
2411 REGEX_ASSERT(m1.lookingAt(status) == TRUE);
2412 REGEX_ASSERT(m1.hitEnd() == TRUE);
2413 REGEX_ASSERT(m1.requireEnd() == FALSE);
2414 REGEX_CHECK_STATUS;
2415
2416 status = U_ZERO_ERROR;
2417 const char str_a[] = { 0x61, 0x2a, 0x00 }; /* a* */
2418 utext_openUTF8(&testPattern, str_a, -1, &status);
2419 RegexMatcher m2(&testPattern, &testText, 0, status);
2420 REGEX_ASSERT(m2.lookingAt(status) == TRUE);
2421 REGEX_ASSERT(m2.hitEnd() == FALSE);
2422 REGEX_ASSERT(m2.requireEnd() == FALSE);
2423 REGEX_CHECK_STATUS;
2424
2425 status = U_ZERO_ERROR;
2426 const char str_dotstardollar[] = { 0x2e, 0x2a, 0x24, 0x00 }; /* .*$ */
2427 utext_openUTF8(&testPattern, str_dotstardollar, -1, &status);
2428 RegexMatcher m3(&testPattern, &testText, 0, status);
2429 REGEX_ASSERT(m3.lookingAt(status) == TRUE);
2430 REGEX_ASSERT(m3.hitEnd() == TRUE);
2431 REGEX_ASSERT(m3.requireEnd() == TRUE);
2432 REGEX_CHECK_STATUS;
2433
2434 utext_close(&testText);
2435 utext_close(&testPattern);
2436 }
2437 }
2438
2439
2440 //---------------------------------------------------------------------------
2441 //
2442 // API_Replace_UTF8 API test for class RegexMatcher, testing the
2443 // Replace family of functions.
2444 //
2445 //---------------------------------------------------------------------------
API_Replace_UTF8()2446 void RegexTest::API_Replace_UTF8() {
2447 //
2448 // Replace
2449 //
2450 int32_t flags=0;
2451 UParseError pe;
2452 UErrorCode status=U_ZERO_ERROR;
2453
2454 UText re=UTEXT_INITIALIZER;
2455 regextst_openUTF8FromInvariant(&re, "abc", -1, &status);
2456 REGEX_VERBOSE_TEXT(&re);
2457 RegexPattern *pat = RegexPattern::compile(&re, flags, pe, status);
2458 REGEX_CHECK_STATUS;
2459
2460 char data[] = { 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .abc..abc...abc.. */
2461 // 012345678901234567
2462 UText dataText = UTEXT_INITIALIZER;
2463 utext_openUTF8(&dataText, data, -1, &status);
2464 REGEX_CHECK_STATUS;
2465 REGEX_VERBOSE_TEXT(&dataText);
2466 RegexMatcher *matcher = &pat->matcher(status)->reset(&dataText);
2467
2468 //
2469 // Plain vanilla matches.
2470 //
2471 UnicodeString dest;
2472 UText destText = UTEXT_INITIALIZER;
2473 utext_openUnicodeString(&destText, &dest, &status);
2474 UText *result;
2475
2476 UText replText = UTEXT_INITIALIZER;
2477
2478 const char str_yz[] = { 0x79, 0x7a, 0x00 }; /* yz */
2479 utext_openUTF8(&replText, str_yz, -1, &status);
2480 REGEX_VERBOSE_TEXT(&replText);
2481 result = matcher->replaceFirst(&replText, NULL, status);
2482 REGEX_CHECK_STATUS;
2483 const char str_yzabcabc[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* .yz..abc...abc.. */
2484 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2485 utext_close(result);
2486 result = matcher->replaceFirst(&replText, &destText, status);
2487 REGEX_CHECK_STATUS;
2488 REGEX_ASSERT(result == &destText);
2489 REGEX_ASSERT_UTEXT_UTF8(str_yzabcabc, result);
2490
2491 result = matcher->replaceAll(&replText, NULL, status);
2492 REGEX_CHECK_STATUS;
2493 const char str_yzyzyz[] = { 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x2e, 0x79, 0x7a, 0x2e, 0x2e, 0x00 }; /* .yz..yz...yz.. */
2494 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2495 utext_close(result);
2496
2497 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2498 result = matcher->replaceAll(&replText, &destText, status);
2499 REGEX_CHECK_STATUS;
2500 REGEX_ASSERT(result == &destText);
2501 REGEX_ASSERT_UTEXT_UTF8(str_yzyzyz, result);
2502
2503 //
2504 // Plain vanilla non-matches.
2505 //
2506 const char str_abxabxabx[] = { 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x78, 0x2e, 0x2e, 0x00 }; /* .abx..abx...abx.. */
2507 utext_openUTF8(&dataText, str_abxabxabx, -1, &status);
2508 matcher->reset(&dataText);
2509
2510 result = matcher->replaceFirst(&replText, NULL, status);
2511 REGEX_CHECK_STATUS;
2512 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2513 utext_close(result);
2514 result = matcher->replaceFirst(&replText, &destText, status);
2515 REGEX_CHECK_STATUS;
2516 REGEX_ASSERT(result == &destText);
2517 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2518
2519 result = matcher->replaceAll(&replText, NULL, status);
2520 REGEX_CHECK_STATUS;
2521 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2522 utext_close(result);
2523 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2524 result = matcher->replaceAll(&replText, &destText, status);
2525 REGEX_CHECK_STATUS;
2526 REGEX_ASSERT(result == &destText);
2527 REGEX_ASSERT_UTEXT_UTF8(str_abxabxabx, result);
2528
2529 //
2530 // Empty source string
2531 //
2532 utext_openUTF8(&dataText, NULL, 0, &status);
2533 matcher->reset(&dataText);
2534
2535 result = matcher->replaceFirst(&replText, NULL, status);
2536 REGEX_CHECK_STATUS;
2537 REGEX_ASSERT_UTEXT_UTF8("", result);
2538 utext_close(result);
2539 result = matcher->replaceFirst(&replText, &destText, status);
2540 REGEX_CHECK_STATUS;
2541 REGEX_ASSERT(result == &destText);
2542 REGEX_ASSERT_UTEXT_UTF8("", result);
2543
2544 result = matcher->replaceAll(&replText, NULL, status);
2545 REGEX_CHECK_STATUS;
2546 REGEX_ASSERT_UTEXT_UTF8("", result);
2547 utext_close(result);
2548 result = matcher->replaceAll(&replText, &destText, status);
2549 REGEX_CHECK_STATUS;
2550 REGEX_ASSERT(result == &destText);
2551 REGEX_ASSERT_UTEXT_UTF8("", result);
2552
2553 //
2554 // Empty substitution string
2555 //
2556 utext_openUTF8(&dataText, data, -1, &status); // ".abc..abc...abc.."
2557 matcher->reset(&dataText);
2558
2559 utext_openUTF8(&replText, NULL, 0, &status);
2560 result = matcher->replaceFirst(&replText, NULL, status);
2561 REGEX_CHECK_STATUS;
2562 const char str_abcabc[] = { 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x2e, 0x61, 0x62, 0x63, 0x2e, 0x2e, 0x00 }; /* ...abc...abc.. */
2563 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2564 utext_close(result);
2565 result = matcher->replaceFirst(&replText, &destText, status);
2566 REGEX_CHECK_STATUS;
2567 REGEX_ASSERT(result == &destText);
2568 REGEX_ASSERT_UTEXT_UTF8(str_abcabc, result);
2569
2570 result = matcher->replaceAll(&replText, NULL, status);
2571 REGEX_CHECK_STATUS;
2572 const char str_dots[] = { 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x2e, 0x00 }; /* ........ */
2573 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2574 utext_close(result);
2575 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2576 result = matcher->replaceAll(&replText, &destText, status);
2577 REGEX_CHECK_STATUS;
2578 REGEX_ASSERT(result == &destText);
2579 REGEX_ASSERT_UTEXT_UTF8(str_dots, result);
2580
2581 //
2582 // match whole string
2583 //
2584 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2585 utext_openUTF8(&dataText, str_abc, -1, &status);
2586 matcher->reset(&dataText);
2587
2588 const char str_xyz[] = { 0x78, 0x79, 0x7a, 0x00 }; /* xyz */
2589 utext_openUTF8(&replText, str_xyz, -1, &status);
2590 result = matcher->replaceFirst(&replText, NULL, status);
2591 REGEX_CHECK_STATUS;
2592 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2593 utext_close(result);
2594 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2595 result = matcher->replaceFirst(&replText, &destText, status);
2596 REGEX_CHECK_STATUS;
2597 REGEX_ASSERT(result == &destText);
2598 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2599
2600 result = matcher->replaceAll(&replText, NULL, status);
2601 REGEX_CHECK_STATUS;
2602 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2603 utext_close(result);
2604 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2605 result = matcher->replaceAll(&replText, &destText, status);
2606 REGEX_CHECK_STATUS;
2607 REGEX_ASSERT(result == &destText);
2608 REGEX_ASSERT_UTEXT_UTF8(str_xyz, result);
2609
2610 //
2611 // Capture Group, simple case
2612 //
2613 const char str_add[] = { 0x61, 0x28, 0x2e, 0x2e, 0x29, 0x00 }; /* a(..) */
2614 utext_openUTF8(&re, str_add, -1, &status);
2615 RegexPattern *pat2 = RegexPattern::compile(&re, flags, pe, status);
2616 REGEX_CHECK_STATUS;
2617
2618 const char str_abcdefg[] = { 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* abcdefg */
2619 utext_openUTF8(&dataText, str_abcdefg, -1, &status);
2620 RegexMatcher *matcher2 = &pat2->matcher(status)->reset(&dataText);
2621 REGEX_CHECK_STATUS;
2622
2623 const char str_11[] = { 0x24, 0x31, 0x24, 0x31, 0x00 }; /* $1$1 */
2624 utext_openUTF8(&replText, str_11, -1, &status);
2625 result = matcher2->replaceFirst(&replText, NULL, status);
2626 REGEX_CHECK_STATUS;
2627 const char str_bcbcdefg[] = { 0x62, 0x63, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* bcbcdefg */
2628 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2629 utext_close(result);
2630 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2631 result = matcher2->replaceFirst(&replText, &destText, status);
2632 REGEX_CHECK_STATUS;
2633 REGEX_ASSERT(result == &destText);
2634 REGEX_ASSERT_UTEXT_UTF8(str_bcbcdefg, result);
2635
2636 const char str_v[24] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x5c, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x24, 0x31, 0x2e, 0x00 }; /* The value of \$1 is $1. */
2637 utext_openUTF8(&replText, str_v, -1, &status);
2638 REGEX_VERBOSE_TEXT(&replText);
2639 result = matcher2->replaceFirst(&replText, NULL, status);
2640 REGEX_CHECK_STATUS;
2641 const char str_Thevalueof1isbcdefg[] = { 0x54, 0x68, 0x65, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x6f, 0x66, 0x20, 0x24, 0x31, 0x20, 0x69, 0x73, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* The value of $1 is bc.defg */
2642 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2643 utext_close(result);
2644 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2645 result = matcher2->replaceFirst(&replText, &destText, status);
2646 REGEX_CHECK_STATUS;
2647 REGEX_ASSERT(result == &destText);
2648 REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
2649
2650 const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
2651 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
2652 0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
2653 utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
2654 result = matcher2->replaceFirst(&replText, NULL, status);
2655 REGEX_CHECK_STATUS;
2656 const char str_byitselfnogroupnumberdefg[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* $ by itself, no group number $$$defg */
2657 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2658 utext_close(result);
2659 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2660 result = matcher2->replaceFirst(&replText, &destText, status);
2661 REGEX_CHECK_STATUS;
2662 REGEX_ASSERT(result == &destText);
2663 REGEX_ASSERT_UTEXT_UTF8(str_byitselfnogroupnumberdefg, result);
2664
2665 unsigned char supplDigitChars[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x24, 0x78, 0x78, 0x78, 0x78, 0x2e, 0x00 }; /* Supplemental Digit 1 $xxxx. */
2666 //unsigned char supplDigitChars[] = "Supplemental Digit 1 $xxxx."; // \U0001D7CF, MATHEMATICAL BOLD DIGIT ONE
2667 // 012345678901234567890123456
2668 supplDigitChars[22] = 0xF0;
2669 supplDigitChars[23] = 0x9D;
2670 supplDigitChars[24] = 0x9F;
2671 supplDigitChars[25] = 0x8F;
2672 utext_openUTF8(&replText, (char *)supplDigitChars, -1, &status);
2673
2674 result = matcher2->replaceFirst(&replText, NULL, status);
2675 REGEX_CHECK_STATUS;
2676 const char str_SupplementalDigit1bcdefg[] = { 0x53, 0x75, 0x70, 0x70, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x61, 0x6c, 0x20, 0x44, 0x69, 0x67, 0x69, 0x74, 0x20, 0x31, 0x20, 0x62, 0x63, 0x2e, 0x64, 0x65, 0x66, 0x67, 0x00 }; /* Supplemental Digit 1 bc.defg */
2677 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2678 utext_close(result);
2679 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2680 result = matcher2->replaceFirst(&replText, &destText, status);
2681 REGEX_CHECK_STATUS;
2682 REGEX_ASSERT(result == &destText);
2683 REGEX_ASSERT_UTEXT_UTF8(str_SupplementalDigit1bcdefg, result);
2684 const char str_badcapturegroupnumber5[] = { 0x62, 0x61, 0x64, 0x20, 0x63, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x35, 0x2e, 0x2e, 0x2e, 0x00 }; /* bad capture group number $5..." */
2685 utext_openUTF8(&replText, str_badcapturegroupnumber5, -1, &status);
2686 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, NULL, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2687 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2688 utext_close(result);
2689 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2690 REGEX_ASSERT_FAIL((result = matcher2->replaceFirst(&replText, &destText, status)), U_INDEX_OUTOFBOUNDS_ERROR);
2691 REGEX_ASSERT(result == &destText);
2692 // REGEX_ASSERT_UTEXT_UTF8("abcdefg", result);
2693
2694 //
2695 // Replacement String with \u hex escapes
2696 //
2697 {
2698 const char str_abc1abc2abc3[] = { 0x61, 0x62, 0x63, 0x20, 0x31, 0x20, 0x61, 0x62, 0x63, 0x20, 0x32, 0x20, 0x61, 0x62, 0x63, 0x20, 0x33, 0x00 }; /* abc 1 abc 2 abc 3 */
2699 const char str_u0043[] = { 0x2d, 0x2d, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x33, 0x2d, 0x2d, 0x00 }; /* --\u0043-- */
2700 utext_openUTF8(&dataText, str_abc1abc2abc3, -1, &status);
2701 utext_openUTF8(&replText, str_u0043, -1, &status);
2702 matcher->reset(&dataText);
2703
2704 result = matcher->replaceAll(&replText, NULL, status);
2705 REGEX_CHECK_STATUS;
2706 const char str_C1C2C3[] = { 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x31, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x32, 0x20, 0x2d, 0x2d, 0x43, 0x2d, 0x2d, 0x20, 0x33, 0x00 }; /* --C-- 1 --C-- 2 --C-- 3 */
2707 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2708 utext_close(result);
2709 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2710 result = matcher->replaceAll(&replText, &destText, status);
2711 REGEX_CHECK_STATUS;
2712 REGEX_ASSERT(result == &destText);
2713 REGEX_ASSERT_UTEXT_UTF8(str_C1C2C3, result);
2714 }
2715 {
2716 const char str_abc[] = { 0x61, 0x62, 0x63, 0x20, 0x21, 0x00 }; /* abc ! */
2717 utext_openUTF8(&dataText, str_abc, -1, &status);
2718 const char str_U00010000[] = { 0x2d, 0x2d, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x31, 0x30, 0x30, 0x30, 0x30, 0x2d, 0x2d, 0x00 }; /* --\U00010000-- */
2719 utext_openUTF8(&replText, str_U00010000, -1, &status);
2720 matcher->reset(&dataText);
2721
2722 unsigned char expected[] = { 0x2d, 0x2d, 0x78, 0x78, 0x78, 0x78, 0x2d, 0x2d, 0x20, 0x21, 0x00 }; /* --xxxx-- ! */ // \U00010000, "LINEAR B SYLLABLE B008 A"
2723 // 0123456789
2724 expected[2] = 0xF0;
2725 expected[3] = 0x90;
2726 expected[4] = 0x80;
2727 expected[5] = 0x80;
2728
2729 result = matcher->replaceAll(&replText, NULL, status);
2730 REGEX_CHECK_STATUS;
2731 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2732 utext_close(result);
2733 utext_replace(&destText, 0, utext_nativeLength(&destText), NULL, 0, &status);
2734 result = matcher->replaceAll(&replText, &destText, status);
2735 REGEX_CHECK_STATUS;
2736 REGEX_ASSERT(result == &destText);
2737 REGEX_ASSERT_UTEXT_UTF8((char *)expected, result);
2738 }
2739 // TODO: need more through testing of capture substitutions.
2740
2741 // Bug 4057
2742 //
2743 {
2744 status = U_ZERO_ERROR;
2745 const char str_ssee[] = { 0x73, 0x73, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x65, 0x65, 0x00 }; /* ss(.*?)ee */
2746 const char str_blah[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x73, 0x73, 0x20, 0x73, 0x74, 0x75, 0x66, 0x66, 0x20, 0x65, 0x65, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ss stuff ee fin */
2747 const char str_ooh[] = { 0x6f, 0x6f, 0x68, 0x00 }; /* ooh */
2748 utext_openUTF8(&re, str_ssee, -1, &status);
2749 utext_openUTF8(&dataText, str_blah, -1, &status);
2750 utext_openUTF8(&replText, str_ooh, -1, &status);
2751
2752 RegexMatcher m(&re, 0, status);
2753 REGEX_CHECK_STATUS;
2754
2755 UnicodeString result;
2756 UText resultText = UTEXT_INITIALIZER;
2757 utext_openUnicodeString(&resultText, &result, &status);
2758
2759 // Multiple finds do NOT bump up the previous appendReplacement postion.
2760 m.reset(&dataText);
2761 m.find();
2762 m.find();
2763 m.appendReplacement(&resultText, &replText, status);
2764 REGEX_CHECK_STATUS;
2765 const char str_blah2[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2766 REGEX_ASSERT_UTEXT_UTF8(str_blah2, &resultText);
2767
2768 // After a reset into the interior of a string, appendReplacement still starts at beginning.
2769 status = U_ZERO_ERROR;
2770 result.truncate(0);
2771 utext_openUnicodeString(&resultText, &result, &status);
2772 m.reset(10, status);
2773 m.find();
2774 m.find();
2775 m.appendReplacement(&resultText, &replText, status);
2776 REGEX_CHECK_STATUS;
2777 const char str_blah3[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2778 REGEX_ASSERT_UTEXT_UTF8(str_blah3, &resultText);
2779
2780 // find() at interior of string, appendReplacement still starts at beginning.
2781 status = U_ZERO_ERROR;
2782 result.truncate(0);
2783 utext_openUnicodeString(&resultText, &result, &status);
2784 m.reset();
2785 m.find(10, status);
2786 m.find();
2787 m.appendReplacement(&resultText, &replText, status);
2788 REGEX_CHECK_STATUS;
2789 const char str_blah8[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x00 }; /* The matches start with ss and end with ee ooh */
2790 REGEX_ASSERT_UTEXT_UTF8(str_blah8, &resultText);
2791
2792 m.appendTail(&resultText, status);
2793 const char str_blah9[] = { 0x54, 0x68, 0x65, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x73, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x65, 0x65, 0x20, 0x6f, 0x6f, 0x68, 0x20, 0x66, 0x69, 0x6e, 0x00 }; /* The matches start with ss and end with ee ooh fin */
2794 REGEX_ASSERT_UTEXT_UTF8(str_blah9, &resultText);
2795
2796 utext_close(&resultText);
2797 }
2798
2799 delete matcher2;
2800 delete pat2;
2801 delete matcher;
2802 delete pat;
2803
2804 utext_close(&dataText);
2805 utext_close(&replText);
2806 utext_close(&destText);
2807 utext_close(&re);
2808 }
2809
2810
2811 //---------------------------------------------------------------------------
2812 //
2813 // API_Pattern_UTF8 Test that the API for class RegexPattern is
2814 // present and nominally working.
2815 //
2816 //---------------------------------------------------------------------------
API_Pattern_UTF8()2817 void RegexTest::API_Pattern_UTF8() {
2818 RegexPattern pata; // Test default constructor to not crash.
2819 RegexPattern patb;
2820
2821 REGEX_ASSERT(pata == patb);
2822 REGEX_ASSERT(pata == pata);
2823
2824 UText re1 = UTEXT_INITIALIZER;
2825 UText re2 = UTEXT_INITIALIZER;
2826 UErrorCode status = U_ZERO_ERROR;
2827 UParseError pe;
2828
2829 const char str_abcalmz[] = { 0x61, 0x62, 0x63, 0x5b, 0x61, 0x2d, 0x6c, 0x5d, 0x5b, 0x6d, 0x2d, 0x7a, 0x5d, 0x00 }; /* abc[a-l][m-z] */
2830 const char str_def[] = { 0x64, 0x65, 0x66, 0x00 }; /* def */
2831 utext_openUTF8(&re1, str_abcalmz, -1, &status);
2832 utext_openUTF8(&re2, str_def, -1, &status);
2833
2834 RegexPattern *pat1 = RegexPattern::compile(&re1, 0, pe, status);
2835 RegexPattern *pat2 = RegexPattern::compile(&re2, 0, pe, status);
2836 REGEX_CHECK_STATUS;
2837 REGEX_ASSERT(*pat1 == *pat1);
2838 REGEX_ASSERT(*pat1 != pata);
2839
2840 // Assign
2841 patb = *pat1;
2842 REGEX_ASSERT(patb == *pat1);
2843
2844 // Copy Construct
2845 RegexPattern patc(*pat1);
2846 REGEX_ASSERT(patc == *pat1);
2847 REGEX_ASSERT(patb == patc);
2848 REGEX_ASSERT(pat1 != pat2);
2849 patb = *pat2;
2850 REGEX_ASSERT(patb != patc);
2851 REGEX_ASSERT(patb == *pat2);
2852
2853 // Compile with no flags.
2854 RegexPattern *pat1a = RegexPattern::compile(&re1, pe, status);
2855 REGEX_ASSERT(*pat1a == *pat1);
2856
2857 REGEX_ASSERT(pat1a->flags() == 0);
2858
2859 // Compile with different flags should be not equal
2860 RegexPattern *pat1b = RegexPattern::compile(&re1, UREGEX_CASE_INSENSITIVE, pe, status);
2861 REGEX_CHECK_STATUS;
2862
2863 REGEX_ASSERT(*pat1b != *pat1a);
2864 REGEX_ASSERT(pat1b->flags() == UREGEX_CASE_INSENSITIVE);
2865 REGEX_ASSERT(pat1a->flags() == 0);
2866 delete pat1b;
2867
2868 // clone
2869 RegexPattern *pat1c = pat1->clone();
2870 REGEX_ASSERT(*pat1c == *pat1);
2871 REGEX_ASSERT(*pat1c != *pat2);
2872
2873 delete pat1c;
2874 delete pat1a;
2875 delete pat1;
2876 delete pat2;
2877
2878 utext_close(&re1);
2879 utext_close(&re2);
2880
2881
2882 //
2883 // Verify that a matcher created from a cloned pattern works.
2884 // (Jitterbug 3423)
2885 //
2886 {
2887 UErrorCode status = U_ZERO_ERROR;
2888 UText pattern = UTEXT_INITIALIZER;
2889 const char str_pL[] = { 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x2b, 0x00 }; /* \p{L}+ */
2890 utext_openUTF8(&pattern, str_pL, -1, &status);
2891
2892 RegexPattern *pSource = RegexPattern::compile(&pattern, 0, status);
2893 RegexPattern *pClone = pSource->clone();
2894 delete pSource;
2895 RegexMatcher *mFromClone = pClone->matcher(status);
2896 REGEX_CHECK_STATUS;
2897
2898 UText input = UTEXT_INITIALIZER;
2899 const char str_HelloWorld[] = { 0x48, 0x65, 0x6c, 0x6c, 0x6f, 0x20, 0x57, 0x6f, 0x72, 0x6c, 0x64, 0x00 }; /* Hello World */
2900 utext_openUTF8(&input, str_HelloWorld, -1, &status);
2901 mFromClone->reset(&input);
2902 REGEX_ASSERT(mFromClone->find() == TRUE);
2903 REGEX_ASSERT(mFromClone->group(status) == "Hello");
2904 REGEX_ASSERT(mFromClone->find() == TRUE);
2905 REGEX_ASSERT(mFromClone->group(status) == "World");
2906 REGEX_ASSERT(mFromClone->find() == FALSE);
2907 delete mFromClone;
2908 delete pClone;
2909
2910 utext_close(&input);
2911 utext_close(&pattern);
2912 }
2913
2914 //
2915 // matches convenience API
2916 //
2917 {
2918 UErrorCode status = U_ZERO_ERROR;
2919 UText pattern = UTEXT_INITIALIZER;
2920 UText input = UTEXT_INITIALIZER;
2921
2922 const char str_randominput[] = { 0x72, 0x61, 0x6e, 0x64, 0x6f, 0x6d, 0x20, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* random input */
2923 utext_openUTF8(&input, str_randominput, -1, &status);
2924
2925 const char str_dotstar[] = { 0x2e, 0x2a, 0x00 }; /* .* */
2926 utext_openUTF8(&pattern, str_dotstar, -1, &status);
2927 REGEX_ASSERT(RegexPattern::matches(&pattern, &input, pe, status) == TRUE);
2928 REGEX_CHECK_STATUS;
2929
2930 const char str_abc[] = { 0x61, 0x62, 0x63, 0x00 }; /* abc */
2931 utext_openUTF8(&pattern, str_abc, -1, &status);
2932 REGEX_ASSERT(RegexPattern::matches("abc", "random input", pe, status) == FALSE);
2933 REGEX_CHECK_STATUS;
2934
2935 const char str_nput[] = { 0x2e, 0x2a, 0x6e, 0x70, 0x75, 0x74, 0x00 }; /* .*nput */
2936 utext_openUTF8(&pattern, str_nput, -1, &status);
2937 REGEX_ASSERT(RegexPattern::matches(".*nput", "random input", pe, status) == TRUE);
2938 REGEX_CHECK_STATUS;
2939
2940 utext_openUTF8(&pattern, str_randominput, -1, &status);
2941 REGEX_ASSERT(RegexPattern::matches("random input", "random input", pe, status) == TRUE);
2942 REGEX_CHECK_STATUS;
2943
2944 const char str_u[] = { 0x2e, 0x2a, 0x75, 0x00 }; /* .*u */
2945 utext_openUTF8(&pattern, str_u, -1, &status);
2946 REGEX_ASSERT(RegexPattern::matches(".*u", "random input", pe, status) == FALSE);
2947 REGEX_CHECK_STATUS;
2948
2949 utext_openUTF8(&input, str_abc, -1, &status);
2950 utext_openUTF8(&pattern, str_abc, -1, &status);
2951 status = U_INDEX_OUTOFBOUNDS_ERROR;
2952 REGEX_ASSERT(RegexPattern::matches("abc", "abc", pe, status) == FALSE);
2953 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
2954
2955 utext_close(&input);
2956 utext_close(&pattern);
2957 }
2958
2959
2960 //
2961 // Split()
2962 //
2963 status = U_ZERO_ERROR;
2964 const char str_spaceplus[] = { 0x20, 0x2b, 0x00 }; /* + */
2965 utext_openUTF8(&re1, str_spaceplus, -1, &status);
2966 pat1 = RegexPattern::compile(&re1, pe, status);
2967 REGEX_CHECK_STATUS;
2968 UnicodeString fields[10];
2969
2970 int32_t n;
2971 n = pat1->split("Now is the time", fields, 10, status);
2972 REGEX_CHECK_STATUS;
2973 REGEX_ASSERT(n==4);
2974 REGEX_ASSERT(fields[0]=="Now");
2975 REGEX_ASSERT(fields[1]=="is");
2976 REGEX_ASSERT(fields[2]=="the");
2977 REGEX_ASSERT(fields[3]=="time");
2978 REGEX_ASSERT(fields[4]=="");
2979
2980 n = pat1->split("Now is the time", fields, 2, status);
2981 REGEX_CHECK_STATUS;
2982 REGEX_ASSERT(n==2);
2983 REGEX_ASSERT(fields[0]=="Now");
2984 REGEX_ASSERT(fields[1]=="is the time");
2985 REGEX_ASSERT(fields[2]=="the"); // left over from previous test
2986
2987 fields[1] = "*";
2988 status = U_ZERO_ERROR;
2989 n = pat1->split("Now is the time", fields, 1, status);
2990 REGEX_CHECK_STATUS;
2991 REGEX_ASSERT(n==1);
2992 REGEX_ASSERT(fields[0]=="Now is the time");
2993 REGEX_ASSERT(fields[1]=="*");
2994 status = U_ZERO_ERROR;
2995
2996 n = pat1->split(" Now is the time ", fields, 10, status);
2997 REGEX_CHECK_STATUS;
2998 REGEX_ASSERT(n==6);
2999 REGEX_ASSERT(fields[0]=="");
3000 REGEX_ASSERT(fields[1]=="Now");
3001 REGEX_ASSERT(fields[2]=="is");
3002 REGEX_ASSERT(fields[3]=="the");
3003 REGEX_ASSERT(fields[4]=="time");
3004 REGEX_ASSERT(fields[5]=="");
3005 REGEX_ASSERT(fields[6]=="");
3006
3007 fields[2] = "*";
3008 n = pat1->split(" ", fields, 10, status);
3009 REGEX_CHECK_STATUS;
3010 REGEX_ASSERT(n==2);
3011 REGEX_ASSERT(fields[0]=="");
3012 REGEX_ASSERT(fields[1]=="");
3013 REGEX_ASSERT(fields[2]=="*");
3014
3015 fields[0] = "foo";
3016 n = pat1->split("", fields, 10, status);
3017 REGEX_CHECK_STATUS;
3018 REGEX_ASSERT(n==0);
3019 REGEX_ASSERT(fields[0]=="foo");
3020
3021 delete pat1;
3022
3023 // split, with a pattern with (capture)
3024 regextst_openUTF8FromInvariant(&re1, "<(\\w*)>", -1, &status);
3025 pat1 = RegexPattern::compile(&re1, pe, status);
3026 REGEX_CHECK_STATUS;
3027
3028 status = U_ZERO_ERROR;
3029 fields[6] = fields[7] = "*";
3030 n = pat1->split("<a>Now is <b>the time<c>", fields, 10, status);
3031 REGEX_CHECK_STATUS;
3032 REGEX_ASSERT(n==7);
3033 REGEX_ASSERT(fields[0]=="");
3034 REGEX_ASSERT(fields[1]=="a");
3035 REGEX_ASSERT(fields[2]=="Now is ");
3036 REGEX_ASSERT(fields[3]=="b");
3037 REGEX_ASSERT(fields[4]=="the time");
3038 REGEX_ASSERT(fields[5]=="c");
3039 REGEX_ASSERT(fields[6]=="");
3040 REGEX_ASSERT(fields[7]=="*");
3041 REGEX_ASSERT(status==U_ZERO_ERROR);
3042
3043 fields[6] = fields[7] = "*";
3044 n = pat1->split(" <a>Now is <b>the time<c>", fields, 10, status);
3045 REGEX_CHECK_STATUS;
3046 REGEX_ASSERT(n==7);
3047 REGEX_ASSERT(fields[0]==" ");
3048 REGEX_ASSERT(fields[1]=="a");
3049 REGEX_ASSERT(fields[2]=="Now is ");
3050 REGEX_ASSERT(fields[3]=="b");
3051 REGEX_ASSERT(fields[4]=="the time");
3052 REGEX_ASSERT(fields[5]=="c");
3053 REGEX_ASSERT(fields[6]=="");
3054 REGEX_ASSERT(fields[7]=="*");
3055
3056 status = U_ZERO_ERROR;
3057 fields[6] = "foo";
3058 n = pat1->split(" <a>Now is <b>the time<c> ", fields, 6, status);
3059 REGEX_CHECK_STATUS;
3060 REGEX_ASSERT(n==6);
3061 REGEX_ASSERT(fields[0]==" ");
3062 REGEX_ASSERT(fields[1]=="a");
3063 REGEX_ASSERT(fields[2]=="Now is ");
3064 REGEX_ASSERT(fields[3]=="b");
3065 REGEX_ASSERT(fields[4]=="the time");
3066 REGEX_ASSERT(fields[5]==" ");
3067 REGEX_ASSERT(fields[6]=="foo");
3068
3069 status = U_ZERO_ERROR;
3070 fields[5] = "foo";
3071 n = pat1->split(" <a>Now is <b>the time<c>", fields, 5, status);
3072 REGEX_CHECK_STATUS;
3073 REGEX_ASSERT(n==5);
3074 REGEX_ASSERT(fields[0]==" ");
3075 REGEX_ASSERT(fields[1]=="a");
3076 REGEX_ASSERT(fields[2]=="Now is ");
3077 REGEX_ASSERT(fields[3]=="b");
3078 REGEX_ASSERT(fields[4]=="the time<c>");
3079 REGEX_ASSERT(fields[5]=="foo");
3080
3081 status = U_ZERO_ERROR;
3082 fields[5] = "foo";
3083 n = pat1->split(" <a>Now is <b>the time", fields, 5, status);
3084 REGEX_CHECK_STATUS;
3085 REGEX_ASSERT(n==5);
3086 REGEX_ASSERT(fields[0]==" ");
3087 REGEX_ASSERT(fields[1]=="a");
3088 REGEX_ASSERT(fields[2]=="Now is ");
3089 REGEX_ASSERT(fields[3]=="b");
3090 REGEX_ASSERT(fields[4]=="the time");
3091 REGEX_ASSERT(fields[5]=="foo");
3092
3093 status = U_ZERO_ERROR;
3094 n = pat1->split(" <a>Now is <b>the time<c>", fields, 4, status);
3095 REGEX_CHECK_STATUS;
3096 REGEX_ASSERT(n==4);
3097 REGEX_ASSERT(fields[0]==" ");
3098 REGEX_ASSERT(fields[1]=="a");
3099 REGEX_ASSERT(fields[2]=="Now is ");
3100 REGEX_ASSERT(fields[3]=="the time<c>");
3101 status = U_ZERO_ERROR;
3102 delete pat1;
3103
3104 regextst_openUTF8FromInvariant(&re1, "([-,])", -1, &status);
3105 pat1 = RegexPattern::compile(&re1, pe, status);
3106 REGEX_CHECK_STATUS;
3107 n = pat1->split("1-10,20", fields, 10, status);
3108 REGEX_CHECK_STATUS;
3109 REGEX_ASSERT(n==5);
3110 REGEX_ASSERT(fields[0]=="1");
3111 REGEX_ASSERT(fields[1]=="-");
3112 REGEX_ASSERT(fields[2]=="10");
3113 REGEX_ASSERT(fields[3]==",");
3114 REGEX_ASSERT(fields[4]=="20");
3115 delete pat1;
3116
3117
3118 //
3119 // split of a UText based string, with library allocating output UTexts.
3120 //
3121 {
3122 status = U_ZERO_ERROR;
3123 RegexMatcher matcher(UnicodeString("(:)"), 0, status);
3124 UnicodeString stringToSplit("first:second:third");
3125 UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
3126 REGEX_CHECK_STATUS;
3127
3128 UText *splits[10] = {NULL};
3129 int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
3130 REGEX_CHECK_STATUS;
3131 REGEX_ASSERT(numFields == 5);
3132 REGEX_ASSERT_UTEXT_INVARIANT("first", splits[0]);
3133 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[1]);
3134 REGEX_ASSERT_UTEXT_INVARIANT("second", splits[2]);
3135 REGEX_ASSERT_UTEXT_INVARIANT(":", splits[3]);
3136 REGEX_ASSERT_UTEXT_INVARIANT("third", splits[4]);
3137 REGEX_ASSERT(splits[5] == NULL);
3138
3139 for (int i=0; i<UPRV_LENGTHOF(splits); i++) {
3140 if (splits[i]) {
3141 utext_close(splits[i]);
3142 splits[i] = NULL;
3143 }
3144 }
3145 utext_close(textToSplit);
3146 }
3147
3148
3149 //
3150 // RegexPattern::pattern() and patternText()
3151 //
3152 pat1 = new RegexPattern();
3153 REGEX_ASSERT(pat1->pattern() == "");
3154 REGEX_ASSERT_UTEXT_UTF8("", pat1->patternText(status));
3155 delete pat1;
3156 const char *helloWorldInvariant = "(Hello, world)*";
3157 regextst_openUTF8FromInvariant(&re1, helloWorldInvariant, -1, &status);
3158 pat1 = RegexPattern::compile(&re1, pe, status);
3159 REGEX_CHECK_STATUS;
3160 REGEX_ASSERT_UNISTR("(Hello, world)*", pat1->pattern());
3161 REGEX_ASSERT_UTEXT_INVARIANT("(Hello, world)*", pat1->patternText(status));
3162 delete pat1;
3163
3164 utext_close(&re1);
3165 }
3166
3167
3168 //---------------------------------------------------------------------------
3169 //
3170 // Extended A more thorough check for features of regex patterns
3171 // The test cases are in a separate data file,
3172 // source/tests/testdata/regextst.txt
3173 // A description of the test data format is included in that file.
3174 //
3175 //---------------------------------------------------------------------------
3176
3177 const char *
getPath(char buffer[2048],const char * filename)3178 RegexTest::getPath(char buffer[2048], const char *filename) {
3179 UErrorCode status=U_ZERO_ERROR;
3180 const char *testDataDirectory = IntlTest::getSourceTestData(status);
3181 if (U_FAILURE(status)) {
3182 errln("ERROR: loadTestData() failed - %s", u_errorName(status));
3183 return NULL;
3184 }
3185
3186 strcpy(buffer, testDataDirectory);
3187 strcat(buffer, filename);
3188 return buffer;
3189 }
3190
Extended()3191 void RegexTest::Extended() {
3192 char tdd[2048];
3193 const char *srcPath;
3194 UErrorCode status = U_ZERO_ERROR;
3195 int32_t lineNum = 0;
3196
3197 //
3198 // Open and read the test data file.
3199 //
3200 srcPath=getPath(tdd, "regextst.txt");
3201 if(srcPath==NULL) {
3202 return; /* something went wrong, error already output */
3203 }
3204
3205 int32_t len;
3206 UChar *testData = ReadAndConvertFile(srcPath, len, "utf-8", status);
3207 if (U_FAILURE(status)) {
3208 return; /* something went wrong, error already output */
3209 }
3210
3211 //
3212 // Put the test data into a UnicodeString
3213 //
3214 UnicodeString testString(FALSE, testData, len);
3215
3216 RegexMatcher quotedStuffMat(UNICODE_STRING_SIMPLE("\\s*([\\'\\\"/])(.*?)\\1"), 0, status);
3217 RegexMatcher commentMat (UNICODE_STRING_SIMPLE("\\s*(#.*)?$"), 0, status);
3218 RegexMatcher flagsMat (UNICODE_STRING_SIMPLE("\\s*([ixsmdteDEGLMQvabtyYzZ2-9]*)([:letter:]*)"), 0, status);
3219
3220 RegexMatcher lineMat(UNICODE_STRING_SIMPLE("(.*?)\\r?\\n"), testString, 0, status);
3221 UnicodeString testPattern; // The pattern for test from the test file.
3222 UnicodeString testFlags; // the flags for a test.
3223 UnicodeString matchString; // The marked up string to be used as input
3224
3225 if (U_FAILURE(status)){
3226 dataerrln("Construct RegexMatcher() error - %s", u_errorName(status));
3227 delete [] testData;
3228 return;
3229 }
3230
3231 //
3232 // Loop over the test data file, once per line.
3233 //
3234 while (lineMat.find()) {
3235 lineNum++;
3236 if (U_FAILURE(status)) {
3237 errln("%s:%d: ICU Error \"%s\"", srcPath, lineNum, u_errorName(status));
3238 }
3239
3240 status = U_ZERO_ERROR;
3241 UnicodeString testLine = lineMat.group(1, status);
3242 if (testLine.length() == 0) {
3243 continue;
3244 }
3245
3246 //
3247 // Parse the test line. Skip blank and comment only lines.
3248 // Separate out the three main fields - pattern, flags, target.
3249 //
3250
3251 commentMat.reset(testLine);
3252 if (commentMat.lookingAt(status)) {
3253 // This line is a comment, or blank.
3254 continue;
3255 }
3256
3257 //
3258 // Pull out the pattern field, remove it from the test file line.
3259 //
3260 quotedStuffMat.reset(testLine);
3261 if (quotedStuffMat.lookingAt(status)) {
3262 testPattern = quotedStuffMat.group(2, status);
3263 testLine.remove(0, quotedStuffMat.end(0, status));
3264 } else {
3265 errln("Bad pattern (missing quotes?) at %s:%d", srcPath, lineNum);
3266 continue;
3267 }
3268
3269
3270 //
3271 // Pull out the flags from the test file line.
3272 //
3273 flagsMat.reset(testLine);
3274 flagsMat.lookingAt(status); // Will always match, possibly an empty string.
3275 testFlags = flagsMat.group(1, status);
3276 if (flagsMat.group(2, status).length() > 0) {
3277 errln("Bad Match flag at line %d. Scanning %c\n",
3278 lineNum, flagsMat.group(2, status).charAt(0));
3279 continue;
3280 }
3281 testLine.remove(0, flagsMat.end(0, status));
3282
3283 //
3284 // Pull out the match string, as a whole.
3285 // We'll process the <tags> later.
3286 //
3287 quotedStuffMat.reset(testLine);
3288 if (quotedStuffMat.lookingAt(status)) {
3289 matchString = quotedStuffMat.group(2, status);
3290 testLine.remove(0, quotedStuffMat.end(0, status));
3291 } else {
3292 errln("Bad match string at test file line %d", lineNum);
3293 continue;
3294 }
3295
3296 //
3297 // The only thing left from the input line should be an optional trailing comment.
3298 //
3299 commentMat.reset(testLine);
3300 if (commentMat.lookingAt(status) == FALSE) {
3301 errln("Line %d: unexpected characters at end of test line.", lineNum);
3302 continue;
3303 }
3304
3305 //
3306 // Run the test
3307 //
3308 regex_find(testPattern, testFlags, matchString, srcPath, lineNum);
3309 }
3310
3311 delete [] testData;
3312
3313 }
3314
3315
3316
3317 //---------------------------------------------------------------------------
3318 //
3319 // regex_find(pattern, flags, inputString, lineNumber)
3320 //
3321 // Function to run a single test from the Extended (data driven) tests.
3322 // See file test/testdata/regextst.txt for a description of the
3323 // pattern and inputString fields, and the allowed flags.
3324 // lineNumber is the source line in regextst.txt of the test.
3325 //
3326 //---------------------------------------------------------------------------
3327
3328
3329 // Set a value into a UVector at position specified by a decimal number in
3330 // a UnicodeString. This is a utility function needed by the actual test function,
3331 // which follows.
set(UVector & vec,int32_t val,UnicodeString index)3332 static void set(UVector &vec, int32_t val, UnicodeString index) {
3333 UErrorCode status=U_ZERO_ERROR;
3334 int32_t idx = 0;
3335 for (int32_t i=0; i<index.length(); i++) {
3336 int32_t d=u_charDigitValue(index.charAt(i));
3337 if (d<0) {return;}
3338 idx = idx*10 + d;
3339 }
3340 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3341 vec.setElementAt(val, idx);
3342 }
3343
setInt(UVector & vec,int32_t val,int32_t idx)3344 static void setInt(UVector &vec, int32_t val, int32_t idx) {
3345 UErrorCode status=U_ZERO_ERROR;
3346 while (vec.size()<idx+1) {vec.addElement(-1, status);}
3347 vec.setElementAt(val, idx);
3348 }
3349
utextOffsetToNative(UText * utext,int32_t unistrOffset,int32_t & nativeIndex)3350 static UBool utextOffsetToNative(UText *utext, int32_t unistrOffset, int32_t& nativeIndex)
3351 {
3352 UBool couldFind = TRUE;
3353 UTEXT_SETNATIVEINDEX(utext, 0);
3354 int32_t i = 0;
3355 while (i < unistrOffset) {
3356 UChar32 c = UTEXT_NEXT32(utext);
3357 if (c != U_SENTINEL) {
3358 i += U16_LENGTH(c);
3359 } else {
3360 couldFind = FALSE;
3361 break;
3362 }
3363 }
3364 nativeIndex = (int32_t)UTEXT_GETNATIVEINDEX(utext);
3365 return couldFind;
3366 }
3367
3368
regex_find(const UnicodeString & pattern,const UnicodeString & flags,const UnicodeString & inputString,const char * srcPath,int32_t line)3369 void RegexTest::regex_find(const UnicodeString &pattern,
3370 const UnicodeString &flags,
3371 const UnicodeString &inputString,
3372 const char *srcPath,
3373 int32_t line) {
3374 UnicodeString unEscapedInput;
3375 UnicodeString deTaggedInput;
3376
3377 int32_t patternUTF8Length, inputUTF8Length;
3378 char *patternChars = NULL, *inputChars = NULL;
3379 UText patternText = UTEXT_INITIALIZER;
3380 UText inputText = UTEXT_INITIALIZER;
3381 UConverter *UTF8Converter = NULL;
3382
3383 UErrorCode status = U_ZERO_ERROR;
3384 UParseError pe;
3385 RegexPattern *parsePat = NULL;
3386 RegexMatcher *parseMatcher = NULL;
3387 RegexPattern *callerPattern = NULL, *UTF8Pattern = NULL;
3388 RegexMatcher *matcher = NULL, *UTF8Matcher = NULL;
3389 UVector groupStarts(status);
3390 UVector groupEnds(status);
3391 UVector groupStartsUTF8(status);
3392 UVector groupEndsUTF8(status);
3393 UBool isMatch = FALSE, isUTF8Match = FALSE;
3394 UBool failed = FALSE;
3395 int32_t numFinds;
3396 int32_t i;
3397 UBool useMatchesFunc = FALSE;
3398 UBool useLookingAtFunc = FALSE;
3399 int32_t regionStart = -1;
3400 int32_t regionEnd = -1;
3401 int32_t regionStartUTF8 = -1;
3402 int32_t regionEndUTF8 = -1;
3403
3404
3405 //
3406 // Compile the caller's pattern
3407 //
3408 uint32_t bflags = 0;
3409 if (flags.indexOf((UChar)0x69) >= 0) { // 'i' flag
3410 bflags |= UREGEX_CASE_INSENSITIVE;
3411 }
3412 if (flags.indexOf((UChar)0x78) >= 0) { // 'x' flag
3413 bflags |= UREGEX_COMMENTS;
3414 }
3415 if (flags.indexOf((UChar)0x73) >= 0) { // 's' flag
3416 bflags |= UREGEX_DOTALL;
3417 }
3418 if (flags.indexOf((UChar)0x6d) >= 0) { // 'm' flag
3419 bflags |= UREGEX_MULTILINE;
3420 }
3421
3422 if (flags.indexOf((UChar)0x65) >= 0) { // 'e' flag
3423 bflags |= UREGEX_ERROR_ON_UNKNOWN_ESCAPES;
3424 }
3425 if (flags.indexOf((UChar)0x44) >= 0) { // 'D' flag
3426 bflags |= UREGEX_UNIX_LINES;
3427 }
3428 if (flags.indexOf((UChar)0x51) >= 0) { // 'Q' flag
3429 bflags |= UREGEX_LITERAL;
3430 }
3431
3432
3433 callerPattern = RegexPattern::compile(pattern, bflags, pe, status);
3434 if (status != U_ZERO_ERROR) {
3435 #if UCONFIG_NO_BREAK_ITERATION==1
3436 // 'v' test flag means that the test pattern should not compile if ICU was configured
3437 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3438 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3439 goto cleanupAndReturn;
3440 }
3441 #endif
3442 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3443 // Expected pattern compilation error.
3444 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3445 logln("Pattern Compile returns \"%s\"", u_errorName(status));
3446 }
3447 goto cleanupAndReturn;
3448 } else {
3449 // Unexpected pattern compilation error.
3450 dataerrln("Line %d: error %s compiling pattern.", line, u_errorName(status));
3451 goto cleanupAndReturn;
3452 }
3453 }
3454
3455 UTF8Converter = ucnv_open("UTF8", &status);
3456 ucnv_setFromUCallBack(UTF8Converter, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
3457
3458 patternUTF8Length = pattern.extract(NULL, 0, UTF8Converter, status);
3459 status = U_ZERO_ERROR; // buffer overflow
3460 patternChars = new char[patternUTF8Length+1];
3461 pattern.extract(patternChars, patternUTF8Length+1, UTF8Converter, status);
3462 utext_openUTF8(&patternText, patternChars, patternUTF8Length, &status);
3463
3464 if (status == U_ZERO_ERROR) {
3465 UTF8Pattern = RegexPattern::compile(&patternText, bflags, pe, status);
3466
3467 if (status != U_ZERO_ERROR) {
3468 #if UCONFIG_NO_BREAK_ITERATION==1
3469 // 'v' test flag means that the test pattern should not compile if ICU was configured
3470 // to not include break iteration. RBBI is needed for Unicode word boundaries.
3471 if (flags.indexOf((UChar)0x76) >= 0 /*'v'*/ && status == U_UNSUPPORTED_ERROR) {
3472 goto cleanupAndReturn;
3473 }
3474 #endif
3475 if (flags.indexOf((UChar)0x45) >= 0) { // flags contain 'E'
3476 // Expected pattern compilation error.
3477 if (flags.indexOf((UChar)0x64) >= 0) { // flags contain 'd'
3478 logln("Pattern Compile returns \"%s\" (UTF8)", u_errorName(status));
3479 }
3480 goto cleanupAndReturn;
3481 } else {
3482 // Unexpected pattern compilation error.
3483 errln("Line %d: error %s compiling pattern. (UTF8)", line, u_errorName(status));
3484 goto cleanupAndReturn;
3485 }
3486 }
3487 }
3488
3489 if (UTF8Pattern == NULL) {
3490 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3491 logln("Unable to create UTF-8 pattern, skipping UTF-8 tests for %s:%d", srcPath, line);
3492 status = U_ZERO_ERROR;
3493 }
3494
3495 if (flags.indexOf((UChar)0x64) >= 0) { // 'd' flag
3496 callerPattern->dumpPattern();
3497 }
3498
3499 if (flags.indexOf((UChar)0x45) >= 0) { // 'E' flag
3500 errln("%s, Line %d: Expected, but did not get, a pattern compilation error.", srcPath, line);
3501 goto cleanupAndReturn;
3502 }
3503
3504
3505 //
3506 // Number of times find() should be called on the test string, default to 1
3507 //
3508 numFinds = 1;
3509 for (i=2; i<=9; i++) {
3510 if (flags.indexOf((UChar)(0x30 + i)) >= 0) { // digit flag
3511 if (numFinds != 1) {
3512 errln("Line %d: more than one digit flag. Scanning %d.", line, i);
3513 goto cleanupAndReturn;
3514 }
3515 numFinds = i;
3516 }
3517 }
3518
3519 // 'M' flag. Use matches() instead of find()
3520 if (flags.indexOf((UChar)0x4d) >= 0) {
3521 useMatchesFunc = TRUE;
3522 }
3523 if (flags.indexOf((UChar)0x4c) >= 0) {
3524 useLookingAtFunc = TRUE;
3525 }
3526
3527 //
3528 // Find the tags in the input data, remove them, and record the group boundary
3529 // positions.
3530 //
3531 parsePat = RegexPattern::compile("<(/?)(r|[0-9]+)>", 0, pe, status);
3532 REGEX_CHECK_STATUS_L(line);
3533
3534 unEscapedInput = inputString.unescape();
3535 parseMatcher = parsePat->matcher(unEscapedInput, status);
3536 REGEX_CHECK_STATUS_L(line);
3537 while(parseMatcher->find()) {
3538 parseMatcher->appendReplacement(deTaggedInput, "", status);
3539 REGEX_CHECK_STATUS;
3540 UnicodeString groupNum = parseMatcher->group(2, status);
3541 if (groupNum == "r") {
3542 // <r> or </r>, a region specification within the string
3543 if (parseMatcher->group(1, status) == "/") {
3544 regionEnd = deTaggedInput.length();
3545 } else {
3546 regionStart = deTaggedInput.length();
3547 }
3548 } else {
3549 // <digits> or </digits>, a group match boundary tag.
3550 if (parseMatcher->group(1, status) == "/") {
3551 set(groupEnds, deTaggedInput.length(), groupNum);
3552 } else {
3553 set(groupStarts, deTaggedInput.length(), groupNum);
3554 }
3555 }
3556 }
3557 parseMatcher->appendTail(deTaggedInput);
3558 REGEX_ASSERT_L(groupStarts.size() == groupEnds.size(), line);
3559 if ((regionStart>=0 || regionEnd>=0) && (regionStart<0 || regionStart>regionEnd)) {
3560 errln("mismatched <r> tags");
3561 failed = TRUE;
3562 goto cleanupAndReturn;
3563 }
3564
3565 //
3566 // Configure the matcher according to the flags specified with this test.
3567 //
3568 matcher = callerPattern->matcher(deTaggedInput, status);
3569 REGEX_CHECK_STATUS_L(line);
3570 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3571 matcher->setTrace(TRUE);
3572 }
3573
3574 if (UTF8Pattern != NULL) {
3575 inputUTF8Length = deTaggedInput.extract(NULL, 0, UTF8Converter, status);
3576 status = U_ZERO_ERROR; // buffer overflow
3577 inputChars = new char[inputUTF8Length+1];
3578 deTaggedInput.extract(inputChars, inputUTF8Length+1, UTF8Converter, status);
3579 utext_openUTF8(&inputText, inputChars, inputUTF8Length, &status);
3580
3581 if (status == U_ZERO_ERROR) {
3582 UTF8Matcher = &UTF8Pattern->matcher(status)->reset(&inputText);
3583 REGEX_CHECK_STATUS_L(line);
3584 }
3585
3586 if (UTF8Matcher == NULL) {
3587 // UTF-8 does not allow unpaired surrogates, so this could actually happen without being a failure of the engine
3588 logln("Unable to create UTF-8 matcher, skipping UTF-8 tests for %s:%d", srcPath, line);
3589 status = U_ZERO_ERROR;
3590 }
3591 }
3592
3593 //
3594 // Generate native indices for UTF8 versions of region and capture group info
3595 //
3596 if (UTF8Matcher != NULL) {
3597 if (flags.indexOf((UChar)0x74) >= 0) { // 't' trace flag
3598 UTF8Matcher->setTrace(TRUE);
3599 }
3600 if (regionStart>=0) (void) utextOffsetToNative(&inputText, regionStart, regionStartUTF8);
3601 if (regionEnd>=0) (void) utextOffsetToNative(&inputText, regionEnd, regionEndUTF8);
3602
3603 // Fill out the native index UVector info.
3604 // Only need 1 loop, from above we know groupStarts.size() = groupEnds.size()
3605 for (i=0; i<groupStarts.size(); i++) {
3606 int32_t start = groupStarts.elementAti(i);
3607 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3608 if (start >= 0) {
3609 int32_t startUTF8;
3610 if (!utextOffsetToNative(&inputText, start, startUTF8)) {
3611 errln("Error at line %d: could not find native index for group start %d. UTF16 index %d", line, i, start);
3612 failed = TRUE;
3613 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3614 }
3615 setInt(groupStartsUTF8, startUTF8, i);
3616 }
3617
3618 int32_t end = groupEnds.elementAti(i);
3619 // -1 means there was no UVector slot and we won't be requesting that capture group for this test, don't bother inserting
3620 if (end >= 0) {
3621 int32_t endUTF8;
3622 if (!utextOffsetToNative(&inputText, end, endUTF8)) {
3623 errln("Error at line %d: could not find native index for group end %d. UTF16 index %d", line, i, end);
3624 failed = TRUE;
3625 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3626 }
3627 setInt(groupEndsUTF8, endUTF8, i);
3628 }
3629 }
3630 }
3631
3632 if (regionStart>=0) {
3633 matcher->region(regionStart, regionEnd, status);
3634 REGEX_CHECK_STATUS_L(line);
3635 if (UTF8Matcher != NULL) {
3636 UTF8Matcher->region(regionStartUTF8, regionEndUTF8, status);
3637 REGEX_CHECK_STATUS_L(line);
3638 }
3639 }
3640 if (flags.indexOf((UChar)0x61) >= 0) { // 'a' anchoring bounds flag
3641 matcher->useAnchoringBounds(FALSE);
3642 if (UTF8Matcher != NULL) {
3643 UTF8Matcher->useAnchoringBounds(FALSE);
3644 }
3645 }
3646 if (flags.indexOf((UChar)0x62) >= 0) { // 'b' transparent bounds flag
3647 matcher->useTransparentBounds(TRUE);
3648 if (UTF8Matcher != NULL) {
3649 UTF8Matcher->useTransparentBounds(TRUE);
3650 }
3651 }
3652
3653
3654
3655 //
3656 // Do a find on the de-tagged input using the caller's pattern
3657 // TODO: error on count>1 and not find().
3658 // error on both matches() and lookingAt().
3659 //
3660 for (i=0; i<numFinds; i++) {
3661 if (useMatchesFunc) {
3662 isMatch = matcher->matches(status);
3663 if (UTF8Matcher != NULL) {
3664 isUTF8Match = UTF8Matcher->matches(status);
3665 }
3666 } else if (useLookingAtFunc) {
3667 isMatch = matcher->lookingAt(status);
3668 if (UTF8Matcher != NULL) {
3669 isUTF8Match = UTF8Matcher->lookingAt(status);
3670 }
3671 } else {
3672 isMatch = matcher->find();
3673 if (UTF8Matcher != NULL) {
3674 isUTF8Match = UTF8Matcher->find();
3675 }
3676 }
3677 }
3678 matcher->setTrace(FALSE);
3679 if (UTF8Matcher) {
3680 UTF8Matcher->setTrace(FALSE);
3681 }
3682 if (U_FAILURE(status)) {
3683 errln("Error at line %d. ICU ErrorCode is %s", u_errorName(status));
3684 }
3685
3686 //
3687 // Match up the groups from the find() with the groups from the tags
3688 //
3689
3690 // number of tags should match number of groups from find operation.
3691 // matcher->groupCount does not include group 0, the entire match, hence the +1.
3692 // G option in test means that capture group data is not available in the
3693 // expected results, so the check needs to be suppressed.
3694 if (isMatch == FALSE && groupStarts.size() != 0) {
3695 dataerrln("Error at line %d: Match expected, but none found.", line);
3696 failed = TRUE;
3697 goto cleanupAndReturn;
3698 } else if (UTF8Matcher != NULL && isUTF8Match == FALSE && groupStarts.size() != 0) {
3699 errln("Error at line %d: Match expected, but none found. (UTF8)", line);
3700 failed = TRUE;
3701 goto cleanupAndReturn;
3702 }
3703 if (isMatch && groupStarts.size() == 0) {
3704 errln("Error at line %d: No match expected, but one found at position %d.", line, matcher->start(status));
3705 failed = TRUE;
3706 }
3707 if (UTF8Matcher && isUTF8Match && groupStarts.size() == 0) {
3708 errln("Error at line %d: No match expected, but one found at position %d (UTF-8).", line, UTF8Matcher->start(status));
3709 failed = TRUE;
3710 }
3711
3712 if (flags.indexOf((UChar)0x47 /*G*/) >= 0) {
3713 // Only check for match / no match. Don't check capture groups.
3714 goto cleanupAndReturn;
3715 }
3716
3717 REGEX_CHECK_STATUS_L(line);
3718 for (i=0; i<=matcher->groupCount(); i++) {
3719 int32_t expectedStart = (i >= groupStarts.size()? -1 : groupStarts.elementAti(i));
3720 int32_t expectedStartUTF8 = (i >= groupStartsUTF8.size()? -1 : groupStartsUTF8.elementAti(i));
3721 if (matcher->start(i, status) != expectedStart) {
3722 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d",
3723 line, i, expectedStart, matcher->start(i, status));
3724 failed = TRUE;
3725 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3726 } else if (UTF8Matcher != NULL && UTF8Matcher->start(i, status) != expectedStartUTF8) {
3727 errln("Error at line %d: incorrect start position for group %d. Expected %d, got %d (UTF8)",
3728 line, i, expectedStartUTF8, UTF8Matcher->start(i, status));
3729 failed = TRUE;
3730 goto cleanupAndReturn; // Good chance of subsequent bogus errors. Stop now.
3731 }
3732
3733 int32_t expectedEnd = (i >= groupEnds.size()? -1 : groupEnds.elementAti(i));
3734 int32_t expectedEndUTF8 = (i >= groupEndsUTF8.size()? -1 : groupEndsUTF8.elementAti(i));
3735 if (matcher->end(i, status) != expectedEnd) {
3736 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d",
3737 line, i, expectedEnd, matcher->end(i, status));
3738 failed = TRUE;
3739 // Error on end position; keep going; real error is probably yet to come as group
3740 // end positions work from end of the input data towards the front.
3741 } else if (UTF8Matcher != NULL && UTF8Matcher->end(i, status) != expectedEndUTF8) {
3742 errln("Error at line %d: incorrect end position for group %d. Expected %d, got %d (UTF8)",
3743 line, i, expectedEndUTF8, UTF8Matcher->end(i, status));
3744 failed = TRUE;
3745 // Error on end position; keep going; real error is probably yet to come as group
3746 // end positions work from end of the input data towards the front.
3747 }
3748 }
3749 if ( matcher->groupCount()+1 < groupStarts.size()) {
3750 errln("Error at line %d: Expected %d capture groups, found %d.",
3751 line, groupStarts.size()-1, matcher->groupCount());
3752 failed = TRUE;
3753 }
3754 else if (UTF8Matcher != NULL && UTF8Matcher->groupCount()+1 < groupStarts.size()) {
3755 errln("Error at line %d: Expected %d capture groups, found %d. (UTF8)",
3756 line, groupStarts.size()-1, UTF8Matcher->groupCount());
3757 failed = TRUE;
3758 }
3759
3760 if ((flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3761 matcher->requireEnd() == TRUE) {
3762 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE", line);
3763 failed = TRUE;
3764 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x59) >= 0) && // 'Y' flag: RequireEnd() == false
3765 UTF8Matcher->requireEnd() == TRUE) {
3766 errln("Error at line %d: requireEnd() returned TRUE. Expected FALSE (UTF8)", line);
3767 failed = TRUE;
3768 }
3769
3770 if ((flags.indexOf((UChar)0x79) >= 0) && // 'y' flag: RequireEnd() == true
3771 matcher->requireEnd() == FALSE) {
3772 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE", line);
3773 failed = TRUE;
3774 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x79) >= 0) && // 'Y' flag: RequireEnd() == false
3775 UTF8Matcher->requireEnd() == FALSE) {
3776 errln("Error at line %d: requireEnd() returned FALSE. Expected TRUE (UTF8)", line);
3777 failed = TRUE;
3778 }
3779
3780 if ((flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3781 matcher->hitEnd() == TRUE) {
3782 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE", line);
3783 failed = TRUE;
3784 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x5A) >= 0) && // 'Z' flag: hitEnd() == false
3785 UTF8Matcher->hitEnd() == TRUE) {
3786 errln("Error at line %d: hitEnd() returned TRUE. Expected FALSE (UTF8)", line);
3787 failed = TRUE;
3788 }
3789
3790 if ((flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3791 matcher->hitEnd() == FALSE) {
3792 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE", line);
3793 failed = TRUE;
3794 } else if (UTF8Matcher != NULL && (flags.indexOf((UChar)0x7A) >= 0) && // 'z' flag: hitEnd() == true
3795 UTF8Matcher->hitEnd() == FALSE) {
3796 errln("Error at line %d: hitEnd() returned FALSE. Expected TRUE (UTF8)", line);
3797 failed = TRUE;
3798 }
3799
3800
3801 cleanupAndReturn:
3802 if (failed) {
3803 infoln((UnicodeString)"\""+pattern+(UnicodeString)"\" "
3804 +flags+(UnicodeString)" \""+inputString+(UnicodeString)"\"");
3805 // callerPattern->dump();
3806 }
3807 delete parseMatcher;
3808 delete parsePat;
3809 delete UTF8Matcher;
3810 delete UTF8Pattern;
3811 delete matcher;
3812 delete callerPattern;
3813
3814 utext_close(&inputText);
3815 delete[] inputChars;
3816 utext_close(&patternText);
3817 delete[] patternChars;
3818 ucnv_close(UTF8Converter);
3819 }
3820
3821
3822
3823
3824 //---------------------------------------------------------------------------
3825 //
3826 // Errors Check for error handling in patterns.
3827 //
3828 //---------------------------------------------------------------------------
Errors()3829 void RegexTest::Errors() {
3830 // \escape sequences that aren't implemented yet.
3831 //REGEX_ERR("hex format \\x{abcd} not implemented", 1, 13, U_REGEX_UNIMPLEMENTED);
3832
3833 // Missing close parentheses
3834 REGEX_ERR("Comment (?# with no close", 1, 25, U_REGEX_MISMATCHED_PAREN);
3835 REGEX_ERR("Capturing Parenthesis(...", 1, 25, U_REGEX_MISMATCHED_PAREN);
3836 REGEX_ERR("Grouping only parens (?: blah blah", 1, 34, U_REGEX_MISMATCHED_PAREN);
3837
3838 // Extra close paren
3839 REGEX_ERR("Grouping only parens (?: blah)) blah", 1, 31, U_REGEX_MISMATCHED_PAREN);
3840 REGEX_ERR(")))))))", 1, 1, U_REGEX_MISMATCHED_PAREN);
3841 REGEX_ERR("(((((((", 1, 7, U_REGEX_MISMATCHED_PAREN);
3842
3843 // Look-ahead, Look-behind
3844 // TODO: add tests for unbounded length look-behinds.
3845 REGEX_ERR("abc(?<@xyz).*", 1, 7, U_REGEX_RULE_SYNTAX); // illegal construct
3846
3847 // Attempt to use non-default flags
3848 {
3849 UParseError pe;
3850 UErrorCode status = U_ZERO_ERROR;
3851 int32_t flags = UREGEX_CANON_EQ |
3852 UREGEX_COMMENTS | UREGEX_DOTALL |
3853 UREGEX_MULTILINE;
3854 RegexPattern *pat1= RegexPattern::compile(".*", flags, pe, status);
3855 REGEX_ASSERT(status == U_REGEX_UNIMPLEMENTED);
3856 delete pat1;
3857 }
3858
3859
3860 // Quantifiers are allowed only after something that can be quantified.
3861 REGEX_ERR("+", 1, 1, U_REGEX_RULE_SYNTAX);
3862 REGEX_ERR("abc\ndef(*2)", 2, 5, U_REGEX_RULE_SYNTAX);
3863 REGEX_ERR("abc**", 1, 5, U_REGEX_RULE_SYNTAX);
3864
3865 // Mal-formed {min,max} quantifiers
3866 REGEX_ERR("abc{a,2}",1,5, U_REGEX_BAD_INTERVAL);
3867 REGEX_ERR("abc{4,2}",1,8, U_REGEX_MAX_LT_MIN);
3868 REGEX_ERR("abc{1,b}",1,7, U_REGEX_BAD_INTERVAL);
3869 REGEX_ERR("abc{1,,2}",1,7, U_REGEX_BAD_INTERVAL);
3870 REGEX_ERR("abc{1,2a}",1,8, U_REGEX_BAD_INTERVAL);
3871 REGEX_ERR("abc{222222222222222222222}",1,14, U_REGEX_NUMBER_TOO_BIG);
3872 REGEX_ERR("abc{5,50000000000}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows int during scan
3873 REGEX_ERR("abc{5,687865858}", 1, 16, U_REGEX_NUMBER_TOO_BIG); // Overflows regex binary format
3874 REGEX_ERR("abc{687865858,687865859}", 1, 24, U_REGEX_NUMBER_TOO_BIG);
3875
3876 // Ticket 5389
3877 REGEX_ERR("*c", 1, 1, U_REGEX_RULE_SYNTAX);
3878
3879 // Invalid Back Reference \0
3880 // For ICU 3.8 and earlier
3881 // For ICU versions newer than 3.8, \0 introduces an octal escape.
3882 //
3883 REGEX_ERR("(ab)\\0", 1, 6, U_REGEX_BAD_ESCAPE_SEQUENCE);
3884
3885 }
3886
3887
3888 //-------------------------------------------------------------------------------
3889 //
3890 // Read a text data file, convert it to UChars, and return the data
3891 // in one big UChar * buffer, which the caller must delete.
3892 //
3893 //--------------------------------------------------------------------------------
ReadAndConvertFile(const char * fileName,int32_t & ulen,const char * defEncoding,UErrorCode & status)3894 UChar *RegexTest::ReadAndConvertFile(const char *fileName, int32_t &ulen,
3895 const char *defEncoding, UErrorCode &status) {
3896 UChar *retPtr = NULL;
3897 char *fileBuf = NULL;
3898 UConverter* conv = NULL;
3899 FILE *f = NULL;
3900
3901 ulen = 0;
3902 if (U_FAILURE(status)) {
3903 return retPtr;
3904 }
3905
3906 //
3907 // Open the file.
3908 //
3909 f = fopen(fileName, "rb");
3910 if (f == 0) {
3911 dataerrln("Error opening test data file %s\n", fileName);
3912 status = U_FILE_ACCESS_ERROR;
3913 return NULL;
3914 }
3915 //
3916 // Read it in
3917 //
3918 int32_t fileSize;
3919 int32_t amt_read;
3920
3921 fseek( f, 0, SEEK_END);
3922 fileSize = ftell(f);
3923 fileBuf = new char[fileSize];
3924 fseek(f, 0, SEEK_SET);
3925 amt_read = fread(fileBuf, 1, fileSize, f);
3926 if (amt_read != fileSize || fileSize <= 0) {
3927 errln("Error reading test data file.");
3928 goto cleanUpAndReturn;
3929 }
3930
3931 //
3932 // Look for a Unicode Signature (BOM) on the data just read
3933 //
3934 int32_t signatureLength;
3935 const char * fileBufC;
3936 const char* encoding;
3937
3938 fileBufC = fileBuf;
3939 encoding = ucnv_detectUnicodeSignature(
3940 fileBuf, fileSize, &signatureLength, &status);
3941 if(encoding!=NULL ){
3942 fileBufC += signatureLength;
3943 fileSize -= signatureLength;
3944 } else {
3945 encoding = defEncoding;
3946 if (strcmp(encoding, "utf-8") == 0) {
3947 errln("file %s is missing its BOM", fileName);
3948 }
3949 }
3950
3951 //
3952 // Open a converter to take the rule file to UTF-16
3953 //
3954 conv = ucnv_open(encoding, &status);
3955 if (U_FAILURE(status)) {
3956 goto cleanUpAndReturn;
3957 }
3958
3959 //
3960 // Convert the rules to UChar.
3961 // Preflight first to determine required buffer size.
3962 //
3963 ulen = ucnv_toUChars(conv,
3964 NULL, // dest,
3965 0, // destCapacity,
3966 fileBufC,
3967 fileSize,
3968 &status);
3969 if (status == U_BUFFER_OVERFLOW_ERROR) {
3970 // Buffer Overflow is expected from the preflight operation.
3971 status = U_ZERO_ERROR;
3972
3973 retPtr = new UChar[ulen+1];
3974 ucnv_toUChars(conv,
3975 retPtr, // dest,
3976 ulen+1,
3977 fileBufC,
3978 fileSize,
3979 &status);
3980 }
3981
3982 cleanUpAndReturn:
3983 fclose(f);
3984 delete[] fileBuf;
3985 ucnv_close(conv);
3986 if (U_FAILURE(status)) {
3987 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
3988 delete []retPtr;
3989 retPtr = 0;
3990 ulen = 0;
3991 };
3992 return retPtr;
3993 }
3994
3995
3996 //-------------------------------------------------------------------------------
3997 //
3998 // PerlTests - Run Perl's regular expression tests
3999 // The input file for this test is re_tests, the standard regular
4000 // expression test data distributed with the Perl source code.
4001 //
4002 // Here is Perl's description of the test data file:
4003 //
4004 // # The tests are in a separate file 't/op/re_tests'.
4005 // # Each line in that file is a separate test.
4006 // # There are five columns, separated by tabs.
4007 // #
4008 // # Column 1 contains the pattern, optionally enclosed in C<''>.
4009 // # Modifiers can be put after the closing C<'>.
4010 // #
4011 // # Column 2 contains the string to be matched.
4012 // #
4013 // # Column 3 contains the expected result:
4014 // # y expect a match
4015 // # n expect no match
4016 // # c expect an error
4017 // # B test exposes a known bug in Perl, should be skipped
4018 // # b test exposes a known bug in Perl, should be skipped if noamp
4019 // #
4020 // # Columns 4 and 5 are used only if column 3 contains C<y> or C<c>.
4021 // #
4022 // # Column 4 contains a string, usually C<$&>.
4023 // #
4024 // # Column 5 contains the expected result of double-quote
4025 // # interpolating that string after the match, or start of error message.
4026 // #
4027 // # Column 6, if present, contains a reason why the test is skipped.
4028 // # This is printed with "skipped", for harness to pick up.
4029 // #
4030 // # \n in the tests are interpolated, as are variables of the form ${\w+}.
4031 // #
4032 // # If you want to add a regular expression test that can't be expressed
4033 // # in this format, don't add it here: put it in op/pat.t instead.
4034 //
4035 // For ICU, if field 3 contains an 'i', the test will be skipped.
4036 // The test exposes is some known incompatibility between ICU and Perl regexps.
4037 // (The i is in addition to whatever was there before.)
4038 //
4039 //-------------------------------------------------------------------------------
PerlTests()4040 void RegexTest::PerlTests() {
4041 char tdd[2048];
4042 const char *srcPath;
4043 UErrorCode status = U_ZERO_ERROR;
4044 UParseError pe;
4045
4046 //
4047 // Open and read the test data file.
4048 //
4049 srcPath=getPath(tdd, "re_tests.txt");
4050 if(srcPath==NULL) {
4051 return; /* something went wrong, error already output */
4052 }
4053
4054 int32_t len;
4055 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4056 if (U_FAILURE(status)) {
4057 return; /* something went wrong, error already output */
4058 }
4059
4060 //
4061 // Put the test data into a UnicodeString
4062 //
4063 UnicodeString testDataString(FALSE, testData, len);
4064
4065 //
4066 // Regex to break the input file into lines, and strip the new lines.
4067 // One line per match, capture group one is the desired data.
4068 //
4069 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4070 if (U_FAILURE(status)) {
4071 dataerrln("RegexPattern::compile() error");
4072 return;
4073 }
4074 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4075
4076 //
4077 // Regex to split a test file line into fields.
4078 // There are six fields, separated by tabs.
4079 //
4080 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4081
4082 //
4083 // Regex to identify test patterns with flag settings, and to separate them.
4084 // Test patterns with flags look like 'pattern'i
4085 // Test patterns without flags are not quoted: pattern
4086 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4087 //
4088 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4089 RegexMatcher* flagMat = flagPat->matcher(status);
4090
4091 //
4092 // The Perl tests reference several perl-isms, which are evaluated/substituted
4093 // in the test data. Not being perl, this must be done explicitly. Here
4094 // are string constants and REs for these constructs.
4095 //
4096 UnicodeString nulnulSrc("${nulnul}");
4097 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4098 nulnul = nulnul.unescape();
4099
4100 UnicodeString ffffSrc("${ffff}");
4101 UnicodeString ffff("\\uffff", -1, US_INV);
4102 ffff = ffff.unescape();
4103
4104 // regexp for $-[0], $+[2], etc.
4105 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4106 RegexMatcher *groupsMat = groupsPat->matcher(status);
4107
4108 // regexp for $0, $1, $2, etc.
4109 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4110 RegexMatcher *cgMat = cgPat->matcher(status);
4111
4112
4113 //
4114 // Main Loop for the Perl Tests, runs once per line from the
4115 // test data file.
4116 //
4117 int32_t lineNum = 0;
4118 int32_t skippedUnimplementedCount = 0;
4119 while (lineMat->find()) {
4120 lineNum++;
4121
4122 //
4123 // Get a line, break it into its fields, do the Perl
4124 // variable substitutions.
4125 //
4126 UnicodeString line = lineMat->group(1, status);
4127 UnicodeString fields[7];
4128 fieldPat->split(line, fields, 7, status);
4129
4130 flagMat->reset(fields[0]);
4131 flagMat->matches(status);
4132 UnicodeString pattern = flagMat->group(2, status);
4133 pattern.findAndReplace("${bang}", "!");
4134 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4135 pattern.findAndReplace(ffffSrc, ffff);
4136
4137 //
4138 // Identify patterns that include match flag settings,
4139 // split off the flags, remove the extra quotes.
4140 //
4141 UnicodeString flagStr = flagMat->group(3, status);
4142 if (U_FAILURE(status)) {
4143 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4144 return;
4145 }
4146 int32_t flags = 0;
4147 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4148 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4149 const UChar UChar_m = 0x6d;
4150 const UChar UChar_x = 0x78;
4151 const UChar UChar_y = 0x79;
4152 if (flagStr.indexOf(UChar_i) != -1) {
4153 flags |= UREGEX_CASE_INSENSITIVE;
4154 }
4155 if (flagStr.indexOf(UChar_m) != -1) {
4156 flags |= UREGEX_MULTILINE;
4157 }
4158 if (flagStr.indexOf(UChar_x) != -1) {
4159 flags |= UREGEX_COMMENTS;
4160 }
4161
4162 //
4163 // Compile the test pattern.
4164 //
4165 status = U_ZERO_ERROR;
4166 RegexPattern *testPat = RegexPattern::compile(pattern, flags, pe, status);
4167 if (status == U_REGEX_UNIMPLEMENTED) {
4168 //
4169 // Test of a feature that is planned for ICU, but not yet implemented.
4170 // skip the test.
4171 skippedUnimplementedCount++;
4172 delete testPat;
4173 status = U_ZERO_ERROR;
4174 continue;
4175 }
4176
4177 if (U_FAILURE(status)) {
4178 // Some tests are supposed to generate errors.
4179 // Only report an error for tests that are supposed to succeed.
4180 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4181 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4182 {
4183 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4184 }
4185 status = U_ZERO_ERROR;
4186 delete testPat;
4187 continue;
4188 }
4189
4190 if (fields[2].indexOf(UChar_i) >= 0) {
4191 // ICU should skip this test.
4192 delete testPat;
4193 continue;
4194 }
4195
4196 if (fields[2].indexOf(UChar_c) >= 0) {
4197 // This pattern should have caused a compilation error, but didn't/
4198 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4199 delete testPat;
4200 continue;
4201 }
4202
4203 //
4204 // replace the Perl variables that appear in some of the
4205 // match data strings.
4206 //
4207 UnicodeString matchString = fields[1];
4208 matchString.findAndReplace(nulnulSrc, nulnul);
4209 matchString.findAndReplace(ffffSrc, ffff);
4210
4211 // Replace any \n in the match string with an actual new-line char.
4212 // Don't do full unescape, as this unescapes more than Perl does, which
4213 // causes other spurious failures in the tests.
4214 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4215
4216
4217
4218 //
4219 // Run the test, check for expected match/don't match result.
4220 //
4221 RegexMatcher *testMat = testPat->matcher(matchString, status);
4222 UBool found = testMat->find();
4223 UBool expected = FALSE;
4224 if (fields[2].indexOf(UChar_y) >=0) {
4225 expected = TRUE;
4226 }
4227 if (expected != found) {
4228 errln("line %d: Expected %smatch, got %smatch",
4229 lineNum, expected?"":"no ", found?"":"no " );
4230 continue;
4231 }
4232
4233 // Don't try to check expected results if there is no match.
4234 // (Some have stuff in the expected fields)
4235 if (!found) {
4236 delete testMat;
4237 delete testPat;
4238 continue;
4239 }
4240
4241 //
4242 // Interpret the Perl expression from the fourth field of the data file,
4243 // building up an ICU string from the results of the ICU match.
4244 // The Perl expression will contain references to the results of
4245 // a regex match, including the matched string, capture group strings,
4246 // group starting and ending indicies, etc.
4247 //
4248 UnicodeString resultString;
4249 UnicodeString perlExpr = fields[3];
4250 #if SUPPORT_MUTATING_INPUT_STRING
4251 groupsMat->reset(perlExpr);
4252 cgMat->reset(perlExpr);
4253 #endif
4254
4255 while (perlExpr.length() > 0) {
4256 #if !SUPPORT_MUTATING_INPUT_STRING
4257 // Perferred usage. Reset after any modification to input string.
4258 groupsMat->reset(perlExpr);
4259 cgMat->reset(perlExpr);
4260 #endif
4261
4262 if (perlExpr.startsWith("$&")) {
4263 resultString.append(testMat->group(status));
4264 perlExpr.remove(0, 2);
4265 }
4266
4267 else if (groupsMat->lookingAt(status)) {
4268 // $-[0] $+[2] etc.
4269 UnicodeString digitString = groupsMat->group(2, status);
4270 int32_t t = 0;
4271 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4272 UnicodeString plusOrMinus = groupsMat->group(1, status);
4273 int32_t matchPosition;
4274 if (plusOrMinus.compare("+") == 0) {
4275 matchPosition = testMat->end(groupNum, status);
4276 } else {
4277 matchPosition = testMat->start(groupNum, status);
4278 }
4279 if (matchPosition != -1) {
4280 ICU_Utility::appendNumber(resultString, matchPosition);
4281 }
4282 perlExpr.remove(0, groupsMat->end(status));
4283 }
4284
4285 else if (cgMat->lookingAt(status)) {
4286 // $1, $2, $3, etc.
4287 UnicodeString digitString = cgMat->group(1, status);
4288 int32_t t = 0;
4289 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4290 if (U_SUCCESS(status)) {
4291 resultString.append(testMat->group(groupNum, status));
4292 status = U_ZERO_ERROR;
4293 }
4294 perlExpr.remove(0, cgMat->end(status));
4295 }
4296
4297 else if (perlExpr.startsWith("@-")) {
4298 int32_t i;
4299 for (i=0; i<=testMat->groupCount(); i++) {
4300 if (i>0) {
4301 resultString.append(" ");
4302 }
4303 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4304 }
4305 perlExpr.remove(0, 2);
4306 }
4307
4308 else if (perlExpr.startsWith("@+")) {
4309 int32_t i;
4310 for (i=0; i<=testMat->groupCount(); i++) {
4311 if (i>0) {
4312 resultString.append(" ");
4313 }
4314 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4315 }
4316 perlExpr.remove(0, 2);
4317 }
4318
4319 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4320 // or as an escaped sequence (e.g. \n)
4321 if (perlExpr.length() > 1) {
4322 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4323 }
4324 UChar c = perlExpr.charAt(0);
4325 switch (c) {
4326 case 'n': c = '\n'; break;
4327 // add any other escape sequences that show up in the test expected results.
4328 }
4329 resultString.append(c);
4330 perlExpr.remove(0, 1);
4331 }
4332
4333 else {
4334 // Any characters from the perl expression that we don't explicitly
4335 // recognize before here are assumed to be literals and copied
4336 // as-is to the expected results.
4337 resultString.append(perlExpr.charAt(0));
4338 perlExpr.remove(0, 1);
4339 }
4340
4341 if (U_FAILURE(status)) {
4342 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4343 break;
4344 }
4345 }
4346
4347 //
4348 // Expected Results Compare
4349 //
4350 UnicodeString expectedS(fields[4]);
4351 expectedS.findAndReplace(nulnulSrc, nulnul);
4352 expectedS.findAndReplace(ffffSrc, ffff);
4353 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4354
4355
4356 if (expectedS.compare(resultString) != 0) {
4357 err("Line %d: Incorrect perl expression results.", lineNum);
4358 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4359 }
4360
4361 delete testMat;
4362 delete testPat;
4363 }
4364
4365 //
4366 // All done. Clean up allocated stuff.
4367 //
4368 delete cgMat;
4369 delete cgPat;
4370
4371 delete groupsMat;
4372 delete groupsPat;
4373
4374 delete flagMat;
4375 delete flagPat;
4376
4377 delete lineMat;
4378 delete linePat;
4379
4380 delete fieldPat;
4381 delete [] testData;
4382
4383
4384 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4385
4386 }
4387
4388
4389 //-------------------------------------------------------------------------------
4390 //
4391 // PerlTestsUTF8 Run Perl's regular expression tests on UTF-8-based UTexts
4392 // (instead of using UnicodeStrings) to test the alternate engine.
4393 // The input file for this test is re_tests, the standard regular
4394 // expression test data distributed with the Perl source code.
4395 // See PerlTests() for more information.
4396 //
4397 //-------------------------------------------------------------------------------
PerlTestsUTF8()4398 void RegexTest::PerlTestsUTF8() {
4399 char tdd[2048];
4400 const char *srcPath;
4401 UErrorCode status = U_ZERO_ERROR;
4402 UParseError pe;
4403 LocalUConverterPointer UTF8Converter(ucnv_open("UTF-8", &status));
4404 UText patternText = UTEXT_INITIALIZER;
4405 char *patternChars = NULL;
4406 int32_t patternLength;
4407 int32_t patternCapacity = 0;
4408 UText inputText = UTEXT_INITIALIZER;
4409 char *inputChars = NULL;
4410 int32_t inputLength;
4411 int32_t inputCapacity = 0;
4412
4413 ucnv_setFromUCallBack(UTF8Converter.getAlias(), UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status);
4414
4415 //
4416 // Open and read the test data file.
4417 //
4418 srcPath=getPath(tdd, "re_tests.txt");
4419 if(srcPath==NULL) {
4420 return; /* something went wrong, error already output */
4421 }
4422
4423 int32_t len;
4424 UChar *testData = ReadAndConvertFile(srcPath, len, "iso-8859-1", status);
4425 if (U_FAILURE(status)) {
4426 return; /* something went wrong, error already output */
4427 }
4428
4429 //
4430 // Put the test data into a UnicodeString
4431 //
4432 UnicodeString testDataString(FALSE, testData, len);
4433
4434 //
4435 // Regex to break the input file into lines, and strip the new lines.
4436 // One line per match, capture group one is the desired data.
4437 //
4438 RegexPattern* linePat = RegexPattern::compile(UNICODE_STRING_SIMPLE("(.+?)[\\r\\n]+"), 0, pe, status);
4439 if (U_FAILURE(status)) {
4440 dataerrln("RegexPattern::compile() error");
4441 return;
4442 }
4443 RegexMatcher* lineMat = linePat->matcher(testDataString, status);
4444
4445 //
4446 // Regex to split a test file line into fields.
4447 // There are six fields, separated by tabs.
4448 //
4449 RegexPattern* fieldPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\t"), 0, pe, status);
4450
4451 //
4452 // Regex to identify test patterns with flag settings, and to separate them.
4453 // Test patterns with flags look like 'pattern'i
4454 // Test patterns without flags are not quoted: pattern
4455 // Coming out, capture group 2 is the pattern, capture group 3 is the flags.
4456 //
4457 RegexPattern *flagPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("('?)(.*)\\1(.*)"), 0, pe, status);
4458 RegexMatcher* flagMat = flagPat->matcher(status);
4459
4460 //
4461 // The Perl tests reference several perl-isms, which are evaluated/substituted
4462 // in the test data. Not being perl, this must be done explicitly. Here
4463 // are string constants and REs for these constructs.
4464 //
4465 UnicodeString nulnulSrc("${nulnul}");
4466 UnicodeString nulnul("\\u0000\\u0000", -1, US_INV);
4467 nulnul = nulnul.unescape();
4468
4469 UnicodeString ffffSrc("${ffff}");
4470 UnicodeString ffff("\\uffff", -1, US_INV);
4471 ffff = ffff.unescape();
4472
4473 // regexp for $-[0], $+[2], etc.
4474 RegexPattern *groupsPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$([+\\-])\\[(\\d+)\\]"), 0, pe, status);
4475 RegexMatcher *groupsMat = groupsPat->matcher(status);
4476
4477 // regexp for $0, $1, $2, etc.
4478 RegexPattern *cgPat = RegexPattern::compile(UNICODE_STRING_SIMPLE("\\$(\\d+)"), 0, pe, status);
4479 RegexMatcher *cgMat = cgPat->matcher(status);
4480
4481
4482 //
4483 // Main Loop for the Perl Tests, runs once per line from the
4484 // test data file.
4485 //
4486 int32_t lineNum = 0;
4487 int32_t skippedUnimplementedCount = 0;
4488 while (lineMat->find()) {
4489 lineNum++;
4490
4491 //
4492 // Get a line, break it into its fields, do the Perl
4493 // variable substitutions.
4494 //
4495 UnicodeString line = lineMat->group(1, status);
4496 UnicodeString fields[7];
4497 fieldPat->split(line, fields, 7, status);
4498
4499 flagMat->reset(fields[0]);
4500 flagMat->matches(status);
4501 UnicodeString pattern = flagMat->group(2, status);
4502 pattern.findAndReplace("${bang}", "!");
4503 pattern.findAndReplace(nulnulSrc, UNICODE_STRING_SIMPLE("\\u0000\\u0000"));
4504 pattern.findAndReplace(ffffSrc, ffff);
4505
4506 //
4507 // Identify patterns that include match flag settings,
4508 // split off the flags, remove the extra quotes.
4509 //
4510 UnicodeString flagStr = flagMat->group(3, status);
4511 if (U_FAILURE(status)) {
4512 errln("ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
4513 return;
4514 }
4515 int32_t flags = 0;
4516 const UChar UChar_c = 0x63; // Char constants for the flag letters.
4517 const UChar UChar_i = 0x69; // (Damn the lack of Unicode support in C)
4518 const UChar UChar_m = 0x6d;
4519 const UChar UChar_x = 0x78;
4520 const UChar UChar_y = 0x79;
4521 if (flagStr.indexOf(UChar_i) != -1) {
4522 flags |= UREGEX_CASE_INSENSITIVE;
4523 }
4524 if (flagStr.indexOf(UChar_m) != -1) {
4525 flags |= UREGEX_MULTILINE;
4526 }
4527 if (flagStr.indexOf(UChar_x) != -1) {
4528 flags |= UREGEX_COMMENTS;
4529 }
4530
4531 //
4532 // Put the pattern in a UTF-8 UText
4533 //
4534 status = U_ZERO_ERROR;
4535 patternLength = pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4536 if (status == U_BUFFER_OVERFLOW_ERROR) {
4537 status = U_ZERO_ERROR;
4538 delete[] patternChars;
4539 patternCapacity = patternLength + 1;
4540 patternChars = new char[patternCapacity];
4541 pattern.extract(patternChars, patternCapacity, UTF8Converter.getAlias(), status);
4542 }
4543 utext_openUTF8(&patternText, patternChars, patternLength, &status);
4544
4545 //
4546 // Compile the test pattern.
4547 //
4548 RegexPattern *testPat = RegexPattern::compile(&patternText, flags, pe, status);
4549 if (status == U_REGEX_UNIMPLEMENTED) {
4550 //
4551 // Test of a feature that is planned for ICU, but not yet implemented.
4552 // skip the test.
4553 skippedUnimplementedCount++;
4554 delete testPat;
4555 status = U_ZERO_ERROR;
4556 continue;
4557 }
4558
4559 if (U_FAILURE(status)) {
4560 // Some tests are supposed to generate errors.
4561 // Only report an error for tests that are supposed to succeed.
4562 if (fields[2].indexOf(UChar_c) == -1 && // Compilation is not supposed to fail AND
4563 fields[2].indexOf(UChar_i) == -1) // it's not an accepted ICU incompatibility
4564 {
4565 errln("line %d: ICU Error \"%s\"\n", lineNum, u_errorName(status));
4566 }
4567 status = U_ZERO_ERROR;
4568 delete testPat;
4569 continue;
4570 }
4571
4572 if (fields[2].indexOf(UChar_i) >= 0) {
4573 // ICU should skip this test.
4574 delete testPat;
4575 continue;
4576 }
4577
4578 if (fields[2].indexOf(UChar_c) >= 0) {
4579 // This pattern should have caused a compilation error, but didn't/
4580 errln("line %d: Expected a pattern compile error, got success.", lineNum);
4581 delete testPat;
4582 continue;
4583 }
4584
4585
4586 //
4587 // replace the Perl variables that appear in some of the
4588 // match data strings.
4589 //
4590 UnicodeString matchString = fields[1];
4591 matchString.findAndReplace(nulnulSrc, nulnul);
4592 matchString.findAndReplace(ffffSrc, ffff);
4593
4594 // Replace any \n in the match string with an actual new-line char.
4595 // Don't do full unescape, as this unescapes more than Perl does, which
4596 // causes other spurious failures in the tests.
4597 matchString.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4598
4599 //
4600 // Put the input in a UTF-8 UText
4601 //
4602 status = U_ZERO_ERROR;
4603 inputLength = matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4604 if (status == U_BUFFER_OVERFLOW_ERROR) {
4605 status = U_ZERO_ERROR;
4606 delete[] inputChars;
4607 inputCapacity = inputLength + 1;
4608 inputChars = new char[inputCapacity];
4609 matchString.extract(inputChars, inputCapacity, UTF8Converter.getAlias(), status);
4610 }
4611 utext_openUTF8(&inputText, inputChars, inputLength, &status);
4612
4613 //
4614 // Run the test, check for expected match/don't match result.
4615 //
4616 RegexMatcher *testMat = &testPat->matcher(status)->reset(&inputText);
4617 UBool found = testMat->find();
4618 UBool expected = FALSE;
4619 if (fields[2].indexOf(UChar_y) >=0) {
4620 expected = TRUE;
4621 }
4622 if (expected != found) {
4623 errln("line %d: Expected %smatch, got %smatch",
4624 lineNum, expected?"":"no ", found?"":"no " );
4625 continue;
4626 }
4627
4628 // Don't try to check expected results if there is no match.
4629 // (Some have stuff in the expected fields)
4630 if (!found) {
4631 delete testMat;
4632 delete testPat;
4633 continue;
4634 }
4635
4636 //
4637 // Interpret the Perl expression from the fourth field of the data file,
4638 // building up an ICU string from the results of the ICU match.
4639 // The Perl expression will contain references to the results of
4640 // a regex match, including the matched string, capture group strings,
4641 // group starting and ending indicies, etc.
4642 //
4643 UnicodeString resultString;
4644 UnicodeString perlExpr = fields[3];
4645
4646 while (perlExpr.length() > 0) {
4647 groupsMat->reset(perlExpr);
4648 cgMat->reset(perlExpr);
4649
4650 if (perlExpr.startsWith("$&")) {
4651 resultString.append(testMat->group(status));
4652 perlExpr.remove(0, 2);
4653 }
4654
4655 else if (groupsMat->lookingAt(status)) {
4656 // $-[0] $+[2] etc.
4657 UnicodeString digitString = groupsMat->group(2, status);
4658 int32_t t = 0;
4659 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4660 UnicodeString plusOrMinus = groupsMat->group(1, status);
4661 int32_t matchPosition;
4662 if (plusOrMinus.compare("+") == 0) {
4663 matchPosition = testMat->end(groupNum, status);
4664 } else {
4665 matchPosition = testMat->start(groupNum, status);
4666 }
4667 if (matchPosition != -1) {
4668 ICU_Utility::appendNumber(resultString, matchPosition);
4669 }
4670 perlExpr.remove(0, groupsMat->end(status));
4671 }
4672
4673 else if (cgMat->lookingAt(status)) {
4674 // $1, $2, $3, etc.
4675 UnicodeString digitString = cgMat->group(1, status);
4676 int32_t t = 0;
4677 int32_t groupNum = ICU_Utility::parseNumber(digitString, t, 10);
4678 if (U_SUCCESS(status)) {
4679 resultString.append(testMat->group(groupNum, status));
4680 status = U_ZERO_ERROR;
4681 }
4682 perlExpr.remove(0, cgMat->end(status));
4683 }
4684
4685 else if (perlExpr.startsWith("@-")) {
4686 int32_t i;
4687 for (i=0; i<=testMat->groupCount(); i++) {
4688 if (i>0) {
4689 resultString.append(" ");
4690 }
4691 ICU_Utility::appendNumber(resultString, testMat->start(i, status));
4692 }
4693 perlExpr.remove(0, 2);
4694 }
4695
4696 else if (perlExpr.startsWith("@+")) {
4697 int32_t i;
4698 for (i=0; i<=testMat->groupCount(); i++) {
4699 if (i>0) {
4700 resultString.append(" ");
4701 }
4702 ICU_Utility::appendNumber(resultString, testMat->end(i, status));
4703 }
4704 perlExpr.remove(0, 2);
4705 }
4706
4707 else if (perlExpr.startsWith(UNICODE_STRING_SIMPLE("\\"))) { // \Escape. Take following char as a literal.
4708 // or as an escaped sequence (e.g. \n)
4709 if (perlExpr.length() > 1) {
4710 perlExpr.remove(0, 1); // Remove the '\', but only if not last char.
4711 }
4712 UChar c = perlExpr.charAt(0);
4713 switch (c) {
4714 case 'n': c = '\n'; break;
4715 // add any other escape sequences that show up in the test expected results.
4716 }
4717 resultString.append(c);
4718 perlExpr.remove(0, 1);
4719 }
4720
4721 else {
4722 // Any characters from the perl expression that we don't explicitly
4723 // recognize before here are assumed to be literals and copied
4724 // as-is to the expected results.
4725 resultString.append(perlExpr.charAt(0));
4726 perlExpr.remove(0, 1);
4727 }
4728
4729 if (U_FAILURE(status)) {
4730 errln("Line %d: ICU Error \"%s\"", lineNum, u_errorName(status));
4731 break;
4732 }
4733 }
4734
4735 //
4736 // Expected Results Compare
4737 //
4738 UnicodeString expectedS(fields[4]);
4739 expectedS.findAndReplace(nulnulSrc, nulnul);
4740 expectedS.findAndReplace(ffffSrc, ffff);
4741 expectedS.findAndReplace(UNICODE_STRING_SIMPLE("\\n"), "\n");
4742
4743
4744 if (expectedS.compare(resultString) != 0) {
4745 err("Line %d: Incorrect perl expression results.", lineNum);
4746 infoln((UnicodeString)"Expected \""+expectedS+(UnicodeString)"\"; got \""+resultString+(UnicodeString)"\"");
4747 }
4748
4749 delete testMat;
4750 delete testPat;
4751 }
4752
4753 //
4754 // All done. Clean up allocated stuff.
4755 //
4756 delete cgMat;
4757 delete cgPat;
4758
4759 delete groupsMat;
4760 delete groupsPat;
4761
4762 delete flagMat;
4763 delete flagPat;
4764
4765 delete lineMat;
4766 delete linePat;
4767
4768 delete fieldPat;
4769 delete [] testData;
4770
4771 utext_close(&patternText);
4772 utext_close(&inputText);
4773
4774 delete [] patternChars;
4775 delete [] inputChars;
4776
4777
4778 logln("%d tests skipped because of unimplemented regexp features.", skippedUnimplementedCount);
4779
4780 }
4781
4782
4783 //--------------------------------------------------------------
4784 //
4785 // Bug6149 Verify limits to heap expansion for backtrack stack.
4786 // Use this pattern,
4787 // "(a?){1,8000000}"
4788 // Note: was an unbounded upperbounds, but that now has loop-breaking enabled.
4789 // This test is likely to be fragile, as further optimizations stop
4790 // more cases of pointless looping in the match engine.
4791 //
4792 //---------------------------------------------------------------
Bug6149()4793 void RegexTest::Bug6149() {
4794 UnicodeString pattern("(a?){1,8000000}");
4795 UnicodeString s("xyz");
4796 uint32_t flags = 0;
4797 UErrorCode status = U_ZERO_ERROR;
4798
4799 RegexMatcher matcher(pattern, s, flags, status);
4800 UBool result = false;
4801 REGEX_ASSERT_FAIL(result=matcher.matches(status), U_REGEX_STACK_OVERFLOW);
4802 REGEX_ASSERT(result == FALSE);
4803 }
4804
4805
4806 //
4807 // Callbacks() Test the callback function.
4808 // When set, callbacks occur periodically during matching operations,
4809 // giving the application code the ability to abort the operation
4810 // before it's normal completion.
4811 //
4812
4813 struct callBackContext {
4814 RegexTest *test;
4815 int32_t maxCalls;
4816 int32_t numCalls;
4817 int32_t lastSteps;
resetcallBackContext4818 void reset(int32_t max) {maxCalls=max; numCalls=0; lastSteps=0;};
4819 };
4820
4821 U_CDECL_BEGIN
4822 static UBool U_CALLCONV
testCallBackFn(const void * context,int32_t steps)4823 testCallBackFn(const void *context, int32_t steps) {
4824 callBackContext *info = (callBackContext *)context;
4825 if (info->lastSteps+1 != steps) {
4826 info->test->errln("incorrect steps in callback. Expected %d, got %d\n", info->lastSteps+1, steps);
4827 }
4828 info->lastSteps = steps;
4829 info->numCalls++;
4830 return (info->numCalls < info->maxCalls);
4831 }
4832 U_CDECL_END
4833
Callbacks()4834 void RegexTest::Callbacks() {
4835 {
4836 // Getter returns NULLs if no callback has been set
4837
4838 // The variables that the getter will fill in.
4839 // Init to non-null values so that the action of the getter can be seen.
4840 const void *returnedContext = &returnedContext;
4841 URegexMatchCallback *returnedFn = &testCallBackFn;
4842
4843 UErrorCode status = U_ZERO_ERROR;
4844 RegexMatcher matcher("x", 0, status);
4845 REGEX_CHECK_STATUS;
4846 matcher.getMatchCallback(returnedFn, returnedContext, status);
4847 REGEX_CHECK_STATUS;
4848 REGEX_ASSERT(returnedFn == NULL);
4849 REGEX_ASSERT(returnedContext == NULL);
4850 }
4851
4852 {
4853 // Set and Get work
4854 callBackContext cbInfo = {this, 0, 0, 0};
4855 const void *returnedContext;
4856 URegexMatchCallback *returnedFn;
4857 UErrorCode status = U_ZERO_ERROR;
4858 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)+\\2)+x"), 0, status); // A pattern that can run long.
4859 REGEX_CHECK_STATUS;
4860 matcher.setMatchCallback(testCallBackFn, &cbInfo, status);
4861 REGEX_CHECK_STATUS;
4862 matcher.getMatchCallback(returnedFn, returnedContext, status);
4863 REGEX_CHECK_STATUS;
4864 REGEX_ASSERT(returnedFn == testCallBackFn);
4865 REGEX_ASSERT(returnedContext == &cbInfo);
4866
4867 // A short-running match shouldn't invoke the callback
4868 status = U_ZERO_ERROR;
4869 cbInfo.reset(1);
4870 UnicodeString s = "xxx";
4871 matcher.reset(s);
4872 REGEX_ASSERT(matcher.matches(status));
4873 REGEX_CHECK_STATUS;
4874 REGEX_ASSERT(cbInfo.numCalls == 0);
4875
4876 // A medium-length match that runs long enough to invoke the
4877 // callback, but not so long that the callback aborts it.
4878 status = U_ZERO_ERROR;
4879 cbInfo.reset(4);
4880 s = "aaaaaaaaaaaaaaaaaaab";
4881 matcher.reset(s);
4882 REGEX_ASSERT(matcher.matches(status)==FALSE);
4883 REGEX_CHECK_STATUS;
4884 REGEX_ASSERT(cbInfo.numCalls > 0);
4885
4886 // A longer running match that the callback function will abort.
4887 status = U_ZERO_ERROR;
4888 cbInfo.reset(4);
4889 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4890 matcher.reset(s);
4891 REGEX_ASSERT(matcher.matches(status)==FALSE);
4892 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4893 REGEX_ASSERT(cbInfo.numCalls == 4);
4894
4895 // A longer running find that the callback function will abort.
4896 status = U_ZERO_ERROR;
4897 cbInfo.reset(4);
4898 s = "aaaaaaaaaaaaaaaaaaaaaaab";
4899 matcher.reset(s);
4900 REGEX_ASSERT(matcher.find(status)==FALSE);
4901 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
4902 REGEX_ASSERT(cbInfo.numCalls == 4);
4903 }
4904
4905
4906 }
4907
4908
4909 //
4910 // FindProgressCallbacks() Test the find "progress" callback function.
4911 // When set, the find progress callback will be invoked during a find operations
4912 // after each return from a match attempt, giving the application the opportunity
4913 // to terminate a long-running find operation before it's normal completion.
4914 //
4915
4916 struct progressCallBackContext {
4917 RegexTest *test;
4918 int64_t lastIndex;
4919 int32_t maxCalls;
4920 int32_t numCalls;
resetprogressCallBackContext4921 void reset(int32_t max) {maxCalls=max; numCalls=0;lastIndex=0;};
4922 };
4923
4924 // call-back function for find().
4925 // Return TRUE to continue the find().
4926 // Return FALSE to stop the find().
4927 U_CDECL_BEGIN
4928 static UBool U_CALLCONV
testProgressCallBackFn(const void * context,int64_t matchIndex)4929 testProgressCallBackFn(const void *context, int64_t matchIndex) {
4930 progressCallBackContext *info = (progressCallBackContext *)context;
4931 info->numCalls++;
4932 info->lastIndex = matchIndex;
4933 // info->test->infoln("ProgressCallback - matchIndex = %d, numCalls = %d\n", matchIndex, info->numCalls);
4934 return (info->numCalls < info->maxCalls);
4935 }
4936 U_CDECL_END
4937
FindProgressCallbacks()4938 void RegexTest::FindProgressCallbacks() {
4939 {
4940 // Getter returns NULLs if no callback has been set
4941
4942 // The variables that the getter will fill in.
4943 // Init to non-null values so that the action of the getter can be seen.
4944 const void *returnedContext = &returnedContext;
4945 URegexFindProgressCallback *returnedFn = &testProgressCallBackFn;
4946
4947 UErrorCode status = U_ZERO_ERROR;
4948 RegexMatcher matcher("x", 0, status);
4949 REGEX_CHECK_STATUS;
4950 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4951 REGEX_CHECK_STATUS;
4952 REGEX_ASSERT(returnedFn == NULL);
4953 REGEX_ASSERT(returnedContext == NULL);
4954 }
4955
4956 {
4957 // Set and Get work
4958 progressCallBackContext cbInfo = {this, 0, 0, 0};
4959 const void *returnedContext;
4960 URegexFindProgressCallback *returnedFn;
4961 UErrorCode status = U_ZERO_ERROR;
4962 RegexMatcher matcher(UNICODE_STRING_SIMPLE("((.)\\2)x"), 0, status);
4963 REGEX_CHECK_STATUS;
4964 matcher.setFindProgressCallback(testProgressCallBackFn, &cbInfo, status);
4965 REGEX_CHECK_STATUS;
4966 matcher.getFindProgressCallback(returnedFn, returnedContext, status);
4967 REGEX_CHECK_STATUS;
4968 REGEX_ASSERT(returnedFn == testProgressCallBackFn);
4969 REGEX_ASSERT(returnedContext == &cbInfo);
4970
4971 // A find that matches on the initial position does NOT invoke the callback.
4972 status = U_ZERO_ERROR;
4973 cbInfo.reset(100);
4974 UnicodeString s = "aaxxx";
4975 matcher.reset(s);
4976 #if 0
4977 matcher.setTrace(TRUE);
4978 #endif
4979 REGEX_ASSERT(matcher.find(0, status));
4980 REGEX_CHECK_STATUS;
4981 REGEX_ASSERT(cbInfo.numCalls == 0);
4982
4983 // A medium running find() that causes matcher.find() to invoke our callback for each index,
4984 // but not so many times that we interrupt the operation.
4985 status = U_ZERO_ERROR;
4986 s = "aaaaaaaaaaaaaaaaaaab";
4987 cbInfo.reset(s.length()); // Some upper limit for number of calls that is greater than size of our input string
4988 matcher.reset(s);
4989 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4990 REGEX_CHECK_STATUS;
4991 REGEX_ASSERT(cbInfo.numCalls > 0 && cbInfo.numCalls < 25);
4992
4993 // A longer running match that causes matcher.find() to invoke our callback which we cancel/interrupt at some point.
4994 status = U_ZERO_ERROR;
4995 UnicodeString s1 = "aaaaaaaaaaaaaaaaaaaaaaab";
4996 cbInfo.reset(s1.length() - 5); // Bail early somewhere near the end of input string
4997 matcher.reset(s1);
4998 REGEX_ASSERT(matcher.find(0, status)==FALSE);
4999 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5000 REGEX_ASSERT(cbInfo.numCalls == s1.length() - 5);
5001
5002 // Now a match that will succeed, but after an interruption
5003 status = U_ZERO_ERROR;
5004 UnicodeString s2 = "aaaaaaaaaaaaaa aaaaaaaaab xxx";
5005 cbInfo.reset(s2.length() - 10); // Bail early somewhere near the end of input string
5006 matcher.reset(s2);
5007 REGEX_ASSERT(matcher.find(0, status)==FALSE);
5008 REGEX_ASSERT(status == U_REGEX_STOPPED_BY_CALLER);
5009 // Now retry the match from where left off
5010 cbInfo.maxCalls = 100; // No callback limit
5011 status = U_ZERO_ERROR;
5012 REGEX_ASSERT(matcher.find(cbInfo.lastIndex, status));
5013 REGEX_CHECK_STATUS;
5014 }
5015
5016
5017 }
5018
5019
5020 //---------------------------------------------------------------------------
5021 //
5022 // PreAllocatedUTextCAPI Check the C API with pre-allocated mutable
5023 // UTexts. The pure-C implementation of UText
5024 // has no mutable backing stores, but we can
5025 // use UnicodeString here to test the functionality.
5026 //
5027 //---------------------------------------------------------------------------
PreAllocatedUTextCAPI()5028 void RegexTest::PreAllocatedUTextCAPI () {
5029 UErrorCode status = U_ZERO_ERROR;
5030 URegularExpression *re;
5031 UText patternText = UTEXT_INITIALIZER;
5032 UnicodeString buffer;
5033 UText bufferText = UTEXT_INITIALIZER;
5034
5035 utext_openUnicodeString(&bufferText, &buffer, &status);
5036
5037 /*
5038 * getText() and getUText()
5039 */
5040 {
5041 UText text1 = UTEXT_INITIALIZER;
5042 UText text2 = UTEXT_INITIALIZER;
5043 UChar text2Chars[20];
5044 UText *resultText;
5045
5046 status = U_ZERO_ERROR;
5047 regextst_openUTF8FromInvariant(&text1, "abcccd", -1, &status);
5048 regextst_openUTF8FromInvariant(&text2, "abcccxd", -1, &status);
5049 u_uastrncpy(text2Chars, "abcccxd", sizeof(text2)/2);
5050 utext_openUChars(&text2, text2Chars, -1, &status);
5051
5052 regextst_openUTF8FromInvariant(&patternText, "abc*d", -1, &status);
5053 re = uregex_openUText(&patternText, 0, NULL, &status);
5054
5055 /* First set a UText */
5056 uregex_setUText(re, &text1, &status);
5057 resultText = uregex_getUText(re, &bufferText, &status);
5058 REGEX_CHECK_STATUS;
5059 REGEX_ASSERT(resultText == &bufferText);
5060 utext_setNativeIndex(resultText, 0);
5061 utext_setNativeIndex(&text1, 0);
5062 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5063
5064 resultText = uregex_getUText(re, &bufferText, &status);
5065 REGEX_CHECK_STATUS;
5066 REGEX_ASSERT(resultText == &bufferText);
5067 utext_setNativeIndex(resultText, 0);
5068 utext_setNativeIndex(&text1, 0);
5069 REGEX_ASSERT(testUTextEqual(resultText, &text1));
5070
5071 /* Then set a UChar * */
5072 uregex_setText(re, text2Chars, 7, &status);
5073 resultText = uregex_getUText(re, &bufferText, &status);
5074 REGEX_CHECK_STATUS;
5075 REGEX_ASSERT(resultText == &bufferText);
5076 utext_setNativeIndex(resultText, 0);
5077 utext_setNativeIndex(&text2, 0);
5078 REGEX_ASSERT(testUTextEqual(resultText, &text2));
5079
5080 uregex_close(re);
5081 utext_close(&text1);
5082 utext_close(&text2);
5083 }
5084
5085 /*
5086 * group()
5087 */
5088 {
5089 UChar text1[80];
5090 UText *actual;
5091 UBool result;
5092 int64_t length = 0;
5093
5094 u_uastrncpy(text1, "noise abc interior def, and this is off the end", UPRV_LENGTHOF(text1));
5095 // 012345678901234567890123456789012345678901234567
5096 // 0 1 2 3 4
5097
5098 status = U_ZERO_ERROR;
5099 re = uregex_openC("abc(.*?)def", 0, NULL, &status);
5100 REGEX_CHECK_STATUS;
5101
5102 uregex_setText(re, text1, -1, &status);
5103 result = uregex_find(re, 0, &status);
5104 REGEX_ASSERT(result==TRUE);
5105
5106 /* Capture Group 0, the full match. Should succeed. "abc interior def" */
5107 status = U_ZERO_ERROR;
5108 actual = uregex_groupUText(re, 0, &bufferText, &length, &status);
5109 REGEX_CHECK_STATUS;
5110 REGEX_ASSERT(actual == &bufferText);
5111 REGEX_ASSERT(utext_getNativeIndex(actual) == 6);
5112 REGEX_ASSERT(length == 16);
5113 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5114
5115 /* Capture group #1. Should succeed, matching " interior ". */
5116 status = U_ZERO_ERROR;
5117 actual = uregex_groupUText(re, 1, &bufferText, &length, &status);
5118 REGEX_CHECK_STATUS;
5119 REGEX_ASSERT(actual == &bufferText);
5120 REGEX_ASSERT(utext_getNativeIndex(actual) == 9); // position of " interior "
5121 REGEX_ASSERT(length == 10);
5122 REGEX_ASSERT(utext_nativeLength(actual) == 47);
5123
5124 /* Capture group out of range. Error. */
5125 status = U_ZERO_ERROR;
5126 actual = uregex_groupUText(re, 2, &bufferText, &length, &status);
5127 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5128 REGEX_ASSERT(actual == &bufferText);
5129 uregex_close(re);
5130
5131 }
5132
5133 /*
5134 * replaceFirst()
5135 */
5136 {
5137 UChar text1[80];
5138 UChar text2[80];
5139 UText replText = UTEXT_INITIALIZER;
5140 UText *result;
5141 status = U_ZERO_ERROR;
5142 utext_openUnicodeString(&bufferText, &buffer, &status);
5143
5144 status = U_ZERO_ERROR;
5145 u_uastrncpy(text1, "Replace xaax x1x x...x.", UPRV_LENGTHOF(text1));
5146 u_uastrncpy(text2, "No match here.", UPRV_LENGTHOF(text2)/2);
5147 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5148
5149 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5150 REGEX_CHECK_STATUS;
5151
5152 /* Normal case, with match */
5153 uregex_setText(re, text1, -1, &status);
5154 REGEX_CHECK_STATUS;
5155 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5156 REGEX_CHECK_STATUS;
5157 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5158 REGEX_CHECK_STATUS;
5159 REGEX_ASSERT(result == &bufferText);
5160 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> x1x x...x.", result);
5161
5162 /* No match. Text should copy to output with no changes. */
5163 uregex_setText(re, text2, -1, &status);
5164 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5165 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5166 REGEX_CHECK_STATUS;
5167 REGEX_ASSERT(result == &bufferText);
5168 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5169
5170 /* Unicode escapes */
5171 uregex_setText(re, text1, -1, &status);
5172 regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
5173 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5174 result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
5175 REGEX_CHECK_STATUS;
5176 REGEX_ASSERT(result == &bufferText);
5177 REGEX_ASSERT_UTEXT_INVARIANT("Replace \\AaaB$a x1x x...x.", result);
5178
5179 uregex_close(re);
5180 utext_close(&replText);
5181 }
5182
5183
5184 /*
5185 * replaceAll()
5186 */
5187 {
5188 UChar text1[80];
5189 UChar text2[80];
5190 UText replText = UTEXT_INITIALIZER;
5191 UText *result;
5192
5193 status = U_ZERO_ERROR;
5194 u_uastrncpy(text1, "Replace xaax x1x x...x.", sizeof(text1)/2);
5195 u_uastrncpy(text2, "No match here.", sizeof(text2)/2);
5196 regextst_openUTF8FromInvariant(&replText, "<$1>", -1, &status);
5197
5198 re = uregex_openC("x(.*?)x", 0, NULL, &status);
5199 REGEX_CHECK_STATUS;
5200
5201 /* Normal case, with match */
5202 uregex_setText(re, text1, -1, &status);
5203 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5204 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5205 REGEX_CHECK_STATUS;
5206 REGEX_ASSERT(result == &bufferText);
5207 REGEX_ASSERT_UTEXT_INVARIANT("Replace <aa> <1> <...>.", result);
5208
5209 /* No match. Text should copy to output with no changes. */
5210 uregex_setText(re, text2, -1, &status);
5211 utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
5212 result = uregex_replaceAllUText(re, &replText, &bufferText, &status);
5213 REGEX_CHECK_STATUS;
5214 REGEX_ASSERT(result == &bufferText);
5215 REGEX_ASSERT_UTEXT_INVARIANT("No match here.", result);
5216
5217 uregex_close(re);
5218 utext_close(&replText);
5219 }
5220
5221
5222 /*
5223 * splitUText() uses the C++ API directly, and the UnicodeString version uses mutable UTexts,
5224 * so we don't need to test it here.
5225 */
5226
5227 utext_close(&bufferText);
5228 utext_close(&patternText);
5229 }
5230
5231
5232 //--------------------------------------------------------------
5233 //
5234 // NamedCapture Check basic named capture group functionality
5235 //
5236 //--------------------------------------------------------------
NamedCapture()5237 void RegexTest::NamedCapture() {
5238 UErrorCode status = U_ZERO_ERROR;
5239 RegexPattern *pat = RegexPattern::compile(UnicodeString(
5240 "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
5241 REGEX_CHECK_STATUS;
5242 int32_t group = pat->groupNumberFromName("five", -1, status);
5243 REGEX_CHECK_STATUS;
5244 REGEX_ASSERT(5 == group);
5245 group = pat->groupNumberFromName("three", -1, status);
5246 REGEX_CHECK_STATUS;
5247 REGEX_ASSERT(3 == group);
5248
5249 status = U_ZERO_ERROR;
5250 group = pat->groupNumberFromName(UnicodeString("six"), status);
5251 REGEX_CHECK_STATUS;
5252 REGEX_ASSERT(6 == group);
5253
5254 status = U_ZERO_ERROR;
5255 group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
5256 U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5257
5258 status = U_ZERO_ERROR;
5259
5260 // After copying a pattern, named capture should still work in the copy.
5261 RegexPattern *copiedPat = new RegexPattern(*pat);
5262 REGEX_ASSERT(*copiedPat == *pat);
5263 delete pat; pat = NULL; // Delete original, copy should have no references back to it.
5264
5265 group = copiedPat->groupNumberFromName("five", -1, status);
5266 REGEX_CHECK_STATUS;
5267 REGEX_ASSERT(5 == group);
5268 group = copiedPat->groupNumberFromName("three", -1, status);
5269 REGEX_CHECK_STATUS;
5270 REGEX_ASSERT(3 == group);
5271 delete copiedPat;
5272
5273 // ReplaceAll with named capture group.
5274 status = U_ZERO_ERROR;
5275 UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
5276 RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
5277 REGEX_CHECK_STATUS;
5278 // m.pattern().dumpPattern();
5279 UnicodeString replacedText = m->replaceAll("'${mid}'", status);
5280 REGEX_CHECK_STATUS;
5281 REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
5282 delete m;
5283
5284 // ReplaceAll, allowed capture group numbers.
5285 text = UnicodeString("abcmxyz");
5286 m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
5287 REGEX_CHECK_STATUS;
5288
5289 status = U_ZERO_ERROR;
5290 replacedText = m->replaceAll(UnicodeString("<$0>"), status); // group 0, full match, is allowed.
5291 REGEX_CHECK_STATUS;
5292 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
5293
5294 status = U_ZERO_ERROR;
5295 replacedText = m->replaceAll(UnicodeString("<$1>"), status); // group 1 by number.
5296 REGEX_CHECK_STATUS;
5297 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5298
5299 status = U_ZERO_ERROR;
5300 replacedText = m->replaceAll(UnicodeString("<${one}>"), status); // group 1 by name.
5301 REGEX_CHECK_STATUS;
5302 REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
5303
5304 status = U_ZERO_ERROR;
5305 replacedText = m->replaceAll(UnicodeString("<$2>"), status); // group 2.
5306 REGEX_CHECK_STATUS;
5307 REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
5308
5309 status = U_ZERO_ERROR;
5310 replacedText = m->replaceAll(UnicodeString("<$3>"), status);
5311 REGEX_CHECK_STATUS;
5312 REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
5313
5314 status = U_ZERO_ERROR;
5315 replacedText = m->replaceAll(UnicodeString("<$4>"), status);
5316 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5317
5318 status = U_ZERO_ERROR;
5319 replacedText = m->replaceAll(UnicodeString("<$04>"), status); // group 0, leading 0,
5320 REGEX_CHECK_STATUS; // trailing out-of-range 4 passes through.
5321 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
5322
5323 status = U_ZERO_ERROR;
5324 replacedText = m->replaceAll(UnicodeString("<$000016>"), status); // Consume leading zeroes. Don't consume digits
5325 REGEX_CHECK_STATUS; // that push group num out of range.
5326 REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText); // This is group 1.
5327
5328 status = U_ZERO_ERROR;
5329 replacedText = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
5330 REGEX_CHECK_STATUS;
5331 REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
5332
5333 status = U_ZERO_ERROR;
5334 replacedText = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
5335 REGEX_CHECK_STATUS;
5336 REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
5337
5338 status = U_ZERO_ERROR;
5339 replacedText = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
5340 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5341
5342 status = U_ZERO_ERROR;
5343 replacedText = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
5344 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5345
5346 status = U_ZERO_ERROR;
5347 replacedText = m->replaceAll(UnicodeString("<${one"), status);
5348 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5349
5350 status = U_ZERO_ERROR;
5351 replacedText = m->replaceAll(UnicodeString("$not a capture group"), status);
5352 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5353
5354 delete m;
5355
5356 // Repeat the above replaceAll() tests using the plain C API, which
5357 // has a separate implementation internally.
5358 // TODO: factor out the test data.
5359
5360 status = U_ZERO_ERROR;
5361 URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
5362 REGEX_CHECK_STATUS;
5363 text = UnicodeString("abcmxyz");
5364 uregex_setText(re, text.getBuffer(), text.length(), &status);
5365 REGEX_CHECK_STATUS;
5366
5367 UChar resultBuf[100];
5368 int32_t resultLength;
5369 UnicodeString repl;
5370
5371 status = U_ZERO_ERROR;
5372 repl = UnicodeString("<$0>");
5373 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5374 REGEX_CHECK_STATUS;
5375 REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
5376
5377 status = U_ZERO_ERROR;
5378 repl = UnicodeString("<$1>");
5379 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5380 REGEX_CHECK_STATUS;
5381 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5382
5383 status = U_ZERO_ERROR;
5384 repl = UnicodeString("<${one}>");
5385 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5386 REGEX_CHECK_STATUS;
5387 REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
5388
5389 status = U_ZERO_ERROR;
5390 repl = UnicodeString("<$2>");
5391 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5392 REGEX_CHECK_STATUS;
5393 REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
5394
5395 status = U_ZERO_ERROR;
5396 repl = UnicodeString("<$3>");
5397 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5398 REGEX_CHECK_STATUS;
5399 REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
5400
5401 status = U_ZERO_ERROR;
5402 repl = UnicodeString("<$4>");
5403 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5404 REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
5405
5406 status = U_ZERO_ERROR;
5407 repl = UnicodeString("<$04>");
5408 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5409 REGEX_CHECK_STATUS;
5410 REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
5411
5412 status = U_ZERO_ERROR;
5413 repl = UnicodeString("<$000016>");
5414 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5415 REGEX_CHECK_STATUS;
5416 REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
5417
5418 status = U_ZERO_ERROR;
5419 repl = UnicodeString("<$3$2$1${one}>");
5420 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5421 REGEX_CHECK_STATUS;
5422 REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
5423
5424 status = U_ZERO_ERROR;
5425 repl = UnicodeString("$3$2$1${one}");
5426 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5427 REGEX_CHECK_STATUS;
5428 REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
5429
5430 status = U_ZERO_ERROR;
5431 repl = UnicodeString("<${noSuchName}>");
5432 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5433 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5434
5435 status = U_ZERO_ERROR;
5436 repl = UnicodeString("<${invalid-name}>");
5437 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5438 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5439
5440 status = U_ZERO_ERROR;
5441 repl = UnicodeString("<${one");
5442 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5443 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5444
5445 status = U_ZERO_ERROR;
5446 repl = UnicodeString("$not a capture group");
5447 resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
5448 REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
5449
5450 uregex_close(re);
5451 }
5452
5453 //--------------------------------------------------------------
5454 //
5455 // NamedCaptureLimits Patterns with huge numbers of named capture groups.
5456 // The point is not so much what the exact limit is,
5457 // but that a largish number doesn't hit bad non-linear performance,
5458 // and that exceeding the limit fails cleanly.
5459 //
5460 //--------------------------------------------------------------
NamedCaptureLimits()5461 void RegexTest::NamedCaptureLimits() {
5462 if (quick) {
5463 logln("Skipping test. Runs in exhuastive mode only.");
5464 return;
5465 }
5466 const int32_t goodLimit = 1000000; // Pattern w this many groups builds successfully.
5467 const int32_t failLimit = 10000000; // Pattern exceeds internal limits, fails to compile.
5468 char nnbuf[100];
5469 UnicodeString pattern;
5470 int32_t nn;
5471
5472 for (nn=1; nn<goodLimit; nn++) {
5473 sprintf(nnbuf, "(?<nn%d>)", nn);
5474 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5475 }
5476 UErrorCode status = U_ZERO_ERROR;
5477 RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
5478 REGEX_CHECK_STATUS;
5479 for (nn=1; nn<goodLimit; nn++) {
5480 sprintf(nnbuf, "nn%d", nn);
5481 int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
5482 REGEX_ASSERT(nn == groupNum);
5483 if (nn != groupNum) {
5484 break;
5485 }
5486 }
5487 delete pat;
5488
5489 pattern.remove();
5490 for (nn=1; nn<failLimit; nn++) {
5491 sprintf(nnbuf, "(?<nn%d>)", nn);
5492 pattern.append(UnicodeString(nnbuf, -1, US_INV));
5493 }
5494 status = U_ZERO_ERROR;
5495 pat = RegexPattern::compile(pattern, 0, status);
5496 REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
5497 delete pat;
5498 }
5499
5500
5501 //--------------------------------------------------------------
5502 //
5503 // Bug7651 Regex pattern that exceeds default operator stack depth in matcher.
5504 //
5505 //---------------------------------------------------------------
Bug7651()5506 void RegexTest::Bug7651() {
5507 UnicodeString pattern1("((?<![A-Za-z0-9])[#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|(?<![A-Za-z0-9_])[@\\uff20][A-Za-z0-9_]+(?:\\/[\\w-]+)?|(https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|\\$[A-Za-z]+)");
5508 // The following should exceed the default operator stack depth in the matcher, i.e. force the matcher to malloc instead of using fSmallData.
5509 // It will cause a segfault if RegexMatcher tries to use fSmallData instead of malloc'ing the memory needed (see init2) for the pattern operator stack allocation.
5510 UnicodeString pattern2("((https?\\:\\/\\/|www\\.)\\S+(?<![\\!\\),\\.:;\\]\\u0080-\\uFFFF])|(?<![A-Za-z0-9_])[\\@\\uff20][A-Za-z0-9_]+(?:\\/[\\w\\-]+)?|(?<![A-Za-z0-9])[\\#\\uff03][A-Za-z0-9_][A-Za-z0-9_\\u00c0-\\u00d6\\u00c8-\\u00f6\\u00f8-\\u00ff]*|\\$[A-Za-z]+)");
5511 UnicodeString s("#ff @abcd This is test");
5512 RegexPattern *REPattern = NULL;
5513 RegexMatcher *REMatcher = NULL;
5514 UErrorCode status = U_ZERO_ERROR;
5515 UParseError pe;
5516
5517 REPattern = RegexPattern::compile(pattern1, 0, pe, status);
5518 REGEX_CHECK_STATUS;
5519 REMatcher = REPattern->matcher(s, status);
5520 REGEX_CHECK_STATUS;
5521 REGEX_ASSERT(REMatcher->find());
5522 REGEX_ASSERT(REMatcher->start(status) == 0);
5523 delete REPattern;
5524 delete REMatcher;
5525 status = U_ZERO_ERROR;
5526
5527 REPattern = RegexPattern::compile(pattern2, 0, pe, status);
5528 REGEX_CHECK_STATUS;
5529 REMatcher = REPattern->matcher(s, status);
5530 REGEX_CHECK_STATUS;
5531 REGEX_ASSERT(REMatcher->find());
5532 REGEX_ASSERT(REMatcher->start(status) == 0);
5533 delete REPattern;
5534 delete REMatcher;
5535 status = U_ZERO_ERROR;
5536 }
5537
Bug7740()5538 void RegexTest::Bug7740() {
5539 UErrorCode status = U_ZERO_ERROR;
5540 UnicodeString pattern = "(a)";
5541 UnicodeString text = "abcdef";
5542 RegexMatcher *m = new RegexMatcher(pattern, text, 0, status);
5543 REGEX_CHECK_STATUS;
5544 REGEX_ASSERT(m->lookingAt(status));
5545 REGEX_CHECK_STATUS;
5546 status = U_ILLEGAL_ARGUMENT_ERROR;
5547 UnicodeString s = m->group(1, status); // Bug 7740: segfault here.
5548 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5549 REGEX_ASSERT(s == "");
5550 delete m;
5551 }
5552
5553 // Bug 8479: was crashing whith a Bogus UnicodeString as input.
5554
Bug8479()5555 void RegexTest::Bug8479() {
5556 UErrorCode status = U_ZERO_ERROR;
5557
5558 RegexMatcher* const pMatcher = new RegexMatcher("\\Aboo\\z", UREGEX_DOTALL|UREGEX_CASE_INSENSITIVE, status);
5559 REGEX_CHECK_STATUS;
5560 if (U_SUCCESS(status))
5561 {
5562 UnicodeString str;
5563 str.setToBogus();
5564 pMatcher->reset(str);
5565 status = U_ZERO_ERROR;
5566 pMatcher->matches(status);
5567 REGEX_ASSERT(status == U_ILLEGAL_ARGUMENT_ERROR);
5568 delete pMatcher;
5569 }
5570 }
5571
5572
5573 // Bug 7029
Bug7029()5574 void RegexTest::Bug7029() {
5575 UErrorCode status = U_ZERO_ERROR;
5576
5577 RegexMatcher* const pMatcher = new RegexMatcher(".", 0, status);
5578 UnicodeString text = "abc.def";
5579 UnicodeString splits[10];
5580 REGEX_CHECK_STATUS;
5581 int32_t numFields = pMatcher->split(text, splits, 10, status);
5582 REGEX_CHECK_STATUS;
5583 REGEX_ASSERT(numFields == 8);
5584 delete pMatcher;
5585 }
5586
5587 // Bug 9283
5588 // This test is checking for the existance of any supplemental characters that case-fold
5589 // to a bmp character.
5590 //
5591 // At the time of this writing there are none. If any should appear in a subsequent release
5592 // of Unicode, the code in regular expressions compilation that determines the longest
5593 // posssible match for a literal string will need to be enhanced.
5594 //
5595 // See file regexcmp.cpp, case URX_STRING_I in RegexCompile::maxMatchLength()
5596 // for details on what to do in case of a failure of this test.
5597 //
Bug9283()5598 void RegexTest::Bug9283() {
5599 #if !UCONFIG_NO_NORMALIZATION
5600 UErrorCode status = U_ZERO_ERROR;
5601 UnicodeSet supplementalsWithCaseFolding("[[:CWCF:]&[\\U00010000-\\U0010FFFF]]", status);
5602 REGEX_CHECK_STATUS;
5603 int32_t index;
5604 UChar32 c;
5605 for (index=0; ; index++) {
5606 c = supplementalsWithCaseFolding.charAt(index);
5607 if (c == -1) {
5608 break;
5609 }
5610 UnicodeString cf = UnicodeString(c).foldCase();
5611 REGEX_ASSERT(cf.length() >= 2);
5612 }
5613 #endif /* #if !UCONFIG_NO_NORMALIZATION */
5614 }
5615
5616
CheckInvBufSize()5617 void RegexTest::CheckInvBufSize() {
5618 if(inv_next>=INV_BUFSIZ) {
5619 errln("%s: increase #define of INV_BUFSIZ ( is %d but needs to be at least %d )\n",
5620 __FILE__, INV_BUFSIZ, inv_next);
5621 } else {
5622 logln("%s: INV_BUFSIZ is %d, usage %d\n", __FILE__, INV_BUFSIZ, inv_next);
5623 }
5624 }
5625
5626
Bug10459()5627 void RegexTest::Bug10459() {
5628 UErrorCode status = U_ZERO_ERROR;
5629 UnicodeString patternString("(txt)");
5630 UnicodeString txtString("txt");
5631
5632 UText *utext_pat = utext_openUnicodeString(NULL, &patternString, &status);
5633 REGEX_CHECK_STATUS;
5634 UText *utext_txt = utext_openUnicodeString(NULL, &txtString, &status);
5635 REGEX_CHECK_STATUS;
5636
5637 URegularExpression *icu_re = uregex_openUText(utext_pat, 0, NULL, &status);
5638 REGEX_CHECK_STATUS;
5639
5640 uregex_setUText(icu_re, utext_txt, &status);
5641 REGEX_CHECK_STATUS;
5642
5643 // The bug was that calling uregex_group() before doing a matching operation
5644 // was causing a segfault. Only for Regular Expressions created from UText.
5645 // It should set an U_REGEX_INVALID_STATE.
5646
5647 UChar buf[100];
5648 int32_t len = uregex_group(icu_re, 0, buf, UPRV_LENGTHOF(buf), &status);
5649 REGEX_ASSERT(status == U_REGEX_INVALID_STATE);
5650 REGEX_ASSERT(len == 0);
5651
5652 uregex_close(icu_re);
5653 utext_close(utext_pat);
5654 utext_close(utext_txt);
5655 }
5656
TestCaseInsensitiveStarters()5657 void RegexTest::TestCaseInsensitiveStarters() {
5658 // Test that the data used by RegexCompile::findCaseInsensitiveStarters() hasn't
5659 // become stale because of new Unicode characters.
5660 // If it is stale, rerun the generation tool
5661 // svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
5662 // and replace the embedded data in i18n/regexcmp.cpp
5663
5664 for (UChar32 cp=0; cp<=0x10ffff; cp++) {
5665 if (!u_hasBinaryProperty(cp, UCHAR_CASE_SENSITIVE)) {
5666 continue;
5667 }
5668 UnicodeSet s(cp, cp);
5669 s.closeOver(USET_CASE_INSENSITIVE);
5670 UnicodeSetIterator setIter(s);
5671 while (setIter.next()) {
5672 if (!setIter.isString()) {
5673 continue;
5674 }
5675 const UnicodeString &str = setIter.getString();
5676 UChar32 firstChar = str.char32At(0);
5677 UnicodeSet starters;
5678 RegexCompile::findCaseInsensitiveStarters(firstChar, &starters);
5679 if (!starters.contains(cp)) {
5680 errln("CaseInsensitiveStarters for \\u%x is missing character \\u%x.", cp, firstChar);
5681 return;
5682 }
5683 }
5684 }
5685 }
5686
5687
TestBug11049()5688 void RegexTest::TestBug11049() {
5689 // Original bug report: pattern with match start consisting of one of several individual characters,
5690 // and the text being matched ending with a supplementary character. find() would read past the
5691 // end of the input text when searching for potential match starting points.
5692
5693 // To see the problem, the text must exactly fill an allocated buffer, so that valgrind will
5694 // detect the bad read.
5695
5696 TestCase11049("A|B|C", "a string \\ud800\\udc00", FALSE, __LINE__);
5697 TestCase11049("A|B|C", "string matches at end C", TRUE, __LINE__);
5698
5699 // Test again with a pattern starting with a single character,
5700 // which takes a different code path than starting with an OR expression,
5701 // but with similar logic.
5702 TestCase11049("C", "a string \\ud800\\udc00", FALSE, __LINE__);
5703 TestCase11049("C", "string matches at end C", TRUE, __LINE__);
5704 }
5705
5706 // Run a single test case from TestBug11049(). Internal function.
TestCase11049(const char * pattern,const char * data,UBool expectMatch,int32_t lineNumber)5707 void RegexTest::TestCase11049(const char *pattern, const char *data, UBool expectMatch, int32_t lineNumber) {
5708 UErrorCode status = U_ZERO_ERROR;
5709 UnicodeString patternString = UnicodeString(pattern).unescape();
5710 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5711
5712 UnicodeString dataString = UnicodeString(data).unescape();
5713 UChar *exactBuffer = new UChar[dataString.length()];
5714 dataString.extract(exactBuffer, dataString.length(), status);
5715 UText *ut = utext_openUChars(NULL, exactBuffer, dataString.length(), &status);
5716
5717 LocalPointer<RegexMatcher> matcher(compiledPat->matcher(status));
5718 REGEX_CHECK_STATUS;
5719 matcher->reset(ut);
5720 UBool result = matcher->find();
5721 if (result != expectMatch) {
5722 errln("File %s, line %d: expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5723 __FILE__, lineNumber, expectMatch, result, pattern, data);
5724 }
5725
5726 // Rerun test with UTF-8 input text. Won't see buffer overreads, but could see
5727 // off-by-one on find() with match at the last code point.
5728 // Size of the original char * data (invariant charset) will be <= than the equivalent UTF-8
5729 // because string.unescape() will only shrink it.
5730 char * utf8Buffer = new char[uprv_strlen(data)+1];
5731 u_strToUTF8(utf8Buffer, uprv_strlen(data)+1, NULL, dataString.getBuffer(), dataString.length(), &status);
5732 REGEX_CHECK_STATUS;
5733 ut = utext_openUTF8(ut, utf8Buffer, -1, &status);
5734 REGEX_CHECK_STATUS;
5735 matcher->reset(ut);
5736 result = matcher->find();
5737 if (result != expectMatch) {
5738 errln("File %s, line %d (UTF-8 check): expected %d, got %d. Pattern = \"%s\", text = \"%s\"",
5739 __FILE__, lineNumber, expectMatch, result, pattern, data);
5740 }
5741 delete [] utf8Buffer;
5742
5743 utext_close(ut);
5744 delete [] exactBuffer;
5745 }
5746
5747
TestBug11371()5748 void RegexTest::TestBug11371() {
5749 if (quick) {
5750 logln("Skipping test. Runs in exhuastive mode only.");
5751 return;
5752 }
5753 UErrorCode status = U_ZERO_ERROR;
5754 UnicodeString patternString;
5755
5756 for (int i=0; i<8000000; i++) {
5757 patternString.append(UnicodeString("()"));
5758 }
5759 LocalPointer<RegexPattern> compiledPat(RegexPattern::compile(patternString, 0, status));
5760 if (status != U_REGEX_PATTERN_TOO_BIG) {
5761 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5762 __FILE__, __LINE__, u_errorName(status));
5763 }
5764
5765 status = U_ZERO_ERROR;
5766 patternString = "(";
5767 for (int i=0; i<20000000; i++) {
5768 patternString.append(UnicodeString("A++"));
5769 }
5770 patternString.append(UnicodeString("){0}B++"));
5771 LocalPointer<RegexPattern> compiledPat2(RegexPattern::compile(patternString, 0, status));
5772 if (status != U_REGEX_PATTERN_TOO_BIG) {
5773 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5774 __FILE__, __LINE__, u_errorName(status));
5775 }
5776
5777 // Pattern with too much string data, such that string indexes overflow operand data field size
5778 // in compiled instruction.
5779 status = U_ZERO_ERROR;
5780 patternString = "";
5781 while (patternString.length() < 0x00ffffff) {
5782 patternString.append(UnicodeString("stuff and things dont you know, these are a few of my favorite strings\n"));
5783 }
5784 patternString.append(UnicodeString("X? trailing string"));
5785 LocalPointer<RegexPattern> compiledPat3(RegexPattern::compile(patternString, 0, status));
5786 if (status != U_REGEX_PATTERN_TOO_BIG) {
5787 errln("File %s, line %d expected status=U_REGEX_PATTERN_TOO_BIG; got %s.",
5788 __FILE__, __LINE__, u_errorName(status));
5789 }
5790 }
5791
TestBug11480()5792 void RegexTest::TestBug11480() {
5793 // C API, get capture group of a group that does not participate in the match.
5794 // (Returns a zero length string, with nul termination,
5795 // indistinguishable from a group with a zero length match.)
5796
5797 UErrorCode status = U_ZERO_ERROR;
5798 URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
5799 REGEX_CHECK_STATUS;
5800 UnicodeString text = UNICODE_STRING_SIMPLE("A");
5801 uregex_setText(re, text.getBuffer(), text.length(), &status);
5802 REGEX_CHECK_STATUS;
5803 REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
5804 UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
5805 int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
5806 REGEX_ASSERT(length == 0);
5807 REGEX_ASSERT(buf[0] == 13);
5808 REGEX_ASSERT(buf[1] == 0);
5809 REGEX_ASSERT(buf[2] == 13);
5810 uregex_close(re);
5811
5812 // UText C++ API, length of match is 0 for non-participating matches.
5813 UText ut = UTEXT_INITIALIZER;
5814 utext_openUnicodeString(&ut, &text, &status);
5815 RegexMatcher matcher(UnicodeString("(A)|(B)"), 0, status);
5816 REGEX_CHECK_STATUS;
5817 matcher.reset(&ut);
5818 REGEX_ASSERT(matcher.lookingAt(0, status));
5819
5820 // UText C++ API, Capture group 1 matches "A", position 0, length 1.
5821 int64_t groupLen = -666;
5822 UText group = UTEXT_INITIALIZER;
5823 matcher.group(1, &group, groupLen, status);
5824 REGEX_CHECK_STATUS;
5825 REGEX_ASSERT(groupLen == 1);
5826 REGEX_ASSERT(utext_getNativeIndex(&group) == 0);
5827
5828 // Capture group 2, the (B), does not participate in the match.
5829 matcher.group(2, &group, groupLen, status);
5830 REGEX_CHECK_STATUS;
5831 REGEX_ASSERT(groupLen == 0);
5832 REGEX_ASSERT(matcher.start(2, status) == -1);
5833 REGEX_CHECK_STATUS;
5834 }
5835
5836
5837 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
5838