1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2015 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33
34 // #include <malloc.h> // Needed for heapcheck testing
35
36 U_NAMESPACE_BEGIN
37
38 // Default limit for the size of the back track stack, to avoid system
39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
40 // This value puts ICU's limits higher than most other regexp implementations,
41 // which use recursion rather than the heap, and take more storage per
42 // backtrack point.
43 //
44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
45
46 // Time limit counter constant.
47 // Time limits for expression evaluation are in terms of quanta of work by
48 // the engine, each of which is 10,000 state saves.
49 // This constant determines that state saves per tick number.
50 static const int32_t TIMER_INITIAL_VALUE = 10000;
51
52
53 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)54 static inline UBool isLineTerminator(UChar32 c) {
55 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
56 return false;
57 }
58 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
59 }
60
61 //-----------------------------------------------------------------------------
62 //
63 // Constructor and Destructor
64 //
65 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)66 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
67 fDeferredStatus = U_ZERO_ERROR;
68 init(fDeferredStatus);
69 if (U_FAILURE(fDeferredStatus)) {
70 return;
71 }
72 if (pat==NULL) {
73 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
74 return;
75 }
76 fPattern = pat;
77 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
78 }
79
80
81
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)82 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
83 uint32_t flags, UErrorCode &status) {
84 init(status);
85 if (U_FAILURE(status)) {
86 return;
87 }
88 UParseError pe;
89 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
90 fPattern = fPatternOwned;
91
92 UText inputText = UTEXT_INITIALIZER;
93 utext_openConstUnicodeString(&inputText, &input, &status);
94 init2(&inputText, status);
95 utext_close(&inputText);
96
97 fInputUniStrMaybeMutable = TRUE;
98 }
99
100
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)101 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
102 uint32_t flags, UErrorCode &status) {
103 init(status);
104 if (U_FAILURE(status)) {
105 return;
106 }
107 UParseError pe;
108 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
109 if (U_FAILURE(status)) {
110 return;
111 }
112
113 fPattern = fPatternOwned;
114 init2(input, status);
115 }
116
117
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)118 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
119 uint32_t flags, UErrorCode &status) {
120 init(status);
121 if (U_FAILURE(status)) {
122 return;
123 }
124 UParseError pe;
125 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
126 if (U_FAILURE(status)) {
127 return;
128 }
129 fPattern = fPatternOwned;
130 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
131 }
132
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)133 RegexMatcher::RegexMatcher(UText *regexp,
134 uint32_t flags, UErrorCode &status) {
135 init(status);
136 if (U_FAILURE(status)) {
137 return;
138 }
139 UParseError pe;
140 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
141 if (U_FAILURE(status)) {
142 return;
143 }
144
145 fPattern = fPatternOwned;
146 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
147 }
148
149
150
151
~RegexMatcher()152 RegexMatcher::~RegexMatcher() {
153 delete fStack;
154 if (fData != fSmallData) {
155 uprv_free(fData);
156 fData = NULL;
157 }
158 if (fPatternOwned) {
159 delete fPatternOwned;
160 fPatternOwned = NULL;
161 fPattern = NULL;
162 }
163
164 if (fInput) {
165 delete fInput;
166 }
167 if (fInputText) {
168 utext_close(fInputText);
169 }
170 if (fAltInputText) {
171 utext_close(fAltInputText);
172 }
173
174 #if UCONFIG_NO_BREAK_ITERATION==0
175 delete fWordBreakItr;
176 #endif
177 }
178
179 //
180 // init() common initialization for use by all constructors.
181 // Initialize all fields, get the object into a consistent state.
182 // This must be done even when the initial status shows an error,
183 // so that the object is initialized sufficiently well for the destructor
184 // to run safely.
185 //
init(UErrorCode & status)186 void RegexMatcher::init(UErrorCode &status) {
187 fPattern = NULL;
188 fPatternOwned = NULL;
189 fFrameSize = 0;
190 fRegionStart = 0;
191 fRegionLimit = 0;
192 fAnchorStart = 0;
193 fAnchorLimit = 0;
194 fLookStart = 0;
195 fLookLimit = 0;
196 fActiveStart = 0;
197 fActiveLimit = 0;
198 fTransparentBounds = FALSE;
199 fAnchoringBounds = TRUE;
200 fMatch = FALSE;
201 fMatchStart = 0;
202 fMatchEnd = 0;
203 fLastMatchEnd = -1;
204 fAppendPosition = 0;
205 fHitEnd = FALSE;
206 fRequireEnd = FALSE;
207 fStack = NULL;
208 fFrame = NULL;
209 fTimeLimit = 0;
210 fTime = 0;
211 fTickCounter = 0;
212 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
213 fCallbackFn = NULL;
214 fCallbackContext = NULL;
215 fFindProgressCallbackFn = NULL;
216 fFindProgressCallbackContext = NULL;
217 fTraceDebug = FALSE;
218 fDeferredStatus = status;
219 fData = fSmallData;
220 fWordBreakItr = NULL;
221
222 fStack = NULL;
223 fInputText = NULL;
224 fAltInputText = NULL;
225 fInput = NULL;
226 fInputLength = 0;
227 fInputUniStrMaybeMutable = FALSE;
228 }
229
230 //
231 // init2() Common initialization for use by RegexMatcher constructors, part 2.
232 // This handles the common setup to be done after the Pattern is available.
233 //
init2(UText * input,UErrorCode & status)234 void RegexMatcher::init2(UText *input, UErrorCode &status) {
235 if (U_FAILURE(status)) {
236 fDeferredStatus = status;
237 return;
238 }
239
240 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
241 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
242 if (fData == NULL) {
243 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
244 return;
245 }
246 }
247
248 fStack = new UVector64(status);
249 if (fStack == NULL) {
250 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
251 return;
252 }
253
254 reset(input);
255 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
256 if (U_FAILURE(status)) {
257 fDeferredStatus = status;
258 return;
259 }
260 }
261
262
263 static const UChar BACKSLASH = 0x5c;
264 static const UChar DOLLARSIGN = 0x24;
265 static const UChar LEFTBRACKET = 0x7b;
266 static const UChar RIGHTBRACKET = 0x7d;
267
268 //--------------------------------------------------------------------------------
269 //
270 // appendReplacement
271 //
272 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)273 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
274 const UnicodeString &replacement,
275 UErrorCode &status) {
276 UText replacementText = UTEXT_INITIALIZER;
277
278 utext_openConstUnicodeString(&replacementText, &replacement, &status);
279 if (U_SUCCESS(status)) {
280 UText resultText = UTEXT_INITIALIZER;
281 utext_openUnicodeString(&resultText, &dest, &status);
282
283 if (U_SUCCESS(status)) {
284 appendReplacement(&resultText, &replacementText, status);
285 utext_close(&resultText);
286 }
287 utext_close(&replacementText);
288 }
289
290 return *this;
291 }
292
293 //
294 // appendReplacement, UText mode
295 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)296 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
297 UText *replacement,
298 UErrorCode &status) {
299 if (U_FAILURE(status)) {
300 return *this;
301 }
302 if (U_FAILURE(fDeferredStatus)) {
303 status = fDeferredStatus;
304 return *this;
305 }
306 if (fMatch == FALSE) {
307 status = U_REGEX_INVALID_STATE;
308 return *this;
309 }
310
311 // Copy input string from the end of previous match to start of current match
312 int64_t destLen = utext_nativeLength(dest);
313 if (fMatchStart > fAppendPosition) {
314 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
315 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
316 (int32_t)(fMatchStart-fAppendPosition), &status);
317 } else {
318 int32_t len16;
319 if (UTEXT_USES_U16(fInputText)) {
320 len16 = (int32_t)(fMatchStart-fAppendPosition);
321 } else {
322 UErrorCode lengthStatus = U_ZERO_ERROR;
323 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
324 }
325 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
326 if (inputChars == NULL) {
327 status = U_MEMORY_ALLOCATION_ERROR;
328 return *this;
329 }
330 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
331 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
332 uprv_free(inputChars);
333 }
334 }
335 fAppendPosition = fMatchEnd;
336
337
338 // scan the replacement text, looking for substitutions ($n) and \escapes.
339 // TODO: optimize this loop by efficiently scanning for '$' or '\',
340 // move entire ranges not containing substitutions.
341 UTEXT_SETNATIVEINDEX(replacement, 0);
342 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
343 if (c == BACKSLASH) {
344 // Backslash Escape. Copy the following char out without further checks.
345 // Note: Surrogate pairs don't need any special handling
346 // The second half wont be a '$' or a '\', and
347 // will move to the dest normally on the next
348 // loop iteration.
349 c = UTEXT_CURRENT32(replacement);
350 if (c == U_SENTINEL) {
351 break;
352 }
353
354 if (c==0x55/*U*/ || c==0x75/*u*/) {
355 // We have a \udddd or \Udddddddd escape sequence.
356 int32_t offset = 0;
357 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
358 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
359 if (escapedChar != (UChar32)0xFFFFFFFF) {
360 if (U_IS_BMP(escapedChar)) {
361 UChar c16 = (UChar)escapedChar;
362 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
363 } else {
364 UChar surrogate[2];
365 surrogate[0] = U16_LEAD(escapedChar);
366 surrogate[1] = U16_TRAIL(escapedChar);
367 if (U_SUCCESS(status)) {
368 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
369 }
370 }
371 // TODO: Report errors for mal-formed \u escapes?
372 // As this is, the original sequence is output, which may be OK.
373 if (context.lastOffset == offset) {
374 (void)UTEXT_PREVIOUS32(replacement);
375 } else if (context.lastOffset != offset-1) {
376 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
377 }
378 }
379 } else {
380 (void)UTEXT_NEXT32(replacement);
381 // Plain backslash escape. Just put out the escaped character.
382 if (U_IS_BMP(c)) {
383 UChar c16 = (UChar)c;
384 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
385 } else {
386 UChar surrogate[2];
387 surrogate[0] = U16_LEAD(c);
388 surrogate[1] = U16_TRAIL(c);
389 if (U_SUCCESS(status)) {
390 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
391 }
392 }
393 }
394 } else if (c != DOLLARSIGN) {
395 // Normal char, not a $. Copy it out without further checks.
396 if (U_IS_BMP(c)) {
397 UChar c16 = (UChar)c;
398 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
399 } else {
400 UChar surrogate[2];
401 surrogate[0] = U16_LEAD(c);
402 surrogate[1] = U16_TRAIL(c);
403 if (U_SUCCESS(status)) {
404 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
405 }
406 }
407 } else {
408 // We've got a $. Pick up a capture group name or number if one follows.
409 // Consume digits so long as the resulting group number <= the number of
410 // number of capture groups in the pattern.
411
412 int32_t groupNum = 0;
413 int32_t numDigits = 0;
414 UChar32 nextChar = utext_current32(replacement);
415 if (nextChar == LEFTBRACKET) {
416 // Scan for a Named Capture Group, ${name}.
417 UnicodeString groupName;
418 utext_next32(replacement);
419 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
420 nextChar = utext_next32(replacement);
421 if (nextChar == U_SENTINEL) {
422 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
423 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
424 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
425 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
426 groupName.append(nextChar);
427 } else if (nextChar == RIGHTBRACKET) {
428 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
429 if (groupNum == 0) {
430 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
431 }
432 } else {
433 // Character was something other than a name char or a closing '}'
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 }
437
438 } else if (u_isdigit(nextChar)) {
439 // $n Scan for a capture group number
440 int32_t numCaptureGroups = fPattern->fGroupMap->size();
441 for (;;) {
442 nextChar = UTEXT_CURRENT32(replacement);
443 if (nextChar == U_SENTINEL) {
444 break;
445 }
446 if (u_isdigit(nextChar) == FALSE) {
447 break;
448 }
449 int32_t nextDigitVal = u_charDigitValue(nextChar);
450 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
451 // Don't consume the next digit if it makes the capture group number too big.
452 if (numDigits == 0) {
453 status = U_INDEX_OUTOFBOUNDS_ERROR;
454 }
455 break;
456 }
457 (void)UTEXT_NEXT32(replacement);
458 groupNum=groupNum*10 + nextDigitVal;
459 ++numDigits;
460 }
461 } else {
462 // $ not followed by capture group name or number.
463 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
464 }
465
466 if (U_SUCCESS(status)) {
467 destLen += appendGroup(groupNum, dest, status);
468 }
469 } // End of $ capture group handling
470 } // End of per-character loop through the replacement string.
471
472 return *this;
473 }
474
475
476
477 //--------------------------------------------------------------------------------
478 //
479 // appendTail Intended to be used in conjunction with appendReplacement()
480 // To the destination string, append everything following
481 // the last match position from the input string.
482 //
483 // Note: Match ranges do not affect appendTail or appendReplacement
484 //
485 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)486 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
487 UErrorCode status = U_ZERO_ERROR;
488 UText resultText = UTEXT_INITIALIZER;
489 utext_openUnicodeString(&resultText, &dest, &status);
490
491 if (U_SUCCESS(status)) {
492 appendTail(&resultText, status);
493 utext_close(&resultText);
494 }
495
496 return dest;
497 }
498
499 //
500 // appendTail, UText mode
501 //
appendTail(UText * dest,UErrorCode & status)502 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
503 if (U_FAILURE(status)) {
504 return dest;
505 }
506 if (U_FAILURE(fDeferredStatus)) {
507 status = fDeferredStatus;
508 return dest;
509 }
510
511 if (fInputLength > fAppendPosition) {
512 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
513 int64_t destLen = utext_nativeLength(dest);
514 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
515 (int32_t)(fInputLength-fAppendPosition), &status);
516 } else {
517 int32_t len16;
518 if (UTEXT_USES_U16(fInputText)) {
519 len16 = (int32_t)(fInputLength-fAppendPosition);
520 } else {
521 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
522 status = U_ZERO_ERROR; // buffer overflow
523 }
524
525 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
526 if (inputChars == NULL) {
527 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
528 } else {
529 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
530 int64_t destLen = utext_nativeLength(dest);
531 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
532 uprv_free(inputChars);
533 }
534 }
535 }
536 return dest;
537 }
538
539
540
541 //--------------------------------------------------------------------------------
542 //
543 // end
544 //
545 //--------------------------------------------------------------------------------
end(UErrorCode & err) const546 int32_t RegexMatcher::end(UErrorCode &err) const {
547 return end(0, err);
548 }
549
end64(UErrorCode & err) const550 int64_t RegexMatcher::end64(UErrorCode &err) const {
551 return end64(0, err);
552 }
553
end64(int32_t group,UErrorCode & err) const554 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
555 if (U_FAILURE(err)) {
556 return -1;
557 }
558 if (fMatch == FALSE) {
559 err = U_REGEX_INVALID_STATE;
560 return -1;
561 }
562 if (group < 0 || group > fPattern->fGroupMap->size()) {
563 err = U_INDEX_OUTOFBOUNDS_ERROR;
564 return -1;
565 }
566 int64_t e = -1;
567 if (group == 0) {
568 e = fMatchEnd;
569 } else {
570 // Get the position within the stack frame of the variables for
571 // this capture group.
572 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
573 U_ASSERT(groupOffset < fPattern->fFrameSize);
574 U_ASSERT(groupOffset >= 0);
575 e = fFrame->fExtra[groupOffset + 1];
576 }
577
578 return e;
579 }
580
end(int32_t group,UErrorCode & err) const581 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
582 return (int32_t)end64(group, err);
583 }
584
585 //--------------------------------------------------------------------------------
586 //
587 // findProgressInterrupt This function is called once for each advance in the target
588 // string from the find() function, and calls the user progress callback
589 // function if there is one installed.
590 //
591 // Return: TRUE if the find operation is to be terminated.
592 // FALSE if the find operation is to continue running.
593 //
594 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)595 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
596 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
597 status = U_REGEX_STOPPED_BY_CALLER;
598 return TRUE;
599 }
600 return FALSE;
601 }
602
603 //--------------------------------------------------------------------------------
604 //
605 // find()
606 //
607 //--------------------------------------------------------------------------------
find()608 UBool RegexMatcher::find() {
609 if (U_FAILURE(fDeferredStatus)) {
610 return FALSE;
611 }
612 UErrorCode status = U_ZERO_ERROR;
613 UBool result = find(status);
614 return result;
615 }
616
617 //--------------------------------------------------------------------------------
618 //
619 // find()
620 //
621 //--------------------------------------------------------------------------------
find(UErrorCode & status)622 UBool RegexMatcher::find(UErrorCode &status) {
623 // Start at the position of the last match end. (Will be zero if the
624 // matcher has been reset.)
625 //
626 if (U_FAILURE(status)) {
627 return FALSE;
628 }
629 if (U_FAILURE(fDeferredStatus)) {
630 status = fDeferredStatus;
631 return FALSE;
632 }
633
634 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
635 return findUsingChunk(status);
636 }
637
638 int64_t startPos = fMatchEnd;
639 if (startPos==0) {
640 startPos = fActiveStart;
641 }
642
643 if (fMatch) {
644 // Save the position of any previous successful match.
645 fLastMatchEnd = fMatchEnd;
646
647 if (fMatchStart == fMatchEnd) {
648 // Previous match had zero length. Move start position up one position
649 // to avoid sending find() into a loop on zero-length matches.
650 if (startPos >= fActiveLimit) {
651 fMatch = FALSE;
652 fHitEnd = TRUE;
653 return FALSE;
654 }
655 UTEXT_SETNATIVEINDEX(fInputText, startPos);
656 (void)UTEXT_NEXT32(fInputText);
657 startPos = UTEXT_GETNATIVEINDEX(fInputText);
658 }
659 } else {
660 if (fLastMatchEnd >= 0) {
661 // A previous find() failed to match. Don't try again.
662 // (without this test, a pattern with a zero-length match
663 // could match again at the end of an input string.)
664 fHitEnd = TRUE;
665 return FALSE;
666 }
667 }
668
669
670 // Compute the position in the input string beyond which a match can not begin, because
671 // the minimum length match would extend past the end of the input.
672 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
673 // Be aware of possible overflows if making changes here.
674 int64_t testStartLimit;
675 if (UTEXT_USES_U16(fInputText)) {
676 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
677 if (startPos > testStartLimit) {
678 fMatch = FALSE;
679 fHitEnd = TRUE;
680 return FALSE;
681 }
682 } else {
683 // We don't know exactly how long the minimum match length is in native characters.
684 // Treat anything > 0 as 1.
685 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
686 }
687
688 UChar32 c;
689 U_ASSERT(startPos >= 0);
690
691 switch (fPattern->fStartType) {
692 case START_NO_INFO:
693 // No optimization was found.
694 // Try a match at each input position.
695 for (;;) {
696 MatchAt(startPos, FALSE, status);
697 if (U_FAILURE(status)) {
698 return FALSE;
699 }
700 if (fMatch) {
701 return TRUE;
702 }
703 if (startPos >= testStartLimit) {
704 fHitEnd = TRUE;
705 return FALSE;
706 }
707 UTEXT_SETNATIVEINDEX(fInputText, startPos);
708 (void)UTEXT_NEXT32(fInputText);
709 startPos = UTEXT_GETNATIVEINDEX(fInputText);
710 // Note that it's perfectly OK for a pattern to have a zero-length
711 // match at the end of a string, so we must make sure that the loop
712 // runs with startPos == testStartLimit the last time through.
713 if (findProgressInterrupt(startPos, status))
714 return FALSE;
715 }
716 U_ASSERT(FALSE);
717
718 case START_START:
719 // Matches are only possible at the start of the input string
720 // (pattern begins with ^ or \A)
721 if (startPos > fActiveStart) {
722 fMatch = FALSE;
723 return FALSE;
724 }
725 MatchAt(startPos, FALSE, status);
726 if (U_FAILURE(status)) {
727 return FALSE;
728 }
729 return fMatch;
730
731
732 case START_SET:
733 {
734 // Match may start on any char from a pre-computed set.
735 U_ASSERT(fPattern->fMinMatchLen > 0);
736 UTEXT_SETNATIVEINDEX(fInputText, startPos);
737 for (;;) {
738 int64_t pos = startPos;
739 c = UTEXT_NEXT32(fInputText);
740 startPos = UTEXT_GETNATIVEINDEX(fInputText);
741 // c will be -1 (U_SENTINEL) at end of text, in which case we
742 // skip this next block (so we don't have a negative array index)
743 // and handle end of text in the following block.
744 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
745 (c>=256 && fPattern->fInitialChars->contains(c)))) {
746 MatchAt(pos, FALSE, status);
747 if (U_FAILURE(status)) {
748 return FALSE;
749 }
750 if (fMatch) {
751 return TRUE;
752 }
753 UTEXT_SETNATIVEINDEX(fInputText, pos);
754 }
755 if (startPos > testStartLimit) {
756 fMatch = FALSE;
757 fHitEnd = TRUE;
758 return FALSE;
759 }
760 if (findProgressInterrupt(startPos, status))
761 return FALSE;
762 }
763 }
764 U_ASSERT(FALSE);
765
766 case START_STRING:
767 case START_CHAR:
768 {
769 // Match starts on exactly one char.
770 U_ASSERT(fPattern->fMinMatchLen > 0);
771 UChar32 theChar = fPattern->fInitialChar;
772 UTEXT_SETNATIVEINDEX(fInputText, startPos);
773 for (;;) {
774 int64_t pos = startPos;
775 c = UTEXT_NEXT32(fInputText);
776 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777 if (c == theChar) {
778 MatchAt(pos, FALSE, status);
779 if (U_FAILURE(status)) {
780 return FALSE;
781 }
782 if (fMatch) {
783 return TRUE;
784 }
785 UTEXT_SETNATIVEINDEX(fInputText, pos);
786 }
787 if (startPos > testStartLimit) {
788 fMatch = FALSE;
789 fHitEnd = TRUE;
790 return FALSE;
791 }
792 if (findProgressInterrupt(startPos, status))
793 return FALSE;
794 }
795 }
796 U_ASSERT(FALSE);
797
798 case START_LINE:
799 {
800 UChar32 c;
801 if (startPos == fAnchorStart) {
802 MatchAt(startPos, FALSE, status);
803 if (U_FAILURE(status)) {
804 return FALSE;
805 }
806 if (fMatch) {
807 return TRUE;
808 }
809 UTEXT_SETNATIVEINDEX(fInputText, startPos);
810 c = UTEXT_NEXT32(fInputText);
811 startPos = UTEXT_GETNATIVEINDEX(fInputText);
812 } else {
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_PREVIOUS32(fInputText);
815 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816 }
817
818 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
819 for (;;) {
820 if (c == 0x0a) {
821 MatchAt(startPos, FALSE, status);
822 if (U_FAILURE(status)) {
823 return FALSE;
824 }
825 if (fMatch) {
826 return TRUE;
827 }
828 UTEXT_SETNATIVEINDEX(fInputText, startPos);
829 }
830 if (startPos >= testStartLimit) {
831 fMatch = FALSE;
832 fHitEnd = TRUE;
833 return FALSE;
834 }
835 c = UTEXT_NEXT32(fInputText);
836 startPos = UTEXT_GETNATIVEINDEX(fInputText);
837 // Note that it's perfectly OK for a pattern to have a zero-length
838 // match at the end of a string, so we must make sure that the loop
839 // runs with startPos == testStartLimit the last time through.
840 if (findProgressInterrupt(startPos, status))
841 return FALSE;
842 }
843 } else {
844 for (;;) {
845 if (isLineTerminator(c)) {
846 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
847 (void)UTEXT_NEXT32(fInputText);
848 startPos = UTEXT_GETNATIVEINDEX(fInputText);
849 }
850 MatchAt(startPos, FALSE, status);
851 if (U_FAILURE(status)) {
852 return FALSE;
853 }
854 if (fMatch) {
855 return TRUE;
856 }
857 UTEXT_SETNATIVEINDEX(fInputText, startPos);
858 }
859 if (startPos >= testStartLimit) {
860 fMatch = FALSE;
861 fHitEnd = TRUE;
862 return FALSE;
863 }
864 c = UTEXT_NEXT32(fInputText);
865 startPos = UTEXT_GETNATIVEINDEX(fInputText);
866 // Note that it's perfectly OK for a pattern to have a zero-length
867 // match at the end of a string, so we must make sure that the loop
868 // runs with startPos == testStartLimit the last time through.
869 if (findProgressInterrupt(startPos, status))
870 return FALSE;
871 }
872 }
873 }
874
875 default:
876 U_ASSERT(FALSE);
877 }
878
879 U_ASSERT(FALSE);
880 return FALSE;
881 }
882
883
884
find(int64_t start,UErrorCode & status)885 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
886 if (U_FAILURE(status)) {
887 return FALSE;
888 }
889 if (U_FAILURE(fDeferredStatus)) {
890 status = fDeferredStatus;
891 return FALSE;
892 }
893 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
894 // This will reset the region to be the full input length.
895 if (start < 0) {
896 status = U_INDEX_OUTOFBOUNDS_ERROR;
897 return FALSE;
898 }
899
900 int64_t nativeStart = start;
901 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
902 status = U_INDEX_OUTOFBOUNDS_ERROR;
903 return FALSE;
904 }
905 fMatchEnd = nativeStart;
906 return find(status);
907 }
908
909
910 //--------------------------------------------------------------------------------
911 //
912 // findUsingChunk() -- like find(), but with the advance knowledge that the
913 // entire string is available in the UText's chunk buffer.
914 //
915 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)916 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
917 // Start at the position of the last match end. (Will be zero if the
918 // matcher has been reset.
919 //
920
921 int32_t startPos = (int32_t)fMatchEnd;
922 if (startPos==0) {
923 startPos = (int32_t)fActiveStart;
924 }
925
926 const UChar *inputBuf = fInputText->chunkContents;
927
928 if (fMatch) {
929 // Save the position of any previous successful match.
930 fLastMatchEnd = fMatchEnd;
931
932 if (fMatchStart == fMatchEnd) {
933 // Previous match had zero length. Move start position up one position
934 // to avoid sending find() into a loop on zero-length matches.
935 if (startPos >= fActiveLimit) {
936 fMatch = FALSE;
937 fHitEnd = TRUE;
938 return FALSE;
939 }
940 U16_FWD_1(inputBuf, startPos, fInputLength);
941 }
942 } else {
943 if (fLastMatchEnd >= 0) {
944 // A previous find() failed to match. Don't try again.
945 // (without this test, a pattern with a zero-length match
946 // could match again at the end of an input string.)
947 fHitEnd = TRUE;
948 return FALSE;
949 }
950 }
951
952
953 // Compute the position in the input string beyond which a match can not begin, because
954 // the minimum length match would extend past the end of the input.
955 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
956 // Be aware of possible overflows if making changes here.
957 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
958 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
959 if (startPos > testLen) {
960 fMatch = FALSE;
961 fHitEnd = TRUE;
962 return FALSE;
963 }
964
965 UChar32 c;
966 U_ASSERT(startPos >= 0);
967
968 switch (fPattern->fStartType) {
969 case START_NO_INFO:
970 // No optimization was found.
971 // Try a match at each input position.
972 for (;;) {
973 MatchChunkAt(startPos, FALSE, status);
974 if (U_FAILURE(status)) {
975 return FALSE;
976 }
977 if (fMatch) {
978 return TRUE;
979 }
980 if (startPos >= testLen) {
981 fHitEnd = TRUE;
982 return FALSE;
983 }
984 U16_FWD_1(inputBuf, startPos, fActiveLimit);
985 // Note that it's perfectly OK for a pattern to have a zero-length
986 // match at the end of a string, so we must make sure that the loop
987 // runs with startPos == testLen the last time through.
988 if (findProgressInterrupt(startPos, status))
989 return FALSE;
990 }
991 U_ASSERT(FALSE);
992
993 case START_START:
994 // Matches are only possible at the start of the input string
995 // (pattern begins with ^ or \A)
996 if (startPos > fActiveStart) {
997 fMatch = FALSE;
998 return FALSE;
999 }
1000 MatchChunkAt(startPos, FALSE, status);
1001 if (U_FAILURE(status)) {
1002 return FALSE;
1003 }
1004 return fMatch;
1005
1006
1007 case START_SET:
1008 {
1009 // Match may start on any char from a pre-computed set.
1010 U_ASSERT(fPattern->fMinMatchLen > 0);
1011 for (;;) {
1012 int32_t pos = startPos;
1013 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1014 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1015 (c>=256 && fPattern->fInitialChars->contains(c))) {
1016 MatchChunkAt(pos, FALSE, status);
1017 if (U_FAILURE(status)) {
1018 return FALSE;
1019 }
1020 if (fMatch) {
1021 return TRUE;
1022 }
1023 }
1024 if (startPos > testLen) {
1025 fMatch = FALSE;
1026 fHitEnd = TRUE;
1027 return FALSE;
1028 }
1029 if (findProgressInterrupt(startPos, status))
1030 return FALSE;
1031 }
1032 }
1033 U_ASSERT(FALSE);
1034
1035 case START_STRING:
1036 case START_CHAR:
1037 {
1038 // Match starts on exactly one char.
1039 U_ASSERT(fPattern->fMinMatchLen > 0);
1040 UChar32 theChar = fPattern->fInitialChar;
1041 for (;;) {
1042 int32_t pos = startPos;
1043 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1044 if (c == theChar) {
1045 MatchChunkAt(pos, FALSE, status);
1046 if (U_FAILURE(status)) {
1047 return FALSE;
1048 }
1049 if (fMatch) {
1050 return TRUE;
1051 }
1052 }
1053 if (startPos > testLen) {
1054 fMatch = FALSE;
1055 fHitEnd = TRUE;
1056 return FALSE;
1057 }
1058 if (findProgressInterrupt(startPos, status))
1059 return FALSE;
1060 }
1061 }
1062 U_ASSERT(FALSE);
1063
1064 case START_LINE:
1065 {
1066 UChar32 c;
1067 if (startPos == fAnchorStart) {
1068 MatchChunkAt(startPos, FALSE, status);
1069 if (U_FAILURE(status)) {
1070 return FALSE;
1071 }
1072 if (fMatch) {
1073 return TRUE;
1074 }
1075 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1076 }
1077
1078 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1079 for (;;) {
1080 c = inputBuf[startPos-1];
1081 if (c == 0x0a) {
1082 MatchChunkAt(startPos, FALSE, status);
1083 if (U_FAILURE(status)) {
1084 return FALSE;
1085 }
1086 if (fMatch) {
1087 return TRUE;
1088 }
1089 }
1090 if (startPos >= testLen) {
1091 fMatch = FALSE;
1092 fHitEnd = TRUE;
1093 return FALSE;
1094 }
1095 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1096 // Note that it's perfectly OK for a pattern to have a zero-length
1097 // match at the end of a string, so we must make sure that the loop
1098 // runs with startPos == testLen the last time through.
1099 if (findProgressInterrupt(startPos, status))
1100 return FALSE;
1101 }
1102 } else {
1103 for (;;) {
1104 c = inputBuf[startPos-1];
1105 if (isLineTerminator(c)) {
1106 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1107 startPos++;
1108 }
1109 MatchChunkAt(startPos, FALSE, status);
1110 if (U_FAILURE(status)) {
1111 return FALSE;
1112 }
1113 if (fMatch) {
1114 return TRUE;
1115 }
1116 }
1117 if (startPos >= testLen) {
1118 fMatch = FALSE;
1119 fHitEnd = TRUE;
1120 return FALSE;
1121 }
1122 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1123 // Note that it's perfectly OK for a pattern to have a zero-length
1124 // match at the end of a string, so we must make sure that the loop
1125 // runs with startPos == testLen the last time through.
1126 if (findProgressInterrupt(startPos, status))
1127 return FALSE;
1128 }
1129 }
1130 }
1131
1132 default:
1133 U_ASSERT(FALSE);
1134 }
1135
1136 U_ASSERT(FALSE);
1137 return FALSE;
1138 }
1139
1140
1141
1142 //--------------------------------------------------------------------------------
1143 //
1144 // group()
1145 //
1146 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1147 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1148 return group(0, status);
1149 }
1150
1151 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1152 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1153 return group(0, dest, group_len, status);
1154 }
1155
1156 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1157 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1158 group_len = 0;
1159 if (U_FAILURE(status)) {
1160 return dest;
1161 }
1162 if (U_FAILURE(fDeferredStatus)) {
1163 status = fDeferredStatus;
1164 } else if (fMatch == FALSE) {
1165 status = U_REGEX_INVALID_STATE;
1166 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1167 status = U_INDEX_OUTOFBOUNDS_ERROR;
1168 }
1169
1170 if (U_FAILURE(status)) {
1171 return dest;
1172 }
1173
1174 int64_t s, e;
1175 if (groupNum == 0) {
1176 s = fMatchStart;
1177 e = fMatchEnd;
1178 } else {
1179 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1180 U_ASSERT(groupOffset < fPattern->fFrameSize);
1181 U_ASSERT(groupOffset >= 0);
1182 s = fFrame->fExtra[groupOffset];
1183 e = fFrame->fExtra[groupOffset+1];
1184 }
1185
1186 if (s < 0) {
1187 // A capture group wasn't part of the match
1188 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1189 }
1190 U_ASSERT(s <= e);
1191 group_len = e - s;
1192
1193 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1194 if (dest)
1195 UTEXT_SETNATIVEINDEX(dest, s);
1196 return dest;
1197 }
1198
group(int32_t groupNum,UErrorCode & status) const1199 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1200 UnicodeString result;
1201 int64_t groupStart = start64(groupNum, status);
1202 int64_t groupEnd = end64(groupNum, status);
1203 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1204 return result;
1205 }
1206
1207 // Get the group length using a utext_extract preflight.
1208 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1209 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1210 if (status != U_BUFFER_OVERFLOW_ERROR) {
1211 return result;
1212 }
1213
1214 status = U_ZERO_ERROR;
1215 UChar *buf = result.getBuffer(length);
1216 if (buf == NULL) {
1217 status = U_MEMORY_ALLOCATION_ERROR;
1218 } else {
1219 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1220 result.releaseBuffer(extractLength);
1221 U_ASSERT(length == extractLength);
1222 }
1223 return result;
1224 }
1225
1226
1227 //--------------------------------------------------------------------------------
1228 //
1229 // appendGroup() -- currently internal only, appends a group to a UText rather
1230 // than replacing its contents
1231 //
1232 //--------------------------------------------------------------------------------
1233
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1234 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1235 if (U_FAILURE(status)) {
1236 return 0;
1237 }
1238 if (U_FAILURE(fDeferredStatus)) {
1239 status = fDeferredStatus;
1240 return 0;
1241 }
1242 int64_t destLen = utext_nativeLength(dest);
1243
1244 if (fMatch == FALSE) {
1245 status = U_REGEX_INVALID_STATE;
1246 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1247 }
1248 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1249 status = U_INDEX_OUTOFBOUNDS_ERROR;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251 }
1252
1253 int64_t s, e;
1254 if (groupNum == 0) {
1255 s = fMatchStart;
1256 e = fMatchEnd;
1257 } else {
1258 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1259 U_ASSERT(groupOffset < fPattern->fFrameSize);
1260 U_ASSERT(groupOffset >= 0);
1261 s = fFrame->fExtra[groupOffset];
1262 e = fFrame->fExtra[groupOffset+1];
1263 }
1264
1265 if (s < 0) {
1266 // A capture group wasn't part of the match
1267 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1268 }
1269 U_ASSERT(s <= e);
1270
1271 int64_t deltaLen;
1272 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1273 U_ASSERT(e <= fInputLength);
1274 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1275 } else {
1276 int32_t len16;
1277 if (UTEXT_USES_U16(fInputText)) {
1278 len16 = (int32_t)(e-s);
1279 } else {
1280 UErrorCode lengthStatus = U_ZERO_ERROR;
1281 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1282 }
1283 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1284 if (groupChars == NULL) {
1285 status = U_MEMORY_ALLOCATION_ERROR;
1286 return 0;
1287 }
1288 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1289
1290 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1291 uprv_free(groupChars);
1292 }
1293 return deltaLen;
1294 }
1295
1296
1297
1298 //--------------------------------------------------------------------------------
1299 //
1300 // groupCount()
1301 //
1302 //--------------------------------------------------------------------------------
groupCount() const1303 int32_t RegexMatcher::groupCount() const {
1304 return fPattern->fGroupMap->size();
1305 }
1306
1307 //--------------------------------------------------------------------------------
1308 //
1309 // hasAnchoringBounds()
1310 //
1311 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1312 UBool RegexMatcher::hasAnchoringBounds() const {
1313 return fAnchoringBounds;
1314 }
1315
1316
1317 //--------------------------------------------------------------------------------
1318 //
1319 // hasTransparentBounds()
1320 //
1321 //--------------------------------------------------------------------------------
hasTransparentBounds() const1322 UBool RegexMatcher::hasTransparentBounds() const {
1323 return fTransparentBounds;
1324 }
1325
1326
1327
1328 //--------------------------------------------------------------------------------
1329 //
1330 // hitEnd()
1331 //
1332 //--------------------------------------------------------------------------------
hitEnd() const1333 UBool RegexMatcher::hitEnd() const {
1334 return fHitEnd;
1335 }
1336
1337
1338 //--------------------------------------------------------------------------------
1339 //
1340 // input()
1341 //
1342 //--------------------------------------------------------------------------------
input() const1343 const UnicodeString &RegexMatcher::input() const {
1344 if (!fInput) {
1345 UErrorCode status = U_ZERO_ERROR;
1346 int32_t len16;
1347 if (UTEXT_USES_U16(fInputText)) {
1348 len16 = (int32_t)fInputLength;
1349 } else {
1350 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1351 status = U_ZERO_ERROR; // overflow, length status
1352 }
1353 UnicodeString *result = new UnicodeString(len16, 0, 0);
1354
1355 UChar *inputChars = result->getBuffer(len16);
1356 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1357 result->releaseBuffer(len16);
1358
1359 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1360 }
1361
1362 return *fInput;
1363 }
1364
1365 //--------------------------------------------------------------------------------
1366 //
1367 // inputText()
1368 //
1369 //--------------------------------------------------------------------------------
inputText() const1370 UText *RegexMatcher::inputText() const {
1371 return fInputText;
1372 }
1373
1374
1375 //--------------------------------------------------------------------------------
1376 //
1377 // getInput() -- like inputText(), but makes a clone or copies into another UText
1378 //
1379 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1380 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1381 if (U_FAILURE(status)) {
1382 return dest;
1383 }
1384 if (U_FAILURE(fDeferredStatus)) {
1385 status = fDeferredStatus;
1386 return dest;
1387 }
1388
1389 if (dest) {
1390 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1391 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1392 } else {
1393 int32_t input16Len;
1394 if (UTEXT_USES_U16(fInputText)) {
1395 input16Len = (int32_t)fInputLength;
1396 } else {
1397 UErrorCode lengthStatus = U_ZERO_ERROR;
1398 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1399 }
1400 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1401 if (inputChars == NULL) {
1402 return dest;
1403 }
1404
1405 status = U_ZERO_ERROR;
1406 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1407 status = U_ZERO_ERROR;
1408 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1409
1410 uprv_free(inputChars);
1411 }
1412 return dest;
1413 } else {
1414 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1415 }
1416 }
1417
1418
1419 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1420 static UBool compat_SyncMutableUTextContents(UText *ut) {
1421 UBool retVal = FALSE;
1422
1423 // In the following test, we're really only interested in whether the UText should switch
1424 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1425 // will still point to the correct data.
1426 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1427 UnicodeString *us=(UnicodeString *)ut->context;
1428
1429 // Update to the latest length.
1430 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1431 int32_t newLength = us->length();
1432
1433 // Update the chunk description.
1434 // The buffer may have switched between stack- and heap-based.
1435 ut->chunkContents = us->getBuffer();
1436 ut->chunkLength = newLength;
1437 ut->chunkNativeLimit = newLength;
1438 ut->nativeIndexingLimit = newLength;
1439 retVal = TRUE;
1440 }
1441
1442 return retVal;
1443 }
1444
1445 //--------------------------------------------------------------------------------
1446 //
1447 // lookingAt()
1448 //
1449 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1450 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1451 if (U_FAILURE(status)) {
1452 return FALSE;
1453 }
1454 if (U_FAILURE(fDeferredStatus)) {
1455 status = fDeferredStatus;
1456 return FALSE;
1457 }
1458
1459 if (fInputUniStrMaybeMutable) {
1460 if (compat_SyncMutableUTextContents(fInputText)) {
1461 fInputLength = utext_nativeLength(fInputText);
1462 reset();
1463 }
1464 }
1465 else {
1466 resetPreserveRegion();
1467 }
1468 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1469 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1470 } else {
1471 MatchAt(fActiveStart, FALSE, status);
1472 }
1473 return fMatch;
1474 }
1475
1476
lookingAt(int64_t start,UErrorCode & status)1477 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1478 if (U_FAILURE(status)) {
1479 return FALSE;
1480 }
1481 if (U_FAILURE(fDeferredStatus)) {
1482 status = fDeferredStatus;
1483 return FALSE;
1484 }
1485 reset();
1486
1487 if (start < 0) {
1488 status = U_INDEX_OUTOFBOUNDS_ERROR;
1489 return FALSE;
1490 }
1491
1492 if (fInputUniStrMaybeMutable) {
1493 if (compat_SyncMutableUTextContents(fInputText)) {
1494 fInputLength = utext_nativeLength(fInputText);
1495 reset();
1496 }
1497 }
1498
1499 int64_t nativeStart;
1500 nativeStart = start;
1501 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1502 status = U_INDEX_OUTOFBOUNDS_ERROR;
1503 return FALSE;
1504 }
1505
1506 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1507 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1508 } else {
1509 MatchAt(nativeStart, FALSE, status);
1510 }
1511 return fMatch;
1512 }
1513
1514
1515
1516 //--------------------------------------------------------------------------------
1517 //
1518 // matches()
1519 //
1520 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1521 UBool RegexMatcher::matches(UErrorCode &status) {
1522 if (U_FAILURE(status)) {
1523 return FALSE;
1524 }
1525 if (U_FAILURE(fDeferredStatus)) {
1526 status = fDeferredStatus;
1527 return FALSE;
1528 }
1529
1530 if (fInputUniStrMaybeMutable) {
1531 if (compat_SyncMutableUTextContents(fInputText)) {
1532 fInputLength = utext_nativeLength(fInputText);
1533 reset();
1534 }
1535 }
1536 else {
1537 resetPreserveRegion();
1538 }
1539
1540 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1541 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1542 } else {
1543 MatchAt(fActiveStart, TRUE, status);
1544 }
1545 return fMatch;
1546 }
1547
1548
matches(int64_t start,UErrorCode & status)1549 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1550 if (U_FAILURE(status)) {
1551 return FALSE;
1552 }
1553 if (U_FAILURE(fDeferredStatus)) {
1554 status = fDeferredStatus;
1555 return FALSE;
1556 }
1557 reset();
1558
1559 if (start < 0) {
1560 status = U_INDEX_OUTOFBOUNDS_ERROR;
1561 return FALSE;
1562 }
1563
1564 if (fInputUniStrMaybeMutable) {
1565 if (compat_SyncMutableUTextContents(fInputText)) {
1566 fInputLength = utext_nativeLength(fInputText);
1567 reset();
1568 }
1569 }
1570
1571 int64_t nativeStart;
1572 nativeStart = start;
1573 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1574 status = U_INDEX_OUTOFBOUNDS_ERROR;
1575 return FALSE;
1576 }
1577
1578 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1579 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1580 } else {
1581 MatchAt(nativeStart, TRUE, status);
1582 }
1583 return fMatch;
1584 }
1585
1586
1587
1588 //--------------------------------------------------------------------------------
1589 //
1590 // pattern
1591 //
1592 //--------------------------------------------------------------------------------
pattern() const1593 const RegexPattern &RegexMatcher::pattern() const {
1594 return *fPattern;
1595 }
1596
1597
1598
1599 //--------------------------------------------------------------------------------
1600 //
1601 // region
1602 //
1603 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1604 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1605 if (U_FAILURE(status)) {
1606 return *this;
1607 }
1608
1609 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1610 status = U_ILLEGAL_ARGUMENT_ERROR;
1611 }
1612
1613 int64_t nativeStart = regionStart;
1614 int64_t nativeLimit = regionLimit;
1615 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1616 status = U_ILLEGAL_ARGUMENT_ERROR;
1617 }
1618
1619 if (startIndex == -1)
1620 this->reset();
1621 else
1622 resetPreserveRegion();
1623
1624 fRegionStart = nativeStart;
1625 fRegionLimit = nativeLimit;
1626 fActiveStart = nativeStart;
1627 fActiveLimit = nativeLimit;
1628
1629 if (startIndex != -1) {
1630 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1631 status = U_INDEX_OUTOFBOUNDS_ERROR;
1632 }
1633 fMatchEnd = startIndex;
1634 }
1635
1636 if (!fTransparentBounds) {
1637 fLookStart = nativeStart;
1638 fLookLimit = nativeLimit;
1639 }
1640 if (fAnchoringBounds) {
1641 fAnchorStart = nativeStart;
1642 fAnchorLimit = nativeLimit;
1643 }
1644 return *this;
1645 }
1646
region(int64_t start,int64_t limit,UErrorCode & status)1647 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1648 return region(start, limit, -1, status);
1649 }
1650
1651 //--------------------------------------------------------------------------------
1652 //
1653 // regionEnd
1654 //
1655 //--------------------------------------------------------------------------------
regionEnd() const1656 int32_t RegexMatcher::regionEnd() const {
1657 return (int32_t)fRegionLimit;
1658 }
1659
regionEnd64() const1660 int64_t RegexMatcher::regionEnd64() const {
1661 return fRegionLimit;
1662 }
1663
1664 //--------------------------------------------------------------------------------
1665 //
1666 // regionStart
1667 //
1668 //--------------------------------------------------------------------------------
regionStart() const1669 int32_t RegexMatcher::regionStart() const {
1670 return (int32_t)fRegionStart;
1671 }
1672
regionStart64() const1673 int64_t RegexMatcher::regionStart64() const {
1674 return fRegionStart;
1675 }
1676
1677
1678 //--------------------------------------------------------------------------------
1679 //
1680 // replaceAll
1681 //
1682 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1683 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1684 UText replacementText = UTEXT_INITIALIZER;
1685 UText resultText = UTEXT_INITIALIZER;
1686 UnicodeString resultString;
1687 if (U_FAILURE(status)) {
1688 return resultString;
1689 }
1690
1691 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1692 utext_openUnicodeString(&resultText, &resultString, &status);
1693
1694 replaceAll(&replacementText, &resultText, status);
1695
1696 utext_close(&resultText);
1697 utext_close(&replacementText);
1698
1699 return resultString;
1700 }
1701
1702
1703 //
1704 // replaceAll, UText mode
1705 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1706 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1707 if (U_FAILURE(status)) {
1708 return dest;
1709 }
1710 if (U_FAILURE(fDeferredStatus)) {
1711 status = fDeferredStatus;
1712 return dest;
1713 }
1714
1715 if (dest == NULL) {
1716 UnicodeString emptyString;
1717 UText empty = UTEXT_INITIALIZER;
1718
1719 utext_openUnicodeString(&empty, &emptyString, &status);
1720 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1721 utext_close(&empty);
1722 }
1723
1724 if (U_SUCCESS(status)) {
1725 reset();
1726 while (find()) {
1727 appendReplacement(dest, replacement, status);
1728 if (U_FAILURE(status)) {
1729 break;
1730 }
1731 }
1732 appendTail(dest, status);
1733 }
1734
1735 return dest;
1736 }
1737
1738
1739 //--------------------------------------------------------------------------------
1740 //
1741 // replaceFirst
1742 //
1743 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1744 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1745 UText replacementText = UTEXT_INITIALIZER;
1746 UText resultText = UTEXT_INITIALIZER;
1747 UnicodeString resultString;
1748
1749 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1750 utext_openUnicodeString(&resultText, &resultString, &status);
1751
1752 replaceFirst(&replacementText, &resultText, status);
1753
1754 utext_close(&resultText);
1755 utext_close(&replacementText);
1756
1757 return resultString;
1758 }
1759
1760 //
1761 // replaceFirst, UText mode
1762 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1763 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1764 if (U_FAILURE(status)) {
1765 return dest;
1766 }
1767 if (U_FAILURE(fDeferredStatus)) {
1768 status = fDeferredStatus;
1769 return dest;
1770 }
1771
1772 reset();
1773 if (!find()) {
1774 return getInput(dest, status);
1775 }
1776
1777 if (dest == NULL) {
1778 UnicodeString emptyString;
1779 UText empty = UTEXT_INITIALIZER;
1780
1781 utext_openUnicodeString(&empty, &emptyString, &status);
1782 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783 utext_close(&empty);
1784 }
1785
1786 appendReplacement(dest, replacement, status);
1787 appendTail(dest, status);
1788
1789 return dest;
1790 }
1791
1792
1793 //--------------------------------------------------------------------------------
1794 //
1795 // requireEnd
1796 //
1797 //--------------------------------------------------------------------------------
requireEnd() const1798 UBool RegexMatcher::requireEnd() const {
1799 return fRequireEnd;
1800 }
1801
1802
1803 //--------------------------------------------------------------------------------
1804 //
1805 // reset
1806 //
1807 //--------------------------------------------------------------------------------
reset()1808 RegexMatcher &RegexMatcher::reset() {
1809 fRegionStart = 0;
1810 fRegionLimit = fInputLength;
1811 fActiveStart = 0;
1812 fActiveLimit = fInputLength;
1813 fAnchorStart = 0;
1814 fAnchorLimit = fInputLength;
1815 fLookStart = 0;
1816 fLookLimit = fInputLength;
1817 resetPreserveRegion();
1818 return *this;
1819 }
1820
1821
1822
resetPreserveRegion()1823 void RegexMatcher::resetPreserveRegion() {
1824 fMatchStart = 0;
1825 fMatchEnd = 0;
1826 fLastMatchEnd = -1;
1827 fAppendPosition = 0;
1828 fMatch = FALSE;
1829 fHitEnd = FALSE;
1830 fRequireEnd = FALSE;
1831 fTime = 0;
1832 fTickCounter = TIMER_INITIAL_VALUE;
1833 //resetStack(); // more expensive than it looks...
1834 }
1835
1836
reset(const UnicodeString & input)1837 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1838 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1839 if (fPattern->fNeedsAltInput) {
1840 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1841 }
1842 if (U_FAILURE(fDeferredStatus)) {
1843 return *this;
1844 }
1845 fInputLength = utext_nativeLength(fInputText);
1846
1847 reset();
1848 delete fInput;
1849 fInput = NULL;
1850
1851 // Do the following for any UnicodeString.
1852 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1853 fInputUniStrMaybeMutable = TRUE;
1854
1855 if (fWordBreakItr != NULL) {
1856 #if UCONFIG_NO_BREAK_ITERATION==0
1857 UErrorCode status = U_ZERO_ERROR;
1858 fWordBreakItr->setText(fInputText, status);
1859 #endif
1860 }
1861 return *this;
1862 }
1863
1864
reset(UText * input)1865 RegexMatcher &RegexMatcher::reset(UText *input) {
1866 if (fInputText != input) {
1867 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1868 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1869 if (U_FAILURE(fDeferredStatus)) {
1870 return *this;
1871 }
1872 fInputLength = utext_nativeLength(fInputText);
1873
1874 delete fInput;
1875 fInput = NULL;
1876
1877 if (fWordBreakItr != NULL) {
1878 #if UCONFIG_NO_BREAK_ITERATION==0
1879 UErrorCode status = U_ZERO_ERROR;
1880 fWordBreakItr->setText(input, status);
1881 #endif
1882 }
1883 }
1884 reset();
1885 fInputUniStrMaybeMutable = FALSE;
1886
1887 return *this;
1888 }
1889
1890 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1891 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1892 return *this;
1893 }*/
1894
reset(int64_t position,UErrorCode & status)1895 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1896 if (U_FAILURE(status)) {
1897 return *this;
1898 }
1899 reset(); // Reset also resets the region to be the entire string.
1900
1901 if (position < 0 || position > fActiveLimit) {
1902 status = U_INDEX_OUTOFBOUNDS_ERROR;
1903 return *this;
1904 }
1905 fMatchEnd = position;
1906 return *this;
1907 }
1908
1909
1910 //--------------------------------------------------------------------------------
1911 //
1912 // refresh
1913 //
1914 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1915 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1916 if (U_FAILURE(status)) {
1917 return *this;
1918 }
1919 if (input == NULL) {
1920 status = U_ILLEGAL_ARGUMENT_ERROR;
1921 return *this;
1922 }
1923 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1924 status = U_ILLEGAL_ARGUMENT_ERROR;
1925 return *this;
1926 }
1927 int64_t pos = utext_getNativeIndex(fInputText);
1928 // Shallow read-only clone of the new UText into the existing input UText
1929 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1930 if (U_FAILURE(status)) {
1931 return *this;
1932 }
1933 utext_setNativeIndex(fInputText, pos);
1934
1935 if (fAltInputText != NULL) {
1936 pos = utext_getNativeIndex(fAltInputText);
1937 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1938 if (U_FAILURE(status)) {
1939 return *this;
1940 }
1941 utext_setNativeIndex(fAltInputText, pos);
1942 }
1943 return *this;
1944 }
1945
1946
1947
1948 //--------------------------------------------------------------------------------
1949 //
1950 // setTrace
1951 //
1952 //--------------------------------------------------------------------------------
setTrace(UBool state)1953 void RegexMatcher::setTrace(UBool state) {
1954 fTraceDebug = state;
1955 }
1956
1957
1958
1959 /**
1960 * UText, replace entire contents of the destination UText with a substring of the source UText.
1961 *
1962 * @param src The source UText
1963 * @param dest The destination UText. Must be writable.
1964 * May be NULL, in which case a new UText will be allocated.
1965 * @param start Start index of source substring.
1966 * @param limit Limit index of source substring.
1967 * @param status An error code.
1968 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1969 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1970 if (U_FAILURE(*status)) {
1971 return dest;
1972 }
1973 if (start == limit) {
1974 if (dest) {
1975 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1976 return dest;
1977 } else {
1978 return utext_openUChars(NULL, NULL, 0, status);
1979 }
1980 }
1981 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1982 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1983 return dest;
1984 }
1985 *status = U_ZERO_ERROR;
1986 MaybeStackArray<UChar, 40> buffer;
1987 if (length >= buffer.getCapacity()) {
1988 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1989 if (newBuf == NULL) {
1990 *status = U_MEMORY_ALLOCATION_ERROR;
1991 }
1992 }
1993 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1994 if (dest) {
1995 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
1996 return dest;
1997 }
1998
1999 // Caller did not provide a prexisting UText.
2000 // Open a new one, and have it adopt the text buffer storage.
2001 if (U_FAILURE(*status)) {
2002 return NULL;
2003 }
2004 int32_t ownedLength = 0;
2005 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2006 if (ownedBuf == NULL) {
2007 *status = U_MEMORY_ALLOCATION_ERROR;
2008 return NULL;
2009 }
2010 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2011 if (U_FAILURE(*status)) {
2012 uprv_free(ownedBuf);
2013 return NULL;
2014 }
2015 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2016 return result;
2017 }
2018
2019
2020 //---------------------------------------------------------------------
2021 //
2022 // split
2023 //
2024 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2025 int32_t RegexMatcher::split(const UnicodeString &input,
2026 UnicodeString dest[],
2027 int32_t destCapacity,
2028 UErrorCode &status)
2029 {
2030 UText inputText = UTEXT_INITIALIZER;
2031 utext_openConstUnicodeString(&inputText, &input, &status);
2032 if (U_FAILURE(status)) {
2033 return 0;
2034 }
2035
2036 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2037 if (destText == NULL) {
2038 status = U_MEMORY_ALLOCATION_ERROR;
2039 return 0;
2040 }
2041 int32_t i;
2042 for (i = 0; i < destCapacity; i++) {
2043 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2044 }
2045
2046 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2047
2048 for (i = 0; i < destCapacity; i++) {
2049 utext_close(destText[i]);
2050 }
2051
2052 uprv_free(destText);
2053 utext_close(&inputText);
2054 return fieldCount;
2055 }
2056
2057 //
2058 // split, UText mode
2059 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2060 int32_t RegexMatcher::split(UText *input,
2061 UText *dest[],
2062 int32_t destCapacity,
2063 UErrorCode &status)
2064 {
2065 //
2066 // Check arguements for validity
2067 //
2068 if (U_FAILURE(status)) {
2069 return 0;
2070 };
2071
2072 if (destCapacity < 1) {
2073 status = U_ILLEGAL_ARGUMENT_ERROR;
2074 return 0;
2075 }
2076
2077 //
2078 // Reset for the input text
2079 //
2080 reset(input);
2081 int64_t nextOutputStringStart = 0;
2082 if (fActiveLimit == 0) {
2083 return 0;
2084 }
2085
2086 //
2087 // Loop through the input text, searching for the delimiter pattern
2088 //
2089 int32_t i;
2090 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2091 for (i=0; ; i++) {
2092 if (i>=destCapacity-1) {
2093 // There is one or zero output string left.
2094 // Fill the last output string with whatever is left from the input, then exit the loop.
2095 // ( i will be == destCapacity if we filled the output array while processing
2096 // capture groups of the delimiter expression, in which case we will discard the
2097 // last capture group saved in favor of the unprocessed remainder of the
2098 // input string.)
2099 i = destCapacity-1;
2100 if (fActiveLimit > nextOutputStringStart) {
2101 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2102 if (dest[i]) {
2103 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2104 input->chunkContents+nextOutputStringStart,
2105 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2106 } else {
2107 UText remainingText = UTEXT_INITIALIZER;
2108 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2109 fActiveLimit-nextOutputStringStart, &status);
2110 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2111 utext_close(&remainingText);
2112 }
2113 } else {
2114 UErrorCode lengthStatus = U_ZERO_ERROR;
2115 int32_t remaining16Length =
2116 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2117 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2118 if (remainingChars == NULL) {
2119 status = U_MEMORY_ALLOCATION_ERROR;
2120 break;
2121 }
2122
2123 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2124 if (dest[i]) {
2125 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2126 } else {
2127 UText remainingText = UTEXT_INITIALIZER;
2128 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2129 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2130 utext_close(&remainingText);
2131 }
2132
2133 uprv_free(remainingChars);
2134 }
2135 }
2136 break;
2137 }
2138 if (find()) {
2139 // We found another delimiter. Move everything from where we started looking
2140 // up until the start of the delimiter into the next output string.
2141 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2142 if (dest[i]) {
2143 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2144 input->chunkContents+nextOutputStringStart,
2145 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2146 } else {
2147 UText remainingText = UTEXT_INITIALIZER;
2148 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2149 fMatchStart-nextOutputStringStart, &status);
2150 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2151 utext_close(&remainingText);
2152 }
2153 } else {
2154 UErrorCode lengthStatus = U_ZERO_ERROR;
2155 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2156 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2157 if (remainingChars == NULL) {
2158 status = U_MEMORY_ALLOCATION_ERROR;
2159 break;
2160 }
2161 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2162 if (dest[i]) {
2163 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2164 } else {
2165 UText remainingText = UTEXT_INITIALIZER;
2166 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2167 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2168 utext_close(&remainingText);
2169 }
2170
2171 uprv_free(remainingChars);
2172 }
2173 nextOutputStringStart = fMatchEnd;
2174
2175 // If the delimiter pattern has capturing parentheses, the captured
2176 // text goes out into the next n destination strings.
2177 int32_t groupNum;
2178 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2179 if (i >= destCapacity-2) {
2180 // Never fill the last available output string with capture group text.
2181 // It will filled with the last field, the remainder of the
2182 // unsplit input text.
2183 break;
2184 }
2185 i++;
2186 dest[i] = utext_extract_replace(fInputText, dest[i],
2187 start64(groupNum, status), end64(groupNum, status), &status);
2188 }
2189
2190 if (nextOutputStringStart == fActiveLimit) {
2191 // The delimiter was at the end of the string. We're done, but first
2192 // we output one last empty string, for the empty field following
2193 // the delimiter at the end of input.
2194 if (i+1 < destCapacity) {
2195 ++i;
2196 if (dest[i] == NULL) {
2197 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198 } else {
2199 static UChar emptyString[] = {(UChar)0};
2200 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201 }
2202 }
2203 break;
2204
2205 }
2206 }
2207 else
2208 {
2209 // We ran off the end of the input while looking for the next delimiter.
2210 // All the remaining text goes into the current output string.
2211 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212 if (dest[i]) {
2213 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214 input->chunkContents+nextOutputStringStart,
2215 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216 } else {
2217 UText remainingText = UTEXT_INITIALIZER;
2218 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219 fActiveLimit-nextOutputStringStart, &status);
2220 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221 utext_close(&remainingText);
2222 }
2223 } else {
2224 UErrorCode lengthStatus = U_ZERO_ERROR;
2225 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227 if (remainingChars == NULL) {
2228 status = U_MEMORY_ALLOCATION_ERROR;
2229 break;
2230 }
2231
2232 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233 if (dest[i]) {
2234 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235 } else {
2236 UText remainingText = UTEXT_INITIALIZER;
2237 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239 utext_close(&remainingText);
2240 }
2241
2242 uprv_free(remainingChars);
2243 }
2244 break;
2245 }
2246 if (U_FAILURE(status)) {
2247 break;
2248 }
2249 } // end of for loop
2250 return i+1;
2251 }
2252
2253
2254 //--------------------------------------------------------------------------------
2255 //
2256 // start
2257 //
2258 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2259 int32_t RegexMatcher::start(UErrorCode &status) const {
2260 return start(0, status);
2261 }
2262
start64(UErrorCode & status) const2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
2264 return start64(0, status);
2265 }
2266
2267 //--------------------------------------------------------------------------------
2268 //
2269 // start(int32_t group, UErrorCode &status)
2270 //
2271 //--------------------------------------------------------------------------------
2272
start64(int32_t group,UErrorCode & status) const2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274 if (U_FAILURE(status)) {
2275 return -1;
2276 }
2277 if (U_FAILURE(fDeferredStatus)) {
2278 status = fDeferredStatus;
2279 return -1;
2280 }
2281 if (fMatch == FALSE) {
2282 status = U_REGEX_INVALID_STATE;
2283 return -1;
2284 }
2285 if (group < 0 || group > fPattern->fGroupMap->size()) {
2286 status = U_INDEX_OUTOFBOUNDS_ERROR;
2287 return -1;
2288 }
2289 int64_t s;
2290 if (group == 0) {
2291 s = fMatchStart;
2292 } else {
2293 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294 U_ASSERT(groupOffset < fPattern->fFrameSize);
2295 U_ASSERT(groupOffset >= 0);
2296 s = fFrame->fExtra[groupOffset];
2297 }
2298
2299 return s;
2300 }
2301
2302
start(int32_t group,UErrorCode & status) const2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304 return (int32_t)start64(group, status);
2305 }
2306
2307 //--------------------------------------------------------------------------------
2308 //
2309 // useAnchoringBounds
2310 //
2311 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313 fAnchoringBounds = b;
2314 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316 return *this;
2317 }
2318
2319
2320 //--------------------------------------------------------------------------------
2321 //
2322 // useTransparentBounds
2323 //
2324 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326 fTransparentBounds = b;
2327 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329 return *this;
2330 }
2331
2332 //--------------------------------------------------------------------------------
2333 //
2334 // setTimeLimit
2335 //
2336 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338 if (U_FAILURE(status)) {
2339 return;
2340 }
2341 if (U_FAILURE(fDeferredStatus)) {
2342 status = fDeferredStatus;
2343 return;
2344 }
2345 if (limit < 0) {
2346 status = U_ILLEGAL_ARGUMENT_ERROR;
2347 return;
2348 }
2349 fTimeLimit = limit;
2350 }
2351
2352
2353 //--------------------------------------------------------------------------------
2354 //
2355 // getTimeLimit
2356 //
2357 //--------------------------------------------------------------------------------
getTimeLimit() const2358 int32_t RegexMatcher::getTimeLimit() const {
2359 return fTimeLimit;
2360 }
2361
2362
2363 //--------------------------------------------------------------------------------
2364 //
2365 // setStackLimit
2366 //
2367 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369 if (U_FAILURE(status)) {
2370 return;
2371 }
2372 if (U_FAILURE(fDeferredStatus)) {
2373 status = fDeferredStatus;
2374 return;
2375 }
2376 if (limit < 0) {
2377 status = U_ILLEGAL_ARGUMENT_ERROR;
2378 return;
2379 }
2380
2381 // Reset the matcher. This is needed here in case there is a current match
2382 // whose final stack frame (containing the match results, pointed to by fFrame)
2383 // would be lost by resizing to a smaller stack size.
2384 reset();
2385
2386 if (limit == 0) {
2387 // Unlimited stack expansion
2388 fStack->setMaxCapacity(0);
2389 } else {
2390 // Change the units of the limit from bytes to ints, and bump the size up
2391 // to be big enough to hold at least one stack frame for the pattern,
2392 // if it isn't there already.
2393 int32_t adjustedLimit = limit / sizeof(int32_t);
2394 if (adjustedLimit < fPattern->fFrameSize) {
2395 adjustedLimit = fPattern->fFrameSize;
2396 }
2397 fStack->setMaxCapacity(adjustedLimit);
2398 }
2399 fStackLimit = limit;
2400 }
2401
2402
2403 //--------------------------------------------------------------------------------
2404 //
2405 // getStackLimit
2406 //
2407 //--------------------------------------------------------------------------------
getStackLimit() const2408 int32_t RegexMatcher::getStackLimit() const {
2409 return fStackLimit;
2410 }
2411
2412
2413 //--------------------------------------------------------------------------------
2414 //
2415 // setMatchCallback
2416 //
2417 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2418 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2419 const void *context,
2420 UErrorCode &status) {
2421 if (U_FAILURE(status)) {
2422 return;
2423 }
2424 fCallbackFn = callback;
2425 fCallbackContext = context;
2426 }
2427
2428
2429 //--------------------------------------------------------------------------------
2430 //
2431 // getMatchCallback
2432 //
2433 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2434 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2435 const void *&context,
2436 UErrorCode &status) {
2437 if (U_FAILURE(status)) {
2438 return;
2439 }
2440 callback = fCallbackFn;
2441 context = fCallbackContext;
2442 }
2443
2444
2445 //--------------------------------------------------------------------------------
2446 //
2447 // setMatchCallback
2448 //
2449 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2451 const void *context,
2452 UErrorCode &status) {
2453 if (U_FAILURE(status)) {
2454 return;
2455 }
2456 fFindProgressCallbackFn = callback;
2457 fFindProgressCallbackContext = context;
2458 }
2459
2460
2461 //--------------------------------------------------------------------------------
2462 //
2463 // getMatchCallback
2464 //
2465 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2467 const void *&context,
2468 UErrorCode &status) {
2469 if (U_FAILURE(status)) {
2470 return;
2471 }
2472 callback = fFindProgressCallbackFn;
2473 context = fFindProgressCallbackContext;
2474 }
2475
2476
2477 //================================================================================
2478 //
2479 // Code following this point in this file is the internal
2480 // Match Engine Implementation.
2481 //
2482 //================================================================================
2483
2484
2485 //--------------------------------------------------------------------------------
2486 //
2487 // resetStack
2488 // Discard any previous contents of the state save stack, and initialize a
2489 // new stack frame to all -1. The -1s are needed for capture group limits,
2490 // where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
resetStack()2492 REStackFrame *RegexMatcher::resetStack() {
2493 // Discard any previous contents of the state save stack, and initialize a
2494 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2495 // where they indicate that a group has not yet matched anything.
2496 fStack->removeAllElements();
2497
2498 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499 if(U_FAILURE(fDeferredStatus)) {
2500 return NULL;
2501 }
2502
2503 int32_t i;
2504 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2505 iFrame->fExtra[i] = -1;
2506 }
2507 return iFrame;
2508 }
2509
2510
2511
2512 //--------------------------------------------------------------------------------
2513 //
2514 // isWordBoundary
2515 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2516 // For us,
2517 // If the current char is a combining mark,
2518 // \b is FALSE.
2519 // Else Scan backwards to the first non-combining char.
2520 // We are at a boundary if the this char and the original chars are
2521 // opposite in membership in \w set
2522 //
2523 // parameters: pos - the current position in the input buffer
2524 //
2525 // TODO: double-check edge cases at region boundaries.
2526 //
2527 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2528 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2529 UBool isBoundary = FALSE;
2530 UBool cIsWord = FALSE;
2531
2532 if (pos >= fLookLimit) {
2533 fHitEnd = TRUE;
2534 } else {
2535 // Determine whether char c at current position is a member of the word set of chars.
2536 // If we're off the end of the string, behave as though we're not at a word char.
2537 UTEXT_SETNATIVEINDEX(fInputText, pos);
2538 UChar32 c = UTEXT_CURRENT32(fInputText);
2539 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2540 // Current char is a combining one. Not a boundary.
2541 return FALSE;
2542 }
2543 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2544 }
2545
2546 // Back up until we come to a non-combining char, determine whether
2547 // that char is a word char.
2548 UBool prevCIsWord = FALSE;
2549 for (;;) {
2550 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2551 break;
2552 }
2553 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2554 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2555 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2556 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2557 break;
2558 }
2559 }
2560 isBoundary = cIsWord ^ prevCIsWord;
2561 return isBoundary;
2562 }
2563
isChunkWordBoundary(int32_t pos)2564 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2565 UBool isBoundary = FALSE;
2566 UBool cIsWord = FALSE;
2567
2568 const UChar *inputBuf = fInputText->chunkContents;
2569
2570 if (pos >= fLookLimit) {
2571 fHitEnd = TRUE;
2572 } else {
2573 // Determine whether char c at current position is a member of the word set of chars.
2574 // If we're off the end of the string, behave as though we're not at a word char.
2575 UChar32 c;
2576 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2577 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2578 // Current char is a combining one. Not a boundary.
2579 return FALSE;
2580 }
2581 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2582 }
2583
2584 // Back up until we come to a non-combining char, determine whether
2585 // that char is a word char.
2586 UBool prevCIsWord = FALSE;
2587 for (;;) {
2588 if (pos <= fLookStart) {
2589 break;
2590 }
2591 UChar32 prevChar;
2592 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2593 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2594 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2595 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2596 break;
2597 }
2598 }
2599 isBoundary = cIsWord ^ prevCIsWord;
2600 return isBoundary;
2601 }
2602
2603 //--------------------------------------------------------------------------------
2604 //
2605 // isUWordBoundary
2606 //
2607 // Test for a word boundary using RBBI word break.
2608 //
2609 // parameters: pos - the current position in the input buffer
2610 //
2611 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2612 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2613 UBool returnVal = FALSE;
2614 #if UCONFIG_NO_BREAK_ITERATION==0
2615
2616 // If we haven't yet created a break iterator for this matcher, do it now.
2617 if (fWordBreakItr == NULL) {
2618 fWordBreakItr =
2619 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2620 if (U_FAILURE(fDeferredStatus)) {
2621 return FALSE;
2622 }
2623 fWordBreakItr->setText(fInputText, fDeferredStatus);
2624 }
2625
2626 if (pos >= fLookLimit) {
2627 fHitEnd = TRUE;
2628 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2629 // words are not boundaries. All non-word chars stand by themselves,
2630 // with word boundaries on both sides.
2631 } else {
2632 if (!UTEXT_USES_U16(fInputText)) {
2633 // !!!: Would like a better way to do this!
2634 UErrorCode status = U_ZERO_ERROR;
2635 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2636 }
2637 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2638 }
2639 #endif
2640 return returnVal;
2641 }
2642
2643 //--------------------------------------------------------------------------------
2644 //
2645 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2646 // saves. Increment the "time" counter, and call the
2647 // user callback function if there is one installed.
2648 //
2649 // If the match operation needs to be aborted, either for a time-out
2650 // or because the user callback asked for it, just set an error status.
2651 // The engine will pick that up and stop in its outer loop.
2652 //
2653 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2654 void RegexMatcher::IncrementTime(UErrorCode &status) {
2655 fTickCounter = TIMER_INITIAL_VALUE;
2656 fTime++;
2657 if (fCallbackFn != NULL) {
2658 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2659 status = U_REGEX_STOPPED_BY_CALLER;
2660 return;
2661 }
2662 }
2663 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2664 status = U_REGEX_TIME_OUT;
2665 }
2666 }
2667
2668 //--------------------------------------------------------------------------------
2669 //
2670 // StateSave
2671 // Make a new stack frame, initialized as a copy of the current stack frame.
2672 // Set the pattern index in the original stack frame from the operand value
2673 // in the opcode. Execution of the engine continues with the state in
2674 // the newly created stack frame
2675 //
2676 // Note that reserveBlock() may grow the stack, resulting in the
2677 // whole thing being relocated in memory.
2678 //
2679 // Parameters:
2680 // fp The top frame pointer when called. At return, a new
2681 // fame will be present
2682 // savePatIdx An index into the compiled pattern. Goes into the original
2683 // (not new) frame. If execution ever back-tracks out of the
2684 // new frame, this will be where we continue from in the pattern.
2685 // Return
2686 // The new frame pointer.
2687 //
2688 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2689 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2690 if (U_FAILURE(status)) {
2691 return fp;
2692 }
2693 // push storage for a new frame.
2694 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2695 if (U_FAILURE(status)) {
2696 // Failure on attempted stack expansion.
2697 // Stack function set some other error code, change it to a more
2698 // specific one for regular expressions.
2699 status = U_REGEX_STACK_OVERFLOW;
2700 // We need to return a writable stack frame, so just return the
2701 // previous frame. The match operation will stop quickly
2702 // because of the error status, after which the frame will never
2703 // be looked at again.
2704 return fp;
2705 }
2706 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2707
2708 // New stack frame = copy of old top frame.
2709 int64_t *source = (int64_t *)fp;
2710 int64_t *dest = newFP;
2711 for (;;) {
2712 *dest++ = *source++;
2713 if (source == newFP) {
2714 break;
2715 }
2716 }
2717
2718 fTickCounter--;
2719 if (fTickCounter <= 0) {
2720 IncrementTime(status); // Re-initializes fTickCounter
2721 }
2722 fp->fPatIdx = savePatIdx;
2723 return (REStackFrame *)newFP;
2724 }
2725
2726
2727 //--------------------------------------------------------------------------------
2728 //
2729 // MatchAt This is the actual matching engine.
2730 //
2731 // startIdx: begin matching a this index.
2732 // toEnd: if true, match must extend to end of the input region
2733 //
2734 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2735 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2736 UBool isMatch = FALSE; // True if the we have a match.
2737
2738 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2739
2740 int32_t op; // Operation from the compiled pattern, split into
2741 int32_t opType; // the opcode
2742 int32_t opValue; // and the operand value.
2743
2744 #ifdef REGEX_RUN_DEBUG
2745 if (fTraceDebug)
2746 {
2747 printf("MatchAt(startIdx=%ld)\n", startIdx);
2748 printf("Original Pattern: ");
2749 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2750 while (c != U_SENTINEL) {
2751 if (c<32 || c>256) {
2752 c = '.';
2753 }
2754 printf("%c", c);
2755
2756 c = UTEXT_NEXT32(fPattern->fPattern);
2757 }
2758 printf("\n");
2759 printf("Input String: ");
2760 c = utext_next32From(fInputText, 0);
2761 while (c != U_SENTINEL) {
2762 if (c<32 || c>256) {
2763 c = '.';
2764 }
2765 printf("%c", c);
2766
2767 c = UTEXT_NEXT32(fInputText);
2768 }
2769 printf("\n");
2770 printf("\n");
2771 }
2772 #endif
2773
2774 if (U_FAILURE(status)) {
2775 return;
2776 }
2777
2778 // Cache frequently referenced items from the compiled pattern
2779 //
2780 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2781
2782 const UChar *litText = fPattern->fLiteralText.getBuffer();
2783 UVector *sets = fPattern->fSets;
2784
2785 fFrameSize = fPattern->fFrameSize;
2786 REStackFrame *fp = resetStack();
2787 if (U_FAILURE(fDeferredStatus)) {
2788 status = fDeferredStatus;
2789 return;
2790 }
2791
2792 fp->fPatIdx = 0;
2793 fp->fInputIdx = startIdx;
2794
2795 // Zero out the pattern's static data
2796 int32_t i;
2797 for (i = 0; i<fPattern->fDataSize; i++) {
2798 fData[i] = 0;
2799 }
2800
2801 //
2802 // Main loop for interpreting the compiled pattern.
2803 // One iteration of the loop per pattern operation performed.
2804 //
2805 for (;;) {
2806 op = (int32_t)pat[fp->fPatIdx];
2807 opType = URX_TYPE(op);
2808 opValue = URX_VAL(op);
2809 #ifdef REGEX_RUN_DEBUG
2810 if (fTraceDebug) {
2811 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2812 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2813 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2814 fPattern->dumpOp(fp->fPatIdx);
2815 }
2816 #endif
2817 fp->fPatIdx++;
2818
2819 switch (opType) {
2820
2821
2822 case URX_NOP:
2823 break;
2824
2825
2826 case URX_BACKTRACK:
2827 // Force a backtrack. In some circumstances, the pattern compiler
2828 // will notice that the pattern can't possibly match anything, and will
2829 // emit one of these at that point.
2830 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2831 break;
2832
2833
2834 case URX_ONECHAR:
2835 if (fp->fInputIdx < fActiveLimit) {
2836 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2837 UChar32 c = UTEXT_NEXT32(fInputText);
2838 if (c == opValue) {
2839 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2840 break;
2841 }
2842 } else {
2843 fHitEnd = TRUE;
2844 }
2845 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2846 break;
2847
2848
2849 case URX_STRING:
2850 {
2851 // Test input against a literal string.
2852 // Strings require two slots in the compiled pattern, one for the
2853 // offset to the string text, and one for the length.
2854
2855 int32_t stringStartIdx = opValue;
2856 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2857 fp->fPatIdx++;
2858 opType = URX_TYPE(op);
2859 int32_t stringLen = URX_VAL(op);
2860 U_ASSERT(opType == URX_STRING_LEN);
2861 U_ASSERT(stringLen >= 2);
2862
2863 const UChar *patternString = litText+stringStartIdx;
2864 int32_t patternStringIndex = 0;
2865 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2866 UChar32 inputChar;
2867 UChar32 patternChar;
2868 UBool success = TRUE;
2869 while (patternStringIndex < stringLen) {
2870 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2871 success = FALSE;
2872 fHitEnd = TRUE;
2873 break;
2874 }
2875 inputChar = UTEXT_NEXT32(fInputText);
2876 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2877 if (patternChar != inputChar) {
2878 success = FALSE;
2879 break;
2880 }
2881 }
2882
2883 if (success) {
2884 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2885 } else {
2886 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2887 }
2888 }
2889 break;
2890
2891
2892 case URX_STATE_SAVE:
2893 fp = StateSave(fp, opValue, status);
2894 break;
2895
2896
2897 case URX_END:
2898 // The match loop will exit via this path on a successful match,
2899 // when we reach the end of the pattern.
2900 if (toEnd && fp->fInputIdx != fActiveLimit) {
2901 // The pattern matched, but not to the end of input. Try some more.
2902 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2903 break;
2904 }
2905 isMatch = TRUE;
2906 goto breakFromLoop;
2907
2908 // Start and End Capture stack frame variables are laid out out like this:
2909 // fp->fExtra[opValue] - The start of a completed capture group
2910 // opValue+1 - The end of a completed capture group
2911 // opValue+2 - the start of a capture group whose end
2912 // has not yet been reached (and might not ever be).
2913 case URX_START_CAPTURE:
2914 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915 fp->fExtra[opValue+2] = fp->fInputIdx;
2916 break;
2917
2918
2919 case URX_END_CAPTURE:
2920 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2921 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2922 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2923 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2924 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2925 break;
2926
2927
2928 case URX_DOLLAR: // $, test for End of line
2929 // or for position before new line at end of input
2930 {
2931 if (fp->fInputIdx >= fAnchorLimit) {
2932 // We really are at the end of input. Success.
2933 fHitEnd = TRUE;
2934 fRequireEnd = TRUE;
2935 break;
2936 }
2937
2938 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2939
2940 // If we are positioned just before a new-line that is located at the
2941 // end of input, succeed.
2942 UChar32 c = UTEXT_NEXT32(fInputText);
2943 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2944 if (isLineTerminator(c)) {
2945 // If not in the middle of a CR/LF sequence
2946 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2947 // At new-line at end of input. Success
2948 fHitEnd = TRUE;
2949 fRequireEnd = TRUE;
2950
2951 break;
2952 }
2953 }
2954 } else {
2955 UChar32 nextC = UTEXT_NEXT32(fInputText);
2956 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2957 fHitEnd = TRUE;
2958 fRequireEnd = TRUE;
2959 break; // At CR/LF at end of input. Success
2960 }
2961 }
2962
2963 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2964 }
2965 break;
2966
2967
2968 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2969 if (fp->fInputIdx >= fAnchorLimit) {
2970 // Off the end of input. Success.
2971 fHitEnd = TRUE;
2972 fRequireEnd = TRUE;
2973 break;
2974 } else {
2975 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2976 UChar32 c = UTEXT_NEXT32(fInputText);
2977 // Either at the last character of input, or off the end.
2978 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2979 fHitEnd = TRUE;
2980 fRequireEnd = TRUE;
2981 break;
2982 }
2983 }
2984
2985 // Not at end of input. Back-track out.
2986 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2987 break;
2988
2989
2990 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2991 {
2992 if (fp->fInputIdx >= fAnchorLimit) {
2993 // We really are at the end of input. Success.
2994 fHitEnd = TRUE;
2995 fRequireEnd = TRUE;
2996 break;
2997 }
2998 // If we are positioned just before a new-line, succeed.
2999 // It makes no difference where the new-line is within the input.
3000 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3001 UChar32 c = UTEXT_CURRENT32(fInputText);
3002 if (isLineTerminator(c)) {
3003 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
3004 // In multi-line mode, hitting a new-line just before the end of input does not
3005 // set the hitEnd or requireEnd flags
3006 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3007 break;
3008 }
3009 }
3010 // not at a new line. Fail.
3011 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3012 }
3013 break;
3014
3015
3016 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3017 {
3018 if (fp->fInputIdx >= fAnchorLimit) {
3019 // We really are at the end of input. Success.
3020 fHitEnd = TRUE;
3021 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3022 break; // adding a new-line would not lose the match.
3023 }
3024 // If we are not positioned just before a new-line, the test fails; backtrack out.
3025 // It makes no difference where the new-line is within the input.
3026 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3027 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3028 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3029 }
3030 }
3031 break;
3032
3033
3034 case URX_CARET: // ^, test for start of line
3035 if (fp->fInputIdx != fAnchorStart) {
3036 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3037 }
3038 break;
3039
3040
3041 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3042 {
3043 if (fp->fInputIdx == fAnchorStart) {
3044 // We are at the start input. Success.
3045 break;
3046 }
3047 // Check whether character just before the current pos is a new-line
3048 // unless we are at the end of input
3049 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3050 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3051 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3052 // It's a new-line. ^ is true. Success.
3053 // TODO: what should be done with positions between a CR and LF?
3054 break;
3055 }
3056 // Not at the start of a line. Fail.
3057 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3058 }
3059 break;
3060
3061
3062 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3063 {
3064 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3065 if (fp->fInputIdx <= fAnchorStart) {
3066 // We are at the start input. Success.
3067 break;
3068 }
3069 // Check whether character just before the current pos is a new-line
3070 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3071 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3072 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3073 if (c != 0x0a) {
3074 // Not at the start of a line. Back-track out.
3075 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3076 }
3077 }
3078 break;
3079
3080 case URX_BACKSLASH_B: // Test for word boundaries
3081 {
3082 UBool success = isWordBoundary(fp->fInputIdx);
3083 success ^= (UBool)(opValue != 0); // flip sense for \B
3084 if (!success) {
3085 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3086 }
3087 }
3088 break;
3089
3090
3091 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3092 {
3093 UBool success = isUWordBoundary(fp->fInputIdx);
3094 success ^= (UBool)(opValue != 0); // flip sense for \B
3095 if (!success) {
3096 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3097 }
3098 }
3099 break;
3100
3101
3102 case URX_BACKSLASH_D: // Test for decimal digit
3103 {
3104 if (fp->fInputIdx >= fActiveLimit) {
3105 fHitEnd = TRUE;
3106 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3107 break;
3108 }
3109
3110 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3111
3112 UChar32 c = UTEXT_NEXT32(fInputText);
3113 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3114 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3115 success ^= (UBool)(opValue != 0); // flip sense for \D
3116 if (success) {
3117 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3118 } else {
3119 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3120 }
3121 }
3122 break;
3123
3124
3125 case URX_BACKSLASH_G: // Test for position at end of previous match
3126 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3127 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3128 }
3129 break;
3130
3131
3132 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3133 {
3134 if (fp->fInputIdx >= fActiveLimit) {
3135 fHitEnd = TRUE;
3136 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137 break;
3138 }
3139 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3140 UChar32 c = UTEXT_NEXT32(fInputText);
3141 int8_t ctype = u_charType(c);
3142 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3143 success ^= (UBool)(opValue != 0); // flip sense for \H
3144 if (success) {
3145 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3146 } else {
3147 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3148 }
3149 }
3150 break;
3151
3152
3153 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3154 {
3155 if (fp->fInputIdx >= fActiveLimit) {
3156 fHitEnd = TRUE;
3157 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3158 break;
3159 }
3160 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3161 UChar32 c = UTEXT_NEXT32(fInputText);
3162 if (isLineTerminator(c)) {
3163 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3164 utext_next32(fInputText);
3165 }
3166 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3167 } else {
3168 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169 }
3170 }
3171 break;
3172
3173
3174 case URX_BACKSLASH_V: // \v, any single line ending character.
3175 {
3176 if (fp->fInputIdx >= fActiveLimit) {
3177 fHitEnd = TRUE;
3178 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3179 break;
3180 }
3181 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3182 UChar32 c = UTEXT_NEXT32(fInputText);
3183 UBool success = isLineTerminator(c);
3184 success ^= (UBool)(opValue != 0); // flip sense for \V
3185 if (success) {
3186 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187 } else {
3188 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3189 }
3190 }
3191 break;
3192
3193
3194 case URX_BACKSLASH_X:
3195 // Match a Grapheme, as defined by Unicode TR 29.
3196 // Differs slightly from Perl, which consumes combining marks independently
3197 // of context.
3198 {
3199
3200 // Fail if at end of input
3201 if (fp->fInputIdx >= fActiveLimit) {
3202 fHitEnd = TRUE;
3203 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3204 break;
3205 }
3206
3207 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3208
3209 // Examine (and consume) the current char.
3210 // Dispatch into a little state machine, based on the char.
3211 UChar32 c;
3212 c = UTEXT_NEXT32(fInputText);
3213 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3214 UnicodeSet **sets = fPattern->fStaticSets;
3215 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3216 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3217 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3218 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3219 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3220 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3221 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3222 goto GC_Extend;
3223
3224
3225
3226 GC_L:
3227 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3228 c = UTEXT_NEXT32(fInputText);
3229 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3231 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3232 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3233 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3234 (void)UTEXT_PREVIOUS32(fInputText);
3235 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236 goto GC_Extend;
3237
3238 GC_V:
3239 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3240 c = UTEXT_NEXT32(fInputText);
3241 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3242 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3243 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3244 (void)UTEXT_PREVIOUS32(fInputText);
3245 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246 goto GC_Extend;
3247
3248 GC_T:
3249 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3250 c = UTEXT_NEXT32(fInputText);
3251 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3252 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3253 (void)UTEXT_PREVIOUS32(fInputText);
3254 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255 goto GC_Extend;
3256
3257 GC_Extend:
3258 // Combining characters are consumed here
3259 for (;;) {
3260 if (fp->fInputIdx >= fActiveLimit) {
3261 break;
3262 }
3263 c = UTEXT_CURRENT32(fInputText);
3264 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3265 break;
3266 }
3267 (void)UTEXT_NEXT32(fInputText);
3268 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3269 }
3270 goto GC_Done;
3271
3272 GC_Control:
3273 // Most control chars stand alone (don't combine with combining chars),
3274 // except for that CR/LF sequence is a single grapheme cluster.
3275 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3276 c = UTEXT_NEXT32(fInputText);
3277 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3278 }
3279
3280 GC_Done:
3281 if (fp->fInputIdx >= fActiveLimit) {
3282 fHitEnd = TRUE;
3283 }
3284 break;
3285 }
3286
3287
3288
3289
3290 case URX_BACKSLASH_Z: // Test for end of Input
3291 if (fp->fInputIdx < fAnchorLimit) {
3292 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293 } else {
3294 fHitEnd = TRUE;
3295 fRequireEnd = TRUE;
3296 }
3297 break;
3298
3299
3300
3301 case URX_STATIC_SETREF:
3302 {
3303 // Test input character against one of the predefined sets
3304 // (Word Characters, for example)
3305 // The high bit of the op value is a flag for the match polarity.
3306 // 0: success if input char is in set.
3307 // 1: success if input char is not in set.
3308 if (fp->fInputIdx >= fActiveLimit) {
3309 fHitEnd = TRUE;
3310 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3311 break;
3312 }
3313
3314 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3315 opValue &= ~URX_NEG_SET;
3316 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3317
3318 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3319 UChar32 c = UTEXT_NEXT32(fInputText);
3320 if (c < 256) {
3321 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3322 if (s8->contains(c)) {
3323 success = !success;
3324 }
3325 } else {
3326 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3327 if (s->contains(c)) {
3328 success = !success;
3329 }
3330 }
3331 if (success) {
3332 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3333 } else {
3334 // the character wasn't in the set.
3335 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336 }
3337 }
3338 break;
3339
3340
3341 case URX_STAT_SETREF_N:
3342 {
3343 // Test input character for NOT being a member of one of
3344 // the predefined sets (Word Characters, for example)
3345 if (fp->fInputIdx >= fActiveLimit) {
3346 fHitEnd = TRUE;
3347 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3348 break;
3349 }
3350
3351 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3352
3353 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3354
3355 UChar32 c = UTEXT_NEXT32(fInputText);
3356 if (c < 256) {
3357 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3358 if (s8->contains(c) == FALSE) {
3359 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360 break;
3361 }
3362 } else {
3363 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3364 if (s->contains(c) == FALSE) {
3365 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3366 break;
3367 }
3368 }
3369 // the character wasn't in the set.
3370 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371 }
3372 break;
3373
3374
3375 case URX_SETREF:
3376 if (fp->fInputIdx >= fActiveLimit) {
3377 fHitEnd = TRUE;
3378 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3379 break;
3380 } else {
3381 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3382
3383 // There is input left. Pick up one char and test it for set membership.
3384 UChar32 c = UTEXT_NEXT32(fInputText);
3385 U_ASSERT(opValue > 0 && opValue < sets->size());
3386 if (c<256) {
3387 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3388 if (s8->contains(c)) {
3389 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3390 break;
3391 }
3392 } else {
3393 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3394 if (s->contains(c)) {
3395 // The character is in the set. A Match.
3396 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3397 break;
3398 }
3399 }
3400
3401 // the character wasn't in the set.
3402 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403 }
3404 break;
3405
3406
3407 case URX_DOTANY:
3408 {
3409 // . matches anything, but stops at end-of-line.
3410 if (fp->fInputIdx >= fActiveLimit) {
3411 // At end of input. Match failed. Backtrack out.
3412 fHitEnd = TRUE;
3413 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3414 break;
3415 }
3416
3417 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3418
3419 // There is input left. Advance over one char, unless we've hit end-of-line
3420 UChar32 c = UTEXT_NEXT32(fInputText);
3421 if (isLineTerminator(c)) {
3422 // End of line in normal mode. . does not match.
3423 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3424 break;
3425 }
3426 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3427 }
3428 break;
3429
3430
3431 case URX_DOTANY_ALL:
3432 {
3433 // ., in dot-matches-all (including new lines) mode
3434 if (fp->fInputIdx >= fActiveLimit) {
3435 // At end of input. Match failed. Backtrack out.
3436 fHitEnd = TRUE;
3437 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3438 break;
3439 }
3440
3441 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3442
3443 // There is input left. Advance over one char, except if we are
3444 // at a cr/lf, advance over both of them.
3445 UChar32 c;
3446 c = UTEXT_NEXT32(fInputText);
3447 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3449 // In the case of a CR/LF, we need to advance over both.
3450 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3451 if (nextc == 0x0a) {
3452 (void)UTEXT_NEXT32(fInputText);
3453 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454 }
3455 }
3456 }
3457 break;
3458
3459
3460 case URX_DOTANY_UNIX:
3461 {
3462 // '.' operator, matches all, but stops at end-of-line.
3463 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3464 if (fp->fInputIdx >= fActiveLimit) {
3465 // At end of input. Match failed. Backtrack out.
3466 fHitEnd = TRUE;
3467 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3468 break;
3469 }
3470
3471 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3472
3473 // There is input left. Advance over one char, unless we've hit end-of-line
3474 UChar32 c = UTEXT_NEXT32(fInputText);
3475 if (c == 0x0a) {
3476 // End of line in normal mode. '.' does not match the \n
3477 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3478 } else {
3479 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3480 }
3481 }
3482 break;
3483
3484
3485 case URX_JMP:
3486 fp->fPatIdx = opValue;
3487 break;
3488
3489 case URX_FAIL:
3490 isMatch = FALSE;
3491 goto breakFromLoop;
3492
3493 case URX_JMP_SAV:
3494 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3495 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3496 fp->fPatIdx = opValue; // Then JMP.
3497 break;
3498
3499 case URX_JMP_SAV_X:
3500 // This opcode is used with (x)+, when x can match a zero length string.
3501 // Same as JMP_SAV, except conditional on the match having made forward progress.
3502 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3503 // data address of the input position at the start of the loop.
3504 {
3505 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3506 int32_t stoOp = (int32_t)pat[opValue-1];
3507 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3508 int32_t frameLoc = URX_VAL(stoOp);
3509 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3510 int64_t prevInputIdx = fp->fExtra[frameLoc];
3511 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3512 if (prevInputIdx < fp->fInputIdx) {
3513 // The match did make progress. Repeat the loop.
3514 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3515 fp->fPatIdx = opValue;
3516 fp->fExtra[frameLoc] = fp->fInputIdx;
3517 }
3518 // If the input position did not advance, we do nothing here,
3519 // execution will fall out of the loop.
3520 }
3521 break;
3522
3523 case URX_CTR_INIT:
3524 {
3525 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3526 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3527
3528 // Pick up the three extra operands that CTR_INIT has, and
3529 // skip the pattern location counter past
3530 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3531 fp->fPatIdx += 3;
3532 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3533 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3534 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3535 U_ASSERT(minCount>=0);
3536 U_ASSERT(maxCount>=minCount || maxCount==-1);
3537 U_ASSERT(loopLoc>=fp->fPatIdx);
3538
3539 if (minCount == 0) {
3540 fp = StateSave(fp, loopLoc+1, status);
3541 }
3542 if (maxCount == -1) {
3543 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3544 } else if (maxCount == 0) {
3545 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3546 }
3547 }
3548 break;
3549
3550 case URX_CTR_LOOP:
3551 {
3552 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3553 int32_t initOp = (int32_t)pat[opValue];
3554 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3555 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3556 int32_t minCount = (int32_t)pat[opValue+2];
3557 int32_t maxCount = (int32_t)pat[opValue+3];
3558 (*pCounter)++;
3559 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3560 U_ASSERT(*pCounter == maxCount);
3561 break;
3562 }
3563 if (*pCounter >= minCount) {
3564 if (maxCount == -1) {
3565 // Loop has no hard upper bound.
3566 // Check that it is progressing through the input, break if it is not.
3567 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3568 if (fp->fInputIdx == *pLastInputIdx) {
3569 break;
3570 } else {
3571 *pLastInputIdx = fp->fInputIdx;
3572 }
3573 }
3574 fp = StateSave(fp, fp->fPatIdx, status);
3575 }
3576 fp->fPatIdx = opValue + 4; // Loop back.
3577 }
3578 break;
3579
3580 case URX_CTR_INIT_NG:
3581 {
3582 // Initialize a non-greedy loop
3583 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3584 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3585
3586 // Pick up the three extra operands that CTR_INIT_NG has, and
3587 // skip the pattern location counter past
3588 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3589 fp->fPatIdx += 3;
3590 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3591 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3592 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3593 U_ASSERT(minCount>=0);
3594 U_ASSERT(maxCount>=minCount || maxCount==-1);
3595 U_ASSERT(loopLoc>fp->fPatIdx);
3596 if (maxCount == -1) {
3597 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3598 }
3599
3600 if (minCount == 0) {
3601 if (maxCount != 0) {
3602 fp = StateSave(fp, fp->fPatIdx, status);
3603 }
3604 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3605 }
3606 }
3607 break;
3608
3609 case URX_CTR_LOOP_NG:
3610 {
3611 // Non-greedy {min, max} loops
3612 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3613 int32_t initOp = (int32_t)pat[opValue];
3614 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3615 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3616 int32_t minCount = (int32_t)pat[opValue+2];
3617 int32_t maxCount = (int32_t)pat[opValue+3];
3618
3619 (*pCounter)++;
3620 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3621 // The loop has matched the maximum permitted number of times.
3622 // Break out of here with no action. Matching will
3623 // continue with the following pattern.
3624 U_ASSERT(*pCounter == maxCount);
3625 break;
3626 }
3627
3628 if (*pCounter < minCount) {
3629 // We haven't met the minimum number of matches yet.
3630 // Loop back for another one.
3631 fp->fPatIdx = opValue + 4; // Loop back.
3632 } else {
3633 // We do have the minimum number of matches.
3634
3635 // If there is no upper bound on the loop iterations, check that the input index
3636 // is progressing, and stop the loop if it is not.
3637 if (maxCount == -1) {
3638 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3639 if (fp->fInputIdx == *pLastInputIdx) {
3640 break;
3641 }
3642 *pLastInputIdx = fp->fInputIdx;
3643 }
3644
3645 // Loop Continuation: we will fall into the pattern following the loop
3646 // (non-greedy, don't execute loop body first), but first do
3647 // a state save to the top of the loop, so that a match failure
3648 // in the following pattern will try another iteration of the loop.
3649 fp = StateSave(fp, opValue + 4, status);
3650 }
3651 }
3652 break;
3653
3654 case URX_STO_SP:
3655 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3656 fData[opValue] = fStack->size();
3657 break;
3658
3659 case URX_LD_SP:
3660 {
3661 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3662 int32_t newStackSize = (int32_t)fData[opValue];
3663 U_ASSERT(newStackSize <= fStack->size());
3664 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3665 if (newFP == (int64_t *)fp) {
3666 break;
3667 }
3668 int32_t i;
3669 for (i=0; i<fFrameSize; i++) {
3670 newFP[i] = ((int64_t *)fp)[i];
3671 }
3672 fp = (REStackFrame *)newFP;
3673 fStack->setSize(newStackSize);
3674 }
3675 break;
3676
3677 case URX_BACKREF:
3678 {
3679 U_ASSERT(opValue < fFrameSize);
3680 int64_t groupStartIdx = fp->fExtra[opValue];
3681 int64_t groupEndIdx = fp->fExtra[opValue+1];
3682 U_ASSERT(groupStartIdx <= groupEndIdx);
3683 if (groupStartIdx < 0) {
3684 // This capture group has not participated in the match thus far,
3685 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3686 break;
3687 }
3688 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3689 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3690
3691 // Note: if the capture group match was of an empty string the backref
3692 // match succeeds. Verified by testing: Perl matches succeed
3693 // in this case, so we do too.
3694
3695 UBool success = TRUE;
3696 for (;;) {
3697 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3698 success = TRUE;
3699 break;
3700 }
3701 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3702 success = FALSE;
3703 fHitEnd = TRUE;
3704 break;
3705 }
3706 UChar32 captureGroupChar = utext_next32(fAltInputText);
3707 UChar32 inputChar = utext_next32(fInputText);
3708 if (inputChar != captureGroupChar) {
3709 success = FALSE;
3710 break;
3711 }
3712 }
3713
3714 if (success) {
3715 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3716 } else {
3717 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3718 }
3719 }
3720 break;
3721
3722
3723
3724 case URX_BACKREF_I:
3725 {
3726 U_ASSERT(opValue < fFrameSize);
3727 int64_t groupStartIdx = fp->fExtra[opValue];
3728 int64_t groupEndIdx = fp->fExtra[opValue+1];
3729 U_ASSERT(groupStartIdx <= groupEndIdx);
3730 if (groupStartIdx < 0) {
3731 // This capture group has not participated in the match thus far,
3732 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3733 break;
3734 }
3735 utext_setNativeIndex(fAltInputText, groupStartIdx);
3736 utext_setNativeIndex(fInputText, fp->fInputIdx);
3737 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3738 CaseFoldingUTextIterator inputItr(*fInputText);
3739
3740 // Note: if the capture group match was of an empty string the backref
3741 // match succeeds. Verified by testing: Perl matches succeed
3742 // in this case, so we do too.
3743
3744 UBool success = TRUE;
3745 for (;;) {
3746 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3747 success = TRUE;
3748 break;
3749 }
3750 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3751 success = FALSE;
3752 fHitEnd = TRUE;
3753 break;
3754 }
3755 UChar32 captureGroupChar = captureGroupItr.next();
3756 UChar32 inputChar = inputItr.next();
3757 if (inputChar != captureGroupChar) {
3758 success = FALSE;
3759 break;
3760 }
3761 }
3762
3763 if (success && inputItr.inExpansion()) {
3764 // We otained a match by consuming part of a string obtained from
3765 // case-folding a single code point of the input text.
3766 // This does not count as an overall match.
3767 success = FALSE;
3768 }
3769
3770 if (success) {
3771 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3772 } else {
3773 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3774 }
3775
3776 }
3777 break;
3778
3779 case URX_STO_INP_LOC:
3780 {
3781 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3782 fp->fExtra[opValue] = fp->fInputIdx;
3783 }
3784 break;
3785
3786 case URX_JMPX:
3787 {
3788 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3789 fp->fPatIdx += 1;
3790 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3791 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3792 int64_t savedInputIdx = fp->fExtra[dataLoc];
3793 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3794 if (savedInputIdx < fp->fInputIdx) {
3795 fp->fPatIdx = opValue; // JMP
3796 } else {
3797 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3798 }
3799 }
3800 break;
3801
3802 case URX_LA_START:
3803 {
3804 // Entering a lookahead block.
3805 // Save Stack Ptr, Input Pos.
3806 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3807 fData[opValue] = fStack->size();
3808 fData[opValue+1] = fp->fInputIdx;
3809 fActiveStart = fLookStart; // Set the match region change for
3810 fActiveLimit = fLookLimit; // transparent bounds.
3811 }
3812 break;
3813
3814 case URX_LA_END:
3815 {
3816 // Leaving a look-ahead block.
3817 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3818 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3819 int32_t stackSize = fStack->size();
3820 int32_t newStackSize =(int32_t)fData[opValue];
3821 U_ASSERT(stackSize >= newStackSize);
3822 if (stackSize > newStackSize) {
3823 // Copy the current top frame back to the new (cut back) top frame.
3824 // This makes the capture groups from within the look-ahead
3825 // expression available.
3826 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3827 int32_t i;
3828 for (i=0; i<fFrameSize; i++) {
3829 newFP[i] = ((int64_t *)fp)[i];
3830 }
3831 fp = (REStackFrame *)newFP;
3832 fStack->setSize(newStackSize);
3833 }
3834 fp->fInputIdx = fData[opValue+1];
3835
3836 // Restore the active region bounds in the input string; they may have
3837 // been changed because of transparent bounds on a Region.
3838 fActiveStart = fRegionStart;
3839 fActiveLimit = fRegionLimit;
3840 }
3841 break;
3842
3843 case URX_ONECHAR_I:
3844 // Case insensitive one char. The char from the pattern is already case folded.
3845 // Input text is not, but case folding the input can not reduce two or more code
3846 // points to one.
3847 if (fp->fInputIdx < fActiveLimit) {
3848 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3849
3850 UChar32 c = UTEXT_NEXT32(fInputText);
3851 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3852 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3853 break;
3854 }
3855 } else {
3856 fHitEnd = TRUE;
3857 }
3858
3859 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3860 break;
3861
3862 case URX_STRING_I:
3863 {
3864 // Case-insensitive test input against a literal string.
3865 // Strings require two slots in the compiled pattern, one for the
3866 // offset to the string text, and one for the length.
3867 // The compiled string has already been case folded.
3868 {
3869 const UChar *patternString = litText + opValue;
3870 int32_t patternStringIdx = 0;
3871
3872 op = (int32_t)pat[fp->fPatIdx];
3873 fp->fPatIdx++;
3874 opType = URX_TYPE(op);
3875 opValue = URX_VAL(op);
3876 U_ASSERT(opType == URX_STRING_LEN);
3877 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3878
3879
3880 UChar32 cPattern;
3881 UChar32 cText;
3882 UBool success = TRUE;
3883
3884 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3885 CaseFoldingUTextIterator inputIterator(*fInputText);
3886 while (patternStringIdx < patternStringLen) {
3887 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3888 success = FALSE;
3889 fHitEnd = TRUE;
3890 break;
3891 }
3892 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3893 cText = inputIterator.next();
3894 if (cText != cPattern) {
3895 success = FALSE;
3896 break;
3897 }
3898 }
3899 if (inputIterator.inExpansion()) {
3900 success = FALSE;
3901 }
3902
3903 if (success) {
3904 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3905 } else {
3906 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3907 }
3908 }
3909 }
3910 break;
3911
3912 case URX_LB_START:
3913 {
3914 // Entering a look-behind block.
3915 // Save Stack Ptr, Input Pos.
3916 // TODO: implement transparent bounds. Ticket #6067
3917 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3918 fData[opValue] = fStack->size();
3919 fData[opValue+1] = fp->fInputIdx;
3920 // Init the variable containing the start index for attempted matches.
3921 fData[opValue+2] = -1;
3922 // Save input string length, then reset to pin any matches to end at
3923 // the current position.
3924 fData[opValue+3] = fActiveLimit;
3925 fActiveLimit = fp->fInputIdx;
3926 }
3927 break;
3928
3929
3930 case URX_LB_CONT:
3931 {
3932 // Positive Look-Behind, at top of loop checking for matches of LB expression
3933 // at all possible input starting positions.
3934
3935 // Fetch the min and max possible match lengths. They are the operands
3936 // of this op in the pattern.
3937 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3938 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3939 U_ASSERT(minML <= maxML);
3940 U_ASSERT(minML >= 0);
3941
3942 // Fetch (from data) the last input index where a match was attempted.
3943 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3944 int64_t *lbStartIdx = &fData[opValue+2];
3945 if (*lbStartIdx < 0) {
3946 // First time through loop.
3947 *lbStartIdx = fp->fInputIdx - minML;
3948 } else {
3949 // 2nd through nth time through the loop.
3950 // Back up start position for match by one.
3951 if (*lbStartIdx == 0) {
3952 (*lbStartIdx)--;
3953 } else {
3954 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3955 (void)UTEXT_PREVIOUS32(fInputText);
3956 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3957 }
3958 }
3959
3960 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3961 // We have tried all potential match starting points without
3962 // getting a match. Backtrack out, and out of the
3963 // Look Behind altogether.
3964 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3965 int64_t restoreInputLen = fData[opValue+3];
3966 U_ASSERT(restoreInputLen >= fActiveLimit);
3967 U_ASSERT(restoreInputLen <= fInputLength);
3968 fActiveLimit = restoreInputLen;
3969 break;
3970 }
3971
3972 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3973 // (successful match will fall off the end of the loop.)
3974 fp = StateSave(fp, fp->fPatIdx-3, status);
3975 fp->fInputIdx = *lbStartIdx;
3976 }
3977 break;
3978
3979 case URX_LB_END:
3980 // End of a look-behind block, after a successful match.
3981 {
3982 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3983 if (fp->fInputIdx != fActiveLimit) {
3984 // The look-behind expression matched, but the match did not
3985 // extend all the way to the point that we are looking behind from.
3986 // FAIL out of here, which will take us back to the LB_CONT, which
3987 // will retry the match starting at another position or fail
3988 // the look-behind altogether, whichever is appropriate.
3989 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3990 break;
3991 }
3992
3993 // Look-behind match is good. Restore the orignal input string length,
3994 // which had been truncated to pin the end of the lookbehind match to the
3995 // position being looked-behind.
3996 int64_t originalInputLen = fData[opValue+3];
3997 U_ASSERT(originalInputLen >= fActiveLimit);
3998 U_ASSERT(originalInputLen <= fInputLength);
3999 fActiveLimit = originalInputLen;
4000 }
4001 break;
4002
4003
4004 case URX_LBN_CONT:
4005 {
4006 // Negative Look-Behind, at top of loop checking for matches of LB expression
4007 // at all possible input starting positions.
4008
4009 // Fetch the extra parameters of this op.
4010 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4011 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4012 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4013 continueLoc = URX_VAL(continueLoc);
4014 U_ASSERT(minML <= maxML);
4015 U_ASSERT(minML >= 0);
4016 U_ASSERT(continueLoc > fp->fPatIdx);
4017
4018 // Fetch (from data) the last input index where a match was attempted.
4019 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4020 int64_t *lbStartIdx = &fData[opValue+2];
4021 if (*lbStartIdx < 0) {
4022 // First time through loop.
4023 *lbStartIdx = fp->fInputIdx - minML;
4024 } else {
4025 // 2nd through nth time through the loop.
4026 // Back up start position for match by one.
4027 if (*lbStartIdx == 0) {
4028 (*lbStartIdx)--;
4029 } else {
4030 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4031 (void)UTEXT_PREVIOUS32(fInputText);
4032 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4033 }
4034 }
4035
4036 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4037 // We have tried all potential match starting points without
4038 // getting a match, which means that the negative lookbehind as
4039 // a whole has succeeded. Jump forward to the continue location
4040 int64_t restoreInputLen = fData[opValue+3];
4041 U_ASSERT(restoreInputLen >= fActiveLimit);
4042 U_ASSERT(restoreInputLen <= fInputLength);
4043 fActiveLimit = restoreInputLen;
4044 fp->fPatIdx = continueLoc;
4045 break;
4046 }
4047
4048 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4049 // (successful match will cause a FAIL out of the loop altogether.)
4050 fp = StateSave(fp, fp->fPatIdx-4, status);
4051 fp->fInputIdx = *lbStartIdx;
4052 }
4053 break;
4054
4055 case URX_LBN_END:
4056 // End of a negative look-behind block, after a successful match.
4057 {
4058 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4059 if (fp->fInputIdx != fActiveLimit) {
4060 // The look-behind expression matched, but the match did not
4061 // extend all the way to the point that we are looking behind from.
4062 // FAIL out of here, which will take us back to the LB_CONT, which
4063 // will retry the match starting at another position or succeed
4064 // the look-behind altogether, whichever is appropriate.
4065 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4066 break;
4067 }
4068
4069 // Look-behind expression matched, which means look-behind test as
4070 // a whole Fails
4071
4072 // Restore the orignal input string length, which had been truncated
4073 // inorder to pin the end of the lookbehind match
4074 // to the position being looked-behind.
4075 int64_t originalInputLen = fData[opValue+3];
4076 U_ASSERT(originalInputLen >= fActiveLimit);
4077 U_ASSERT(originalInputLen <= fInputLength);
4078 fActiveLimit = originalInputLen;
4079
4080 // Restore original stack position, discarding any state saved
4081 // by the successful pattern match.
4082 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4083 int32_t newStackSize = (int32_t)fData[opValue];
4084 U_ASSERT(fStack->size() > newStackSize);
4085 fStack->setSize(newStackSize);
4086
4087 // FAIL, which will take control back to someplace
4088 // prior to entering the look-behind test.
4089 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4090 }
4091 break;
4092
4093
4094 case URX_LOOP_SR_I:
4095 // Loop Initialization for the optimized implementation of
4096 // [some character set]*
4097 // This op scans through all matching input.
4098 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4099 {
4100 U_ASSERT(opValue > 0 && opValue < sets->size());
4101 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4102 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4103
4104 // Loop through input, until either the input is exhausted or
4105 // we reach a character that is not a member of the set.
4106 int64_t ix = fp->fInputIdx;
4107 UTEXT_SETNATIVEINDEX(fInputText, ix);
4108 for (;;) {
4109 if (ix >= fActiveLimit) {
4110 fHitEnd = TRUE;
4111 break;
4112 }
4113 UChar32 c = UTEXT_NEXT32(fInputText);
4114 if (c<256) {
4115 if (s8->contains(c) == FALSE) {
4116 break;
4117 }
4118 } else {
4119 if (s->contains(c) == FALSE) {
4120 break;
4121 }
4122 }
4123 ix = UTEXT_GETNATIVEINDEX(fInputText);
4124 }
4125
4126 // If there were no matching characters, skip over the loop altogether.
4127 // The loop doesn't run at all, a * op always succeeds.
4128 if (ix == fp->fInputIdx) {
4129 fp->fPatIdx++; // skip the URX_LOOP_C op.
4130 break;
4131 }
4132
4133 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4134 // must follow. It's operand is the stack location
4135 // that holds the starting input index for the match of this [set]*
4136 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4137 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4138 int32_t stackLoc = URX_VAL(loopcOp);
4139 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4140 fp->fExtra[stackLoc] = fp->fInputIdx;
4141 fp->fInputIdx = ix;
4142
4143 // Save State to the URX_LOOP_C op that follows this one,
4144 // so that match failures in the following code will return to there.
4145 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4146 fp = StateSave(fp, fp->fPatIdx, status);
4147 fp->fPatIdx++;
4148 }
4149 break;
4150
4151
4152 case URX_LOOP_DOT_I:
4153 // Loop Initialization for the optimized implementation of .*
4154 // This op scans through all remaining input.
4155 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4156 {
4157 // Loop through input until the input is exhausted (we reach an end-of-line)
4158 // In DOTALL mode, we can just go straight to the end of the input.
4159 int64_t ix;
4160 if ((opValue & 1) == 1) {
4161 // Dot-matches-All mode. Jump straight to the end of the string.
4162 ix = fActiveLimit;
4163 fHitEnd = TRUE;
4164 } else {
4165 // NOT DOT ALL mode. Line endings do not match '.'
4166 // Scan forward until a line ending or end of input.
4167 ix = fp->fInputIdx;
4168 UTEXT_SETNATIVEINDEX(fInputText, ix);
4169 for (;;) {
4170 if (ix >= fActiveLimit) {
4171 fHitEnd = TRUE;
4172 break;
4173 }
4174 UChar32 c = UTEXT_NEXT32(fInputText);
4175 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4176 if ((c == 0x0a) || // 0x0a is newline in both modes.
4177 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4178 isLineTerminator(c))) {
4179 // char is a line ending. Exit the scanning loop.
4180 break;
4181 }
4182 }
4183 ix = UTEXT_GETNATIVEINDEX(fInputText);
4184 }
4185 }
4186
4187 // If there were no matching characters, skip over the loop altogether.
4188 // The loop doesn't run at all, a * op always succeeds.
4189 if (ix == fp->fInputIdx) {
4190 fp->fPatIdx++; // skip the URX_LOOP_C op.
4191 break;
4192 }
4193
4194 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4195 // must follow. It's operand is the stack location
4196 // that holds the starting input index for the match of this .*
4197 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4198 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4199 int32_t stackLoc = URX_VAL(loopcOp);
4200 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4201 fp->fExtra[stackLoc] = fp->fInputIdx;
4202 fp->fInputIdx = ix;
4203
4204 // Save State to the URX_LOOP_C op that follows this one,
4205 // so that match failures in the following code will return to there.
4206 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4207 fp = StateSave(fp, fp->fPatIdx, status);
4208 fp->fPatIdx++;
4209 }
4210 break;
4211
4212
4213 case URX_LOOP_C:
4214 {
4215 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4216 backSearchIndex = fp->fExtra[opValue];
4217 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4218 if (backSearchIndex == fp->fInputIdx) {
4219 // We've backed up the input idx to the point that the loop started.
4220 // The loop is done. Leave here without saving state.
4221 // Subsequent failures won't come back here.
4222 break;
4223 }
4224 // Set up for the next iteration of the loop, with input index
4225 // backed up by one from the last time through,
4226 // and a state save to this instruction in case the following code fails again.
4227 // (We're going backwards because this loop emulates stack unwinding, not
4228 // the initial scan forward.)
4229 U_ASSERT(fp->fInputIdx > 0);
4230 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4231 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4232 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4233
4234 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4235 if (prevC == 0x0a &&
4236 fp->fInputIdx > backSearchIndex &&
4237 twoPrevC == 0x0d) {
4238 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4239 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4240 // .*, stepping back over CRLF pair.
4241 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4242 }
4243 }
4244
4245
4246 fp = StateSave(fp, fp->fPatIdx-1, status);
4247 }
4248 break;
4249
4250
4251
4252 default:
4253 // Trouble. The compiled pattern contains an entry with an
4254 // unrecognized type tag.
4255 U_ASSERT(FALSE);
4256 }
4257
4258 if (U_FAILURE(status)) {
4259 isMatch = FALSE;
4260 break;
4261 }
4262 }
4263
4264 breakFromLoop:
4265 fMatch = isMatch;
4266 if (isMatch) {
4267 fLastMatchEnd = fMatchEnd;
4268 fMatchStart = startIdx;
4269 fMatchEnd = fp->fInputIdx;
4270 }
4271
4272 #ifdef REGEX_RUN_DEBUG
4273 if (fTraceDebug) {
4274 if (isMatch) {
4275 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4276 } else {
4277 printf("No match\n\n");
4278 }
4279 }
4280 #endif
4281
4282 fFrame = fp; // The active stack frame when the engine stopped.
4283 // Contains the capture group results that we need to
4284 // access later.
4285 return;
4286 }
4287
4288
4289 //--------------------------------------------------------------------------------
4290 //
4291 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4292 // assumption that the entire string is available in the UText's
4293 // chunk buffer. For now, that means we can use int32_t indexes,
4294 // except for anything that needs to be saved (like group starts
4295 // and ends).
4296 //
4297 // startIdx: begin matching a this index.
4298 // toEnd: if true, match must extend to end of the input region
4299 //
4300 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4301 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4302 UBool isMatch = FALSE; // True if the we have a match.
4303
4304 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4305
4306 int32_t op; // Operation from the compiled pattern, split into
4307 int32_t opType; // the opcode
4308 int32_t opValue; // and the operand value.
4309
4310 #ifdef REGEX_RUN_DEBUG
4311 if (fTraceDebug) {
4312 printf("MatchAt(startIdx=%d)\n", startIdx);
4313 printf("Original Pattern: ");
4314 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4315 while (c != U_SENTINEL) {
4316 if (c<32 || c>256) {
4317 c = '.';
4318 }
4319 printf("%c", c);
4320
4321 c = UTEXT_NEXT32(fPattern->fPattern);
4322 }
4323 printf("\n");
4324 printf("Input String: ");
4325 c = utext_next32From(fInputText, 0);
4326 while (c != U_SENTINEL) {
4327 if (c<32 || c>256) {
4328 c = '.';
4329 }
4330 printf("%c", c);
4331
4332 c = UTEXT_NEXT32(fInputText);
4333 }
4334 printf("\n");
4335 printf("\n");
4336 }
4337 #endif
4338
4339 if (U_FAILURE(status)) {
4340 return;
4341 }
4342
4343 // Cache frequently referenced items from the compiled pattern
4344 //
4345 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4346
4347 const UChar *litText = fPattern->fLiteralText.getBuffer();
4348 UVector *sets = fPattern->fSets;
4349
4350 const UChar *inputBuf = fInputText->chunkContents;
4351
4352 fFrameSize = fPattern->fFrameSize;
4353 REStackFrame *fp = resetStack();
4354 if (U_FAILURE(fDeferredStatus)) {
4355 status = fDeferredStatus;
4356 return;
4357 }
4358
4359 fp->fPatIdx = 0;
4360 fp->fInputIdx = startIdx;
4361
4362 // Zero out the pattern's static data
4363 int32_t i;
4364 for (i = 0; i<fPattern->fDataSize; i++) {
4365 fData[i] = 0;
4366 }
4367
4368 //
4369 // Main loop for interpreting the compiled pattern.
4370 // One iteration of the loop per pattern operation performed.
4371 //
4372 for (;;) {
4373 op = (int32_t)pat[fp->fPatIdx];
4374 opType = URX_TYPE(op);
4375 opValue = URX_VAL(op);
4376 #ifdef REGEX_RUN_DEBUG
4377 if (fTraceDebug) {
4378 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4379 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4380 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4381 fPattern->dumpOp(fp->fPatIdx);
4382 }
4383 #endif
4384 fp->fPatIdx++;
4385
4386 switch (opType) {
4387
4388
4389 case URX_NOP:
4390 break;
4391
4392
4393 case URX_BACKTRACK:
4394 // Force a backtrack. In some circumstances, the pattern compiler
4395 // will notice that the pattern can't possibly match anything, and will
4396 // emit one of these at that point.
4397 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4398 break;
4399
4400
4401 case URX_ONECHAR:
4402 if (fp->fInputIdx < fActiveLimit) {
4403 UChar32 c;
4404 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4405 if (c == opValue) {
4406 break;
4407 }
4408 } else {
4409 fHitEnd = TRUE;
4410 }
4411 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412 break;
4413
4414
4415 case URX_STRING:
4416 {
4417 // Test input against a literal string.
4418 // Strings require two slots in the compiled pattern, one for the
4419 // offset to the string text, and one for the length.
4420 int32_t stringStartIdx = opValue;
4421 int32_t stringLen;
4422
4423 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4424 fp->fPatIdx++;
4425 opType = URX_TYPE(op);
4426 stringLen = URX_VAL(op);
4427 U_ASSERT(opType == URX_STRING_LEN);
4428 U_ASSERT(stringLen >= 2);
4429
4430 const UChar * pInp = inputBuf + fp->fInputIdx;
4431 const UChar * pInpLimit = inputBuf + fActiveLimit;
4432 const UChar * pPat = litText+stringStartIdx;
4433 const UChar * pEnd = pInp + stringLen;
4434 UBool success = TRUE;
4435 while (pInp < pEnd) {
4436 if (pInp >= pInpLimit) {
4437 fHitEnd = TRUE;
4438 success = FALSE;
4439 break;
4440 }
4441 if (*pInp++ != *pPat++) {
4442 success = FALSE;
4443 break;
4444 }
4445 }
4446
4447 if (success) {
4448 fp->fInputIdx += stringLen;
4449 } else {
4450 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4451 }
4452 }
4453 break;
4454
4455
4456 case URX_STATE_SAVE:
4457 fp = StateSave(fp, opValue, status);
4458 break;
4459
4460
4461 case URX_END:
4462 // The match loop will exit via this path on a successful match,
4463 // when we reach the end of the pattern.
4464 if (toEnd && fp->fInputIdx != fActiveLimit) {
4465 // The pattern matched, but not to the end of input. Try some more.
4466 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4467 break;
4468 }
4469 isMatch = TRUE;
4470 goto breakFromLoop;
4471
4472 // Start and End Capture stack frame variables are laid out out like this:
4473 // fp->fExtra[opValue] - The start of a completed capture group
4474 // opValue+1 - The end of a completed capture group
4475 // opValue+2 - the start of a capture group whose end
4476 // has not yet been reached (and might not ever be).
4477 case URX_START_CAPTURE:
4478 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4479 fp->fExtra[opValue+2] = fp->fInputIdx;
4480 break;
4481
4482
4483 case URX_END_CAPTURE:
4484 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4485 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4486 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4487 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4488 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4489 break;
4490
4491
4492 case URX_DOLLAR: // $, test for End of line
4493 // or for position before new line at end of input
4494 if (fp->fInputIdx < fAnchorLimit-2) {
4495 // We are no where near the end of input. Fail.
4496 // This is the common case. Keep it first.
4497 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4498 break;
4499 }
4500 if (fp->fInputIdx >= fAnchorLimit) {
4501 // We really are at the end of input. Success.
4502 fHitEnd = TRUE;
4503 fRequireEnd = TRUE;
4504 break;
4505 }
4506
4507 // If we are positioned just before a new-line that is located at the
4508 // end of input, succeed.
4509 if (fp->fInputIdx == fAnchorLimit-1) {
4510 UChar32 c;
4511 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4512
4513 if (isLineTerminator(c)) {
4514 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4515 // At new-line at end of input. Success
4516 fHitEnd = TRUE;
4517 fRequireEnd = TRUE;
4518 break;
4519 }
4520 }
4521 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4522 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4523 fHitEnd = TRUE;
4524 fRequireEnd = TRUE;
4525 break; // At CR/LF at end of input. Success
4526 }
4527
4528 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4529
4530 break;
4531
4532
4533 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4534 if (fp->fInputIdx >= fAnchorLimit-1) {
4535 // Either at the last character of input, or off the end.
4536 if (fp->fInputIdx == fAnchorLimit-1) {
4537 // At last char of input. Success if it's a new line.
4538 if (inputBuf[fp->fInputIdx] == 0x0a) {
4539 fHitEnd = TRUE;
4540 fRequireEnd = TRUE;
4541 break;
4542 }
4543 } else {
4544 // Off the end of input. Success.
4545 fHitEnd = TRUE;
4546 fRequireEnd = TRUE;
4547 break;
4548 }
4549 }
4550
4551 // Not at end of input. Back-track out.
4552 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4553 break;
4554
4555
4556 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4557 {
4558 if (fp->fInputIdx >= fAnchorLimit) {
4559 // We really are at the end of input. Success.
4560 fHitEnd = TRUE;
4561 fRequireEnd = TRUE;
4562 break;
4563 }
4564 // If we are positioned just before a new-line, succeed.
4565 // It makes no difference where the new-line is within the input.
4566 UChar32 c = inputBuf[fp->fInputIdx];
4567 if (isLineTerminator(c)) {
4568 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4569 // In multi-line mode, hitting a new-line just before the end of input does not
4570 // set the hitEnd or requireEnd flags
4571 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4572 break;
4573 }
4574 }
4575 // not at a new line. Fail.
4576 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4577 }
4578 break;
4579
4580
4581 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4582 {
4583 if (fp->fInputIdx >= fAnchorLimit) {
4584 // We really are at the end of input. Success.
4585 fHitEnd = TRUE;
4586 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4587 break; // adding a new-line would not lose the match.
4588 }
4589 // If we are not positioned just before a new-line, the test fails; backtrack out.
4590 // It makes no difference where the new-line is within the input.
4591 if (inputBuf[fp->fInputIdx] != 0x0a) {
4592 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4593 }
4594 }
4595 break;
4596
4597
4598 case URX_CARET: // ^, test for start of line
4599 if (fp->fInputIdx != fAnchorStart) {
4600 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4601 }
4602 break;
4603
4604
4605 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4606 {
4607 if (fp->fInputIdx == fAnchorStart) {
4608 // We are at the start input. Success.
4609 break;
4610 }
4611 // Check whether character just before the current pos is a new-line
4612 // unless we are at the end of input
4613 UChar c = inputBuf[fp->fInputIdx - 1];
4614 if ((fp->fInputIdx < fAnchorLimit) &&
4615 isLineTerminator(c)) {
4616 // It's a new-line. ^ is true. Success.
4617 // TODO: what should be done with positions between a CR and LF?
4618 break;
4619 }
4620 // Not at the start of a line. Fail.
4621 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4622 }
4623 break;
4624
4625
4626 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4627 {
4628 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4629 if (fp->fInputIdx <= fAnchorStart) {
4630 // We are at the start input. Success.
4631 break;
4632 }
4633 // Check whether character just before the current pos is a new-line
4634 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4635 UChar c = inputBuf[fp->fInputIdx - 1];
4636 if (c != 0x0a) {
4637 // Not at the start of a line. Back-track out.
4638 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4639 }
4640 }
4641 break;
4642
4643 case URX_BACKSLASH_B: // Test for word boundaries
4644 {
4645 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4646 success ^= (UBool)(opValue != 0); // flip sense for \B
4647 if (!success) {
4648 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4649 }
4650 }
4651 break;
4652
4653
4654 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4655 {
4656 UBool success = isUWordBoundary(fp->fInputIdx);
4657 success ^= (UBool)(opValue != 0); // flip sense for \B
4658 if (!success) {
4659 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4660 }
4661 }
4662 break;
4663
4664
4665 case URX_BACKSLASH_D: // Test for decimal digit
4666 {
4667 if (fp->fInputIdx >= fActiveLimit) {
4668 fHitEnd = TRUE;
4669 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4670 break;
4671 }
4672
4673 UChar32 c;
4674 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4675 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4676 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4677 success ^= (UBool)(opValue != 0); // flip sense for \D
4678 if (!success) {
4679 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4680 }
4681 }
4682 break;
4683
4684
4685 case URX_BACKSLASH_G: // Test for position at end of previous match
4686 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4687 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4688 }
4689 break;
4690
4691
4692 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4693 {
4694 if (fp->fInputIdx >= fActiveLimit) {
4695 fHitEnd = TRUE;
4696 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697 break;
4698 }
4699 UChar32 c;
4700 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4701 int8_t ctype = u_charType(c);
4702 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4703 success ^= (UBool)(opValue != 0); // flip sense for \H
4704 if (!success) {
4705 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706 }
4707 }
4708 break;
4709
4710
4711 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4712 {
4713 if (fp->fInputIdx >= fActiveLimit) {
4714 fHitEnd = TRUE;
4715 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716 break;
4717 }
4718 UChar32 c;
4719 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720 if (isLineTerminator(c)) {
4721 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4722 // Check for CR/LF sequence. Consume both together when found.
4723 UChar c2;
4724 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4725 if (c2 != 0x0a) {
4726 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4727 }
4728 }
4729 } else {
4730 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4731 }
4732 }
4733 break;
4734
4735
4736 case URX_BACKSLASH_V: // Any single code point line ending.
4737 {
4738 if (fp->fInputIdx >= fActiveLimit) {
4739 fHitEnd = TRUE;
4740 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4741 break;
4742 }
4743 UChar32 c;
4744 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4745 UBool success = isLineTerminator(c);
4746 success ^= (UBool)(opValue != 0); // flip sense for \V
4747 if (!success) {
4748 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4749 }
4750 }
4751 break;
4752
4753
4754
4755 case URX_BACKSLASH_X:
4756 // Match a Grapheme, as defined by Unicode TR 29.
4757 // Differs slightly from Perl, which consumes combining marks independently
4758 // of context.
4759 {
4760
4761 // Fail if at end of input
4762 if (fp->fInputIdx >= fActiveLimit) {
4763 fHitEnd = TRUE;
4764 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765 break;
4766 }
4767
4768 // Examine (and consume) the current char.
4769 // Dispatch into a little state machine, based on the char.
4770 UChar32 c;
4771 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4772 UnicodeSet **sets = fPattern->fStaticSets;
4773 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4774 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4775 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4776 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4777 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4778 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4779 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4780 goto GC_Extend;
4781
4782
4783
4784 GC_L:
4785 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4786 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4787 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4788 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4789 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4790 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4791 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4792 goto GC_Extend;
4793
4794 GC_V:
4795 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4796 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4797 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4798 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4799 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4800 goto GC_Extend;
4801
4802 GC_T:
4803 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4804 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4805 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4806 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4807 goto GC_Extend;
4808
4809 GC_Extend:
4810 // Combining characters are consumed here
4811 for (;;) {
4812 if (fp->fInputIdx >= fActiveLimit) {
4813 break;
4814 }
4815 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4816 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4817 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4818 break;
4819 }
4820 }
4821 goto GC_Done;
4822
4823 GC_Control:
4824 // Most control chars stand alone (don't combine with combining chars),
4825 // except for that CR/LF sequence is a single grapheme cluster.
4826 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4827 fp->fInputIdx++;
4828 }
4829
4830 GC_Done:
4831 if (fp->fInputIdx >= fActiveLimit) {
4832 fHitEnd = TRUE;
4833 }
4834 break;
4835 }
4836
4837
4838
4839
4840 case URX_BACKSLASH_Z: // Test for end of Input
4841 if (fp->fInputIdx < fAnchorLimit) {
4842 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4843 } else {
4844 fHitEnd = TRUE;
4845 fRequireEnd = TRUE;
4846 }
4847 break;
4848
4849
4850
4851 case URX_STATIC_SETREF:
4852 {
4853 // Test input character against one of the predefined sets
4854 // (Word Characters, for example)
4855 // The high bit of the op value is a flag for the match polarity.
4856 // 0: success if input char is in set.
4857 // 1: success if input char is not in set.
4858 if (fp->fInputIdx >= fActiveLimit) {
4859 fHitEnd = TRUE;
4860 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861 break;
4862 }
4863
4864 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4865 opValue &= ~URX_NEG_SET;
4866 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4867
4868 UChar32 c;
4869 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4870 if (c < 256) {
4871 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4872 if (s8->contains(c)) {
4873 success = !success;
4874 }
4875 } else {
4876 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4877 if (s->contains(c)) {
4878 success = !success;
4879 }
4880 }
4881 if (!success) {
4882 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4883 }
4884 }
4885 break;
4886
4887
4888 case URX_STAT_SETREF_N:
4889 {
4890 // Test input character for NOT being a member of one of
4891 // the predefined sets (Word Characters, for example)
4892 if (fp->fInputIdx >= fActiveLimit) {
4893 fHitEnd = TRUE;
4894 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4895 break;
4896 }
4897
4898 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4899
4900 UChar32 c;
4901 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4902 if (c < 256) {
4903 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4904 if (s8->contains(c) == FALSE) {
4905 break;
4906 }
4907 } else {
4908 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4909 if (s->contains(c) == FALSE) {
4910 break;
4911 }
4912 }
4913 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4914 }
4915 break;
4916
4917
4918 case URX_SETREF:
4919 {
4920 if (fp->fInputIdx >= fActiveLimit) {
4921 fHitEnd = TRUE;
4922 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4923 break;
4924 }
4925
4926 U_ASSERT(opValue > 0 && opValue < sets->size());
4927
4928 // There is input left. Pick up one char and test it for set membership.
4929 UChar32 c;
4930 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4931 if (c<256) {
4932 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4933 if (s8->contains(c)) {
4934 // The character is in the set. A Match.
4935 break;
4936 }
4937 } else {
4938 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4939 if (s->contains(c)) {
4940 // The character is in the set. A Match.
4941 break;
4942 }
4943 }
4944
4945 // the character wasn't in the set.
4946 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4947 }
4948 break;
4949
4950
4951 case URX_DOTANY:
4952 {
4953 // . matches anything, but stops at end-of-line.
4954 if (fp->fInputIdx >= fActiveLimit) {
4955 // At end of input. Match failed. Backtrack out.
4956 fHitEnd = TRUE;
4957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4958 break;
4959 }
4960
4961 // There is input left. Advance over one char, unless we've hit end-of-line
4962 UChar32 c;
4963 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4964 if (isLineTerminator(c)) {
4965 // End of line in normal mode. . does not match.
4966 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4967 break;
4968 }
4969 }
4970 break;
4971
4972
4973 case URX_DOTANY_ALL:
4974 {
4975 // . in dot-matches-all (including new lines) mode
4976 if (fp->fInputIdx >= fActiveLimit) {
4977 // At end of input. Match failed. Backtrack out.
4978 fHitEnd = TRUE;
4979 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4980 break;
4981 }
4982
4983 // There is input left. Advance over one char, except if we are
4984 // at a cr/lf, advance over both of them.
4985 UChar32 c;
4986 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4987 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4988 // In the case of a CR/LF, we need to advance over both.
4989 if (inputBuf[fp->fInputIdx] == 0x0a) {
4990 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4991 }
4992 }
4993 }
4994 break;
4995
4996
4997 case URX_DOTANY_UNIX:
4998 {
4999 // '.' operator, matches all, but stops at end-of-line.
5000 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
5001 if (fp->fInputIdx >= fActiveLimit) {
5002 // At end of input. Match failed. Backtrack out.
5003 fHitEnd = TRUE;
5004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5005 break;
5006 }
5007
5008 // There is input left. Advance over one char, unless we've hit end-of-line
5009 UChar32 c;
5010 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5011 if (c == 0x0a) {
5012 // End of line in normal mode. '.' does not match the \n
5013 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5014 }
5015 }
5016 break;
5017
5018
5019 case URX_JMP:
5020 fp->fPatIdx = opValue;
5021 break;
5022
5023 case URX_FAIL:
5024 isMatch = FALSE;
5025 goto breakFromLoop;
5026
5027 case URX_JMP_SAV:
5028 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5029 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5030 fp->fPatIdx = opValue; // Then JMP.
5031 break;
5032
5033 case URX_JMP_SAV_X:
5034 // This opcode is used with (x)+, when x can match a zero length string.
5035 // Same as JMP_SAV, except conditional on the match having made forward progress.
5036 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5037 // data address of the input position at the start of the loop.
5038 {
5039 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5040 int32_t stoOp = (int32_t)pat[opValue-1];
5041 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5042 int32_t frameLoc = URX_VAL(stoOp);
5043 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5044 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5045 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5046 if (prevInputIdx < fp->fInputIdx) {
5047 // The match did make progress. Repeat the loop.
5048 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5049 fp->fPatIdx = opValue;
5050 fp->fExtra[frameLoc] = fp->fInputIdx;
5051 }
5052 // If the input position did not advance, we do nothing here,
5053 // execution will fall out of the loop.
5054 }
5055 break;
5056
5057 case URX_CTR_INIT:
5058 {
5059 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5060 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5061
5062 // Pick up the three extra operands that CTR_INIT has, and
5063 // skip the pattern location counter past
5064 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5065 fp->fPatIdx += 3;
5066 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5067 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5068 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5069 U_ASSERT(minCount>=0);
5070 U_ASSERT(maxCount>=minCount || maxCount==-1);
5071 U_ASSERT(loopLoc>=fp->fPatIdx);
5072
5073 if (minCount == 0) {
5074 fp = StateSave(fp, loopLoc+1, status);
5075 }
5076 if (maxCount == -1) {
5077 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5078 } else if (maxCount == 0) {
5079 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5080 }
5081 }
5082 break;
5083
5084 case URX_CTR_LOOP:
5085 {
5086 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5087 int32_t initOp = (int32_t)pat[opValue];
5088 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5089 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5090 int32_t minCount = (int32_t)pat[opValue+2];
5091 int32_t maxCount = (int32_t)pat[opValue+3];
5092 (*pCounter)++;
5093 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5094 U_ASSERT(*pCounter == maxCount);
5095 break;
5096 }
5097 if (*pCounter >= minCount) {
5098 if (maxCount == -1) {
5099 // Loop has no hard upper bound.
5100 // Check that it is progressing through the input, break if it is not.
5101 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5102 if (fp->fInputIdx == *pLastInputIdx) {
5103 break;
5104 } else {
5105 *pLastInputIdx = fp->fInputIdx;
5106 }
5107 }
5108 fp = StateSave(fp, fp->fPatIdx, status);
5109 }
5110 fp->fPatIdx = opValue + 4; // Loop back.
5111 }
5112 break;
5113
5114 case URX_CTR_INIT_NG:
5115 {
5116 // Initialize a non-greedy loop
5117 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5118 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5119
5120 // Pick up the three extra operands that CTR_INIT_NG has, and
5121 // skip the pattern location counter past
5122 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5123 fp->fPatIdx += 3;
5124 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5125 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5126 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5127 U_ASSERT(minCount>=0);
5128 U_ASSERT(maxCount>=minCount || maxCount==-1);
5129 U_ASSERT(loopLoc>fp->fPatIdx);
5130 if (maxCount == -1) {
5131 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5132 }
5133
5134 if (minCount == 0) {
5135 if (maxCount != 0) {
5136 fp = StateSave(fp, fp->fPatIdx, status);
5137 }
5138 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5139 }
5140 }
5141 break;
5142
5143 case URX_CTR_LOOP_NG:
5144 {
5145 // Non-greedy {min, max} loops
5146 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5147 int32_t initOp = (int32_t)pat[opValue];
5148 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5149 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5150 int32_t minCount = (int32_t)pat[opValue+2];
5151 int32_t maxCount = (int32_t)pat[opValue+3];
5152
5153 (*pCounter)++;
5154 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5155 // The loop has matched the maximum permitted number of times.
5156 // Break out of here with no action. Matching will
5157 // continue with the following pattern.
5158 U_ASSERT(*pCounter == maxCount);
5159 break;
5160 }
5161
5162 if (*pCounter < minCount) {
5163 // We haven't met the minimum number of matches yet.
5164 // Loop back for another one.
5165 fp->fPatIdx = opValue + 4; // Loop back.
5166 } else {
5167 // We do have the minimum number of matches.
5168
5169 // If there is no upper bound on the loop iterations, check that the input index
5170 // is progressing, and stop the loop if it is not.
5171 if (maxCount == -1) {
5172 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5173 if (fp->fInputIdx == *pLastInputIdx) {
5174 break;
5175 }
5176 *pLastInputIdx = fp->fInputIdx;
5177 }
5178
5179 // Loop Continuation: we will fall into the pattern following the loop
5180 // (non-greedy, don't execute loop body first), but first do
5181 // a state save to the top of the loop, so that a match failure
5182 // in the following pattern will try another iteration of the loop.
5183 fp = StateSave(fp, opValue + 4, status);
5184 }
5185 }
5186 break;
5187
5188 case URX_STO_SP:
5189 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5190 fData[opValue] = fStack->size();
5191 break;
5192
5193 case URX_LD_SP:
5194 {
5195 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5196 int32_t newStackSize = (int32_t)fData[opValue];
5197 U_ASSERT(newStackSize <= fStack->size());
5198 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5199 if (newFP == (int64_t *)fp) {
5200 break;
5201 }
5202 int32_t i;
5203 for (i=0; i<fFrameSize; i++) {
5204 newFP[i] = ((int64_t *)fp)[i];
5205 }
5206 fp = (REStackFrame *)newFP;
5207 fStack->setSize(newStackSize);
5208 }
5209 break;
5210
5211 case URX_BACKREF:
5212 {
5213 U_ASSERT(opValue < fFrameSize);
5214 int64_t groupStartIdx = fp->fExtra[opValue];
5215 int64_t groupEndIdx = fp->fExtra[opValue+1];
5216 U_ASSERT(groupStartIdx <= groupEndIdx);
5217 int64_t inputIndex = fp->fInputIdx;
5218 if (groupStartIdx < 0) {
5219 // This capture group has not participated in the match thus far,
5220 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5221 break;
5222 }
5223 UBool success = TRUE;
5224 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5225 if (inputIndex >= fActiveLimit) {
5226 success = FALSE;
5227 fHitEnd = TRUE;
5228 break;
5229 }
5230 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5231 success = FALSE;
5232 break;
5233 }
5234 }
5235 if (success) {
5236 fp->fInputIdx = inputIndex;
5237 } else {
5238 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5239 }
5240 }
5241 break;
5242
5243 case URX_BACKREF_I:
5244 {
5245 U_ASSERT(opValue < fFrameSize);
5246 int64_t groupStartIdx = fp->fExtra[opValue];
5247 int64_t groupEndIdx = fp->fExtra[opValue+1];
5248 U_ASSERT(groupStartIdx <= groupEndIdx);
5249 if (groupStartIdx < 0) {
5250 // This capture group has not participated in the match thus far,
5251 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5252 break;
5253 }
5254 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5255 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5256
5257 // Note: if the capture group match was of an empty string the backref
5258 // match succeeds. Verified by testing: Perl matches succeed
5259 // in this case, so we do too.
5260
5261 UBool success = TRUE;
5262 for (;;) {
5263 UChar32 captureGroupChar = captureGroupItr.next();
5264 if (captureGroupChar == U_SENTINEL) {
5265 success = TRUE;
5266 break;
5267 }
5268 UChar32 inputChar = inputItr.next();
5269 if (inputChar == U_SENTINEL) {
5270 success = FALSE;
5271 fHitEnd = TRUE;
5272 break;
5273 }
5274 if (inputChar != captureGroupChar) {
5275 success = FALSE;
5276 break;
5277 }
5278 }
5279
5280 if (success && inputItr.inExpansion()) {
5281 // We otained a match by consuming part of a string obtained from
5282 // case-folding a single code point of the input text.
5283 // This does not count as an overall match.
5284 success = FALSE;
5285 }
5286
5287 if (success) {
5288 fp->fInputIdx = inputItr.getIndex();
5289 } else {
5290 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5291 }
5292 }
5293 break;
5294
5295 case URX_STO_INP_LOC:
5296 {
5297 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5298 fp->fExtra[opValue] = fp->fInputIdx;
5299 }
5300 break;
5301
5302 case URX_JMPX:
5303 {
5304 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5305 fp->fPatIdx += 1;
5306 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5307 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5308 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5309 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5310 if (savedInputIdx < fp->fInputIdx) {
5311 fp->fPatIdx = opValue; // JMP
5312 } else {
5313 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5314 }
5315 }
5316 break;
5317
5318 case URX_LA_START:
5319 {
5320 // Entering a lookahead block.
5321 // Save Stack Ptr, Input Pos.
5322 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5323 fData[opValue] = fStack->size();
5324 fData[opValue+1] = fp->fInputIdx;
5325 fActiveStart = fLookStart; // Set the match region change for
5326 fActiveLimit = fLookLimit; // transparent bounds.
5327 }
5328 break;
5329
5330 case URX_LA_END:
5331 {
5332 // Leaving a look-ahead block.
5333 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5334 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5335 int32_t stackSize = fStack->size();
5336 int32_t newStackSize = (int32_t)fData[opValue];
5337 U_ASSERT(stackSize >= newStackSize);
5338 if (stackSize > newStackSize) {
5339 // Copy the current top frame back to the new (cut back) top frame.
5340 // This makes the capture groups from within the look-ahead
5341 // expression available.
5342 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5343 int32_t i;
5344 for (i=0; i<fFrameSize; i++) {
5345 newFP[i] = ((int64_t *)fp)[i];
5346 }
5347 fp = (REStackFrame *)newFP;
5348 fStack->setSize(newStackSize);
5349 }
5350 fp->fInputIdx = fData[opValue+1];
5351
5352 // Restore the active region bounds in the input string; they may have
5353 // been changed because of transparent bounds on a Region.
5354 fActiveStart = fRegionStart;
5355 fActiveLimit = fRegionLimit;
5356 }
5357 break;
5358
5359 case URX_ONECHAR_I:
5360 if (fp->fInputIdx < fActiveLimit) {
5361 UChar32 c;
5362 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5363 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5364 break;
5365 }
5366 } else {
5367 fHitEnd = TRUE;
5368 }
5369 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5370 break;
5371
5372 case URX_STRING_I:
5373 // Case-insensitive test input against a literal string.
5374 // Strings require two slots in the compiled pattern, one for the
5375 // offset to the string text, and one for the length.
5376 // The compiled string has already been case folded.
5377 {
5378 const UChar *patternString = litText + opValue;
5379
5380 op = (int32_t)pat[fp->fPatIdx];
5381 fp->fPatIdx++;
5382 opType = URX_TYPE(op);
5383 opValue = URX_VAL(op);
5384 U_ASSERT(opType == URX_STRING_LEN);
5385 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5386
5387 UChar32 cText;
5388 UChar32 cPattern;
5389 UBool success = TRUE;
5390 int32_t patternStringIdx = 0;
5391 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5392 while (patternStringIdx < patternStringLen) {
5393 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5394 cText = inputIterator.next();
5395 if (cText != cPattern) {
5396 success = FALSE;
5397 if (cText == U_SENTINEL) {
5398 fHitEnd = TRUE;
5399 }
5400 break;
5401 }
5402 }
5403 if (inputIterator.inExpansion()) {
5404 success = FALSE;
5405 }
5406
5407 if (success) {
5408 fp->fInputIdx = inputIterator.getIndex();
5409 } else {
5410 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5411 }
5412 }
5413 break;
5414
5415 case URX_LB_START:
5416 {
5417 // Entering a look-behind block.
5418 // Save Stack Ptr, Input Pos.
5419 // TODO: implement transparent bounds. Ticket #6067
5420 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5421 fData[opValue] = fStack->size();
5422 fData[opValue+1] = fp->fInputIdx;
5423 // Init the variable containing the start index for attempted matches.
5424 fData[opValue+2] = -1;
5425 // Save input string length, then reset to pin any matches to end at
5426 // the current position.
5427 fData[opValue+3] = fActiveLimit;
5428 fActiveLimit = fp->fInputIdx;
5429 }
5430 break;
5431
5432
5433 case URX_LB_CONT:
5434 {
5435 // Positive Look-Behind, at top of loop checking for matches of LB expression
5436 // at all possible input starting positions.
5437
5438 // Fetch the min and max possible match lengths. They are the operands
5439 // of this op in the pattern.
5440 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5441 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5442 U_ASSERT(minML <= maxML);
5443 U_ASSERT(minML >= 0);
5444
5445 // Fetch (from data) the last input index where a match was attempted.
5446 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5447 int64_t *lbStartIdx = &fData[opValue+2];
5448 if (*lbStartIdx < 0) {
5449 // First time through loop.
5450 *lbStartIdx = fp->fInputIdx - minML;
5451 } else {
5452 // 2nd through nth time through the loop.
5453 // Back up start position for match by one.
5454 if (*lbStartIdx == 0) {
5455 (*lbStartIdx)--;
5456 } else {
5457 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5458 }
5459 }
5460
5461 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5462 // We have tried all potential match starting points without
5463 // getting a match. Backtrack out, and out of the
5464 // Look Behind altogether.
5465 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5466 int64_t restoreInputLen = fData[opValue+3];
5467 U_ASSERT(restoreInputLen >= fActiveLimit);
5468 U_ASSERT(restoreInputLen <= fInputLength);
5469 fActiveLimit = restoreInputLen;
5470 break;
5471 }
5472
5473 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5474 // (successful match will fall off the end of the loop.)
5475 fp = StateSave(fp, fp->fPatIdx-3, status);
5476 fp->fInputIdx = *lbStartIdx;
5477 }
5478 break;
5479
5480 case URX_LB_END:
5481 // End of a look-behind block, after a successful match.
5482 {
5483 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5484 if (fp->fInputIdx != fActiveLimit) {
5485 // The look-behind expression matched, but the match did not
5486 // extend all the way to the point that we are looking behind from.
5487 // FAIL out of here, which will take us back to the LB_CONT, which
5488 // will retry the match starting at another position or fail
5489 // the look-behind altogether, whichever is appropriate.
5490 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5491 break;
5492 }
5493
5494 // Look-behind match is good. Restore the orignal input string length,
5495 // which had been truncated to pin the end of the lookbehind match to the
5496 // position being looked-behind.
5497 int64_t originalInputLen = fData[opValue+3];
5498 U_ASSERT(originalInputLen >= fActiveLimit);
5499 U_ASSERT(originalInputLen <= fInputLength);
5500 fActiveLimit = originalInputLen;
5501 }
5502 break;
5503
5504
5505 case URX_LBN_CONT:
5506 {
5507 // Negative Look-Behind, at top of loop checking for matches of LB expression
5508 // at all possible input starting positions.
5509
5510 // Fetch the extra parameters of this op.
5511 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5512 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5513 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5514 continueLoc = URX_VAL(continueLoc);
5515 U_ASSERT(minML <= maxML);
5516 U_ASSERT(minML >= 0);
5517 U_ASSERT(continueLoc > fp->fPatIdx);
5518
5519 // Fetch (from data) the last input index where a match was attempted.
5520 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5521 int64_t *lbStartIdx = &fData[opValue+2];
5522 if (*lbStartIdx < 0) {
5523 // First time through loop.
5524 *lbStartIdx = fp->fInputIdx - minML;
5525 } else {
5526 // 2nd through nth time through the loop.
5527 // Back up start position for match by one.
5528 if (*lbStartIdx == 0) {
5529 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5530 } else {
5531 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5532 }
5533 }
5534
5535 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5536 // We have tried all potential match starting points without
5537 // getting a match, which means that the negative lookbehind as
5538 // a whole has succeeded. Jump forward to the continue location
5539 int64_t restoreInputLen = fData[opValue+3];
5540 U_ASSERT(restoreInputLen >= fActiveLimit);
5541 U_ASSERT(restoreInputLen <= fInputLength);
5542 fActiveLimit = restoreInputLen;
5543 fp->fPatIdx = continueLoc;
5544 break;
5545 }
5546
5547 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5548 // (successful match will cause a FAIL out of the loop altogether.)
5549 fp = StateSave(fp, fp->fPatIdx-4, status);
5550 fp->fInputIdx = *lbStartIdx;
5551 }
5552 break;
5553
5554 case URX_LBN_END:
5555 // End of a negative look-behind block, after a successful match.
5556 {
5557 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5558 if (fp->fInputIdx != fActiveLimit) {
5559 // The look-behind expression matched, but the match did not
5560 // extend all the way to the point that we are looking behind from.
5561 // FAIL out of here, which will take us back to the LB_CONT, which
5562 // will retry the match starting at another position or succeed
5563 // the look-behind altogether, whichever is appropriate.
5564 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5565 break;
5566 }
5567
5568 // Look-behind expression matched, which means look-behind test as
5569 // a whole Fails
5570
5571 // Restore the orignal input string length, which had been truncated
5572 // inorder to pin the end of the lookbehind match
5573 // to the position being looked-behind.
5574 int64_t originalInputLen = fData[opValue+3];
5575 U_ASSERT(originalInputLen >= fActiveLimit);
5576 U_ASSERT(originalInputLen <= fInputLength);
5577 fActiveLimit = originalInputLen;
5578
5579 // Restore original stack position, discarding any state saved
5580 // by the successful pattern match.
5581 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5582 int32_t newStackSize = (int32_t)fData[opValue];
5583 U_ASSERT(fStack->size() > newStackSize);
5584 fStack->setSize(newStackSize);
5585
5586 // FAIL, which will take control back to someplace
5587 // prior to entering the look-behind test.
5588 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5589 }
5590 break;
5591
5592
5593 case URX_LOOP_SR_I:
5594 // Loop Initialization for the optimized implementation of
5595 // [some character set]*
5596 // This op scans through all matching input.
5597 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5598 {
5599 U_ASSERT(opValue > 0 && opValue < sets->size());
5600 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5601 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5602
5603 // Loop through input, until either the input is exhausted or
5604 // we reach a character that is not a member of the set.
5605 int32_t ix = (int32_t)fp->fInputIdx;
5606 for (;;) {
5607 if (ix >= fActiveLimit) {
5608 fHitEnd = TRUE;
5609 break;
5610 }
5611 UChar32 c;
5612 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5613 if (c<256) {
5614 if (s8->contains(c) == FALSE) {
5615 U16_BACK_1(inputBuf, 0, ix);
5616 break;
5617 }
5618 } else {
5619 if (s->contains(c) == FALSE) {
5620 U16_BACK_1(inputBuf, 0, ix);
5621 break;
5622 }
5623 }
5624 }
5625
5626 // If there were no matching characters, skip over the loop altogether.
5627 // The loop doesn't run at all, a * op always succeeds.
5628 if (ix == fp->fInputIdx) {
5629 fp->fPatIdx++; // skip the URX_LOOP_C op.
5630 break;
5631 }
5632
5633 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5634 // must follow. It's operand is the stack location
5635 // that holds the starting input index for the match of this [set]*
5636 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5637 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5638 int32_t stackLoc = URX_VAL(loopcOp);
5639 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5640 fp->fExtra[stackLoc] = fp->fInputIdx;
5641 fp->fInputIdx = ix;
5642
5643 // Save State to the URX_LOOP_C op that follows this one,
5644 // so that match failures in the following code will return to there.
5645 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5646 fp = StateSave(fp, fp->fPatIdx, status);
5647 fp->fPatIdx++;
5648 }
5649 break;
5650
5651
5652 case URX_LOOP_DOT_I:
5653 // Loop Initialization for the optimized implementation of .*
5654 // This op scans through all remaining input.
5655 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5656 {
5657 // Loop through input until the input is exhausted (we reach an end-of-line)
5658 // In DOTALL mode, we can just go straight to the end of the input.
5659 int32_t ix;
5660 if ((opValue & 1) == 1) {
5661 // Dot-matches-All mode. Jump straight to the end of the string.
5662 ix = (int32_t)fActiveLimit;
5663 fHitEnd = TRUE;
5664 } else {
5665 // NOT DOT ALL mode. Line endings do not match '.'
5666 // Scan forward until a line ending or end of input.
5667 ix = (int32_t)fp->fInputIdx;
5668 for (;;) {
5669 if (ix >= fActiveLimit) {
5670 fHitEnd = TRUE;
5671 break;
5672 }
5673 UChar32 c;
5674 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5675 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5676 if ((c == 0x0a) || // 0x0a is newline in both modes.
5677 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5678 isLineTerminator(c))) {
5679 // char is a line ending. Put the input pos back to the
5680 // line ending char, and exit the scanning loop.
5681 U16_BACK_1(inputBuf, 0, ix);
5682 break;
5683 }
5684 }
5685 }
5686 }
5687
5688 // If there were no matching characters, skip over the loop altogether.
5689 // The loop doesn't run at all, a * op always succeeds.
5690 if (ix == fp->fInputIdx) {
5691 fp->fPatIdx++; // skip the URX_LOOP_C op.
5692 break;
5693 }
5694
5695 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5696 // must follow. It's operand is the stack location
5697 // that holds the starting input index for the match of this .*
5698 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5699 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5700 int32_t stackLoc = URX_VAL(loopcOp);
5701 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5702 fp->fExtra[stackLoc] = fp->fInputIdx;
5703 fp->fInputIdx = ix;
5704
5705 // Save State to the URX_LOOP_C op that follows this one,
5706 // so that match failures in the following code will return to there.
5707 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5708 fp = StateSave(fp, fp->fPatIdx, status);
5709 fp->fPatIdx++;
5710 }
5711 break;
5712
5713
5714 case URX_LOOP_C:
5715 {
5716 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5717 backSearchIndex = (int32_t)fp->fExtra[opValue];
5718 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5719 if (backSearchIndex == fp->fInputIdx) {
5720 // We've backed up the input idx to the point that the loop started.
5721 // The loop is done. Leave here without saving state.
5722 // Subsequent failures won't come back here.
5723 break;
5724 }
5725 // Set up for the next iteration of the loop, with input index
5726 // backed up by one from the last time through,
5727 // and a state save to this instruction in case the following code fails again.
5728 // (We're going backwards because this loop emulates stack unwinding, not
5729 // the initial scan forward.)
5730 U_ASSERT(fp->fInputIdx > 0);
5731 UChar32 prevC;
5732 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5733
5734 if (prevC == 0x0a &&
5735 fp->fInputIdx > backSearchIndex &&
5736 inputBuf[fp->fInputIdx-1] == 0x0d) {
5737 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5738 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5739 // .*, stepping back over CRLF pair.
5740 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5741 }
5742 }
5743
5744
5745 fp = StateSave(fp, fp->fPatIdx-1, status);
5746 }
5747 break;
5748
5749
5750
5751 default:
5752 // Trouble. The compiled pattern contains an entry with an
5753 // unrecognized type tag.
5754 U_ASSERT(FALSE);
5755 }
5756
5757 if (U_FAILURE(status)) {
5758 isMatch = FALSE;
5759 break;
5760 }
5761 }
5762
5763 breakFromLoop:
5764 fMatch = isMatch;
5765 if (isMatch) {
5766 fLastMatchEnd = fMatchEnd;
5767 fMatchStart = startIdx;
5768 fMatchEnd = fp->fInputIdx;
5769 }
5770
5771 #ifdef REGEX_RUN_DEBUG
5772 if (fTraceDebug) {
5773 if (isMatch) {
5774 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5775 } else {
5776 printf("No match\n\n");
5777 }
5778 }
5779 #endif
5780
5781 fFrame = fp; // The active stack frame when the engine stopped.
5782 // Contains the capture group results that we need to
5783 // access later.
5784
5785 return;
5786 }
5787
5788
5789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5790
5791 U_NAMESPACE_END
5792
5793 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5794