1 /*
2 **************************************************************************
3 *   Copyright (C) 2002-2015 International Business Machines Corporation  *
4 *   and others. All rights reserved.                                     *
5 **************************************************************************
6 */
7 //
8 //  file:  rematch.cpp
9 //
10 //         Contains the implementation of class RegexMatcher,
11 //         which is one of the main API classes for the ICU regular expression package.
12 //
13 
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16 
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33 
34 // #include <malloc.h>        // Needed for heapcheck testing
35 
36 U_NAMESPACE_BEGIN
37 
38 // Default limit for the size of the back track stack, to avoid system
39 //    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
40 // This value puts ICU's limits higher than most other regexp implementations,
41 //    which use recursion rather than the heap, and take more storage per
42 //    backtrack point.
43 //
44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
45 
46 // Time limit counter constant.
47 //   Time limits for expression evaluation are in terms of quanta of work by
48 //   the engine, each of which is 10,000 state saves.
49 //   This constant determines that state saves per tick number.
50 static const int32_t TIMER_INITIAL_VALUE = 10000;
51 
52 
53 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)54 static inline UBool isLineTerminator(UChar32 c) {
55     if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
56         return false;
57     }
58     return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
59 }
60 
61 //-----------------------------------------------------------------------------
62 //
63 //   Constructor and Destructor
64 //
65 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)66 RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
67     fDeferredStatus = U_ZERO_ERROR;
68     init(fDeferredStatus);
69     if (U_FAILURE(fDeferredStatus)) {
70         return;
71     }
72     if (pat==NULL) {
73         fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
74         return;
75     }
76     fPattern = pat;
77     init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
78 }
79 
80 
81 
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)82 RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
83                            uint32_t flags, UErrorCode &status) {
84     init(status);
85     if (U_FAILURE(status)) {
86         return;
87     }
88     UParseError    pe;
89     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
90     fPattern           = fPatternOwned;
91 
92     UText inputText = UTEXT_INITIALIZER;
93     utext_openConstUnicodeString(&inputText, &input, &status);
94     init2(&inputText, status);
95     utext_close(&inputText);
96 
97     fInputUniStrMaybeMutable = TRUE;
98 }
99 
100 
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)101 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
102                            uint32_t flags, UErrorCode &status) {
103     init(status);
104     if (U_FAILURE(status)) {
105         return;
106     }
107     UParseError    pe;
108     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
109     if (U_FAILURE(status)) {
110         return;
111     }
112 
113     fPattern           = fPatternOwned;
114     init2(input, status);
115 }
116 
117 
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)118 RegexMatcher::RegexMatcher(const UnicodeString &regexp,
119                            uint32_t flags, UErrorCode &status) {
120     init(status);
121     if (U_FAILURE(status)) {
122         return;
123     }
124     UParseError    pe;
125     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
126     if (U_FAILURE(status)) {
127         return;
128     }
129     fPattern           = fPatternOwned;
130     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
131 }
132 
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)133 RegexMatcher::RegexMatcher(UText *regexp,
134                            uint32_t flags, UErrorCode &status) {
135     init(status);
136     if (U_FAILURE(status)) {
137         return;
138     }
139     UParseError    pe;
140     fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
141         if (U_FAILURE(status)) {
142         return;
143     }
144 
145     fPattern           = fPatternOwned;
146     init2(RegexStaticSets::gStaticSets->fEmptyText, status);
147 }
148 
149 
150 
151 
~RegexMatcher()152 RegexMatcher::~RegexMatcher() {
153     delete fStack;
154     if (fData != fSmallData) {
155         uprv_free(fData);
156         fData = NULL;
157     }
158     if (fPatternOwned) {
159         delete fPatternOwned;
160         fPatternOwned = NULL;
161         fPattern = NULL;
162     }
163 
164     if (fInput) {
165         delete fInput;
166     }
167     if (fInputText) {
168         utext_close(fInputText);
169     }
170     if (fAltInputText) {
171         utext_close(fAltInputText);
172     }
173 
174     #if UCONFIG_NO_BREAK_ITERATION==0
175     delete fWordBreakItr;
176     #endif
177 }
178 
179 //
180 //   init()   common initialization for use by all constructors.
181 //            Initialize all fields, get the object into a consistent state.
182 //            This must be done even when the initial status shows an error,
183 //            so that the object is initialized sufficiently well for the destructor
184 //            to run safely.
185 //
init(UErrorCode & status)186 void RegexMatcher::init(UErrorCode &status) {
187     fPattern           = NULL;
188     fPatternOwned      = NULL;
189     fFrameSize         = 0;
190     fRegionStart       = 0;
191     fRegionLimit       = 0;
192     fAnchorStart       = 0;
193     fAnchorLimit       = 0;
194     fLookStart         = 0;
195     fLookLimit         = 0;
196     fActiveStart       = 0;
197     fActiveLimit       = 0;
198     fTransparentBounds = FALSE;
199     fAnchoringBounds   = TRUE;
200     fMatch             = FALSE;
201     fMatchStart        = 0;
202     fMatchEnd          = 0;
203     fLastMatchEnd      = -1;
204     fAppendPosition    = 0;
205     fHitEnd            = FALSE;
206     fRequireEnd        = FALSE;
207     fStack             = NULL;
208     fFrame             = NULL;
209     fTimeLimit         = 0;
210     fTime              = 0;
211     fTickCounter       = 0;
212     fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
213     fCallbackFn        = NULL;
214     fCallbackContext   = NULL;
215     fFindProgressCallbackFn      = NULL;
216     fFindProgressCallbackContext = NULL;
217     fTraceDebug        = FALSE;
218     fDeferredStatus    = status;
219     fData              = fSmallData;
220     fWordBreakItr      = NULL;
221 
222     fStack             = NULL;
223     fInputText         = NULL;
224     fAltInputText      = NULL;
225     fInput             = NULL;
226     fInputLength       = 0;
227     fInputUniStrMaybeMutable = FALSE;
228 }
229 
230 //
231 //  init2()   Common initialization for use by RegexMatcher constructors, part 2.
232 //            This handles the common setup to be done after the Pattern is available.
233 //
init2(UText * input,UErrorCode & status)234 void RegexMatcher::init2(UText *input, UErrorCode &status) {
235     if (U_FAILURE(status)) {
236         fDeferredStatus = status;
237         return;
238     }
239 
240     if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
241         fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
242         if (fData == NULL) {
243             status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
244             return;
245         }
246     }
247 
248     fStack = new UVector64(status);
249     if (fStack == NULL) {
250         status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
251         return;
252     }
253 
254     reset(input);
255     setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
256     if (U_FAILURE(status)) {
257         fDeferredStatus = status;
258         return;
259     }
260 }
261 
262 
263 static const UChar BACKSLASH  = 0x5c;
264 static const UChar DOLLARSIGN = 0x24;
265 static const UChar LEFTBRACKET = 0x7b;
266 static const UChar RIGHTBRACKET = 0x7d;
267 
268 //--------------------------------------------------------------------------------
269 //
270 //    appendReplacement
271 //
272 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)273 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
274                                               const UnicodeString &replacement,
275                                               UErrorCode &status) {
276     UText replacementText = UTEXT_INITIALIZER;
277 
278     utext_openConstUnicodeString(&replacementText, &replacement, &status);
279     if (U_SUCCESS(status)) {
280         UText resultText = UTEXT_INITIALIZER;
281         utext_openUnicodeString(&resultText, &dest, &status);
282 
283         if (U_SUCCESS(status)) {
284             appendReplacement(&resultText, &replacementText, status);
285             utext_close(&resultText);
286         }
287         utext_close(&replacementText);
288     }
289 
290     return *this;
291 }
292 
293 //
294 //    appendReplacement, UText mode
295 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)296 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
297                                               UText *replacement,
298                                               UErrorCode &status) {
299     if (U_FAILURE(status)) {
300         return *this;
301     }
302     if (U_FAILURE(fDeferredStatus)) {
303         status = fDeferredStatus;
304         return *this;
305     }
306     if (fMatch == FALSE) {
307         status = U_REGEX_INVALID_STATE;
308         return *this;
309     }
310 
311     // Copy input string from the end of previous match to start of current match
312     int64_t  destLen = utext_nativeLength(dest);
313     if (fMatchStart > fAppendPosition) {
314         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
315             destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
316                                      (int32_t)(fMatchStart-fAppendPosition), &status);
317         } else {
318             int32_t len16;
319             if (UTEXT_USES_U16(fInputText)) {
320                 len16 = (int32_t)(fMatchStart-fAppendPosition);
321             } else {
322                 UErrorCode lengthStatus = U_ZERO_ERROR;
323                 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
324             }
325             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
326             if (inputChars == NULL) {
327                 status = U_MEMORY_ALLOCATION_ERROR;
328                 return *this;
329             }
330             utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
331             destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
332             uprv_free(inputChars);
333         }
334     }
335     fAppendPosition = fMatchEnd;
336 
337 
338     // scan the replacement text, looking for substitutions ($n) and \escapes.
339     //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
340     //         move entire ranges not containing substitutions.
341     UTEXT_SETNATIVEINDEX(replacement, 0);
342     for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
343         if (c == BACKSLASH) {
344             // Backslash Escape.  Copy the following char out without further checks.
345             //                    Note:  Surrogate pairs don't need any special handling
346             //                           The second half wont be a '$' or a '\', and
347             //                           will move to the dest normally on the next
348             //                           loop iteration.
349             c = UTEXT_CURRENT32(replacement);
350             if (c == U_SENTINEL) {
351                 break;
352             }
353 
354             if (c==0x55/*U*/ || c==0x75/*u*/) {
355                 // We have a \udddd or \Udddddddd escape sequence.
356                 int32_t offset = 0;
357                 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
358                 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
359                 if (escapedChar != (UChar32)0xFFFFFFFF) {
360                     if (U_IS_BMP(escapedChar)) {
361                         UChar c16 = (UChar)escapedChar;
362                         destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
363                     } else {
364                         UChar surrogate[2];
365                         surrogate[0] = U16_LEAD(escapedChar);
366                         surrogate[1] = U16_TRAIL(escapedChar);
367                         if (U_SUCCESS(status)) {
368                             destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
369                         }
370                     }
371                     // TODO:  Report errors for mal-formed \u escapes?
372                     //        As this is, the original sequence is output, which may be OK.
373                     if (context.lastOffset == offset) {
374                         (void)UTEXT_PREVIOUS32(replacement);
375                     } else if (context.lastOffset != offset-1) {
376                         utext_moveIndex32(replacement, offset - context.lastOffset - 1);
377                     }
378                 }
379             } else {
380                 (void)UTEXT_NEXT32(replacement);
381                 // Plain backslash escape.  Just put out the escaped character.
382                 if (U_IS_BMP(c)) {
383                     UChar c16 = (UChar)c;
384                     destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
385                 } else {
386                     UChar surrogate[2];
387                     surrogate[0] = U16_LEAD(c);
388                     surrogate[1] = U16_TRAIL(c);
389                     if (U_SUCCESS(status)) {
390                         destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
391                     }
392                 }
393             }
394         } else if (c != DOLLARSIGN) {
395             // Normal char, not a $.  Copy it out without further checks.
396             if (U_IS_BMP(c)) {
397                 UChar c16 = (UChar)c;
398                 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
399             } else {
400                 UChar surrogate[2];
401                 surrogate[0] = U16_LEAD(c);
402                 surrogate[1] = U16_TRAIL(c);
403                 if (U_SUCCESS(status)) {
404                     destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
405                 }
406             }
407         } else {
408             // We've got a $.  Pick up a capture group name or number if one follows.
409             // Consume digits so long as the resulting group number <= the number of
410             // number of capture groups in the pattern.
411 
412             int32_t groupNum  = 0;
413             int32_t numDigits = 0;
414             UChar32 nextChar = utext_current32(replacement);
415             if (nextChar == LEFTBRACKET) {
416                 // Scan for a Named Capture Group, ${name}.
417                 UnicodeString groupName;
418                 utext_next32(replacement);
419                 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
420                     nextChar = utext_next32(replacement);
421                     if (nextChar == U_SENTINEL) {
422                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
423                     } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
424                                (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
425                                (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
426                         groupName.append(nextChar);
427                     } else if (nextChar == RIGHTBRACKET) {
428                         groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
429                         if (groupNum == 0) {
430                             status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
431                         }
432                     } else {
433                         // Character was something other than a name char or a closing '}'
434                         status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435                     }
436                 }
437 
438             } else if (u_isdigit(nextChar)) {
439                 // $n    Scan for a capture group number
440                 int32_t numCaptureGroups = fPattern->fGroupMap->size();
441                 for (;;) {
442                     nextChar = UTEXT_CURRENT32(replacement);
443                     if (nextChar == U_SENTINEL) {
444                         break;
445                     }
446                     if (u_isdigit(nextChar) == FALSE) {
447                         break;
448                     }
449                     int32_t nextDigitVal = u_charDigitValue(nextChar);
450                     if (groupNum*10 + nextDigitVal > numCaptureGroups) {
451                         // Don't consume the next digit if it makes the capture group number too big.
452                         if (numDigits == 0) {
453                             status = U_INDEX_OUTOFBOUNDS_ERROR;
454                         }
455                         break;
456                     }
457                     (void)UTEXT_NEXT32(replacement);
458                     groupNum=groupNum*10 + nextDigitVal;
459                     ++numDigits;
460                 }
461             } else {
462                 // $ not followed by capture group name or number.
463                 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
464             }
465 
466             if (U_SUCCESS(status)) {
467                 destLen += appendGroup(groupNum, dest, status);
468             }
469         }  // End of $ capture group handling
470     }  // End of per-character loop through the replacement string.
471 
472     return *this;
473 }
474 
475 
476 
477 //--------------------------------------------------------------------------------
478 //
479 //    appendTail     Intended to be used in conjunction with appendReplacement()
480 //                   To the destination string, append everything following
481 //                   the last match position from the input string.
482 //
483 //                   Note:  Match ranges do not affect appendTail or appendReplacement
484 //
485 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)486 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
487     UErrorCode status = U_ZERO_ERROR;
488     UText resultText = UTEXT_INITIALIZER;
489     utext_openUnicodeString(&resultText, &dest, &status);
490 
491     if (U_SUCCESS(status)) {
492         appendTail(&resultText, status);
493         utext_close(&resultText);
494     }
495 
496     return dest;
497 }
498 
499 //
500 //   appendTail, UText mode
501 //
appendTail(UText * dest,UErrorCode & status)502 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
503     if (U_FAILURE(status)) {
504         return dest;
505     }
506     if (U_FAILURE(fDeferredStatus)) {
507         status = fDeferredStatus;
508         return dest;
509     }
510 
511     if (fInputLength > fAppendPosition) {
512         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
513             int64_t destLen = utext_nativeLength(dest);
514             utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
515                           (int32_t)(fInputLength-fAppendPosition), &status);
516         } else {
517             int32_t len16;
518             if (UTEXT_USES_U16(fInputText)) {
519                 len16 = (int32_t)(fInputLength-fAppendPosition);
520             } else {
521                 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
522                 status = U_ZERO_ERROR; // buffer overflow
523             }
524 
525             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
526             if (inputChars == NULL) {
527                 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
528             } else {
529                 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
530                 int64_t destLen = utext_nativeLength(dest);
531                 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
532                 uprv_free(inputChars);
533             }
534         }
535     }
536     return dest;
537 }
538 
539 
540 
541 //--------------------------------------------------------------------------------
542 //
543 //   end
544 //
545 //--------------------------------------------------------------------------------
end(UErrorCode & err) const546 int32_t RegexMatcher::end(UErrorCode &err) const {
547     return end(0, err);
548 }
549 
end64(UErrorCode & err) const550 int64_t RegexMatcher::end64(UErrorCode &err) const {
551     return end64(0, err);
552 }
553 
end64(int32_t group,UErrorCode & err) const554 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
555     if (U_FAILURE(err)) {
556         return -1;
557     }
558     if (fMatch == FALSE) {
559         err = U_REGEX_INVALID_STATE;
560         return -1;
561     }
562     if (group < 0 || group > fPattern->fGroupMap->size()) {
563         err = U_INDEX_OUTOFBOUNDS_ERROR;
564         return -1;
565     }
566     int64_t e = -1;
567     if (group == 0) {
568         e = fMatchEnd;
569     } else {
570         // Get the position within the stack frame of the variables for
571         //    this capture group.
572         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
573         U_ASSERT(groupOffset < fPattern->fFrameSize);
574         U_ASSERT(groupOffset >= 0);
575         e = fFrame->fExtra[groupOffset + 1];
576     }
577 
578         return e;
579 }
580 
end(int32_t group,UErrorCode & err) const581 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
582     return (int32_t)end64(group, err);
583 }
584 
585 //--------------------------------------------------------------------------------
586 //
587 //   findProgressInterrupt  This function is called once for each advance in the target
588 //                          string from the find() function, and calls the user progress callback
589 //                          function if there is one installed.
590 //
591 //         Return:  TRUE if the find operation is to be terminated.
592 //                  FALSE if the find operation is to continue running.
593 //
594 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)595 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
596     if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
597         status = U_REGEX_STOPPED_BY_CALLER;
598         return TRUE;
599     }
600     return FALSE;
601 }
602 
603 //--------------------------------------------------------------------------------
604 //
605 //   find()
606 //
607 //--------------------------------------------------------------------------------
find()608 UBool RegexMatcher::find() {
609     if (U_FAILURE(fDeferredStatus)) {
610         return FALSE;
611     }
612     UErrorCode status = U_ZERO_ERROR;
613     UBool result = find(status);
614     return result;
615 }
616 
617 //--------------------------------------------------------------------------------
618 //
619 //   find()
620 //
621 //--------------------------------------------------------------------------------
find(UErrorCode & status)622 UBool RegexMatcher::find(UErrorCode &status) {
623     // Start at the position of the last match end.  (Will be zero if the
624     //   matcher has been reset.)
625     //
626     if (U_FAILURE(status)) {
627         return FALSE;
628     }
629     if (U_FAILURE(fDeferredStatus)) {
630         status = fDeferredStatus;
631         return FALSE;
632     }
633 
634     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
635         return findUsingChunk(status);
636     }
637 
638     int64_t startPos = fMatchEnd;
639     if (startPos==0) {
640         startPos = fActiveStart;
641     }
642 
643     if (fMatch) {
644         // Save the position of any previous successful match.
645         fLastMatchEnd = fMatchEnd;
646 
647         if (fMatchStart == fMatchEnd) {
648             // Previous match had zero length.  Move start position up one position
649             //  to avoid sending find() into a loop on zero-length matches.
650             if (startPos >= fActiveLimit) {
651                 fMatch = FALSE;
652                 fHitEnd = TRUE;
653                 return FALSE;
654             }
655             UTEXT_SETNATIVEINDEX(fInputText, startPos);
656             (void)UTEXT_NEXT32(fInputText);
657             startPos = UTEXT_GETNATIVEINDEX(fInputText);
658         }
659     } else {
660         if (fLastMatchEnd >= 0) {
661             // A previous find() failed to match.  Don't try again.
662             //   (without this test, a pattern with a zero-length match
663             //    could match again at the end of an input string.)
664             fHitEnd = TRUE;
665             return FALSE;
666         }
667     }
668 
669 
670     // Compute the position in the input string beyond which a match can not begin, because
671     //   the minimum length match would extend past the end of the input.
672     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
673     //          Be aware of possible overflows if making changes here.
674     int64_t testStartLimit;
675     if (UTEXT_USES_U16(fInputText)) {
676         testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
677         if (startPos > testStartLimit) {
678             fMatch = FALSE;
679             fHitEnd = TRUE;
680             return FALSE;
681         }
682     } else {
683         // We don't know exactly how long the minimum match length is in native characters.
684         // Treat anything > 0 as 1.
685         testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
686     }
687 
688     UChar32  c;
689     U_ASSERT(startPos >= 0);
690 
691     switch (fPattern->fStartType) {
692     case START_NO_INFO:
693         // No optimization was found.
694         //  Try a match at each input position.
695         for (;;) {
696             MatchAt(startPos, FALSE, status);
697             if (U_FAILURE(status)) {
698                 return FALSE;
699             }
700             if (fMatch) {
701                 return TRUE;
702             }
703             if (startPos >= testStartLimit) {
704                 fHitEnd = TRUE;
705                 return FALSE;
706             }
707             UTEXT_SETNATIVEINDEX(fInputText, startPos);
708             (void)UTEXT_NEXT32(fInputText);
709             startPos = UTEXT_GETNATIVEINDEX(fInputText);
710             // Note that it's perfectly OK for a pattern to have a zero-length
711             //   match at the end of a string, so we must make sure that the loop
712             //   runs with startPos == testStartLimit the last time through.
713             if  (findProgressInterrupt(startPos, status))
714                 return FALSE;
715         }
716         U_ASSERT(FALSE);
717 
718     case START_START:
719         // Matches are only possible at the start of the input string
720         //   (pattern begins with ^ or \A)
721         if (startPos > fActiveStart) {
722             fMatch = FALSE;
723             return FALSE;
724         }
725         MatchAt(startPos, FALSE, status);
726         if (U_FAILURE(status)) {
727             return FALSE;
728         }
729         return fMatch;
730 
731 
732     case START_SET:
733         {
734             // Match may start on any char from a pre-computed set.
735             U_ASSERT(fPattern->fMinMatchLen > 0);
736             UTEXT_SETNATIVEINDEX(fInputText, startPos);
737             for (;;) {
738                 int64_t pos = startPos;
739                 c = UTEXT_NEXT32(fInputText);
740                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
741                 // c will be -1 (U_SENTINEL) at end of text, in which case we
742                 // skip this next block (so we don't have a negative array index)
743                 // and handle end of text in the following block.
744                 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
745                               (c>=256 && fPattern->fInitialChars->contains(c)))) {
746                     MatchAt(pos, FALSE, status);
747                     if (U_FAILURE(status)) {
748                         return FALSE;
749                     }
750                     if (fMatch) {
751                         return TRUE;
752                     }
753                     UTEXT_SETNATIVEINDEX(fInputText, pos);
754                 }
755                 if (startPos > testStartLimit) {
756                     fMatch = FALSE;
757                     fHitEnd = TRUE;
758                     return FALSE;
759                 }
760                 if  (findProgressInterrupt(startPos, status))
761                     return FALSE;
762             }
763         }
764         U_ASSERT(FALSE);
765 
766     case START_STRING:
767     case START_CHAR:
768         {
769             // Match starts on exactly one char.
770             U_ASSERT(fPattern->fMinMatchLen > 0);
771             UChar32 theChar = fPattern->fInitialChar;
772             UTEXT_SETNATIVEINDEX(fInputText, startPos);
773             for (;;) {
774                 int64_t pos = startPos;
775                 c = UTEXT_NEXT32(fInputText);
776                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
777                 if (c == theChar) {
778                     MatchAt(pos, FALSE, status);
779                     if (U_FAILURE(status)) {
780                         return FALSE;
781                     }
782                     if (fMatch) {
783                         return TRUE;
784                     }
785                     UTEXT_SETNATIVEINDEX(fInputText, pos);
786                 }
787                 if (startPos > testStartLimit) {
788                     fMatch = FALSE;
789                     fHitEnd = TRUE;
790                     return FALSE;
791                 }
792                 if  (findProgressInterrupt(startPos, status))
793                     return FALSE;
794            }
795         }
796         U_ASSERT(FALSE);
797 
798     case START_LINE:
799         {
800             UChar32  c;
801             if (startPos == fAnchorStart) {
802                 MatchAt(startPos, FALSE, status);
803                 if (U_FAILURE(status)) {
804                     return FALSE;
805                 }
806                 if (fMatch) {
807                     return TRUE;
808                 }
809                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
810                 c = UTEXT_NEXT32(fInputText);
811                 startPos = UTEXT_GETNATIVEINDEX(fInputText);
812             } else {
813                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814                 c = UTEXT_PREVIOUS32(fInputText);
815                 UTEXT_SETNATIVEINDEX(fInputText, startPos);
816             }
817 
818             if (fPattern->fFlags & UREGEX_UNIX_LINES) {
819                 for (;;) {
820                     if (c == 0x0a) {
821                             MatchAt(startPos, FALSE, status);
822                             if (U_FAILURE(status)) {
823                                 return FALSE;
824                             }
825                             if (fMatch) {
826                                 return TRUE;
827                             }
828                             UTEXT_SETNATIVEINDEX(fInputText, startPos);
829                     }
830                     if (startPos >= testStartLimit) {
831                         fMatch = FALSE;
832                         fHitEnd = TRUE;
833                         return FALSE;
834                     }
835                     c = UTEXT_NEXT32(fInputText);
836                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
837                     // Note that it's perfectly OK for a pattern to have a zero-length
838                     //   match at the end of a string, so we must make sure that the loop
839                     //   runs with startPos == testStartLimit the last time through.
840                     if  (findProgressInterrupt(startPos, status))
841                         return FALSE;
842                 }
843             } else {
844                 for (;;) {
845                     if (isLineTerminator(c)) {
846                         if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
847                             (void)UTEXT_NEXT32(fInputText);
848                             startPos = UTEXT_GETNATIVEINDEX(fInputText);
849                         }
850                         MatchAt(startPos, FALSE, status);
851                         if (U_FAILURE(status)) {
852                             return FALSE;
853                         }
854                         if (fMatch) {
855                             return TRUE;
856                         }
857                         UTEXT_SETNATIVEINDEX(fInputText, startPos);
858                     }
859                     if (startPos >= testStartLimit) {
860                         fMatch = FALSE;
861                         fHitEnd = TRUE;
862                         return FALSE;
863                     }
864                     c = UTEXT_NEXT32(fInputText);
865                     startPos = UTEXT_GETNATIVEINDEX(fInputText);
866                     // Note that it's perfectly OK for a pattern to have a zero-length
867                     //   match at the end of a string, so we must make sure that the loop
868                     //   runs with startPos == testStartLimit the last time through.
869                     if  (findProgressInterrupt(startPos, status))
870                         return FALSE;
871                 }
872             }
873         }
874 
875     default:
876         U_ASSERT(FALSE);
877     }
878 
879     U_ASSERT(FALSE);
880     return FALSE;
881 }
882 
883 
884 
find(int64_t start,UErrorCode & status)885 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
886     if (U_FAILURE(status)) {
887         return FALSE;
888     }
889     if (U_FAILURE(fDeferredStatus)) {
890         status = fDeferredStatus;
891         return FALSE;
892     }
893     this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
894                                           //        This will reset the region to be the full input length.
895     if (start < 0) {
896         status = U_INDEX_OUTOFBOUNDS_ERROR;
897         return FALSE;
898     }
899 
900     int64_t nativeStart = start;
901     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
902         status = U_INDEX_OUTOFBOUNDS_ERROR;
903         return FALSE;
904     }
905     fMatchEnd = nativeStart;
906     return find(status);
907 }
908 
909 
910 //--------------------------------------------------------------------------------
911 //
912 //   findUsingChunk() -- like find(), but with the advance knowledge that the
913 //                       entire string is available in the UText's chunk buffer.
914 //
915 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)916 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
917     // Start at the position of the last match end.  (Will be zero if the
918     //   matcher has been reset.
919     //
920 
921     int32_t startPos = (int32_t)fMatchEnd;
922     if (startPos==0) {
923         startPos = (int32_t)fActiveStart;
924     }
925 
926     const UChar *inputBuf = fInputText->chunkContents;
927 
928     if (fMatch) {
929         // Save the position of any previous successful match.
930         fLastMatchEnd = fMatchEnd;
931 
932         if (fMatchStart == fMatchEnd) {
933             // Previous match had zero length.  Move start position up one position
934             //  to avoid sending find() into a loop on zero-length matches.
935             if (startPos >= fActiveLimit) {
936                 fMatch = FALSE;
937                 fHitEnd = TRUE;
938                 return FALSE;
939             }
940             U16_FWD_1(inputBuf, startPos, fInputLength);
941         }
942     } else {
943         if (fLastMatchEnd >= 0) {
944             // A previous find() failed to match.  Don't try again.
945             //   (without this test, a pattern with a zero-length match
946             //    could match again at the end of an input string.)
947             fHitEnd = TRUE;
948             return FALSE;
949         }
950     }
951 
952 
953     // Compute the position in the input string beyond which a match can not begin, because
954     //   the minimum length match would extend past the end of the input.
955     //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
956     //          Be aware of possible overflows if making changes here.
957     //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
958     int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
959     if (startPos > testLen) {
960         fMatch = FALSE;
961         fHitEnd = TRUE;
962         return FALSE;
963     }
964 
965     UChar32  c;
966     U_ASSERT(startPos >= 0);
967 
968     switch (fPattern->fStartType) {
969     case START_NO_INFO:
970         // No optimization was found.
971         //  Try a match at each input position.
972         for (;;) {
973             MatchChunkAt(startPos, FALSE, status);
974             if (U_FAILURE(status)) {
975                 return FALSE;
976             }
977             if (fMatch) {
978                 return TRUE;
979             }
980             if (startPos >= testLen) {
981                 fHitEnd = TRUE;
982                 return FALSE;
983             }
984             U16_FWD_1(inputBuf, startPos, fActiveLimit);
985             // Note that it's perfectly OK for a pattern to have a zero-length
986             //   match at the end of a string, so we must make sure that the loop
987             //   runs with startPos == testLen the last time through.
988             if  (findProgressInterrupt(startPos, status))
989                 return FALSE;
990         }
991         U_ASSERT(FALSE);
992 
993     case START_START:
994         // Matches are only possible at the start of the input string
995         //   (pattern begins with ^ or \A)
996         if (startPos > fActiveStart) {
997             fMatch = FALSE;
998             return FALSE;
999         }
1000         MatchChunkAt(startPos, FALSE, status);
1001         if (U_FAILURE(status)) {
1002             return FALSE;
1003         }
1004         return fMatch;
1005 
1006 
1007     case START_SET:
1008     {
1009         // Match may start on any char from a pre-computed set.
1010         U_ASSERT(fPattern->fMinMatchLen > 0);
1011         for (;;) {
1012             int32_t pos = startPos;
1013             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1014             if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1015                 (c>=256 && fPattern->fInitialChars->contains(c))) {
1016                 MatchChunkAt(pos, FALSE, status);
1017                 if (U_FAILURE(status)) {
1018                     return FALSE;
1019                 }
1020                 if (fMatch) {
1021                     return TRUE;
1022                 }
1023             }
1024             if (startPos > testLen) {
1025                 fMatch = FALSE;
1026                 fHitEnd = TRUE;
1027                 return FALSE;
1028             }
1029             if  (findProgressInterrupt(startPos, status))
1030                 return FALSE;
1031         }
1032     }
1033         U_ASSERT(FALSE);
1034 
1035     case START_STRING:
1036     case START_CHAR:
1037     {
1038         // Match starts on exactly one char.
1039         U_ASSERT(fPattern->fMinMatchLen > 0);
1040         UChar32 theChar = fPattern->fInitialChar;
1041         for (;;) {
1042             int32_t pos = startPos;
1043             U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1044             if (c == theChar) {
1045                 MatchChunkAt(pos, FALSE, status);
1046                 if (U_FAILURE(status)) {
1047                     return FALSE;
1048                 }
1049                 if (fMatch) {
1050                     return TRUE;
1051                 }
1052             }
1053             if (startPos > testLen) {
1054                 fMatch = FALSE;
1055                 fHitEnd = TRUE;
1056                 return FALSE;
1057             }
1058             if  (findProgressInterrupt(startPos, status))
1059                 return FALSE;
1060         }
1061     }
1062     U_ASSERT(FALSE);
1063 
1064     case START_LINE:
1065     {
1066         UChar32  c;
1067         if (startPos == fAnchorStart) {
1068             MatchChunkAt(startPos, FALSE, status);
1069             if (U_FAILURE(status)) {
1070                 return FALSE;
1071             }
1072             if (fMatch) {
1073                 return TRUE;
1074             }
1075             U16_FWD_1(inputBuf, startPos, fActiveLimit);
1076         }
1077 
1078         if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1079             for (;;) {
1080                 c = inputBuf[startPos-1];
1081                 if (c == 0x0a) {
1082                     MatchChunkAt(startPos, FALSE, status);
1083                     if (U_FAILURE(status)) {
1084                         return FALSE;
1085                     }
1086                     if (fMatch) {
1087                         return TRUE;
1088                     }
1089                 }
1090                 if (startPos >= testLen) {
1091                     fMatch = FALSE;
1092                     fHitEnd = TRUE;
1093                     return FALSE;
1094                 }
1095                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1096                 // Note that it's perfectly OK for a pattern to have a zero-length
1097                 //   match at the end of a string, so we must make sure that the loop
1098                 //   runs with startPos == testLen the last time through.
1099                 if  (findProgressInterrupt(startPos, status))
1100                     return FALSE;
1101             }
1102         } else {
1103             for (;;) {
1104                 c = inputBuf[startPos-1];
1105                 if (isLineTerminator(c)) {
1106                     if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1107                         startPos++;
1108                     }
1109                     MatchChunkAt(startPos, FALSE, status);
1110                     if (U_FAILURE(status)) {
1111                         return FALSE;
1112                     }
1113                     if (fMatch) {
1114                         return TRUE;
1115                     }
1116                 }
1117                 if (startPos >= testLen) {
1118                     fMatch = FALSE;
1119                     fHitEnd = TRUE;
1120                     return FALSE;
1121                 }
1122                 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1123                 // Note that it's perfectly OK for a pattern to have a zero-length
1124                 //   match at the end of a string, so we must make sure that the loop
1125                 //   runs with startPos == testLen the last time through.
1126                 if  (findProgressInterrupt(startPos, status))
1127                     return FALSE;
1128             }
1129         }
1130     }
1131 
1132     default:
1133         U_ASSERT(FALSE);
1134     }
1135 
1136     U_ASSERT(FALSE);
1137     return FALSE;
1138 }
1139 
1140 
1141 
1142 //--------------------------------------------------------------------------------
1143 //
1144 //  group()
1145 //
1146 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1147 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1148     return group(0, status);
1149 }
1150 
1151 //  Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1152 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1153     return group(0, dest, group_len, status);
1154 }
1155 
1156 //  Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1157 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1158     group_len = 0;
1159     if (U_FAILURE(status)) {
1160         return dest;
1161     }
1162     if (U_FAILURE(fDeferredStatus)) {
1163         status = fDeferredStatus;
1164     } else if (fMatch == FALSE) {
1165         status = U_REGEX_INVALID_STATE;
1166     } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1167         status = U_INDEX_OUTOFBOUNDS_ERROR;
1168     }
1169 
1170     if (U_FAILURE(status)) {
1171         return dest;
1172     }
1173 
1174     int64_t s, e;
1175     if (groupNum == 0) {
1176         s = fMatchStart;
1177         e = fMatchEnd;
1178     } else {
1179         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1180         U_ASSERT(groupOffset < fPattern->fFrameSize);
1181         U_ASSERT(groupOffset >= 0);
1182         s = fFrame->fExtra[groupOffset];
1183         e = fFrame->fExtra[groupOffset+1];
1184     }
1185 
1186     if (s < 0) {
1187         // A capture group wasn't part of the match
1188         return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1189     }
1190     U_ASSERT(s <= e);
1191     group_len = e - s;
1192 
1193     dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1194     if (dest)
1195         UTEXT_SETNATIVEINDEX(dest, s);
1196     return dest;
1197 }
1198 
group(int32_t groupNum,UErrorCode & status) const1199 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1200     UnicodeString result;
1201     int64_t groupStart = start64(groupNum, status);
1202     int64_t groupEnd = end64(groupNum, status);
1203     if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1204         return result;
1205     }
1206 
1207     // Get the group length using a utext_extract preflight.
1208     //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1209     int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1210     if (status != U_BUFFER_OVERFLOW_ERROR) {
1211         return result;
1212     }
1213 
1214     status = U_ZERO_ERROR;
1215     UChar *buf = result.getBuffer(length);
1216     if (buf == NULL) {
1217         status = U_MEMORY_ALLOCATION_ERROR;
1218     } else {
1219         int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1220         result.releaseBuffer(extractLength);
1221         U_ASSERT(length == extractLength);
1222     }
1223     return result;
1224 }
1225 
1226 
1227 //--------------------------------------------------------------------------------
1228 //
1229 //  appendGroup() -- currently internal only, appends a group to a UText rather
1230 //                   than replacing its contents
1231 //
1232 //--------------------------------------------------------------------------------
1233 
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1234 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1235     if (U_FAILURE(status)) {
1236         return 0;
1237     }
1238     if (U_FAILURE(fDeferredStatus)) {
1239         status = fDeferredStatus;
1240         return 0;
1241     }
1242     int64_t destLen = utext_nativeLength(dest);
1243 
1244     if (fMatch == FALSE) {
1245         status = U_REGEX_INVALID_STATE;
1246         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1247     }
1248     if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1249         status = U_INDEX_OUTOFBOUNDS_ERROR;
1250         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251     }
1252 
1253     int64_t s, e;
1254     if (groupNum == 0) {
1255         s = fMatchStart;
1256         e = fMatchEnd;
1257     } else {
1258         int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1259         U_ASSERT(groupOffset < fPattern->fFrameSize);
1260         U_ASSERT(groupOffset >= 0);
1261         s = fFrame->fExtra[groupOffset];
1262         e = fFrame->fExtra[groupOffset+1];
1263     }
1264 
1265     if (s < 0) {
1266         // A capture group wasn't part of the match
1267         return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1268     }
1269     U_ASSERT(s <= e);
1270 
1271     int64_t deltaLen;
1272     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1273         U_ASSERT(e <= fInputLength);
1274         deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1275     } else {
1276         int32_t len16;
1277         if (UTEXT_USES_U16(fInputText)) {
1278             len16 = (int32_t)(e-s);
1279         } else {
1280             UErrorCode lengthStatus = U_ZERO_ERROR;
1281             len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1282         }
1283         UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1284         if (groupChars == NULL) {
1285             status = U_MEMORY_ALLOCATION_ERROR;
1286             return 0;
1287         }
1288         utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1289 
1290         deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1291         uprv_free(groupChars);
1292     }
1293     return deltaLen;
1294 }
1295 
1296 
1297 
1298 //--------------------------------------------------------------------------------
1299 //
1300 //  groupCount()
1301 //
1302 //--------------------------------------------------------------------------------
groupCount() const1303 int32_t RegexMatcher::groupCount() const {
1304     return fPattern->fGroupMap->size();
1305 }
1306 
1307 //--------------------------------------------------------------------------------
1308 //
1309 //  hasAnchoringBounds()
1310 //
1311 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1312 UBool RegexMatcher::hasAnchoringBounds() const {
1313     return fAnchoringBounds;
1314 }
1315 
1316 
1317 //--------------------------------------------------------------------------------
1318 //
1319 //  hasTransparentBounds()
1320 //
1321 //--------------------------------------------------------------------------------
hasTransparentBounds() const1322 UBool RegexMatcher::hasTransparentBounds() const {
1323     return fTransparentBounds;
1324 }
1325 
1326 
1327 
1328 //--------------------------------------------------------------------------------
1329 //
1330 //  hitEnd()
1331 //
1332 //--------------------------------------------------------------------------------
hitEnd() const1333 UBool RegexMatcher::hitEnd() const {
1334     return fHitEnd;
1335 }
1336 
1337 
1338 //--------------------------------------------------------------------------------
1339 //
1340 //  input()
1341 //
1342 //--------------------------------------------------------------------------------
input() const1343 const UnicodeString &RegexMatcher::input() const {
1344     if (!fInput) {
1345         UErrorCode status = U_ZERO_ERROR;
1346         int32_t len16;
1347         if (UTEXT_USES_U16(fInputText)) {
1348             len16 = (int32_t)fInputLength;
1349         } else {
1350             len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1351             status = U_ZERO_ERROR; // overflow, length status
1352         }
1353         UnicodeString *result = new UnicodeString(len16, 0, 0);
1354 
1355         UChar *inputChars = result->getBuffer(len16);
1356         utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1357         result->releaseBuffer(len16);
1358 
1359         (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1360     }
1361 
1362     return *fInput;
1363 }
1364 
1365 //--------------------------------------------------------------------------------
1366 //
1367 //  inputText()
1368 //
1369 //--------------------------------------------------------------------------------
inputText() const1370 UText *RegexMatcher::inputText() const {
1371     return fInputText;
1372 }
1373 
1374 
1375 //--------------------------------------------------------------------------------
1376 //
1377 //  getInput() -- like inputText(), but makes a clone or copies into another UText
1378 //
1379 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1380 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1381     if (U_FAILURE(status)) {
1382         return dest;
1383     }
1384     if (U_FAILURE(fDeferredStatus)) {
1385         status = fDeferredStatus;
1386         return dest;
1387     }
1388 
1389     if (dest) {
1390         if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1391             utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1392         } else {
1393             int32_t input16Len;
1394             if (UTEXT_USES_U16(fInputText)) {
1395                 input16Len = (int32_t)fInputLength;
1396             } else {
1397                 UErrorCode lengthStatus = U_ZERO_ERROR;
1398                 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1399             }
1400             UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1401             if (inputChars == NULL) {
1402                 return dest;
1403             }
1404 
1405             status = U_ZERO_ERROR;
1406             utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1407             status = U_ZERO_ERROR;
1408             utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1409 
1410             uprv_free(inputChars);
1411         }
1412         return dest;
1413     } else {
1414         return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1415     }
1416 }
1417 
1418 
1419 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1420 static UBool compat_SyncMutableUTextContents(UText *ut) {
1421     UBool retVal = FALSE;
1422 
1423     //  In the following test, we're really only interested in whether the UText should switch
1424     //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1425     //  will still point to the correct data.
1426     if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1427         UnicodeString *us=(UnicodeString *)ut->context;
1428 
1429         // Update to the latest length.
1430         // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1431         int32_t newLength = us->length();
1432 
1433         // Update the chunk description.
1434         // The buffer may have switched between stack- and heap-based.
1435         ut->chunkContents    = us->getBuffer();
1436         ut->chunkLength      = newLength;
1437         ut->chunkNativeLimit = newLength;
1438         ut->nativeIndexingLimit = newLength;
1439         retVal = TRUE;
1440     }
1441 
1442     return retVal;
1443 }
1444 
1445 //--------------------------------------------------------------------------------
1446 //
1447 //  lookingAt()
1448 //
1449 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1450 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1451     if (U_FAILURE(status)) {
1452         return FALSE;
1453     }
1454     if (U_FAILURE(fDeferredStatus)) {
1455         status = fDeferredStatus;
1456         return FALSE;
1457     }
1458 
1459     if (fInputUniStrMaybeMutable) {
1460         if (compat_SyncMutableUTextContents(fInputText)) {
1461         fInputLength = utext_nativeLength(fInputText);
1462         reset();
1463         }
1464     }
1465     else {
1466         resetPreserveRegion();
1467     }
1468     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1469         MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1470     } else {
1471         MatchAt(fActiveStart, FALSE, status);
1472     }
1473     return fMatch;
1474 }
1475 
1476 
lookingAt(int64_t start,UErrorCode & status)1477 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1478     if (U_FAILURE(status)) {
1479         return FALSE;
1480     }
1481     if (U_FAILURE(fDeferredStatus)) {
1482         status = fDeferredStatus;
1483         return FALSE;
1484     }
1485     reset();
1486 
1487     if (start < 0) {
1488         status = U_INDEX_OUTOFBOUNDS_ERROR;
1489         return FALSE;
1490     }
1491 
1492     if (fInputUniStrMaybeMutable) {
1493         if (compat_SyncMutableUTextContents(fInputText)) {
1494         fInputLength = utext_nativeLength(fInputText);
1495         reset();
1496         }
1497     }
1498 
1499     int64_t nativeStart;
1500     nativeStart = start;
1501     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1502         status = U_INDEX_OUTOFBOUNDS_ERROR;
1503         return FALSE;
1504     }
1505 
1506     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1507         MatchChunkAt((int32_t)nativeStart, FALSE, status);
1508     } else {
1509         MatchAt(nativeStart, FALSE, status);
1510     }
1511     return fMatch;
1512 }
1513 
1514 
1515 
1516 //--------------------------------------------------------------------------------
1517 //
1518 //  matches()
1519 //
1520 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1521 UBool RegexMatcher::matches(UErrorCode &status) {
1522     if (U_FAILURE(status)) {
1523         return FALSE;
1524     }
1525     if (U_FAILURE(fDeferredStatus)) {
1526         status = fDeferredStatus;
1527         return FALSE;
1528     }
1529 
1530     if (fInputUniStrMaybeMutable) {
1531         if (compat_SyncMutableUTextContents(fInputText)) {
1532         fInputLength = utext_nativeLength(fInputText);
1533         reset();
1534         }
1535     }
1536     else {
1537         resetPreserveRegion();
1538     }
1539 
1540     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1541         MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1542     } else {
1543         MatchAt(fActiveStart, TRUE, status);
1544     }
1545     return fMatch;
1546 }
1547 
1548 
matches(int64_t start,UErrorCode & status)1549 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1550     if (U_FAILURE(status)) {
1551         return FALSE;
1552     }
1553     if (U_FAILURE(fDeferredStatus)) {
1554         status = fDeferredStatus;
1555         return FALSE;
1556     }
1557     reset();
1558 
1559     if (start < 0) {
1560         status = U_INDEX_OUTOFBOUNDS_ERROR;
1561         return FALSE;
1562     }
1563 
1564     if (fInputUniStrMaybeMutable) {
1565         if (compat_SyncMutableUTextContents(fInputText)) {
1566         fInputLength = utext_nativeLength(fInputText);
1567         reset();
1568         }
1569     }
1570 
1571     int64_t nativeStart;
1572     nativeStart = start;
1573     if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1574         status = U_INDEX_OUTOFBOUNDS_ERROR;
1575         return FALSE;
1576     }
1577 
1578     if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1579         MatchChunkAt((int32_t)nativeStart, TRUE, status);
1580     } else {
1581         MatchAt(nativeStart, TRUE, status);
1582     }
1583     return fMatch;
1584 }
1585 
1586 
1587 
1588 //--------------------------------------------------------------------------------
1589 //
1590 //    pattern
1591 //
1592 //--------------------------------------------------------------------------------
pattern() const1593 const RegexPattern &RegexMatcher::pattern() const {
1594     return *fPattern;
1595 }
1596 
1597 
1598 
1599 //--------------------------------------------------------------------------------
1600 //
1601 //    region
1602 //
1603 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1604 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1605     if (U_FAILURE(status)) {
1606         return *this;
1607     }
1608 
1609     if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1610         status = U_ILLEGAL_ARGUMENT_ERROR;
1611     }
1612 
1613     int64_t nativeStart = regionStart;
1614     int64_t nativeLimit = regionLimit;
1615     if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1616       status = U_ILLEGAL_ARGUMENT_ERROR;
1617     }
1618 
1619     if (startIndex == -1)
1620       this->reset();
1621     else
1622       resetPreserveRegion();
1623 
1624     fRegionStart = nativeStart;
1625     fRegionLimit = nativeLimit;
1626     fActiveStart = nativeStart;
1627     fActiveLimit = nativeLimit;
1628 
1629     if (startIndex != -1) {
1630       if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1631           status = U_INDEX_OUTOFBOUNDS_ERROR;
1632       }
1633       fMatchEnd = startIndex;
1634     }
1635 
1636     if (!fTransparentBounds) {
1637         fLookStart = nativeStart;
1638         fLookLimit = nativeLimit;
1639     }
1640     if (fAnchoringBounds) {
1641         fAnchorStart = nativeStart;
1642         fAnchorLimit = nativeLimit;
1643     }
1644     return *this;
1645 }
1646 
region(int64_t start,int64_t limit,UErrorCode & status)1647 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1648   return region(start, limit, -1, status);
1649 }
1650 
1651 //--------------------------------------------------------------------------------
1652 //
1653 //    regionEnd
1654 //
1655 //--------------------------------------------------------------------------------
regionEnd() const1656 int32_t RegexMatcher::regionEnd() const {
1657     return (int32_t)fRegionLimit;
1658 }
1659 
regionEnd64() const1660 int64_t RegexMatcher::regionEnd64() const {
1661     return fRegionLimit;
1662 }
1663 
1664 //--------------------------------------------------------------------------------
1665 //
1666 //    regionStart
1667 //
1668 //--------------------------------------------------------------------------------
regionStart() const1669 int32_t RegexMatcher::regionStart() const {
1670     return (int32_t)fRegionStart;
1671 }
1672 
regionStart64() const1673 int64_t RegexMatcher::regionStart64() const {
1674     return fRegionStart;
1675 }
1676 
1677 
1678 //--------------------------------------------------------------------------------
1679 //
1680 //    replaceAll
1681 //
1682 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1683 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1684     UText replacementText = UTEXT_INITIALIZER;
1685     UText resultText = UTEXT_INITIALIZER;
1686     UnicodeString resultString;
1687     if (U_FAILURE(status)) {
1688         return resultString;
1689     }
1690 
1691     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1692     utext_openUnicodeString(&resultText, &resultString, &status);
1693 
1694     replaceAll(&replacementText, &resultText, status);
1695 
1696     utext_close(&resultText);
1697     utext_close(&replacementText);
1698 
1699     return resultString;
1700 }
1701 
1702 
1703 //
1704 //    replaceAll, UText mode
1705 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1706 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1707     if (U_FAILURE(status)) {
1708         return dest;
1709     }
1710     if (U_FAILURE(fDeferredStatus)) {
1711         status = fDeferredStatus;
1712         return dest;
1713     }
1714 
1715     if (dest == NULL) {
1716         UnicodeString emptyString;
1717         UText empty = UTEXT_INITIALIZER;
1718 
1719         utext_openUnicodeString(&empty, &emptyString, &status);
1720         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1721         utext_close(&empty);
1722     }
1723 
1724     if (U_SUCCESS(status)) {
1725         reset();
1726         while (find()) {
1727             appendReplacement(dest, replacement, status);
1728             if (U_FAILURE(status)) {
1729                 break;
1730             }
1731         }
1732         appendTail(dest, status);
1733     }
1734 
1735     return dest;
1736 }
1737 
1738 
1739 //--------------------------------------------------------------------------------
1740 //
1741 //    replaceFirst
1742 //
1743 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1744 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1745     UText replacementText = UTEXT_INITIALIZER;
1746     UText resultText = UTEXT_INITIALIZER;
1747     UnicodeString resultString;
1748 
1749     utext_openConstUnicodeString(&replacementText, &replacement, &status);
1750     utext_openUnicodeString(&resultText, &resultString, &status);
1751 
1752     replaceFirst(&replacementText, &resultText, status);
1753 
1754     utext_close(&resultText);
1755     utext_close(&replacementText);
1756 
1757     return resultString;
1758 }
1759 
1760 //
1761 //    replaceFirst, UText mode
1762 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1763 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1764     if (U_FAILURE(status)) {
1765         return dest;
1766     }
1767     if (U_FAILURE(fDeferredStatus)) {
1768         status = fDeferredStatus;
1769         return dest;
1770     }
1771 
1772     reset();
1773     if (!find()) {
1774         return getInput(dest, status);
1775     }
1776 
1777     if (dest == NULL) {
1778         UnicodeString emptyString;
1779         UText empty = UTEXT_INITIALIZER;
1780 
1781         utext_openUnicodeString(&empty, &emptyString, &status);
1782         dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1783         utext_close(&empty);
1784     }
1785 
1786     appendReplacement(dest, replacement, status);
1787     appendTail(dest, status);
1788 
1789     return dest;
1790 }
1791 
1792 
1793 //--------------------------------------------------------------------------------
1794 //
1795 //     requireEnd
1796 //
1797 //--------------------------------------------------------------------------------
requireEnd() const1798 UBool RegexMatcher::requireEnd() const {
1799     return fRequireEnd;
1800 }
1801 
1802 
1803 //--------------------------------------------------------------------------------
1804 //
1805 //     reset
1806 //
1807 //--------------------------------------------------------------------------------
reset()1808 RegexMatcher &RegexMatcher::reset() {
1809     fRegionStart    = 0;
1810     fRegionLimit    = fInputLength;
1811     fActiveStart    = 0;
1812     fActiveLimit    = fInputLength;
1813     fAnchorStart    = 0;
1814     fAnchorLimit    = fInputLength;
1815     fLookStart      = 0;
1816     fLookLimit      = fInputLength;
1817     resetPreserveRegion();
1818     return *this;
1819 }
1820 
1821 
1822 
resetPreserveRegion()1823 void RegexMatcher::resetPreserveRegion() {
1824     fMatchStart     = 0;
1825     fMatchEnd       = 0;
1826     fLastMatchEnd   = -1;
1827     fAppendPosition = 0;
1828     fMatch          = FALSE;
1829     fHitEnd         = FALSE;
1830     fRequireEnd     = FALSE;
1831     fTime           = 0;
1832     fTickCounter    = TIMER_INITIAL_VALUE;
1833     //resetStack(); // more expensive than it looks...
1834 }
1835 
1836 
reset(const UnicodeString & input)1837 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1838     fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1839     if (fPattern->fNeedsAltInput) {
1840         fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1841     }
1842     if (U_FAILURE(fDeferredStatus)) {
1843         return *this;
1844     }
1845     fInputLength = utext_nativeLength(fInputText);
1846 
1847     reset();
1848     delete fInput;
1849     fInput = NULL;
1850 
1851     //  Do the following for any UnicodeString.
1852     //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1853     fInputUniStrMaybeMutable = TRUE;
1854 
1855     if (fWordBreakItr != NULL) {
1856 #if UCONFIG_NO_BREAK_ITERATION==0
1857         UErrorCode status = U_ZERO_ERROR;
1858         fWordBreakItr->setText(fInputText, status);
1859 #endif
1860     }
1861     return *this;
1862 }
1863 
1864 
reset(UText * input)1865 RegexMatcher &RegexMatcher::reset(UText *input) {
1866     if (fInputText != input) {
1867         fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1868         if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1869         if (U_FAILURE(fDeferredStatus)) {
1870             return *this;
1871         }
1872         fInputLength = utext_nativeLength(fInputText);
1873 
1874         delete fInput;
1875         fInput = NULL;
1876 
1877         if (fWordBreakItr != NULL) {
1878 #if UCONFIG_NO_BREAK_ITERATION==0
1879             UErrorCode status = U_ZERO_ERROR;
1880             fWordBreakItr->setText(input, status);
1881 #endif
1882         }
1883     }
1884     reset();
1885     fInputUniStrMaybeMutable = FALSE;
1886 
1887     return *this;
1888 }
1889 
1890 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1891     fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1892     return *this;
1893 }*/
1894 
reset(int64_t position,UErrorCode & status)1895 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1896     if (U_FAILURE(status)) {
1897         return *this;
1898     }
1899     reset();       // Reset also resets the region to be the entire string.
1900 
1901     if (position < 0 || position > fActiveLimit) {
1902         status = U_INDEX_OUTOFBOUNDS_ERROR;
1903         return *this;
1904     }
1905     fMatchEnd = position;
1906     return *this;
1907 }
1908 
1909 
1910 //--------------------------------------------------------------------------------
1911 //
1912 //    refresh
1913 //
1914 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1915 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1916     if (U_FAILURE(status)) {
1917         return *this;
1918     }
1919     if (input == NULL) {
1920         status = U_ILLEGAL_ARGUMENT_ERROR;
1921         return *this;
1922     }
1923     if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1924         status = U_ILLEGAL_ARGUMENT_ERROR;
1925         return *this;
1926     }
1927     int64_t  pos = utext_getNativeIndex(fInputText);
1928     //  Shallow read-only clone of the new UText into the existing input UText
1929     fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1930     if (U_FAILURE(status)) {
1931         return *this;
1932     }
1933     utext_setNativeIndex(fInputText, pos);
1934 
1935     if (fAltInputText != NULL) {
1936         pos = utext_getNativeIndex(fAltInputText);
1937         fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1938         if (U_FAILURE(status)) {
1939             return *this;
1940         }
1941         utext_setNativeIndex(fAltInputText, pos);
1942     }
1943     return *this;
1944 }
1945 
1946 
1947 
1948 //--------------------------------------------------------------------------------
1949 //
1950 //    setTrace
1951 //
1952 //--------------------------------------------------------------------------------
setTrace(UBool state)1953 void RegexMatcher::setTrace(UBool state) {
1954     fTraceDebug = state;
1955 }
1956 
1957 
1958 
1959 /**
1960   *  UText, replace entire contents of the destination UText with a substring of the source UText.
1961   *
1962   *     @param src    The source UText
1963   *     @param dest   The destination UText. Must be writable.
1964   *                   May be NULL, in which case a new UText will be allocated.
1965   *     @param start  Start index of source substring.
1966   *     @param limit  Limit index of source substring.
1967   *     @param status An error code.
1968   */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1969 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1970     if (U_FAILURE(*status)) {
1971         return dest;
1972     }
1973     if (start == limit) {
1974         if (dest) {
1975             utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1976             return dest;
1977         } else {
1978             return utext_openUChars(NULL, NULL, 0, status);
1979         }
1980     }
1981     int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1982     if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1983         return dest;
1984     }
1985     *status = U_ZERO_ERROR;
1986     MaybeStackArray<UChar, 40> buffer;
1987     if (length >= buffer.getCapacity()) {
1988         UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
1989         if (newBuf == NULL) {
1990             *status = U_MEMORY_ALLOCATION_ERROR;
1991         }
1992     }
1993     utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1994     if (dest) {
1995         utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
1996         return dest;
1997     }
1998 
1999     // Caller did not provide a prexisting UText.
2000     // Open a new one, and have it adopt the text buffer storage.
2001     if (U_FAILURE(*status)) {
2002         return NULL;
2003     }
2004     int32_t ownedLength = 0;
2005     UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2006     if (ownedBuf == NULL) {
2007         *status = U_MEMORY_ALLOCATION_ERROR;
2008         return NULL;
2009     }
2010     UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2011     if (U_FAILURE(*status)) {
2012         uprv_free(ownedBuf);
2013         return NULL;
2014     }
2015     result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2016     return result;
2017 }
2018 
2019 
2020 //---------------------------------------------------------------------
2021 //
2022 //   split
2023 //
2024 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2025 int32_t  RegexMatcher::split(const UnicodeString &input,
2026         UnicodeString    dest[],
2027         int32_t          destCapacity,
2028         UErrorCode      &status)
2029 {
2030     UText inputText = UTEXT_INITIALIZER;
2031     utext_openConstUnicodeString(&inputText, &input, &status);
2032     if (U_FAILURE(status)) {
2033         return 0;
2034     }
2035 
2036     UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2037     if (destText == NULL) {
2038         status = U_MEMORY_ALLOCATION_ERROR;
2039         return 0;
2040     }
2041     int32_t i;
2042     for (i = 0; i < destCapacity; i++) {
2043         destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2044     }
2045 
2046     int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2047 
2048     for (i = 0; i < destCapacity; i++) {
2049         utext_close(destText[i]);
2050     }
2051 
2052     uprv_free(destText);
2053     utext_close(&inputText);
2054     return fieldCount;
2055 }
2056 
2057 //
2058 //   split, UText mode
2059 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2060 int32_t  RegexMatcher::split(UText *input,
2061         UText           *dest[],
2062         int32_t          destCapacity,
2063         UErrorCode      &status)
2064 {
2065     //
2066     // Check arguements for validity
2067     //
2068     if (U_FAILURE(status)) {
2069         return 0;
2070     };
2071 
2072     if (destCapacity < 1) {
2073         status = U_ILLEGAL_ARGUMENT_ERROR;
2074         return 0;
2075     }
2076 
2077     //
2078     // Reset for the input text
2079     //
2080     reset(input);
2081     int64_t   nextOutputStringStart = 0;
2082     if (fActiveLimit == 0) {
2083         return 0;
2084     }
2085 
2086     //
2087     // Loop through the input text, searching for the delimiter pattern
2088     //
2089     int32_t i;
2090     int32_t numCaptureGroups = fPattern->fGroupMap->size();
2091     for (i=0; ; i++) {
2092         if (i>=destCapacity-1) {
2093             // There is one or zero output string left.
2094             // Fill the last output string with whatever is left from the input, then exit the loop.
2095             //  ( i will be == destCapacity if we filled the output array while processing
2096             //    capture groups of the delimiter expression, in which case we will discard the
2097             //    last capture group saved in favor of the unprocessed remainder of the
2098             //    input string.)
2099             i = destCapacity-1;
2100             if (fActiveLimit > nextOutputStringStart) {
2101                 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2102                     if (dest[i]) {
2103                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2104                                       input->chunkContents+nextOutputStringStart,
2105                                       (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2106                     } else {
2107                         UText remainingText = UTEXT_INITIALIZER;
2108                         utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2109                                          fActiveLimit-nextOutputStringStart, &status);
2110                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2111                         utext_close(&remainingText);
2112                     }
2113                 } else {
2114                     UErrorCode lengthStatus = U_ZERO_ERROR;
2115                     int32_t remaining16Length =
2116                         utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2117                     UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2118                     if (remainingChars == NULL) {
2119                         status = U_MEMORY_ALLOCATION_ERROR;
2120                         break;
2121                     }
2122 
2123                     utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2124                     if (dest[i]) {
2125                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2126                     } else {
2127                         UText remainingText = UTEXT_INITIALIZER;
2128                         utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2129                         dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2130                         utext_close(&remainingText);
2131                     }
2132 
2133                     uprv_free(remainingChars);
2134                 }
2135             }
2136             break;
2137         }
2138         if (find()) {
2139             // We found another delimiter.  Move everything from where we started looking
2140             //  up until the start of the delimiter into the next output string.
2141             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2142                 if (dest[i]) {
2143                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2144                                   input->chunkContents+nextOutputStringStart,
2145                                   (int32_t)(fMatchStart-nextOutputStringStart), &status);
2146                 } else {
2147                     UText remainingText = UTEXT_INITIALIZER;
2148                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2149                                       fMatchStart-nextOutputStringStart, &status);
2150                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2151                     utext_close(&remainingText);
2152                 }
2153             } else {
2154                 UErrorCode lengthStatus = U_ZERO_ERROR;
2155                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2156                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2157                 if (remainingChars == NULL) {
2158                     status = U_MEMORY_ALLOCATION_ERROR;
2159                     break;
2160                 }
2161                 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2162                 if (dest[i]) {
2163                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2164                 } else {
2165                     UText remainingText = UTEXT_INITIALIZER;
2166                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2167                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2168                     utext_close(&remainingText);
2169                 }
2170 
2171                 uprv_free(remainingChars);
2172             }
2173             nextOutputStringStart = fMatchEnd;
2174 
2175             // If the delimiter pattern has capturing parentheses, the captured
2176             //  text goes out into the next n destination strings.
2177             int32_t groupNum;
2178             for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2179                 if (i >= destCapacity-2) {
2180                     // Never fill the last available output string with capture group text.
2181                     // It will filled with the last field, the remainder of the
2182                     //  unsplit input text.
2183                     break;
2184                 }
2185                 i++;
2186                 dest[i] = utext_extract_replace(fInputText, dest[i],
2187                                                start64(groupNum, status), end64(groupNum, status), &status);
2188             }
2189 
2190             if (nextOutputStringStart == fActiveLimit) {
2191                 // The delimiter was at the end of the string.  We're done, but first
2192                 // we output one last empty string, for the empty field following
2193                 //   the delimiter at the end of input.
2194                 if (i+1 < destCapacity) {
2195                     ++i;
2196                     if (dest[i] == NULL) {
2197                         dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2198                     } else {
2199                         static UChar emptyString[] = {(UChar)0};
2200                         utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2201                     }
2202                 }
2203                 break;
2204 
2205             }
2206         }
2207         else
2208         {
2209             // We ran off the end of the input while looking for the next delimiter.
2210             // All the remaining text goes into the current output string.
2211             if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2212                 if (dest[i]) {
2213                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2214                                   input->chunkContents+nextOutputStringStart,
2215                                   (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2216                 } else {
2217                     UText remainingText = UTEXT_INITIALIZER;
2218                     utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2219                                      fActiveLimit-nextOutputStringStart, &status);
2220                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2221                     utext_close(&remainingText);
2222                 }
2223             } else {
2224                 UErrorCode lengthStatus = U_ZERO_ERROR;
2225                 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2226                 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2227                 if (remainingChars == NULL) {
2228                     status = U_MEMORY_ALLOCATION_ERROR;
2229                     break;
2230                 }
2231 
2232                 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2233                 if (dest[i]) {
2234                     utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2235                 } else {
2236                     UText remainingText = UTEXT_INITIALIZER;
2237                     utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2238                     dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2239                     utext_close(&remainingText);
2240                 }
2241 
2242                 uprv_free(remainingChars);
2243             }
2244             break;
2245         }
2246         if (U_FAILURE(status)) {
2247             break;
2248         }
2249     }   // end of for loop
2250     return i+1;
2251 }
2252 
2253 
2254 //--------------------------------------------------------------------------------
2255 //
2256 //     start
2257 //
2258 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2259 int32_t RegexMatcher::start(UErrorCode &status) const {
2260     return start(0, status);
2261 }
2262 
start64(UErrorCode & status) const2263 int64_t RegexMatcher::start64(UErrorCode &status) const {
2264     return start64(0, status);
2265 }
2266 
2267 //--------------------------------------------------------------------------------
2268 //
2269 //     start(int32_t group, UErrorCode &status)
2270 //
2271 //--------------------------------------------------------------------------------
2272 
start64(int32_t group,UErrorCode & status) const2273 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2274     if (U_FAILURE(status)) {
2275         return -1;
2276     }
2277     if (U_FAILURE(fDeferredStatus)) {
2278         status = fDeferredStatus;
2279         return -1;
2280     }
2281     if (fMatch == FALSE) {
2282         status = U_REGEX_INVALID_STATE;
2283         return -1;
2284     }
2285     if (group < 0 || group > fPattern->fGroupMap->size()) {
2286         status = U_INDEX_OUTOFBOUNDS_ERROR;
2287         return -1;
2288     }
2289     int64_t s;
2290     if (group == 0) {
2291         s = fMatchStart;
2292     } else {
2293         int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2294         U_ASSERT(groupOffset < fPattern->fFrameSize);
2295         U_ASSERT(groupOffset >= 0);
2296         s = fFrame->fExtra[groupOffset];
2297     }
2298 
2299     return s;
2300 }
2301 
2302 
start(int32_t group,UErrorCode & status) const2303 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2304     return (int32_t)start64(group, status);
2305 }
2306 
2307 //--------------------------------------------------------------------------------
2308 //
2309 //     useAnchoringBounds
2310 //
2311 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2312 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2313     fAnchoringBounds = b;
2314     fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2315     fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2316     return *this;
2317 }
2318 
2319 
2320 //--------------------------------------------------------------------------------
2321 //
2322 //     useTransparentBounds
2323 //
2324 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2325 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2326     fTransparentBounds = b;
2327     fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2328     fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2329     return *this;
2330 }
2331 
2332 //--------------------------------------------------------------------------------
2333 //
2334 //     setTimeLimit
2335 //
2336 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2337 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2338     if (U_FAILURE(status)) {
2339         return;
2340     }
2341     if (U_FAILURE(fDeferredStatus)) {
2342         status = fDeferredStatus;
2343         return;
2344     }
2345     if (limit < 0) {
2346         status = U_ILLEGAL_ARGUMENT_ERROR;
2347         return;
2348     }
2349     fTimeLimit = limit;
2350 }
2351 
2352 
2353 //--------------------------------------------------------------------------------
2354 //
2355 //     getTimeLimit
2356 //
2357 //--------------------------------------------------------------------------------
getTimeLimit() const2358 int32_t RegexMatcher::getTimeLimit() const {
2359     return fTimeLimit;
2360 }
2361 
2362 
2363 //--------------------------------------------------------------------------------
2364 //
2365 //     setStackLimit
2366 //
2367 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2368 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2369     if (U_FAILURE(status)) {
2370         return;
2371     }
2372     if (U_FAILURE(fDeferredStatus)) {
2373         status = fDeferredStatus;
2374         return;
2375     }
2376     if (limit < 0) {
2377         status = U_ILLEGAL_ARGUMENT_ERROR;
2378         return;
2379     }
2380 
2381     // Reset the matcher.  This is needed here in case there is a current match
2382     //    whose final stack frame (containing the match results, pointed to by fFrame)
2383     //    would be lost by resizing to a smaller stack size.
2384     reset();
2385 
2386     if (limit == 0) {
2387         // Unlimited stack expansion
2388         fStack->setMaxCapacity(0);
2389     } else {
2390         // Change the units of the limit  from bytes to ints, and bump the size up
2391         //   to be big enough to hold at least one stack frame for the pattern,
2392         //   if it isn't there already.
2393         int32_t adjustedLimit = limit / sizeof(int32_t);
2394         if (adjustedLimit < fPattern->fFrameSize) {
2395             adjustedLimit = fPattern->fFrameSize;
2396         }
2397         fStack->setMaxCapacity(adjustedLimit);
2398     }
2399     fStackLimit = limit;
2400 }
2401 
2402 
2403 //--------------------------------------------------------------------------------
2404 //
2405 //     getStackLimit
2406 //
2407 //--------------------------------------------------------------------------------
getStackLimit() const2408 int32_t RegexMatcher::getStackLimit() const {
2409     return fStackLimit;
2410 }
2411 
2412 
2413 //--------------------------------------------------------------------------------
2414 //
2415 //     setMatchCallback
2416 //
2417 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2418 void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2419                                     const void              *context,
2420                                     UErrorCode              &status) {
2421     if (U_FAILURE(status)) {
2422         return;
2423     }
2424     fCallbackFn = callback;
2425     fCallbackContext = context;
2426 }
2427 
2428 
2429 //--------------------------------------------------------------------------------
2430 //
2431 //     getMatchCallback
2432 //
2433 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2434 void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2435                                   const void              *&context,
2436                                   UErrorCode              &status) {
2437     if (U_FAILURE(status)) {
2438        return;
2439     }
2440     callback = fCallbackFn;
2441     context  = fCallbackContext;
2442 }
2443 
2444 
2445 //--------------------------------------------------------------------------------
2446 //
2447 //     setMatchCallback
2448 //
2449 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2450 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2451                                                 const void                      *context,
2452                                                 UErrorCode                      &status) {
2453     if (U_FAILURE(status)) {
2454         return;
2455     }
2456     fFindProgressCallbackFn = callback;
2457     fFindProgressCallbackContext = context;
2458 }
2459 
2460 
2461 //--------------------------------------------------------------------------------
2462 //
2463 //     getMatchCallback
2464 //
2465 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2466 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2467                                                 const void                    *&context,
2468                                                 UErrorCode                    &status) {
2469     if (U_FAILURE(status)) {
2470        return;
2471     }
2472     callback = fFindProgressCallbackFn;
2473     context  = fFindProgressCallbackContext;
2474 }
2475 
2476 
2477 //================================================================================
2478 //
2479 //    Code following this point in this file is the internal
2480 //    Match Engine Implementation.
2481 //
2482 //================================================================================
2483 
2484 
2485 //--------------------------------------------------------------------------------
2486 //
2487 //   resetStack
2488 //           Discard any previous contents of the state save stack, and initialize a
2489 //           new stack frame to all -1.  The -1s are needed for capture group limits,
2490 //           where they indicate that a group has not yet matched anything.
2491 //--------------------------------------------------------------------------------
resetStack()2492 REStackFrame *RegexMatcher::resetStack() {
2493     // Discard any previous contents of the state save stack, and initialize a
2494     //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2495     //  where they indicate that a group has not yet matched anything.
2496     fStack->removeAllElements();
2497 
2498     REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2499     if(U_FAILURE(fDeferredStatus)) {
2500         return NULL;
2501     }
2502 
2503     int32_t i;
2504     for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2505         iFrame->fExtra[i] = -1;
2506     }
2507     return iFrame;
2508 }
2509 
2510 
2511 
2512 //--------------------------------------------------------------------------------
2513 //
2514 //   isWordBoundary
2515 //                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2516 //                     For us,
2517 //                       If the current char is a combining mark,
2518 //                          \b is FALSE.
2519 //                       Else Scan backwards to the first non-combining char.
2520 //                            We are at a boundary if the this char and the original chars are
2521 //                               opposite in membership in \w set
2522 //
2523 //          parameters:   pos   - the current position in the input buffer
2524 //
2525 //              TODO:  double-check edge cases at region boundaries.
2526 //
2527 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2528 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2529     UBool isBoundary = FALSE;
2530     UBool cIsWord    = FALSE;
2531 
2532     if (pos >= fLookLimit) {
2533         fHitEnd = TRUE;
2534     } else {
2535         // Determine whether char c at current position is a member of the word set of chars.
2536         // If we're off the end of the string, behave as though we're not at a word char.
2537         UTEXT_SETNATIVEINDEX(fInputText, pos);
2538         UChar32  c = UTEXT_CURRENT32(fInputText);
2539         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2540             // Current char is a combining one.  Not a boundary.
2541             return FALSE;
2542         }
2543         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2544     }
2545 
2546     // Back up until we come to a non-combining char, determine whether
2547     //  that char is a word char.
2548     UBool prevCIsWord = FALSE;
2549     for (;;) {
2550         if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2551             break;
2552         }
2553         UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2554         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2555               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2556             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2557             break;
2558         }
2559     }
2560     isBoundary = cIsWord ^ prevCIsWord;
2561     return isBoundary;
2562 }
2563 
isChunkWordBoundary(int32_t pos)2564 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2565     UBool isBoundary = FALSE;
2566     UBool cIsWord    = FALSE;
2567 
2568     const UChar *inputBuf = fInputText->chunkContents;
2569 
2570     if (pos >= fLookLimit) {
2571         fHitEnd = TRUE;
2572     } else {
2573         // Determine whether char c at current position is a member of the word set of chars.
2574         // If we're off the end of the string, behave as though we're not at a word char.
2575         UChar32 c;
2576         U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2577         if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2578             // Current char is a combining one.  Not a boundary.
2579             return FALSE;
2580         }
2581         cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2582     }
2583 
2584     // Back up until we come to a non-combining char, determine whether
2585     //  that char is a word char.
2586     UBool prevCIsWord = FALSE;
2587     for (;;) {
2588         if (pos <= fLookStart) {
2589             break;
2590         }
2591         UChar32 prevChar;
2592         U16_PREV(inputBuf, fLookStart, pos, prevChar);
2593         if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2594               || u_charType(prevChar) == U_FORMAT_CHAR)) {
2595             prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2596             break;
2597         }
2598     }
2599     isBoundary = cIsWord ^ prevCIsWord;
2600     return isBoundary;
2601 }
2602 
2603 //--------------------------------------------------------------------------------
2604 //
2605 //   isUWordBoundary
2606 //
2607 //         Test for a word boundary using RBBI word break.
2608 //
2609 //          parameters:   pos   - the current position in the input buffer
2610 //
2611 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2612 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2613     UBool       returnVal = FALSE;
2614 #if UCONFIG_NO_BREAK_ITERATION==0
2615 
2616     // If we haven't yet created a break iterator for this matcher, do it now.
2617     if (fWordBreakItr == NULL) {
2618         fWordBreakItr =
2619             (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2620         if (U_FAILURE(fDeferredStatus)) {
2621             return FALSE;
2622         }
2623         fWordBreakItr->setText(fInputText, fDeferredStatus);
2624     }
2625 
2626     if (pos >= fLookLimit) {
2627         fHitEnd = TRUE;
2628         returnVal = TRUE;   // With Unicode word rules, only positions within the interior of "real"
2629                             //    words are not boundaries.  All non-word chars stand by themselves,
2630                             //    with word boundaries on both sides.
2631     } else {
2632         if (!UTEXT_USES_U16(fInputText)) {
2633             // !!!: Would like a better way to do this!
2634             UErrorCode status = U_ZERO_ERROR;
2635             pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2636         }
2637         returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2638     }
2639 #endif
2640     return   returnVal;
2641 }
2642 
2643 //--------------------------------------------------------------------------------
2644 //
2645 //   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2646 //                     saves. Increment the "time" counter, and call the
2647 //                     user callback function if there is one installed.
2648 //
2649 //                     If the match operation needs to be aborted, either for a time-out
2650 //                     or because the user callback asked for it, just set an error status.
2651 //                     The engine will pick that up and stop in its outer loop.
2652 //
2653 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2654 void RegexMatcher::IncrementTime(UErrorCode &status) {
2655     fTickCounter = TIMER_INITIAL_VALUE;
2656     fTime++;
2657     if (fCallbackFn != NULL) {
2658         if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2659             status = U_REGEX_STOPPED_BY_CALLER;
2660             return;
2661         }
2662     }
2663     if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2664         status = U_REGEX_TIME_OUT;
2665     }
2666 }
2667 
2668 //--------------------------------------------------------------------------------
2669 //
2670 //   StateSave
2671 //       Make a new stack frame, initialized as a copy of the current stack frame.
2672 //       Set the pattern index in the original stack frame from the operand value
2673 //       in the opcode.  Execution of the engine continues with the state in
2674 //       the newly created stack frame
2675 //
2676 //       Note that reserveBlock() may grow the stack, resulting in the
2677 //       whole thing being relocated in memory.
2678 //
2679 //    Parameters:
2680 //       fp           The top frame pointer when called.  At return, a new
2681 //                    fame will be present
2682 //       savePatIdx   An index into the compiled pattern.  Goes into the original
2683 //                    (not new) frame.  If execution ever back-tracks out of the
2684 //                    new frame, this will be where we continue from in the pattern.
2685 //    Return
2686 //                    The new frame pointer.
2687 //
2688 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2689 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2690     if (U_FAILURE(status)) {
2691         return fp;
2692     }
2693     // push storage for a new frame.
2694     int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2695     if (U_FAILURE(status)) {
2696         // Failure on attempted stack expansion.
2697         //   Stack function set some other error code, change it to a more
2698         //   specific one for regular expressions.
2699         status = U_REGEX_STACK_OVERFLOW;
2700         // We need to return a writable stack frame, so just return the
2701         //    previous frame.  The match operation will stop quickly
2702         //    because of the error status, after which the frame will never
2703         //    be looked at again.
2704         return fp;
2705     }
2706     fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2707 
2708     // New stack frame = copy of old top frame.
2709     int64_t *source = (int64_t *)fp;
2710     int64_t *dest   = newFP;
2711     for (;;) {
2712         *dest++ = *source++;
2713         if (source == newFP) {
2714             break;
2715         }
2716     }
2717 
2718     fTickCounter--;
2719     if (fTickCounter <= 0) {
2720        IncrementTime(status);    // Re-initializes fTickCounter
2721     }
2722     fp->fPatIdx = savePatIdx;
2723     return (REStackFrame *)newFP;
2724 }
2725 
2726 
2727 //--------------------------------------------------------------------------------
2728 //
2729 //   MatchAt      This is the actual matching engine.
2730 //
2731 //                  startIdx:    begin matching a this index.
2732 //                  toEnd:       if true, match must extend to end of the input region
2733 //
2734 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2735 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2736     UBool       isMatch  = FALSE;      // True if the we have a match.
2737 
2738     int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2739 
2740     int32_t     op;                    // Operation from the compiled pattern, split into
2741     int32_t     opType;                //    the opcode
2742     int32_t     opValue;               //    and the operand value.
2743 
2744 #ifdef REGEX_RUN_DEBUG
2745     if (fTraceDebug)
2746     {
2747         printf("MatchAt(startIdx=%ld)\n", startIdx);
2748         printf("Original Pattern: ");
2749         UChar32 c = utext_next32From(fPattern->fPattern, 0);
2750         while (c != U_SENTINEL) {
2751             if (c<32 || c>256) {
2752                 c = '.';
2753             }
2754             printf("%c", c);
2755 
2756             c = UTEXT_NEXT32(fPattern->fPattern);
2757         }
2758         printf("\n");
2759         printf("Input String: ");
2760         c = utext_next32From(fInputText, 0);
2761         while (c != U_SENTINEL) {
2762             if (c<32 || c>256) {
2763                 c = '.';
2764             }
2765             printf("%c", c);
2766 
2767             c = UTEXT_NEXT32(fInputText);
2768         }
2769         printf("\n");
2770         printf("\n");
2771     }
2772 #endif
2773 
2774     if (U_FAILURE(status)) {
2775         return;
2776     }
2777 
2778     //  Cache frequently referenced items from the compiled pattern
2779     //
2780     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2781 
2782     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2783     UVector             *sets          = fPattern->fSets;
2784 
2785     fFrameSize = fPattern->fFrameSize;
2786     REStackFrame        *fp            = resetStack();
2787     if (U_FAILURE(fDeferredStatus)) {
2788         status = fDeferredStatus;
2789         return;
2790     }
2791 
2792     fp->fPatIdx   = 0;
2793     fp->fInputIdx = startIdx;
2794 
2795     // Zero out the pattern's static data
2796     int32_t i;
2797     for (i = 0; i<fPattern->fDataSize; i++) {
2798         fData[i] = 0;
2799     }
2800 
2801     //
2802     //  Main loop for interpreting the compiled pattern.
2803     //  One iteration of the loop per pattern operation performed.
2804     //
2805     for (;;) {
2806         op      = (int32_t)pat[fp->fPatIdx];
2807         opType  = URX_TYPE(op);
2808         opValue = URX_VAL(op);
2809 #ifdef REGEX_RUN_DEBUG
2810         if (fTraceDebug) {
2811             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2812             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2813                 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2814             fPattern->dumpOp(fp->fPatIdx);
2815         }
2816 #endif
2817         fp->fPatIdx++;
2818 
2819         switch (opType) {
2820 
2821 
2822         case URX_NOP:
2823             break;
2824 
2825 
2826         case URX_BACKTRACK:
2827             // Force a backtrack.  In some circumstances, the pattern compiler
2828             //   will notice that the pattern can't possibly match anything, and will
2829             //   emit one of these at that point.
2830             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2831             break;
2832 
2833 
2834         case URX_ONECHAR:
2835             if (fp->fInputIdx < fActiveLimit) {
2836                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2837                 UChar32 c = UTEXT_NEXT32(fInputText);
2838                 if (c == opValue) {
2839                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2840                     break;
2841                 }
2842             } else {
2843                 fHitEnd = TRUE;
2844             }
2845             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2846             break;
2847 
2848 
2849         case URX_STRING:
2850             {
2851                 // Test input against a literal string.
2852                 // Strings require two slots in the compiled pattern, one for the
2853                 //   offset to the string text, and one for the length.
2854 
2855                 int32_t   stringStartIdx = opValue;
2856                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2857                 fp->fPatIdx++;
2858                 opType    = URX_TYPE(op);
2859                 int32_t stringLen = URX_VAL(op);
2860                 U_ASSERT(opType == URX_STRING_LEN);
2861                 U_ASSERT(stringLen >= 2);
2862 
2863                 const UChar *patternString = litText+stringStartIdx;
2864                 int32_t patternStringIndex = 0;
2865                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2866                 UChar32 inputChar;
2867                 UChar32 patternChar;
2868                 UBool success = TRUE;
2869                 while (patternStringIndex < stringLen) {
2870                     if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2871                         success = FALSE;
2872                         fHitEnd = TRUE;
2873                         break;
2874                     }
2875                     inputChar = UTEXT_NEXT32(fInputText);
2876                     U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2877                     if (patternChar != inputChar) {
2878                         success = FALSE;
2879                         break;
2880                     }
2881                 }
2882 
2883                 if (success) {
2884                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2885                 } else {
2886                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2887                 }
2888             }
2889             break;
2890 
2891 
2892         case URX_STATE_SAVE:
2893             fp = StateSave(fp, opValue, status);
2894             break;
2895 
2896 
2897         case URX_END:
2898             // The match loop will exit via this path on a successful match,
2899             //   when we reach the end of the pattern.
2900             if (toEnd && fp->fInputIdx != fActiveLimit) {
2901                 // The pattern matched, but not to the end of input.  Try some more.
2902                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2903                 break;
2904             }
2905             isMatch = TRUE;
2906             goto  breakFromLoop;
2907 
2908         // Start and End Capture stack frame variables are laid out out like this:
2909             //  fp->fExtra[opValue]  - The start of a completed capture group
2910             //             opValue+1 - The end   of a completed capture group
2911             //             opValue+2 - the start of a capture group whose end
2912             //                          has not yet been reached (and might not ever be).
2913         case URX_START_CAPTURE:
2914             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2915             fp->fExtra[opValue+2] = fp->fInputIdx;
2916             break;
2917 
2918 
2919         case URX_END_CAPTURE:
2920             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2921             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2922             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2923             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2924             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2925             break;
2926 
2927 
2928         case URX_DOLLAR:                   //  $, test for End of line
2929                                            //     or for position before new line at end of input
2930             {
2931                 if (fp->fInputIdx >= fAnchorLimit) {
2932                     // We really are at the end of input.  Success.
2933                     fHitEnd = TRUE;
2934                     fRequireEnd = TRUE;
2935                     break;
2936                 }
2937 
2938                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2939 
2940                 // If we are positioned just before a new-line that is located at the
2941                 //   end of input, succeed.
2942                 UChar32 c = UTEXT_NEXT32(fInputText);
2943                 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2944                     if (isLineTerminator(c)) {
2945                         // If not in the middle of a CR/LF sequence
2946                         if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2947                             // At new-line at end of input. Success
2948                             fHitEnd = TRUE;
2949                             fRequireEnd = TRUE;
2950 
2951                             break;
2952                         }
2953                     }
2954                 } else {
2955                     UChar32 nextC = UTEXT_NEXT32(fInputText);
2956                     if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2957                         fHitEnd = TRUE;
2958                         fRequireEnd = TRUE;
2959                         break;                         // At CR/LF at end of input.  Success
2960                     }
2961                 }
2962 
2963                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2964             }
2965             break;
2966 
2967 
2968          case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
2969             if (fp->fInputIdx >= fAnchorLimit) {
2970                 // Off the end of input.  Success.
2971                 fHitEnd = TRUE;
2972                 fRequireEnd = TRUE;
2973                 break;
2974             } else {
2975                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2976                 UChar32 c = UTEXT_NEXT32(fInputText);
2977                 // Either at the last character of input, or off the end.
2978                 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2979                     fHitEnd = TRUE;
2980                     fRequireEnd = TRUE;
2981                     break;
2982                 }
2983             }
2984 
2985             // Not at end of input.  Back-track out.
2986             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2987             break;
2988 
2989 
2990          case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
2991              {
2992                  if (fp->fInputIdx >= fAnchorLimit) {
2993                      // We really are at the end of input.  Success.
2994                      fHitEnd = TRUE;
2995                      fRequireEnd = TRUE;
2996                      break;
2997                  }
2998                  // If we are positioned just before a new-line, succeed.
2999                  // It makes no difference where the new-line is within the input.
3000                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3001                  UChar32 c = UTEXT_CURRENT32(fInputText);
3002                  if (isLineTerminator(c)) {
3003                      // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3004                      //  In multi-line mode, hitting a new-line just before the end of input does not
3005                      //   set the hitEnd or requireEnd flags
3006                      if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3007                         break;
3008                      }
3009                  }
3010                  // not at a new line.  Fail.
3011                  fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3012              }
3013              break;
3014 
3015 
3016          case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3017              {
3018                  if (fp->fInputIdx >= fAnchorLimit) {
3019                      // We really are at the end of input.  Success.
3020                      fHitEnd = TRUE;
3021                      fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
3022                      break;               //   adding a new-line would not lose the match.
3023                  }
3024                  // If we are not positioned just before a new-line, the test fails; backtrack out.
3025                  // It makes no difference where the new-line is within the input.
3026                  UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3027                  if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3028                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3029                  }
3030              }
3031              break;
3032 
3033 
3034        case URX_CARET:                    //  ^, test for start of line
3035             if (fp->fInputIdx != fAnchorStart) {
3036                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3037             }
3038             break;
3039 
3040 
3041        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3042            {
3043                if (fp->fInputIdx == fAnchorStart) {
3044                    // We are at the start input.  Success.
3045                    break;
3046                }
3047                // Check whether character just before the current pos is a new-line
3048                //   unless we are at the end of input
3049                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3050                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3051                if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3052                    //  It's a new-line.  ^ is true.  Success.
3053                    //  TODO:  what should be done with positions between a CR and LF?
3054                    break;
3055                }
3056                // Not at the start of a line.  Fail.
3057                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3058            }
3059            break;
3060 
3061 
3062        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3063            {
3064                U_ASSERT(fp->fInputIdx >= fAnchorStart);
3065                if (fp->fInputIdx <= fAnchorStart) {
3066                    // We are at the start input.  Success.
3067                    break;
3068                }
3069                // Check whether character just before the current pos is a new-line
3070                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3071                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3072                UChar32  c = UTEXT_PREVIOUS32(fInputText);
3073                if (c != 0x0a) {
3074                    // Not at the start of a line.  Back-track out.
3075                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3076                }
3077            }
3078            break;
3079 
3080         case URX_BACKSLASH_B:          // Test for word boundaries
3081             {
3082                 UBool success = isWordBoundary(fp->fInputIdx);
3083                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3084                 if (!success) {
3085                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3086                 }
3087             }
3088             break;
3089 
3090 
3091         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3092             {
3093                 UBool success = isUWordBoundary(fp->fInputIdx);
3094                 success ^= (UBool)(opValue != 0);     // flip sense for \B
3095                 if (!success) {
3096                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3097                 }
3098             }
3099             break;
3100 
3101 
3102         case URX_BACKSLASH_D:            // Test for decimal digit
3103             {
3104                 if (fp->fInputIdx >= fActiveLimit) {
3105                     fHitEnd = TRUE;
3106                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3107                     break;
3108                 }
3109 
3110                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3111 
3112                 UChar32 c = UTEXT_NEXT32(fInputText);
3113                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3114                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3115                 success ^= (UBool)(opValue != 0);        // flip sense for \D
3116                 if (success) {
3117                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3118                 } else {
3119                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3120                 }
3121             }
3122             break;
3123 
3124 
3125         case URX_BACKSLASH_G:          // Test for position at end of previous match
3126             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3127                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3128             }
3129             break;
3130 
3131 
3132         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3133             {
3134                 if (fp->fInputIdx >= fActiveLimit) {
3135                     fHitEnd = TRUE;
3136                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3137                     break;
3138                 }
3139                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3140                 UChar32 c = UTEXT_NEXT32(fInputText);
3141                 int8_t ctype = u_charType(c);
3142                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3143                 success ^= (UBool)(opValue != 0);        // flip sense for \H
3144                 if (success) {
3145                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3146                 } else {
3147                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3148                 }
3149             }
3150             break;
3151 
3152 
3153         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3154             {
3155                 if (fp->fInputIdx >= fActiveLimit) {
3156                     fHitEnd = TRUE;
3157                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3158                     break;
3159                 }
3160                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3161                 UChar32 c = UTEXT_NEXT32(fInputText);
3162                 if (isLineTerminator(c)) {
3163                     if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3164                         utext_next32(fInputText);
3165                     }
3166                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3167                 } else {
3168                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169                 }
3170             }
3171             break;
3172 
3173 
3174         case URX_BACKSLASH_V:            // \v, any single line ending character.
3175             {
3176                 if (fp->fInputIdx >= fActiveLimit) {
3177                     fHitEnd = TRUE;
3178                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3179                     break;
3180                 }
3181                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3182                 UChar32 c = UTEXT_NEXT32(fInputText);
3183                 UBool success = isLineTerminator(c);
3184                 success ^= (UBool)(opValue != 0);        // flip sense for \V
3185                 if (success) {
3186                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3187                 } else {
3188                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3189                 }
3190             }
3191             break;
3192 
3193 
3194         case URX_BACKSLASH_X:
3195             //  Match a Grapheme, as defined by Unicode TR 29.
3196             //  Differs slightly from Perl, which consumes combining marks independently
3197             //    of context.
3198             {
3199 
3200                 // Fail if at end of input
3201                 if (fp->fInputIdx >= fActiveLimit) {
3202                     fHitEnd = TRUE;
3203                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3204                     break;
3205                 }
3206 
3207                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3208 
3209                 // Examine (and consume) the current char.
3210                 //   Dispatch into a little state machine, based on the char.
3211                 UChar32  c;
3212                 c = UTEXT_NEXT32(fInputText);
3213                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3214                 UnicodeSet **sets = fPattern->fStaticSets;
3215                 if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
3216                 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3217                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3218                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3219                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3220                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3221                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3222                 goto GC_Extend;
3223 
3224 
3225 
3226 GC_L:
3227                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3228                 c = UTEXT_NEXT32(fInputText);
3229                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3230                 if (sets[URX_GC_L]->contains(c))       goto GC_L;
3231                 if (sets[URX_GC_LV]->contains(c))      goto GC_V;
3232                 if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
3233                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3234                 (void)UTEXT_PREVIOUS32(fInputText);
3235                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3236                 goto GC_Extend;
3237 
3238 GC_V:
3239                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3240                 c = UTEXT_NEXT32(fInputText);
3241                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3242                 if (sets[URX_GC_V]->contains(c))       goto GC_V;
3243                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3244                 (void)UTEXT_PREVIOUS32(fInputText);
3245                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3246                 goto GC_Extend;
3247 
3248 GC_T:
3249                 if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
3250                 c = UTEXT_NEXT32(fInputText);
3251                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3252                 if (sets[URX_GC_T]->contains(c))       goto GC_T;
3253                 (void)UTEXT_PREVIOUS32(fInputText);
3254                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3255                 goto GC_Extend;
3256 
3257 GC_Extend:
3258                 // Combining characters are consumed here
3259                 for (;;) {
3260                     if (fp->fInputIdx >= fActiveLimit) {
3261                         break;
3262                     }
3263                     c = UTEXT_CURRENT32(fInputText);
3264                     if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3265                         break;
3266                     }
3267                     (void)UTEXT_NEXT32(fInputText);
3268                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3269                 }
3270                 goto GC_Done;
3271 
3272 GC_Control:
3273                 // Most control chars stand alone (don't combine with combining chars),
3274                 //   except for that CR/LF sequence is a single grapheme cluster.
3275                 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3276                     c = UTEXT_NEXT32(fInputText);
3277                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3278                 }
3279 
3280 GC_Done:
3281                 if (fp->fInputIdx >= fActiveLimit) {
3282                     fHitEnd = TRUE;
3283                 }
3284                 break;
3285             }
3286 
3287 
3288 
3289 
3290         case URX_BACKSLASH_Z:          // Test for end of Input
3291             if (fp->fInputIdx < fAnchorLimit) {
3292                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3293             } else {
3294                 fHitEnd = TRUE;
3295                 fRequireEnd = TRUE;
3296             }
3297             break;
3298 
3299 
3300 
3301         case URX_STATIC_SETREF:
3302             {
3303                 // Test input character against one of the predefined sets
3304                 //    (Word Characters, for example)
3305                 // The high bit of the op value is a flag for the match polarity.
3306                 //    0:   success if input char is in set.
3307                 //    1:   success if input char is not in set.
3308                 if (fp->fInputIdx >= fActiveLimit) {
3309                     fHitEnd = TRUE;
3310                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3311                     break;
3312                 }
3313 
3314                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3315                 opValue &= ~URX_NEG_SET;
3316                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3317 
3318                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3319                 UChar32 c = UTEXT_NEXT32(fInputText);
3320                 if (c < 256) {
3321                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3322                     if (s8->contains(c)) {
3323                         success = !success;
3324                     }
3325                 } else {
3326                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3327                     if (s->contains(c)) {
3328                         success = !success;
3329                     }
3330                 }
3331                 if (success) {
3332                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3333                 } else {
3334                     // the character wasn't in the set.
3335                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3336                 }
3337             }
3338             break;
3339 
3340 
3341         case URX_STAT_SETREF_N:
3342             {
3343                 // Test input character for NOT being a member of  one of
3344                 //    the predefined sets (Word Characters, for example)
3345                 if (fp->fInputIdx >= fActiveLimit) {
3346                     fHitEnd = TRUE;
3347                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3348                     break;
3349                 }
3350 
3351                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3352 
3353                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3354 
3355                 UChar32 c = UTEXT_NEXT32(fInputText);
3356                 if (c < 256) {
3357                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3358                     if (s8->contains(c) == FALSE) {
3359                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3360                         break;
3361                     }
3362                 } else {
3363                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
3364                     if (s->contains(c) == FALSE) {
3365                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3366                         break;
3367                     }
3368                 }
3369                 // the character wasn't in the set.
3370                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3371             }
3372             break;
3373 
3374 
3375         case URX_SETREF:
3376             if (fp->fInputIdx >= fActiveLimit) {
3377                 fHitEnd = TRUE;
3378                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3379                 break;
3380             } else {
3381                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3382 
3383                 // There is input left.  Pick up one char and test it for set membership.
3384                 UChar32 c = UTEXT_NEXT32(fInputText);
3385                 U_ASSERT(opValue > 0 && opValue < sets->size());
3386                 if (c<256) {
3387                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3388                     if (s8->contains(c)) {
3389                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3390                         break;
3391                     }
3392                 } else {
3393                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3394                     if (s->contains(c)) {
3395                         // The character is in the set.  A Match.
3396                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3397                         break;
3398                     }
3399                 }
3400 
3401                 // the character wasn't in the set.
3402                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3403             }
3404             break;
3405 
3406 
3407         case URX_DOTANY:
3408             {
3409                 // . matches anything, but stops at end-of-line.
3410                 if (fp->fInputIdx >= fActiveLimit) {
3411                     // At end of input.  Match failed.  Backtrack out.
3412                     fHitEnd = TRUE;
3413                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3414                     break;
3415                 }
3416 
3417                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3418 
3419                 // There is input left.  Advance over one char, unless we've hit end-of-line
3420                 UChar32 c = UTEXT_NEXT32(fInputText);
3421                 if (isLineTerminator(c)) {
3422                     // End of line in normal mode.   . does not match.
3423                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3424                     break;
3425                 }
3426                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3427             }
3428             break;
3429 
3430 
3431         case URX_DOTANY_ALL:
3432             {
3433                 // ., in dot-matches-all (including new lines) mode
3434                 if (fp->fInputIdx >= fActiveLimit) {
3435                     // At end of input.  Match failed.  Backtrack out.
3436                     fHitEnd = TRUE;
3437                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3438                     break;
3439                 }
3440 
3441                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3442 
3443                 // There is input left.  Advance over one char, except if we are
3444                 //   at a cr/lf, advance over both of them.
3445                 UChar32 c;
3446                 c = UTEXT_NEXT32(fInputText);
3447                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3448                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3449                     // In the case of a CR/LF, we need to advance over both.
3450                     UChar32 nextc = UTEXT_CURRENT32(fInputText);
3451                     if (nextc == 0x0a) {
3452                         (void)UTEXT_NEXT32(fInputText);
3453                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3454                     }
3455                 }
3456             }
3457             break;
3458 
3459 
3460         case URX_DOTANY_UNIX:
3461             {
3462                 // '.' operator, matches all, but stops at end-of-line.
3463                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3464                 if (fp->fInputIdx >= fActiveLimit) {
3465                     // At end of input.  Match failed.  Backtrack out.
3466                     fHitEnd = TRUE;
3467                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3468                     break;
3469                 }
3470 
3471                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3472 
3473                 // There is input left.  Advance over one char, unless we've hit end-of-line
3474                 UChar32 c = UTEXT_NEXT32(fInputText);
3475                 if (c == 0x0a) {
3476                     // End of line in normal mode.   '.' does not match the \n
3477                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3478                 } else {
3479                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3480                 }
3481             }
3482             break;
3483 
3484 
3485         case URX_JMP:
3486             fp->fPatIdx = opValue;
3487             break;
3488 
3489         case URX_FAIL:
3490             isMatch = FALSE;
3491             goto breakFromLoop;
3492 
3493         case URX_JMP_SAV:
3494             U_ASSERT(opValue < fPattern->fCompiledPat->size());
3495             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3496             fp->fPatIdx = opValue;                         // Then JMP.
3497             break;
3498 
3499         case URX_JMP_SAV_X:
3500             // This opcode is used with (x)+, when x can match a zero length string.
3501             // Same as JMP_SAV, except conditional on the match having made forward progress.
3502             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3503             //   data address of the input position at the start of the loop.
3504             {
3505                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3506                 int32_t  stoOp = (int32_t)pat[opValue-1];
3507                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3508                 int32_t  frameLoc = URX_VAL(stoOp);
3509                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3510                 int64_t prevInputIdx = fp->fExtra[frameLoc];
3511                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3512                 if (prevInputIdx < fp->fInputIdx) {
3513                     // The match did make progress.  Repeat the loop.
3514                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3515                     fp->fPatIdx = opValue;
3516                     fp->fExtra[frameLoc] = fp->fInputIdx;
3517                 }
3518                 // If the input position did not advance, we do nothing here,
3519                 //   execution will fall out of the loop.
3520             }
3521             break;
3522 
3523         case URX_CTR_INIT:
3524             {
3525                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3526                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3527 
3528                 // Pick up the three extra operands that CTR_INIT has, and
3529                 //    skip the pattern location counter past
3530                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3531                 fp->fPatIdx += 3;
3532                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3533                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3534                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3535                 U_ASSERT(minCount>=0);
3536                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3537                 U_ASSERT(loopLoc>=fp->fPatIdx);
3538 
3539                 if (minCount == 0) {
3540                     fp = StateSave(fp, loopLoc+1, status);
3541                 }
3542                 if (maxCount == -1) {
3543                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3544                 } else if (maxCount == 0) {
3545                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3546                 }
3547             }
3548             break;
3549 
3550         case URX_CTR_LOOP:
3551             {
3552                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3553                 int32_t initOp = (int32_t)pat[opValue];
3554                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3555                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3556                 int32_t minCount  = (int32_t)pat[opValue+2];
3557                 int32_t maxCount  = (int32_t)pat[opValue+3];
3558                 (*pCounter)++;
3559                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3560                     U_ASSERT(*pCounter == maxCount);
3561                     break;
3562                 }
3563                 if (*pCounter >= minCount) {
3564                     if (maxCount == -1) {
3565                         // Loop has no hard upper bound.
3566                         // Check that it is progressing through the input, break if it is not.
3567                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3568                         if (fp->fInputIdx == *pLastInputIdx) {
3569                             break;
3570                         } else {
3571                             *pLastInputIdx = fp->fInputIdx;
3572                         }
3573                     }
3574                     fp = StateSave(fp, fp->fPatIdx, status);
3575                 }
3576                 fp->fPatIdx = opValue + 4;    // Loop back.
3577             }
3578             break;
3579 
3580         case URX_CTR_INIT_NG:
3581             {
3582                 // Initialize a non-greedy loop
3583                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3584                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3585 
3586                 // Pick up the three extra operands that CTR_INIT_NG has, and
3587                 //    skip the pattern location counter past
3588                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3589                 fp->fPatIdx += 3;
3590                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3591                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3592                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3593                 U_ASSERT(minCount>=0);
3594                 U_ASSERT(maxCount>=minCount || maxCount==-1);
3595                 U_ASSERT(loopLoc>fp->fPatIdx);
3596                 if (maxCount == -1) {
3597                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3598                 }
3599 
3600                 if (minCount == 0) {
3601                     if (maxCount != 0) {
3602                         fp = StateSave(fp, fp->fPatIdx, status);
3603                     }
3604                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3605                 }
3606             }
3607             break;
3608 
3609         case URX_CTR_LOOP_NG:
3610             {
3611                 // Non-greedy {min, max} loops
3612                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3613                 int32_t initOp = (int32_t)pat[opValue];
3614                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3615                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3616                 int32_t minCount  = (int32_t)pat[opValue+2];
3617                 int32_t maxCount  = (int32_t)pat[opValue+3];
3618 
3619                 (*pCounter)++;
3620                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3621                     // The loop has matched the maximum permitted number of times.
3622                     //   Break out of here with no action.  Matching will
3623                     //   continue with the following pattern.
3624                     U_ASSERT(*pCounter == maxCount);
3625                     break;
3626                 }
3627 
3628                 if (*pCounter < minCount) {
3629                     // We haven't met the minimum number of matches yet.
3630                     //   Loop back for another one.
3631                     fp->fPatIdx = opValue + 4;    // Loop back.
3632                 } else {
3633                     // We do have the minimum number of matches.
3634 
3635                     // If there is no upper bound on the loop iterations, check that the input index
3636                     // is progressing, and stop the loop if it is not.
3637                     if (maxCount == -1) {
3638                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3639                         if (fp->fInputIdx == *pLastInputIdx) {
3640                             break;
3641                         }
3642                         *pLastInputIdx = fp->fInputIdx;
3643                     }
3644 
3645                     // Loop Continuation: we will fall into the pattern following the loop
3646                     //   (non-greedy, don't execute loop body first), but first do
3647                     //   a state save to the top of the loop, so that a match failure
3648                     //   in the following pattern will try another iteration of the loop.
3649                     fp = StateSave(fp, opValue + 4, status);
3650                 }
3651             }
3652             break;
3653 
3654         case URX_STO_SP:
3655             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3656             fData[opValue] = fStack->size();
3657             break;
3658 
3659         case URX_LD_SP:
3660             {
3661                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3662                 int32_t newStackSize = (int32_t)fData[opValue];
3663                 U_ASSERT(newStackSize <= fStack->size());
3664                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3665                 if (newFP == (int64_t *)fp) {
3666                     break;
3667                 }
3668                 int32_t i;
3669                 for (i=0; i<fFrameSize; i++) {
3670                     newFP[i] = ((int64_t *)fp)[i];
3671                 }
3672                 fp = (REStackFrame *)newFP;
3673                 fStack->setSize(newStackSize);
3674             }
3675             break;
3676 
3677         case URX_BACKREF:
3678             {
3679                 U_ASSERT(opValue < fFrameSize);
3680                 int64_t groupStartIdx = fp->fExtra[opValue];
3681                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3682                 U_ASSERT(groupStartIdx <= groupEndIdx);
3683                 if (groupStartIdx < 0) {
3684                     // This capture group has not participated in the match thus far,
3685                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3686                     break;
3687                 }
3688                 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3689                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3690 
3691                 //   Note: if the capture group match was of an empty string the backref
3692                 //         match succeeds.  Verified by testing:  Perl matches succeed
3693                 //         in this case, so we do too.
3694 
3695                 UBool success = TRUE;
3696                 for (;;) {
3697                     if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3698                         success = TRUE;
3699                         break;
3700                     }
3701                     if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3702                         success = FALSE;
3703                         fHitEnd = TRUE;
3704                         break;
3705                     }
3706                     UChar32 captureGroupChar = utext_next32(fAltInputText);
3707                     UChar32 inputChar = utext_next32(fInputText);
3708                     if (inputChar != captureGroupChar) {
3709                         success = FALSE;
3710                         break;
3711                     }
3712                 }
3713 
3714                 if (success) {
3715                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3716                 } else {
3717                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3718                 }
3719             }
3720             break;
3721 
3722 
3723 
3724         case URX_BACKREF_I:
3725             {
3726                 U_ASSERT(opValue < fFrameSize);
3727                 int64_t groupStartIdx = fp->fExtra[opValue];
3728                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
3729                 U_ASSERT(groupStartIdx <= groupEndIdx);
3730                 if (groupStartIdx < 0) {
3731                     // This capture group has not participated in the match thus far,
3732                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3733                     break;
3734                 }
3735                 utext_setNativeIndex(fAltInputText, groupStartIdx);
3736                 utext_setNativeIndex(fInputText, fp->fInputIdx);
3737                 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3738                 CaseFoldingUTextIterator inputItr(*fInputText);
3739 
3740                 //   Note: if the capture group match was of an empty string the backref
3741                 //         match succeeds.  Verified by testing:  Perl matches succeed
3742                 //         in this case, so we do too.
3743 
3744                 UBool success = TRUE;
3745                 for (;;) {
3746                     if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3747                         success = TRUE;
3748                         break;
3749                     }
3750                     if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3751                         success = FALSE;
3752                         fHitEnd = TRUE;
3753                         break;
3754                     }
3755                     UChar32 captureGroupChar = captureGroupItr.next();
3756                     UChar32 inputChar = inputItr.next();
3757                     if (inputChar != captureGroupChar) {
3758                         success = FALSE;
3759                         break;
3760                     }
3761                 }
3762 
3763                 if (success && inputItr.inExpansion()) {
3764                     // We otained a match by consuming part of a string obtained from
3765                     // case-folding a single code point of the input text.
3766                     // This does not count as an overall match.
3767                     success = FALSE;
3768                 }
3769 
3770                 if (success) {
3771                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3772                 } else {
3773                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3774                 }
3775 
3776             }
3777             break;
3778 
3779         case URX_STO_INP_LOC:
3780             {
3781                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3782                 fp->fExtra[opValue] = fp->fInputIdx;
3783             }
3784             break;
3785 
3786         case URX_JMPX:
3787             {
3788                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3789                 fp->fPatIdx += 1;
3790                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3791                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3792                 int64_t savedInputIdx = fp->fExtra[dataLoc];
3793                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3794                 if (savedInputIdx < fp->fInputIdx) {
3795                     fp->fPatIdx = opValue;                               // JMP
3796                 } else {
3797                      fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3798                 }
3799             }
3800             break;
3801 
3802         case URX_LA_START:
3803             {
3804                 // Entering a lookahead block.
3805                 // Save Stack Ptr, Input Pos.
3806                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3807                 fData[opValue]   = fStack->size();
3808                 fData[opValue+1] = fp->fInputIdx;
3809                 fActiveStart     = fLookStart;          // Set the match region change for
3810                 fActiveLimit     = fLookLimit;          //   transparent bounds.
3811             }
3812             break;
3813 
3814         case URX_LA_END:
3815             {
3816                 // Leaving a look-ahead block.
3817                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3818                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3819                 int32_t stackSize = fStack->size();
3820                 int32_t newStackSize =(int32_t)fData[opValue];
3821                 U_ASSERT(stackSize >= newStackSize);
3822                 if (stackSize > newStackSize) {
3823                     // Copy the current top frame back to the new (cut back) top frame.
3824                     //   This makes the capture groups from within the look-ahead
3825                     //   expression available.
3826                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3827                     int32_t i;
3828                     for (i=0; i<fFrameSize; i++) {
3829                         newFP[i] = ((int64_t *)fp)[i];
3830                     }
3831                     fp = (REStackFrame *)newFP;
3832                     fStack->setSize(newStackSize);
3833                 }
3834                 fp->fInputIdx = fData[opValue+1];
3835 
3836                 // Restore the active region bounds in the input string; they may have
3837                 //    been changed because of transparent bounds on a Region.
3838                 fActiveStart = fRegionStart;
3839                 fActiveLimit = fRegionLimit;
3840             }
3841             break;
3842 
3843         case URX_ONECHAR_I:
3844             // Case insensitive one char.  The char from the pattern is already case folded.
3845             // Input text is not, but case folding the input can not reduce two or more code
3846             // points to one.
3847             if (fp->fInputIdx < fActiveLimit) {
3848                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3849 
3850                 UChar32 c = UTEXT_NEXT32(fInputText);
3851                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3852                     fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3853                     break;
3854                 }
3855             } else {
3856                 fHitEnd = TRUE;
3857             }
3858 
3859             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3860             break;
3861 
3862         case URX_STRING_I:
3863             {
3864                 // Case-insensitive test input against a literal string.
3865                 // Strings require two slots in the compiled pattern, one for the
3866                 //   offset to the string text, and one for the length.
3867                 //   The compiled string has already been case folded.
3868                 {
3869                     const UChar *patternString = litText + opValue;
3870                     int32_t      patternStringIdx  = 0;
3871 
3872                     op      = (int32_t)pat[fp->fPatIdx];
3873                     fp->fPatIdx++;
3874                     opType  = URX_TYPE(op);
3875                     opValue = URX_VAL(op);
3876                     U_ASSERT(opType == URX_STRING_LEN);
3877                     int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3878 
3879 
3880                     UChar32   cPattern;
3881                     UChar32   cText;
3882                     UBool     success = TRUE;
3883 
3884                     UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3885                     CaseFoldingUTextIterator inputIterator(*fInputText);
3886                     while (patternStringIdx < patternStringLen) {
3887                         if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3888                             success = FALSE;
3889                             fHitEnd = TRUE;
3890                             break;
3891                         }
3892                         U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3893                         cText = inputIterator.next();
3894                         if (cText != cPattern) {
3895                             success = FALSE;
3896                             break;
3897                         }
3898                     }
3899                     if (inputIterator.inExpansion()) {
3900                         success = FALSE;
3901                     }
3902 
3903                     if (success) {
3904                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3905                     } else {
3906                         fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3907                     }
3908                 }
3909             }
3910             break;
3911 
3912         case URX_LB_START:
3913             {
3914                 // Entering a look-behind block.
3915                 // Save Stack Ptr, Input Pos.
3916                 //   TODO:  implement transparent bounds.  Ticket #6067
3917                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3918                 fData[opValue]   = fStack->size();
3919                 fData[opValue+1] = fp->fInputIdx;
3920                 // Init the variable containing the start index for attempted matches.
3921                 fData[opValue+2] = -1;
3922                 // Save input string length, then reset to pin any matches to end at
3923                 //   the current position.
3924                 fData[opValue+3] = fActiveLimit;
3925                 fActiveLimit     = fp->fInputIdx;
3926             }
3927             break;
3928 
3929 
3930         case URX_LB_CONT:
3931             {
3932                 // Positive Look-Behind, at top of loop checking for matches of LB expression
3933                 //    at all possible input starting positions.
3934 
3935                 // Fetch the min and max possible match lengths.  They are the operands
3936                 //   of this op in the pattern.
3937                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3938                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3939                 U_ASSERT(minML <= maxML);
3940                 U_ASSERT(minML >= 0);
3941 
3942                 // Fetch (from data) the last input index where a match was attempted.
3943                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3944                 int64_t  *lbStartIdx = &fData[opValue+2];
3945                 if (*lbStartIdx < 0) {
3946                     // First time through loop.
3947                     *lbStartIdx = fp->fInputIdx - minML;
3948                 } else {
3949                     // 2nd through nth time through the loop.
3950                     // Back up start position for match by one.
3951                     if (*lbStartIdx == 0) {
3952                         (*lbStartIdx)--;
3953                     } else {
3954                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3955                         (void)UTEXT_PREVIOUS32(fInputText);
3956                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3957                     }
3958                 }
3959 
3960                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3961                     // We have tried all potential match starting points without
3962                     //  getting a match.  Backtrack out, and out of the
3963                     //   Look Behind altogether.
3964                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3965                     int64_t restoreInputLen = fData[opValue+3];
3966                     U_ASSERT(restoreInputLen >= fActiveLimit);
3967                     U_ASSERT(restoreInputLen <= fInputLength);
3968                     fActiveLimit = restoreInputLen;
3969                     break;
3970                 }
3971 
3972                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3973                 //      (successful match will fall off the end of the loop.)
3974                 fp = StateSave(fp, fp->fPatIdx-3, status);
3975                 fp->fInputIdx = *lbStartIdx;
3976             }
3977             break;
3978 
3979         case URX_LB_END:
3980             // End of a look-behind block, after a successful match.
3981             {
3982                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3983                 if (fp->fInputIdx != fActiveLimit) {
3984                     //  The look-behind expression matched, but the match did not
3985                     //    extend all the way to the point that we are looking behind from.
3986                     //  FAIL out of here, which will take us back to the LB_CONT, which
3987                     //     will retry the match starting at another position or fail
3988                     //     the look-behind altogether, whichever is appropriate.
3989                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3990                     break;
3991                 }
3992 
3993                 // Look-behind match is good.  Restore the orignal input string length,
3994                 //   which had been truncated to pin the end of the lookbehind match to the
3995                 //   position being looked-behind.
3996                 int64_t originalInputLen = fData[opValue+3];
3997                 U_ASSERT(originalInputLen >= fActiveLimit);
3998                 U_ASSERT(originalInputLen <= fInputLength);
3999                 fActiveLimit = originalInputLen;
4000             }
4001             break;
4002 
4003 
4004         case URX_LBN_CONT:
4005             {
4006                 // Negative Look-Behind, at top of loop checking for matches of LB expression
4007                 //    at all possible input starting positions.
4008 
4009                 // Fetch the extra parameters of this op.
4010                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
4011                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
4012                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4013                         continueLoc = URX_VAL(continueLoc);
4014                 U_ASSERT(minML <= maxML);
4015                 U_ASSERT(minML >= 0);
4016                 U_ASSERT(continueLoc > fp->fPatIdx);
4017 
4018                 // Fetch (from data) the last input index where a match was attempted.
4019                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4020                 int64_t  *lbStartIdx = &fData[opValue+2];
4021                 if (*lbStartIdx < 0) {
4022                     // First time through loop.
4023                     *lbStartIdx = fp->fInputIdx - minML;
4024                 } else {
4025                     // 2nd through nth time through the loop.
4026                     // Back up start position for match by one.
4027                     if (*lbStartIdx == 0) {
4028                         (*lbStartIdx)--;
4029                     } else {
4030                         UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4031                         (void)UTEXT_PREVIOUS32(fInputText);
4032                         *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4033                     }
4034                 }
4035 
4036                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4037                     // We have tried all potential match starting points without
4038                     //  getting a match, which means that the negative lookbehind as
4039                     //  a whole has succeeded.  Jump forward to the continue location
4040                     int64_t restoreInputLen = fData[opValue+3];
4041                     U_ASSERT(restoreInputLen >= fActiveLimit);
4042                     U_ASSERT(restoreInputLen <= fInputLength);
4043                     fActiveLimit = restoreInputLen;
4044                     fp->fPatIdx = continueLoc;
4045                     break;
4046                 }
4047 
4048                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4049                 //      (successful match will cause a FAIL out of the loop altogether.)
4050                 fp = StateSave(fp, fp->fPatIdx-4, status);
4051                 fp->fInputIdx = *lbStartIdx;
4052             }
4053             break;
4054 
4055         case URX_LBN_END:
4056             // End of a negative look-behind block, after a successful match.
4057             {
4058                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4059                 if (fp->fInputIdx != fActiveLimit) {
4060                     //  The look-behind expression matched, but the match did not
4061                     //    extend all the way to the point that we are looking behind from.
4062                     //  FAIL out of here, which will take us back to the LB_CONT, which
4063                     //     will retry the match starting at another position or succeed
4064                     //     the look-behind altogether, whichever is appropriate.
4065                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4066                     break;
4067                 }
4068 
4069                 // Look-behind expression matched, which means look-behind test as
4070                 //   a whole Fails
4071 
4072                 //   Restore the orignal input string length, which had been truncated
4073                 //   inorder to pin the end of the lookbehind match
4074                 //   to the position being looked-behind.
4075                 int64_t originalInputLen = fData[opValue+3];
4076                 U_ASSERT(originalInputLen >= fActiveLimit);
4077                 U_ASSERT(originalInputLen <= fInputLength);
4078                 fActiveLimit = originalInputLen;
4079 
4080                 // Restore original stack position, discarding any state saved
4081                 //   by the successful pattern match.
4082                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4083                 int32_t newStackSize = (int32_t)fData[opValue];
4084                 U_ASSERT(fStack->size() > newStackSize);
4085                 fStack->setSize(newStackSize);
4086 
4087                 //  FAIL, which will take control back to someplace
4088                 //  prior to entering the look-behind test.
4089                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4090             }
4091             break;
4092 
4093 
4094         case URX_LOOP_SR_I:
4095             // Loop Initialization for the optimized implementation of
4096             //     [some character set]*
4097             //   This op scans through all matching input.
4098             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4099             {
4100                 U_ASSERT(opValue > 0 && opValue < sets->size());
4101                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4102                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
4103 
4104                 // Loop through input, until either the input is exhausted or
4105                 //   we reach a character that is not a member of the set.
4106                 int64_t ix = fp->fInputIdx;
4107                 UTEXT_SETNATIVEINDEX(fInputText, ix);
4108                 for (;;) {
4109                     if (ix >= fActiveLimit) {
4110                         fHitEnd = TRUE;
4111                         break;
4112                     }
4113                     UChar32 c = UTEXT_NEXT32(fInputText);
4114                     if (c<256) {
4115                         if (s8->contains(c) == FALSE) {
4116                             break;
4117                         }
4118                     } else {
4119                         if (s->contains(c) == FALSE) {
4120                             break;
4121                         }
4122                     }
4123                     ix = UTEXT_GETNATIVEINDEX(fInputText);
4124                 }
4125 
4126                 // If there were no matching characters, skip over the loop altogether.
4127                 //   The loop doesn't run at all, a * op always succeeds.
4128                 if (ix == fp->fInputIdx) {
4129                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4130                     break;
4131                 }
4132 
4133                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4134                 //   must follow.  It's operand is the stack location
4135                 //   that holds the starting input index for the match of this [set]*
4136                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4137                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4138                 int32_t stackLoc = URX_VAL(loopcOp);
4139                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4140                 fp->fExtra[stackLoc] = fp->fInputIdx;
4141                 fp->fInputIdx = ix;
4142 
4143                 // Save State to the URX_LOOP_C op that follows this one,
4144                 //   so that match failures in the following code will return to there.
4145                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4146                 fp = StateSave(fp, fp->fPatIdx, status);
4147                 fp->fPatIdx++;
4148             }
4149             break;
4150 
4151 
4152         case URX_LOOP_DOT_I:
4153             // Loop Initialization for the optimized implementation of .*
4154             //   This op scans through all remaining input.
4155             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4156             {
4157                 // Loop through input until the input is exhausted (we reach an end-of-line)
4158                 // In DOTALL mode, we can just go straight to the end of the input.
4159                 int64_t ix;
4160                 if ((opValue & 1) == 1) {
4161                     // Dot-matches-All mode.  Jump straight to the end of the string.
4162                     ix = fActiveLimit;
4163                     fHitEnd = TRUE;
4164                 } else {
4165                     // NOT DOT ALL mode.  Line endings do not match '.'
4166                     // Scan forward until a line ending or end of input.
4167                     ix = fp->fInputIdx;
4168                     UTEXT_SETNATIVEINDEX(fInputText, ix);
4169                     for (;;) {
4170                         if (ix >= fActiveLimit) {
4171                             fHitEnd = TRUE;
4172                             break;
4173                         }
4174                         UChar32 c = UTEXT_NEXT32(fInputText);
4175                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4176                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4177                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4178                                     isLineTerminator(c))) {
4179                                 //  char is a line ending.  Exit the scanning loop.
4180                                 break;
4181                             }
4182                         }
4183                         ix = UTEXT_GETNATIVEINDEX(fInputText);
4184                     }
4185                 }
4186 
4187                 // If there were no matching characters, skip over the loop altogether.
4188                 //   The loop doesn't run at all, a * op always succeeds.
4189                 if (ix == fp->fInputIdx) {
4190                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
4191                     break;
4192                 }
4193 
4194                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4195                 //   must follow.  It's operand is the stack location
4196                 //   that holds the starting input index for the match of this .*
4197                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4198                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4199                 int32_t stackLoc = URX_VAL(loopcOp);
4200                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4201                 fp->fExtra[stackLoc] = fp->fInputIdx;
4202                 fp->fInputIdx = ix;
4203 
4204                 // Save State to the URX_LOOP_C op that follows this one,
4205                 //   so that match failures in the following code will return to there.
4206                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4207                 fp = StateSave(fp, fp->fPatIdx, status);
4208                 fp->fPatIdx++;
4209             }
4210             break;
4211 
4212 
4213         case URX_LOOP_C:
4214             {
4215                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4216                 backSearchIndex = fp->fExtra[opValue];
4217                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4218                 if (backSearchIndex == fp->fInputIdx) {
4219                     // We've backed up the input idx to the point that the loop started.
4220                     // The loop is done.  Leave here without saving state.
4221                     //  Subsequent failures won't come back here.
4222                     break;
4223                 }
4224                 // Set up for the next iteration of the loop, with input index
4225                 //   backed up by one from the last time through,
4226                 //   and a state save to this instruction in case the following code fails again.
4227                 //   (We're going backwards because this loop emulates stack unwinding, not
4228                 //    the initial scan forward.)
4229                 U_ASSERT(fp->fInputIdx > 0);
4230                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4231                 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4232                 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4233 
4234                 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4235                 if (prevC == 0x0a &&
4236                     fp->fInputIdx > backSearchIndex &&
4237                     twoPrevC == 0x0d) {
4238                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4239                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4240                         // .*, stepping back over CRLF pair.
4241                         fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4242                     }
4243                 }
4244 
4245 
4246                 fp = StateSave(fp, fp->fPatIdx-1, status);
4247             }
4248             break;
4249 
4250 
4251 
4252         default:
4253             // Trouble.  The compiled pattern contains an entry with an
4254             //           unrecognized type tag.
4255             U_ASSERT(FALSE);
4256         }
4257 
4258         if (U_FAILURE(status)) {
4259             isMatch = FALSE;
4260             break;
4261         }
4262     }
4263 
4264 breakFromLoop:
4265     fMatch = isMatch;
4266     if (isMatch) {
4267         fLastMatchEnd = fMatchEnd;
4268         fMatchStart   = startIdx;
4269         fMatchEnd     = fp->fInputIdx;
4270     }
4271 
4272 #ifdef REGEX_RUN_DEBUG
4273     if (fTraceDebug) {
4274         if (isMatch) {
4275             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4276         } else {
4277             printf("No match\n\n");
4278         }
4279     }
4280 #endif
4281 
4282     fFrame = fp;                // The active stack frame when the engine stopped.
4283                                 //   Contains the capture group results that we need to
4284                                 //    access later.
4285     return;
4286 }
4287 
4288 
4289 //--------------------------------------------------------------------------------
4290 //
4291 //   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4292 //                  assumption that the entire string is available in the UText's
4293 //                  chunk buffer. For now, that means we can use int32_t indexes,
4294 //                  except for anything that needs to be saved (like group starts
4295 //                  and ends).
4296 //
4297 //                  startIdx:    begin matching a this index.
4298 //                  toEnd:       if true, match must extend to end of the input region
4299 //
4300 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4301 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4302     UBool       isMatch  = FALSE;      // True if the we have a match.
4303 
4304     int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4305 
4306     int32_t     op;                    // Operation from the compiled pattern, split into
4307     int32_t     opType;                //    the opcode
4308     int32_t     opValue;               //    and the operand value.
4309 
4310 #ifdef REGEX_RUN_DEBUG
4311     if (fTraceDebug) {
4312         printf("MatchAt(startIdx=%d)\n", startIdx);
4313         printf("Original Pattern: ");
4314         UChar32 c = utext_next32From(fPattern->fPattern, 0);
4315         while (c != U_SENTINEL) {
4316             if (c<32 || c>256) {
4317                 c = '.';
4318             }
4319             printf("%c", c);
4320 
4321             c = UTEXT_NEXT32(fPattern->fPattern);
4322         }
4323         printf("\n");
4324         printf("Input String: ");
4325         c = utext_next32From(fInputText, 0);
4326         while (c != U_SENTINEL) {
4327             if (c<32 || c>256) {
4328                 c = '.';
4329             }
4330             printf("%c", c);
4331 
4332             c = UTEXT_NEXT32(fInputText);
4333         }
4334         printf("\n");
4335         printf("\n");
4336     }
4337 #endif
4338 
4339     if (U_FAILURE(status)) {
4340         return;
4341     }
4342 
4343     //  Cache frequently referenced items from the compiled pattern
4344     //
4345     int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4346 
4347     const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4348     UVector             *sets          = fPattern->fSets;
4349 
4350     const UChar         *inputBuf      = fInputText->chunkContents;
4351 
4352     fFrameSize = fPattern->fFrameSize;
4353     REStackFrame        *fp            = resetStack();
4354     if (U_FAILURE(fDeferredStatus)) {
4355         status = fDeferredStatus;
4356         return;
4357     }
4358 
4359     fp->fPatIdx   = 0;
4360     fp->fInputIdx = startIdx;
4361 
4362     // Zero out the pattern's static data
4363     int32_t i;
4364     for (i = 0; i<fPattern->fDataSize; i++) {
4365         fData[i] = 0;
4366     }
4367 
4368     //
4369     //  Main loop for interpreting the compiled pattern.
4370     //  One iteration of the loop per pattern operation performed.
4371     //
4372     for (;;) {
4373         op      = (int32_t)pat[fp->fPatIdx];
4374         opType  = URX_TYPE(op);
4375         opValue = URX_VAL(op);
4376 #ifdef REGEX_RUN_DEBUG
4377         if (fTraceDebug) {
4378             UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4379             printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4380                    UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4381             fPattern->dumpOp(fp->fPatIdx);
4382         }
4383 #endif
4384         fp->fPatIdx++;
4385 
4386         switch (opType) {
4387 
4388 
4389         case URX_NOP:
4390             break;
4391 
4392 
4393         case URX_BACKTRACK:
4394             // Force a backtrack.  In some circumstances, the pattern compiler
4395             //   will notice that the pattern can't possibly match anything, and will
4396             //   emit one of these at that point.
4397             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4398             break;
4399 
4400 
4401         case URX_ONECHAR:
4402             if (fp->fInputIdx < fActiveLimit) {
4403                 UChar32 c;
4404                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4405                 if (c == opValue) {
4406                     break;
4407                 }
4408             } else {
4409                 fHitEnd = TRUE;
4410             }
4411             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4412             break;
4413 
4414 
4415         case URX_STRING:
4416             {
4417                 // Test input against a literal string.
4418                 // Strings require two slots in the compiled pattern, one for the
4419                 //   offset to the string text, and one for the length.
4420                 int32_t   stringStartIdx = opValue;
4421                 int32_t   stringLen;
4422 
4423                 op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4424                 fp->fPatIdx++;
4425                 opType    = URX_TYPE(op);
4426                 stringLen = URX_VAL(op);
4427                 U_ASSERT(opType == URX_STRING_LEN);
4428                 U_ASSERT(stringLen >= 2);
4429 
4430                 const UChar * pInp = inputBuf + fp->fInputIdx;
4431                 const UChar * pInpLimit = inputBuf + fActiveLimit;
4432                 const UChar * pPat = litText+stringStartIdx;
4433                 const UChar * pEnd = pInp + stringLen;
4434                 UBool success = TRUE;
4435                 while (pInp < pEnd) {
4436                     if (pInp >= pInpLimit) {
4437                         fHitEnd = TRUE;
4438                         success = FALSE;
4439                         break;
4440                     }
4441                     if (*pInp++ != *pPat++) {
4442                         success = FALSE;
4443                         break;
4444                     }
4445                 }
4446 
4447                 if (success) {
4448                     fp->fInputIdx += stringLen;
4449                 } else {
4450                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4451                 }
4452             }
4453             break;
4454 
4455 
4456         case URX_STATE_SAVE:
4457             fp = StateSave(fp, opValue, status);
4458             break;
4459 
4460 
4461         case URX_END:
4462             // The match loop will exit via this path on a successful match,
4463             //   when we reach the end of the pattern.
4464             if (toEnd && fp->fInputIdx != fActiveLimit) {
4465                 // The pattern matched, but not to the end of input.  Try some more.
4466                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4467                 break;
4468             }
4469             isMatch = TRUE;
4470             goto  breakFromLoop;
4471 
4472             // Start and End Capture stack frame variables are laid out out like this:
4473             //  fp->fExtra[opValue]  - The start of a completed capture group
4474             //             opValue+1 - The end   of a completed capture group
4475             //             opValue+2 - the start of a capture group whose end
4476             //                          has not yet been reached (and might not ever be).
4477         case URX_START_CAPTURE:
4478             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4479             fp->fExtra[opValue+2] = fp->fInputIdx;
4480             break;
4481 
4482 
4483         case URX_END_CAPTURE:
4484             U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4485             U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4486             fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4487             fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4488             U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4489             break;
4490 
4491 
4492         case URX_DOLLAR:                   //  $, test for End of line
4493             //     or for position before new line at end of input
4494             if (fp->fInputIdx < fAnchorLimit-2) {
4495                 // We are no where near the end of input.  Fail.
4496                 //   This is the common case.  Keep it first.
4497                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4498                 break;
4499             }
4500             if (fp->fInputIdx >= fAnchorLimit) {
4501                 // We really are at the end of input.  Success.
4502                 fHitEnd = TRUE;
4503                 fRequireEnd = TRUE;
4504                 break;
4505             }
4506 
4507             // If we are positioned just before a new-line that is located at the
4508             //   end of input, succeed.
4509             if (fp->fInputIdx == fAnchorLimit-1) {
4510                 UChar32 c;
4511                 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4512 
4513                 if (isLineTerminator(c)) {
4514                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4515                         // At new-line at end of input. Success
4516                         fHitEnd = TRUE;
4517                         fRequireEnd = TRUE;
4518                         break;
4519                     }
4520                 }
4521             } else if (fp->fInputIdx == fAnchorLimit-2 &&
4522                 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4523                     fHitEnd = TRUE;
4524                     fRequireEnd = TRUE;
4525                     break;                         // At CR/LF at end of input.  Success
4526             }
4527 
4528             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4529 
4530             break;
4531 
4532 
4533         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4534             if (fp->fInputIdx >= fAnchorLimit-1) {
4535                 // Either at the last character of input, or off the end.
4536                 if (fp->fInputIdx == fAnchorLimit-1) {
4537                     // At last char of input.  Success if it's a new line.
4538                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4539                         fHitEnd = TRUE;
4540                         fRequireEnd = TRUE;
4541                         break;
4542                     }
4543                 } else {
4544                     // Off the end of input.  Success.
4545                     fHitEnd = TRUE;
4546                     fRequireEnd = TRUE;
4547                     break;
4548                 }
4549             }
4550 
4551             // Not at end of input.  Back-track out.
4552             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4553             break;
4554 
4555 
4556         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4557             {
4558                 if (fp->fInputIdx >= fAnchorLimit) {
4559                     // We really are at the end of input.  Success.
4560                     fHitEnd = TRUE;
4561                     fRequireEnd = TRUE;
4562                     break;
4563                 }
4564                 // If we are positioned just before a new-line, succeed.
4565                 // It makes no difference where the new-line is within the input.
4566                 UChar32 c = inputBuf[fp->fInputIdx];
4567                 if (isLineTerminator(c)) {
4568                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4569                     //  In multi-line mode, hitting a new-line just before the end of input does not
4570                     //   set the hitEnd or requireEnd flags
4571                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4572                         break;
4573                     }
4574                 }
4575                 // not at a new line.  Fail.
4576                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4577             }
4578             break;
4579 
4580 
4581         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4582             {
4583                 if (fp->fInputIdx >= fAnchorLimit) {
4584                     // We really are at the end of input.  Success.
4585                     fHitEnd = TRUE;
4586                     fRequireEnd = TRUE;  // Java set requireEnd in this case, even though
4587                     break;               //   adding a new-line would not lose the match.
4588                 }
4589                 // If we are not positioned just before a new-line, the test fails; backtrack out.
4590                 // It makes no difference where the new-line is within the input.
4591                 if (inputBuf[fp->fInputIdx] != 0x0a) {
4592                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4593                 }
4594             }
4595             break;
4596 
4597 
4598         case URX_CARET:                    //  ^, test for start of line
4599             if (fp->fInputIdx != fAnchorStart) {
4600                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4601             }
4602             break;
4603 
4604 
4605         case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4606             {
4607                 if (fp->fInputIdx == fAnchorStart) {
4608                     // We are at the start input.  Success.
4609                     break;
4610                 }
4611                 // Check whether character just before the current pos is a new-line
4612                 //   unless we are at the end of input
4613                 UChar  c = inputBuf[fp->fInputIdx - 1];
4614                 if ((fp->fInputIdx < fAnchorLimit) &&
4615                     isLineTerminator(c)) {
4616                     //  It's a new-line.  ^ is true.  Success.
4617                     //  TODO:  what should be done with positions between a CR and LF?
4618                     break;
4619                 }
4620                 // Not at the start of a line.  Fail.
4621                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4622             }
4623             break;
4624 
4625 
4626         case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4627             {
4628                 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4629                 if (fp->fInputIdx <= fAnchorStart) {
4630                     // We are at the start input.  Success.
4631                     break;
4632                 }
4633                 // Check whether character just before the current pos is a new-line
4634                 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4635                 UChar  c = inputBuf[fp->fInputIdx - 1];
4636                 if (c != 0x0a) {
4637                     // Not at the start of a line.  Back-track out.
4638                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4639                 }
4640             }
4641             break;
4642 
4643         case URX_BACKSLASH_B:          // Test for word boundaries
4644             {
4645                 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4646                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4647                 if (!success) {
4648                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4649                 }
4650             }
4651             break;
4652 
4653 
4654         case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4655             {
4656                 UBool success = isUWordBoundary(fp->fInputIdx);
4657                 success ^= (UBool)(opValue != 0);     // flip sense for \B
4658                 if (!success) {
4659                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4660                 }
4661             }
4662             break;
4663 
4664 
4665         case URX_BACKSLASH_D:            // Test for decimal digit
4666             {
4667                 if (fp->fInputIdx >= fActiveLimit) {
4668                     fHitEnd = TRUE;
4669                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4670                     break;
4671                 }
4672 
4673                 UChar32 c;
4674                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4675                 int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4676                 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4677                 success ^= (UBool)(opValue != 0);        // flip sense for \D
4678                 if (!success) {
4679                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4680                 }
4681             }
4682             break;
4683 
4684 
4685         case URX_BACKSLASH_G:          // Test for position at end of previous match
4686             if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4687                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4688             }
4689             break;
4690 
4691 
4692         case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4693             {
4694                 if (fp->fInputIdx >= fActiveLimit) {
4695                     fHitEnd = TRUE;
4696                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4697                     break;
4698                 }
4699                 UChar32 c;
4700                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4701                 int8_t ctype = u_charType(c);
4702                 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4703                 success ^= (UBool)(opValue != 0);        // flip sense for \H
4704                 if (!success) {
4705                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706                 }
4707             }
4708             break;
4709 
4710 
4711         case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4712             {
4713                 if (fp->fInputIdx >= fActiveLimit) {
4714                     fHitEnd = TRUE;
4715                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716                     break;
4717                 }
4718                 UChar32 c;
4719                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720                 if (isLineTerminator(c)) {
4721                     if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4722                         // Check for CR/LF sequence. Consume both together when found.
4723                         UChar c2;
4724                         U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4725                         if (c2 != 0x0a) {
4726                             U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4727                         }
4728                     }
4729                 } else {
4730                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4731                 }
4732             }
4733             break;
4734 
4735 
4736         case URX_BACKSLASH_V:         // Any single code point line ending.
4737             {
4738                 if (fp->fInputIdx >= fActiveLimit) {
4739                     fHitEnd = TRUE;
4740                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4741                     break;
4742                 }
4743                 UChar32 c;
4744                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4745                 UBool success = isLineTerminator(c);
4746                 success ^= (UBool)(opValue != 0);        // flip sense for \V
4747                 if (!success) {
4748                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4749                 }
4750             }
4751             break;
4752 
4753 
4754 
4755         case URX_BACKSLASH_X:
4756         //  Match a Grapheme, as defined by Unicode TR 29.
4757         //  Differs slightly from Perl, which consumes combining marks independently
4758         //    of context.
4759         {
4760 
4761             // Fail if at end of input
4762             if (fp->fInputIdx >= fActiveLimit) {
4763                 fHitEnd = TRUE;
4764                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4765                 break;
4766             }
4767 
4768             // Examine (and consume) the current char.
4769             //   Dispatch into a little state machine, based on the char.
4770             UChar32  c;
4771             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4772             UnicodeSet **sets = fPattern->fStaticSets;
4773             if (sets[URX_GC_NORMAL]->contains(c))  goto GC_Extend;
4774             if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4775             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4776             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4777             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4778             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4779             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4780             goto GC_Extend;
4781 
4782 
4783 
4784 GC_L:
4785             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4786             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4787             if (sets[URX_GC_L]->contains(c))       goto GC_L;
4788             if (sets[URX_GC_LV]->contains(c))      goto GC_V;
4789             if (sets[URX_GC_LVT]->contains(c))     goto GC_T;
4790             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4791             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4792             goto GC_Extend;
4793 
4794 GC_V:
4795             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4796             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4797             if (sets[URX_GC_V]->contains(c))       goto GC_V;
4798             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4799             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4800             goto GC_Extend;
4801 
4802 GC_T:
4803             if (fp->fInputIdx >= fActiveLimit)         goto GC_Done;
4804             U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4805             if (sets[URX_GC_T]->contains(c))       goto GC_T;
4806             U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4807             goto GC_Extend;
4808 
4809 GC_Extend:
4810             // Combining characters are consumed here
4811             for (;;) {
4812                 if (fp->fInputIdx >= fActiveLimit) {
4813                     break;
4814                 }
4815                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4816                 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4817                     U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4818                     break;
4819                 }
4820             }
4821             goto GC_Done;
4822 
4823 GC_Control:
4824             // Most control chars stand alone (don't combine with combining chars),
4825             //   except for that CR/LF sequence is a single grapheme cluster.
4826             if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4827                 fp->fInputIdx++;
4828             }
4829 
4830 GC_Done:
4831             if (fp->fInputIdx >= fActiveLimit) {
4832                 fHitEnd = TRUE;
4833             }
4834             break;
4835         }
4836 
4837 
4838 
4839 
4840         case URX_BACKSLASH_Z:          // Test for end of Input
4841             if (fp->fInputIdx < fAnchorLimit) {
4842                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4843             } else {
4844                 fHitEnd = TRUE;
4845                 fRequireEnd = TRUE;
4846             }
4847             break;
4848 
4849 
4850 
4851         case URX_STATIC_SETREF:
4852             {
4853                 // Test input character against one of the predefined sets
4854                 //    (Word Characters, for example)
4855                 // The high bit of the op value is a flag for the match polarity.
4856                 //    0:   success if input char is in set.
4857                 //    1:   success if input char is not in set.
4858                 if (fp->fInputIdx >= fActiveLimit) {
4859                     fHitEnd = TRUE;
4860                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4861                     break;
4862                 }
4863 
4864                 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4865                 opValue &= ~URX_NEG_SET;
4866                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4867 
4868                 UChar32 c;
4869                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4870                 if (c < 256) {
4871                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4872                     if (s8->contains(c)) {
4873                         success = !success;
4874                     }
4875                 } else {
4876                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4877                     if (s->contains(c)) {
4878                         success = !success;
4879                     }
4880                 }
4881                 if (!success) {
4882                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4883                 }
4884             }
4885             break;
4886 
4887 
4888         case URX_STAT_SETREF_N:
4889             {
4890                 // Test input character for NOT being a member of  one of
4891                 //    the predefined sets (Word Characters, for example)
4892                 if (fp->fInputIdx >= fActiveLimit) {
4893                     fHitEnd = TRUE;
4894                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4895                     break;
4896                 }
4897 
4898                 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4899 
4900                 UChar32  c;
4901                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4902                 if (c < 256) {
4903                     Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4904                     if (s8->contains(c) == FALSE) {
4905                         break;
4906                     }
4907                 } else {
4908                     const UnicodeSet *s = fPattern->fStaticSets[opValue];
4909                     if (s->contains(c) == FALSE) {
4910                         break;
4911                     }
4912                 }
4913                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4914             }
4915             break;
4916 
4917 
4918         case URX_SETREF:
4919             {
4920                 if (fp->fInputIdx >= fActiveLimit) {
4921                     fHitEnd = TRUE;
4922                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4923                     break;
4924                 }
4925 
4926                 U_ASSERT(opValue > 0 && opValue < sets->size());
4927 
4928                 // There is input left.  Pick up one char and test it for set membership.
4929                 UChar32  c;
4930                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4931                 if (c<256) {
4932                     Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4933                     if (s8->contains(c)) {
4934                         // The character is in the set.  A Match.
4935                         break;
4936                     }
4937                 } else {
4938                     UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4939                     if (s->contains(c)) {
4940                         // The character is in the set.  A Match.
4941                         break;
4942                     }
4943                 }
4944 
4945                 // the character wasn't in the set.
4946                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4947             }
4948             break;
4949 
4950 
4951         case URX_DOTANY:
4952             {
4953                 // . matches anything, but stops at end-of-line.
4954                 if (fp->fInputIdx >= fActiveLimit) {
4955                     // At end of input.  Match failed.  Backtrack out.
4956                     fHitEnd = TRUE;
4957                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4958                     break;
4959                 }
4960 
4961                 // There is input left.  Advance over one char, unless we've hit end-of-line
4962                 UChar32  c;
4963                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4964                 if (isLineTerminator(c)) {
4965                     // End of line in normal mode.   . does not match.
4966                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4967                     break;
4968                 }
4969             }
4970             break;
4971 
4972 
4973         case URX_DOTANY_ALL:
4974             {
4975                 // . in dot-matches-all (including new lines) mode
4976                 if (fp->fInputIdx >= fActiveLimit) {
4977                     // At end of input.  Match failed.  Backtrack out.
4978                     fHitEnd = TRUE;
4979                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4980                     break;
4981                 }
4982 
4983                 // There is input left.  Advance over one char, except if we are
4984                 //   at a cr/lf, advance over both of them.
4985                 UChar32 c;
4986                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4987                 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4988                     // In the case of a CR/LF, we need to advance over both.
4989                     if (inputBuf[fp->fInputIdx] == 0x0a) {
4990                         U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4991                     }
4992                 }
4993             }
4994             break;
4995 
4996 
4997         case URX_DOTANY_UNIX:
4998             {
4999                 // '.' operator, matches all, but stops at end-of-line.
5000                 //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
5001                 if (fp->fInputIdx >= fActiveLimit) {
5002                     // At end of input.  Match failed.  Backtrack out.
5003                     fHitEnd = TRUE;
5004                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5005                     break;
5006                 }
5007 
5008                 // There is input left.  Advance over one char, unless we've hit end-of-line
5009                 UChar32 c;
5010                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5011                 if (c == 0x0a) {
5012                     // End of line in normal mode.   '.' does not match the \n
5013                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5014                 }
5015             }
5016             break;
5017 
5018 
5019         case URX_JMP:
5020             fp->fPatIdx = opValue;
5021             break;
5022 
5023         case URX_FAIL:
5024             isMatch = FALSE;
5025             goto breakFromLoop;
5026 
5027         case URX_JMP_SAV:
5028             U_ASSERT(opValue < fPattern->fCompiledPat->size());
5029             fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
5030             fp->fPatIdx = opValue;                         // Then JMP.
5031             break;
5032 
5033         case URX_JMP_SAV_X:
5034             // This opcode is used with (x)+, when x can match a zero length string.
5035             // Same as JMP_SAV, except conditional on the match having made forward progress.
5036             // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5037             //   data address of the input position at the start of the loop.
5038             {
5039                 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5040                 int32_t  stoOp = (int32_t)pat[opValue-1];
5041                 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5042                 int32_t  frameLoc = URX_VAL(stoOp);
5043                 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5044                 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5045                 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5046                 if (prevInputIdx < fp->fInputIdx) {
5047                     // The match did make progress.  Repeat the loop.
5048                     fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
5049                     fp->fPatIdx = opValue;
5050                     fp->fExtra[frameLoc] = fp->fInputIdx;
5051                 }
5052                 // If the input position did not advance, we do nothing here,
5053                 //   execution will fall out of the loop.
5054             }
5055             break;
5056 
5057         case URX_CTR_INIT:
5058             {
5059                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5060                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5061 
5062                 // Pick up the three extra operands that CTR_INIT has, and
5063                 //    skip the pattern location counter past
5064                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5065                 fp->fPatIdx += 3;
5066                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5067                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5068                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5069                 U_ASSERT(minCount>=0);
5070                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5071                 U_ASSERT(loopLoc>=fp->fPatIdx);
5072 
5073                 if (minCount == 0) {
5074                     fp = StateSave(fp, loopLoc+1, status);
5075                 }
5076                 if (maxCount == -1) {
5077                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
5078                 } else if (maxCount == 0) {
5079                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5080                 }
5081             }
5082             break;
5083 
5084         case URX_CTR_LOOP:
5085             {
5086                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5087                 int32_t initOp = (int32_t)pat[opValue];
5088                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5089                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5090                 int32_t minCount  = (int32_t)pat[opValue+2];
5091                 int32_t maxCount  = (int32_t)pat[opValue+3];
5092                 (*pCounter)++;
5093                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5094                     U_ASSERT(*pCounter == maxCount);
5095                     break;
5096                 }
5097                 if (*pCounter >= minCount) {
5098                     if (maxCount == -1) {
5099                         // Loop has no hard upper bound.
5100                         // Check that it is progressing through the input, break if it is not.
5101                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5102                         if (fp->fInputIdx == *pLastInputIdx) {
5103                             break;
5104                         } else {
5105                             *pLastInputIdx = fp->fInputIdx;
5106                         }
5107                     }
5108                     fp = StateSave(fp, fp->fPatIdx, status);
5109                 }
5110                 fp->fPatIdx = opValue + 4;    // Loop back.
5111             }
5112             break;
5113 
5114         case URX_CTR_INIT_NG:
5115             {
5116                 // Initialize a non-greedy loop
5117                 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5118                 fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5119 
5120                 // Pick up the three extra operands that CTR_INIT_NG has, and
5121                 //    skip the pattern location counter past
5122                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5123                 fp->fPatIdx += 3;
5124                 int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5125                 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5126                 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5127                 U_ASSERT(minCount>=0);
5128                 U_ASSERT(maxCount>=minCount || maxCount==-1);
5129                 U_ASSERT(loopLoc>fp->fPatIdx);
5130                 if (maxCount == -1) {
5131                     fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5132                 }
5133 
5134                 if (minCount == 0) {
5135                     if (maxCount != 0) {
5136                         fp = StateSave(fp, fp->fPatIdx, status);
5137                     }
5138                     fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5139                 }
5140             }
5141             break;
5142 
5143         case URX_CTR_LOOP_NG:
5144             {
5145                 // Non-greedy {min, max} loops
5146                 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5147                 int32_t initOp = (int32_t)pat[opValue];
5148                 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5149                 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5150                 int32_t minCount  = (int32_t)pat[opValue+2];
5151                 int32_t maxCount  = (int32_t)pat[opValue+3];
5152 
5153                 (*pCounter)++;
5154                 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5155                     // The loop has matched the maximum permitted number of times.
5156                     //   Break out of here with no action.  Matching will
5157                     //   continue with the following pattern.
5158                     U_ASSERT(*pCounter == maxCount);
5159                     break;
5160                 }
5161 
5162                 if (*pCounter < minCount) {
5163                     // We haven't met the minimum number of matches yet.
5164                     //   Loop back for another one.
5165                     fp->fPatIdx = opValue + 4;    // Loop back.
5166                 } else {
5167                     // We do have the minimum number of matches.
5168 
5169                     // If there is no upper bound on the loop iterations, check that the input index
5170                     // is progressing, and stop the loop if it is not.
5171                     if (maxCount == -1) {
5172                         int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5173                         if (fp->fInputIdx == *pLastInputIdx) {
5174                             break;
5175                         }
5176                         *pLastInputIdx = fp->fInputIdx;
5177                     }
5178 
5179                     // Loop Continuation: we will fall into the pattern following the loop
5180                     //   (non-greedy, don't execute loop body first), but first do
5181                     //   a state save to the top of the loop, so that a match failure
5182                     //   in the following pattern will try another iteration of the loop.
5183                     fp = StateSave(fp, opValue + 4, status);
5184                 }
5185             }
5186             break;
5187 
5188         case URX_STO_SP:
5189             U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5190             fData[opValue] = fStack->size();
5191             break;
5192 
5193         case URX_LD_SP:
5194             {
5195                 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5196                 int32_t newStackSize = (int32_t)fData[opValue];
5197                 U_ASSERT(newStackSize <= fStack->size());
5198                 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5199                 if (newFP == (int64_t *)fp) {
5200                     break;
5201                 }
5202                 int32_t i;
5203                 for (i=0; i<fFrameSize; i++) {
5204                     newFP[i] = ((int64_t *)fp)[i];
5205                 }
5206                 fp = (REStackFrame *)newFP;
5207                 fStack->setSize(newStackSize);
5208             }
5209             break;
5210 
5211         case URX_BACKREF:
5212             {
5213                 U_ASSERT(opValue < fFrameSize);
5214                 int64_t groupStartIdx = fp->fExtra[opValue];
5215                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5216                 U_ASSERT(groupStartIdx <= groupEndIdx);
5217                 int64_t inputIndex = fp->fInputIdx;
5218                 if (groupStartIdx < 0) {
5219                     // This capture group has not participated in the match thus far,
5220                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5221                     break;
5222                 }
5223                 UBool success = TRUE;
5224                 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5225                     if (inputIndex >= fActiveLimit) {
5226                         success = FALSE;
5227                         fHitEnd = TRUE;
5228                         break;
5229                     }
5230                     if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5231                         success = FALSE;
5232                         break;
5233                     }
5234                 }
5235                 if (success) {
5236                     fp->fInputIdx = inputIndex;
5237                 } else {
5238                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5239                 }
5240             }
5241             break;
5242 
5243         case URX_BACKREF_I:
5244             {
5245                 U_ASSERT(opValue < fFrameSize);
5246                 int64_t groupStartIdx = fp->fExtra[opValue];
5247                 int64_t groupEndIdx   = fp->fExtra[opValue+1];
5248                 U_ASSERT(groupStartIdx <= groupEndIdx);
5249                 if (groupStartIdx < 0) {
5250                     // This capture group has not participated in the match thus far,
5251                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5252                     break;
5253                 }
5254                 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5255                 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5256 
5257                 //   Note: if the capture group match was of an empty string the backref
5258                 //         match succeeds.  Verified by testing:  Perl matches succeed
5259                 //         in this case, so we do too.
5260 
5261                 UBool success = TRUE;
5262                 for (;;) {
5263                     UChar32 captureGroupChar = captureGroupItr.next();
5264                     if (captureGroupChar == U_SENTINEL) {
5265                         success = TRUE;
5266                         break;
5267                     }
5268                     UChar32 inputChar = inputItr.next();
5269                     if (inputChar == U_SENTINEL) {
5270                         success = FALSE;
5271                         fHitEnd = TRUE;
5272                         break;
5273                     }
5274                     if (inputChar != captureGroupChar) {
5275                         success = FALSE;
5276                         break;
5277                     }
5278                 }
5279 
5280                 if (success && inputItr.inExpansion()) {
5281                     // We otained a match by consuming part of a string obtained from
5282                     // case-folding a single code point of the input text.
5283                     // This does not count as an overall match.
5284                     success = FALSE;
5285                 }
5286 
5287                 if (success) {
5288                     fp->fInputIdx = inputItr.getIndex();
5289                 } else {
5290                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5291                 }
5292             }
5293             break;
5294 
5295         case URX_STO_INP_LOC:
5296             {
5297                 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5298                 fp->fExtra[opValue] = fp->fInputIdx;
5299             }
5300             break;
5301 
5302         case URX_JMPX:
5303             {
5304                 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5305                 fp->fPatIdx += 1;
5306                 int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5307                 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5308                 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5309                 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5310                 if (savedInputIdx < fp->fInputIdx) {
5311                     fp->fPatIdx = opValue;                               // JMP
5312                 } else {
5313                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5314                 }
5315             }
5316             break;
5317 
5318         case URX_LA_START:
5319             {
5320                 // Entering a lookahead block.
5321                 // Save Stack Ptr, Input Pos.
5322                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5323                 fData[opValue]   = fStack->size();
5324                 fData[opValue+1] = fp->fInputIdx;
5325                 fActiveStart     = fLookStart;          // Set the match region change for
5326                 fActiveLimit     = fLookLimit;          //   transparent bounds.
5327             }
5328             break;
5329 
5330         case URX_LA_END:
5331             {
5332                 // Leaving a look-ahead block.
5333                 //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5334                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5335                 int32_t stackSize = fStack->size();
5336                 int32_t newStackSize = (int32_t)fData[opValue];
5337                 U_ASSERT(stackSize >= newStackSize);
5338                 if (stackSize > newStackSize) {
5339                     // Copy the current top frame back to the new (cut back) top frame.
5340                     //   This makes the capture groups from within the look-ahead
5341                     //   expression available.
5342                     int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5343                     int32_t i;
5344                     for (i=0; i<fFrameSize; i++) {
5345                         newFP[i] = ((int64_t *)fp)[i];
5346                     }
5347                     fp = (REStackFrame *)newFP;
5348                     fStack->setSize(newStackSize);
5349                 }
5350                 fp->fInputIdx = fData[opValue+1];
5351 
5352                 // Restore the active region bounds in the input string; they may have
5353                 //    been changed because of transparent bounds on a Region.
5354                 fActiveStart = fRegionStart;
5355                 fActiveLimit = fRegionLimit;
5356             }
5357             break;
5358 
5359         case URX_ONECHAR_I:
5360             if (fp->fInputIdx < fActiveLimit) {
5361                 UChar32 c;
5362                 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5363                 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5364                     break;
5365                 }
5366             } else {
5367                 fHitEnd = TRUE;
5368             }
5369             fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5370             break;
5371 
5372         case URX_STRING_I:
5373             // Case-insensitive test input against a literal string.
5374             // Strings require two slots in the compiled pattern, one for the
5375             //   offset to the string text, and one for the length.
5376             //   The compiled string has already been case folded.
5377             {
5378                 const UChar *patternString = litText + opValue;
5379 
5380                 op      = (int32_t)pat[fp->fPatIdx];
5381                 fp->fPatIdx++;
5382                 opType  = URX_TYPE(op);
5383                 opValue = URX_VAL(op);
5384                 U_ASSERT(opType == URX_STRING_LEN);
5385                 int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5386 
5387                 UChar32      cText;
5388                 UChar32      cPattern;
5389                 UBool        success = TRUE;
5390                 int32_t      patternStringIdx  = 0;
5391                 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5392                 while (patternStringIdx < patternStringLen) {
5393                     U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5394                     cText = inputIterator.next();
5395                     if (cText != cPattern) {
5396                         success = FALSE;
5397                         if (cText == U_SENTINEL) {
5398                             fHitEnd = TRUE;
5399                         }
5400                         break;
5401                     }
5402                 }
5403                 if (inputIterator.inExpansion()) {
5404                     success = FALSE;
5405                 }
5406 
5407                 if (success) {
5408                     fp->fInputIdx = inputIterator.getIndex();
5409                 } else {
5410                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5411                 }
5412             }
5413             break;
5414 
5415         case URX_LB_START:
5416             {
5417                 // Entering a look-behind block.
5418                 // Save Stack Ptr, Input Pos.
5419                 //   TODO:  implement transparent bounds.  Ticket #6067
5420                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5421                 fData[opValue]   = fStack->size();
5422                 fData[opValue+1] = fp->fInputIdx;
5423                 // Init the variable containing the start index for attempted matches.
5424                 fData[opValue+2] = -1;
5425                 // Save input string length, then reset to pin any matches to end at
5426                 //   the current position.
5427                 fData[opValue+3] = fActiveLimit;
5428                 fActiveLimit     = fp->fInputIdx;
5429             }
5430             break;
5431 
5432 
5433         case URX_LB_CONT:
5434             {
5435                 // Positive Look-Behind, at top of loop checking for matches of LB expression
5436                 //    at all possible input starting positions.
5437 
5438                 // Fetch the min and max possible match lengths.  They are the operands
5439                 //   of this op in the pattern.
5440                 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5441                 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5442                 U_ASSERT(minML <= maxML);
5443                 U_ASSERT(minML >= 0);
5444 
5445                 // Fetch (from data) the last input index where a match was attempted.
5446                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5447                 int64_t  *lbStartIdx = &fData[opValue+2];
5448                 if (*lbStartIdx < 0) {
5449                     // First time through loop.
5450                     *lbStartIdx = fp->fInputIdx - minML;
5451                 } else {
5452                     // 2nd through nth time through the loop.
5453                     // Back up start position for match by one.
5454                     if (*lbStartIdx == 0) {
5455                         (*lbStartIdx)--;
5456                     } else {
5457                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5458                     }
5459                 }
5460 
5461                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5462                     // We have tried all potential match starting points without
5463                     //  getting a match.  Backtrack out, and out of the
5464                     //   Look Behind altogether.
5465                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5466                     int64_t restoreInputLen = fData[opValue+3];
5467                     U_ASSERT(restoreInputLen >= fActiveLimit);
5468                     U_ASSERT(restoreInputLen <= fInputLength);
5469                     fActiveLimit = restoreInputLen;
5470                     break;
5471                 }
5472 
5473                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5474                 //      (successful match will fall off the end of the loop.)
5475                 fp = StateSave(fp, fp->fPatIdx-3, status);
5476                 fp->fInputIdx =  *lbStartIdx;
5477             }
5478             break;
5479 
5480         case URX_LB_END:
5481             // End of a look-behind block, after a successful match.
5482             {
5483                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5484                 if (fp->fInputIdx != fActiveLimit) {
5485                     //  The look-behind expression matched, but the match did not
5486                     //    extend all the way to the point that we are looking behind from.
5487                     //  FAIL out of here, which will take us back to the LB_CONT, which
5488                     //     will retry the match starting at another position or fail
5489                     //     the look-behind altogether, whichever is appropriate.
5490                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5491                     break;
5492                 }
5493 
5494                 // Look-behind match is good.  Restore the orignal input string length,
5495                 //   which had been truncated to pin the end of the lookbehind match to the
5496                 //   position being looked-behind.
5497                 int64_t originalInputLen = fData[opValue+3];
5498                 U_ASSERT(originalInputLen >= fActiveLimit);
5499                 U_ASSERT(originalInputLen <= fInputLength);
5500                 fActiveLimit = originalInputLen;
5501             }
5502             break;
5503 
5504 
5505         case URX_LBN_CONT:
5506             {
5507                 // Negative Look-Behind, at top of loop checking for matches of LB expression
5508                 //    at all possible input starting positions.
5509 
5510                 // Fetch the extra parameters of this op.
5511                 int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5512                 int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5513                 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5514                 continueLoc = URX_VAL(continueLoc);
5515                 U_ASSERT(minML <= maxML);
5516                 U_ASSERT(minML >= 0);
5517                 U_ASSERT(continueLoc > fp->fPatIdx);
5518 
5519                 // Fetch (from data) the last input index where a match was attempted.
5520                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5521                 int64_t  *lbStartIdx = &fData[opValue+2];
5522                 if (*lbStartIdx < 0) {
5523                     // First time through loop.
5524                     *lbStartIdx = fp->fInputIdx - minML;
5525                 } else {
5526                     // 2nd through nth time through the loop.
5527                     // Back up start position for match by one.
5528                     if (*lbStartIdx == 0) {
5529                         (*lbStartIdx)--;   // Because U16_BACK is unsafe starting at 0.
5530                     } else {
5531                         U16_BACK_1(inputBuf, 0, *lbStartIdx);
5532                     }
5533                 }
5534 
5535                 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5536                     // We have tried all potential match starting points without
5537                     //  getting a match, which means that the negative lookbehind as
5538                     //  a whole has succeeded.  Jump forward to the continue location
5539                     int64_t restoreInputLen = fData[opValue+3];
5540                     U_ASSERT(restoreInputLen >= fActiveLimit);
5541                     U_ASSERT(restoreInputLen <= fInputLength);
5542                     fActiveLimit = restoreInputLen;
5543                     fp->fPatIdx = continueLoc;
5544                     break;
5545                 }
5546 
5547                 //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5548                 //      (successful match will cause a FAIL out of the loop altogether.)
5549                 fp = StateSave(fp, fp->fPatIdx-4, status);
5550                 fp->fInputIdx =  *lbStartIdx;
5551             }
5552             break;
5553 
5554         case URX_LBN_END:
5555             // End of a negative look-behind block, after a successful match.
5556             {
5557                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5558                 if (fp->fInputIdx != fActiveLimit) {
5559                     //  The look-behind expression matched, but the match did not
5560                     //    extend all the way to the point that we are looking behind from.
5561                     //  FAIL out of here, which will take us back to the LB_CONT, which
5562                     //     will retry the match starting at another position or succeed
5563                     //     the look-behind altogether, whichever is appropriate.
5564                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5565                     break;
5566                 }
5567 
5568                 // Look-behind expression matched, which means look-behind test as
5569                 //   a whole Fails
5570 
5571                 //   Restore the orignal input string length, which had been truncated
5572                 //   inorder to pin the end of the lookbehind match
5573                 //   to the position being looked-behind.
5574                 int64_t originalInputLen = fData[opValue+3];
5575                 U_ASSERT(originalInputLen >= fActiveLimit);
5576                 U_ASSERT(originalInputLen <= fInputLength);
5577                 fActiveLimit = originalInputLen;
5578 
5579                 // Restore original stack position, discarding any state saved
5580                 //   by the successful pattern match.
5581                 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5582                 int32_t newStackSize = (int32_t)fData[opValue];
5583                 U_ASSERT(fStack->size() > newStackSize);
5584                 fStack->setSize(newStackSize);
5585 
5586                 //  FAIL, which will take control back to someplace
5587                 //  prior to entering the look-behind test.
5588                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5589             }
5590             break;
5591 
5592 
5593         case URX_LOOP_SR_I:
5594             // Loop Initialization for the optimized implementation of
5595             //     [some character set]*
5596             //   This op scans through all matching input.
5597             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5598             {
5599                 U_ASSERT(opValue > 0 && opValue < sets->size());
5600                 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5601                 UnicodeSet   *s  = (UnicodeSet *)sets->elementAt(opValue);
5602 
5603                 // Loop through input, until either the input is exhausted or
5604                 //   we reach a character that is not a member of the set.
5605                 int32_t ix = (int32_t)fp->fInputIdx;
5606                 for (;;) {
5607                     if (ix >= fActiveLimit) {
5608                         fHitEnd = TRUE;
5609                         break;
5610                     }
5611                     UChar32   c;
5612                     U16_NEXT(inputBuf, ix, fActiveLimit, c);
5613                     if (c<256) {
5614                         if (s8->contains(c) == FALSE) {
5615                             U16_BACK_1(inputBuf, 0, ix);
5616                             break;
5617                         }
5618                     } else {
5619                         if (s->contains(c) == FALSE) {
5620                             U16_BACK_1(inputBuf, 0, ix);
5621                             break;
5622                         }
5623                     }
5624                 }
5625 
5626                 // If there were no matching characters, skip over the loop altogether.
5627                 //   The loop doesn't run at all, a * op always succeeds.
5628                 if (ix == fp->fInputIdx) {
5629                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5630                     break;
5631                 }
5632 
5633                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5634                 //   must follow.  It's operand is the stack location
5635                 //   that holds the starting input index for the match of this [set]*
5636                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5637                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5638                 int32_t stackLoc = URX_VAL(loopcOp);
5639                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5640                 fp->fExtra[stackLoc] = fp->fInputIdx;
5641                 fp->fInputIdx = ix;
5642 
5643                 // Save State to the URX_LOOP_C op that follows this one,
5644                 //   so that match failures in the following code will return to there.
5645                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5646                 fp = StateSave(fp, fp->fPatIdx, status);
5647                 fp->fPatIdx++;
5648             }
5649             break;
5650 
5651 
5652         case URX_LOOP_DOT_I:
5653             // Loop Initialization for the optimized implementation of .*
5654             //   This op scans through all remaining input.
5655             //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5656             {
5657                 // Loop through input until the input is exhausted (we reach an end-of-line)
5658                 // In DOTALL mode, we can just go straight to the end of the input.
5659                 int32_t ix;
5660                 if ((opValue & 1) == 1) {
5661                     // Dot-matches-All mode.  Jump straight to the end of the string.
5662                     ix = (int32_t)fActiveLimit;
5663                     fHitEnd = TRUE;
5664                 } else {
5665                     // NOT DOT ALL mode.  Line endings do not match '.'
5666                     // Scan forward until a line ending or end of input.
5667                     ix = (int32_t)fp->fInputIdx;
5668                     for (;;) {
5669                         if (ix >= fActiveLimit) {
5670                             fHitEnd = TRUE;
5671                             break;
5672                         }
5673                         UChar32   c;
5674                         U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5675                         if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5676                             if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5677                                 (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5678                                    isLineTerminator(c))) {
5679                                 //  char is a line ending.  Put the input pos back to the
5680                                 //    line ending char, and exit the scanning loop.
5681                                 U16_BACK_1(inputBuf, 0, ix);
5682                                 break;
5683                             }
5684                         }
5685                     }
5686                 }
5687 
5688                 // If there were no matching characters, skip over the loop altogether.
5689                 //   The loop doesn't run at all, a * op always succeeds.
5690                 if (ix == fp->fInputIdx) {
5691                     fp->fPatIdx++;   // skip the URX_LOOP_C op.
5692                     break;
5693                 }
5694 
5695                 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5696                 //   must follow.  It's operand is the stack location
5697                 //   that holds the starting input index for the match of this .*
5698                 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5699                 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5700                 int32_t stackLoc = URX_VAL(loopcOp);
5701                 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5702                 fp->fExtra[stackLoc] = fp->fInputIdx;
5703                 fp->fInputIdx = ix;
5704 
5705                 // Save State to the URX_LOOP_C op that follows this one,
5706                 //   so that match failures in the following code will return to there.
5707                 //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5708                 fp = StateSave(fp, fp->fPatIdx, status);
5709                 fp->fPatIdx++;
5710             }
5711             break;
5712 
5713 
5714         case URX_LOOP_C:
5715             {
5716                 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5717                 backSearchIndex = (int32_t)fp->fExtra[opValue];
5718                 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5719                 if (backSearchIndex == fp->fInputIdx) {
5720                     // We've backed up the input idx to the point that the loop started.
5721                     // The loop is done.  Leave here without saving state.
5722                     //  Subsequent failures won't come back here.
5723                     break;
5724                 }
5725                 // Set up for the next iteration of the loop, with input index
5726                 //   backed up by one from the last time through,
5727                 //   and a state save to this instruction in case the following code fails again.
5728                 //   (We're going backwards because this loop emulates stack unwinding, not
5729                 //    the initial scan forward.)
5730                 U_ASSERT(fp->fInputIdx > 0);
5731                 UChar32 prevC;
5732                 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5733 
5734                 if (prevC == 0x0a &&
5735                     fp->fInputIdx > backSearchIndex &&
5736                     inputBuf[fp->fInputIdx-1] == 0x0d) {
5737                     int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5738                     if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5739                         // .*, stepping back over CRLF pair.
5740                         U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5741                     }
5742                 }
5743 
5744 
5745                 fp = StateSave(fp, fp->fPatIdx-1, status);
5746             }
5747             break;
5748 
5749 
5750 
5751         default:
5752             // Trouble.  The compiled pattern contains an entry with an
5753             //           unrecognized type tag.
5754             U_ASSERT(FALSE);
5755         }
5756 
5757         if (U_FAILURE(status)) {
5758             isMatch = FALSE;
5759             break;
5760         }
5761     }
5762 
5763 breakFromLoop:
5764     fMatch = isMatch;
5765     if (isMatch) {
5766         fLastMatchEnd = fMatchEnd;
5767         fMatchStart   = startIdx;
5768         fMatchEnd     = fp->fInputIdx;
5769     }
5770 
5771 #ifdef REGEX_RUN_DEBUG
5772     if (fTraceDebug) {
5773         if (isMatch) {
5774             printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5775         } else {
5776             printf("No match\n\n");
5777         }
5778     }
5779 #endif
5780 
5781     fFrame = fp;                // The active stack frame when the engine stopped.
5782                                 //   Contains the capture group results that we need to
5783                                 //    access later.
5784 
5785     return;
5786 }
5787 
5788 
5789 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5790 
5791 U_NAMESPACE_END
5792 
5793 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5794