1 /*
2 **************************************************************************
3 * Copyright (C) 2002-2015 International Business Machines Corporation *
4 * and others. All rights reserved. *
5 **************************************************************************
6 */
7 //
8 // file: rematch.cpp
9 //
10 // Contains the implementation of class RegexMatcher,
11 // which is one of the main API classes for the ICU regular expression package.
12 //
13
14 #include "unicode/utypes.h"
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/ustring.h"
21 #include "unicode/rbbi.h"
22 #include "unicode/utf.h"
23 #include "unicode/utf16.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 #include "uvector.h"
27 #include "uvectr32.h"
28 #include "uvectr64.h"
29 #include "regeximp.h"
30 #include "regexst.h"
31 #include "regextxt.h"
32 #include "ucase.h"
33
34 // #include <malloc.h> // Needed for heapcheck testing
35
36 U_NAMESPACE_BEGIN
37
38 // Default limit for the size of the back track stack, to avoid system
39 // failures causedby heap exhaustion. Units are in 32 bit words, not bytes.
40 // This value puts ICU's limits higher than most other regexp implementations,
41 // which use recursion rather than the heap, and take more storage per
42 // backtrack point.
43 //
44 static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
45
46 // Time limit counter constant.
47 // Time limits for expression evaluation are in terms of quanta of work by
48 // the engine, each of which is 10,000 state saves.
49 // This constant determines that state saves per tick number.
50 static const int32_t TIMER_INITIAL_VALUE = 10000;
51
52
53 // Test for any of the Unicode line terminating characters.
isLineTerminator(UChar32 c)54 static inline UBool isLineTerminator(UChar32 c) {
55 if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
56 return false;
57 }
58 return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
59 }
60
61 //-----------------------------------------------------------------------------
62 //
63 // Constructor and Destructor
64 //
65 //-----------------------------------------------------------------------------
RegexMatcher(const RegexPattern * pat)66 RegexMatcher::RegexMatcher(const RegexPattern *pat) {
67 fDeferredStatus = U_ZERO_ERROR;
68 init(fDeferredStatus);
69 if (U_FAILURE(fDeferredStatus)) {
70 return;
71 }
72 if (pat==NULL) {
73 fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
74 return;
75 }
76 fPattern = pat;
77 init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
78 }
79
80
81
RegexMatcher(const UnicodeString & regexp,const UnicodeString & input,uint32_t flags,UErrorCode & status)82 RegexMatcher::RegexMatcher(const UnicodeString ®exp, const UnicodeString &input,
83 uint32_t flags, UErrorCode &status) {
84 init(status);
85 if (U_FAILURE(status)) {
86 return;
87 }
88 UParseError pe;
89 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
90 fPattern = fPatternOwned;
91
92 UText inputText = UTEXT_INITIALIZER;
93 utext_openConstUnicodeString(&inputText, &input, &status);
94 init2(&inputText, status);
95 utext_close(&inputText);
96
97 fInputUniStrMaybeMutable = TRUE;
98 }
99
100
RegexMatcher(UText * regexp,UText * input,uint32_t flags,UErrorCode & status)101 RegexMatcher::RegexMatcher(UText *regexp, UText *input,
102 uint32_t flags, UErrorCode &status) {
103 init(status);
104 if (U_FAILURE(status)) {
105 return;
106 }
107 UParseError pe;
108 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
109 if (U_FAILURE(status)) {
110 return;
111 }
112
113 fPattern = fPatternOwned;
114 init2(input, status);
115 }
116
117
RegexMatcher(const UnicodeString & regexp,uint32_t flags,UErrorCode & status)118 RegexMatcher::RegexMatcher(const UnicodeString ®exp,
119 uint32_t flags, UErrorCode &status) {
120 init(status);
121 if (U_FAILURE(status)) {
122 return;
123 }
124 UParseError pe;
125 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
126 if (U_FAILURE(status)) {
127 return;
128 }
129 fPattern = fPatternOwned;
130 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
131 }
132
RegexMatcher(UText * regexp,uint32_t flags,UErrorCode & status)133 RegexMatcher::RegexMatcher(UText *regexp,
134 uint32_t flags, UErrorCode &status) {
135 init(status);
136 if (U_FAILURE(status)) {
137 return;
138 }
139 UParseError pe;
140 fPatternOwned = RegexPattern::compile(regexp, flags, pe, status);
141 if (U_FAILURE(status)) {
142 return;
143 }
144
145 fPattern = fPatternOwned;
146 init2(RegexStaticSets::gStaticSets->fEmptyText, status);
147 }
148
149
150
151
~RegexMatcher()152 RegexMatcher::~RegexMatcher() {
153 delete fStack;
154 if (fData != fSmallData) {
155 uprv_free(fData);
156 fData = NULL;
157 }
158 if (fPatternOwned) {
159 delete fPatternOwned;
160 fPatternOwned = NULL;
161 fPattern = NULL;
162 }
163
164 if (fInput) {
165 delete fInput;
166 }
167 if (fInputText) {
168 utext_close(fInputText);
169 }
170 if (fAltInputText) {
171 utext_close(fAltInputText);
172 }
173
174 #if UCONFIG_NO_BREAK_ITERATION==0
175 delete fWordBreakItr;
176 #endif
177 }
178
179 //
180 // init() common initialization for use by all constructors.
181 // Initialize all fields, get the object into a consistent state.
182 // This must be done even when the initial status shows an error,
183 // so that the object is initialized sufficiently well for the destructor
184 // to run safely.
185 //
init(UErrorCode & status)186 void RegexMatcher::init(UErrorCode &status) {
187 fPattern = NULL;
188 fPatternOwned = NULL;
189 fFrameSize = 0;
190 fRegionStart = 0;
191 fRegionLimit = 0;
192 fAnchorStart = 0;
193 fAnchorLimit = 0;
194 fLookStart = 0;
195 fLookLimit = 0;
196 fActiveStart = 0;
197 fActiveLimit = 0;
198 fTransparentBounds = FALSE;
199 fAnchoringBounds = TRUE;
200 fMatch = FALSE;
201 fMatchStart = 0;
202 fMatchEnd = 0;
203 fLastMatchEnd = -1;
204 fAppendPosition = 0;
205 fHitEnd = FALSE;
206 fRequireEnd = FALSE;
207 fStack = NULL;
208 fFrame = NULL;
209 fTimeLimit = 0;
210 fTime = 0;
211 fTickCounter = 0;
212 fStackLimit = DEFAULT_BACKTRACK_STACK_CAPACITY;
213 fCallbackFn = NULL;
214 fCallbackContext = NULL;
215 fFindProgressCallbackFn = NULL;
216 fFindProgressCallbackContext = NULL;
217 fTraceDebug = FALSE;
218 fDeferredStatus = status;
219 fData = fSmallData;
220 fWordBreakItr = NULL;
221
222 fStack = NULL;
223 fInputText = NULL;
224 fAltInputText = NULL;
225 fInput = NULL;
226 fInputLength = 0;
227 fInputUniStrMaybeMutable = FALSE;
228
229 if (U_FAILURE(status)) {
230 fDeferredStatus = status;
231 }
232 }
233
234 //
235 // init2() Common initialization for use by RegexMatcher constructors, part 2.
236 // This handles the common setup to be done after the Pattern is available.
237 //
init2(UText * input,UErrorCode & status)238 void RegexMatcher::init2(UText *input, UErrorCode &status) {
239 if (U_FAILURE(status)) {
240 fDeferredStatus = status;
241 return;
242 }
243
244 if (fPattern->fDataSize > (int32_t)(sizeof(fSmallData)/sizeof(fSmallData[0]))) {
245 fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
246 if (fData == NULL) {
247 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248 return;
249 }
250 }
251
252 fStack = new UVector64(status);
253 if (fStack == NULL) {
254 status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255 return;
256 }
257
258 reset(input);
259 setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260 if (U_FAILURE(status)) {
261 fDeferredStatus = status;
262 return;
263 }
264 }
265
266
267 static const UChar BACKSLASH = 0x5c;
268 static const UChar DOLLARSIGN = 0x24;
269 static const UChar LEFTBRACKET = 0x7b;
270 static const UChar RIGHTBRACKET = 0x7d;
271
272 //--------------------------------------------------------------------------------
273 //
274 // appendReplacement
275 //
276 //--------------------------------------------------------------------------------
appendReplacement(UnicodeString & dest,const UnicodeString & replacement,UErrorCode & status)277 RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278 const UnicodeString &replacement,
279 UErrorCode &status) {
280 UText replacementText = UTEXT_INITIALIZER;
281
282 utext_openConstUnicodeString(&replacementText, &replacement, &status);
283 if (U_SUCCESS(status)) {
284 UText resultText = UTEXT_INITIALIZER;
285 utext_openUnicodeString(&resultText, &dest, &status);
286
287 if (U_SUCCESS(status)) {
288 appendReplacement(&resultText, &replacementText, status);
289 utext_close(&resultText);
290 }
291 utext_close(&replacementText);
292 }
293
294 return *this;
295 }
296
297 //
298 // appendReplacement, UText mode
299 //
appendReplacement(UText * dest,UText * replacement,UErrorCode & status)300 RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301 UText *replacement,
302 UErrorCode &status) {
303 if (U_FAILURE(status)) {
304 return *this;
305 }
306 if (U_FAILURE(fDeferredStatus)) {
307 status = fDeferredStatus;
308 return *this;
309 }
310 if (fMatch == FALSE) {
311 status = U_REGEX_INVALID_STATE;
312 return *this;
313 }
314
315 // Copy input string from the end of previous match to start of current match
316 int64_t destLen = utext_nativeLength(dest);
317 if (fMatchStart > fAppendPosition) {
318 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319 destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320 (int32_t)(fMatchStart-fAppendPosition), &status);
321 } else {
322 int32_t len16;
323 if (UTEXT_USES_U16(fInputText)) {
324 len16 = (int32_t)(fMatchStart-fAppendPosition);
325 } else {
326 UErrorCode lengthStatus = U_ZERO_ERROR;
327 len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
328 }
329 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
330 if (inputChars == NULL) {
331 status = U_MEMORY_ALLOCATION_ERROR;
332 return *this;
333 }
334 utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335 destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336 uprv_free(inputChars);
337 }
338 }
339 fAppendPosition = fMatchEnd;
340
341
342 // scan the replacement text, looking for substitutions ($n) and \escapes.
343 // TODO: optimize this loop by efficiently scanning for '$' or '\',
344 // move entire ranges not containing substitutions.
345 UTEXT_SETNATIVEINDEX(replacement, 0);
346 for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL; c = UTEXT_NEXT32(replacement)) {
347 if (c == BACKSLASH) {
348 // Backslash Escape. Copy the following char out without further checks.
349 // Note: Surrogate pairs don't need any special handling
350 // The second half wont be a '$' or a '\', and
351 // will move to the dest normally on the next
352 // loop iteration.
353 c = UTEXT_CURRENT32(replacement);
354 if (c == U_SENTINEL) {
355 break;
356 }
357
358 if (c==0x55/*U*/ || c==0x75/*u*/) {
359 // We have a \udddd or \Udddddddd escape sequence.
360 int32_t offset = 0;
361 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362 UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363 if (escapedChar != (UChar32)0xFFFFFFFF) {
364 if (U_IS_BMP(escapedChar)) {
365 UChar c16 = (UChar)escapedChar;
366 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367 } else {
368 UChar surrogate[2];
369 surrogate[0] = U16_LEAD(escapedChar);
370 surrogate[1] = U16_TRAIL(escapedChar);
371 if (U_SUCCESS(status)) {
372 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373 }
374 }
375 // TODO: Report errors for mal-formed \u escapes?
376 // As this is, the original sequence is output, which may be OK.
377 if (context.lastOffset == offset) {
378 (void)UTEXT_PREVIOUS32(replacement);
379 } else if (context.lastOffset != offset-1) {
380 utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381 }
382 }
383 } else {
384 (void)UTEXT_NEXT32(replacement);
385 // Plain backslash escape. Just put out the escaped character.
386 if (U_IS_BMP(c)) {
387 UChar c16 = (UChar)c;
388 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389 } else {
390 UChar surrogate[2];
391 surrogate[0] = U16_LEAD(c);
392 surrogate[1] = U16_TRAIL(c);
393 if (U_SUCCESS(status)) {
394 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395 }
396 }
397 }
398 } else if (c != DOLLARSIGN) {
399 // Normal char, not a $. Copy it out without further checks.
400 if (U_IS_BMP(c)) {
401 UChar c16 = (UChar)c;
402 destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403 } else {
404 UChar surrogate[2];
405 surrogate[0] = U16_LEAD(c);
406 surrogate[1] = U16_TRAIL(c);
407 if (U_SUCCESS(status)) {
408 destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409 }
410 }
411 } else {
412 // We've got a $. Pick up a capture group name or number if one follows.
413 // Consume digits so long as the resulting group number <= the number of
414 // number of capture groups in the pattern.
415
416 int32_t groupNum = 0;
417 int32_t numDigits = 0;
418 UChar32 nextChar = utext_current32(replacement);
419 if (nextChar == LEFTBRACKET) {
420 // Scan for a Named Capture Group, ${name}.
421 UnicodeString groupName;
422 utext_next32(replacement);
423 while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424 nextChar = utext_next32(replacement);
425 if (nextChar == U_SENTINEL) {
426 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427 } else if ((nextChar >= 0x41 && nextChar <= 0x5a) || // A..Z
428 (nextChar >= 0x61 && nextChar <= 0x7a) || // a..z
429 (nextChar >= 0x31 && nextChar <= 0x39)) { // 0..9
430 groupName.append(nextChar);
431 } else if (nextChar == RIGHTBRACKET) {
432 groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
433 if (groupNum == 0) {
434 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435 }
436 } else {
437 // Character was something other than a name char or a closing '}'
438 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439 }
440 }
441
442 } else if (u_isdigit(nextChar)) {
443 // $n Scan for a capture group number
444 int32_t numCaptureGroups = fPattern->fGroupMap->size();
445 for (;;) {
446 nextChar = UTEXT_CURRENT32(replacement);
447 if (nextChar == U_SENTINEL) {
448 break;
449 }
450 if (u_isdigit(nextChar) == FALSE) {
451 break;
452 }
453 int32_t nextDigitVal = u_charDigitValue(nextChar);
454 if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455 // Don't consume the next digit if it makes the capture group number too big.
456 if (numDigits == 0) {
457 status = U_INDEX_OUTOFBOUNDS_ERROR;
458 }
459 break;
460 }
461 (void)UTEXT_NEXT32(replacement);
462 groupNum=groupNum*10 + nextDigitVal;
463 ++numDigits;
464 }
465 } else {
466 // $ not followed by capture group name or number.
467 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468 }
469
470 if (U_SUCCESS(status)) {
471 destLen += appendGroup(groupNum, dest, status);
472 }
473 } // End of $ capture group handling
474 } // End of per-character loop through the replacement string.
475
476 return *this;
477 }
478
479
480
481 //--------------------------------------------------------------------------------
482 //
483 // appendTail Intended to be used in conjunction with appendReplacement()
484 // To the destination string, append everything following
485 // the last match position from the input string.
486 //
487 // Note: Match ranges do not affect appendTail or appendReplacement
488 //
489 //--------------------------------------------------------------------------------
appendTail(UnicodeString & dest)490 UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491 UErrorCode status = U_ZERO_ERROR;
492 UText resultText = UTEXT_INITIALIZER;
493 utext_openUnicodeString(&resultText, &dest, &status);
494
495 if (U_SUCCESS(status)) {
496 appendTail(&resultText, status);
497 utext_close(&resultText);
498 }
499
500 return dest;
501 }
502
503 //
504 // appendTail, UText mode
505 //
appendTail(UText * dest,UErrorCode & status)506 UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507 if (U_FAILURE(status)) {
508 return dest;
509 }
510 if (U_FAILURE(fDeferredStatus)) {
511 status = fDeferredStatus;
512 return dest;
513 }
514
515 if (fInputLength > fAppendPosition) {
516 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517 int64_t destLen = utext_nativeLength(dest);
518 utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519 (int32_t)(fInputLength-fAppendPosition), &status);
520 } else {
521 int32_t len16;
522 if (UTEXT_USES_U16(fInputText)) {
523 len16 = (int32_t)(fInputLength-fAppendPosition);
524 } else {
525 len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
526 status = U_ZERO_ERROR; // buffer overflow
527 }
528
529 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
530 if (inputChars == NULL) {
531 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532 } else {
533 utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534 int64_t destLen = utext_nativeLength(dest);
535 utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536 uprv_free(inputChars);
537 }
538 }
539 }
540 return dest;
541 }
542
543
544
545 //--------------------------------------------------------------------------------
546 //
547 // end
548 //
549 //--------------------------------------------------------------------------------
end(UErrorCode & err) const550 int32_t RegexMatcher::end(UErrorCode &err) const {
551 return end(0, err);
552 }
553
end64(UErrorCode & err) const554 int64_t RegexMatcher::end64(UErrorCode &err) const {
555 return end64(0, err);
556 }
557
end64(int32_t group,UErrorCode & err) const558 int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559 if (U_FAILURE(err)) {
560 return -1;
561 }
562 if (fMatch == FALSE) {
563 err = U_REGEX_INVALID_STATE;
564 return -1;
565 }
566 if (group < 0 || group > fPattern->fGroupMap->size()) {
567 err = U_INDEX_OUTOFBOUNDS_ERROR;
568 return -1;
569 }
570 int64_t e = -1;
571 if (group == 0) {
572 e = fMatchEnd;
573 } else {
574 // Get the position within the stack frame of the variables for
575 // this capture group.
576 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577 U_ASSERT(groupOffset < fPattern->fFrameSize);
578 U_ASSERT(groupOffset >= 0);
579 e = fFrame->fExtra[groupOffset + 1];
580 }
581
582 return e;
583 }
584
end(int32_t group,UErrorCode & err) const585 int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586 return (int32_t)end64(group, err);
587 }
588
589 //--------------------------------------------------------------------------------
590 //
591 // findProgressInterrupt This function is called once for each advance in the target
592 // string from the find() function, and calls the user progress callback
593 // function if there is one installed.
594 //
595 // Return: TRUE if the find operation is to be terminated.
596 // FALSE if the find operation is to continue running.
597 //
598 //--------------------------------------------------------------------------------
findProgressInterrupt(int64_t pos,UErrorCode & status)599 UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600 if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601 status = U_REGEX_STOPPED_BY_CALLER;
602 return TRUE;
603 }
604 return FALSE;
605 }
606
607 //--------------------------------------------------------------------------------
608 //
609 // find()
610 //
611 //--------------------------------------------------------------------------------
find()612 UBool RegexMatcher::find() {
613 if (U_FAILURE(fDeferredStatus)) {
614 return FALSE;
615 }
616 UErrorCode status = U_ZERO_ERROR;
617 UBool result = find(status);
618 return result;
619 }
620
621 //--------------------------------------------------------------------------------
622 //
623 // find()
624 //
625 //--------------------------------------------------------------------------------
find(UErrorCode & status)626 UBool RegexMatcher::find(UErrorCode &status) {
627 // Start at the position of the last match end. (Will be zero if the
628 // matcher has been reset.)
629 //
630 if (U_FAILURE(status)) {
631 return FALSE;
632 }
633 if (U_FAILURE(fDeferredStatus)) {
634 status = fDeferredStatus;
635 return FALSE;
636 }
637
638 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639 return findUsingChunk(status);
640 }
641
642 int64_t startPos = fMatchEnd;
643 if (startPos==0) {
644 startPos = fActiveStart;
645 }
646
647 if (fMatch) {
648 // Save the position of any previous successful match.
649 fLastMatchEnd = fMatchEnd;
650
651 if (fMatchStart == fMatchEnd) {
652 // Previous match had zero length. Move start position up one position
653 // to avoid sending find() into a loop on zero-length matches.
654 if (startPos >= fActiveLimit) {
655 fMatch = FALSE;
656 fHitEnd = TRUE;
657 return FALSE;
658 }
659 UTEXT_SETNATIVEINDEX(fInputText, startPos);
660 (void)UTEXT_NEXT32(fInputText);
661 startPos = UTEXT_GETNATIVEINDEX(fInputText);
662 }
663 } else {
664 if (fLastMatchEnd >= 0) {
665 // A previous find() failed to match. Don't try again.
666 // (without this test, a pattern with a zero-length match
667 // could match again at the end of an input string.)
668 fHitEnd = TRUE;
669 return FALSE;
670 }
671 }
672
673
674 // Compute the position in the input string beyond which a match can not begin, because
675 // the minimum length match would extend past the end of the input.
676 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
677 // Be aware of possible overflows if making changes here.
678 int64_t testStartLimit;
679 if (UTEXT_USES_U16(fInputText)) {
680 testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681 if (startPos > testStartLimit) {
682 fMatch = FALSE;
683 fHitEnd = TRUE;
684 return FALSE;
685 }
686 } else {
687 // We don't know exactly how long the minimum match length is in native characters.
688 // Treat anything > 0 as 1.
689 testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690 }
691
692 UChar32 c;
693 U_ASSERT(startPos >= 0);
694
695 switch (fPattern->fStartType) {
696 case START_NO_INFO:
697 // No optimization was found.
698 // Try a match at each input position.
699 for (;;) {
700 MatchAt(startPos, FALSE, status);
701 if (U_FAILURE(status)) {
702 return FALSE;
703 }
704 if (fMatch) {
705 return TRUE;
706 }
707 if (startPos >= testStartLimit) {
708 fHitEnd = TRUE;
709 return FALSE;
710 }
711 UTEXT_SETNATIVEINDEX(fInputText, startPos);
712 (void)UTEXT_NEXT32(fInputText);
713 startPos = UTEXT_GETNATIVEINDEX(fInputText);
714 // Note that it's perfectly OK for a pattern to have a zero-length
715 // match at the end of a string, so we must make sure that the loop
716 // runs with startPos == testStartLimit the last time through.
717 if (findProgressInterrupt(startPos, status))
718 return FALSE;
719 }
720 U_ASSERT(FALSE);
721
722 case START_START:
723 // Matches are only possible at the start of the input string
724 // (pattern begins with ^ or \A)
725 if (startPos > fActiveStart) {
726 fMatch = FALSE;
727 return FALSE;
728 }
729 MatchAt(startPos, FALSE, status);
730 if (U_FAILURE(status)) {
731 return FALSE;
732 }
733 return fMatch;
734
735
736 case START_SET:
737 {
738 // Match may start on any char from a pre-computed set.
739 U_ASSERT(fPattern->fMinMatchLen > 0);
740 UTEXT_SETNATIVEINDEX(fInputText, startPos);
741 for (;;) {
742 int64_t pos = startPos;
743 c = UTEXT_NEXT32(fInputText);
744 startPos = UTEXT_GETNATIVEINDEX(fInputText);
745 // c will be -1 (U_SENTINEL) at end of text, in which case we
746 // skip this next block (so we don't have a negative array index)
747 // and handle end of text in the following block.
748 if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749 (c>=256 && fPattern->fInitialChars->contains(c)))) {
750 MatchAt(pos, FALSE, status);
751 if (U_FAILURE(status)) {
752 return FALSE;
753 }
754 if (fMatch) {
755 return TRUE;
756 }
757 UTEXT_SETNATIVEINDEX(fInputText, pos);
758 }
759 if (startPos > testStartLimit) {
760 fMatch = FALSE;
761 fHitEnd = TRUE;
762 return FALSE;
763 }
764 if (findProgressInterrupt(startPos, status))
765 return FALSE;
766 }
767 }
768 U_ASSERT(FALSE);
769
770 case START_STRING:
771 case START_CHAR:
772 {
773 // Match starts on exactly one char.
774 U_ASSERT(fPattern->fMinMatchLen > 0);
775 UChar32 theChar = fPattern->fInitialChar;
776 UTEXT_SETNATIVEINDEX(fInputText, startPos);
777 for (;;) {
778 int64_t pos = startPos;
779 c = UTEXT_NEXT32(fInputText);
780 startPos = UTEXT_GETNATIVEINDEX(fInputText);
781 if (c == theChar) {
782 MatchAt(pos, FALSE, status);
783 if (U_FAILURE(status)) {
784 return FALSE;
785 }
786 if (fMatch) {
787 return TRUE;
788 }
789 UTEXT_SETNATIVEINDEX(fInputText, pos);
790 }
791 if (startPos > testStartLimit) {
792 fMatch = FALSE;
793 fHitEnd = TRUE;
794 return FALSE;
795 }
796 if (findProgressInterrupt(startPos, status))
797 return FALSE;
798 }
799 }
800 U_ASSERT(FALSE);
801
802 case START_LINE:
803 {
804 UChar32 c;
805 if (startPos == fAnchorStart) {
806 MatchAt(startPos, FALSE, status);
807 if (U_FAILURE(status)) {
808 return FALSE;
809 }
810 if (fMatch) {
811 return TRUE;
812 }
813 UTEXT_SETNATIVEINDEX(fInputText, startPos);
814 c = UTEXT_NEXT32(fInputText);
815 startPos = UTEXT_GETNATIVEINDEX(fInputText);
816 } else {
817 UTEXT_SETNATIVEINDEX(fInputText, startPos);
818 c = UTEXT_PREVIOUS32(fInputText);
819 UTEXT_SETNATIVEINDEX(fInputText, startPos);
820 }
821
822 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823 for (;;) {
824 if (c == 0x0a) {
825 MatchAt(startPos, FALSE, status);
826 if (U_FAILURE(status)) {
827 return FALSE;
828 }
829 if (fMatch) {
830 return TRUE;
831 }
832 UTEXT_SETNATIVEINDEX(fInputText, startPos);
833 }
834 if (startPos >= testStartLimit) {
835 fMatch = FALSE;
836 fHitEnd = TRUE;
837 return FALSE;
838 }
839 c = UTEXT_NEXT32(fInputText);
840 startPos = UTEXT_GETNATIVEINDEX(fInputText);
841 // Note that it's perfectly OK for a pattern to have a zero-length
842 // match at the end of a string, so we must make sure that the loop
843 // runs with startPos == testStartLimit the last time through.
844 if (findProgressInterrupt(startPos, status))
845 return FALSE;
846 }
847 } else {
848 for (;;) {
849 if (isLineTerminator(c)) {
850 if (c == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851 (void)UTEXT_NEXT32(fInputText);
852 startPos = UTEXT_GETNATIVEINDEX(fInputText);
853 }
854 MatchAt(startPos, FALSE, status);
855 if (U_FAILURE(status)) {
856 return FALSE;
857 }
858 if (fMatch) {
859 return TRUE;
860 }
861 UTEXT_SETNATIVEINDEX(fInputText, startPos);
862 }
863 if (startPos >= testStartLimit) {
864 fMatch = FALSE;
865 fHitEnd = TRUE;
866 return FALSE;
867 }
868 c = UTEXT_NEXT32(fInputText);
869 startPos = UTEXT_GETNATIVEINDEX(fInputText);
870 // Note that it's perfectly OK for a pattern to have a zero-length
871 // match at the end of a string, so we must make sure that the loop
872 // runs with startPos == testStartLimit the last time through.
873 if (findProgressInterrupt(startPos, status))
874 return FALSE;
875 }
876 }
877 }
878
879 default:
880 U_ASSERT(FALSE);
881 }
882
883 U_ASSERT(FALSE);
884 return FALSE;
885 }
886
887
888
find(int64_t start,UErrorCode & status)889 UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
890 if (U_FAILURE(status)) {
891 return FALSE;
892 }
893 if (U_FAILURE(fDeferredStatus)) {
894 status = fDeferredStatus;
895 return FALSE;
896 }
897 this->reset(); // Note: Reset() is specified by Java Matcher documentation.
898 // This will reset the region to be the full input length.
899 if (start < 0) {
900 status = U_INDEX_OUTOFBOUNDS_ERROR;
901 return FALSE;
902 }
903
904 int64_t nativeStart = start;
905 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
906 status = U_INDEX_OUTOFBOUNDS_ERROR;
907 return FALSE;
908 }
909 fMatchEnd = nativeStart;
910 return find(status);
911 }
912
913
914 //--------------------------------------------------------------------------------
915 //
916 // findUsingChunk() -- like find(), but with the advance knowledge that the
917 // entire string is available in the UText's chunk buffer.
918 //
919 //--------------------------------------------------------------------------------
findUsingChunk(UErrorCode & status)920 UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
921 // Start at the position of the last match end. (Will be zero if the
922 // matcher has been reset.
923 //
924
925 int32_t startPos = (int32_t)fMatchEnd;
926 if (startPos==0) {
927 startPos = (int32_t)fActiveStart;
928 }
929
930 const UChar *inputBuf = fInputText->chunkContents;
931
932 if (fMatch) {
933 // Save the position of any previous successful match.
934 fLastMatchEnd = fMatchEnd;
935
936 if (fMatchStart == fMatchEnd) {
937 // Previous match had zero length. Move start position up one position
938 // to avoid sending find() into a loop on zero-length matches.
939 if (startPos >= fActiveLimit) {
940 fMatch = FALSE;
941 fHitEnd = TRUE;
942 return FALSE;
943 }
944 U16_FWD_1(inputBuf, startPos, fInputLength);
945 }
946 } else {
947 if (fLastMatchEnd >= 0) {
948 // A previous find() failed to match. Don't try again.
949 // (without this test, a pattern with a zero-length match
950 // could match again at the end of an input string.)
951 fHitEnd = TRUE;
952 return FALSE;
953 }
954 }
955
956
957 // Compute the position in the input string beyond which a match can not begin, because
958 // the minimum length match would extend past the end of the input.
959 // Note: some patterns that cannot match anything will have fMinMatchLength==Max Int.
960 // Be aware of possible overflows if making changes here.
961 // Note: a match can begin at inputBuf + testLen; it is an inclusive limit.
962 int32_t testLen = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
963 if (startPos > testLen) {
964 fMatch = FALSE;
965 fHitEnd = TRUE;
966 return FALSE;
967 }
968
969 UChar32 c;
970 U_ASSERT(startPos >= 0);
971
972 switch (fPattern->fStartType) {
973 case START_NO_INFO:
974 // No optimization was found.
975 // Try a match at each input position.
976 for (;;) {
977 MatchChunkAt(startPos, FALSE, status);
978 if (U_FAILURE(status)) {
979 return FALSE;
980 }
981 if (fMatch) {
982 return TRUE;
983 }
984 if (startPos >= testLen) {
985 fHitEnd = TRUE;
986 return FALSE;
987 }
988 U16_FWD_1(inputBuf, startPos, fActiveLimit);
989 // Note that it's perfectly OK for a pattern to have a zero-length
990 // match at the end of a string, so we must make sure that the loop
991 // runs with startPos == testLen the last time through.
992 if (findProgressInterrupt(startPos, status))
993 return FALSE;
994 }
995 U_ASSERT(FALSE);
996
997 case START_START:
998 // Matches are only possible at the start of the input string
999 // (pattern begins with ^ or \A)
1000 if (startPos > fActiveStart) {
1001 fMatch = FALSE;
1002 return FALSE;
1003 }
1004 MatchChunkAt(startPos, FALSE, status);
1005 if (U_FAILURE(status)) {
1006 return FALSE;
1007 }
1008 return fMatch;
1009
1010
1011 case START_SET:
1012 {
1013 // Match may start on any char from a pre-computed set.
1014 U_ASSERT(fPattern->fMinMatchLen > 0);
1015 for (;;) {
1016 int32_t pos = startPos;
1017 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1018 if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1019 (c>=256 && fPattern->fInitialChars->contains(c))) {
1020 MatchChunkAt(pos, FALSE, status);
1021 if (U_FAILURE(status)) {
1022 return FALSE;
1023 }
1024 if (fMatch) {
1025 return TRUE;
1026 }
1027 }
1028 if (startPos > testLen) {
1029 fMatch = FALSE;
1030 fHitEnd = TRUE;
1031 return FALSE;
1032 }
1033 if (findProgressInterrupt(startPos, status))
1034 return FALSE;
1035 }
1036 }
1037 U_ASSERT(FALSE);
1038
1039 case START_STRING:
1040 case START_CHAR:
1041 {
1042 // Match starts on exactly one char.
1043 U_ASSERT(fPattern->fMinMatchLen > 0);
1044 UChar32 theChar = fPattern->fInitialChar;
1045 for (;;) {
1046 int32_t pos = startPos;
1047 U16_NEXT(inputBuf, startPos, fActiveLimit, c); // like c = inputBuf[startPos++];
1048 if (c == theChar) {
1049 MatchChunkAt(pos, FALSE, status);
1050 if (U_FAILURE(status)) {
1051 return FALSE;
1052 }
1053 if (fMatch) {
1054 return TRUE;
1055 }
1056 }
1057 if (startPos > testLen) {
1058 fMatch = FALSE;
1059 fHitEnd = TRUE;
1060 return FALSE;
1061 }
1062 if (findProgressInterrupt(startPos, status))
1063 return FALSE;
1064 }
1065 }
1066 U_ASSERT(FALSE);
1067
1068 case START_LINE:
1069 {
1070 UChar32 c;
1071 if (startPos == fAnchorStart) {
1072 MatchChunkAt(startPos, FALSE, status);
1073 if (U_FAILURE(status)) {
1074 return FALSE;
1075 }
1076 if (fMatch) {
1077 return TRUE;
1078 }
1079 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1080 }
1081
1082 if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1083 for (;;) {
1084 c = inputBuf[startPos-1];
1085 if (c == 0x0a) {
1086 MatchChunkAt(startPos, FALSE, status);
1087 if (U_FAILURE(status)) {
1088 return FALSE;
1089 }
1090 if (fMatch) {
1091 return TRUE;
1092 }
1093 }
1094 if (startPos >= testLen) {
1095 fMatch = FALSE;
1096 fHitEnd = TRUE;
1097 return FALSE;
1098 }
1099 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1100 // Note that it's perfectly OK for a pattern to have a zero-length
1101 // match at the end of a string, so we must make sure that the loop
1102 // runs with startPos == testLen the last time through.
1103 if (findProgressInterrupt(startPos, status))
1104 return FALSE;
1105 }
1106 } else {
1107 for (;;) {
1108 c = inputBuf[startPos-1];
1109 if (isLineTerminator(c)) {
1110 if (c == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1111 startPos++;
1112 }
1113 MatchChunkAt(startPos, FALSE, status);
1114 if (U_FAILURE(status)) {
1115 return FALSE;
1116 }
1117 if (fMatch) {
1118 return TRUE;
1119 }
1120 }
1121 if (startPos >= testLen) {
1122 fMatch = FALSE;
1123 fHitEnd = TRUE;
1124 return FALSE;
1125 }
1126 U16_FWD_1(inputBuf, startPos, fActiveLimit);
1127 // Note that it's perfectly OK for a pattern to have a zero-length
1128 // match at the end of a string, so we must make sure that the loop
1129 // runs with startPos == testLen the last time through.
1130 if (findProgressInterrupt(startPos, status))
1131 return FALSE;
1132 }
1133 }
1134 }
1135
1136 default:
1137 U_ASSERT(FALSE);
1138 }
1139
1140 U_ASSERT(FALSE);
1141 return FALSE;
1142 }
1143
1144
1145
1146 //--------------------------------------------------------------------------------
1147 //
1148 // group()
1149 //
1150 //--------------------------------------------------------------------------------
group(UErrorCode & status) const1151 UnicodeString RegexMatcher::group(UErrorCode &status) const {
1152 return group(0, status);
1153 }
1154
1155 // Return immutable shallow clone
group(UText * dest,int64_t & group_len,UErrorCode & status) const1156 UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1157 return group(0, dest, group_len, status);
1158 }
1159
1160 // Return immutable shallow clone
group(int32_t groupNum,UText * dest,int64_t & group_len,UErrorCode & status) const1161 UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1162 group_len = 0;
1163 if (U_FAILURE(status)) {
1164 return dest;
1165 }
1166 if (U_FAILURE(fDeferredStatus)) {
1167 status = fDeferredStatus;
1168 } else if (fMatch == FALSE) {
1169 status = U_REGEX_INVALID_STATE;
1170 } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1171 status = U_INDEX_OUTOFBOUNDS_ERROR;
1172 }
1173
1174 if (U_FAILURE(status)) {
1175 return dest;
1176 }
1177
1178 int64_t s, e;
1179 if (groupNum == 0) {
1180 s = fMatchStart;
1181 e = fMatchEnd;
1182 } else {
1183 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1184 U_ASSERT(groupOffset < fPattern->fFrameSize);
1185 U_ASSERT(groupOffset >= 0);
1186 s = fFrame->fExtra[groupOffset];
1187 e = fFrame->fExtra[groupOffset+1];
1188 }
1189
1190 if (s < 0) {
1191 // A capture group wasn't part of the match
1192 return utext_clone(dest, fInputText, FALSE, TRUE, &status);
1193 }
1194 U_ASSERT(s <= e);
1195 group_len = e - s;
1196
1197 dest = utext_clone(dest, fInputText, FALSE, TRUE, &status);
1198 if (dest)
1199 UTEXT_SETNATIVEINDEX(dest, s);
1200 return dest;
1201 }
1202
group(int32_t groupNum,UErrorCode & status) const1203 UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1204 UnicodeString result;
1205 int64_t groupStart = start64(groupNum, status);
1206 int64_t groupEnd = end64(groupNum, status);
1207 if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1208 return result;
1209 }
1210
1211 // Get the group length using a utext_extract preflight.
1212 // UText is actually pretty efficient at this when underlying encoding is UTF-16.
1213 int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1214 if (status != U_BUFFER_OVERFLOW_ERROR) {
1215 return result;
1216 }
1217
1218 status = U_ZERO_ERROR;
1219 UChar *buf = result.getBuffer(length);
1220 if (buf == NULL) {
1221 status = U_MEMORY_ALLOCATION_ERROR;
1222 } else {
1223 int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1224 result.releaseBuffer(extractLength);
1225 U_ASSERT(length == extractLength);
1226 }
1227 return result;
1228 }
1229
1230
1231 //--------------------------------------------------------------------------------
1232 //
1233 // appendGroup() -- currently internal only, appends a group to a UText rather
1234 // than replacing its contents
1235 //
1236 //--------------------------------------------------------------------------------
1237
appendGroup(int32_t groupNum,UText * dest,UErrorCode & status) const1238 int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1239 if (U_FAILURE(status)) {
1240 return 0;
1241 }
1242 if (U_FAILURE(fDeferredStatus)) {
1243 status = fDeferredStatus;
1244 return 0;
1245 }
1246 int64_t destLen = utext_nativeLength(dest);
1247
1248 if (fMatch == FALSE) {
1249 status = U_REGEX_INVALID_STATE;
1250 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1251 }
1252 if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1253 status = U_INDEX_OUTOFBOUNDS_ERROR;
1254 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1255 }
1256
1257 int64_t s, e;
1258 if (groupNum == 0) {
1259 s = fMatchStart;
1260 e = fMatchEnd;
1261 } else {
1262 int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1263 U_ASSERT(groupOffset < fPattern->fFrameSize);
1264 U_ASSERT(groupOffset >= 0);
1265 s = fFrame->fExtra[groupOffset];
1266 e = fFrame->fExtra[groupOffset+1];
1267 }
1268
1269 if (s < 0) {
1270 // A capture group wasn't part of the match
1271 return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1272 }
1273 U_ASSERT(s <= e);
1274
1275 int64_t deltaLen;
1276 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1277 U_ASSERT(e <= fInputLength);
1278 deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1279 } else {
1280 int32_t len16;
1281 if (UTEXT_USES_U16(fInputText)) {
1282 len16 = (int32_t)(e-s);
1283 } else {
1284 UErrorCode lengthStatus = U_ZERO_ERROR;
1285 len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1286 }
1287 UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1288 if (groupChars == NULL) {
1289 status = U_MEMORY_ALLOCATION_ERROR;
1290 return 0;
1291 }
1292 utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1293
1294 deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1295 uprv_free(groupChars);
1296 }
1297 return deltaLen;
1298 }
1299
1300
1301
1302 //--------------------------------------------------------------------------------
1303 //
1304 // groupCount()
1305 //
1306 //--------------------------------------------------------------------------------
groupCount() const1307 int32_t RegexMatcher::groupCount() const {
1308 return fPattern->fGroupMap->size();
1309 }
1310
1311 //--------------------------------------------------------------------------------
1312 //
1313 // hasAnchoringBounds()
1314 //
1315 //--------------------------------------------------------------------------------
hasAnchoringBounds() const1316 UBool RegexMatcher::hasAnchoringBounds() const {
1317 return fAnchoringBounds;
1318 }
1319
1320
1321 //--------------------------------------------------------------------------------
1322 //
1323 // hasTransparentBounds()
1324 //
1325 //--------------------------------------------------------------------------------
hasTransparentBounds() const1326 UBool RegexMatcher::hasTransparentBounds() const {
1327 return fTransparentBounds;
1328 }
1329
1330
1331
1332 //--------------------------------------------------------------------------------
1333 //
1334 // hitEnd()
1335 //
1336 //--------------------------------------------------------------------------------
hitEnd() const1337 UBool RegexMatcher::hitEnd() const {
1338 return fHitEnd;
1339 }
1340
1341
1342 //--------------------------------------------------------------------------------
1343 //
1344 // input()
1345 //
1346 //--------------------------------------------------------------------------------
input() const1347 const UnicodeString &RegexMatcher::input() const {
1348 if (!fInput) {
1349 UErrorCode status = U_ZERO_ERROR;
1350 int32_t len16;
1351 if (UTEXT_USES_U16(fInputText)) {
1352 len16 = (int32_t)fInputLength;
1353 } else {
1354 len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1355 status = U_ZERO_ERROR; // overflow, length status
1356 }
1357 UnicodeString *result = new UnicodeString(len16, 0, 0);
1358
1359 UChar *inputChars = result->getBuffer(len16);
1360 utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1361 result->releaseBuffer(len16);
1362
1363 (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1364 }
1365
1366 return *fInput;
1367 }
1368
1369 //--------------------------------------------------------------------------------
1370 //
1371 // inputText()
1372 //
1373 //--------------------------------------------------------------------------------
inputText() const1374 UText *RegexMatcher::inputText() const {
1375 return fInputText;
1376 }
1377
1378
1379 //--------------------------------------------------------------------------------
1380 //
1381 // getInput() -- like inputText(), but makes a clone or copies into another UText
1382 //
1383 //--------------------------------------------------------------------------------
getInput(UText * dest,UErrorCode & status) const1384 UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1385 if (U_FAILURE(status)) {
1386 return dest;
1387 }
1388 if (U_FAILURE(fDeferredStatus)) {
1389 status = fDeferredStatus;
1390 return dest;
1391 }
1392
1393 if (dest) {
1394 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1395 utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1396 } else {
1397 int32_t input16Len;
1398 if (UTEXT_USES_U16(fInputText)) {
1399 input16Len = (int32_t)fInputLength;
1400 } else {
1401 UErrorCode lengthStatus = U_ZERO_ERROR;
1402 input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1403 }
1404 UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1405 if (inputChars == NULL) {
1406 return dest;
1407 }
1408
1409 status = U_ZERO_ERROR;
1410 utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1411 status = U_ZERO_ERROR;
1412 utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1413
1414 uprv_free(inputChars);
1415 }
1416 return dest;
1417 } else {
1418 return utext_clone(NULL, fInputText, FALSE, TRUE, &status);
1419 }
1420 }
1421
1422
1423 static UBool compat_SyncMutableUTextContents(UText *ut);
compat_SyncMutableUTextContents(UText * ut)1424 static UBool compat_SyncMutableUTextContents(UText *ut) {
1425 UBool retVal = FALSE;
1426
1427 // In the following test, we're really only interested in whether the UText should switch
1428 // between heap and stack allocation. If length hasn't changed, we won't, so the chunkContents
1429 // will still point to the correct data.
1430 if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1431 UnicodeString *us=(UnicodeString *)ut->context;
1432
1433 // Update to the latest length.
1434 // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1435 int32_t newLength = us->length();
1436
1437 // Update the chunk description.
1438 // The buffer may have switched between stack- and heap-based.
1439 ut->chunkContents = us->getBuffer();
1440 ut->chunkLength = newLength;
1441 ut->chunkNativeLimit = newLength;
1442 ut->nativeIndexingLimit = newLength;
1443 retVal = TRUE;
1444 }
1445
1446 return retVal;
1447 }
1448
1449 //--------------------------------------------------------------------------------
1450 //
1451 // lookingAt()
1452 //
1453 //--------------------------------------------------------------------------------
lookingAt(UErrorCode & status)1454 UBool RegexMatcher::lookingAt(UErrorCode &status) {
1455 if (U_FAILURE(status)) {
1456 return FALSE;
1457 }
1458 if (U_FAILURE(fDeferredStatus)) {
1459 status = fDeferredStatus;
1460 return FALSE;
1461 }
1462
1463 if (fInputUniStrMaybeMutable) {
1464 if (compat_SyncMutableUTextContents(fInputText)) {
1465 fInputLength = utext_nativeLength(fInputText);
1466 reset();
1467 }
1468 }
1469 else {
1470 resetPreserveRegion();
1471 }
1472 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1473 MatchChunkAt((int32_t)fActiveStart, FALSE, status);
1474 } else {
1475 MatchAt(fActiveStart, FALSE, status);
1476 }
1477 return fMatch;
1478 }
1479
1480
lookingAt(int64_t start,UErrorCode & status)1481 UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1482 if (U_FAILURE(status)) {
1483 return FALSE;
1484 }
1485 if (U_FAILURE(fDeferredStatus)) {
1486 status = fDeferredStatus;
1487 return FALSE;
1488 }
1489 reset();
1490
1491 if (start < 0) {
1492 status = U_INDEX_OUTOFBOUNDS_ERROR;
1493 return FALSE;
1494 }
1495
1496 if (fInputUniStrMaybeMutable) {
1497 if (compat_SyncMutableUTextContents(fInputText)) {
1498 fInputLength = utext_nativeLength(fInputText);
1499 reset();
1500 }
1501 }
1502
1503 int64_t nativeStart;
1504 nativeStart = start;
1505 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1506 status = U_INDEX_OUTOFBOUNDS_ERROR;
1507 return FALSE;
1508 }
1509
1510 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1511 MatchChunkAt((int32_t)nativeStart, FALSE, status);
1512 } else {
1513 MatchAt(nativeStart, FALSE, status);
1514 }
1515 return fMatch;
1516 }
1517
1518
1519
1520 //--------------------------------------------------------------------------------
1521 //
1522 // matches()
1523 //
1524 //--------------------------------------------------------------------------------
matches(UErrorCode & status)1525 UBool RegexMatcher::matches(UErrorCode &status) {
1526 if (U_FAILURE(status)) {
1527 return FALSE;
1528 }
1529 if (U_FAILURE(fDeferredStatus)) {
1530 status = fDeferredStatus;
1531 return FALSE;
1532 }
1533
1534 if (fInputUniStrMaybeMutable) {
1535 if (compat_SyncMutableUTextContents(fInputText)) {
1536 fInputLength = utext_nativeLength(fInputText);
1537 reset();
1538 }
1539 }
1540 else {
1541 resetPreserveRegion();
1542 }
1543
1544 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1545 MatchChunkAt((int32_t)fActiveStart, TRUE, status);
1546 } else {
1547 MatchAt(fActiveStart, TRUE, status);
1548 }
1549 return fMatch;
1550 }
1551
1552
matches(int64_t start,UErrorCode & status)1553 UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1554 if (U_FAILURE(status)) {
1555 return FALSE;
1556 }
1557 if (U_FAILURE(fDeferredStatus)) {
1558 status = fDeferredStatus;
1559 return FALSE;
1560 }
1561 reset();
1562
1563 if (start < 0) {
1564 status = U_INDEX_OUTOFBOUNDS_ERROR;
1565 return FALSE;
1566 }
1567
1568 if (fInputUniStrMaybeMutable) {
1569 if (compat_SyncMutableUTextContents(fInputText)) {
1570 fInputLength = utext_nativeLength(fInputText);
1571 reset();
1572 }
1573 }
1574
1575 int64_t nativeStart;
1576 nativeStart = start;
1577 if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1578 status = U_INDEX_OUTOFBOUNDS_ERROR;
1579 return FALSE;
1580 }
1581
1582 if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1583 MatchChunkAt((int32_t)nativeStart, TRUE, status);
1584 } else {
1585 MatchAt(nativeStart, TRUE, status);
1586 }
1587 return fMatch;
1588 }
1589
1590
1591
1592 //--------------------------------------------------------------------------------
1593 //
1594 // pattern
1595 //
1596 //--------------------------------------------------------------------------------
pattern() const1597 const RegexPattern &RegexMatcher::pattern() const {
1598 return *fPattern;
1599 }
1600
1601
1602
1603 //--------------------------------------------------------------------------------
1604 //
1605 // region
1606 //
1607 //--------------------------------------------------------------------------------
region(int64_t regionStart,int64_t regionLimit,int64_t startIndex,UErrorCode & status)1608 RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1609 if (U_FAILURE(status)) {
1610 return *this;
1611 }
1612
1613 if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1614 status = U_ILLEGAL_ARGUMENT_ERROR;
1615 }
1616
1617 int64_t nativeStart = regionStart;
1618 int64_t nativeLimit = regionLimit;
1619 if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1620 status = U_ILLEGAL_ARGUMENT_ERROR;
1621 }
1622
1623 if (startIndex == -1)
1624 this->reset();
1625 else
1626 resetPreserveRegion();
1627
1628 fRegionStart = nativeStart;
1629 fRegionLimit = nativeLimit;
1630 fActiveStart = nativeStart;
1631 fActiveLimit = nativeLimit;
1632
1633 if (startIndex != -1) {
1634 if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1635 status = U_INDEX_OUTOFBOUNDS_ERROR;
1636 }
1637 fMatchEnd = startIndex;
1638 }
1639
1640 if (!fTransparentBounds) {
1641 fLookStart = nativeStart;
1642 fLookLimit = nativeLimit;
1643 }
1644 if (fAnchoringBounds) {
1645 fAnchorStart = nativeStart;
1646 fAnchorLimit = nativeLimit;
1647 }
1648 return *this;
1649 }
1650
region(int64_t start,int64_t limit,UErrorCode & status)1651 RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1652 return region(start, limit, -1, status);
1653 }
1654
1655 //--------------------------------------------------------------------------------
1656 //
1657 // regionEnd
1658 //
1659 //--------------------------------------------------------------------------------
regionEnd() const1660 int32_t RegexMatcher::regionEnd() const {
1661 return (int32_t)fRegionLimit;
1662 }
1663
regionEnd64() const1664 int64_t RegexMatcher::regionEnd64() const {
1665 return fRegionLimit;
1666 }
1667
1668 //--------------------------------------------------------------------------------
1669 //
1670 // regionStart
1671 //
1672 //--------------------------------------------------------------------------------
regionStart() const1673 int32_t RegexMatcher::regionStart() const {
1674 return (int32_t)fRegionStart;
1675 }
1676
regionStart64() const1677 int64_t RegexMatcher::regionStart64() const {
1678 return fRegionStart;
1679 }
1680
1681
1682 //--------------------------------------------------------------------------------
1683 //
1684 // replaceAll
1685 //
1686 //--------------------------------------------------------------------------------
replaceAll(const UnicodeString & replacement,UErrorCode & status)1687 UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1688 UText replacementText = UTEXT_INITIALIZER;
1689 UText resultText = UTEXT_INITIALIZER;
1690 UnicodeString resultString;
1691 if (U_FAILURE(status)) {
1692 return resultString;
1693 }
1694
1695 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1696 utext_openUnicodeString(&resultText, &resultString, &status);
1697
1698 replaceAll(&replacementText, &resultText, status);
1699
1700 utext_close(&resultText);
1701 utext_close(&replacementText);
1702
1703 return resultString;
1704 }
1705
1706
1707 //
1708 // replaceAll, UText mode
1709 //
replaceAll(UText * replacement,UText * dest,UErrorCode & status)1710 UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1711 if (U_FAILURE(status)) {
1712 return dest;
1713 }
1714 if (U_FAILURE(fDeferredStatus)) {
1715 status = fDeferredStatus;
1716 return dest;
1717 }
1718
1719 if (dest == NULL) {
1720 UnicodeString emptyString;
1721 UText empty = UTEXT_INITIALIZER;
1722
1723 utext_openUnicodeString(&empty, &emptyString, &status);
1724 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1725 utext_close(&empty);
1726 }
1727
1728 if (U_SUCCESS(status)) {
1729 reset();
1730 while (find()) {
1731 appendReplacement(dest, replacement, status);
1732 if (U_FAILURE(status)) {
1733 break;
1734 }
1735 }
1736 appendTail(dest, status);
1737 }
1738
1739 return dest;
1740 }
1741
1742
1743 //--------------------------------------------------------------------------------
1744 //
1745 // replaceFirst
1746 //
1747 //--------------------------------------------------------------------------------
replaceFirst(const UnicodeString & replacement,UErrorCode & status)1748 UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1749 UText replacementText = UTEXT_INITIALIZER;
1750 UText resultText = UTEXT_INITIALIZER;
1751 UnicodeString resultString;
1752
1753 utext_openConstUnicodeString(&replacementText, &replacement, &status);
1754 utext_openUnicodeString(&resultText, &resultString, &status);
1755
1756 replaceFirst(&replacementText, &resultText, status);
1757
1758 utext_close(&resultText);
1759 utext_close(&replacementText);
1760
1761 return resultString;
1762 }
1763
1764 //
1765 // replaceFirst, UText mode
1766 //
replaceFirst(UText * replacement,UText * dest,UErrorCode & status)1767 UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1768 if (U_FAILURE(status)) {
1769 return dest;
1770 }
1771 if (U_FAILURE(fDeferredStatus)) {
1772 status = fDeferredStatus;
1773 return dest;
1774 }
1775
1776 reset();
1777 if (!find()) {
1778 return getInput(dest, status);
1779 }
1780
1781 if (dest == NULL) {
1782 UnicodeString emptyString;
1783 UText empty = UTEXT_INITIALIZER;
1784
1785 utext_openUnicodeString(&empty, &emptyString, &status);
1786 dest = utext_clone(NULL, &empty, TRUE, FALSE, &status);
1787 utext_close(&empty);
1788 }
1789
1790 appendReplacement(dest, replacement, status);
1791 appendTail(dest, status);
1792
1793 return dest;
1794 }
1795
1796
1797 //--------------------------------------------------------------------------------
1798 //
1799 // requireEnd
1800 //
1801 //--------------------------------------------------------------------------------
requireEnd() const1802 UBool RegexMatcher::requireEnd() const {
1803 return fRequireEnd;
1804 }
1805
1806
1807 //--------------------------------------------------------------------------------
1808 //
1809 // reset
1810 //
1811 //--------------------------------------------------------------------------------
reset()1812 RegexMatcher &RegexMatcher::reset() {
1813 fRegionStart = 0;
1814 fRegionLimit = fInputLength;
1815 fActiveStart = 0;
1816 fActiveLimit = fInputLength;
1817 fAnchorStart = 0;
1818 fAnchorLimit = fInputLength;
1819 fLookStart = 0;
1820 fLookLimit = fInputLength;
1821 resetPreserveRegion();
1822 return *this;
1823 }
1824
1825
1826
resetPreserveRegion()1827 void RegexMatcher::resetPreserveRegion() {
1828 fMatchStart = 0;
1829 fMatchEnd = 0;
1830 fLastMatchEnd = -1;
1831 fAppendPosition = 0;
1832 fMatch = FALSE;
1833 fHitEnd = FALSE;
1834 fRequireEnd = FALSE;
1835 fTime = 0;
1836 fTickCounter = TIMER_INITIAL_VALUE;
1837 //resetStack(); // more expensive than it looks...
1838 }
1839
1840
reset(const UnicodeString & input)1841 RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1842 fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1843 if (fPattern->fNeedsAltInput) {
1844 fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1845 }
1846 if (U_FAILURE(fDeferredStatus)) {
1847 return *this;
1848 }
1849 fInputLength = utext_nativeLength(fInputText);
1850
1851 reset();
1852 delete fInput;
1853 fInput = NULL;
1854
1855 // Do the following for any UnicodeString.
1856 // This is for compatibility for those clients who modify the input string "live" during regex operations.
1857 fInputUniStrMaybeMutable = TRUE;
1858
1859 if (fWordBreakItr != NULL) {
1860 #if UCONFIG_NO_BREAK_ITERATION==0
1861 UErrorCode status = U_ZERO_ERROR;
1862 fWordBreakItr->setText(fInputText, status);
1863 #endif
1864 }
1865 return *this;
1866 }
1867
1868
reset(UText * input)1869 RegexMatcher &RegexMatcher::reset(UText *input) {
1870 if (fInputText != input) {
1871 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &fDeferredStatus);
1872 if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, FALSE, TRUE, &fDeferredStatus);
1873 if (U_FAILURE(fDeferredStatus)) {
1874 return *this;
1875 }
1876 fInputLength = utext_nativeLength(fInputText);
1877
1878 delete fInput;
1879 fInput = NULL;
1880
1881 if (fWordBreakItr != NULL) {
1882 #if UCONFIG_NO_BREAK_ITERATION==0
1883 UErrorCode status = U_ZERO_ERROR;
1884 fWordBreakItr->setText(input, status);
1885 #endif
1886 }
1887 }
1888 reset();
1889 fInputUniStrMaybeMutable = FALSE;
1890
1891 return *this;
1892 }
1893
1894 /*RegexMatcher &RegexMatcher::reset(const UChar *) {
1895 fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1896 return *this;
1897 }*/
1898
reset(int64_t position,UErrorCode & status)1899 RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1900 if (U_FAILURE(status)) {
1901 return *this;
1902 }
1903 reset(); // Reset also resets the region to be the entire string.
1904
1905 if (position < 0 || position > fActiveLimit) {
1906 status = U_INDEX_OUTOFBOUNDS_ERROR;
1907 return *this;
1908 }
1909 fMatchEnd = position;
1910 return *this;
1911 }
1912
1913
1914 //--------------------------------------------------------------------------------
1915 //
1916 // refresh
1917 //
1918 //--------------------------------------------------------------------------------
refreshInputText(UText * input,UErrorCode & status)1919 RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1920 if (U_FAILURE(status)) {
1921 return *this;
1922 }
1923 if (input == NULL) {
1924 status = U_ILLEGAL_ARGUMENT_ERROR;
1925 return *this;
1926 }
1927 if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1928 status = U_ILLEGAL_ARGUMENT_ERROR;
1929 return *this;
1930 }
1931 int64_t pos = utext_getNativeIndex(fInputText);
1932 // Shallow read-only clone of the new UText into the existing input UText
1933 fInputText = utext_clone(fInputText, input, FALSE, TRUE, &status);
1934 if (U_FAILURE(status)) {
1935 return *this;
1936 }
1937 utext_setNativeIndex(fInputText, pos);
1938
1939 if (fAltInputText != NULL) {
1940 pos = utext_getNativeIndex(fAltInputText);
1941 fAltInputText = utext_clone(fAltInputText, input, FALSE, TRUE, &status);
1942 if (U_FAILURE(status)) {
1943 return *this;
1944 }
1945 utext_setNativeIndex(fAltInputText, pos);
1946 }
1947 return *this;
1948 }
1949
1950
1951
1952 //--------------------------------------------------------------------------------
1953 //
1954 // setTrace
1955 //
1956 //--------------------------------------------------------------------------------
setTrace(UBool state)1957 void RegexMatcher::setTrace(UBool state) {
1958 fTraceDebug = state;
1959 }
1960
1961
1962
1963 /**
1964 * UText, replace entire contents of the destination UText with a substring of the source UText.
1965 *
1966 * @param src The source UText
1967 * @param dest The destination UText. Must be writable.
1968 * May be NULL, in which case a new UText will be allocated.
1969 * @param start Start index of source substring.
1970 * @param limit Limit index of source substring.
1971 * @param status An error code.
1972 */
utext_extract_replace(UText * src,UText * dest,int64_t start,int64_t limit,UErrorCode * status)1973 static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1974 if (U_FAILURE(*status)) {
1975 return dest;
1976 }
1977 if (start == limit) {
1978 if (dest) {
1979 utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1980 return dest;
1981 } else {
1982 return utext_openUChars(NULL, NULL, 0, status);
1983 }
1984 }
1985 int32_t length = utext_extract(src, start, limit, NULL, 0, status);
1986 if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
1987 return dest;
1988 }
1989 *status = U_ZERO_ERROR;
1990 MaybeStackArray<UChar, 40> buffer;
1991 if (length >= buffer.getCapacity()) {
1992 UChar *newBuf = buffer.resize(length+1); // Leave space for terminating Nul.
1993 if (newBuf == NULL) {
1994 *status = U_MEMORY_ALLOCATION_ERROR;
1995 }
1996 }
1997 utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
1998 if (dest) {
1999 utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2000 return dest;
2001 }
2002
2003 // Caller did not provide a prexisting UText.
2004 // Open a new one, and have it adopt the text buffer storage.
2005 if (U_FAILURE(*status)) {
2006 return NULL;
2007 }
2008 int32_t ownedLength = 0;
2009 UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2010 if (ownedBuf == NULL) {
2011 *status = U_MEMORY_ALLOCATION_ERROR;
2012 return NULL;
2013 }
2014 UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2015 if (U_FAILURE(*status)) {
2016 uprv_free(ownedBuf);
2017 return NULL;
2018 }
2019 result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2020 return result;
2021 }
2022
2023
2024 //---------------------------------------------------------------------
2025 //
2026 // split
2027 //
2028 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status)2029 int32_t RegexMatcher::split(const UnicodeString &input,
2030 UnicodeString dest[],
2031 int32_t destCapacity,
2032 UErrorCode &status)
2033 {
2034 UText inputText = UTEXT_INITIALIZER;
2035 utext_openConstUnicodeString(&inputText, &input, &status);
2036 if (U_FAILURE(status)) {
2037 return 0;
2038 }
2039
2040 UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2041 if (destText == NULL) {
2042 status = U_MEMORY_ALLOCATION_ERROR;
2043 return 0;
2044 }
2045 int32_t i;
2046 for (i = 0; i < destCapacity; i++) {
2047 destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2048 }
2049
2050 int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2051
2052 for (i = 0; i < destCapacity; i++) {
2053 utext_close(destText[i]);
2054 }
2055
2056 uprv_free(destText);
2057 utext_close(&inputText);
2058 return fieldCount;
2059 }
2060
2061 //
2062 // split, UText mode
2063 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status)2064 int32_t RegexMatcher::split(UText *input,
2065 UText *dest[],
2066 int32_t destCapacity,
2067 UErrorCode &status)
2068 {
2069 //
2070 // Check arguements for validity
2071 //
2072 if (U_FAILURE(status)) {
2073 return 0;
2074 };
2075
2076 if (destCapacity < 1) {
2077 status = U_ILLEGAL_ARGUMENT_ERROR;
2078 return 0;
2079 }
2080
2081 //
2082 // Reset for the input text
2083 //
2084 reset(input);
2085 int64_t nextOutputStringStart = 0;
2086 if (fActiveLimit == 0) {
2087 return 0;
2088 }
2089
2090 //
2091 // Loop through the input text, searching for the delimiter pattern
2092 //
2093 int32_t i;
2094 int32_t numCaptureGroups = fPattern->fGroupMap->size();
2095 for (i=0; ; i++) {
2096 if (i>=destCapacity-1) {
2097 // There is one or zero output string left.
2098 // Fill the last output string with whatever is left from the input, then exit the loop.
2099 // ( i will be == destCapacity if we filled the output array while processing
2100 // capture groups of the delimiter expression, in which case we will discard the
2101 // last capture group saved in favor of the unprocessed remainder of the
2102 // input string.)
2103 i = destCapacity-1;
2104 if (fActiveLimit > nextOutputStringStart) {
2105 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2106 if (dest[i]) {
2107 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2108 input->chunkContents+nextOutputStringStart,
2109 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2110 } else {
2111 UText remainingText = UTEXT_INITIALIZER;
2112 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2113 fActiveLimit-nextOutputStringStart, &status);
2114 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2115 utext_close(&remainingText);
2116 }
2117 } else {
2118 UErrorCode lengthStatus = U_ZERO_ERROR;
2119 int32_t remaining16Length =
2120 utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2121 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2122 if (remainingChars == NULL) {
2123 status = U_MEMORY_ALLOCATION_ERROR;
2124 break;
2125 }
2126
2127 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2128 if (dest[i]) {
2129 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2130 } else {
2131 UText remainingText = UTEXT_INITIALIZER;
2132 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2133 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2134 utext_close(&remainingText);
2135 }
2136
2137 uprv_free(remainingChars);
2138 }
2139 }
2140 break;
2141 }
2142 if (find()) {
2143 // We found another delimiter. Move everything from where we started looking
2144 // up until the start of the delimiter into the next output string.
2145 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2146 if (dest[i]) {
2147 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2148 input->chunkContents+nextOutputStringStart,
2149 (int32_t)(fMatchStart-nextOutputStringStart), &status);
2150 } else {
2151 UText remainingText = UTEXT_INITIALIZER;
2152 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2153 fMatchStart-nextOutputStringStart, &status);
2154 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2155 utext_close(&remainingText);
2156 }
2157 } else {
2158 UErrorCode lengthStatus = U_ZERO_ERROR;
2159 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2160 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2161 if (remainingChars == NULL) {
2162 status = U_MEMORY_ALLOCATION_ERROR;
2163 break;
2164 }
2165 utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2166 if (dest[i]) {
2167 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2168 } else {
2169 UText remainingText = UTEXT_INITIALIZER;
2170 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2171 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2172 utext_close(&remainingText);
2173 }
2174
2175 uprv_free(remainingChars);
2176 }
2177 nextOutputStringStart = fMatchEnd;
2178
2179 // If the delimiter pattern has capturing parentheses, the captured
2180 // text goes out into the next n destination strings.
2181 int32_t groupNum;
2182 for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2183 if (i >= destCapacity-2) {
2184 // Never fill the last available output string with capture group text.
2185 // It will filled with the last field, the remainder of the
2186 // unsplit input text.
2187 break;
2188 }
2189 i++;
2190 dest[i] = utext_extract_replace(fInputText, dest[i],
2191 start64(groupNum, status), end64(groupNum, status), &status);
2192 }
2193
2194 if (nextOutputStringStart == fActiveLimit) {
2195 // The delimiter was at the end of the string. We're done, but first
2196 // we output one last empty string, for the empty field following
2197 // the delimiter at the end of input.
2198 if (i+1 < destCapacity) {
2199 ++i;
2200 if (dest[i] == NULL) {
2201 dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2202 } else {
2203 static UChar emptyString[] = {(UChar)0};
2204 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2205 }
2206 }
2207 break;
2208
2209 }
2210 }
2211 else
2212 {
2213 // We ran off the end of the input while looking for the next delimiter.
2214 // All the remaining text goes into the current output string.
2215 if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2216 if (dest[i]) {
2217 utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2218 input->chunkContents+nextOutputStringStart,
2219 (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2220 } else {
2221 UText remainingText = UTEXT_INITIALIZER;
2222 utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2223 fActiveLimit-nextOutputStringStart, &status);
2224 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2225 utext_close(&remainingText);
2226 }
2227 } else {
2228 UErrorCode lengthStatus = U_ZERO_ERROR;
2229 int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2230 UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2231 if (remainingChars == NULL) {
2232 status = U_MEMORY_ALLOCATION_ERROR;
2233 break;
2234 }
2235
2236 utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2237 if (dest[i]) {
2238 utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2239 } else {
2240 UText remainingText = UTEXT_INITIALIZER;
2241 utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2242 dest[i] = utext_clone(NULL, &remainingText, TRUE, FALSE, &status);
2243 utext_close(&remainingText);
2244 }
2245
2246 uprv_free(remainingChars);
2247 }
2248 break;
2249 }
2250 if (U_FAILURE(status)) {
2251 break;
2252 }
2253 } // end of for loop
2254 return i+1;
2255 }
2256
2257
2258 //--------------------------------------------------------------------------------
2259 //
2260 // start
2261 //
2262 //--------------------------------------------------------------------------------
start(UErrorCode & status) const2263 int32_t RegexMatcher::start(UErrorCode &status) const {
2264 return start(0, status);
2265 }
2266
start64(UErrorCode & status) const2267 int64_t RegexMatcher::start64(UErrorCode &status) const {
2268 return start64(0, status);
2269 }
2270
2271 //--------------------------------------------------------------------------------
2272 //
2273 // start(int32_t group, UErrorCode &status)
2274 //
2275 //--------------------------------------------------------------------------------
2276
start64(int32_t group,UErrorCode & status) const2277 int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2278 if (U_FAILURE(status)) {
2279 return -1;
2280 }
2281 if (U_FAILURE(fDeferredStatus)) {
2282 status = fDeferredStatus;
2283 return -1;
2284 }
2285 if (fMatch == FALSE) {
2286 status = U_REGEX_INVALID_STATE;
2287 return -1;
2288 }
2289 if (group < 0 || group > fPattern->fGroupMap->size()) {
2290 status = U_INDEX_OUTOFBOUNDS_ERROR;
2291 return -1;
2292 }
2293 int64_t s;
2294 if (group == 0) {
2295 s = fMatchStart;
2296 } else {
2297 int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2298 U_ASSERT(groupOffset < fPattern->fFrameSize);
2299 U_ASSERT(groupOffset >= 0);
2300 s = fFrame->fExtra[groupOffset];
2301 }
2302
2303 return s;
2304 }
2305
2306
start(int32_t group,UErrorCode & status) const2307 int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2308 return (int32_t)start64(group, status);
2309 }
2310
2311 //--------------------------------------------------------------------------------
2312 //
2313 // useAnchoringBounds
2314 //
2315 //--------------------------------------------------------------------------------
useAnchoringBounds(UBool b)2316 RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2317 fAnchoringBounds = b;
2318 fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2319 fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2320 return *this;
2321 }
2322
2323
2324 //--------------------------------------------------------------------------------
2325 //
2326 // useTransparentBounds
2327 //
2328 //--------------------------------------------------------------------------------
useTransparentBounds(UBool b)2329 RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2330 fTransparentBounds = b;
2331 fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2332 fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2333 return *this;
2334 }
2335
2336 //--------------------------------------------------------------------------------
2337 //
2338 // setTimeLimit
2339 //
2340 //--------------------------------------------------------------------------------
setTimeLimit(int32_t limit,UErrorCode & status)2341 void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2342 if (U_FAILURE(status)) {
2343 return;
2344 }
2345 if (U_FAILURE(fDeferredStatus)) {
2346 status = fDeferredStatus;
2347 return;
2348 }
2349 if (limit < 0) {
2350 status = U_ILLEGAL_ARGUMENT_ERROR;
2351 return;
2352 }
2353 fTimeLimit = limit;
2354 }
2355
2356
2357 //--------------------------------------------------------------------------------
2358 //
2359 // getTimeLimit
2360 //
2361 //--------------------------------------------------------------------------------
getTimeLimit() const2362 int32_t RegexMatcher::getTimeLimit() const {
2363 return fTimeLimit;
2364 }
2365
2366
2367 //--------------------------------------------------------------------------------
2368 //
2369 // setStackLimit
2370 //
2371 //--------------------------------------------------------------------------------
setStackLimit(int32_t limit,UErrorCode & status)2372 void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2373 if (U_FAILURE(status)) {
2374 return;
2375 }
2376 if (U_FAILURE(fDeferredStatus)) {
2377 status = fDeferredStatus;
2378 return;
2379 }
2380 if (limit < 0) {
2381 status = U_ILLEGAL_ARGUMENT_ERROR;
2382 return;
2383 }
2384
2385 // Reset the matcher. This is needed here in case there is a current match
2386 // whose final stack frame (containing the match results, pointed to by fFrame)
2387 // would be lost by resizing to a smaller stack size.
2388 reset();
2389
2390 if (limit == 0) {
2391 // Unlimited stack expansion
2392 fStack->setMaxCapacity(0);
2393 } else {
2394 // Change the units of the limit from bytes to ints, and bump the size up
2395 // to be big enough to hold at least one stack frame for the pattern,
2396 // if it isn't there already.
2397 int32_t adjustedLimit = limit / sizeof(int32_t);
2398 if (adjustedLimit < fPattern->fFrameSize) {
2399 adjustedLimit = fPattern->fFrameSize;
2400 }
2401 fStack->setMaxCapacity(adjustedLimit);
2402 }
2403 fStackLimit = limit;
2404 }
2405
2406
2407 //--------------------------------------------------------------------------------
2408 //
2409 // getStackLimit
2410 //
2411 //--------------------------------------------------------------------------------
getStackLimit() const2412 int32_t RegexMatcher::getStackLimit() const {
2413 return fStackLimit;
2414 }
2415
2416
2417 //--------------------------------------------------------------------------------
2418 //
2419 // setMatchCallback
2420 //
2421 //--------------------------------------------------------------------------------
setMatchCallback(URegexMatchCallback * callback,const void * context,UErrorCode & status)2422 void RegexMatcher::setMatchCallback(URegexMatchCallback *callback,
2423 const void *context,
2424 UErrorCode &status) {
2425 if (U_FAILURE(status)) {
2426 return;
2427 }
2428 fCallbackFn = callback;
2429 fCallbackContext = context;
2430 }
2431
2432
2433 //--------------------------------------------------------------------------------
2434 //
2435 // getMatchCallback
2436 //
2437 //--------------------------------------------------------------------------------
getMatchCallback(URegexMatchCallback * & callback,const void * & context,UErrorCode & status)2438 void RegexMatcher::getMatchCallback(URegexMatchCallback *&callback,
2439 const void *&context,
2440 UErrorCode &status) {
2441 if (U_FAILURE(status)) {
2442 return;
2443 }
2444 callback = fCallbackFn;
2445 context = fCallbackContext;
2446 }
2447
2448
2449 //--------------------------------------------------------------------------------
2450 //
2451 // setMatchCallback
2452 //
2453 //--------------------------------------------------------------------------------
setFindProgressCallback(URegexFindProgressCallback * callback,const void * context,UErrorCode & status)2454 void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback *callback,
2455 const void *context,
2456 UErrorCode &status) {
2457 if (U_FAILURE(status)) {
2458 return;
2459 }
2460 fFindProgressCallbackFn = callback;
2461 fFindProgressCallbackContext = context;
2462 }
2463
2464
2465 //--------------------------------------------------------------------------------
2466 //
2467 // getMatchCallback
2468 //
2469 //--------------------------------------------------------------------------------
getFindProgressCallback(URegexFindProgressCallback * & callback,const void * & context,UErrorCode & status)2470 void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback *&callback,
2471 const void *&context,
2472 UErrorCode &status) {
2473 if (U_FAILURE(status)) {
2474 return;
2475 }
2476 callback = fFindProgressCallbackFn;
2477 context = fFindProgressCallbackContext;
2478 }
2479
2480
2481 //================================================================================
2482 //
2483 // Code following this point in this file is the internal
2484 // Match Engine Implementation.
2485 //
2486 //================================================================================
2487
2488
2489 //--------------------------------------------------------------------------------
2490 //
2491 // resetStack
2492 // Discard any previous contents of the state save stack, and initialize a
2493 // new stack frame to all -1. The -1s are needed for capture group limits,
2494 // where they indicate that a group has not yet matched anything.
2495 //--------------------------------------------------------------------------------
resetStack()2496 REStackFrame *RegexMatcher::resetStack() {
2497 // Discard any previous contents of the state save stack, and initialize a
2498 // new stack frame with all -1 data. The -1s are needed for capture group limits,
2499 // where they indicate that a group has not yet matched anything.
2500 fStack->removeAllElements();
2501
2502 REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2503 int32_t i;
2504 for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2505 iFrame->fExtra[i] = -1;
2506 }
2507 return iFrame;
2508 }
2509
2510
2511
2512 //--------------------------------------------------------------------------------
2513 //
2514 // isWordBoundary
2515 // in perl, "xab..cd..", \b is true at positions 0,3,5,7
2516 // For us,
2517 // If the current char is a combining mark,
2518 // \b is FALSE.
2519 // Else Scan backwards to the first non-combining char.
2520 // We are at a boundary if the this char and the original chars are
2521 // opposite in membership in \w set
2522 //
2523 // parameters: pos - the current position in the input buffer
2524 //
2525 // TODO: double-check edge cases at region boundaries.
2526 //
2527 //--------------------------------------------------------------------------------
isWordBoundary(int64_t pos)2528 UBool RegexMatcher::isWordBoundary(int64_t pos) {
2529 UBool isBoundary = FALSE;
2530 UBool cIsWord = FALSE;
2531
2532 if (pos >= fLookLimit) {
2533 fHitEnd = TRUE;
2534 } else {
2535 // Determine whether char c at current position is a member of the word set of chars.
2536 // If we're off the end of the string, behave as though we're not at a word char.
2537 UTEXT_SETNATIVEINDEX(fInputText, pos);
2538 UChar32 c = UTEXT_CURRENT32(fInputText);
2539 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2540 // Current char is a combining one. Not a boundary.
2541 return FALSE;
2542 }
2543 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2544 }
2545
2546 // Back up until we come to a non-combining char, determine whether
2547 // that char is a word char.
2548 UBool prevCIsWord = FALSE;
2549 for (;;) {
2550 if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2551 break;
2552 }
2553 UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2554 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2555 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2556 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2557 break;
2558 }
2559 }
2560 isBoundary = cIsWord ^ prevCIsWord;
2561 return isBoundary;
2562 }
2563
isChunkWordBoundary(int32_t pos)2564 UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2565 UBool isBoundary = FALSE;
2566 UBool cIsWord = FALSE;
2567
2568 const UChar *inputBuf = fInputText->chunkContents;
2569
2570 if (pos >= fLookLimit) {
2571 fHitEnd = TRUE;
2572 } else {
2573 // Determine whether char c at current position is a member of the word set of chars.
2574 // If we're off the end of the string, behave as though we're not at a word char.
2575 UChar32 c;
2576 U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2577 if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2578 // Current char is a combining one. Not a boundary.
2579 return FALSE;
2580 }
2581 cIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(c);
2582 }
2583
2584 // Back up until we come to a non-combining char, determine whether
2585 // that char is a word char.
2586 UBool prevCIsWord = FALSE;
2587 for (;;) {
2588 if (pos <= fLookStart) {
2589 break;
2590 }
2591 UChar32 prevChar;
2592 U16_PREV(inputBuf, fLookStart, pos, prevChar);
2593 if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2594 || u_charType(prevChar) == U_FORMAT_CHAR)) {
2595 prevCIsWord = fPattern->fStaticSets[URX_ISWORD_SET]->contains(prevChar);
2596 break;
2597 }
2598 }
2599 isBoundary = cIsWord ^ prevCIsWord;
2600 return isBoundary;
2601 }
2602
2603 //--------------------------------------------------------------------------------
2604 //
2605 // isUWordBoundary
2606 //
2607 // Test for a word boundary using RBBI word break.
2608 //
2609 // parameters: pos - the current position in the input buffer
2610 //
2611 //--------------------------------------------------------------------------------
isUWordBoundary(int64_t pos)2612 UBool RegexMatcher::isUWordBoundary(int64_t pos) {
2613 UBool returnVal = FALSE;
2614 #if UCONFIG_NO_BREAK_ITERATION==0
2615
2616 // If we haven't yet created a break iterator for this matcher, do it now.
2617 if (fWordBreakItr == NULL) {
2618 fWordBreakItr =
2619 (RuleBasedBreakIterator *)BreakIterator::createWordInstance(Locale::getEnglish(), fDeferredStatus);
2620 if (U_FAILURE(fDeferredStatus)) {
2621 return FALSE;
2622 }
2623 fWordBreakItr->setText(fInputText, fDeferredStatus);
2624 }
2625
2626 if (pos >= fLookLimit) {
2627 fHitEnd = TRUE;
2628 returnVal = TRUE; // With Unicode word rules, only positions within the interior of "real"
2629 // words are not boundaries. All non-word chars stand by themselves,
2630 // with word boundaries on both sides.
2631 } else {
2632 if (!UTEXT_USES_U16(fInputText)) {
2633 // !!!: Would like a better way to do this!
2634 UErrorCode status = U_ZERO_ERROR;
2635 pos = utext_extract(fInputText, 0, pos, NULL, 0, &status);
2636 }
2637 returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2638 }
2639 #endif
2640 return returnVal;
2641 }
2642
2643 //--------------------------------------------------------------------------------
2644 //
2645 // IncrementTime This function is called once each TIMER_INITIAL_VALUE state
2646 // saves. Increment the "time" counter, and call the
2647 // user callback function if there is one installed.
2648 //
2649 // If the match operation needs to be aborted, either for a time-out
2650 // or because the user callback asked for it, just set an error status.
2651 // The engine will pick that up and stop in its outer loop.
2652 //
2653 //--------------------------------------------------------------------------------
IncrementTime(UErrorCode & status)2654 void RegexMatcher::IncrementTime(UErrorCode &status) {
2655 fTickCounter = TIMER_INITIAL_VALUE;
2656 fTime++;
2657 if (fCallbackFn != NULL) {
2658 if ((*fCallbackFn)(fCallbackContext, fTime) == FALSE) {
2659 status = U_REGEX_STOPPED_BY_CALLER;
2660 return;
2661 }
2662 }
2663 if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2664 status = U_REGEX_TIME_OUT;
2665 }
2666 }
2667
2668 //--------------------------------------------------------------------------------
2669 //
2670 // StateSave
2671 // Make a new stack frame, initialized as a copy of the current stack frame.
2672 // Set the pattern index in the original stack frame from the operand value
2673 // in the opcode. Execution of the engine continues with the state in
2674 // the newly created stack frame
2675 //
2676 // Note that reserveBlock() may grow the stack, resulting in the
2677 // whole thing being relocated in memory.
2678 //
2679 // Parameters:
2680 // fp The top frame pointer when called. At return, a new
2681 // fame will be present
2682 // savePatIdx An index into the compiled pattern. Goes into the original
2683 // (not new) frame. If execution ever back-tracks out of the
2684 // new frame, this will be where we continue from in the pattern.
2685 // Return
2686 // The new frame pointer.
2687 //
2688 //--------------------------------------------------------------------------------
StateSave(REStackFrame * fp,int64_t savePatIdx,UErrorCode & status)2689 inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2690 // push storage for a new frame.
2691 int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2692 if (newFP == NULL) {
2693 // Failure on attempted stack expansion.
2694 // Stack function set some other error code, change it to a more
2695 // specific one for regular expressions.
2696 status = U_REGEX_STACK_OVERFLOW;
2697 // We need to return a writable stack frame, so just return the
2698 // previous frame. The match operation will stop quickly
2699 // because of the error status, after which the frame will never
2700 // be looked at again.
2701 return fp;
2702 }
2703 fp = (REStackFrame *)(newFP - fFrameSize); // in case of realloc of stack.
2704
2705 // New stack frame = copy of old top frame.
2706 int64_t *source = (int64_t *)fp;
2707 int64_t *dest = newFP;
2708 for (;;) {
2709 *dest++ = *source++;
2710 if (source == newFP) {
2711 break;
2712 }
2713 }
2714
2715 fTickCounter--;
2716 if (fTickCounter <= 0) {
2717 IncrementTime(status); // Re-initializes fTickCounter
2718 }
2719 fp->fPatIdx = savePatIdx;
2720 return (REStackFrame *)newFP;
2721 }
2722
2723
2724 //--------------------------------------------------------------------------------
2725 //
2726 // MatchAt This is the actual matching engine.
2727 //
2728 // startIdx: begin matching a this index.
2729 // toEnd: if true, match must extend to end of the input region
2730 //
2731 //--------------------------------------------------------------------------------
MatchAt(int64_t startIdx,UBool toEnd,UErrorCode & status)2732 void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2733 UBool isMatch = FALSE; // True if the we have a match.
2734
2735 int64_t backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2736
2737 int32_t op; // Operation from the compiled pattern, split into
2738 int32_t opType; // the opcode
2739 int32_t opValue; // and the operand value.
2740
2741 #ifdef REGEX_RUN_DEBUG
2742 if (fTraceDebug)
2743 {
2744 printf("MatchAt(startIdx=%ld)\n", startIdx);
2745 printf("Original Pattern: ");
2746 UChar32 c = utext_next32From(fPattern->fPattern, 0);
2747 while (c != U_SENTINEL) {
2748 if (c<32 || c>256) {
2749 c = '.';
2750 }
2751 printf("%c", c);
2752
2753 c = UTEXT_NEXT32(fPattern->fPattern);
2754 }
2755 printf("\n");
2756 printf("Input String: ");
2757 c = utext_next32From(fInputText, 0);
2758 while (c != U_SENTINEL) {
2759 if (c<32 || c>256) {
2760 c = '.';
2761 }
2762 printf("%c", c);
2763
2764 c = UTEXT_NEXT32(fInputText);
2765 }
2766 printf("\n");
2767 printf("\n");
2768 }
2769 #endif
2770
2771 if (U_FAILURE(status)) {
2772 return;
2773 }
2774
2775 // Cache frequently referenced items from the compiled pattern
2776 //
2777 int64_t *pat = fPattern->fCompiledPat->getBuffer();
2778
2779 const UChar *litText = fPattern->fLiteralText.getBuffer();
2780 UVector *sets = fPattern->fSets;
2781
2782 fFrameSize = fPattern->fFrameSize;
2783 REStackFrame *fp = resetStack();
2784
2785 fp->fPatIdx = 0;
2786 fp->fInputIdx = startIdx;
2787
2788 // Zero out the pattern's static data
2789 int32_t i;
2790 for (i = 0; i<fPattern->fDataSize; i++) {
2791 fData[i] = 0;
2792 }
2793
2794 //
2795 // Main loop for interpreting the compiled pattern.
2796 // One iteration of the loop per pattern operation performed.
2797 //
2798 for (;;) {
2799 op = (int32_t)pat[fp->fPatIdx];
2800 opType = URX_TYPE(op);
2801 opValue = URX_VAL(op);
2802 #ifdef REGEX_RUN_DEBUG
2803 if (fTraceDebug) {
2804 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2805 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
2806 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2807 fPattern->dumpOp(fp->fPatIdx);
2808 }
2809 #endif
2810 fp->fPatIdx++;
2811
2812 switch (opType) {
2813
2814
2815 case URX_NOP:
2816 break;
2817
2818
2819 case URX_BACKTRACK:
2820 // Force a backtrack. In some circumstances, the pattern compiler
2821 // will notice that the pattern can't possibly match anything, and will
2822 // emit one of these at that point.
2823 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2824 break;
2825
2826
2827 case URX_ONECHAR:
2828 if (fp->fInputIdx < fActiveLimit) {
2829 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2830 UChar32 c = UTEXT_NEXT32(fInputText);
2831 if (c == opValue) {
2832 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2833 break;
2834 }
2835 } else {
2836 fHitEnd = TRUE;
2837 }
2838 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2839 break;
2840
2841
2842 case URX_STRING:
2843 {
2844 // Test input against a literal string.
2845 // Strings require two slots in the compiled pattern, one for the
2846 // offset to the string text, and one for the length.
2847
2848 int32_t stringStartIdx = opValue;
2849 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
2850 fp->fPatIdx++;
2851 opType = URX_TYPE(op);
2852 int32_t stringLen = URX_VAL(op);
2853 U_ASSERT(opType == URX_STRING_LEN);
2854 U_ASSERT(stringLen >= 2);
2855
2856 const UChar *patternString = litText+stringStartIdx;
2857 int32_t patternStringIndex = 0;
2858 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2859 UChar32 inputChar;
2860 UChar32 patternChar;
2861 UBool success = TRUE;
2862 while (patternStringIndex < stringLen) {
2863 if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2864 success = FALSE;
2865 fHitEnd = TRUE;
2866 break;
2867 }
2868 inputChar = UTEXT_NEXT32(fInputText);
2869 U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2870 if (patternChar != inputChar) {
2871 success = FALSE;
2872 break;
2873 }
2874 }
2875
2876 if (success) {
2877 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2878 } else {
2879 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2880 }
2881 }
2882 break;
2883
2884
2885 case URX_STATE_SAVE:
2886 fp = StateSave(fp, opValue, status);
2887 break;
2888
2889
2890 case URX_END:
2891 // The match loop will exit via this path on a successful match,
2892 // when we reach the end of the pattern.
2893 if (toEnd && fp->fInputIdx != fActiveLimit) {
2894 // The pattern matched, but not to the end of input. Try some more.
2895 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2896 break;
2897 }
2898 isMatch = TRUE;
2899 goto breakFromLoop;
2900
2901 // Start and End Capture stack frame variables are laid out out like this:
2902 // fp->fExtra[opValue] - The start of a completed capture group
2903 // opValue+1 - The end of a completed capture group
2904 // opValue+2 - the start of a capture group whose end
2905 // has not yet been reached (and might not ever be).
2906 case URX_START_CAPTURE:
2907 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2908 fp->fExtra[opValue+2] = fp->fInputIdx;
2909 break;
2910
2911
2912 case URX_END_CAPTURE:
2913 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2914 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
2915 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
2916 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
2917 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2918 break;
2919
2920
2921 case URX_DOLLAR: // $, test for End of line
2922 // or for position before new line at end of input
2923 {
2924 if (fp->fInputIdx >= fAnchorLimit) {
2925 // We really are at the end of input. Success.
2926 fHitEnd = TRUE;
2927 fRequireEnd = TRUE;
2928 break;
2929 }
2930
2931 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2932
2933 // If we are positioned just before a new-line that is located at the
2934 // end of input, succeed.
2935 UChar32 c = UTEXT_NEXT32(fInputText);
2936 if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2937 if (isLineTerminator(c)) {
2938 // If not in the middle of a CR/LF sequence
2939 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2940 // At new-line at end of input. Success
2941 fHitEnd = TRUE;
2942 fRequireEnd = TRUE;
2943
2944 break;
2945 }
2946 }
2947 } else {
2948 UChar32 nextC = UTEXT_NEXT32(fInputText);
2949 if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2950 fHitEnd = TRUE;
2951 fRequireEnd = TRUE;
2952 break; // At CR/LF at end of input. Success
2953 }
2954 }
2955
2956 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2957 }
2958 break;
2959
2960
2961 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
2962 if (fp->fInputIdx >= fAnchorLimit) {
2963 // Off the end of input. Success.
2964 fHitEnd = TRUE;
2965 fRequireEnd = TRUE;
2966 break;
2967 } else {
2968 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2969 UChar32 c = UTEXT_NEXT32(fInputText);
2970 // Either at the last character of input, or off the end.
2971 if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
2972 fHitEnd = TRUE;
2973 fRequireEnd = TRUE;
2974 break;
2975 }
2976 }
2977
2978 // Not at end of input. Back-track out.
2979 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2980 break;
2981
2982
2983 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
2984 {
2985 if (fp->fInputIdx >= fAnchorLimit) {
2986 // We really are at the end of input. Success.
2987 fHitEnd = TRUE;
2988 fRequireEnd = TRUE;
2989 break;
2990 }
2991 // If we are positioned just before a new-line, succeed.
2992 // It makes no difference where the new-line is within the input.
2993 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2994 UChar32 c = UTEXT_CURRENT32(fInputText);
2995 if (isLineTerminator(c)) {
2996 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
2997 // In multi-line mode, hitting a new-line just before the end of input does not
2998 // set the hitEnd or requireEnd flags
2999 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3000 break;
3001 }
3002 }
3003 // not at a new line. Fail.
3004 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3005 }
3006 break;
3007
3008
3009 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
3010 {
3011 if (fp->fInputIdx >= fAnchorLimit) {
3012 // We really are at the end of input. Success.
3013 fHitEnd = TRUE;
3014 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
3015 break; // adding a new-line would not lose the match.
3016 }
3017 // If we are not positioned just before a new-line, the test fails; backtrack out.
3018 // It makes no difference where the new-line is within the input.
3019 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3020 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3021 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3022 }
3023 }
3024 break;
3025
3026
3027 case URX_CARET: // ^, test for start of line
3028 if (fp->fInputIdx != fAnchorStart) {
3029 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3030 }
3031 break;
3032
3033
3034 case URX_CARET_M: // ^, test for start of line in mulit-line mode
3035 {
3036 if (fp->fInputIdx == fAnchorStart) {
3037 // We are at the start input. Success.
3038 break;
3039 }
3040 // Check whether character just before the current pos is a new-line
3041 // unless we are at the end of input
3042 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3043 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3044 if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3045 // It's a new-line. ^ is true. Success.
3046 // TODO: what should be done with positions between a CR and LF?
3047 break;
3048 }
3049 // Not at the start of a line. Fail.
3050 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3051 }
3052 break;
3053
3054
3055 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
3056 {
3057 U_ASSERT(fp->fInputIdx >= fAnchorStart);
3058 if (fp->fInputIdx <= fAnchorStart) {
3059 // We are at the start input. Success.
3060 break;
3061 }
3062 // Check whether character just before the current pos is a new-line
3063 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3064 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3065 UChar32 c = UTEXT_PREVIOUS32(fInputText);
3066 if (c != 0x0a) {
3067 // Not at the start of a line. Back-track out.
3068 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3069 }
3070 }
3071 break;
3072
3073 case URX_BACKSLASH_B: // Test for word boundaries
3074 {
3075 UBool success = isWordBoundary(fp->fInputIdx);
3076 success ^= (UBool)(opValue != 0); // flip sense for \B
3077 if (!success) {
3078 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3079 }
3080 }
3081 break;
3082
3083
3084 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
3085 {
3086 UBool success = isUWordBoundary(fp->fInputIdx);
3087 success ^= (UBool)(opValue != 0); // flip sense for \B
3088 if (!success) {
3089 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3090 }
3091 }
3092 break;
3093
3094
3095 case URX_BACKSLASH_D: // Test for decimal digit
3096 {
3097 if (fp->fInputIdx >= fActiveLimit) {
3098 fHitEnd = TRUE;
3099 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3100 break;
3101 }
3102
3103 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3104
3105 UChar32 c = UTEXT_NEXT32(fInputText);
3106 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
3107 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3108 success ^= (UBool)(opValue != 0); // flip sense for \D
3109 if (success) {
3110 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3111 } else {
3112 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3113 }
3114 }
3115 break;
3116
3117
3118 case URX_BACKSLASH_G: // Test for position at end of previous match
3119 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
3120 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3121 }
3122 break;
3123
3124
3125 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
3126 {
3127 if (fp->fInputIdx >= fActiveLimit) {
3128 fHitEnd = TRUE;
3129 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3130 break;
3131 }
3132 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3133 UChar32 c = UTEXT_NEXT32(fInputText);
3134 int8_t ctype = u_charType(c);
3135 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
3136 success ^= (UBool)(opValue != 0); // flip sense for \H
3137 if (success) {
3138 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3139 } else {
3140 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3141 }
3142 }
3143 break;
3144
3145
3146 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
3147 {
3148 if (fp->fInputIdx >= fActiveLimit) {
3149 fHitEnd = TRUE;
3150 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3151 break;
3152 }
3153 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3154 UChar32 c = UTEXT_NEXT32(fInputText);
3155 if (isLineTerminator(c)) {
3156 if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3157 utext_next32(fInputText);
3158 }
3159 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3160 } else {
3161 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3162 }
3163 }
3164 break;
3165
3166
3167 case URX_BACKSLASH_V: // \v, any single line ending character.
3168 {
3169 if (fp->fInputIdx >= fActiveLimit) {
3170 fHitEnd = TRUE;
3171 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3172 break;
3173 }
3174 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3175 UChar32 c = UTEXT_NEXT32(fInputText);
3176 UBool success = isLineTerminator(c);
3177 success ^= (UBool)(opValue != 0); // flip sense for \V
3178 if (success) {
3179 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3180 } else {
3181 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3182 }
3183 }
3184 break;
3185
3186
3187 case URX_BACKSLASH_X:
3188 // Match a Grapheme, as defined by Unicode TR 29.
3189 // Differs slightly from Perl, which consumes combining marks independently
3190 // of context.
3191 {
3192
3193 // Fail if at end of input
3194 if (fp->fInputIdx >= fActiveLimit) {
3195 fHitEnd = TRUE;
3196 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3197 break;
3198 }
3199
3200 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3201
3202 // Examine (and consume) the current char.
3203 // Dispatch into a little state machine, based on the char.
3204 UChar32 c;
3205 c = UTEXT_NEXT32(fInputText);
3206 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3207 UnicodeSet **sets = fPattern->fStaticSets;
3208 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
3209 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
3210 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3211 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3212 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3213 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3214 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3215 goto GC_Extend;
3216
3217
3218
3219 GC_L:
3220 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3221 c = UTEXT_NEXT32(fInputText);
3222 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3223 if (sets[URX_GC_L]->contains(c)) goto GC_L;
3224 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
3225 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
3226 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3227 (void)UTEXT_PREVIOUS32(fInputText);
3228 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3229 goto GC_Extend;
3230
3231 GC_V:
3232 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3233 c = UTEXT_NEXT32(fInputText);
3234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3235 if (sets[URX_GC_V]->contains(c)) goto GC_V;
3236 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3237 (void)UTEXT_PREVIOUS32(fInputText);
3238 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3239 goto GC_Extend;
3240
3241 GC_T:
3242 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
3243 c = UTEXT_NEXT32(fInputText);
3244 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3245 if (sets[URX_GC_T]->contains(c)) goto GC_T;
3246 (void)UTEXT_PREVIOUS32(fInputText);
3247 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3248 goto GC_Extend;
3249
3250 GC_Extend:
3251 // Combining characters are consumed here
3252 for (;;) {
3253 if (fp->fInputIdx >= fActiveLimit) {
3254 break;
3255 }
3256 c = UTEXT_CURRENT32(fInputText);
3257 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
3258 break;
3259 }
3260 (void)UTEXT_NEXT32(fInputText);
3261 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3262 }
3263 goto GC_Done;
3264
3265 GC_Control:
3266 // Most control chars stand alone (don't combine with combining chars),
3267 // except for that CR/LF sequence is a single grapheme cluster.
3268 if (c == 0x0d && fp->fInputIdx < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
3269 c = UTEXT_NEXT32(fInputText);
3270 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3271 }
3272
3273 GC_Done:
3274 if (fp->fInputIdx >= fActiveLimit) {
3275 fHitEnd = TRUE;
3276 }
3277 break;
3278 }
3279
3280
3281
3282
3283 case URX_BACKSLASH_Z: // Test for end of Input
3284 if (fp->fInputIdx < fAnchorLimit) {
3285 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3286 } else {
3287 fHitEnd = TRUE;
3288 fRequireEnd = TRUE;
3289 }
3290 break;
3291
3292
3293
3294 case URX_STATIC_SETREF:
3295 {
3296 // Test input character against one of the predefined sets
3297 // (Word Characters, for example)
3298 // The high bit of the op value is a flag for the match polarity.
3299 // 0: success if input char is in set.
3300 // 1: success if input char is not in set.
3301 if (fp->fInputIdx >= fActiveLimit) {
3302 fHitEnd = TRUE;
3303 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3304 break;
3305 }
3306
3307 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3308 opValue &= ~URX_NEG_SET;
3309 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3310
3311 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3312 UChar32 c = UTEXT_NEXT32(fInputText);
3313 if (c < 256) {
3314 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3315 if (s8->contains(c)) {
3316 success = !success;
3317 }
3318 } else {
3319 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3320 if (s->contains(c)) {
3321 success = !success;
3322 }
3323 }
3324 if (success) {
3325 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3326 } else {
3327 // the character wasn't in the set.
3328 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3329 }
3330 }
3331 break;
3332
3333
3334 case URX_STAT_SETREF_N:
3335 {
3336 // Test input character for NOT being a member of one of
3337 // the predefined sets (Word Characters, for example)
3338 if (fp->fInputIdx >= fActiveLimit) {
3339 fHitEnd = TRUE;
3340 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3341 break;
3342 }
3343
3344 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3345
3346 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3347
3348 UChar32 c = UTEXT_NEXT32(fInputText);
3349 if (c < 256) {
3350 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
3351 if (s8->contains(c) == FALSE) {
3352 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3353 break;
3354 }
3355 } else {
3356 const UnicodeSet *s = fPattern->fStaticSets[opValue];
3357 if (s->contains(c) == FALSE) {
3358 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3359 break;
3360 }
3361 }
3362 // the character wasn't in the set.
3363 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3364 }
3365 break;
3366
3367
3368 case URX_SETREF:
3369 if (fp->fInputIdx >= fActiveLimit) {
3370 fHitEnd = TRUE;
3371 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3372 break;
3373 } else {
3374 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3375
3376 // There is input left. Pick up one char and test it for set membership.
3377 UChar32 c = UTEXT_NEXT32(fInputText);
3378 U_ASSERT(opValue > 0 && opValue < sets->size());
3379 if (c<256) {
3380 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3381 if (s8->contains(c)) {
3382 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3383 break;
3384 }
3385 } else {
3386 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
3387 if (s->contains(c)) {
3388 // The character is in the set. A Match.
3389 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3390 break;
3391 }
3392 }
3393
3394 // the character wasn't in the set.
3395 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3396 }
3397 break;
3398
3399
3400 case URX_DOTANY:
3401 {
3402 // . matches anything, but stops at end-of-line.
3403 if (fp->fInputIdx >= fActiveLimit) {
3404 // At end of input. Match failed. Backtrack out.
3405 fHitEnd = TRUE;
3406 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3407 break;
3408 }
3409
3410 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3411
3412 // There is input left. Advance over one char, unless we've hit end-of-line
3413 UChar32 c = UTEXT_NEXT32(fInputText);
3414 if (isLineTerminator(c)) {
3415 // End of line in normal mode. . does not match.
3416 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3417 break;
3418 }
3419 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3420 }
3421 break;
3422
3423
3424 case URX_DOTANY_ALL:
3425 {
3426 // ., in dot-matches-all (including new lines) mode
3427 if (fp->fInputIdx >= fActiveLimit) {
3428 // At end of input. Match failed. Backtrack out.
3429 fHitEnd = TRUE;
3430 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3431 break;
3432 }
3433
3434 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3435
3436 // There is input left. Advance over one char, except if we are
3437 // at a cr/lf, advance over both of them.
3438 UChar32 c;
3439 c = UTEXT_NEXT32(fInputText);
3440 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3441 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3442 // In the case of a CR/LF, we need to advance over both.
3443 UChar32 nextc = UTEXT_CURRENT32(fInputText);
3444 if (nextc == 0x0a) {
3445 (void)UTEXT_NEXT32(fInputText);
3446 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3447 }
3448 }
3449 }
3450 break;
3451
3452
3453 case URX_DOTANY_UNIX:
3454 {
3455 // '.' operator, matches all, but stops at end-of-line.
3456 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
3457 if (fp->fInputIdx >= fActiveLimit) {
3458 // At end of input. Match failed. Backtrack out.
3459 fHitEnd = TRUE;
3460 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3461 break;
3462 }
3463
3464 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3465
3466 // There is input left. Advance over one char, unless we've hit end-of-line
3467 UChar32 c = UTEXT_NEXT32(fInputText);
3468 if (c == 0x0a) {
3469 // End of line in normal mode. '.' does not match the \n
3470 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3471 } else {
3472 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3473 }
3474 }
3475 break;
3476
3477
3478 case URX_JMP:
3479 fp->fPatIdx = opValue;
3480 break;
3481
3482 case URX_FAIL:
3483 isMatch = FALSE;
3484 goto breakFromLoop;
3485
3486 case URX_JMP_SAV:
3487 U_ASSERT(opValue < fPattern->fCompiledPat->size());
3488 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3489 fp->fPatIdx = opValue; // Then JMP.
3490 break;
3491
3492 case URX_JMP_SAV_X:
3493 // This opcode is used with (x)+, when x can match a zero length string.
3494 // Same as JMP_SAV, except conditional on the match having made forward progress.
3495 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3496 // data address of the input position at the start of the loop.
3497 {
3498 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3499 int32_t stoOp = (int32_t)pat[opValue-1];
3500 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3501 int32_t frameLoc = URX_VAL(stoOp);
3502 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3503 int64_t prevInputIdx = fp->fExtra[frameLoc];
3504 U_ASSERT(prevInputIdx <= fp->fInputIdx);
3505 if (prevInputIdx < fp->fInputIdx) {
3506 // The match did make progress. Repeat the loop.
3507 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
3508 fp->fPatIdx = opValue;
3509 fp->fExtra[frameLoc] = fp->fInputIdx;
3510 }
3511 // If the input position did not advance, we do nothing here,
3512 // execution will fall out of the loop.
3513 }
3514 break;
3515
3516 case URX_CTR_INIT:
3517 {
3518 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3519 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3520
3521 // Pick up the three extra operands that CTR_INIT has, and
3522 // skip the pattern location counter past
3523 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3524 fp->fPatIdx += 3;
3525 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3526 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3527 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3528 U_ASSERT(minCount>=0);
3529 U_ASSERT(maxCount>=minCount || maxCount==-1);
3530 U_ASSERT(loopLoc>=fp->fPatIdx);
3531
3532 if (minCount == 0) {
3533 fp = StateSave(fp, loopLoc+1, status);
3534 }
3535 if (maxCount == -1) {
3536 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
3537 } else if (maxCount == 0) {
3538 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3539 }
3540 }
3541 break;
3542
3543 case URX_CTR_LOOP:
3544 {
3545 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3546 int32_t initOp = (int32_t)pat[opValue];
3547 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3548 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3549 int32_t minCount = (int32_t)pat[opValue+2];
3550 int32_t maxCount = (int32_t)pat[opValue+3];
3551 (*pCounter)++;
3552 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3553 U_ASSERT(*pCounter == maxCount);
3554 break;
3555 }
3556 if (*pCounter >= minCount) {
3557 if (maxCount == -1) {
3558 // Loop has no hard upper bound.
3559 // Check that it is progressing through the input, break if it is not.
3560 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3561 if (fp->fInputIdx == *pLastInputIdx) {
3562 break;
3563 } else {
3564 *pLastInputIdx = fp->fInputIdx;
3565 }
3566 }
3567 fp = StateSave(fp, fp->fPatIdx, status);
3568 }
3569 fp->fPatIdx = opValue + 4; // Loop back.
3570 }
3571 break;
3572
3573 case URX_CTR_INIT_NG:
3574 {
3575 // Initialize a non-greedy loop
3576 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3577 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
3578
3579 // Pick up the three extra operands that CTR_INIT_NG has, and
3580 // skip the pattern location counter past
3581 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3582 fp->fPatIdx += 3;
3583 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
3584 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3585 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3586 U_ASSERT(minCount>=0);
3587 U_ASSERT(maxCount>=minCount || maxCount==-1);
3588 U_ASSERT(loopLoc>fp->fPatIdx);
3589 if (maxCount == -1) {
3590 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
3591 }
3592
3593 if (minCount == 0) {
3594 if (maxCount != 0) {
3595 fp = StateSave(fp, fp->fPatIdx, status);
3596 }
3597 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
3598 }
3599 }
3600 break;
3601
3602 case URX_CTR_LOOP_NG:
3603 {
3604 // Non-greedy {min, max} loops
3605 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3606 int32_t initOp = (int32_t)pat[opValue];
3607 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3608 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3609 int32_t minCount = (int32_t)pat[opValue+2];
3610 int32_t maxCount = (int32_t)pat[opValue+3];
3611
3612 (*pCounter)++;
3613 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3614 // The loop has matched the maximum permitted number of times.
3615 // Break out of here with no action. Matching will
3616 // continue with the following pattern.
3617 U_ASSERT(*pCounter == maxCount);
3618 break;
3619 }
3620
3621 if (*pCounter < minCount) {
3622 // We haven't met the minimum number of matches yet.
3623 // Loop back for another one.
3624 fp->fPatIdx = opValue + 4; // Loop back.
3625 } else {
3626 // We do have the minimum number of matches.
3627
3628 // If there is no upper bound on the loop iterations, check that the input index
3629 // is progressing, and stop the loop if it is not.
3630 if (maxCount == -1) {
3631 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
3632 if (fp->fInputIdx == *pLastInputIdx) {
3633 break;
3634 }
3635 *pLastInputIdx = fp->fInputIdx;
3636 }
3637
3638 // Loop Continuation: we will fall into the pattern following the loop
3639 // (non-greedy, don't execute loop body first), but first do
3640 // a state save to the top of the loop, so that a match failure
3641 // in the following pattern will try another iteration of the loop.
3642 fp = StateSave(fp, opValue + 4, status);
3643 }
3644 }
3645 break;
3646
3647 case URX_STO_SP:
3648 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3649 fData[opValue] = fStack->size();
3650 break;
3651
3652 case URX_LD_SP:
3653 {
3654 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3655 int32_t newStackSize = (int32_t)fData[opValue];
3656 U_ASSERT(newStackSize <= fStack->size());
3657 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3658 if (newFP == (int64_t *)fp) {
3659 break;
3660 }
3661 int32_t i;
3662 for (i=0; i<fFrameSize; i++) {
3663 newFP[i] = ((int64_t *)fp)[i];
3664 }
3665 fp = (REStackFrame *)newFP;
3666 fStack->setSize(newStackSize);
3667 }
3668 break;
3669
3670 case URX_BACKREF:
3671 {
3672 U_ASSERT(opValue < fFrameSize);
3673 int64_t groupStartIdx = fp->fExtra[opValue];
3674 int64_t groupEndIdx = fp->fExtra[opValue+1];
3675 U_ASSERT(groupStartIdx <= groupEndIdx);
3676 if (groupStartIdx < 0) {
3677 // This capture group has not participated in the match thus far,
3678 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3679 break;
3680 }
3681 UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3682 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3683
3684 // Note: if the capture group match was of an empty string the backref
3685 // match succeeds. Verified by testing: Perl matches succeed
3686 // in this case, so we do too.
3687
3688 UBool success = TRUE;
3689 for (;;) {
3690 if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3691 success = TRUE;
3692 break;
3693 }
3694 if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3695 success = FALSE;
3696 fHitEnd = TRUE;
3697 break;
3698 }
3699 UChar32 captureGroupChar = utext_next32(fAltInputText);
3700 UChar32 inputChar = utext_next32(fInputText);
3701 if (inputChar != captureGroupChar) {
3702 success = FALSE;
3703 break;
3704 }
3705 }
3706
3707 if (success) {
3708 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3709 } else {
3710 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3711 }
3712 }
3713 break;
3714
3715
3716
3717 case URX_BACKREF_I:
3718 {
3719 U_ASSERT(opValue < fFrameSize);
3720 int64_t groupStartIdx = fp->fExtra[opValue];
3721 int64_t groupEndIdx = fp->fExtra[opValue+1];
3722 U_ASSERT(groupStartIdx <= groupEndIdx);
3723 if (groupStartIdx < 0) {
3724 // This capture group has not participated in the match thus far,
3725 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
3726 break;
3727 }
3728 utext_setNativeIndex(fAltInputText, groupStartIdx);
3729 utext_setNativeIndex(fInputText, fp->fInputIdx);
3730 CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3731 CaseFoldingUTextIterator inputItr(*fInputText);
3732
3733 // Note: if the capture group match was of an empty string the backref
3734 // match succeeds. Verified by testing: Perl matches succeed
3735 // in this case, so we do too.
3736
3737 UBool success = TRUE;
3738 for (;;) {
3739 if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3740 success = TRUE;
3741 break;
3742 }
3743 if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3744 success = FALSE;
3745 fHitEnd = TRUE;
3746 break;
3747 }
3748 UChar32 captureGroupChar = captureGroupItr.next();
3749 UChar32 inputChar = inputItr.next();
3750 if (inputChar != captureGroupChar) {
3751 success = FALSE;
3752 break;
3753 }
3754 }
3755
3756 if (success && inputItr.inExpansion()) {
3757 // We otained a match by consuming part of a string obtained from
3758 // case-folding a single code point of the input text.
3759 // This does not count as an overall match.
3760 success = FALSE;
3761 }
3762
3763 if (success) {
3764 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3765 } else {
3766 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3767 }
3768
3769 }
3770 break;
3771
3772 case URX_STO_INP_LOC:
3773 {
3774 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3775 fp->fExtra[opValue] = fp->fInputIdx;
3776 }
3777 break;
3778
3779 case URX_JMPX:
3780 {
3781 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3782 fp->fPatIdx += 1;
3783 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
3784 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3785 int64_t savedInputIdx = fp->fExtra[dataLoc];
3786 U_ASSERT(savedInputIdx <= fp->fInputIdx);
3787 if (savedInputIdx < fp->fInputIdx) {
3788 fp->fPatIdx = opValue; // JMP
3789 } else {
3790 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
3791 }
3792 }
3793 break;
3794
3795 case URX_LA_START:
3796 {
3797 // Entering a lookahead block.
3798 // Save Stack Ptr, Input Pos.
3799 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3800 fData[opValue] = fStack->size();
3801 fData[opValue+1] = fp->fInputIdx;
3802 fActiveStart = fLookStart; // Set the match region change for
3803 fActiveLimit = fLookLimit; // transparent bounds.
3804 }
3805 break;
3806
3807 case URX_LA_END:
3808 {
3809 // Leaving a look-ahead block.
3810 // restore Stack Ptr, Input Pos to positions they had on entry to block.
3811 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3812 int32_t stackSize = fStack->size();
3813 int32_t newStackSize =(int32_t)fData[opValue];
3814 U_ASSERT(stackSize >= newStackSize);
3815 if (stackSize > newStackSize) {
3816 // Copy the current top frame back to the new (cut back) top frame.
3817 // This makes the capture groups from within the look-ahead
3818 // expression available.
3819 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3820 int32_t i;
3821 for (i=0; i<fFrameSize; i++) {
3822 newFP[i] = ((int64_t *)fp)[i];
3823 }
3824 fp = (REStackFrame *)newFP;
3825 fStack->setSize(newStackSize);
3826 }
3827 fp->fInputIdx = fData[opValue+1];
3828
3829 // Restore the active region bounds in the input string; they may have
3830 // been changed because of transparent bounds on a Region.
3831 fActiveStart = fRegionStart;
3832 fActiveLimit = fRegionLimit;
3833 }
3834 break;
3835
3836 case URX_ONECHAR_I:
3837 // Case insensitive one char. The char from the pattern is already case folded.
3838 // Input text is not, but case folding the input can not reduce two or more code
3839 // points to one.
3840 if (fp->fInputIdx < fActiveLimit) {
3841 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3842
3843 UChar32 c = UTEXT_NEXT32(fInputText);
3844 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3845 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3846 break;
3847 }
3848 } else {
3849 fHitEnd = TRUE;
3850 }
3851
3852 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3853 break;
3854
3855 case URX_STRING_I:
3856 {
3857 // Case-insensitive test input against a literal string.
3858 // Strings require two slots in the compiled pattern, one for the
3859 // offset to the string text, and one for the length.
3860 // The compiled string has already been case folded.
3861 {
3862 const UChar *patternString = litText + opValue;
3863 int32_t patternStringIdx = 0;
3864
3865 op = (int32_t)pat[fp->fPatIdx];
3866 fp->fPatIdx++;
3867 opType = URX_TYPE(op);
3868 opValue = URX_VAL(op);
3869 U_ASSERT(opType == URX_STRING_LEN);
3870 int32_t patternStringLen = opValue; // Length of the string from the pattern.
3871
3872
3873 UChar32 cPattern;
3874 UChar32 cText;
3875 UBool success = TRUE;
3876
3877 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3878 CaseFoldingUTextIterator inputIterator(*fInputText);
3879 while (patternStringIdx < patternStringLen) {
3880 if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3881 success = FALSE;
3882 fHitEnd = TRUE;
3883 break;
3884 }
3885 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3886 cText = inputIterator.next();
3887 if (cText != cPattern) {
3888 success = FALSE;
3889 break;
3890 }
3891 }
3892 if (inputIterator.inExpansion()) {
3893 success = FALSE;
3894 }
3895
3896 if (success) {
3897 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3898 } else {
3899 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3900 }
3901 }
3902 }
3903 break;
3904
3905 case URX_LB_START:
3906 {
3907 // Entering a look-behind block.
3908 // Save Stack Ptr, Input Pos.
3909 // TODO: implement transparent bounds. Ticket #6067
3910 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3911 fData[opValue] = fStack->size();
3912 fData[opValue+1] = fp->fInputIdx;
3913 // Init the variable containing the start index for attempted matches.
3914 fData[opValue+2] = -1;
3915 // Save input string length, then reset to pin any matches to end at
3916 // the current position.
3917 fData[opValue+3] = fActiveLimit;
3918 fActiveLimit = fp->fInputIdx;
3919 }
3920 break;
3921
3922
3923 case URX_LB_CONT:
3924 {
3925 // Positive Look-Behind, at top of loop checking for matches of LB expression
3926 // at all possible input starting positions.
3927
3928 // Fetch the min and max possible match lengths. They are the operands
3929 // of this op in the pattern.
3930 int32_t minML = (int32_t)pat[fp->fPatIdx++];
3931 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3932 U_ASSERT(minML <= maxML);
3933 U_ASSERT(minML >= 0);
3934
3935 // Fetch (from data) the last input index where a match was attempted.
3936 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3937 int64_t *lbStartIdx = &fData[opValue+2];
3938 if (*lbStartIdx < 0) {
3939 // First time through loop.
3940 *lbStartIdx = fp->fInputIdx - minML;
3941 } else {
3942 // 2nd through nth time through the loop.
3943 // Back up start position for match by one.
3944 if (*lbStartIdx == 0) {
3945 (*lbStartIdx)--;
3946 } else {
3947 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
3948 (void)UTEXT_PREVIOUS32(fInputText);
3949 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3950 }
3951 }
3952
3953 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
3954 // We have tried all potential match starting points without
3955 // getting a match. Backtrack out, and out of the
3956 // Look Behind altogether.
3957 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3958 int64_t restoreInputLen = fData[opValue+3];
3959 U_ASSERT(restoreInputLen >= fActiveLimit);
3960 U_ASSERT(restoreInputLen <= fInputLength);
3961 fActiveLimit = restoreInputLen;
3962 break;
3963 }
3964
3965 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3966 // (successful match will fall off the end of the loop.)
3967 fp = StateSave(fp, fp->fPatIdx-3, status);
3968 fp->fInputIdx = *lbStartIdx;
3969 }
3970 break;
3971
3972 case URX_LB_END:
3973 // End of a look-behind block, after a successful match.
3974 {
3975 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
3976 if (fp->fInputIdx != fActiveLimit) {
3977 // The look-behind expression matched, but the match did not
3978 // extend all the way to the point that we are looking behind from.
3979 // FAIL out of here, which will take us back to the LB_CONT, which
3980 // will retry the match starting at another position or fail
3981 // the look-behind altogether, whichever is appropriate.
3982 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3983 break;
3984 }
3985
3986 // Look-behind match is good. Restore the orignal input string length,
3987 // which had been truncated to pin the end of the lookbehind match to the
3988 // position being looked-behind.
3989 int64_t originalInputLen = fData[opValue+3];
3990 U_ASSERT(originalInputLen >= fActiveLimit);
3991 U_ASSERT(originalInputLen <= fInputLength);
3992 fActiveLimit = originalInputLen;
3993 }
3994 break;
3995
3996
3997 case URX_LBN_CONT:
3998 {
3999 // Negative Look-Behind, at top of loop checking for matches of LB expression
4000 // at all possible input starting positions.
4001
4002 // Fetch the extra parameters of this op.
4003 int32_t minML = (int32_t)pat[fp->fPatIdx++];
4004 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
4005 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4006 continueLoc = URX_VAL(continueLoc);
4007 U_ASSERT(minML <= maxML);
4008 U_ASSERT(minML >= 0);
4009 U_ASSERT(continueLoc > fp->fPatIdx);
4010
4011 // Fetch (from data) the last input index where a match was attempted.
4012 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4013 int64_t *lbStartIdx = &fData[opValue+2];
4014 if (*lbStartIdx < 0) {
4015 // First time through loop.
4016 *lbStartIdx = fp->fInputIdx - minML;
4017 } else {
4018 // 2nd through nth time through the loop.
4019 // Back up start position for match by one.
4020 if (*lbStartIdx == 0) {
4021 (*lbStartIdx)--;
4022 } else {
4023 UTEXT_SETNATIVEINDEX(fInputText, *lbStartIdx);
4024 (void)UTEXT_PREVIOUS32(fInputText);
4025 *lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4026 }
4027 }
4028
4029 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
4030 // We have tried all potential match starting points without
4031 // getting a match, which means that the negative lookbehind as
4032 // a whole has succeeded. Jump forward to the continue location
4033 int64_t restoreInputLen = fData[opValue+3];
4034 U_ASSERT(restoreInputLen >= fActiveLimit);
4035 U_ASSERT(restoreInputLen <= fInputLength);
4036 fActiveLimit = restoreInputLen;
4037 fp->fPatIdx = continueLoc;
4038 break;
4039 }
4040
4041 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4042 // (successful match will cause a FAIL out of the loop altogether.)
4043 fp = StateSave(fp, fp->fPatIdx-4, status);
4044 fp->fInputIdx = *lbStartIdx;
4045 }
4046 break;
4047
4048 case URX_LBN_END:
4049 // End of a negative look-behind block, after a successful match.
4050 {
4051 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4052 if (fp->fInputIdx != fActiveLimit) {
4053 // The look-behind expression matched, but the match did not
4054 // extend all the way to the point that we are looking behind from.
4055 // FAIL out of here, which will take us back to the LB_CONT, which
4056 // will retry the match starting at another position or succeed
4057 // the look-behind altogether, whichever is appropriate.
4058 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4059 break;
4060 }
4061
4062 // Look-behind expression matched, which means look-behind test as
4063 // a whole Fails
4064
4065 // Restore the orignal input string length, which had been truncated
4066 // inorder to pin the end of the lookbehind match
4067 // to the position being looked-behind.
4068 int64_t originalInputLen = fData[opValue+3];
4069 U_ASSERT(originalInputLen >= fActiveLimit);
4070 U_ASSERT(originalInputLen <= fInputLength);
4071 fActiveLimit = originalInputLen;
4072
4073 // Restore original stack position, discarding any state saved
4074 // by the successful pattern match.
4075 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4076 int32_t newStackSize = (int32_t)fData[opValue];
4077 U_ASSERT(fStack->size() > newStackSize);
4078 fStack->setSize(newStackSize);
4079
4080 // FAIL, which will take control back to someplace
4081 // prior to entering the look-behind test.
4082 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4083 }
4084 break;
4085
4086
4087 case URX_LOOP_SR_I:
4088 // Loop Initialization for the optimized implementation of
4089 // [some character set]*
4090 // This op scans through all matching input.
4091 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4092 {
4093 U_ASSERT(opValue > 0 && opValue < sets->size());
4094 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4095 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4096
4097 // Loop through input, until either the input is exhausted or
4098 // we reach a character that is not a member of the set.
4099 int64_t ix = fp->fInputIdx;
4100 UTEXT_SETNATIVEINDEX(fInputText, ix);
4101 for (;;) {
4102 if (ix >= fActiveLimit) {
4103 fHitEnd = TRUE;
4104 break;
4105 }
4106 UChar32 c = UTEXT_NEXT32(fInputText);
4107 if (c<256) {
4108 if (s8->contains(c) == FALSE) {
4109 break;
4110 }
4111 } else {
4112 if (s->contains(c) == FALSE) {
4113 break;
4114 }
4115 }
4116 ix = UTEXT_GETNATIVEINDEX(fInputText);
4117 }
4118
4119 // If there were no matching characters, skip over the loop altogether.
4120 // The loop doesn't run at all, a * op always succeeds.
4121 if (ix == fp->fInputIdx) {
4122 fp->fPatIdx++; // skip the URX_LOOP_C op.
4123 break;
4124 }
4125
4126 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4127 // must follow. It's operand is the stack location
4128 // that holds the starting input index for the match of this [set]*
4129 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4130 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4131 int32_t stackLoc = URX_VAL(loopcOp);
4132 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4133 fp->fExtra[stackLoc] = fp->fInputIdx;
4134 fp->fInputIdx = ix;
4135
4136 // Save State to the URX_LOOP_C op that follows this one,
4137 // so that match failures in the following code will return to there.
4138 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4139 fp = StateSave(fp, fp->fPatIdx, status);
4140 fp->fPatIdx++;
4141 }
4142 break;
4143
4144
4145 case URX_LOOP_DOT_I:
4146 // Loop Initialization for the optimized implementation of .*
4147 // This op scans through all remaining input.
4148 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
4149 {
4150 // Loop through input until the input is exhausted (we reach an end-of-line)
4151 // In DOTALL mode, we can just go straight to the end of the input.
4152 int64_t ix;
4153 if ((opValue & 1) == 1) {
4154 // Dot-matches-All mode. Jump straight to the end of the string.
4155 ix = fActiveLimit;
4156 fHitEnd = TRUE;
4157 } else {
4158 // NOT DOT ALL mode. Line endings do not match '.'
4159 // Scan forward until a line ending or end of input.
4160 ix = fp->fInputIdx;
4161 UTEXT_SETNATIVEINDEX(fInputText, ix);
4162 for (;;) {
4163 if (ix >= fActiveLimit) {
4164 fHitEnd = TRUE;
4165 break;
4166 }
4167 UChar32 c = UTEXT_NEXT32(fInputText);
4168 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
4169 if ((c == 0x0a) || // 0x0a is newline in both modes.
4170 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
4171 isLineTerminator(c))) {
4172 // char is a line ending. Exit the scanning loop.
4173 break;
4174 }
4175 }
4176 ix = UTEXT_GETNATIVEINDEX(fInputText);
4177 }
4178 }
4179
4180 // If there were no matching characters, skip over the loop altogether.
4181 // The loop doesn't run at all, a * op always succeeds.
4182 if (ix == fp->fInputIdx) {
4183 fp->fPatIdx++; // skip the URX_LOOP_C op.
4184 break;
4185 }
4186
4187 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4188 // must follow. It's operand is the stack location
4189 // that holds the starting input index for the match of this .*
4190 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4191 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4192 int32_t stackLoc = URX_VAL(loopcOp);
4193 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4194 fp->fExtra[stackLoc] = fp->fInputIdx;
4195 fp->fInputIdx = ix;
4196
4197 // Save State to the URX_LOOP_C op that follows this one,
4198 // so that match failures in the following code will return to there.
4199 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4200 fp = StateSave(fp, fp->fPatIdx, status);
4201 fp->fPatIdx++;
4202 }
4203 break;
4204
4205
4206 case URX_LOOP_C:
4207 {
4208 U_ASSERT(opValue>=0 && opValue<fFrameSize);
4209 backSearchIndex = fp->fExtra[opValue];
4210 U_ASSERT(backSearchIndex <= fp->fInputIdx);
4211 if (backSearchIndex == fp->fInputIdx) {
4212 // We've backed up the input idx to the point that the loop started.
4213 // The loop is done. Leave here without saving state.
4214 // Subsequent failures won't come back here.
4215 break;
4216 }
4217 // Set up for the next iteration of the loop, with input index
4218 // backed up by one from the last time through,
4219 // and a state save to this instruction in case the following code fails again.
4220 // (We're going backwards because this loop emulates stack unwinding, not
4221 // the initial scan forward.)
4222 U_ASSERT(fp->fInputIdx > 0);
4223 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4224 UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4225 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4226
4227 UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4228 if (prevC == 0x0a &&
4229 fp->fInputIdx > backSearchIndex &&
4230 twoPrevC == 0x0d) {
4231 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4232 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4233 // .*, stepping back over CRLF pair.
4234 fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4235 }
4236 }
4237
4238
4239 fp = StateSave(fp, fp->fPatIdx-1, status);
4240 }
4241 break;
4242
4243
4244
4245 default:
4246 // Trouble. The compiled pattern contains an entry with an
4247 // unrecognized type tag.
4248 U_ASSERT(FALSE);
4249 }
4250
4251 if (U_FAILURE(status)) {
4252 isMatch = FALSE;
4253 break;
4254 }
4255 }
4256
4257 breakFromLoop:
4258 fMatch = isMatch;
4259 if (isMatch) {
4260 fLastMatchEnd = fMatchEnd;
4261 fMatchStart = startIdx;
4262 fMatchEnd = fp->fInputIdx;
4263 }
4264
4265 #ifdef REGEX_RUN_DEBUG
4266 if (fTraceDebug) {
4267 if (isMatch) {
4268 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
4269 } else {
4270 printf("No match\n\n");
4271 }
4272 }
4273 #endif
4274
4275 fFrame = fp; // The active stack frame when the engine stopped.
4276 // Contains the capture group results that we need to
4277 // access later.
4278 return;
4279 }
4280
4281
4282 //--------------------------------------------------------------------------------
4283 //
4284 // MatchChunkAt This is the actual matching engine. Like MatchAt, but with the
4285 // assumption that the entire string is available in the UText's
4286 // chunk buffer. For now, that means we can use int32_t indexes,
4287 // except for anything that needs to be saved (like group starts
4288 // and ends).
4289 //
4290 // startIdx: begin matching a this index.
4291 // toEnd: if true, match must extend to end of the input region
4292 //
4293 //--------------------------------------------------------------------------------
MatchChunkAt(int32_t startIdx,UBool toEnd,UErrorCode & status)4294 void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4295 UBool isMatch = FALSE; // True if the we have a match.
4296
4297 int32_t backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4298
4299 int32_t op; // Operation from the compiled pattern, split into
4300 int32_t opType; // the opcode
4301 int32_t opValue; // and the operand value.
4302
4303 #ifdef REGEX_RUN_DEBUG
4304 if (fTraceDebug) {
4305 printf("MatchAt(startIdx=%d)\n", startIdx);
4306 printf("Original Pattern: ");
4307 UChar32 c = utext_next32From(fPattern->fPattern, 0);
4308 while (c != U_SENTINEL) {
4309 if (c<32 || c>256) {
4310 c = '.';
4311 }
4312 printf("%c", c);
4313
4314 c = UTEXT_NEXT32(fPattern->fPattern);
4315 }
4316 printf("\n");
4317 printf("Input String: ");
4318 c = utext_next32From(fInputText, 0);
4319 while (c != U_SENTINEL) {
4320 if (c<32 || c>256) {
4321 c = '.';
4322 }
4323 printf("%c", c);
4324
4325 c = UTEXT_NEXT32(fInputText);
4326 }
4327 printf("\n");
4328 printf("\n");
4329 }
4330 #endif
4331
4332 if (U_FAILURE(status)) {
4333 return;
4334 }
4335
4336 // Cache frequently referenced items from the compiled pattern
4337 //
4338 int64_t *pat = fPattern->fCompiledPat->getBuffer();
4339
4340 const UChar *litText = fPattern->fLiteralText.getBuffer();
4341 UVector *sets = fPattern->fSets;
4342
4343 const UChar *inputBuf = fInputText->chunkContents;
4344
4345 fFrameSize = fPattern->fFrameSize;
4346 REStackFrame *fp = resetStack();
4347
4348 fp->fPatIdx = 0;
4349 fp->fInputIdx = startIdx;
4350
4351 // Zero out the pattern's static data
4352 int32_t i;
4353 for (i = 0; i<fPattern->fDataSize; i++) {
4354 fData[i] = 0;
4355 }
4356
4357 //
4358 // Main loop for interpreting the compiled pattern.
4359 // One iteration of the loop per pattern operation performed.
4360 //
4361 for (;;) {
4362 op = (int32_t)pat[fp->fPatIdx];
4363 opType = URX_TYPE(op);
4364 opValue = URX_VAL(op);
4365 #ifdef REGEX_RUN_DEBUG
4366 if (fTraceDebug) {
4367 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4368 printf("inputIdx=%ld inputChar=%x sp=%3ld activeLimit=%ld ", fp->fInputIdx,
4369 UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4370 fPattern->dumpOp(fp->fPatIdx);
4371 }
4372 #endif
4373 fp->fPatIdx++;
4374
4375 switch (opType) {
4376
4377
4378 case URX_NOP:
4379 break;
4380
4381
4382 case URX_BACKTRACK:
4383 // Force a backtrack. In some circumstances, the pattern compiler
4384 // will notice that the pattern can't possibly match anything, and will
4385 // emit one of these at that point.
4386 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4387 break;
4388
4389
4390 case URX_ONECHAR:
4391 if (fp->fInputIdx < fActiveLimit) {
4392 UChar32 c;
4393 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4394 if (c == opValue) {
4395 break;
4396 }
4397 } else {
4398 fHitEnd = TRUE;
4399 }
4400 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4401 break;
4402
4403
4404 case URX_STRING:
4405 {
4406 // Test input against a literal string.
4407 // Strings require two slots in the compiled pattern, one for the
4408 // offset to the string text, and one for the length.
4409 int32_t stringStartIdx = opValue;
4410 int32_t stringLen;
4411
4412 op = (int32_t)pat[fp->fPatIdx]; // Fetch the second operand
4413 fp->fPatIdx++;
4414 opType = URX_TYPE(op);
4415 stringLen = URX_VAL(op);
4416 U_ASSERT(opType == URX_STRING_LEN);
4417 U_ASSERT(stringLen >= 2);
4418
4419 const UChar * pInp = inputBuf + fp->fInputIdx;
4420 const UChar * pInpLimit = inputBuf + fActiveLimit;
4421 const UChar * pPat = litText+stringStartIdx;
4422 const UChar * pEnd = pInp + stringLen;
4423 UBool success = TRUE;
4424 while (pInp < pEnd) {
4425 if (pInp >= pInpLimit) {
4426 fHitEnd = TRUE;
4427 success = FALSE;
4428 break;
4429 }
4430 if (*pInp++ != *pPat++) {
4431 success = FALSE;
4432 break;
4433 }
4434 }
4435
4436 if (success) {
4437 fp->fInputIdx += stringLen;
4438 } else {
4439 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4440 }
4441 }
4442 break;
4443
4444
4445 case URX_STATE_SAVE:
4446 fp = StateSave(fp, opValue, status);
4447 break;
4448
4449
4450 case URX_END:
4451 // The match loop will exit via this path on a successful match,
4452 // when we reach the end of the pattern.
4453 if (toEnd && fp->fInputIdx != fActiveLimit) {
4454 // The pattern matched, but not to the end of input. Try some more.
4455 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4456 break;
4457 }
4458 isMatch = TRUE;
4459 goto breakFromLoop;
4460
4461 // Start and End Capture stack frame variables are laid out out like this:
4462 // fp->fExtra[opValue] - The start of a completed capture group
4463 // opValue+1 - The end of a completed capture group
4464 // opValue+2 - the start of a capture group whose end
4465 // has not yet been reached (and might not ever be).
4466 case URX_START_CAPTURE:
4467 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4468 fp->fExtra[opValue+2] = fp->fInputIdx;
4469 break;
4470
4471
4472 case URX_END_CAPTURE:
4473 U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4474 U_ASSERT(fp->fExtra[opValue+2] >= 0); // Start pos for this group must be set.
4475 fp->fExtra[opValue] = fp->fExtra[opValue+2]; // Tentative start becomes real.
4476 fp->fExtra[opValue+1] = fp->fInputIdx; // End position
4477 U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4478 break;
4479
4480
4481 case URX_DOLLAR: // $, test for End of line
4482 // or for position before new line at end of input
4483 if (fp->fInputIdx < fAnchorLimit-2) {
4484 // We are no where near the end of input. Fail.
4485 // This is the common case. Keep it first.
4486 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4487 break;
4488 }
4489 if (fp->fInputIdx >= fAnchorLimit) {
4490 // We really are at the end of input. Success.
4491 fHitEnd = TRUE;
4492 fRequireEnd = TRUE;
4493 break;
4494 }
4495
4496 // If we are positioned just before a new-line that is located at the
4497 // end of input, succeed.
4498 if (fp->fInputIdx == fAnchorLimit-1) {
4499 UChar32 c;
4500 U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4501
4502 if (isLineTerminator(c)) {
4503 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4504 // At new-line at end of input. Success
4505 fHitEnd = TRUE;
4506 fRequireEnd = TRUE;
4507 break;
4508 }
4509 }
4510 } else if (fp->fInputIdx == fAnchorLimit-2 &&
4511 inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4512 fHitEnd = TRUE;
4513 fRequireEnd = TRUE;
4514 break; // At CR/LF at end of input. Success
4515 }
4516
4517 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4518
4519 break;
4520
4521
4522 case URX_DOLLAR_D: // $, test for End of Line, in UNIX_LINES mode.
4523 if (fp->fInputIdx >= fAnchorLimit-1) {
4524 // Either at the last character of input, or off the end.
4525 if (fp->fInputIdx == fAnchorLimit-1) {
4526 // At last char of input. Success if it's a new line.
4527 if (inputBuf[fp->fInputIdx] == 0x0a) {
4528 fHitEnd = TRUE;
4529 fRequireEnd = TRUE;
4530 break;
4531 }
4532 } else {
4533 // Off the end of input. Success.
4534 fHitEnd = TRUE;
4535 fRequireEnd = TRUE;
4536 break;
4537 }
4538 }
4539
4540 // Not at end of input. Back-track out.
4541 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4542 break;
4543
4544
4545 case URX_DOLLAR_M: // $, test for End of line in multi-line mode
4546 {
4547 if (fp->fInputIdx >= fAnchorLimit) {
4548 // We really are at the end of input. Success.
4549 fHitEnd = TRUE;
4550 fRequireEnd = TRUE;
4551 break;
4552 }
4553 // If we are positioned just before a new-line, succeed.
4554 // It makes no difference where the new-line is within the input.
4555 UChar32 c = inputBuf[fp->fInputIdx];
4556 if (isLineTerminator(c)) {
4557 // At a line end, except for the odd chance of being in the middle of a CR/LF sequence
4558 // In multi-line mode, hitting a new-line just before the end of input does not
4559 // set the hitEnd or requireEnd flags
4560 if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4561 break;
4562 }
4563 }
4564 // not at a new line. Fail.
4565 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4566 }
4567 break;
4568
4569
4570 case URX_DOLLAR_MD: // $, test for End of line in multi-line and UNIX_LINES mode
4571 {
4572 if (fp->fInputIdx >= fAnchorLimit) {
4573 // We really are at the end of input. Success.
4574 fHitEnd = TRUE;
4575 fRequireEnd = TRUE; // Java set requireEnd in this case, even though
4576 break; // adding a new-line would not lose the match.
4577 }
4578 // If we are not positioned just before a new-line, the test fails; backtrack out.
4579 // It makes no difference where the new-line is within the input.
4580 if (inputBuf[fp->fInputIdx] != 0x0a) {
4581 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4582 }
4583 }
4584 break;
4585
4586
4587 case URX_CARET: // ^, test for start of line
4588 if (fp->fInputIdx != fAnchorStart) {
4589 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4590 }
4591 break;
4592
4593
4594 case URX_CARET_M: // ^, test for start of line in mulit-line mode
4595 {
4596 if (fp->fInputIdx == fAnchorStart) {
4597 // We are at the start input. Success.
4598 break;
4599 }
4600 // Check whether character just before the current pos is a new-line
4601 // unless we are at the end of input
4602 UChar c = inputBuf[fp->fInputIdx - 1];
4603 if ((fp->fInputIdx < fAnchorLimit) &&
4604 isLineTerminator(c)) {
4605 // It's a new-line. ^ is true. Success.
4606 // TODO: what should be done with positions between a CR and LF?
4607 break;
4608 }
4609 // Not at the start of a line. Fail.
4610 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4611 }
4612 break;
4613
4614
4615 case URX_CARET_M_UNIX: // ^, test for start of line in mulit-line + Unix-line mode
4616 {
4617 U_ASSERT(fp->fInputIdx >= fAnchorStart);
4618 if (fp->fInputIdx <= fAnchorStart) {
4619 // We are at the start input. Success.
4620 break;
4621 }
4622 // Check whether character just before the current pos is a new-line
4623 U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4624 UChar c = inputBuf[fp->fInputIdx - 1];
4625 if (c != 0x0a) {
4626 // Not at the start of a line. Back-track out.
4627 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4628 }
4629 }
4630 break;
4631
4632 case URX_BACKSLASH_B: // Test for word boundaries
4633 {
4634 UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4635 success ^= (UBool)(opValue != 0); // flip sense for \B
4636 if (!success) {
4637 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4638 }
4639 }
4640 break;
4641
4642
4643 case URX_BACKSLASH_BU: // Test for word boundaries, Unicode-style
4644 {
4645 UBool success = isUWordBoundary(fp->fInputIdx);
4646 success ^= (UBool)(opValue != 0); // flip sense for \B
4647 if (!success) {
4648 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4649 }
4650 }
4651 break;
4652
4653
4654 case URX_BACKSLASH_D: // Test for decimal digit
4655 {
4656 if (fp->fInputIdx >= fActiveLimit) {
4657 fHitEnd = TRUE;
4658 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4659 break;
4660 }
4661
4662 UChar32 c;
4663 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4664 int8_t ctype = u_charType(c); // TODO: make a unicode set for this. Will be faster.
4665 UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4666 success ^= (UBool)(opValue != 0); // flip sense for \D
4667 if (!success) {
4668 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4669 }
4670 }
4671 break;
4672
4673
4674 case URX_BACKSLASH_G: // Test for position at end of previous match
4675 if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==FALSE && fp->fInputIdx==fActiveStart))) {
4676 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4677 }
4678 break;
4679
4680
4681 case URX_BACKSLASH_H: // Test for \h, horizontal white space.
4682 {
4683 if (fp->fInputIdx >= fActiveLimit) {
4684 fHitEnd = TRUE;
4685 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4686 break;
4687 }
4688 UChar32 c;
4689 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4690 int8_t ctype = u_charType(c);
4691 UBool success = (ctype == U_SPACE_SEPARATOR || c == 9); // SPACE_SEPARATOR || TAB
4692 success ^= (UBool)(opValue != 0); // flip sense for \H
4693 if (!success) {
4694 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4695 }
4696 }
4697 break;
4698
4699
4700 case URX_BACKSLASH_R: // Test for \R, any line break sequence.
4701 {
4702 if (fp->fInputIdx >= fActiveLimit) {
4703 fHitEnd = TRUE;
4704 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4705 break;
4706 }
4707 UChar32 c;
4708 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4709 if (isLineTerminator(c)) {
4710 if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4711 // Check for CR/LF sequence. Consume both together when found.
4712 UChar c2;
4713 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4714 if (c2 != 0x0a) {
4715 U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4716 }
4717 }
4718 } else {
4719 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4720 }
4721 }
4722 break;
4723
4724
4725 case URX_BACKSLASH_V: // Any single code point line ending.
4726 {
4727 if (fp->fInputIdx >= fActiveLimit) {
4728 fHitEnd = TRUE;
4729 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4730 break;
4731 }
4732 UChar32 c;
4733 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4734 UBool success = isLineTerminator(c);
4735 success ^= (UBool)(opValue != 0); // flip sense for \V
4736 if (!success) {
4737 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4738 }
4739 }
4740 break;
4741
4742
4743
4744 case URX_BACKSLASH_X:
4745 // Match a Grapheme, as defined by Unicode TR 29.
4746 // Differs slightly from Perl, which consumes combining marks independently
4747 // of context.
4748 {
4749
4750 // Fail if at end of input
4751 if (fp->fInputIdx >= fActiveLimit) {
4752 fHitEnd = TRUE;
4753 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4754 break;
4755 }
4756
4757 // Examine (and consume) the current char.
4758 // Dispatch into a little state machine, based on the char.
4759 UChar32 c;
4760 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4761 UnicodeSet **sets = fPattern->fStaticSets;
4762 if (sets[URX_GC_NORMAL]->contains(c)) goto GC_Extend;
4763 if (sets[URX_GC_CONTROL]->contains(c)) goto GC_Control;
4764 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4765 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4766 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4767 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4768 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4769 goto GC_Extend;
4770
4771
4772
4773 GC_L:
4774 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4775 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4776 if (sets[URX_GC_L]->contains(c)) goto GC_L;
4777 if (sets[URX_GC_LV]->contains(c)) goto GC_V;
4778 if (sets[URX_GC_LVT]->contains(c)) goto GC_T;
4779 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4780 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4781 goto GC_Extend;
4782
4783 GC_V:
4784 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4785 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4786 if (sets[URX_GC_V]->contains(c)) goto GC_V;
4787 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4788 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4789 goto GC_Extend;
4790
4791 GC_T:
4792 if (fp->fInputIdx >= fActiveLimit) goto GC_Done;
4793 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4794 if (sets[URX_GC_T]->contains(c)) goto GC_T;
4795 U16_PREV(inputBuf, 0, fp->fInputIdx, c);
4796 goto GC_Extend;
4797
4798 GC_Extend:
4799 // Combining characters are consumed here
4800 for (;;) {
4801 if (fp->fInputIdx >= fActiveLimit) {
4802 break;
4803 }
4804 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4805 if (sets[URX_GC_EXTEND]->contains(c) == FALSE) {
4806 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
4807 break;
4808 }
4809 }
4810 goto GC_Done;
4811
4812 GC_Control:
4813 // Most control chars stand alone (don't combine with combining chars),
4814 // except for that CR/LF sequence is a single grapheme cluster.
4815 if (c == 0x0d && fp->fInputIdx < fActiveLimit && inputBuf[fp->fInputIdx] == 0x0a) {
4816 fp->fInputIdx++;
4817 }
4818
4819 GC_Done:
4820 if (fp->fInputIdx >= fActiveLimit) {
4821 fHitEnd = TRUE;
4822 }
4823 break;
4824 }
4825
4826
4827
4828
4829 case URX_BACKSLASH_Z: // Test for end of Input
4830 if (fp->fInputIdx < fAnchorLimit) {
4831 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4832 } else {
4833 fHitEnd = TRUE;
4834 fRequireEnd = TRUE;
4835 }
4836 break;
4837
4838
4839
4840 case URX_STATIC_SETREF:
4841 {
4842 // Test input character against one of the predefined sets
4843 // (Word Characters, for example)
4844 // The high bit of the op value is a flag for the match polarity.
4845 // 0: success if input char is in set.
4846 // 1: success if input char is not in set.
4847 if (fp->fInputIdx >= fActiveLimit) {
4848 fHitEnd = TRUE;
4849 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4850 break;
4851 }
4852
4853 UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4854 opValue &= ~URX_NEG_SET;
4855 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4856
4857 UChar32 c;
4858 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4859 if (c < 256) {
4860 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4861 if (s8->contains(c)) {
4862 success = !success;
4863 }
4864 } else {
4865 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4866 if (s->contains(c)) {
4867 success = !success;
4868 }
4869 }
4870 if (!success) {
4871 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4872 }
4873 }
4874 break;
4875
4876
4877 case URX_STAT_SETREF_N:
4878 {
4879 // Test input character for NOT being a member of one of
4880 // the predefined sets (Word Characters, for example)
4881 if (fp->fInputIdx >= fActiveLimit) {
4882 fHitEnd = TRUE;
4883 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4884 break;
4885 }
4886
4887 U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4888
4889 UChar32 c;
4890 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4891 if (c < 256) {
4892 Regex8BitSet *s8 = &fPattern->fStaticSets8[opValue];
4893 if (s8->contains(c) == FALSE) {
4894 break;
4895 }
4896 } else {
4897 const UnicodeSet *s = fPattern->fStaticSets[opValue];
4898 if (s->contains(c) == FALSE) {
4899 break;
4900 }
4901 }
4902 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4903 }
4904 break;
4905
4906
4907 case URX_SETREF:
4908 {
4909 if (fp->fInputIdx >= fActiveLimit) {
4910 fHitEnd = TRUE;
4911 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912 break;
4913 }
4914
4915 U_ASSERT(opValue > 0 && opValue < sets->size());
4916
4917 // There is input left. Pick up one char and test it for set membership.
4918 UChar32 c;
4919 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4920 if (c<256) {
4921 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4922 if (s8->contains(c)) {
4923 // The character is in the set. A Match.
4924 break;
4925 }
4926 } else {
4927 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
4928 if (s->contains(c)) {
4929 // The character is in the set. A Match.
4930 break;
4931 }
4932 }
4933
4934 // the character wasn't in the set.
4935 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4936 }
4937 break;
4938
4939
4940 case URX_DOTANY:
4941 {
4942 // . matches anything, but stops at end-of-line.
4943 if (fp->fInputIdx >= fActiveLimit) {
4944 // At end of input. Match failed. Backtrack out.
4945 fHitEnd = TRUE;
4946 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4947 break;
4948 }
4949
4950 // There is input left. Advance over one char, unless we've hit end-of-line
4951 UChar32 c;
4952 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4953 if (isLineTerminator(c)) {
4954 // End of line in normal mode. . does not match.
4955 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4956 break;
4957 }
4958 }
4959 break;
4960
4961
4962 case URX_DOTANY_ALL:
4963 {
4964 // . in dot-matches-all (including new lines) mode
4965 if (fp->fInputIdx >= fActiveLimit) {
4966 // At end of input. Match failed. Backtrack out.
4967 fHitEnd = TRUE;
4968 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4969 break;
4970 }
4971
4972 // There is input left. Advance over one char, except if we are
4973 // at a cr/lf, advance over both of them.
4974 UChar32 c;
4975 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4976 if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4977 // In the case of a CR/LF, we need to advance over both.
4978 if (inputBuf[fp->fInputIdx] == 0x0a) {
4979 U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4980 }
4981 }
4982 }
4983 break;
4984
4985
4986 case URX_DOTANY_UNIX:
4987 {
4988 // '.' operator, matches all, but stops at end-of-line.
4989 // UNIX_LINES mode, so 0x0a is the only recognized line ending.
4990 if (fp->fInputIdx >= fActiveLimit) {
4991 // At end of input. Match failed. Backtrack out.
4992 fHitEnd = TRUE;
4993 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4994 break;
4995 }
4996
4997 // There is input left. Advance over one char, unless we've hit end-of-line
4998 UChar32 c;
4999 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5000 if (c == 0x0a) {
5001 // End of line in normal mode. '.' does not match the \n
5002 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5003 }
5004 }
5005 break;
5006
5007
5008 case URX_JMP:
5009 fp->fPatIdx = opValue;
5010 break;
5011
5012 case URX_FAIL:
5013 isMatch = FALSE;
5014 goto breakFromLoop;
5015
5016 case URX_JMP_SAV:
5017 U_ASSERT(opValue < fPattern->fCompiledPat->size());
5018 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5019 fp->fPatIdx = opValue; // Then JMP.
5020 break;
5021
5022 case URX_JMP_SAV_X:
5023 // This opcode is used with (x)+, when x can match a zero length string.
5024 // Same as JMP_SAV, except conditional on the match having made forward progress.
5025 // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
5026 // data address of the input position at the start of the loop.
5027 {
5028 U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
5029 int32_t stoOp = (int32_t)pat[opValue-1];
5030 U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
5031 int32_t frameLoc = URX_VAL(stoOp);
5032 U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
5033 int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
5034 U_ASSERT(prevInputIdx <= fp->fInputIdx);
5035 if (prevInputIdx < fp->fInputIdx) {
5036 // The match did make progress. Repeat the loop.
5037 fp = StateSave(fp, fp->fPatIdx, status); // State save to loc following current
5038 fp->fPatIdx = opValue;
5039 fp->fExtra[frameLoc] = fp->fInputIdx;
5040 }
5041 // If the input position did not advance, we do nothing here,
5042 // execution will fall out of the loop.
5043 }
5044 break;
5045
5046 case URX_CTR_INIT:
5047 {
5048 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5049 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5050
5051 // Pick up the three extra operands that CTR_INIT has, and
5052 // skip the pattern location counter past
5053 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5054 fp->fPatIdx += 3;
5055 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5056 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5057 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5058 U_ASSERT(minCount>=0);
5059 U_ASSERT(maxCount>=minCount || maxCount==-1);
5060 U_ASSERT(loopLoc>=fp->fPatIdx);
5061
5062 if (minCount == 0) {
5063 fp = StateSave(fp, loopLoc+1, status);
5064 }
5065 if (maxCount == -1) {
5066 fp->fExtra[opValue+1] = fp->fInputIdx; // For loop breaking.
5067 } else if (maxCount == 0) {
5068 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5069 }
5070 }
5071 break;
5072
5073 case URX_CTR_LOOP:
5074 {
5075 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5076 int32_t initOp = (int32_t)pat[opValue];
5077 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5078 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5079 int32_t minCount = (int32_t)pat[opValue+2];
5080 int32_t maxCount = (int32_t)pat[opValue+3];
5081 (*pCounter)++;
5082 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5083 U_ASSERT(*pCounter == maxCount);
5084 break;
5085 }
5086 if (*pCounter >= minCount) {
5087 if (maxCount == -1) {
5088 // Loop has no hard upper bound.
5089 // Check that it is progressing through the input, break if it is not.
5090 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5091 if (fp->fInputIdx == *pLastInputIdx) {
5092 break;
5093 } else {
5094 *pLastInputIdx = fp->fInputIdx;
5095 }
5096 }
5097 fp = StateSave(fp, fp->fPatIdx, status);
5098 }
5099 fp->fPatIdx = opValue + 4; // Loop back.
5100 }
5101 break;
5102
5103 case URX_CTR_INIT_NG:
5104 {
5105 // Initialize a non-greedy loop
5106 U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5107 fp->fExtra[opValue] = 0; // Set the loop counter variable to zero
5108
5109 // Pick up the three extra operands that CTR_INIT_NG has, and
5110 // skip the pattern location counter past
5111 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5112 fp->fPatIdx += 3;
5113 int32_t loopLoc = URX_VAL(pat[instrOperandLoc]);
5114 int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5115 int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5116 U_ASSERT(minCount>=0);
5117 U_ASSERT(maxCount>=minCount || maxCount==-1);
5118 U_ASSERT(loopLoc>fp->fPatIdx);
5119 if (maxCount == -1) {
5120 fp->fExtra[opValue+1] = fp->fInputIdx; // Save initial input index for loop breaking.
5121 }
5122
5123 if (minCount == 0) {
5124 if (maxCount != 0) {
5125 fp = StateSave(fp, fp->fPatIdx, status);
5126 }
5127 fp->fPatIdx = loopLoc+1; // Continue with stuff after repeated block
5128 }
5129 }
5130 break;
5131
5132 case URX_CTR_LOOP_NG:
5133 {
5134 // Non-greedy {min, max} loops
5135 U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5136 int32_t initOp = (int32_t)pat[opValue];
5137 U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5138 int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5139 int32_t minCount = (int32_t)pat[opValue+2];
5140 int32_t maxCount = (int32_t)pat[opValue+3];
5141
5142 (*pCounter)++;
5143 if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5144 // The loop has matched the maximum permitted number of times.
5145 // Break out of here with no action. Matching will
5146 // continue with the following pattern.
5147 U_ASSERT(*pCounter == maxCount);
5148 break;
5149 }
5150
5151 if (*pCounter < minCount) {
5152 // We haven't met the minimum number of matches yet.
5153 // Loop back for another one.
5154 fp->fPatIdx = opValue + 4; // Loop back.
5155 } else {
5156 // We do have the minimum number of matches.
5157
5158 // If there is no upper bound on the loop iterations, check that the input index
5159 // is progressing, and stop the loop if it is not.
5160 if (maxCount == -1) {
5161 int64_t *pLastInputIdx = &fp->fExtra[URX_VAL(initOp) + 1];
5162 if (fp->fInputIdx == *pLastInputIdx) {
5163 break;
5164 }
5165 *pLastInputIdx = fp->fInputIdx;
5166 }
5167
5168 // Loop Continuation: we will fall into the pattern following the loop
5169 // (non-greedy, don't execute loop body first), but first do
5170 // a state save to the top of the loop, so that a match failure
5171 // in the following pattern will try another iteration of the loop.
5172 fp = StateSave(fp, opValue + 4, status);
5173 }
5174 }
5175 break;
5176
5177 case URX_STO_SP:
5178 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5179 fData[opValue] = fStack->size();
5180 break;
5181
5182 case URX_LD_SP:
5183 {
5184 U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5185 int32_t newStackSize = (int32_t)fData[opValue];
5186 U_ASSERT(newStackSize <= fStack->size());
5187 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5188 if (newFP == (int64_t *)fp) {
5189 break;
5190 }
5191 int32_t i;
5192 for (i=0; i<fFrameSize; i++) {
5193 newFP[i] = ((int64_t *)fp)[i];
5194 }
5195 fp = (REStackFrame *)newFP;
5196 fStack->setSize(newStackSize);
5197 }
5198 break;
5199
5200 case URX_BACKREF:
5201 {
5202 U_ASSERT(opValue < fFrameSize);
5203 int64_t groupStartIdx = fp->fExtra[opValue];
5204 int64_t groupEndIdx = fp->fExtra[opValue+1];
5205 U_ASSERT(groupStartIdx <= groupEndIdx);
5206 int64_t inputIndex = fp->fInputIdx;
5207 if (groupStartIdx < 0) {
5208 // This capture group has not participated in the match thus far,
5209 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5210 break;
5211 }
5212 UBool success = TRUE;
5213 for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5214 if (inputIndex >= fActiveLimit) {
5215 success = FALSE;
5216 fHitEnd = TRUE;
5217 break;
5218 }
5219 if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5220 success = FALSE;
5221 break;
5222 }
5223 }
5224 if (success) {
5225 fp->fInputIdx = inputIndex;
5226 } else {
5227 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5228 }
5229 }
5230 break;
5231
5232 case URX_BACKREF_I:
5233 {
5234 U_ASSERT(opValue < fFrameSize);
5235 int64_t groupStartIdx = fp->fExtra[opValue];
5236 int64_t groupEndIdx = fp->fExtra[opValue+1];
5237 U_ASSERT(groupStartIdx <= groupEndIdx);
5238 if (groupStartIdx < 0) {
5239 // This capture group has not participated in the match thus far,
5240 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no match.
5241 break;
5242 }
5243 CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5244 CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5245
5246 // Note: if the capture group match was of an empty string the backref
5247 // match succeeds. Verified by testing: Perl matches succeed
5248 // in this case, so we do too.
5249
5250 UBool success = TRUE;
5251 for (;;) {
5252 UChar32 captureGroupChar = captureGroupItr.next();
5253 if (captureGroupChar == U_SENTINEL) {
5254 success = TRUE;
5255 break;
5256 }
5257 UChar32 inputChar = inputItr.next();
5258 if (inputChar == U_SENTINEL) {
5259 success = FALSE;
5260 fHitEnd = TRUE;
5261 break;
5262 }
5263 if (inputChar != captureGroupChar) {
5264 success = FALSE;
5265 break;
5266 }
5267 }
5268
5269 if (success && inputItr.inExpansion()) {
5270 // We otained a match by consuming part of a string obtained from
5271 // case-folding a single code point of the input text.
5272 // This does not count as an overall match.
5273 success = FALSE;
5274 }
5275
5276 if (success) {
5277 fp->fInputIdx = inputItr.getIndex();
5278 } else {
5279 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5280 }
5281 }
5282 break;
5283
5284 case URX_STO_INP_LOC:
5285 {
5286 U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5287 fp->fExtra[opValue] = fp->fInputIdx;
5288 }
5289 break;
5290
5291 case URX_JMPX:
5292 {
5293 int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5294 fp->fPatIdx += 1;
5295 int32_t dataLoc = URX_VAL(pat[instrOperandLoc]);
5296 U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5297 int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5298 U_ASSERT(savedInputIdx <= fp->fInputIdx);
5299 if (savedInputIdx < fp->fInputIdx) {
5300 fp->fPatIdx = opValue; // JMP
5301 } else {
5302 fp = (REStackFrame *)fStack->popFrame(fFrameSize); // FAIL, no progress in loop.
5303 }
5304 }
5305 break;
5306
5307 case URX_LA_START:
5308 {
5309 // Entering a lookahead block.
5310 // Save Stack Ptr, Input Pos.
5311 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5312 fData[opValue] = fStack->size();
5313 fData[opValue+1] = fp->fInputIdx;
5314 fActiveStart = fLookStart; // Set the match region change for
5315 fActiveLimit = fLookLimit; // transparent bounds.
5316 }
5317 break;
5318
5319 case URX_LA_END:
5320 {
5321 // Leaving a look-ahead block.
5322 // restore Stack Ptr, Input Pos to positions they had on entry to block.
5323 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5324 int32_t stackSize = fStack->size();
5325 int32_t newStackSize = (int32_t)fData[opValue];
5326 U_ASSERT(stackSize >= newStackSize);
5327 if (stackSize > newStackSize) {
5328 // Copy the current top frame back to the new (cut back) top frame.
5329 // This makes the capture groups from within the look-ahead
5330 // expression available.
5331 int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5332 int32_t i;
5333 for (i=0; i<fFrameSize; i++) {
5334 newFP[i] = ((int64_t *)fp)[i];
5335 }
5336 fp = (REStackFrame *)newFP;
5337 fStack->setSize(newStackSize);
5338 }
5339 fp->fInputIdx = fData[opValue+1];
5340
5341 // Restore the active region bounds in the input string; they may have
5342 // been changed because of transparent bounds on a Region.
5343 fActiveStart = fRegionStart;
5344 fActiveLimit = fRegionLimit;
5345 }
5346 break;
5347
5348 case URX_ONECHAR_I:
5349 if (fp->fInputIdx < fActiveLimit) {
5350 UChar32 c;
5351 U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5352 if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5353 break;
5354 }
5355 } else {
5356 fHitEnd = TRUE;
5357 }
5358 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5359 break;
5360
5361 case URX_STRING_I:
5362 // Case-insensitive test input against a literal string.
5363 // Strings require two slots in the compiled pattern, one for the
5364 // offset to the string text, and one for the length.
5365 // The compiled string has already been case folded.
5366 {
5367 const UChar *patternString = litText + opValue;
5368
5369 op = (int32_t)pat[fp->fPatIdx];
5370 fp->fPatIdx++;
5371 opType = URX_TYPE(op);
5372 opValue = URX_VAL(op);
5373 U_ASSERT(opType == URX_STRING_LEN);
5374 int32_t patternStringLen = opValue; // Length of the string from the pattern.
5375
5376 UChar32 cText;
5377 UChar32 cPattern;
5378 UBool success = TRUE;
5379 int32_t patternStringIdx = 0;
5380 CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5381 while (patternStringIdx < patternStringLen) {
5382 U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5383 cText = inputIterator.next();
5384 if (cText != cPattern) {
5385 success = FALSE;
5386 if (cText == U_SENTINEL) {
5387 fHitEnd = TRUE;
5388 }
5389 break;
5390 }
5391 }
5392 if (inputIterator.inExpansion()) {
5393 success = FALSE;
5394 }
5395
5396 if (success) {
5397 fp->fInputIdx = inputIterator.getIndex();
5398 } else {
5399 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5400 }
5401 }
5402 break;
5403
5404 case URX_LB_START:
5405 {
5406 // Entering a look-behind block.
5407 // Save Stack Ptr, Input Pos.
5408 // TODO: implement transparent bounds. Ticket #6067
5409 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5410 fData[opValue] = fStack->size();
5411 fData[opValue+1] = fp->fInputIdx;
5412 // Init the variable containing the start index for attempted matches.
5413 fData[opValue+2] = -1;
5414 // Save input string length, then reset to pin any matches to end at
5415 // the current position.
5416 fData[opValue+3] = fActiveLimit;
5417 fActiveLimit = fp->fInputIdx;
5418 }
5419 break;
5420
5421
5422 case URX_LB_CONT:
5423 {
5424 // Positive Look-Behind, at top of loop checking for matches of LB expression
5425 // at all possible input starting positions.
5426
5427 // Fetch the min and max possible match lengths. They are the operands
5428 // of this op in the pattern.
5429 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5430 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5431 U_ASSERT(minML <= maxML);
5432 U_ASSERT(minML >= 0);
5433
5434 // Fetch (from data) the last input index where a match was attempted.
5435 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5436 int64_t *lbStartIdx = &fData[opValue+2];
5437 if (*lbStartIdx < 0) {
5438 // First time through loop.
5439 *lbStartIdx = fp->fInputIdx - minML;
5440 } else {
5441 // 2nd through nth time through the loop.
5442 // Back up start position for match by one.
5443 if (*lbStartIdx == 0) {
5444 (*lbStartIdx)--;
5445 } else {
5446 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5447 }
5448 }
5449
5450 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5451 // We have tried all potential match starting points without
5452 // getting a match. Backtrack out, and out of the
5453 // Look Behind altogether.
5454 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5455 int64_t restoreInputLen = fData[opValue+3];
5456 U_ASSERT(restoreInputLen >= fActiveLimit);
5457 U_ASSERT(restoreInputLen <= fInputLength);
5458 fActiveLimit = restoreInputLen;
5459 break;
5460 }
5461
5462 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5463 // (successful match will fall off the end of the loop.)
5464 fp = StateSave(fp, fp->fPatIdx-3, status);
5465 fp->fInputIdx = *lbStartIdx;
5466 }
5467 break;
5468
5469 case URX_LB_END:
5470 // End of a look-behind block, after a successful match.
5471 {
5472 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5473 if (fp->fInputIdx != fActiveLimit) {
5474 // The look-behind expression matched, but the match did not
5475 // extend all the way to the point that we are looking behind from.
5476 // FAIL out of here, which will take us back to the LB_CONT, which
5477 // will retry the match starting at another position or fail
5478 // the look-behind altogether, whichever is appropriate.
5479 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5480 break;
5481 }
5482
5483 // Look-behind match is good. Restore the orignal input string length,
5484 // which had been truncated to pin the end of the lookbehind match to the
5485 // position being looked-behind.
5486 int64_t originalInputLen = fData[opValue+3];
5487 U_ASSERT(originalInputLen >= fActiveLimit);
5488 U_ASSERT(originalInputLen <= fInputLength);
5489 fActiveLimit = originalInputLen;
5490 }
5491 break;
5492
5493
5494 case URX_LBN_CONT:
5495 {
5496 // Negative Look-Behind, at top of loop checking for matches of LB expression
5497 // at all possible input starting positions.
5498
5499 // Fetch the extra parameters of this op.
5500 int32_t minML = (int32_t)pat[fp->fPatIdx++];
5501 int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5502 int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5503 continueLoc = URX_VAL(continueLoc);
5504 U_ASSERT(minML <= maxML);
5505 U_ASSERT(minML >= 0);
5506 U_ASSERT(continueLoc > fp->fPatIdx);
5507
5508 // Fetch (from data) the last input index where a match was attempted.
5509 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5510 int64_t *lbStartIdx = &fData[opValue+2];
5511 if (*lbStartIdx < 0) {
5512 // First time through loop.
5513 *lbStartIdx = fp->fInputIdx - minML;
5514 } else {
5515 // 2nd through nth time through the loop.
5516 // Back up start position for match by one.
5517 if (*lbStartIdx == 0) {
5518 (*lbStartIdx)--; // Because U16_BACK is unsafe starting at 0.
5519 } else {
5520 U16_BACK_1(inputBuf, 0, *lbStartIdx);
5521 }
5522 }
5523
5524 if (*lbStartIdx < 0 || *lbStartIdx < fp->fInputIdx - maxML) {
5525 // We have tried all potential match starting points without
5526 // getting a match, which means that the negative lookbehind as
5527 // a whole has succeeded. Jump forward to the continue location
5528 int64_t restoreInputLen = fData[opValue+3];
5529 U_ASSERT(restoreInputLen >= fActiveLimit);
5530 U_ASSERT(restoreInputLen <= fInputLength);
5531 fActiveLimit = restoreInputLen;
5532 fp->fPatIdx = continueLoc;
5533 break;
5534 }
5535
5536 // Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5537 // (successful match will cause a FAIL out of the loop altogether.)
5538 fp = StateSave(fp, fp->fPatIdx-4, status);
5539 fp->fInputIdx = *lbStartIdx;
5540 }
5541 break;
5542
5543 case URX_LBN_END:
5544 // End of a negative look-behind block, after a successful match.
5545 {
5546 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5547 if (fp->fInputIdx != fActiveLimit) {
5548 // The look-behind expression matched, but the match did not
5549 // extend all the way to the point that we are looking behind from.
5550 // FAIL out of here, which will take us back to the LB_CONT, which
5551 // will retry the match starting at another position or succeed
5552 // the look-behind altogether, whichever is appropriate.
5553 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5554 break;
5555 }
5556
5557 // Look-behind expression matched, which means look-behind test as
5558 // a whole Fails
5559
5560 // Restore the orignal input string length, which had been truncated
5561 // inorder to pin the end of the lookbehind match
5562 // to the position being looked-behind.
5563 int64_t originalInputLen = fData[opValue+3];
5564 U_ASSERT(originalInputLen >= fActiveLimit);
5565 U_ASSERT(originalInputLen <= fInputLength);
5566 fActiveLimit = originalInputLen;
5567
5568 // Restore original stack position, discarding any state saved
5569 // by the successful pattern match.
5570 U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5571 int32_t newStackSize = (int32_t)fData[opValue];
5572 U_ASSERT(fStack->size() > newStackSize);
5573 fStack->setSize(newStackSize);
5574
5575 // FAIL, which will take control back to someplace
5576 // prior to entering the look-behind test.
5577 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5578 }
5579 break;
5580
5581
5582 case URX_LOOP_SR_I:
5583 // Loop Initialization for the optimized implementation of
5584 // [some character set]*
5585 // This op scans through all matching input.
5586 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5587 {
5588 U_ASSERT(opValue > 0 && opValue < sets->size());
5589 Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5590 UnicodeSet *s = (UnicodeSet *)sets->elementAt(opValue);
5591
5592 // Loop through input, until either the input is exhausted or
5593 // we reach a character that is not a member of the set.
5594 int32_t ix = (int32_t)fp->fInputIdx;
5595 for (;;) {
5596 if (ix >= fActiveLimit) {
5597 fHitEnd = TRUE;
5598 break;
5599 }
5600 UChar32 c;
5601 U16_NEXT(inputBuf, ix, fActiveLimit, c);
5602 if (c<256) {
5603 if (s8->contains(c) == FALSE) {
5604 U16_BACK_1(inputBuf, 0, ix);
5605 break;
5606 }
5607 } else {
5608 if (s->contains(c) == FALSE) {
5609 U16_BACK_1(inputBuf, 0, ix);
5610 break;
5611 }
5612 }
5613 }
5614
5615 // If there were no matching characters, skip over the loop altogether.
5616 // The loop doesn't run at all, a * op always succeeds.
5617 if (ix == fp->fInputIdx) {
5618 fp->fPatIdx++; // skip the URX_LOOP_C op.
5619 break;
5620 }
5621
5622 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5623 // must follow. It's operand is the stack location
5624 // that holds the starting input index for the match of this [set]*
5625 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5626 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5627 int32_t stackLoc = URX_VAL(loopcOp);
5628 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5629 fp->fExtra[stackLoc] = fp->fInputIdx;
5630 fp->fInputIdx = ix;
5631
5632 // Save State to the URX_LOOP_C op that follows this one,
5633 // so that match failures in the following code will return to there.
5634 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5635 fp = StateSave(fp, fp->fPatIdx, status);
5636 fp->fPatIdx++;
5637 }
5638 break;
5639
5640
5641 case URX_LOOP_DOT_I:
5642 // Loop Initialization for the optimized implementation of .*
5643 // This op scans through all remaining input.
5644 // The following LOOP_C op emulates stack unwinding if the following pattern fails.
5645 {
5646 // Loop through input until the input is exhausted (we reach an end-of-line)
5647 // In DOTALL mode, we can just go straight to the end of the input.
5648 int32_t ix;
5649 if ((opValue & 1) == 1) {
5650 // Dot-matches-All mode. Jump straight to the end of the string.
5651 ix = (int32_t)fActiveLimit;
5652 fHitEnd = TRUE;
5653 } else {
5654 // NOT DOT ALL mode. Line endings do not match '.'
5655 // Scan forward until a line ending or end of input.
5656 ix = (int32_t)fp->fInputIdx;
5657 for (;;) {
5658 if (ix >= fActiveLimit) {
5659 fHitEnd = TRUE;
5660 break;
5661 }
5662 UChar32 c;
5663 U16_NEXT(inputBuf, ix, fActiveLimit, c); // c = inputBuf[ix++]
5664 if ((c & 0x7f) <= 0x29) { // Fast filter of non-new-line-s
5665 if ((c == 0x0a) || // 0x0a is newline in both modes.
5666 (((opValue & 2) == 0) && // IF not UNIX_LINES mode
5667 isLineTerminator(c))) {
5668 // char is a line ending. Put the input pos back to the
5669 // line ending char, and exit the scanning loop.
5670 U16_BACK_1(inputBuf, 0, ix);
5671 break;
5672 }
5673 }
5674 }
5675 }
5676
5677 // If there were no matching characters, skip over the loop altogether.
5678 // The loop doesn't run at all, a * op always succeeds.
5679 if (ix == fp->fInputIdx) {
5680 fp->fPatIdx++; // skip the URX_LOOP_C op.
5681 break;
5682 }
5683
5684 // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5685 // must follow. It's operand is the stack location
5686 // that holds the starting input index for the match of this .*
5687 int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5688 U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5689 int32_t stackLoc = URX_VAL(loopcOp);
5690 U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5691 fp->fExtra[stackLoc] = fp->fInputIdx;
5692 fp->fInputIdx = ix;
5693
5694 // Save State to the URX_LOOP_C op that follows this one,
5695 // so that match failures in the following code will return to there.
5696 // Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5697 fp = StateSave(fp, fp->fPatIdx, status);
5698 fp->fPatIdx++;
5699 }
5700 break;
5701
5702
5703 case URX_LOOP_C:
5704 {
5705 U_ASSERT(opValue>=0 && opValue<fFrameSize);
5706 backSearchIndex = (int32_t)fp->fExtra[opValue];
5707 U_ASSERT(backSearchIndex <= fp->fInputIdx);
5708 if (backSearchIndex == fp->fInputIdx) {
5709 // We've backed up the input idx to the point that the loop started.
5710 // The loop is done. Leave here without saving state.
5711 // Subsequent failures won't come back here.
5712 break;
5713 }
5714 // Set up for the next iteration of the loop, with input index
5715 // backed up by one from the last time through,
5716 // and a state save to this instruction in case the following code fails again.
5717 // (We're going backwards because this loop emulates stack unwinding, not
5718 // the initial scan forward.)
5719 U_ASSERT(fp->fInputIdx > 0);
5720 UChar32 prevC;
5721 U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5722
5723 if (prevC == 0x0a &&
5724 fp->fInputIdx > backSearchIndex &&
5725 inputBuf[fp->fInputIdx-1] == 0x0d) {
5726 int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5727 if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5728 // .*, stepping back over CRLF pair.
5729 U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5730 }
5731 }
5732
5733
5734 fp = StateSave(fp, fp->fPatIdx-1, status);
5735 }
5736 break;
5737
5738
5739
5740 default:
5741 // Trouble. The compiled pattern contains an entry with an
5742 // unrecognized type tag.
5743 U_ASSERT(FALSE);
5744 }
5745
5746 if (U_FAILURE(status)) {
5747 isMatch = FALSE;
5748 break;
5749 }
5750 }
5751
5752 breakFromLoop:
5753 fMatch = isMatch;
5754 if (isMatch) {
5755 fLastMatchEnd = fMatchEnd;
5756 fMatchStart = startIdx;
5757 fMatchEnd = fp->fInputIdx;
5758 }
5759
5760 #ifdef REGEX_RUN_DEBUG
5761 if (fTraceDebug) {
5762 if (isMatch) {
5763 printf("Match. start=%ld end=%ld\n\n", fMatchStart, fMatchEnd);
5764 } else {
5765 printf("No match\n\n");
5766 }
5767 }
5768 #endif
5769
5770 fFrame = fp; // The active stack frame when the engine stopped.
5771 // Contains the capture group results that we need to
5772 // access later.
5773
5774 return;
5775 }
5776
5777
5778 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5779
5780 U_NAMESPACE_END
5781
5782 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
5783