1 //
2 // file: regexcmp.cpp
3 //
4 // Copyright (C) 2002-2015 International Business Machines Corporation and others.
5 // All Rights Reserved.
6 //
7 // This file contains the ICU regular expression compiler, which is responsible
8 // for processing a regular expression pattern into the compiled form that
9 // is used by the match finding engine.
10 //
11
12 #include "unicode/utypes.h"
13
14 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
15
16 #include "unicode/ustring.h"
17 #include "unicode/unistr.h"
18 #include "unicode/uniset.h"
19 #include "unicode/uchar.h"
20 #include "unicode/uchriter.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/parseerr.h"
23 #include "unicode/regex.h"
24 #include "unicode/utf.h"
25 #include "unicode/utf16.h"
26 #include "patternprops.h"
27 #include "putilimp.h"
28 #include "cmemory.h"
29 #include "cstring.h"
30 #include "uvectr32.h"
31 #include "uvectr64.h"
32 #include "uassert.h"
33 #include "uinvchar.h"
34
35 #include "regeximp.h"
36 #include "regexcst.h" // Contains state table for the regex pattern parser.
37 // generated by a Perl script.
38 #include "regexcmp.h"
39 #include "regexst.h"
40 #include "regextxt.h"
41
42
43
44 U_NAMESPACE_BEGIN
45
46
47 //------------------------------------------------------------------------------
48 //
49 // Constructor.
50 //
51 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)52 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
53 fParenStack(status), fSetStack(status), fSetOpStack(status)
54 {
55 // Lazy init of all shared global sets (needed for init()'s empty text)
56 RegexStaticSets::initGlobals(&status);
57
58 fStatus = &status;
59
60 fRXPat = rxp;
61 fScanIndex = 0;
62 fLastChar = -1;
63 fPeekChar = -1;
64 fLineNum = 1;
65 fCharNum = 0;
66 fQuoteMode = FALSE;
67 fInBackslashQuote = FALSE;
68 fModeFlags = fRXPat->fFlags | 0x80000000;
69 fEOLComments = TRUE;
70
71 fMatchOpenParen = -1;
72 fMatchCloseParen = -1;
73 fCaptureName = NULL;
74
75 if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
76 status = rxp->fDeferredStatus;
77 }
78 }
79
80 static const UChar chAmp = 0x26; // '&'
81 static const UChar chDash = 0x2d; // '-'
82
83
84 //------------------------------------------------------------------------------
85 //
86 // Destructor
87 //
88 //------------------------------------------------------------------------------
~RegexCompile()89 RegexCompile::~RegexCompile() {
90 delete fCaptureName; // Normally will be NULL, but can exist if pattern
91 // compilation stops with a syntax error.
92 }
93
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)94 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
95 set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
96 }
97
98 //------------------------------------------------------------------------------
99 //
100 // Compile regex pattern. The state machine for rexexp pattern parsing is here.
101 // The state tables are hand-written in the file regexcst.txt,
102 // and converted to the form used here by a perl
103 // script regexcst.pl
104 //
105 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)106 void RegexCompile::compile(
107 const UnicodeString &pat, // Source pat to be compiled.
108 UParseError &pp, // Error position info
109 UErrorCode &e) // Error Code
110 {
111 fRXPat->fPatternString = new UnicodeString(pat);
112 UText patternText = UTEXT_INITIALIZER;
113 utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
114
115 if (U_SUCCESS(e)) {
116 compile(&patternText, pp, e);
117 utext_close(&patternText);
118 }
119 }
120
121 //
122 // compile, UText mode
123 // All the work is actually done here.
124 //
compile(UText * pat,UParseError & pp,UErrorCode & e)125 void RegexCompile::compile(
126 UText *pat, // Source pat to be compiled.
127 UParseError &pp, // Error position info
128 UErrorCode &e) // Error Code
129 {
130 fStatus = &e;
131 fParseErr = &pp;
132 fStackPtr = 0;
133 fStack[fStackPtr] = 0;
134
135 if (U_FAILURE(*fStatus)) {
136 return;
137 }
138
139 // There should be no pattern stuff in the RegexPattern object. They can not be reused.
140 U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
141
142 // Prepare the RegexPattern object to receive the compiled pattern.
143 fRXPat->fPattern = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
144 if (U_FAILURE(*fStatus)) {
145 return;
146 }
147 fRXPat->fStaticSets = RegexStaticSets::gStaticSets->fPropSets;
148 fRXPat->fStaticSets8 = RegexStaticSets::gStaticSets->fPropSets8;
149
150
151 // Initialize the pattern scanning state machine
152 fPatternLength = utext_nativeLength(pat);
153 uint16_t state = 1;
154 const RegexTableEl *tableEl;
155
156 // UREGEX_LITERAL force entire pattern to be treated as a literal string.
157 if (fModeFlags & UREGEX_LITERAL) {
158 fQuoteMode = TRUE;
159 }
160
161 nextChar(fC); // Fetch the first char from the pattern string.
162
163 //
164 // Main loop for the regex pattern parsing state machine.
165 // Runs once per state transition.
166 // Each time through optionally performs, depending on the state table,
167 // - an advance to the the next pattern char
168 // - an action to be performed.
169 // - pushing or popping a state to/from the local state return stack.
170 // file regexcst.txt is the source for the state table. The logic behind
171 // recongizing the pattern syntax is there, not here.
172 //
173 for (;;) {
174 // Bail out if anything has gone wrong.
175 // Regex pattern parsing stops on the first error encountered.
176 if (U_FAILURE(*fStatus)) {
177 break;
178 }
179
180 U_ASSERT(state != 0);
181
182 // Find the state table element that matches the input char from the pattern, or the
183 // class of the input character. Start with the first table row for this
184 // state, then linearly scan forward until we find a row that matches the
185 // character. The last row for each state always matches all characters, so
186 // the search will stop there, if not before.
187 //
188 tableEl = &gRuleParseStateTable[state];
189 REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d) state=%s ",
190 fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
191
192 for (;;) { // loop through table rows belonging to this state, looking for one
193 // that matches the current input char.
194 REGEX_SCAN_DEBUG_PRINTF(("."));
195 if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE && tableEl->fCharClass == fC.fChar) {
196 // Table row specified an individual character, not a set, and
197 // the input character is not quoted, and
198 // the input character matched it.
199 break;
200 }
201 if (tableEl->fCharClass == 255) {
202 // Table row specified default, match anything character class.
203 break;
204 }
205 if (tableEl->fCharClass == 254 && fC.fQuoted) {
206 // Table row specified "quoted" and the char was quoted.
207 break;
208 }
209 if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1) {
210 // Table row specified eof and we hit eof on the input.
211 break;
212 }
213
214 if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 && // Table specs a char class &&
215 fC.fQuoted == FALSE && // char is not escaped &&
216 fC.fChar != (UChar32)-1) { // char is not EOF
217 U_ASSERT(tableEl->fCharClass <= 137);
218 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
219 // Table row specified a character class, or set of characters,
220 // and the current char matches it.
221 break;
222 }
223 }
224
225 // No match on this row, advance to the next row for this state,
226 tableEl++;
227 }
228 REGEX_SCAN_DEBUG_PRINTF(("\n"));
229
230 //
231 // We've found the row of the state table that matches the current input
232 // character from the rules string.
233 // Perform any action specified by this row in the state table.
234 if (doParseActions(tableEl->fAction) == FALSE) {
235 // Break out of the state machine loop if the
236 // the action signalled some kind of error, or
237 // the action was to exit, occurs on normal end-of-rules-input.
238 break;
239 }
240
241 if (tableEl->fPushState != 0) {
242 fStackPtr++;
243 if (fStackPtr >= kStackSize) {
244 error(U_REGEX_INTERNAL_ERROR);
245 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
246 fStackPtr--;
247 }
248 fStack[fStackPtr] = tableEl->fPushState;
249 }
250
251 //
252 // NextChar. This is where characters are actually fetched from the pattern.
253 // Happens under control of the 'n' tag in the state table.
254 //
255 if (tableEl->fNextChar) {
256 nextChar(fC);
257 }
258
259 // Get the next state from the table entry, or from the
260 // state stack if the next state was specified as "pop".
261 if (tableEl->fNextState != 255) {
262 state = tableEl->fNextState;
263 } else {
264 state = fStack[fStackPtr];
265 fStackPtr--;
266 if (fStackPtr < 0) {
267 // state stack underflow
268 // This will occur if the user pattern has mis-matched parentheses,
269 // with extra close parens.
270 //
271 fStackPtr++;
272 error(U_REGEX_MISMATCHED_PAREN);
273 }
274 }
275
276 }
277
278 if (U_FAILURE(*fStatus)) {
279 // Bail out if the pattern had errors.
280 // Set stack cleanup: a successful compile would have left it empty,
281 // but errors can leave temporary sets hanging around.
282 while (!fSetStack.empty()) {
283 delete (UnicodeSet *)fSetStack.pop();
284 }
285 return;
286 }
287
288 //
289 // The pattern has now been read and processed, and the compiled code generated.
290 //
291
292 //
293 // The pattern's fFrameSize so far has accumulated the requirements for
294 // storage for capture parentheses, counters, etc. that are encountered
295 // in the pattern. Add space for the two variables that are always
296 // present in the saved state: the input string position (int64_t) and
297 // the position in the compiled pattern.
298 //
299 allocateStackData(RESTACKFRAME_HDRCOUNT);
300
301 //
302 // Optimization pass 1: NOPs, back-references, and case-folding
303 //
304 stripNOPs();
305
306 //
307 // Get bounds for the minimum and maximum length of a string that this
308 // pattern can match. Used to avoid looking for matches in strings that
309 // are too short.
310 //
311 fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
312
313 //
314 // Optimization pass 2: match start type
315 //
316 matchStartType();
317
318 //
319 // Set up fast latin-1 range sets
320 //
321 int32_t numSets = fRXPat->fSets->size();
322 fRXPat->fSets8 = new Regex8BitSet[numSets];
323 // Null pointer check.
324 if (fRXPat->fSets8 == NULL) {
325 e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
326 return;
327 }
328 int32_t i;
329 for (i=0; i<numSets; i++) {
330 UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
331 fRXPat->fSets8[i].init(s);
332 }
333
334 }
335
336
337
338
339
340 //------------------------------------------------------------------------------
341 //
342 // doParseAction Do some action during regex pattern parsing.
343 // Called by the parse state machine.
344 //
345 // Generation of the match engine PCode happens here, or
346 // in functions called from the parse actions defined here.
347 //
348 //
349 //------------------------------------------------------------------------------
doParseActions(int32_t action)350 UBool RegexCompile::doParseActions(int32_t action)
351 {
352 UBool returnVal = TRUE;
353
354 switch ((Regex_PatternParseAction)action) {
355
356 case doPatStart:
357 // Start of pattern compiles to:
358 //0 SAVE 2 Fall back to position of FAIL
359 //1 jmp 3
360 //2 FAIL Stop if we ever reach here.
361 //3 NOP Dummy, so start of pattern looks the same as
362 // the start of an ( grouping.
363 //4 NOP Resreved, will be replaced by a save if there are
364 // OR | operators at the top level
365 appendOp(URX_STATE_SAVE, 2);
366 appendOp(URX_JMP, 3);
367 appendOp(URX_FAIL, 0);
368
369 // Standard open nonCapture paren action emits the two NOPs and
370 // sets up the paren stack frame.
371 doParseActions(doOpenNonCaptureParen);
372 break;
373
374 case doPatFinish:
375 // We've scanned to the end of the pattern
376 // The end of pattern compiles to:
377 // URX_END
378 // which will stop the runtime match engine.
379 // Encountering end of pattern also behaves like a close paren,
380 // and forces fixups of the State Save at the beginning of the compiled pattern
381 // and of any OR operations at the top level.
382 //
383 handleCloseParen();
384 if (fParenStack.size() > 0) {
385 // Missing close paren in pattern.
386 error(U_REGEX_MISMATCHED_PAREN);
387 }
388
389 // add the END operation to the compiled pattern.
390 appendOp(URX_END, 0);
391
392 // Terminate the pattern compilation state machine.
393 returnVal = FALSE;
394 break;
395
396
397
398 case doOrOperator:
399 // Scanning a '|', as in (A|B)
400 {
401 // Generate code for any pending literals preceding the '|'
402 fixLiterals(FALSE);
403
404 // Insert a SAVE operation at the start of the pattern section preceding
405 // this OR at this level. This SAVE will branch the match forward
406 // to the right hand side of the OR in the event that the left hand
407 // side fails to match and backtracks. Locate the position for the
408 // save from the location on the top of the parentheses stack.
409 int32_t savePosition = fParenStack.popi();
410 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
411 U_ASSERT(URX_TYPE(op) == URX_NOP); // original contents of reserved location
412 op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
413 fRXPat->fCompiledPat->setElementAt(op, savePosition);
414
415 // Append an JMP operation into the compiled pattern. The operand for
416 // the JMP will eventually be the location following the ')' for the
417 // group. This will be patched in later, when the ')' is encountered.
418 appendOp(URX_JMP, 0);
419
420 // Push the position of the newly added JMP op onto the parentheses stack.
421 // This registers if for fixup when this block's close paren is encountered.
422 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
423
424 // Append a NOP to the compiled pattern. This is the slot reserved
425 // for a SAVE in the event that there is yet another '|' following
426 // this one.
427 appendOp(URX_NOP, 0);
428 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
429 }
430 break;
431
432
433 case doBeginNamedCapture:
434 // Scanning (?<letter.
435 // The first letter of the name will come through again under doConinueNamedCapture.
436 fCaptureName = new UnicodeString();
437 if (fCaptureName == NULL) {
438 error(U_MEMORY_ALLOCATION_ERROR);
439 }
440 break;
441
442 case doContinueNamedCapture:
443 fCaptureName->append(fC.fChar);
444 break;
445
446 case doBadNamedCapture:
447 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
448 break;
449
450 case doOpenCaptureParen:
451 // Open Capturing Paren, possibly named.
452 // Compile to a
453 // - NOP, which later may be replaced by a save-state if the
454 // parenthesized group gets a * quantifier, followed by
455 // - START_CAPTURE n where n is stack frame offset to the capture group variables.
456 // - NOP, which may later be replaced by a save-state if there
457 // is an '|' alternation within the parens.
458 //
459 // Each capture group gets three slots in the save stack frame:
460 // 0: Capture Group start position (in input string being matched.)
461 // 1: Capture Group end position.
462 // 2: Start of Match-in-progress.
463 // The first two locations are for a completed capture group, and are
464 // referred to by back references and the like.
465 // The third location stores the capture start position when an START_CAPTURE is
466 // encountered. This will be promoted to a completed capture when (and if) the corresponding
467 // END_CAPTURE is encountered.
468 {
469 fixLiterals();
470 appendOp(URX_NOP, 0);
471 int32_t varsLoc = allocateStackData(3); // Reserve three slots in match stack frame.
472 appendOp(URX_START_CAPTURE, varsLoc);
473 appendOp(URX_NOP, 0);
474
475 // On the Parentheses stack, start a new frame and add the postions
476 // of the two NOPs. Depending on what follows in the pattern, the
477 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
478 // address of the end of the parenthesized group.
479 fParenStack.push(fModeFlags, *fStatus); // Match mode state
480 fParenStack.push(capturing, *fStatus); // Frame type.
481 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP location
482 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
483
484 // Save the mapping from group number to stack frame variable position.
485 fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
486
487 // If this is a named capture group, add the name->group number mapping.
488 if (fCaptureName != NULL) {
489 int32_t groupNumber = fRXPat->fGroupMap->size();
490 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
491 fCaptureName = NULL; // hash table takes ownership of the name (key) string.
492 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
493 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
494 }
495 }
496 }
497 break;
498
499 case doOpenNonCaptureParen:
500 // Open non-caputuring (grouping only) Paren.
501 // Compile to a
502 // - NOP, which later may be replaced by a save-state if the
503 // parenthesized group gets a * quantifier, followed by
504 // - NOP, which may later be replaced by a save-state if there
505 // is an '|' alternation within the parens.
506 {
507 fixLiterals();
508 appendOp(URX_NOP, 0);
509 appendOp(URX_NOP, 0);
510
511 // On the Parentheses stack, start a new frame and add the postions
512 // of the two NOPs.
513 fParenStack.push(fModeFlags, *fStatus); // Match mode state
514 fParenStack.push(plain, *fStatus); // Begin a new frame.
515 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
516 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP loc
517 }
518 break;
519
520
521 case doOpenAtomicParen:
522 // Open Atomic Paren. (?>
523 // Compile to a
524 // - NOP, which later may be replaced if the parenthesized group
525 // has a quantifier, followed by
526 // - STO_SP save state stack position, so it can be restored at the ")"
527 // - NOP, which may later be replaced by a save-state if there
528 // is an '|' alternation within the parens.
529 {
530 fixLiterals();
531 appendOp(URX_NOP, 0);
532 int32_t varLoc = allocateData(1); // Reserve a data location for saving the state stack ptr.
533 appendOp(URX_STO_SP, varLoc);
534 appendOp(URX_NOP, 0);
535
536 // On the Parentheses stack, start a new frame and add the postions
537 // of the two NOPs. Depending on what follows in the pattern, the
538 // NOPs may be changed to SAVE_STATE or JMP ops, with a target
539 // address of the end of the parenthesized group.
540 fParenStack.push(fModeFlags, *fStatus); // Match mode state
541 fParenStack.push(atomic, *fStatus); // Frame type.
542 fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus); // The first NOP
543 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
544 }
545 break;
546
547
548 case doOpenLookAhead:
549 // Positive Look-ahead (?= stuff )
550 //
551 // Note: Addition of transparent input regions, with the need to
552 // restore the original regions when failing out of a lookahead
553 // block, complicated this sequence. Some conbined opcodes
554 // might make sense - or might not, lookahead aren't that common.
555 //
556 // Caution: min match length optimization knows about this
557 // sequence; don't change without making updates there too.
558 //
559 // Compiles to
560 // 1 START_LA dataLoc Saves SP, Input Pos
561 // 2. STATE_SAVE 4 on failure of lookahead, goto 4
562 // 3 JMP 6 continue ...
563 //
564 // 4. LA_END Look Ahead failed. Restore regions.
565 // 5. BACKTRACK and back track again.
566 //
567 // 6. NOP reserved for use by quantifiers on the block.
568 // Look-ahead can't have quantifiers, but paren stack
569 // compile time conventions require the slot anyhow.
570 // 7. NOP may be replaced if there is are '|' ops in the block.
571 // 8. code for parenthesized stuff.
572 // 9. LA_END
573 //
574 // Two data slots are reserved, for saving the stack ptr and the input position.
575 {
576 fixLiterals();
577 int32_t dataLoc = allocateData(2);
578 appendOp(URX_LA_START, dataLoc);
579 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
580 appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
581 appendOp(URX_LA_END, dataLoc);
582 appendOp(URX_BACKTRACK, 0);
583 appendOp(URX_NOP, 0);
584 appendOp(URX_NOP, 0);
585
586 // On the Parentheses stack, start a new frame and add the postions
587 // of the NOPs.
588 fParenStack.push(fModeFlags, *fStatus); // Match mode state
589 fParenStack.push(lookAhead, *fStatus); // Frame type.
590 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
591 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
592 }
593 break;
594
595 case doOpenLookAheadNeg:
596 // Negated Lookahead. (?! stuff )
597 // Compiles to
598 // 1. START_LA dataloc
599 // 2. SAVE_STATE 7 // Fail within look-ahead block restores to this state,
600 // // which continues with the match.
601 // 3. NOP // Std. Open Paren sequence, for possible '|'
602 // 4. code for parenthesized stuff.
603 // 5. END_LA // Cut back stack, remove saved state from step 2.
604 // 6. BACKTRACK // code in block succeeded, so neg. lookahead fails.
605 // 7. END_LA // Restore match region, in case look-ahead was using
606 // an alternate (transparent) region.
607 {
608 fixLiterals();
609 int32_t dataLoc = allocateData(2);
610 appendOp(URX_LA_START, dataLoc);
611 appendOp(URX_STATE_SAVE, 0); // dest address will be patched later.
612 appendOp(URX_NOP, 0);
613
614 // On the Parentheses stack, start a new frame and add the postions
615 // of the StateSave and NOP.
616 fParenStack.push(fModeFlags, *fStatus); // Match mode state
617 fParenStack.push(negLookAhead, *fStatus); // Frame type
618 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The STATE_SAVE location
619 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP location
620
621 // Instructions #5 - #7 will be added when the ')' is encountered.
622 }
623 break;
624
625 case doOpenLookBehind:
626 {
627 // Compile a (?<= look-behind open paren.
628 //
629 // Compiles to
630 // 0 URX_LB_START dataLoc
631 // 1 URX_LB_CONT dataLoc
632 // 2 MinMatchLen
633 // 3 MaxMatchLen
634 // 4 URX_NOP Standard '(' boilerplate.
635 // 5 URX_NOP Reserved slot for use with '|' ops within (block).
636 // 6 <code for LookBehind expression>
637 // 7 URX_LB_END dataLoc # Check match len, restore input len
638 // 8 URX_LA_END dataLoc # Restore stack, input pos
639 //
640 // Allocate a block of matcher data, to contain (when running a match)
641 // 0: Stack ptr on entry
642 // 1: Input Index on entry
643 // 2: Start index of match current match attempt.
644 // 3: Original Input String len.
645
646 // Generate match code for any pending literals.
647 fixLiterals();
648
649 // Allocate data space
650 int32_t dataLoc = allocateData(4);
651
652 // Emit URX_LB_START
653 appendOp(URX_LB_START, dataLoc);
654
655 // Emit URX_LB_CONT
656 appendOp(URX_LB_CONT, dataLoc);
657 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
658 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
659
660 // Emit the NOPs
661 appendOp(URX_NOP, 0);
662 appendOp(URX_NOP, 0);
663
664 // On the Parentheses stack, start a new frame and add the postions
665 // of the URX_LB_CONT and the NOP.
666 fParenStack.push(fModeFlags, *fStatus); // Match mode state
667 fParenStack.push(lookBehind, *fStatus); // Frame type
668 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
669 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
670
671 // The final two instructions will be added when the ')' is encountered.
672 }
673
674 break;
675
676 case doOpenLookBehindNeg:
677 {
678 // Compile a (?<! negated look-behind open paren.
679 //
680 // Compiles to
681 // 0 URX_LB_START dataLoc # Save entry stack, input len
682 // 1 URX_LBN_CONT dataLoc # Iterate possible match positions
683 // 2 MinMatchLen
684 // 3 MaxMatchLen
685 // 4 continueLoc (9)
686 // 5 URX_NOP Standard '(' boilerplate.
687 // 6 URX_NOP Reserved slot for use with '|' ops within (block).
688 // 7 <code for LookBehind expression>
689 // 8 URX_LBN_END dataLoc # Check match len, cause a FAIL
690 // 9 ...
691 //
692 // Allocate a block of matcher data, to contain (when running a match)
693 // 0: Stack ptr on entry
694 // 1: Input Index on entry
695 // 2: Start index of match current match attempt.
696 // 3: Original Input String len.
697
698 // Generate match code for any pending literals.
699 fixLiterals();
700
701 // Allocate data space
702 int32_t dataLoc = allocateData(4);
703
704 // Emit URX_LB_START
705 appendOp(URX_LB_START, dataLoc);
706
707 // Emit URX_LBN_CONT
708 appendOp(URX_LBN_CONT, dataLoc);
709 appendOp(URX_RESERVED_OP, 0); // MinMatchLength. To be filled later.
710 appendOp(URX_RESERVED_OP, 0); // MaxMatchLength. To be filled later.
711 appendOp(URX_RESERVED_OP, 0); // Continue Loc. To be filled later.
712
713 // Emit the NOPs
714 appendOp(URX_NOP, 0);
715 appendOp(URX_NOP, 0);
716
717 // On the Parentheses stack, start a new frame and add the postions
718 // of the URX_LB_CONT and the NOP.
719 fParenStack.push(fModeFlags, *fStatus); // Match mode state
720 fParenStack.push(lookBehindN, *fStatus); // Frame type
721 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP location
722 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The 2nd NOP location
723
724 // The final two instructions will be added when the ')' is encountered.
725 }
726 break;
727
728 case doConditionalExpr:
729 // Conditionals such as (?(1)a:b)
730 case doPerlInline:
731 // Perl inline-condtionals. (?{perl code}a|b) We're not perl, no way to do them.
732 error(U_REGEX_UNIMPLEMENTED);
733 break;
734
735
736 case doCloseParen:
737 handleCloseParen();
738 if (fParenStack.size() <= 0) {
739 // Extra close paren, or missing open paren.
740 error(U_REGEX_MISMATCHED_PAREN);
741 }
742 break;
743
744 case doNOP:
745 break;
746
747
748 case doBadOpenParenType:
749 case doRuleError:
750 error(U_REGEX_RULE_SYNTAX);
751 break;
752
753
754 case doMismatchedParenErr:
755 error(U_REGEX_MISMATCHED_PAREN);
756 break;
757
758 case doPlus:
759 // Normal '+' compiles to
760 // 1. stuff to be repeated (already built)
761 // 2. jmp-sav 1
762 // 3. ...
763 //
764 // Or, if the item to be repeated can match a zero length string,
765 // 1. STO_INP_LOC data-loc
766 // 2. body of stuff to be repeated
767 // 3. JMP_SAV_X 2
768 // 4. ...
769
770 //
771 // Or, if the item to be repeated is simple
772 // 1. Item to be repeated.
773 // 2. LOOP_SR_I set number (assuming repeated item is a set ref)
774 // 3. LOOP_C stack location
775 {
776 int32_t topLoc = blockTopLoc(FALSE); // location of item #1
777 int32_t frameLoc;
778
779 // Check for simple constructs, which may get special optimized code.
780 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
781 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
782
783 if (URX_TYPE(repeatedOp) == URX_SETREF) {
784 // Emit optimized code for [char set]+
785 appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
786 frameLoc = allocateStackData(1);
787 appendOp(URX_LOOP_C, frameLoc);
788 break;
789 }
790
791 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
792 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
793 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
794 // Emit Optimized code for .+ operations.
795 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
796 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
797 // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
798 loopOpI |= 1;
799 }
800 if (fModeFlags & UREGEX_UNIX_LINES) {
801 loopOpI |= 2;
802 }
803 appendOp(loopOpI);
804 frameLoc = allocateStackData(1);
805 appendOp(URX_LOOP_C, frameLoc);
806 break;
807 }
808
809 }
810
811 // General case.
812
813 // Check for minimum match length of zero, which requires
814 // extra loop-breaking code.
815 if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
816 // Zero length match is possible.
817 // Emit the code sequence that can handle it.
818 insertOp(topLoc);
819 frameLoc = allocateStackData(1);
820
821 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
822 fRXPat->fCompiledPat->setElementAt(op, topLoc);
823
824 appendOp(URX_JMP_SAV_X, topLoc+1);
825 } else {
826 // Simpler code when the repeated body must match something non-empty
827 appendOp(URX_JMP_SAV, topLoc);
828 }
829 }
830 break;
831
832 case doNGPlus:
833 // Non-greedy '+?' compiles to
834 // 1. stuff to be repeated (already built)
835 // 2. state-save 1
836 // 3. ...
837 {
838 int32_t topLoc = blockTopLoc(FALSE);
839 appendOp(URX_STATE_SAVE, topLoc);
840 }
841 break;
842
843
844 case doOpt:
845 // Normal (greedy) ? quantifier.
846 // Compiles to
847 // 1. state save 3
848 // 2. body of optional block
849 // 3. ...
850 // Insert the state save into the compiled pattern, and we're done.
851 {
852 int32_t saveStateLoc = blockTopLoc(TRUE);
853 int32_t saveStateOp = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
854 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
855 }
856 break;
857
858 case doNGOpt:
859 // Non-greedy ?? quantifier
860 // compiles to
861 // 1. jmp 4
862 // 2. body of optional block
863 // 3 jmp 5
864 // 4. state save 2
865 // 5 ...
866 // This code is less than ideal, with two jmps instead of one, because we can only
867 // insert one instruction at the top of the block being iterated.
868 {
869 int32_t jmp1_loc = blockTopLoc(TRUE);
870 int32_t jmp2_loc = fRXPat->fCompiledPat->size();
871
872 int32_t jmp1_op = buildOp(URX_JMP, jmp2_loc+1);
873 fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
874
875 appendOp(URX_JMP, jmp2_loc+2);
876
877 appendOp(URX_STATE_SAVE, jmp1_loc+1);
878 }
879 break;
880
881
882 case doStar:
883 // Normal (greedy) * quantifier.
884 // Compiles to
885 // 1. STATE_SAVE 4
886 // 2. body of stuff being iterated over
887 // 3. JMP_SAV 2
888 // 4. ...
889 //
890 // Or, if the body is a simple [Set],
891 // 1. LOOP_SR_I set number
892 // 2. LOOP_C stack location
893 // ...
894 //
895 // Or if this is a .*
896 // 1. LOOP_DOT_I (. matches all mode flag)
897 // 2. LOOP_C stack location
898 //
899 // Or, if the body can match a zero-length string, to inhibit infinite loops,
900 // 1. STATE_SAVE 5
901 // 2. STO_INP_LOC data-loc
902 // 3. body of stuff
903 // 4. JMP_SAV_X 2
904 // 5. ...
905 {
906 // location of item #1, the STATE_SAVE
907 int32_t topLoc = blockTopLoc(FALSE);
908 int32_t dataLoc = -1;
909
910 // Check for simple *, where the construct being repeated
911 // compiled to single opcode, and might be optimizable.
912 if (topLoc == fRXPat->fCompiledPat->size() - 1) {
913 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
914
915 if (URX_TYPE(repeatedOp) == URX_SETREF) {
916 // Emit optimized code for a [char set]*
917 int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
918 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
919 dataLoc = allocateStackData(1);
920 appendOp(URX_LOOP_C, dataLoc);
921 break;
922 }
923
924 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
925 URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
926 URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
927 // Emit Optimized code for .* operations.
928 int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
929 if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
930 // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
931 loopOpI |= 1;
932 }
933 if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
934 loopOpI |= 2;
935 }
936 fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
937 dataLoc = allocateStackData(1);
938 appendOp(URX_LOOP_C, dataLoc);
939 break;
940 }
941 }
942
943 // Emit general case code for this *
944 // The optimizations did not apply.
945
946 int32_t saveStateLoc = blockTopLoc(TRUE);
947 int32_t jmpOp = buildOp(URX_JMP_SAV, saveStateLoc+1);
948
949 // Check for minimum match length of zero, which requires
950 // extra loop-breaking code.
951 if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
952 insertOp(saveStateLoc);
953 dataLoc = allocateStackData(1);
954
955 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
956 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
957 jmpOp = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
958 }
959
960 // Locate the position in the compiled pattern where the match will continue
961 // after completing the *. (4 or 5 in the comment above)
962 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
963
964 // Put together the save state op and store it into the compiled code.
965 int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
966 fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
967
968 // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
969 appendOp(jmpOp);
970 }
971 break;
972
973 case doNGStar:
974 // Non-greedy *? quantifier
975 // compiles to
976 // 1. JMP 3
977 // 2. body of stuff being iterated over
978 // 3. STATE_SAVE 2
979 // 4 ...
980 {
981 int32_t jmpLoc = blockTopLoc(TRUE); // loc 1.
982 int32_t saveLoc = fRXPat->fCompiledPat->size(); // loc 3.
983 int32_t jmpOp = buildOp(URX_JMP, saveLoc);
984 fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
985 appendOp(URX_STATE_SAVE, jmpLoc+1);
986 }
987 break;
988
989
990 case doIntervalInit:
991 // The '{' opening an interval quantifier was just scanned.
992 // Init the counter varaiables that will accumulate the values as the digits
993 // are scanned.
994 fIntervalLow = 0;
995 fIntervalUpper = -1;
996 break;
997
998 case doIntevalLowerDigit:
999 // Scanned a digit from the lower value of an {lower,upper} interval
1000 {
1001 int32_t digitValue = u_charDigitValue(fC.fChar);
1002 U_ASSERT(digitValue >= 0);
1003 int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1004 if (val > INT32_MAX) {
1005 error(U_REGEX_NUMBER_TOO_BIG);
1006 } else {
1007 fIntervalLow = (int32_t)val;
1008 }
1009 }
1010 break;
1011
1012 case doIntervalUpperDigit:
1013 // Scanned a digit from the upper value of an {lower,upper} interval
1014 {
1015 if (fIntervalUpper < 0) {
1016 fIntervalUpper = 0;
1017 }
1018 int32_t digitValue = u_charDigitValue(fC.fChar);
1019 U_ASSERT(digitValue >= 0);
1020 int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1021 if (val > INT32_MAX) {
1022 error(U_REGEX_NUMBER_TOO_BIG);
1023 } else {
1024 fIntervalUpper = (int32_t)val;
1025 }
1026 }
1027 break;
1028
1029 case doIntervalSame:
1030 // Scanned a single value interval like {27}. Upper = Lower.
1031 fIntervalUpper = fIntervalLow;
1032 break;
1033
1034 case doInterval:
1035 // Finished scanning a normal {lower,upper} interval. Generate the code for it.
1036 if (compileInlineInterval() == FALSE) {
1037 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1038 }
1039 break;
1040
1041 case doPossessiveInterval:
1042 // Finished scanning a Possessive {lower,upper}+ interval. Generate the code for it.
1043 {
1044 // Remember the loc for the top of the block being looped over.
1045 // (Can not reserve a slot in the compiled pattern at this time, because
1046 // compileInterval needs to reserve also, and blockTopLoc can only reserve
1047 // once per block.)
1048 int32_t topLoc = blockTopLoc(FALSE);
1049
1050 // Produce normal looping code.
1051 compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1052
1053 // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1054 // just as if the loop was inclosed in atomic parentheses.
1055
1056 // First the STO_SP before the start of the loop
1057 insertOp(topLoc);
1058
1059 int32_t varLoc = allocateData(1); // Reserve a data location for saving the
1060 int32_t op = buildOp(URX_STO_SP, varLoc);
1061 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1062
1063 int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1064 U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1065 loopOp++; // point LoopOp after the just-inserted STO_SP
1066 fRXPat->fCompiledPat->push(loopOp, *fStatus);
1067
1068 // Then the LD_SP after the end of the loop
1069 appendOp(URX_LD_SP, varLoc);
1070 }
1071
1072 break;
1073
1074 case doNGInterval:
1075 // Finished scanning a non-greedy {lower,upper}? interval. Generate the code for it.
1076 compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1077 break;
1078
1079 case doIntervalError:
1080 error(U_REGEX_BAD_INTERVAL);
1081 break;
1082
1083 case doLiteralChar:
1084 // We've just scanned a "normal" character from the pattern,
1085 literalChar(fC.fChar);
1086 break;
1087
1088
1089 case doEscapedLiteralChar:
1090 // We've just scanned an backslashed escaped character with no
1091 // special meaning. It represents itself.
1092 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1093 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1094 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1095 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1096 }
1097 literalChar(fC.fChar);
1098 break;
1099
1100
1101 case doDotAny:
1102 // scanned a ".", match any single character.
1103 {
1104 fixLiterals(FALSE);
1105 if (fModeFlags & UREGEX_DOTALL) {
1106 appendOp(URX_DOTANY_ALL, 0);
1107 } else if (fModeFlags & UREGEX_UNIX_LINES) {
1108 appendOp(URX_DOTANY_UNIX, 0);
1109 } else {
1110 appendOp(URX_DOTANY, 0);
1111 }
1112 }
1113 break;
1114
1115 case doCaret:
1116 {
1117 fixLiterals(FALSE);
1118 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1119 appendOp(URX_CARET, 0);
1120 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1121 appendOp(URX_CARET_M, 0);
1122 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1123 appendOp(URX_CARET, 0); // Only testing true start of input.
1124 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1125 appendOp(URX_CARET_M_UNIX, 0);
1126 }
1127 }
1128 break;
1129
1130 case doDollar:
1131 {
1132 fixLiterals(FALSE);
1133 if ( (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134 appendOp(URX_DOLLAR, 0);
1135 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136 appendOp(URX_DOLLAR_M, 0);
1137 } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138 appendOp(URX_DOLLAR_D, 0);
1139 } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140 appendOp(URX_DOLLAR_MD, 0);
1141 }
1142 }
1143 break;
1144
1145 case doBackslashA:
1146 fixLiterals(FALSE);
1147 appendOp(URX_CARET, 0);
1148 break;
1149
1150 case doBackslashB:
1151 {
1152 #if UCONFIG_NO_BREAK_ITERATION==1
1153 if (fModeFlags & UREGEX_UWORD) {
1154 error(U_UNSUPPORTED_ERROR);
1155 }
1156 #endif
1157 fixLiterals(FALSE);
1158 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1159 appendOp(op, 1);
1160 }
1161 break;
1162
1163 case doBackslashb:
1164 {
1165 #if UCONFIG_NO_BREAK_ITERATION==1
1166 if (fModeFlags & UREGEX_UWORD) {
1167 error(U_UNSUPPORTED_ERROR);
1168 }
1169 #endif
1170 fixLiterals(FALSE);
1171 int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1172 appendOp(op, 0);
1173 }
1174 break;
1175
1176 case doBackslashD:
1177 fixLiterals(FALSE);
1178 appendOp(URX_BACKSLASH_D, 1);
1179 break;
1180
1181 case doBackslashd:
1182 fixLiterals(FALSE);
1183 appendOp(URX_BACKSLASH_D, 0);
1184 break;
1185
1186 case doBackslashG:
1187 fixLiterals(FALSE);
1188 appendOp(URX_BACKSLASH_G, 0);
1189 break;
1190
1191 case doBackslashH:
1192 fixLiterals(FALSE);
1193 appendOp(URX_BACKSLASH_H, 1);
1194 break;
1195
1196 case doBackslashh:
1197 fixLiterals(FALSE);
1198 appendOp(URX_BACKSLASH_H, 0);
1199 break;
1200
1201 case doBackslashR:
1202 fixLiterals(FALSE);
1203 appendOp(URX_BACKSLASH_R, 0);
1204 break;
1205
1206 case doBackslashS:
1207 fixLiterals(FALSE);
1208 appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1209 break;
1210
1211 case doBackslashs:
1212 fixLiterals(FALSE);
1213 appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1214 break;
1215
1216 case doBackslashV:
1217 fixLiterals(FALSE);
1218 appendOp(URX_BACKSLASH_V, 1);
1219 break;
1220
1221 case doBackslashv:
1222 fixLiterals(FALSE);
1223 appendOp(URX_BACKSLASH_V, 0);
1224 break;
1225
1226 case doBackslashW:
1227 fixLiterals(FALSE);
1228 appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1229 break;
1230
1231 case doBackslashw:
1232 fixLiterals(FALSE);
1233 appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1234 break;
1235
1236 case doBackslashX:
1237 fixLiterals(FALSE);
1238 appendOp(URX_BACKSLASH_X, 0);
1239 break;
1240
1241
1242 case doBackslashZ:
1243 fixLiterals(FALSE);
1244 appendOp(URX_DOLLAR, 0);
1245 break;
1246
1247 case doBackslashz:
1248 fixLiterals(FALSE);
1249 appendOp(URX_BACKSLASH_Z, 0);
1250 break;
1251
1252 case doEscapeError:
1253 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1254 break;
1255
1256 case doExit:
1257 fixLiterals(FALSE);
1258 returnVal = FALSE;
1259 break;
1260
1261 case doProperty:
1262 {
1263 fixLiterals(FALSE);
1264 UnicodeSet *theSet = scanProp();
1265 compileSet(theSet);
1266 }
1267 break;
1268
1269 case doNamedChar:
1270 {
1271 UChar32 c = scanNamedChar();
1272 literalChar(c);
1273 }
1274 break;
1275
1276
1277 case doBackRef:
1278 // BackReference. Somewhat unusual in that the front-end can not completely parse
1279 // the regular expression, because the number of digits to be consumed
1280 // depends on the number of capture groups that have been defined. So
1281 // we have to do it here instead.
1282 {
1283 int32_t numCaptureGroups = fRXPat->fGroupMap->size();
1284 int32_t groupNum = 0;
1285 UChar32 c = fC.fChar;
1286
1287 for (;;) {
1288 // Loop once per digit, for max allowed number of digits in a back reference.
1289 int32_t digit = u_charDigitValue(c);
1290 groupNum = groupNum * 10 + digit;
1291 if (groupNum >= numCaptureGroups) {
1292 break;
1293 }
1294 c = peekCharLL();
1295 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1296 break;
1297 }
1298 nextCharLL();
1299 }
1300
1301 // Scan of the back reference in the source regexp is complete. Now generate
1302 // the compiled code for it.
1303 // Because capture groups can be forward-referenced by back-references,
1304 // we fill the operand with the capture group number. At the end
1305 // of compilation, it will be changed to the variable's location.
1306 U_ASSERT(groupNum > 0); // Shouldn't happen. '\0' begins an octal escape sequence,
1307 // and shouldn't enter this code path at all.
1308 fixLiterals(FALSE);
1309 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1310 appendOp(URX_BACKREF_I, groupNum);
1311 } else {
1312 appendOp(URX_BACKREF, groupNum);
1313 }
1314 }
1315 break;
1316
1317 case doBeginNamedBackRef:
1318 U_ASSERT(fCaptureName == NULL);
1319 fCaptureName = new UnicodeString;
1320 if (fCaptureName == NULL) {
1321 error(U_MEMORY_ALLOCATION_ERROR);
1322 }
1323 break;
1324
1325 case doContinueNamedBackRef:
1326 fCaptureName->append(fC.fChar);
1327 break;
1328
1329 case doCompleteNamedBackRef:
1330 {
1331 int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1332 if (groupNumber == 0) {
1333 // Group name has not been defined.
1334 // Could be a forward reference. If we choose to support them at some
1335 // future time, extra mechanism will be required at this point.
1336 error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1337 } else {
1338 // Given the number, handle identically to a \n numbered back reference.
1339 // See comments above, under doBackRef
1340 fixLiterals(FALSE);
1341 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1342 appendOp(URX_BACKREF_I, groupNumber);
1343 } else {
1344 appendOp(URX_BACKREF, groupNumber);
1345 }
1346 }
1347 delete fCaptureName;
1348 fCaptureName = NULL;
1349 break;
1350 }
1351
1352 case doPossessivePlus:
1353 // Possessive ++ quantifier.
1354 // Compiles to
1355 // 1. STO_SP
1356 // 2. body of stuff being iterated over
1357 // 3. STATE_SAVE 5
1358 // 4. JMP 2
1359 // 5. LD_SP
1360 // 6. ...
1361 //
1362 // Note: TODO: This is pretty inefficient. A mass of saved state is built up
1363 // then unconditionally discarded. Perhaps introduce a new opcode. Ticket 6056
1364 //
1365 {
1366 // Emit the STO_SP
1367 int32_t topLoc = blockTopLoc(TRUE);
1368 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1369 int32_t op = buildOp(URX_STO_SP, stoLoc);
1370 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1371
1372 // Emit the STATE_SAVE
1373 appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1374
1375 // Emit the JMP
1376 appendOp(URX_JMP, topLoc+1);
1377
1378 // Emit the LD_SP
1379 appendOp(URX_LD_SP, stoLoc);
1380 }
1381 break;
1382
1383 case doPossessiveStar:
1384 // Possessive *+ quantifier.
1385 // Compiles to
1386 // 1. STO_SP loc
1387 // 2. STATE_SAVE 5
1388 // 3. body of stuff being iterated over
1389 // 4. JMP 2
1390 // 5. LD_SP loc
1391 // 6 ...
1392 // TODO: do something to cut back the state stack each time through the loop.
1393 {
1394 // Reserve two slots at the top of the block.
1395 int32_t topLoc = blockTopLoc(TRUE);
1396 insertOp(topLoc);
1397
1398 // emit STO_SP loc
1399 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1400 int32_t op = buildOp(URX_STO_SP, stoLoc);
1401 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1402
1403 // Emit the SAVE_STATE 5
1404 int32_t L7 = fRXPat->fCompiledPat->size()+1;
1405 op = buildOp(URX_STATE_SAVE, L7);
1406 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1407
1408 // Append the JMP operation.
1409 appendOp(URX_JMP, topLoc+1);
1410
1411 // Emit the LD_SP loc
1412 appendOp(URX_LD_SP, stoLoc);
1413 }
1414 break;
1415
1416 case doPossessiveOpt:
1417 // Possessive ?+ quantifier.
1418 // Compiles to
1419 // 1. STO_SP loc
1420 // 2. SAVE_STATE 5
1421 // 3. body of optional block
1422 // 4. LD_SP loc
1423 // 5. ...
1424 //
1425 {
1426 // Reserve two slots at the top of the block.
1427 int32_t topLoc = blockTopLoc(TRUE);
1428 insertOp(topLoc);
1429
1430 // Emit the STO_SP
1431 int32_t stoLoc = allocateData(1); // Reserve the data location for storing save stack ptr.
1432 int32_t op = buildOp(URX_STO_SP, stoLoc);
1433 fRXPat->fCompiledPat->setElementAt(op, topLoc);
1434
1435 // Emit the SAVE_STATE
1436 int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
1437 op = buildOp(URX_STATE_SAVE, continueLoc);
1438 fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1439
1440 // Emit the LD_SP
1441 appendOp(URX_LD_SP, stoLoc);
1442 }
1443 break;
1444
1445
1446 case doBeginMatchMode:
1447 fNewModeFlags = fModeFlags;
1448 fSetModeFlag = TRUE;
1449 break;
1450
1451 case doMatchMode: // (?i) and similar
1452 {
1453 int32_t bit = 0;
1454 switch (fC.fChar) {
1455 case 0x69: /* 'i' */ bit = UREGEX_CASE_INSENSITIVE; break;
1456 case 0x64: /* 'd' */ bit = UREGEX_UNIX_LINES; break;
1457 case 0x6d: /* 'm' */ bit = UREGEX_MULTILINE; break;
1458 case 0x73: /* 's' */ bit = UREGEX_DOTALL; break;
1459 case 0x75: /* 'u' */ bit = 0; /* Unicode casing */ break;
1460 case 0x77: /* 'w' */ bit = UREGEX_UWORD; break;
1461 case 0x78: /* 'x' */ bit = UREGEX_COMMENTS; break;
1462 case 0x2d: /* '-' */ fSetModeFlag = FALSE; break;
1463 default:
1464 U_ASSERT(FALSE); // Should never happen. Other chars are filtered out
1465 // by the scanner.
1466 }
1467 if (fSetModeFlag) {
1468 fNewModeFlags |= bit;
1469 } else {
1470 fNewModeFlags &= ~bit;
1471 }
1472 }
1473 break;
1474
1475 case doSetMatchMode:
1476 // Emit code to match any pending literals, using the not-yet changed match mode.
1477 fixLiterals();
1478
1479 // We've got a (?i) or similar. The match mode is being changed, but
1480 // the change is not scoped to a parenthesized block.
1481 U_ASSERT(fNewModeFlags < 0);
1482 fModeFlags = fNewModeFlags;
1483
1484 break;
1485
1486
1487 case doMatchModeParen:
1488 // We've got a (?i: or similar. Begin a parenthesized block, save old
1489 // mode flags so they can be restored at the close of the block.
1490 //
1491 // Compile to a
1492 // - NOP, which later may be replaced by a save-state if the
1493 // parenthesized group gets a * quantifier, followed by
1494 // - NOP, which may later be replaced by a save-state if there
1495 // is an '|' alternation within the parens.
1496 {
1497 fixLiterals(FALSE);
1498 appendOp(URX_NOP, 0);
1499 appendOp(URX_NOP, 0);
1500
1501 // On the Parentheses stack, start a new frame and add the postions
1502 // of the two NOPs (a normal non-capturing () frame, except for the
1503 // saving of the orignal mode flags.)
1504 fParenStack.push(fModeFlags, *fStatus);
1505 fParenStack.push(flags, *fStatus); // Frame Marker
1506 fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus); // The first NOP
1507 fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus); // The second NOP
1508
1509 // Set the current mode flags to the new values.
1510 U_ASSERT(fNewModeFlags < 0);
1511 fModeFlags = fNewModeFlags;
1512 }
1513 break;
1514
1515 case doBadModeFlag:
1516 error(U_REGEX_INVALID_FLAG);
1517 break;
1518
1519 case doSuppressComments:
1520 // We have just scanned a '(?'. We now need to prevent the character scanner from
1521 // treating a '#' as a to-the-end-of-line comment.
1522 // (This Perl compatibility just gets uglier and uglier to do...)
1523 fEOLComments = FALSE;
1524 break;
1525
1526
1527 case doSetAddAmp:
1528 {
1529 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1530 set->add(chAmp);
1531 }
1532 break;
1533
1534 case doSetAddDash:
1535 {
1536 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1537 set->add(chDash);
1538 }
1539 break;
1540
1541 case doSetBackslash_s:
1542 {
1543 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1544 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1545 break;
1546 }
1547
1548 case doSetBackslash_S:
1549 {
1550 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1551 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1552 SSet.complement();
1553 set->addAll(SSet);
1554 break;
1555 }
1556
1557 case doSetBackslash_d:
1558 {
1559 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1560 // TODO - make a static set, ticket 6058.
1561 addCategory(set, U_GC_ND_MASK, *fStatus);
1562 break;
1563 }
1564
1565 case doSetBackslash_D:
1566 {
1567 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1568 UnicodeSet digits;
1569 // TODO - make a static set, ticket 6058.
1570 digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1571 digits.complement();
1572 set->addAll(digits);
1573 break;
1574 }
1575
1576 case doSetBackslash_h:
1577 {
1578 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579 UnicodeSet h;
1580 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1581 h.add((UChar32)9); // Tab
1582 set->addAll(h);
1583 break;
1584 }
1585
1586 case doSetBackslash_H:
1587 {
1588 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1589 UnicodeSet h;
1590 h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1591 h.add((UChar32)9); // Tab
1592 h.complement();
1593 set->addAll(h);
1594 break;
1595 }
1596
1597 case doSetBackslash_v:
1598 {
1599 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1600 set->add((UChar32)0x0a, (UChar32)0x0d); // add range
1601 set->add((UChar32)0x85);
1602 set->add((UChar32)0x2028, (UChar32)0x2029);
1603 break;
1604 }
1605
1606 case doSetBackslash_V:
1607 {
1608 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1609 UnicodeSet v;
1610 v.add((UChar32)0x0a, (UChar32)0x0d); // add range
1611 v.add((UChar32)0x85);
1612 v.add((UChar32)0x2028, (UChar32)0x2029);
1613 v.complement();
1614 set->addAll(v);
1615 break;
1616 }
1617
1618 case doSetBackslash_w:
1619 {
1620 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1621 set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1622 break;
1623 }
1624
1625 case doSetBackslash_W:
1626 {
1627 UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628 UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1629 SSet.complement();
1630 set->addAll(SSet);
1631 break;
1632 }
1633
1634 case doSetBegin:
1635 fixLiterals(FALSE);
1636 fSetStack.push(new UnicodeSet(), *fStatus);
1637 fSetOpStack.push(setStart, *fStatus);
1638 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1639 fSetOpStack.push(setCaseClose, *fStatus);
1640 }
1641 break;
1642
1643 case doSetBeginDifference1:
1644 // We have scanned something like [[abc]-[
1645 // Set up a new UnicodeSet for the set beginning with the just-scanned '['
1646 // Push a Difference operator, which will cause the new set to be subtracted from what
1647 // went before once it is created.
1648 setPushOp(setDifference1);
1649 fSetOpStack.push(setStart, *fStatus);
1650 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1651 fSetOpStack.push(setCaseClose, *fStatus);
1652 }
1653 break;
1654
1655 case doSetBeginIntersection1:
1656 // We have scanned something like [[abc]&[
1657 // Need both the '&' operator and the open '[' operator.
1658 setPushOp(setIntersection1);
1659 fSetOpStack.push(setStart, *fStatus);
1660 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1661 fSetOpStack.push(setCaseClose, *fStatus);
1662 }
1663 break;
1664
1665 case doSetBeginUnion:
1666 // We have scanned something like [[abc][
1667 // Need to handle the union operation explicitly [[abc] | [
1668 setPushOp(setUnion);
1669 fSetOpStack.push(setStart, *fStatus);
1670 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1671 fSetOpStack.push(setCaseClose, *fStatus);
1672 }
1673 break;
1674
1675 case doSetDifference2:
1676 // We have scanned something like [abc--
1677 // Consider this to unambiguously be a set difference operator.
1678 setPushOp(setDifference2);
1679 break;
1680
1681 case doSetEnd:
1682 // Have encountered the ']' that closes a set.
1683 // Force the evaluation of any pending operations within this set,
1684 // leave the completed set on the top of the set stack.
1685 setEval(setEnd);
1686 U_ASSERT(fSetOpStack.peeki()==setStart);
1687 fSetOpStack.popi();
1688 break;
1689
1690 case doSetFinish:
1691 {
1692 // Finished a complete set expression, including all nested sets.
1693 // The close bracket has already triggered clearing out pending set operators,
1694 // the operator stack should be empty and the operand stack should have just
1695 // one entry, the result set.
1696 U_ASSERT(fSetOpStack.empty());
1697 UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1698 U_ASSERT(fSetStack.empty());
1699 compileSet(theSet);
1700 break;
1701 }
1702
1703 case doSetIntersection2:
1704 // Have scanned something like [abc&&
1705 setPushOp(setIntersection2);
1706 break;
1707
1708 case doSetLiteral:
1709 // Union the just-scanned literal character into the set being built.
1710 // This operation is the highest precedence set operation, so we can always do
1711 // it immediately, without waiting to see what follows. It is necessary to perform
1712 // any pending '-' or '&' operation first, because these have the same precedence
1713 // as union-ing in a literal'
1714 {
1715 setEval(setUnion);
1716 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1717 s->add(fC.fChar);
1718 fLastSetLiteral = fC.fChar;
1719 break;
1720 }
1721
1722 case doSetLiteralEscaped:
1723 // A back-slash escaped literal character was encountered.
1724 // Processing is the same as with setLiteral, above, with the addition of
1725 // the optional check for errors on escaped ASCII letters.
1726 {
1727 if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1728 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) || // in [A-Z]
1729 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) { // in [a-z]
1730 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1731 }
1732 setEval(setUnion);
1733 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1734 s->add(fC.fChar);
1735 fLastSetLiteral = fC.fChar;
1736 break;
1737 }
1738
1739 case doSetNamedChar:
1740 // Scanning a \N{UNICODE CHARACTER NAME}
1741 // Aside from the source of the character, the processing is identical to doSetLiteral,
1742 // above.
1743 {
1744 UChar32 c = scanNamedChar();
1745 setEval(setUnion);
1746 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1747 s->add(c);
1748 fLastSetLiteral = c;
1749 break;
1750 }
1751
1752 case doSetNamedRange:
1753 // We have scanned literal-\N{CHAR NAME}. Add the range to the set.
1754 // The left character is already in the set, and is saved in fLastSetLiteral.
1755 // The right side needs to be picked up, the scan is at the 'N'.
1756 // Lower Limit > Upper limit being an error matches both Java
1757 // and ICU UnicodeSet behavior.
1758 {
1759 UChar32 c = scanNamedChar();
1760 if (U_SUCCESS(*fStatus) && fLastSetLiteral > c) {
1761 error(U_REGEX_INVALID_RANGE);
1762 }
1763 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1764 s->add(fLastSetLiteral, c);
1765 fLastSetLiteral = c;
1766 break;
1767 }
1768
1769
1770 case doSetNegate:
1771 // Scanned a '^' at the start of a set.
1772 // Push the negation operator onto the set op stack.
1773 // A twist for case-insensitive matching:
1774 // the case closure operation must happen _before_ negation.
1775 // But the case closure operation will already be on the stack if it's required.
1776 // This requires checking for case closure, and swapping the stack order
1777 // if it is present.
1778 {
1779 int32_t tosOp = fSetOpStack.peeki();
1780 if (tosOp == setCaseClose) {
1781 fSetOpStack.popi();
1782 fSetOpStack.push(setNegation, *fStatus);
1783 fSetOpStack.push(setCaseClose, *fStatus);
1784 } else {
1785 fSetOpStack.push(setNegation, *fStatus);
1786 }
1787 }
1788 break;
1789
1790 case doSetNoCloseError:
1791 error(U_REGEX_MISSING_CLOSE_BRACKET);
1792 break;
1793
1794 case doSetOpError:
1795 error(U_REGEX_RULE_SYNTAX); // -- or && at the end of a set. Illegal.
1796 break;
1797
1798 case doSetPosixProp:
1799 {
1800 UnicodeSet *s = scanPosixProp();
1801 if (s != NULL) {
1802 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1803 tos->addAll(*s);
1804 delete s;
1805 } // else error. scanProp() reported the error status already.
1806 }
1807 break;
1808
1809 case doSetProp:
1810 // Scanned a \p \P within [brackets].
1811 {
1812 UnicodeSet *s = scanProp();
1813 if (s != NULL) {
1814 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1815 tos->addAll(*s);
1816 delete s;
1817 } // else error. scanProp() reported the error status already.
1818 }
1819 break;
1820
1821
1822 case doSetRange:
1823 // We have scanned literal-literal. Add the range to the set.
1824 // The left character is already in the set, and is saved in fLastSetLiteral.
1825 // The right side is the current character.
1826 // Lower Limit > Upper limit being an error matches both Java
1827 // and ICU UnicodeSet behavior.
1828 {
1829 if (fLastSetLiteral > fC.fChar) {
1830 error(U_REGEX_INVALID_RANGE);
1831 }
1832 UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1833 s->add(fLastSetLiteral, fC.fChar);
1834 break;
1835 }
1836
1837 default:
1838 U_ASSERT(FALSE);
1839 error(U_REGEX_INTERNAL_ERROR);
1840 break;
1841 }
1842
1843 if (U_FAILURE(*fStatus)) {
1844 returnVal = FALSE;
1845 }
1846
1847 return returnVal;
1848 }
1849
1850
1851
1852 //------------------------------------------------------------------------------
1853 //
1854 // literalChar We've encountered a literal character from the pattern,
1855 // or an escape sequence that reduces to a character.
1856 // Add it to the string containing all literal chars/strings from
1857 // the pattern.
1858 //
1859 //------------------------------------------------------------------------------
literalChar(UChar32 c)1860 void RegexCompile::literalChar(UChar32 c) {
1861 fLiteralChars.append(c);
1862 }
1863
1864
1865 //------------------------------------------------------------------------------
1866 //
1867 // fixLiterals When compiling something that can follow a literal
1868 // string in a pattern, emit the code to match the
1869 // accumulated literal string.
1870 //
1871 // Optionally, split the last char of the string off into
1872 // a single "ONE_CHAR" operation, so that quantifiers can
1873 // apply to that char alone. Example: abc*
1874 // The * must apply to the 'c' only.
1875 //
1876 //------------------------------------------------------------------------------
fixLiterals(UBool split)1877 void RegexCompile::fixLiterals(UBool split) {
1878
1879 // If no literal characters have been scanned but not yet had code generated
1880 // for them, nothing needs to be done.
1881 if (fLiteralChars.length() == 0) {
1882 return;
1883 }
1884
1885 int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1886 UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1887
1888 // Split: We need to ensure that the last item in the compiled pattern
1889 // refers only to the last literal scanned in the pattern, so that
1890 // quantifiers (*, +, etc.) affect only it, and not a longer string.
1891 // Split before case folding for case insensitive matches.
1892
1893 if (split) {
1894 fLiteralChars.truncate(indexOfLastCodePoint);
1895 fixLiterals(FALSE); // Recursive call, emit code to match the first part of the string.
1896 // Note that the truncated literal string may be empty, in which case
1897 // nothing will be emitted.
1898
1899 literalChar(lastCodePoint); // Re-add the last code point as if it were a new literal.
1900 fixLiterals(FALSE); // Second recursive call, code for the final code point.
1901 return;
1902 }
1903
1904 // If we are doing case-insensitive matching, case fold the string. This may expand
1905 // the string, e.g. the German sharp-s turns into "ss"
1906 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1907 fLiteralChars.foldCase();
1908 indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1909 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1910 }
1911
1912 if (indexOfLastCodePoint == 0) {
1913 // Single character, emit a URX_ONECHAR op to match it.
1914 if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1915 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1916 appendOp(URX_ONECHAR_I, lastCodePoint);
1917 } else {
1918 appendOp(URX_ONECHAR, lastCodePoint);
1919 }
1920 } else {
1921 // Two or more chars, emit a URX_STRING to match them.
1922 if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1923 error(U_REGEX_PATTERN_TOO_BIG);
1924 }
1925 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1926 appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1927 } else {
1928 // TODO here: add optimization to split case sensitive strings of length two
1929 // into two single char ops, for efficiency.
1930 appendOp(URX_STRING, fRXPat->fLiteralText.length());
1931 }
1932 appendOp(URX_STRING_LEN, fLiteralChars.length());
1933
1934 // Add this string into the accumulated strings of the compiled pattern.
1935 fRXPat->fLiteralText.append(fLiteralChars);
1936 }
1937
1938 fLiteralChars.remove();
1939 }
1940
1941
buildOp(int32_t type,int32_t val)1942 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1943 if (U_FAILURE(*fStatus)) {
1944 return 0;
1945 }
1946 if (type < 0 || type > 255) {
1947 U_ASSERT(FALSE);
1948 error(U_REGEX_INTERNAL_ERROR);
1949 type = URX_RESERVED_OP;
1950 }
1951 if (val > 0x00ffffff) {
1952 U_ASSERT(FALSE);
1953 error(U_REGEX_INTERNAL_ERROR);
1954 val = 0;
1955 }
1956 if (val < 0) {
1957 if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1958 U_ASSERT(FALSE);
1959 error(U_REGEX_INTERNAL_ERROR);
1960 return -1;
1961 }
1962 if (URX_TYPE(val) != 0xff) {
1963 U_ASSERT(FALSE);
1964 error(U_REGEX_INTERNAL_ERROR);
1965 return -1;
1966 }
1967 type = URX_RESERVED_OP_N;
1968 }
1969 return (type << 24) | val;
1970 }
1971
1972
1973 //------------------------------------------------------------------------------
1974 //
1975 // appendOp() Append a new instruction onto the compiled pattern
1976 // Includes error checking, limiting the size of the
1977 // pattern to lengths that can be represented in the
1978 // 24 bit operand field of an instruction.
1979 //
1980 //------------------------------------------------------------------------------
appendOp(int32_t op)1981 void RegexCompile::appendOp(int32_t op) {
1982 if (U_FAILURE(*fStatus)) {
1983 return;
1984 }
1985 fRXPat->fCompiledPat->addElement(op, *fStatus);
1986 if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1987 error(U_REGEX_PATTERN_TOO_BIG);
1988 }
1989 }
1990
appendOp(int32_t type,int32_t val)1991 void RegexCompile::appendOp(int32_t type, int32_t val) {
1992 appendOp(buildOp(type, val));
1993 }
1994
1995
1996 //------------------------------------------------------------------------------
1997 //
1998 // insertOp() Insert a slot for a new opcode into the already
1999 // compiled pattern code.
2000 //
2001 // Fill the slot with a NOP. Our caller will replace it
2002 // with what they really wanted.
2003 //
2004 //------------------------------------------------------------------------------
insertOp(int32_t where)2005 void RegexCompile::insertOp(int32_t where) {
2006 UVector64 *code = fRXPat->fCompiledPat;
2007 U_ASSERT(where>0 && where < code->size());
2008
2009 int32_t nop = buildOp(URX_NOP, 0);
2010 code->insertElementAt(nop, where, *fStatus);
2011
2012 // Walk through the pattern, looking for any ops with targets that
2013 // were moved down by the insert. Fix them.
2014 int32_t loc;
2015 for (loc=0; loc<code->size(); loc++) {
2016 int32_t op = (int32_t)code->elementAti(loc);
2017 int32_t opType = URX_TYPE(op);
2018 int32_t opValue = URX_VAL(op);
2019 if ((opType == URX_JMP ||
2020 opType == URX_JMPX ||
2021 opType == URX_STATE_SAVE ||
2022 opType == URX_CTR_LOOP ||
2023 opType == URX_CTR_LOOP_NG ||
2024 opType == URX_JMP_SAV ||
2025 opType == URX_JMP_SAV_X ||
2026 opType == URX_RELOC_OPRND) && opValue > where) {
2027 // Target location for this opcode is after the insertion point and
2028 // needs to be incremented to adjust for the insertion.
2029 opValue++;
2030 op = buildOp(opType, opValue);
2031 code->setElementAt(op, loc);
2032 }
2033 }
2034
2035 // Now fix up the parentheses stack. All positive values in it are locations in
2036 // the compiled pattern. (Negative values are frame boundaries, and don't need fixing.)
2037 for (loc=0; loc<fParenStack.size(); loc++) {
2038 int32_t x = fParenStack.elementAti(loc);
2039 U_ASSERT(x < code->size());
2040 if (x>where) {
2041 x++;
2042 fParenStack.setElementAt(x, loc);
2043 }
2044 }
2045
2046 if (fMatchCloseParen > where) {
2047 fMatchCloseParen++;
2048 }
2049 if (fMatchOpenParen > where) {
2050 fMatchOpenParen++;
2051 }
2052 }
2053
2054
2055 //------------------------------------------------------------------------------
2056 //
2057 // allocateData() Allocate storage in the matcher's static data area.
2058 // Return the index for the newly allocated data.
2059 // The storage won't actually exist until we are running a match
2060 // operation, but the storage indexes are inserted into various
2061 // opcodes while compiling the pattern.
2062 //
2063 //------------------------------------------------------------------------------
allocateData(int32_t size)2064 int32_t RegexCompile::allocateData(int32_t size) {
2065 if (U_FAILURE(*fStatus)) {
2066 return 0;
2067 }
2068 if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2069 error(U_REGEX_INTERNAL_ERROR);
2070 return 0;
2071 }
2072 int32_t dataIndex = fRXPat->fDataSize;
2073 fRXPat->fDataSize += size;
2074 if (fRXPat->fDataSize >= 0x00fffff0) {
2075 error(U_REGEX_INTERNAL_ERROR);
2076 }
2077 return dataIndex;
2078 }
2079
2080
2081 //------------------------------------------------------------------------------
2082 //
2083 // allocateStackData() Allocate space in the back-tracking stack frame.
2084 // Return the index for the newly allocated data.
2085 // The frame indexes are inserted into various
2086 // opcodes while compiling the pattern, meaning that frame
2087 // size must be restricted to the size that will fit
2088 // as an operand (24 bits).
2089 //
2090 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2091 int32_t RegexCompile::allocateStackData(int32_t size) {
2092 if (U_FAILURE(*fStatus)) {
2093 return 0;
2094 }
2095 if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2096 error(U_REGEX_INTERNAL_ERROR);
2097 return 0;
2098 }
2099 int32_t dataIndex = fRXPat->fFrameSize;
2100 fRXPat->fFrameSize += size;
2101 if (fRXPat->fFrameSize >= 0x00fffff0) {
2102 error(U_REGEX_PATTERN_TOO_BIG);
2103 }
2104 return dataIndex;
2105 }
2106
2107
2108 //------------------------------------------------------------------------------
2109 //
2110 // blockTopLoc() Find or create a location in the compiled pattern
2111 // at the start of the operation or block that has
2112 // just been compiled. Needed when a quantifier (* or
2113 // whatever) appears, and we need to add an operation
2114 // at the start of the thing being quantified.
2115 //
2116 // (Parenthesized Blocks) have a slot with a NOP that
2117 // is reserved for this purpose. .* or similar don't
2118 // and a slot needs to be added.
2119 //
2120 // parameter reserveLoc : TRUE - ensure that there is space to add an opcode
2121 // at the returned location.
2122 // FALSE - just return the address,
2123 // do not reserve a location there.
2124 //
2125 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2126 int32_t RegexCompile::blockTopLoc(UBool reserveLoc) {
2127 int32_t theLoc;
2128 fixLiterals(TRUE); // Emit code for any pending literals.
2129 // If last item was a string, emit separate op for the its last char.
2130 if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2131 {
2132 // The item just processed is a parenthesized block.
2133 theLoc = fMatchOpenParen; // A slot is already reserved for us.
2134 U_ASSERT(theLoc > 0);
2135 U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2136 }
2137 else {
2138 // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2139 // No slot for STATE_SAVE was pre-reserved in the compiled code.
2140 // We need to make space now.
2141 theLoc = fRXPat->fCompiledPat->size()-1;
2142 int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2143 if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2144 // Strings take two opcode, we want the position of the first one.
2145 // We can have a string at this point if a single character case-folded to two.
2146 theLoc--;
2147 }
2148 if (reserveLoc) {
2149 int32_t nop = buildOp(URX_NOP, 0);
2150 fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2151 }
2152 }
2153 return theLoc;
2154 }
2155
2156
2157
2158 //------------------------------------------------------------------------------
2159 //
2160 // handleCloseParen When compiling a close paren, we need to go back
2161 // and fix up any JMP or SAVE operations within the
2162 // parenthesized block that need to target the end
2163 // of the block. The locations of these are kept on
2164 // the paretheses stack.
2165 //
2166 // This function is called both when encountering a
2167 // real ) and at the end of the pattern.
2168 //
2169 //------------------------------------------------------------------------------
handleCloseParen()2170 void RegexCompile::handleCloseParen() {
2171 int32_t patIdx;
2172 int32_t patOp;
2173 if (fParenStack.size() <= 0) {
2174 error(U_REGEX_MISMATCHED_PAREN);
2175 return;
2176 }
2177
2178 // Emit code for any pending literals.
2179 fixLiterals(FALSE);
2180
2181 // Fixup any operations within the just-closed parenthesized group
2182 // that need to reference the end of the (block).
2183 // (The first one popped from the stack is an unused slot for
2184 // alternation (OR) state save, but applying the fixup to it does no harm.)
2185 for (;;) {
2186 patIdx = fParenStack.popi();
2187 if (patIdx < 0) {
2188 // value < 0 flags the start of the frame on the paren stack.
2189 break;
2190 }
2191 U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2192 patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2193 U_ASSERT(URX_VAL(patOp) == 0); // Branch target for JMP should not be set.
2194 patOp |= fRXPat->fCompiledPat->size(); // Set it now.
2195 fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2196 fMatchOpenParen = patIdx;
2197 }
2198
2199 // At the close of any parenthesized block, restore the match mode flags to
2200 // the value they had at the open paren. Saved value is
2201 // at the top of the paren stack.
2202 fModeFlags = fParenStack.popi();
2203 U_ASSERT(fModeFlags < 0);
2204
2205 // DO any additional fixups, depending on the specific kind of
2206 // parentesized grouping this is
2207
2208 switch (patIdx) {
2209 case plain:
2210 case flags:
2211 // No additional fixups required.
2212 // (Grouping-only parentheses)
2213 break;
2214 case capturing:
2215 // Capturing Parentheses.
2216 // Insert a End Capture op into the pattern.
2217 // The frame offset of the variables for this cg is obtained from the
2218 // start capture op and put it into the end-capture op.
2219 {
2220 int32_t captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2221 U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2222
2223 int32_t frameVarLocation = URX_VAL(captureOp);
2224 appendOp(URX_END_CAPTURE, frameVarLocation);
2225 }
2226 break;
2227 case atomic:
2228 // Atomic Parenthesis.
2229 // Insert a LD_SP operation to restore the state stack to the position
2230 // it was when the atomic parens were entered.
2231 {
2232 int32_t stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2233 U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2234 int32_t stoLoc = URX_VAL(stoOp);
2235 appendOp(URX_LD_SP, stoLoc);
2236 }
2237 break;
2238
2239 case lookAhead:
2240 {
2241 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2242 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2243 int32_t dataLoc = URX_VAL(startOp);
2244 appendOp(URX_LA_END, dataLoc);
2245 }
2246 break;
2247
2248 case negLookAhead:
2249 {
2250 // See comment at doOpenLookAheadNeg
2251 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2252 U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2253 int32_t dataLoc = URX_VAL(startOp);
2254 appendOp(URX_LA_END, dataLoc);
2255 appendOp(URX_BACKTRACK, 0);
2256 appendOp(URX_LA_END, dataLoc);
2257
2258 // Patch the URX_SAVE near the top of the block.
2259 // The destination of the SAVE is the final LA_END that was just added.
2260 int32_t saveOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2261 U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2262 int32_t dest = fRXPat->fCompiledPat->size()-1;
2263 saveOp = buildOp(URX_STATE_SAVE, dest);
2264 fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2265 }
2266 break;
2267
2268 case lookBehind:
2269 {
2270 // See comment at doOpenLookBehind.
2271
2272 // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2273 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2274 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2275 int32_t dataLoc = URX_VAL(startOp);
2276 appendOp(URX_LB_END, dataLoc);
2277 appendOp(URX_LA_END, dataLoc);
2278
2279 // Determine the min and max bounds for the length of the
2280 // string that the pattern can match.
2281 // An unbounded upper limit is an error.
2282 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2283 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2284 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2285 if (URX_TYPE(maxML) != 0) {
2286 error(U_REGEX_LOOK_BEHIND_LIMIT);
2287 break;
2288 }
2289 if (maxML == INT32_MAX) {
2290 error(U_REGEX_LOOK_BEHIND_LIMIT);
2291 break;
2292 }
2293 U_ASSERT(minML <= maxML);
2294
2295 // Insert the min and max match len bounds into the URX_LB_CONT op that
2296 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2297 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-2);
2298 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-1);
2299
2300 }
2301 break;
2302
2303
2304
2305 case lookBehindN:
2306 {
2307 // See comment at doOpenLookBehindNeg.
2308
2309 // Append the URX_LBN_END to the compiled pattern.
2310 int32_t startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2311 U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2312 int32_t dataLoc = URX_VAL(startOp);
2313 appendOp(URX_LBN_END, dataLoc);
2314
2315 // Determine the min and max bounds for the length of the
2316 // string that the pattern can match.
2317 // An unbounded upper limit is an error.
2318 int32_t patEnd = fRXPat->fCompiledPat->size() - 1;
2319 int32_t minML = minMatchLength(fMatchOpenParen, patEnd);
2320 int32_t maxML = maxMatchLength(fMatchOpenParen, patEnd);
2321 if (URX_TYPE(maxML) != 0) {
2322 error(U_REGEX_LOOK_BEHIND_LIMIT);
2323 break;
2324 }
2325 if (maxML == INT32_MAX) {
2326 error(U_REGEX_LOOK_BEHIND_LIMIT);
2327 break;
2328 }
2329 U_ASSERT(minML <= maxML);
2330
2331 // Insert the min and max match len bounds into the URX_LB_CONT op that
2332 // appears at the top of the look-behind block, at location fMatchOpenParen+1
2333 fRXPat->fCompiledPat->setElementAt(minML, fMatchOpenParen-3);
2334 fRXPat->fCompiledPat->setElementAt(maxML, fMatchOpenParen-2);
2335
2336 // Insert the pattern location to continue at after a successful match
2337 // as the last operand of the URX_LBN_CONT
2338 int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2339 fRXPat->fCompiledPat->setElementAt(op, fMatchOpenParen-1);
2340 }
2341 break;
2342
2343
2344
2345 default:
2346 U_ASSERT(FALSE);
2347 }
2348
2349 // remember the next location in the compiled pattern.
2350 // The compilation of Quantifiers will look at this to see whether its looping
2351 // over a parenthesized block or a single item
2352 fMatchCloseParen = fRXPat->fCompiledPat->size();
2353 }
2354
2355
2356
2357 //------------------------------------------------------------------------------
2358 //
2359 // compileSet Compile the pattern operations for a reference to a
2360 // UnicodeSet.
2361 //
2362 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2363 void RegexCompile::compileSet(UnicodeSet *theSet)
2364 {
2365 if (theSet == NULL) {
2366 return;
2367 }
2368 // Remove any strings from the set.
2369 // There shoudn't be any, but just in case.
2370 // (Case Closure can add them; if we had a simple case closure avaialble that
2371 // ignored strings, that would be better.)
2372 theSet->removeAllStrings();
2373 int32_t setSize = theSet->size();
2374
2375 switch (setSize) {
2376 case 0:
2377 {
2378 // Set of no elements. Always fails to match.
2379 appendOp(URX_BACKTRACK, 0);
2380 delete theSet;
2381 }
2382 break;
2383
2384 case 1:
2385 {
2386 // The set contains only a single code point. Put it into
2387 // the compiled pattern as a single char operation rather
2388 // than a set, and discard the set itself.
2389 literalChar(theSet->charAt(0));
2390 delete theSet;
2391 }
2392 break;
2393
2394 default:
2395 {
2396 // The set contains two or more chars. (the normal case)
2397 // Put it into the compiled pattern as a set.
2398 int32_t setNumber = fRXPat->fSets->size();
2399 fRXPat->fSets->addElement(theSet, *fStatus);
2400 appendOp(URX_SETREF, setNumber);
2401 }
2402 }
2403 }
2404
2405
2406 //------------------------------------------------------------------------------
2407 //
2408 // compileInterval Generate the code for a {min, max} style interval quantifier.
2409 // Except for the specific opcodes used, the code is the same
2410 // for all three types (greedy, non-greedy, possessive) of
2411 // intervals. The opcodes are supplied as parameters.
2412 // (There are two sets of opcodes - greedy & possessive use the
2413 // same ones, while non-greedy has it's own.)
2414 //
2415 // The code for interval loops has this form:
2416 // 0 CTR_INIT counter loc (in stack frame)
2417 // 1 5 patt address of CTR_LOOP at bottom of block
2418 // 2 min count
2419 // 3 max count (-1 for unbounded)
2420 // 4 ... block to be iterated over
2421 // 5 CTR_LOOP
2422 //
2423 // In
2424 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2425 void RegexCompile::compileInterval(int32_t InitOp, int32_t LoopOp)
2426 {
2427 // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2428 // four slots in the compiled code. Reserve them.
2429 int32_t topOfBlock = blockTopLoc(TRUE);
2430 insertOp(topOfBlock);
2431 insertOp(topOfBlock);
2432 insertOp(topOfBlock);
2433
2434 // The operands for the CTR_INIT opcode include the index in the matcher data
2435 // of the counter. Allocate it now. There are two data items
2436 // counterLoc --> Loop counter
2437 // +1 --> Input index (for breaking non-progressing loops)
2438 // (Only present if unbounded upper limit on loop)
2439 int32_t dataSize = fIntervalUpper < 0 ? 2 : 1;
2440 int32_t counterLoc = allocateStackData(dataSize);
2441
2442 int32_t op = buildOp(InitOp, counterLoc);
2443 fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2444
2445 // The second operand of CTR_INIT is the location following the end of the loop.
2446 // Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2447 // compilation of something later on causes the code to grow and the target
2448 // position to move.
2449 int32_t loopEnd = fRXPat->fCompiledPat->size();
2450 op = buildOp(URX_RELOC_OPRND, loopEnd);
2451 fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2452
2453 // Followed by the min and max counts.
2454 fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2455 fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2456
2457 // Apend the CTR_LOOP op. The operand is the location of the CTR_INIT op.
2458 // Goes at end of the block being looped over, so just append to the code so far.
2459 appendOp(LoopOp, topOfBlock);
2460
2461 if ((fIntervalLow & 0xff000000) != 0 ||
2462 (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2463 error(U_REGEX_NUMBER_TOO_BIG);
2464 }
2465
2466 if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2467 error(U_REGEX_MAX_LT_MIN);
2468 }
2469 }
2470
2471
2472
compileInlineInterval()2473 UBool RegexCompile::compileInlineInterval() {
2474 if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2475 // Too big to inline. Fail, which will cause looping code to be generated.
2476 // (Upper < Lower picks up unbounded upper and errors, both.)
2477 return FALSE;
2478 }
2479
2480 int32_t topOfBlock = blockTopLoc(FALSE);
2481 if (fIntervalUpper == 0) {
2482 // Pathological case. Attempt no matches, as if the block doesn't exist.
2483 // Discard the generated code for the block.
2484 // If the block included parens, discard the info pertaining to them as well.
2485 fRXPat->fCompiledPat->setSize(topOfBlock);
2486 if (fMatchOpenParen >= topOfBlock) {
2487 fMatchOpenParen = -1;
2488 }
2489 if (fMatchCloseParen >= topOfBlock) {
2490 fMatchCloseParen = -1;
2491 }
2492 return TRUE;
2493 }
2494
2495 if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2496 // The thing being repeated is not a single op, but some
2497 // more complex block. Do it as a loop, not inlines.
2498 // Note that things "repeated" a max of once are handled as inline, because
2499 // the one copy of the code already generated is just fine.
2500 return FALSE;
2501 }
2502
2503 // Pick up the opcode that is to be repeated
2504 //
2505 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2506
2507 // Compute the pattern location where the inline sequence
2508 // will end, and set up the state save op that will be needed.
2509 //
2510 int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2511 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2512 int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2513 if (fIntervalLow == 0) {
2514 insertOp(topOfBlock);
2515 fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2516 }
2517
2518
2519
2520 // Loop, emitting the op for the thing being repeated each time.
2521 // Loop starts at 1 because one instance of the op already exists in the pattern,
2522 // it was put there when it was originally encountered.
2523 int32_t i;
2524 for (i=1; i<fIntervalUpper; i++ ) {
2525 if (i >= fIntervalLow) {
2526 appendOp(saveOp);
2527 }
2528 appendOp(op);
2529 }
2530 return TRUE;
2531 }
2532
2533
2534
2535 //------------------------------------------------------------------------------
2536 //
2537 // caseInsensitiveStart given a single code point from a pattern string, determine the
2538 // set of characters that could potentially begin a case-insensitive
2539 // match of a string beginning with that character, using full Unicode
2540 // case insensitive matching.
2541 //
2542 // This is used in optimizing find().
2543 //
2544 // closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2545 // misses cases like this:
2546 // A string from the pattern begins with 'ss' (although all we know
2547 // in this context is that it begins with 's')
2548 // The pattern could match a string beginning with a German sharp-s
2549 //
2550 // To the ordinary case closure for a character c, we add all other
2551 // characters cx where the case closure of cx incudes a string form that begins
2552 // with the original character c.
2553 //
2554 // This function could be made smarter. The full pattern string is available
2555 // and it would be possible to verify that the extra characters being added
2556 // to the starting set fully match, rather than having just a first-char of the
2557 // folded form match.
2558 //
2559 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2560 void RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2561
2562 // Machine Generated below.
2563 // It may need updating with new versions of Unicode.
2564 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2565 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2566
2567 // Machine Generated Data. Do not hand edit.
2568 static const UChar32 RECaseFixCodePoints[] = {
2569 0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2570 0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2571 0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2572 0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2573 0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2574
2575 static const int16_t RECaseFixStringOffsets[] = {
2576 0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2577 0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2578 0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2579 0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2580 0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2581
2582 static const int16_t RECaseFixCounts[] = {
2583 0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2584 0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2585 0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2586 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2587 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2588
2589 static const UChar RECaseFixData[] = {
2590 0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2591 0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2592 0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2593 0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2594 0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2595 0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2596 0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2597 0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2598 0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2599 0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2600 0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2601
2602 // End of machine generated data.
2603
2604 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2605 UChar32 caseFoldedC = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2606 starterChars->set(caseFoldedC, caseFoldedC);
2607
2608 int32_t i;
2609 for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2610 // Simple linear search through the sorted list of interesting code points.
2611 }
2612
2613 if (RECaseFixCodePoints[i] == c) {
2614 int32_t dataIndex = RECaseFixStringOffsets[i];
2615 int32_t numCharsToAdd = RECaseFixCounts[i];
2616 UChar32 cpToAdd = 0;
2617 for (int32_t j=0; j<numCharsToAdd; j++) {
2618 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2619 starterChars->add(cpToAdd);
2620 }
2621 }
2622
2623 starterChars->closeOver(USET_CASE_INSENSITIVE);
2624 starterChars->removeAllStrings();
2625 } else {
2626 // Not a cased character. Just return it alone.
2627 starterChars->set(c, c);
2628 }
2629 }
2630
2631
2632
2633
2634 //------------------------------------------------------------------------------
2635 //
2636 // matchStartType Determine how a match can start.
2637 // Used to optimize find() operations.
2638 //
2639 // Operation is very similar to minMatchLength(). Walk the compiled
2640 // pattern, keeping an on-going minimum-match-length. For any
2641 // op where the min match coming in is zero, add that ops possible
2642 // starting matches to the possible starts for the overall pattern.
2643 //
2644 //------------------------------------------------------------------------------
matchStartType()2645 void RegexCompile::matchStartType() {
2646 if (U_FAILURE(*fStatus)) {
2647 return;
2648 }
2649
2650
2651 int32_t loc; // Location in the pattern of the current op being processed.
2652 int32_t op; // The op being processed
2653 int32_t opType; // The opcode type of the op
2654 int32_t currentLen = 0; // Minimum length of a match to this point (loc) in the pattern
2655 int32_t numInitialStrings = 0; // Number of strings encountered that could match at start.
2656
2657 UBool atStart = TRUE; // True if no part of the pattern yet encountered
2658 // could have advanced the position in a match.
2659 // (Maximum match length so far == 0)
2660
2661 // forwardedLength is a vector holding minimum-match-length values that
2662 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
2663 // It must be one longer than the pattern being checked because some ops
2664 // will jmp to a end-of-block+1 location from within a block, and we must
2665 // count those when checking the block.
2666 int32_t end = fRXPat->fCompiledPat->size();
2667 UVector32 forwardedLength(end+1, *fStatus);
2668 forwardedLength.setSize(end+1);
2669 for (loc=3; loc<end; loc++) {
2670 forwardedLength.setElementAt(INT32_MAX, loc);
2671 }
2672
2673 for (loc = 3; loc<end; loc++) {
2674 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2675 opType = URX_TYPE(op);
2676
2677 // The loop is advancing linearly through the pattern.
2678 // If the op we are now at was the destination of a branch in the pattern,
2679 // and that path has a shorter minimum length than the current accumulated value,
2680 // replace the current accumulated value.
2681 if (forwardedLength.elementAti(loc) < currentLen) {
2682 currentLen = forwardedLength.elementAti(loc);
2683 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2684 }
2685
2686 switch (opType) {
2687 // Ops that don't change the total length matched
2688 case URX_RESERVED_OP:
2689 case URX_END:
2690 case URX_FAIL:
2691 case URX_STRING_LEN:
2692 case URX_NOP:
2693 case URX_START_CAPTURE:
2694 case URX_END_CAPTURE:
2695 case URX_BACKSLASH_B:
2696 case URX_BACKSLASH_BU:
2697 case URX_BACKSLASH_G:
2698 case URX_BACKSLASH_Z:
2699 case URX_DOLLAR:
2700 case URX_DOLLAR_M:
2701 case URX_DOLLAR_D:
2702 case URX_DOLLAR_MD:
2703 case URX_RELOC_OPRND:
2704 case URX_STO_INP_LOC:
2705 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
2706 case URX_BACKREF_I:
2707
2708 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
2709 case URX_LD_SP:
2710 break;
2711
2712 case URX_CARET:
2713 if (atStart) {
2714 fRXPat->fStartType = START_START;
2715 }
2716 break;
2717
2718 case URX_CARET_M:
2719 case URX_CARET_M_UNIX:
2720 if (atStart) {
2721 fRXPat->fStartType = START_LINE;
2722 }
2723 break;
2724
2725 case URX_ONECHAR:
2726 if (currentLen == 0) {
2727 // This character could appear at the start of a match.
2728 // Add it to the set of possible starting characters.
2729 fRXPat->fInitialChars->add(URX_VAL(op));
2730 numInitialStrings += 2;
2731 }
2732 currentLen++;
2733 atStart = FALSE;
2734 break;
2735
2736
2737 case URX_SETREF:
2738 if (currentLen == 0) {
2739 int32_t sn = URX_VAL(op);
2740 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2741 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2742 fRXPat->fInitialChars->addAll(*s);
2743 numInitialStrings += 2;
2744 }
2745 currentLen++;
2746 atStart = FALSE;
2747 break;
2748
2749 case URX_LOOP_SR_I:
2750 // [Set]*, like a SETREF, above, in what it can match,
2751 // but may not match at all, so currentLen is not incremented.
2752 if (currentLen == 0) {
2753 int32_t sn = URX_VAL(op);
2754 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2755 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2756 fRXPat->fInitialChars->addAll(*s);
2757 numInitialStrings += 2;
2758 }
2759 atStart = FALSE;
2760 break;
2761
2762 case URX_LOOP_DOT_I:
2763 if (currentLen == 0) {
2764 // .* at the start of a pattern.
2765 // Any character can begin the match.
2766 fRXPat->fInitialChars->clear();
2767 fRXPat->fInitialChars->complement();
2768 numInitialStrings += 2;
2769 }
2770 atStart = FALSE;
2771 break;
2772
2773
2774 case URX_STATIC_SETREF:
2775 if (currentLen == 0) {
2776 int32_t sn = URX_VAL(op);
2777 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2778 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2779 fRXPat->fInitialChars->addAll(*s);
2780 numInitialStrings += 2;
2781 }
2782 currentLen++;
2783 atStart = FALSE;
2784 break;
2785
2786
2787
2788 case URX_STAT_SETREF_N:
2789 if (currentLen == 0) {
2790 int32_t sn = URX_VAL(op);
2791 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2792 UnicodeSet sc(*s);
2793 sc.complement();
2794 fRXPat->fInitialChars->addAll(sc);
2795 numInitialStrings += 2;
2796 }
2797 currentLen++;
2798 atStart = FALSE;
2799 break;
2800
2801
2802
2803 case URX_BACKSLASH_D:
2804 // Digit Char
2805 if (currentLen == 0) {
2806 UnicodeSet s;
2807 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2808 if (URX_VAL(op) != 0) {
2809 s.complement();
2810 }
2811 fRXPat->fInitialChars->addAll(s);
2812 numInitialStrings += 2;
2813 }
2814 currentLen++;
2815 atStart = FALSE;
2816 break;
2817
2818
2819 case URX_BACKSLASH_H:
2820 // Horiz white space
2821 if (currentLen == 0) {
2822 UnicodeSet s;
2823 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2824 s.add((UChar32)9); // Tab
2825 if (URX_VAL(op) != 0) {
2826 s.complement();
2827 }
2828 fRXPat->fInitialChars->addAll(s);
2829 numInitialStrings += 2;
2830 }
2831 currentLen++;
2832 atStart = FALSE;
2833 break;
2834
2835
2836 case URX_BACKSLASH_R: // Any line ending sequence
2837 case URX_BACKSLASH_V: // Any line ending code point, with optional negation
2838 if (currentLen == 0) {
2839 UnicodeSet s;
2840 s.add((UChar32)0x0a, (UChar32)0x0d); // add range
2841 s.add((UChar32)0x85);
2842 s.add((UChar32)0x2028, (UChar32)0x2029);
2843 if (URX_VAL(op) != 0) {
2844 // Complement option applies to URX_BACKSLASH_V only.
2845 s.complement();
2846 }
2847 fRXPat->fInitialChars->addAll(s);
2848 numInitialStrings += 2;
2849 }
2850 currentLen++;
2851 atStart = FALSE;
2852 break;
2853
2854
2855
2856 case URX_ONECHAR_I:
2857 // Case Insensitive Single Character.
2858 if (currentLen == 0) {
2859 UChar32 c = URX_VAL(op);
2860 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2861 UnicodeSet starters(c, c);
2862 starters.closeOver(USET_CASE_INSENSITIVE);
2863 // findCaseInsensitiveStarters(c, &starters);
2864 // For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2865 // The expanded folding can't match the pattern.
2866 fRXPat->fInitialChars->addAll(starters);
2867 } else {
2868 // Char has no case variants. Just add it as-is to the
2869 // set of possible starting chars.
2870 fRXPat->fInitialChars->add(c);
2871 }
2872 numInitialStrings += 2;
2873 }
2874 currentLen++;
2875 atStart = FALSE;
2876 break;
2877
2878
2879 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
2880 case URX_DOTANY_ALL: // . matches one or two.
2881 case URX_DOTANY:
2882 case URX_DOTANY_UNIX:
2883 if (currentLen == 0) {
2884 // These constructs are all bad news when they appear at the start
2885 // of a match. Any character can begin the match.
2886 fRXPat->fInitialChars->clear();
2887 fRXPat->fInitialChars->complement();
2888 numInitialStrings += 2;
2889 }
2890 currentLen++;
2891 atStart = FALSE;
2892 break;
2893
2894
2895 case URX_JMPX:
2896 loc++; // Except for extra operand on URX_JMPX, same as URX_JMP.
2897 case URX_JMP:
2898 {
2899 int32_t jmpDest = URX_VAL(op);
2900 if (jmpDest < loc) {
2901 // Loop of some kind. Can safely ignore, the worst that will happen
2902 // is that we understate the true minimum length
2903 currentLen = forwardedLength.elementAti(loc+1);
2904
2905 } else {
2906 // Forward jump. Propagate the current min length to the target loc of the jump.
2907 U_ASSERT(jmpDest <= end+1);
2908 if (forwardedLength.elementAti(jmpDest) > currentLen) {
2909 forwardedLength.setElementAt(currentLen, jmpDest);
2910 }
2911 }
2912 }
2913 atStart = FALSE;
2914 break;
2915
2916 case URX_JMP_SAV:
2917 case URX_JMP_SAV_X:
2918 // Combo of state save to the next loc, + jmp backwards.
2919 // Net effect on min. length computation is nothing.
2920 atStart = FALSE;
2921 break;
2922
2923 case URX_BACKTRACK:
2924 // Fails are kind of like a branch, except that the min length was
2925 // propagated already, by the state save.
2926 currentLen = forwardedLength.elementAti(loc+1);
2927 atStart = FALSE;
2928 break;
2929
2930
2931 case URX_STATE_SAVE:
2932 {
2933 // State Save, for forward jumps, propagate the current minimum.
2934 // of the state save.
2935 int32_t jmpDest = URX_VAL(op);
2936 if (jmpDest > loc) {
2937 if (currentLen < forwardedLength.elementAti(jmpDest)) {
2938 forwardedLength.setElementAt(currentLen, jmpDest);
2939 }
2940 }
2941 }
2942 atStart = FALSE;
2943 break;
2944
2945
2946
2947
2948 case URX_STRING:
2949 {
2950 loc++;
2951 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2952 int32_t stringLen = URX_VAL(stringLenOp);
2953 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2954 U_ASSERT(stringLenOp >= 2);
2955 if (currentLen == 0) {
2956 // Add the starting character of this string to the set of possible starting
2957 // characters for this pattern.
2958 int32_t stringStartIdx = URX_VAL(op);
2959 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2960 fRXPat->fInitialChars->add(c);
2961
2962 // Remember this string. After the entire pattern has been checked,
2963 // if nothing else is identified that can start a match, we'll use it.
2964 numInitialStrings++;
2965 fRXPat->fInitialStringIdx = stringStartIdx;
2966 fRXPat->fInitialStringLen = stringLen;
2967 }
2968
2969 currentLen += stringLen;
2970 atStart = FALSE;
2971 }
2972 break;
2973
2974 case URX_STRING_I:
2975 {
2976 // Case-insensitive string. Unlike exact-match strings, we won't
2977 // attempt a string search for possible match positions. But we
2978 // do update the set of possible starting characters.
2979 loc++;
2980 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2981 int32_t stringLen = URX_VAL(stringLenOp);
2982 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2983 U_ASSERT(stringLenOp >= 2);
2984 if (currentLen == 0) {
2985 // Add the starting character of this string to the set of possible starting
2986 // characters for this pattern.
2987 int32_t stringStartIdx = URX_VAL(op);
2988 UChar32 c = fRXPat->fLiteralText.char32At(stringStartIdx);
2989 UnicodeSet s;
2990 findCaseInsensitiveStarters(c, &s);
2991 fRXPat->fInitialChars->addAll(s);
2992 numInitialStrings += 2; // Matching on an initial string not possible.
2993 }
2994 currentLen += stringLen;
2995 atStart = FALSE;
2996 }
2997 break;
2998
2999 case URX_CTR_INIT:
3000 case URX_CTR_INIT_NG:
3001 {
3002 // Loop Init Ops. These don't change the min length, but they are 4 word ops
3003 // so location must be updated accordingly.
3004 // Loop Init Ops.
3005 // If the min loop count == 0
3006 // move loc forwards to the end of the loop, skipping over the body.
3007 // If the min count is > 0,
3008 // continue normal processing of the body of the loop.
3009 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3010 loopEndLoc = URX_VAL(loopEndLoc);
3011 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3012 if (minLoopCount == 0) {
3013 // Min Loop Count of 0, treat like a forward branch and
3014 // move the current minimum length up to the target
3015 // (end of loop) location.
3016 U_ASSERT(loopEndLoc <= end+1);
3017 if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3018 forwardedLength.setElementAt(currentLen, loopEndLoc);
3019 }
3020 }
3021 loc+=3; // Skips over operands of CTR_INIT
3022 }
3023 atStart = FALSE;
3024 break;
3025
3026
3027 case URX_CTR_LOOP:
3028 case URX_CTR_LOOP_NG:
3029 // Loop ops.
3030 // The jump is conditional, backwards only.
3031 atStart = FALSE;
3032 break;
3033
3034 case URX_LOOP_C:
3035 // More loop ops. These state-save to themselves.
3036 // don't change the minimum match
3037 atStart = FALSE;
3038 break;
3039
3040
3041 case URX_LA_START:
3042 case URX_LB_START:
3043 {
3044 // Look-around. Scan forward until the matching look-ahead end,
3045 // without processing the look-around block. This is overly pessimistic.
3046
3047 // Keep track of the nesting depth of look-around blocks. Boilerplate code for
3048 // lookahead contains two LA_END instructions, so count goes up by two
3049 // for each LA_START.
3050 int32_t depth = (opType == URX_LA_START? 2: 1);
3051 for (;;) {
3052 loc++;
3053 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3054 if (URX_TYPE(op) == URX_LA_START) {
3055 depth+=2;
3056 }
3057 if (URX_TYPE(op) == URX_LB_START) {
3058 depth++;
3059 }
3060 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3061 depth--;
3062 if (depth == 0) {
3063 break;
3064 }
3065 }
3066 if (URX_TYPE(op) == URX_STATE_SAVE) {
3067 // Need this because neg lookahead blocks will FAIL to outside
3068 // of the block.
3069 int32_t jmpDest = URX_VAL(op);
3070 if (jmpDest > loc) {
3071 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3072 forwardedLength.setElementAt(currentLen, jmpDest);
3073 }
3074 }
3075 }
3076 U_ASSERT(loc <= end);
3077 }
3078 }
3079 break;
3080
3081 case URX_LA_END:
3082 case URX_LB_CONT:
3083 case URX_LB_END:
3084 case URX_LBN_CONT:
3085 case URX_LBN_END:
3086 U_ASSERT(FALSE); // Shouldn't get here. These ops should be
3087 // consumed by the scan in URX_LA_START and LB_START
3088
3089 break;
3090
3091 default:
3092 U_ASSERT(FALSE);
3093 }
3094
3095 }
3096
3097
3098 // We have finished walking through the ops. Check whether some forward jump
3099 // propagated a shorter length to location end+1.
3100 if (forwardedLength.elementAti(end+1) < currentLen) {
3101 currentLen = forwardedLength.elementAti(end+1);
3102 }
3103
3104
3105 fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3106
3107
3108 // Sort out what we should check for when looking for candidate match start positions.
3109 // In order of preference,
3110 // 1. Start of input text buffer.
3111 // 2. A literal string.
3112 // 3. Start of line in multi-line mode.
3113 // 4. A single literal character.
3114 // 5. A character from a set of characters.
3115 //
3116 if (fRXPat->fStartType == START_START) {
3117 // Match only at the start of an input text string.
3118 // start type is already set. We're done.
3119 } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3120 // Match beginning only with a literal string.
3121 UChar32 c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3122 U_ASSERT(fRXPat->fInitialChars->contains(c));
3123 fRXPat->fStartType = START_STRING;
3124 fRXPat->fInitialChar = c;
3125 } else if (fRXPat->fStartType == START_LINE) {
3126 // Match at start of line in Multi-Line mode.
3127 // Nothing to do here; everything is already set.
3128 } else if (fRXPat->fMinMatchLen == 0) {
3129 // Zero length match possible. We could start anywhere.
3130 fRXPat->fStartType = START_NO_INFO;
3131 } else if (fRXPat->fInitialChars->size() == 1) {
3132 // All matches begin with the same char.
3133 fRXPat->fStartType = START_CHAR;
3134 fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3135 U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3136 } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3137 fRXPat->fMinMatchLen > 0) {
3138 // Matches start with a set of character smaller than the set of all chars.
3139 fRXPat->fStartType = START_SET;
3140 } else {
3141 // Matches can start with anything
3142 fRXPat->fStartType = START_NO_INFO;
3143 }
3144
3145 return;
3146 }
3147
3148
3149
3150 //------------------------------------------------------------------------------
3151 //
3152 // minMatchLength Calculate the length of the shortest string that could
3153 // match the specified pattern.
3154 // Length is in 16 bit code units, not code points.
3155 //
3156 // The calculated length may not be exact. The returned
3157 // value may be shorter than the actual minimum; it must
3158 // never be longer.
3159 //
3160 // start and end are the range of p-code operations to be
3161 // examined. The endpoints are included in the range.
3162 //
3163 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3164 int32_t RegexCompile::minMatchLength(int32_t start, int32_t end) {
3165 if (U_FAILURE(*fStatus)) {
3166 return 0;
3167 }
3168
3169 U_ASSERT(start <= end);
3170 U_ASSERT(end < fRXPat->fCompiledPat->size());
3171
3172
3173 int32_t loc;
3174 int32_t op;
3175 int32_t opType;
3176 int32_t currentLen = 0;
3177
3178
3179 // forwardedLength is a vector holding minimum-match-length values that
3180 // are propagated forward in the pattern by JMP or STATE_SAVE operations.
3181 // It must be one longer than the pattern being checked because some ops
3182 // will jmp to a end-of-block+1 location from within a block, and we must
3183 // count those when checking the block.
3184 UVector32 forwardedLength(end+2, *fStatus);
3185 forwardedLength.setSize(end+2);
3186 for (loc=start; loc<=end+1; loc++) {
3187 forwardedLength.setElementAt(INT32_MAX, loc);
3188 }
3189
3190 for (loc = start; loc<=end; loc++) {
3191 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3192 opType = URX_TYPE(op);
3193
3194 // The loop is advancing linearly through the pattern.
3195 // If the op we are now at was the destination of a branch in the pattern,
3196 // and that path has a shorter minimum length than the current accumulated value,
3197 // replace the current accumulated value.
3198 // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX); // MinLength == INT32_MAX for some
3199 // no-match-possible cases.
3200 if (forwardedLength.elementAti(loc) < currentLen) {
3201 currentLen = forwardedLength.elementAti(loc);
3202 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3203 }
3204
3205 switch (opType) {
3206 // Ops that don't change the total length matched
3207 case URX_RESERVED_OP:
3208 case URX_END:
3209 case URX_STRING_LEN:
3210 case URX_NOP:
3211 case URX_START_CAPTURE:
3212 case URX_END_CAPTURE:
3213 case URX_BACKSLASH_B:
3214 case URX_BACKSLASH_BU:
3215 case URX_BACKSLASH_G:
3216 case URX_BACKSLASH_Z:
3217 case URX_CARET:
3218 case URX_DOLLAR:
3219 case URX_DOLLAR_M:
3220 case URX_DOLLAR_D:
3221 case URX_DOLLAR_MD:
3222 case URX_RELOC_OPRND:
3223 case URX_STO_INP_LOC:
3224 case URX_CARET_M:
3225 case URX_CARET_M_UNIX:
3226 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3227 case URX_BACKREF_I:
3228
3229 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3230 case URX_LD_SP:
3231
3232 case URX_JMP_SAV:
3233 case URX_JMP_SAV_X:
3234 break;
3235
3236
3237 // Ops that match a minimum of one character (one or two 16 bit code units.)
3238 //
3239 case URX_ONECHAR:
3240 case URX_STATIC_SETREF:
3241 case URX_STAT_SETREF_N:
3242 case URX_SETREF:
3243 case URX_BACKSLASH_D:
3244 case URX_BACKSLASH_H:
3245 case URX_BACKSLASH_R:
3246 case URX_BACKSLASH_V:
3247 case URX_ONECHAR_I:
3248 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3249 case URX_DOTANY_ALL: // . matches one or two.
3250 case URX_DOTANY:
3251 case URX_DOTANY_UNIX:
3252 currentLen++;
3253 break;
3254
3255
3256 case URX_JMPX:
3257 loc++; // URX_JMPX has an extra operand, ignored here,
3258 // otherwise processed identically to URX_JMP.
3259 case URX_JMP:
3260 {
3261 int32_t jmpDest = URX_VAL(op);
3262 if (jmpDest < loc) {
3263 // Loop of some kind. Can safely ignore, the worst that will happen
3264 // is that we understate the true minimum length
3265 currentLen = forwardedLength.elementAti(loc+1);
3266 } else {
3267 // Forward jump. Propagate the current min length to the target loc of the jump.
3268 U_ASSERT(jmpDest <= end+1);
3269 if (forwardedLength.elementAti(jmpDest) > currentLen) {
3270 forwardedLength.setElementAt(currentLen, jmpDest);
3271 }
3272 }
3273 }
3274 break;
3275
3276 case URX_BACKTRACK:
3277 {
3278 // Back-tracks are kind of like a branch, except that the min length was
3279 // propagated already, by the state save.
3280 currentLen = forwardedLength.elementAti(loc+1);
3281 }
3282 break;
3283
3284
3285 case URX_STATE_SAVE:
3286 {
3287 // State Save, for forward jumps, propagate the current minimum.
3288 // of the state save.
3289 int32_t jmpDest = URX_VAL(op);
3290 if (jmpDest > loc) {
3291 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3292 forwardedLength.setElementAt(currentLen, jmpDest);
3293 }
3294 }
3295 }
3296 break;
3297
3298
3299 case URX_STRING:
3300 {
3301 loc++;
3302 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3303 currentLen += URX_VAL(stringLenOp);
3304 }
3305 break;
3306
3307
3308 case URX_STRING_I:
3309 {
3310 loc++;
3311 // TODO: with full case folding, matching input text may be shorter than
3312 // the string we have here. More smarts could put some bounds on it.
3313 // Assume a min length of one for now. A min length of zero causes
3314 // optimization failures for a pattern like "string"+
3315 // currentLen += URX_VAL(stringLenOp);
3316 currentLen += 1;
3317 }
3318 break;
3319
3320 case URX_CTR_INIT:
3321 case URX_CTR_INIT_NG:
3322 {
3323 // Loop Init Ops.
3324 // If the min loop count == 0
3325 // move loc forwards to the end of the loop, skipping over the body.
3326 // If the min count is > 0,
3327 // continue normal processing of the body of the loop.
3328 int32_t loopEndLoc = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3329 loopEndLoc = URX_VAL(loopEndLoc);
3330 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3331 if (minLoopCount == 0) {
3332 loc = loopEndLoc;
3333 } else {
3334 loc+=3; // Skips over operands of CTR_INIT
3335 }
3336 }
3337 break;
3338
3339
3340 case URX_CTR_LOOP:
3341 case URX_CTR_LOOP_NG:
3342 // Loop ops.
3343 // The jump is conditional, backwards only.
3344 break;
3345
3346 case URX_LOOP_SR_I:
3347 case URX_LOOP_DOT_I:
3348 case URX_LOOP_C:
3349 // More loop ops. These state-save to themselves.
3350 // don't change the minimum match - could match nothing at all.
3351 break;
3352
3353
3354 case URX_LA_START:
3355 case URX_LB_START:
3356 {
3357 // Look-around. Scan forward until the matching look-ahead end,
3358 // without processing the look-around block. This is overly pessimistic for look-ahead,
3359 // it assumes that the look-ahead match might be zero-length.
3360 // TODO: Positive lookahead could recursively do the block, then continue
3361 // with the longer of the block or the value coming in. Ticket 6060
3362 int32_t depth = (opType == URX_LA_START? 2: 1);;
3363 for (;;) {
3364 loc++;
3365 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3366 if (URX_TYPE(op) == URX_LA_START) {
3367 // The boilerplate for look-ahead includes two LA_END insturctions,
3368 // Depth will be decremented by each one when it is seen.
3369 depth += 2;
3370 }
3371 if (URX_TYPE(op) == URX_LB_START) {
3372 depth++;
3373 }
3374 if (URX_TYPE(op) == URX_LA_END) {
3375 depth--;
3376 if (depth == 0) {
3377 break;
3378 }
3379 }
3380 if (URX_TYPE(op)==URX_LBN_END) {
3381 depth--;
3382 if (depth == 0) {
3383 break;
3384 }
3385 }
3386 if (URX_TYPE(op) == URX_STATE_SAVE) {
3387 // Need this because neg lookahead blocks will FAIL to outside
3388 // of the block.
3389 int32_t jmpDest = URX_VAL(op);
3390 if (jmpDest > loc) {
3391 if (currentLen < forwardedLength.elementAti(jmpDest)) {
3392 forwardedLength.setElementAt(currentLen, jmpDest);
3393 }
3394 }
3395 }
3396 U_ASSERT(loc <= end);
3397 }
3398 }
3399 break;
3400
3401 case URX_LA_END:
3402 case URX_LB_CONT:
3403 case URX_LB_END:
3404 case URX_LBN_CONT:
3405 case URX_LBN_END:
3406 // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3407 // range being sized, which happens when measuring size of look-behind blocks.
3408 break;
3409
3410 default:
3411 U_ASSERT(FALSE);
3412 }
3413
3414 }
3415
3416 // We have finished walking through the ops. Check whether some forward jump
3417 // propagated a shorter length to location end+1.
3418 if (forwardedLength.elementAti(end+1) < currentLen) {
3419 currentLen = forwardedLength.elementAti(end+1);
3420 U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3421 }
3422
3423 return currentLen;
3424 }
3425
3426 // Increment with overflow check.
3427 // val and delta will both be positive.
3428
safeIncrement(int32_t val,int32_t delta)3429 static int32_t safeIncrement(int32_t val, int32_t delta) {
3430 if (INT32_MAX - val > delta) {
3431 return val + delta;
3432 } else {
3433 return INT32_MAX;
3434 }
3435 }
3436
3437
3438 //------------------------------------------------------------------------------
3439 //
3440 // maxMatchLength Calculate the length of the longest string that could
3441 // match the specified pattern.
3442 // Length is in 16 bit code units, not code points.
3443 //
3444 // The calculated length may not be exact. The returned
3445 // value may be longer than the actual maximum; it must
3446 // never be shorter.
3447 //
3448 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3449 int32_t RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3450 if (U_FAILURE(*fStatus)) {
3451 return 0;
3452 }
3453 U_ASSERT(start <= end);
3454 U_ASSERT(end < fRXPat->fCompiledPat->size());
3455
3456
3457 int32_t loc;
3458 int32_t op;
3459 int32_t opType;
3460 int32_t currentLen = 0;
3461 UVector32 forwardedLength(end+1, *fStatus);
3462 forwardedLength.setSize(end+1);
3463
3464 for (loc=start; loc<=end; loc++) {
3465 forwardedLength.setElementAt(0, loc);
3466 }
3467
3468 for (loc = start; loc<=end; loc++) {
3469 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3470 opType = URX_TYPE(op);
3471
3472 // The loop is advancing linearly through the pattern.
3473 // If the op we are now at was the destination of a branch in the pattern,
3474 // and that path has a longer maximum length than the current accumulated value,
3475 // replace the current accumulated value.
3476 if (forwardedLength.elementAti(loc) > currentLen) {
3477 currentLen = forwardedLength.elementAti(loc);
3478 }
3479
3480 switch (opType) {
3481 // Ops that don't change the total length matched
3482 case URX_RESERVED_OP:
3483 case URX_END:
3484 case URX_STRING_LEN:
3485 case URX_NOP:
3486 case URX_START_CAPTURE:
3487 case URX_END_CAPTURE:
3488 case URX_BACKSLASH_B:
3489 case URX_BACKSLASH_BU:
3490 case URX_BACKSLASH_G:
3491 case URX_BACKSLASH_Z:
3492 case URX_CARET:
3493 case URX_DOLLAR:
3494 case URX_DOLLAR_M:
3495 case URX_DOLLAR_D:
3496 case URX_DOLLAR_MD:
3497 case URX_RELOC_OPRND:
3498 case URX_STO_INP_LOC:
3499 case URX_CARET_M:
3500 case URX_CARET_M_UNIX:
3501
3502 case URX_STO_SP: // Setup for atomic or possessive blocks. Doesn't change what can match.
3503 case URX_LD_SP:
3504
3505 case URX_LB_END:
3506 case URX_LB_CONT:
3507 case URX_LBN_CONT:
3508 case URX_LBN_END:
3509 break;
3510
3511
3512 // Ops that increase that cause an unbounded increase in the length
3513 // of a matched string, or that increase it a hard to characterize way.
3514 // Call the max length unbounded, and stop further checking.
3515 case URX_BACKREF: // BackRef. Must assume that it might be a zero length match
3516 case URX_BACKREF_I:
3517 case URX_BACKSLASH_X: // Grahpeme Cluster. Minimum is 1, max unbounded.
3518 currentLen = INT32_MAX;
3519 break;
3520
3521
3522 // Ops that match a max of one character (possibly two 16 bit code units.)
3523 //
3524 case URX_STATIC_SETREF:
3525 case URX_STAT_SETREF_N:
3526 case URX_SETREF:
3527 case URX_BACKSLASH_D:
3528 case URX_BACKSLASH_H:
3529 case URX_BACKSLASH_R:
3530 case URX_BACKSLASH_V:
3531 case URX_ONECHAR_I:
3532 case URX_DOTANY_ALL:
3533 case URX_DOTANY:
3534 case URX_DOTANY_UNIX:
3535 currentLen = safeIncrement(currentLen, 2);
3536 break;
3537
3538 // Single literal character. Increase current max length by one or two,
3539 // depending on whether the char is in the supplementary range.
3540 case URX_ONECHAR:
3541 currentLen = safeIncrement(currentLen, 1);
3542 if (URX_VAL(op) > 0x10000) {
3543 currentLen = safeIncrement(currentLen, 1);
3544 }
3545 break;
3546
3547 // Jumps.
3548 //
3549 case URX_JMP:
3550 case URX_JMPX:
3551 case URX_JMP_SAV:
3552 case URX_JMP_SAV_X:
3553 {
3554 int32_t jmpDest = URX_VAL(op);
3555 if (jmpDest < loc) {
3556 // Loop of some kind. Max match length is unbounded.
3557 currentLen = INT32_MAX;
3558 } else {
3559 // Forward jump. Propagate the current min length to the target loc of the jump.
3560 if (forwardedLength.elementAti(jmpDest) < currentLen) {
3561 forwardedLength.setElementAt(currentLen, jmpDest);
3562 }
3563 currentLen = 0;
3564 }
3565 }
3566 break;
3567
3568 case URX_BACKTRACK:
3569 // back-tracks are kind of like a branch, except that the max length was
3570 // propagated already, by the state save.
3571 currentLen = forwardedLength.elementAti(loc+1);
3572 break;
3573
3574
3575 case URX_STATE_SAVE:
3576 {
3577 // State Save, for forward jumps, propagate the current minimum.
3578 // of the state save.
3579 // For backwards jumps, they create a loop, maximum
3580 // match length is unbounded.
3581 int32_t jmpDest = URX_VAL(op);
3582 if (jmpDest > loc) {
3583 if (currentLen > forwardedLength.elementAti(jmpDest)) {
3584 forwardedLength.setElementAt(currentLen, jmpDest);
3585 }
3586 } else {
3587 currentLen = INT32_MAX;
3588 }
3589 }
3590 break;
3591
3592
3593
3594
3595 case URX_STRING:
3596 {
3597 loc++;
3598 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3599 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3600 break;
3601 }
3602
3603 case URX_STRING_I:
3604 // TODO: This code assumes that any user string that matches will be no longer
3605 // than our compiled string, with case insensitive matching.
3606 // Our compiled string has been case-folded already.
3607 //
3608 // Any matching user string will have no more code points than our
3609 // compiled (folded) string. Folding may add code points, but
3610 // not remove them.
3611 //
3612 // There is a potential problem if a supplemental code point
3613 // case-folds to a BMP code point. In this case our compiled string
3614 // could be shorter (in code units) than a matching user string.
3615 //
3616 // At this time (Unicode 6.1) there are no such characters, and this case
3617 // is not being handled. A test, intltest regex/Bug9283, will fail if
3618 // any problematic characters are added to Unicode.
3619 //
3620 // If this happens, we can make a set of the BMP chars that the
3621 // troublesome supplementals fold to, scan our string, and bump the
3622 // currentLen one extra for each that is found.
3623 //
3624 {
3625 loc++;
3626 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3627 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3628 }
3629 break;
3630
3631 case URX_CTR_INIT:
3632 case URX_CTR_INIT_NG:
3633 // For Loops, recursively call this function on the pattern for the loop body,
3634 // then multiply the result by the maximum loop count.
3635 {
3636 int32_t loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3637 if (loopEndLoc == loc+4) {
3638 // Loop has an empty body. No affect on max match length.
3639 // Continue processing with code after the loop end.
3640 loc = loopEndLoc;
3641 break;
3642 }
3643
3644 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3645 if (maxLoopCount == -1) {
3646 // Unbounded Loop. No upper bound on match length.
3647 currentLen = INT32_MAX;
3648 break;
3649 }
3650
3651 U_ASSERT(loopEndLoc >= loc+4);
3652 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1); // Recursive call.
3653 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3654 if (updatedLen >= INT32_MAX) {
3655 currentLen = INT32_MAX;
3656 break;
3657 }
3658 currentLen = (int32_t)updatedLen;
3659 loc = loopEndLoc;
3660 break;
3661 }
3662
3663 case URX_CTR_LOOP:
3664 case URX_CTR_LOOP_NG:
3665 // These opcodes will be skipped over by code for URX_CRT_INIT.
3666 // We shouldn't encounter them here.
3667 U_ASSERT(FALSE);
3668 break;
3669
3670 case URX_LOOP_SR_I:
3671 case URX_LOOP_DOT_I:
3672 case URX_LOOP_C:
3673 // For anything to do with loops, make the match length unbounded.
3674 currentLen = INT32_MAX;
3675 break;
3676
3677
3678
3679 case URX_LA_START:
3680 case URX_LA_END:
3681 // Look-ahead. Just ignore, treat the look-ahead block as if
3682 // it were normal pattern. Gives a too-long match length,
3683 // but good enough for now.
3684 break;
3685
3686 // End of look-ahead ops should always be consumed by the processing at
3687 // the URX_LA_START op.
3688 // U_ASSERT(FALSE);
3689 // break;
3690
3691 case URX_LB_START:
3692 {
3693 // Look-behind. Scan forward until the matching look-around end,
3694 // without processing the look-behind block.
3695 int32_t depth = 0;
3696 for (;;) {
3697 loc++;
3698 op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3699 if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3700 depth++;
3701 }
3702 if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3703 if (depth == 0) {
3704 break;
3705 }
3706 depth--;
3707 }
3708 U_ASSERT(loc < end);
3709 }
3710 }
3711 break;
3712
3713 default:
3714 U_ASSERT(FALSE);
3715 }
3716
3717
3718 if (currentLen == INT32_MAX) {
3719 // The maximum length is unbounded.
3720 // Stop further processing of the pattern.
3721 break;
3722 }
3723
3724 }
3725 return currentLen;
3726
3727 }
3728
3729
3730 //------------------------------------------------------------------------------
3731 //
3732 // stripNOPs Remove any NOP operations from the compiled pattern code.
3733 // Extra NOPs are inserted for some constructs during the initial
3734 // code generation to provide locations that may be patched later.
3735 // Many end up unneeded, and are removed by this function.
3736 //
3737 // In order to minimize the number of passes through the pattern,
3738 // back-reference fixup is also performed here (adjusting
3739 // back-reference operands to point to the correct frame offsets).
3740 //
3741 //------------------------------------------------------------------------------
stripNOPs()3742 void RegexCompile::stripNOPs() {
3743
3744 if (U_FAILURE(*fStatus)) {
3745 return;
3746 }
3747
3748 int32_t end = fRXPat->fCompiledPat->size();
3749 UVector32 deltas(end, *fStatus);
3750
3751 // Make a first pass over the code, computing the amount that things
3752 // will be offset at each location in the original code.
3753 int32_t loc;
3754 int32_t d = 0;
3755 for (loc=0; loc<end; loc++) {
3756 deltas.addElement(d, *fStatus);
3757 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3758 if (URX_TYPE(op) == URX_NOP) {
3759 d++;
3760 }
3761 }
3762
3763 UnicodeString caseStringBuffer;
3764
3765 // Make a second pass over the code, removing the NOPs by moving following
3766 // code up, and patching operands that refer to code locations that
3767 // are being moved. The array of offsets from the first step is used
3768 // to compute the new operand values.
3769 int32_t src;
3770 int32_t dst = 0;
3771 for (src=0; src<end; src++) {
3772 int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3773 int32_t opType = URX_TYPE(op);
3774 switch (opType) {
3775 case URX_NOP:
3776 break;
3777
3778 case URX_STATE_SAVE:
3779 case URX_JMP:
3780 case URX_CTR_LOOP:
3781 case URX_CTR_LOOP_NG:
3782 case URX_RELOC_OPRND:
3783 case URX_JMPX:
3784 case URX_JMP_SAV:
3785 case URX_JMP_SAV_X:
3786 // These are instructions with operands that refer to code locations.
3787 {
3788 int32_t operandAddress = URX_VAL(op);
3789 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3790 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3791 op = buildOp(opType, fixedOperandAddress);
3792 fRXPat->fCompiledPat->setElementAt(op, dst);
3793 dst++;
3794 break;
3795 }
3796
3797 case URX_BACKREF:
3798 case URX_BACKREF_I:
3799 {
3800 int32_t where = URX_VAL(op);
3801 if (where > fRXPat->fGroupMap->size()) {
3802 error(U_REGEX_INVALID_BACK_REF);
3803 break;
3804 }
3805 where = fRXPat->fGroupMap->elementAti(where-1);
3806 op = buildOp(opType, where);
3807 fRXPat->fCompiledPat->setElementAt(op, dst);
3808 dst++;
3809
3810 fRXPat->fNeedsAltInput = TRUE;
3811 break;
3812 }
3813 case URX_RESERVED_OP:
3814 case URX_RESERVED_OP_N:
3815 case URX_BACKTRACK:
3816 case URX_END:
3817 case URX_ONECHAR:
3818 case URX_STRING:
3819 case URX_STRING_LEN:
3820 case URX_START_CAPTURE:
3821 case URX_END_CAPTURE:
3822 case URX_STATIC_SETREF:
3823 case URX_STAT_SETREF_N:
3824 case URX_SETREF:
3825 case URX_DOTANY:
3826 case URX_FAIL:
3827 case URX_BACKSLASH_B:
3828 case URX_BACKSLASH_BU:
3829 case URX_BACKSLASH_G:
3830 case URX_BACKSLASH_X:
3831 case URX_BACKSLASH_Z:
3832 case URX_DOTANY_ALL:
3833 case URX_BACKSLASH_D:
3834 case URX_CARET:
3835 case URX_DOLLAR:
3836 case URX_CTR_INIT:
3837 case URX_CTR_INIT_NG:
3838 case URX_DOTANY_UNIX:
3839 case URX_STO_SP:
3840 case URX_LD_SP:
3841 case URX_STO_INP_LOC:
3842 case URX_LA_START:
3843 case URX_LA_END:
3844 case URX_ONECHAR_I:
3845 case URX_STRING_I:
3846 case URX_DOLLAR_M:
3847 case URX_CARET_M:
3848 case URX_CARET_M_UNIX:
3849 case URX_LB_START:
3850 case URX_LB_CONT:
3851 case URX_LB_END:
3852 case URX_LBN_CONT:
3853 case URX_LBN_END:
3854 case URX_LOOP_SR_I:
3855 case URX_LOOP_DOT_I:
3856 case URX_LOOP_C:
3857 case URX_DOLLAR_D:
3858 case URX_DOLLAR_MD:
3859 case URX_BACKSLASH_H:
3860 case URX_BACKSLASH_R:
3861 case URX_BACKSLASH_V:
3862 // These instructions are unaltered by the relocation.
3863 fRXPat->fCompiledPat->setElementAt(op, dst);
3864 dst++;
3865 break;
3866
3867 default:
3868 // Some op is unaccounted for.
3869 U_ASSERT(FALSE);
3870 error(U_REGEX_INTERNAL_ERROR);
3871 }
3872 }
3873
3874 fRXPat->fCompiledPat->setSize(dst);
3875 }
3876
3877
3878
3879
3880 //------------------------------------------------------------------------------
3881 //
3882 // Error Report a rule parse error.
3883 // Only report it if no previous error has been recorded.
3884 //
3885 //------------------------------------------------------------------------------
error(UErrorCode e)3886 void RegexCompile::error(UErrorCode e) {
3887 if (U_SUCCESS(*fStatus)) {
3888 *fStatus = e;
3889 // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3890 // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3891 // int64_t. If the values of the latter are out of range for the former,
3892 // set them to the appropriate "field not supported" values.
3893 if (fLineNum > 0x7FFFFFFF) {
3894 fParseErr->line = 0;
3895 fParseErr->offset = -1;
3896 } else if (fCharNum > 0x7FFFFFFF) {
3897 fParseErr->line = (int32_t)fLineNum;
3898 fParseErr->offset = -1;
3899 } else {
3900 fParseErr->line = (int32_t)fLineNum;
3901 fParseErr->offset = (int32_t)fCharNum;
3902 }
3903
3904 UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3905
3906 // Fill in the context.
3907 // Note: extractBetween() pins supplied indicies to the string bounds.
3908 uprv_memset(fParseErr->preContext, 0, sizeof(fParseErr->preContext));
3909 uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3910 utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3911 utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3912 }
3913 }
3914
3915
3916 //
3917 // Assorted Unicode character constants.
3918 // Numeric because there is no portable way to enter them as literals.
3919 // (Think EBCDIC).
3920 //
3921 static const UChar chCR = 0x0d; // New lines, for terminating comments.
3922 static const UChar chLF = 0x0a; // Line Feed
3923 static const UChar chPound = 0x23; // '#', introduces a comment.
3924 static const UChar chDigit0 = 0x30; // '0'
3925 static const UChar chDigit7 = 0x37; // '9'
3926 static const UChar chColon = 0x3A; // ':'
3927 static const UChar chE = 0x45; // 'E'
3928 static const UChar chQ = 0x51; // 'Q'
3929 //static const UChar chN = 0x4E; // 'N'
3930 static const UChar chP = 0x50; // 'P'
3931 static const UChar chBackSlash = 0x5c; // '\' introduces a char escape
3932 //static const UChar chLBracket = 0x5b; // '['
3933 static const UChar chRBracket = 0x5d; // ']'
3934 static const UChar chUp = 0x5e; // '^'
3935 static const UChar chLowerP = 0x70;
3936 static const UChar chLBrace = 0x7b; // '{'
3937 static const UChar chRBrace = 0x7d; // '}'
3938 static const UChar chNEL = 0x85; // NEL newline variant
3939 static const UChar chLS = 0x2028; // Unicode Line Separator
3940
3941
3942 //------------------------------------------------------------------------------
3943 //
3944 // nextCharLL Low Level Next Char from the regex pattern.
3945 // Get a char from the string, keep track of input position
3946 // for error reporting.
3947 //
3948 //------------------------------------------------------------------------------
nextCharLL()3949 UChar32 RegexCompile::nextCharLL() {
3950 UChar32 ch;
3951
3952 if (fPeekChar != -1) {
3953 ch = fPeekChar;
3954 fPeekChar = -1;
3955 return ch;
3956 }
3957
3958 // assume we're already in the right place
3959 ch = UTEXT_NEXT32(fRXPat->fPattern);
3960 if (ch == U_SENTINEL) {
3961 return ch;
3962 }
3963
3964 if (ch == chCR ||
3965 ch == chNEL ||
3966 ch == chLS ||
3967 (ch == chLF && fLastChar != chCR)) {
3968 // Character is starting a new line. Bump up the line number, and
3969 // reset the column to 0.
3970 fLineNum++;
3971 fCharNum=0;
3972 }
3973 else {
3974 // Character is not starting a new line. Except in the case of a
3975 // LF following a CR, increment the column position.
3976 if (ch != chLF) {
3977 fCharNum++;
3978 }
3979 }
3980 fLastChar = ch;
3981 return ch;
3982 }
3983
3984 //------------------------------------------------------------------------------
3985 //
3986 // peekCharLL Low Level Character Scanning, sneak a peek at the next
3987 // character without actually getting it.
3988 //
3989 //------------------------------------------------------------------------------
peekCharLL()3990 UChar32 RegexCompile::peekCharLL() {
3991 if (fPeekChar == -1) {
3992 fPeekChar = nextCharLL();
3993 }
3994 return fPeekChar;
3995 }
3996
3997
3998 //------------------------------------------------------------------------------
3999 //
4000 // nextChar for pattern scanning. At this level, we handle stripping
4001 // out comments and processing some backslash character escapes.
4002 // The rest of the pattern grammar is handled at the next level up.
4003 //
4004 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4005 void RegexCompile::nextChar(RegexPatternChar &c) {
4006
4007 fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4008 c.fChar = nextCharLL();
4009 c.fQuoted = FALSE;
4010
4011 if (fQuoteMode) {
4012 c.fQuoted = TRUE;
4013 if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4014 c.fChar == (UChar32)-1) {
4015 fQuoteMode = FALSE; // Exit quote mode,
4016 nextCharLL(); // discard the E
4017 nextChar(c); // recurse to get the real next char
4018 }
4019 }
4020 else if (fInBackslashQuote) {
4021 // The current character immediately follows a '\'
4022 // Don't check for any further escapes, just return it as-is.
4023 // Don't set c.fQuoted, because that would prevent the state machine from
4024 // dispatching on the character.
4025 fInBackslashQuote = FALSE;
4026 }
4027 else
4028 {
4029 // We are not in a \Q quoted region \E of the source.
4030 //
4031 if (fModeFlags & UREGEX_COMMENTS) {
4032 //
4033 // We are in free-spacing and comments mode.
4034 // Scan through any white space and comments, until we
4035 // reach a significant character or the end of inut.
4036 for (;;) {
4037 if (c.fChar == (UChar32)-1) {
4038 break; // End of Input
4039 }
4040 if (c.fChar == chPound && fEOLComments == TRUE) {
4041 // Start of a comment. Consume the rest of it, until EOF or a new line
4042 for (;;) {
4043 c.fChar = nextCharLL();
4044 if (c.fChar == (UChar32)-1 || // EOF
4045 c.fChar == chCR ||
4046 c.fChar == chLF ||
4047 c.fChar == chNEL ||
4048 c.fChar == chLS) {
4049 break;
4050 }
4051 }
4052 }
4053 // TODO: check what Java & Perl do with non-ASCII white spaces. Ticket 6061.
4054 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4055 break;
4056 }
4057 c.fChar = nextCharLL();
4058 }
4059 }
4060
4061 //
4062 // check for backslash escaped characters.
4063 //
4064 if (c.fChar == chBackSlash) {
4065 int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4066 if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4067 //
4068 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4069 // Includes \uxxxx, \n, \r, many others.
4070 // Return the single equivalent character.
4071 //
4072 nextCharLL(); // get & discard the peeked char.
4073 c.fQuoted = TRUE;
4074
4075 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4076 int32_t endIndex = (int32_t)pos;
4077 c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4078
4079 if (endIndex == pos) {
4080 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4081 }
4082 fCharNum += endIndex - pos;
4083 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4084 } else {
4085 int32_t offset = 0;
4086 struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4087
4088 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4089 c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4090
4091 if (offset == 0) {
4092 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4093 } else if (context.lastOffset == offset) {
4094 UTEXT_PREVIOUS32(fRXPat->fPattern);
4095 } else if (context.lastOffset != offset-1) {
4096 utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4097 }
4098 fCharNum += offset;
4099 }
4100 }
4101 else if (peekCharLL() == chDigit0) {
4102 // Octal Escape, using Java Regexp Conventions
4103 // which are \0 followed by 1-3 octal digits.
4104 // Different from ICU Unescape handling of Octal, which does not
4105 // require the leading 0.
4106 // Java also has the convention of only consuming 2 octal digits if
4107 // the three digit number would be > 0xff
4108 //
4109 c.fChar = 0;
4110 nextCharLL(); // Consume the initial 0.
4111 int index;
4112 for (index=0; index<3; index++) {
4113 int32_t ch = peekCharLL();
4114 if (ch<chDigit0 || ch>chDigit7) {
4115 if (index==0) {
4116 // \0 is not followed by any octal digits.
4117 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4118 }
4119 break;
4120 }
4121 c.fChar <<= 3;
4122 c.fChar += ch&7;
4123 if (c.fChar <= 255) {
4124 nextCharLL();
4125 } else {
4126 // The last digit made the number too big. Forget we saw it.
4127 c.fChar >>= 3;
4128 }
4129 }
4130 c.fQuoted = TRUE;
4131 }
4132 else if (peekCharLL() == chQ) {
4133 // "\Q" enter quote mode, which will continue until "\E"
4134 fQuoteMode = TRUE;
4135 nextCharLL(); // discard the 'Q'.
4136 nextChar(c); // recurse to get the real next char.
4137 }
4138 else
4139 {
4140 // We are in a '\' escape that will be handled by the state table scanner.
4141 // Just return the backslash, but remember that the following char is to
4142 // be taken literally.
4143 fInBackslashQuote = TRUE;
4144 }
4145 }
4146 }
4147
4148 // re-enable # to end-of-line comments, in case they were disabled.
4149 // They are disabled by the parser upon seeing '(?', but this lasts for
4150 // the fetching of the next character only.
4151 fEOLComments = TRUE;
4152
4153 // putc(c.fChar, stdout);
4154 }
4155
4156
4157
4158 //------------------------------------------------------------------------------
4159 //
4160 // scanNamedChar
4161 // Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4162 //
4163 // The scan position will be at the 'N'. On return
4164 // the scan position should be just after the '}'
4165 //
4166 // Return the UChar32
4167 //
4168 //------------------------------------------------------------------------------
scanNamedChar()4169 UChar32 RegexCompile::scanNamedChar() {
4170 if (U_FAILURE(*fStatus)) {
4171 return 0;
4172 }
4173
4174 nextChar(fC);
4175 if (fC.fChar != chLBrace) {
4176 error(U_REGEX_PROPERTY_SYNTAX);
4177 return 0;
4178 }
4179
4180 UnicodeString charName;
4181 for (;;) {
4182 nextChar(fC);
4183 if (fC.fChar == chRBrace) {
4184 break;
4185 }
4186 if (fC.fChar == -1) {
4187 error(U_REGEX_PROPERTY_SYNTAX);
4188 return 0;
4189 }
4190 charName.append(fC.fChar);
4191 }
4192
4193 char name[100];
4194 if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4195 (uint32_t)charName.length()>=sizeof(name)) {
4196 // All Unicode character names have only invariant characters.
4197 // The API to get a character, given a name, accepts only char *, forcing us to convert,
4198 // which requires this error check
4199 error(U_REGEX_PROPERTY_SYNTAX);
4200 return 0;
4201 }
4202 charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4203
4204 UChar32 theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4205 if (U_FAILURE(*fStatus)) {
4206 error(U_REGEX_PROPERTY_SYNTAX);
4207 }
4208
4209 nextChar(fC); // Continue overall regex pattern processing with char after the '}'
4210 return theChar;
4211 }
4212
4213 //------------------------------------------------------------------------------
4214 //
4215 // scanProp Construct a UnicodeSet from the text at the current scan
4216 // position, which will be of the form \p{whaterver}
4217 //
4218 // The scan position will be at the 'p' or 'P'. On return
4219 // the scan position should be just after the '}'
4220 //
4221 // Return a UnicodeSet, constructed from the \P pattern,
4222 // or NULL if the pattern is invalid.
4223 //
4224 //------------------------------------------------------------------------------
scanProp()4225 UnicodeSet *RegexCompile::scanProp() {
4226 UnicodeSet *uset = NULL;
4227
4228 if (U_FAILURE(*fStatus)) {
4229 return NULL;
4230 }
4231 (void)chLowerP; // Suppress compiler unused variable warning.
4232 U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4233 UBool negated = (fC.fChar == chP);
4234
4235 UnicodeString propertyName;
4236 nextChar(fC);
4237 if (fC.fChar != chLBrace) {
4238 error(U_REGEX_PROPERTY_SYNTAX);
4239 return NULL;
4240 }
4241 for (;;) {
4242 nextChar(fC);
4243 if (fC.fChar == chRBrace) {
4244 break;
4245 }
4246 if (fC.fChar == -1) {
4247 // Hit the end of the input string without finding the closing '}'
4248 error(U_REGEX_PROPERTY_SYNTAX);
4249 return NULL;
4250 }
4251 propertyName.append(fC.fChar);
4252 }
4253 uset = createSetForProperty(propertyName, negated);
4254 nextChar(fC); // Move input scan to position following the closing '}'
4255 return uset;
4256 }
4257
4258 //------------------------------------------------------------------------------
4259 //
4260 // scanPosixProp Construct a UnicodeSet from the text at the current scan
4261 // position, which is expected be of the form [:property expression:]
4262 //
4263 // The scan position will be at the opening ':'. On return
4264 // the scan position must be on the closing ']'
4265 //
4266 // Return a UnicodeSet constructed from the pattern,
4267 // or NULL if this is not a valid POSIX-style set expression.
4268 // If not a property expression, restore the initial scan position
4269 // (to the opening ':')
4270 //
4271 // Note: the opening '[:' is not sufficient to guarantee that
4272 // this is a [:property:] expression.
4273 // [:'+=,] is a perfectly good ordinary set expression that
4274 // happens to include ':' as one of its characters.
4275 //
4276 //------------------------------------------------------------------------------
scanPosixProp()4277 UnicodeSet *RegexCompile::scanPosixProp() {
4278 UnicodeSet *uset = NULL;
4279
4280 if (U_FAILURE(*fStatus)) {
4281 return NULL;
4282 }
4283
4284 U_ASSERT(fC.fChar == chColon);
4285
4286 // Save the scanner state.
4287 // TODO: move this into the scanner, with the state encapsulated in some way. Ticket 6062
4288 int64_t savedScanIndex = fScanIndex;
4289 int64_t savedNextIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4290 UBool savedQuoteMode = fQuoteMode;
4291 UBool savedInBackslashQuote = fInBackslashQuote;
4292 UBool savedEOLComments = fEOLComments;
4293 int64_t savedLineNum = fLineNum;
4294 int64_t savedCharNum = fCharNum;
4295 UChar32 savedLastChar = fLastChar;
4296 UChar32 savedPeekChar = fPeekChar;
4297 RegexPatternChar savedfC = fC;
4298
4299 // Scan for a closing ]. A little tricky because there are some perverse
4300 // edge cases possible. "[:abc\Qdef:] \E]" is a valid non-property expression,
4301 // ending on the second closing ].
4302
4303 UnicodeString propName;
4304 UBool negated = FALSE;
4305
4306 // Check for and consume the '^' in a negated POSIX property, e.g. [:^Letter:]
4307 nextChar(fC);
4308 if (fC.fChar == chUp) {
4309 negated = TRUE;
4310 nextChar(fC);
4311 }
4312
4313 // Scan for the closing ":]", collecting the property name along the way.
4314 UBool sawPropSetTerminator = FALSE;
4315 for (;;) {
4316 propName.append(fC.fChar);
4317 nextChar(fC);
4318 if (fC.fQuoted || fC.fChar == -1) {
4319 // Escaped characters or end of input - either says this isn't a [:Property:]
4320 break;
4321 }
4322 if (fC.fChar == chColon) {
4323 nextChar(fC);
4324 if (fC.fChar == chRBracket) {
4325 sawPropSetTerminator = TRUE;
4326 }
4327 break;
4328 }
4329 }
4330
4331 if (sawPropSetTerminator) {
4332 uset = createSetForProperty(propName, negated);
4333 }
4334 else
4335 {
4336 // No closing ":]".
4337 // Restore the original scan position.
4338 // The main scanner will retry the input as a normal set expression,
4339 // not a [:Property:] expression.
4340 fScanIndex = savedScanIndex;
4341 fQuoteMode = savedQuoteMode;
4342 fInBackslashQuote = savedInBackslashQuote;
4343 fEOLComments = savedEOLComments;
4344 fLineNum = savedLineNum;
4345 fCharNum = savedCharNum;
4346 fLastChar = savedLastChar;
4347 fPeekChar = savedPeekChar;
4348 fC = savedfC;
4349 UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4350 }
4351 return uset;
4352 }
4353
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4354 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4355 set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4356 addCategory(set, U_GC_CF_MASK, ec);
4357 }
4358
4359 //
4360 // Create a Unicode Set from a Unicode Property expression.
4361 // This is common code underlying both \p{...} ane [:...:] expressions.
4362 // Includes trying the Java "properties" that aren't supported as
4363 // normal ICU UnicodeSet properties
4364 //
4365 static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
4366 static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
createSetForProperty(const UnicodeString & propName,UBool negated)4367 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4368 UnicodeString setExpr;
4369 UnicodeSet *set;
4370 uint32_t usetFlags = 0;
4371
4372 if (U_FAILURE(*fStatus)) {
4373 return NULL;
4374 }
4375
4376 //
4377 // First try the property as we received it
4378 //
4379 if (negated) {
4380 setExpr.append(negSetPrefix, -1);
4381 } else {
4382 setExpr.append(posSetPrefix, -1);
4383 }
4384 setExpr.append(propName);
4385 setExpr.append(chRBrace);
4386 setExpr.append(chRBracket);
4387 if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4388 usetFlags |= USET_CASE_INSENSITIVE;
4389 }
4390 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4391 if (U_SUCCESS(*fStatus)) {
4392 return set;
4393 }
4394 delete set;
4395 set = NULL;
4396
4397 //
4398 // The property as it was didn't work.
4399
4400 // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
4401 // or standard Java, but many other regular expression packages do recognize it.
4402
4403 if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
4404 *fStatus = U_ZERO_ERROR;
4405 set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
4406 if (set == NULL) {
4407 *fStatus = U_MEMORY_ALLOCATION_ERROR;
4408 return set;
4409 }
4410 if (negated) {
4411 set->complement();
4412 }
4413 return set;
4414 }
4415
4416
4417 // Do Java fixes -
4418 // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
4419 // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
4420 //
4421 // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
4422 // is accepted by Java. The property part of the name is compared
4423 // case-insenstively. The spaces must be exactly as shown, either
4424 // all there, or all omitted, with exactly one at each position
4425 // if they are present. From checking against JDK 1.6
4426 //
4427 // This code should be removed when ICU properties support the Java compatibility names
4428 // (ICU 4.0?)
4429 //
4430 UnicodeString mPropName = propName;
4431 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
4432 mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
4433 }
4434 if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
4435 mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
4436 mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
4437 }
4438 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4439 mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
4440 }
4441
4442 // See if the property looks like a Java "InBlockName", which
4443 // we will recast as "Block=BlockName"
4444 //
4445 static const UChar IN[] = {0x49, 0x6E, 0}; // "In"
4446 static const UChar BLOCK[] = {0x42, 0x6C, 0x6f, 0x63, 0x6b, 0x3d, 00}; // "Block="
4447 if (mPropName.startsWith(IN, 2) && propName.length()>=3) {
4448 setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
4449 setExpr.append(BLOCK, -1);
4450 setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
4451 setExpr.append(chRBrace);
4452 setExpr.append(chRBracket);
4453 *fStatus = U_ZERO_ERROR;
4454 set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
4455 if (U_SUCCESS(*fStatus)) {
4456 return set;
4457 }
4458 delete set;
4459 set = NULL;
4460 }
4461
4462 if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
4463 propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
4464 {
4465 UErrorCode localStatus = U_ZERO_ERROR;
4466 //setExpr.remove();
4467 set = new UnicodeSet();
4468 //
4469 // Try the various Java specific properties.
4470 // These all begin with "java"
4471 //
4472 if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
4473 addCategory(set, U_GC_CN_MASK, localStatus);
4474 set->complement();
4475 }
4476 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
4477 addCategory(set, U_GC_ND_MASK, localStatus);
4478 }
4479 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
4480 addIdentifierIgnorable(set, localStatus);
4481 }
4482 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
4483 set->add(0, 0x1F).add(0x7F, 0x9F);
4484 }
4485 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
4486 addCategory(set, U_GC_L_MASK, localStatus);
4487 addCategory(set, U_GC_SC_MASK, localStatus);
4488 addCategory(set, U_GC_PC_MASK, localStatus);
4489 addCategory(set, U_GC_ND_MASK, localStatus);
4490 addCategory(set, U_GC_NL_MASK, localStatus);
4491 addCategory(set, U_GC_MC_MASK, localStatus);
4492 addCategory(set, U_GC_MN_MASK, localStatus);
4493 addIdentifierIgnorable(set, localStatus);
4494 }
4495 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
4496 addCategory(set, U_GC_L_MASK, localStatus);
4497 addCategory(set, U_GC_NL_MASK, localStatus);
4498 addCategory(set, U_GC_SC_MASK, localStatus);
4499 addCategory(set, U_GC_PC_MASK, localStatus);
4500 }
4501 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
4502 addCategory(set, U_GC_L_MASK, localStatus);
4503 }
4504 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
4505 addCategory(set, U_GC_L_MASK, localStatus);
4506 addCategory(set, U_GC_ND_MASK, localStatus);
4507 }
4508 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
4509 addCategory(set, U_GC_LL_MASK, localStatus);
4510 }
4511 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
4512 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
4513 }
4514 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
4515 addCategory(set, U_GC_Z_MASK, localStatus);
4516 }
4517 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
4518 set->add(0x10000, UnicodeSet::MAX_VALUE);
4519 }
4520 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
4521 addCategory(set, U_GC_LT_MASK, localStatus);
4522 }
4523 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
4524 addCategory(set, U_GC_L_MASK, localStatus);
4525 addCategory(set, U_GC_NL_MASK, localStatus);
4526 }
4527 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
4528 addCategory(set, U_GC_L_MASK, localStatus);
4529 addCategory(set, U_GC_PC_MASK, localStatus);
4530 addCategory(set, U_GC_ND_MASK, localStatus);
4531 addCategory(set, U_GC_NL_MASK, localStatus);
4532 addCategory(set, U_GC_MC_MASK, localStatus);
4533 addCategory(set, U_GC_MN_MASK, localStatus);
4534 addIdentifierIgnorable(set, localStatus);
4535 }
4536 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
4537 addCategory(set, U_GC_LU_MASK, localStatus);
4538 }
4539 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
4540 set->add(0, UnicodeSet::MAX_VALUE);
4541 }
4542 else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
4543 addCategory(set, U_GC_Z_MASK, localStatus);
4544 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4545 set->add(9, 0x0d).add(0x1c, 0x1f);
4546 }
4547 else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
4548 set->add(0, UnicodeSet::MAX_VALUE);
4549 }
4550
4551 if (U_SUCCESS(localStatus) && !set->isEmpty()) {
4552 *fStatus = U_ZERO_ERROR;
4553 if (usetFlags & USET_CASE_INSENSITIVE) {
4554 set->closeOver(USET_CASE_INSENSITIVE);
4555 }
4556 if (negated) {
4557 set->complement();
4558 }
4559 return set;
4560 }
4561 delete set;
4562 set = NULL;
4563 }
4564 error(*fStatus);
4565 return NULL;
4566 }
4567
4568
4569
4570 //
4571 // SetEval Part of the evaluation of [set expressions].
4572 // Perform any pending (stacked) operations with precedence
4573 // equal or greater to that of the next operator encountered
4574 // in the expression.
4575 //
setEval(int32_t nextOp)4576 void RegexCompile::setEval(int32_t nextOp) {
4577 UnicodeSet *rightOperand = NULL;
4578 UnicodeSet *leftOperand = NULL;
4579 for (;;) {
4580 U_ASSERT(fSetOpStack.empty()==FALSE);
4581 int32_t pendingSetOperation = fSetOpStack.peeki();
4582 if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4583 break;
4584 }
4585 fSetOpStack.popi();
4586 U_ASSERT(fSetStack.empty() == FALSE);
4587 rightOperand = (UnicodeSet *)fSetStack.peek();
4588 switch (pendingSetOperation) {
4589 case setNegation:
4590 rightOperand->complement();
4591 break;
4592 case setCaseClose:
4593 // TODO: need a simple close function. Ticket 6065
4594 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4595 rightOperand->removeAllStrings();
4596 break;
4597 case setDifference1:
4598 case setDifference2:
4599 fSetStack.pop();
4600 leftOperand = (UnicodeSet *)fSetStack.peek();
4601 leftOperand->removeAll(*rightOperand);
4602 delete rightOperand;
4603 break;
4604 case setIntersection1:
4605 case setIntersection2:
4606 fSetStack.pop();
4607 leftOperand = (UnicodeSet *)fSetStack.peek();
4608 leftOperand->retainAll(*rightOperand);
4609 delete rightOperand;
4610 break;
4611 case setUnion:
4612 fSetStack.pop();
4613 leftOperand = (UnicodeSet *)fSetStack.peek();
4614 leftOperand->addAll(*rightOperand);
4615 delete rightOperand;
4616 break;
4617 default:
4618 U_ASSERT(FALSE);
4619 break;
4620 }
4621 }
4622 }
4623
setPushOp(int32_t op)4624 void RegexCompile::setPushOp(int32_t op) {
4625 setEval(op);
4626 fSetOpStack.push(op, *fStatus);
4627 fSetStack.push(new UnicodeSet(), *fStatus);
4628 }
4629
4630 U_NAMESPACE_END
4631 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
4632
4633