1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 //  file:  regexcmp.cpp
5 //
6 //  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7 //  All Rights Reserved.
8 //
9 //  This file contains the ICU regular expression compiler, which is responsible
10 //  for processing a regular expression pattern into the compiled form that
11 //  is used by the match finding engine.
12 //
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
17 
18 #include "unicode/ustring.h"
19 #include "unicode/unistr.h"
20 #include "unicode/uniset.h"
21 #include "unicode/uchar.h"
22 #include "unicode/uchriter.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/parseerr.h"
25 #include "unicode/regex.h"
26 #include "unicode/utf.h"
27 #include "unicode/utf16.h"
28 #include "patternprops.h"
29 #include "putilimp.h"
30 #include "cmemory.h"
31 #include "cstr.h"
32 #include "cstring.h"
33 #include "uvectr32.h"
34 #include "uvectr64.h"
35 #include "uassert.h"
36 #include "uinvchar.h"
37 
38 #include "regeximp.h"
39 #include "regexcst.h"   // Contains state table for the regex pattern parser.
40                         //   generated by a Perl script.
41 #include "regexcmp.h"
42 #include "regexst.h"
43 #include "regextxt.h"
44 
45 
46 
47 U_NAMESPACE_BEGIN
48 
49 
50 //------------------------------------------------------------------------------
51 //
52 //  Constructor.
53 //
54 //------------------------------------------------------------------------------
RegexCompile(RegexPattern * rxp,UErrorCode & status)55 RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56    fParenStack(status), fSetStack(status), fSetOpStack(status)
57 {
58     // Lazy init of all shared global sets (needed for init()'s empty text)
59     RegexStaticSets::initGlobals(&status);
60 
61     fStatus           = &status;
62 
63     fRXPat            = rxp;
64     fScanIndex        = 0;
65     fLastChar         = -1;
66     fPeekChar         = -1;
67     fLineNum          = 1;
68     fCharNum          = 0;
69     fQuoteMode        = FALSE;
70     fInBackslashQuote = FALSE;
71     fModeFlags        = fRXPat->fFlags | 0x80000000;
72     fEOLComments      = TRUE;
73 
74     fMatchOpenParen   = -1;
75     fMatchCloseParen  = -1;
76     fCaptureName      = NULL;
77     fLastSetLiteral   = U_SENTINEL;
78 
79     if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80         status = rxp->fDeferredStatus;
81     }
82 }
83 
84 static const UChar      chAmp       = 0x26;      // '&'
85 static const UChar      chDash      = 0x2d;      // '-'
86 
87 
88 //------------------------------------------------------------------------------
89 //
90 //  Destructor
91 //
92 //------------------------------------------------------------------------------
~RegexCompile()93 RegexCompile::~RegexCompile() {
94     delete fCaptureName;         // Normally will be NULL, but can exist if pattern
95                                  //   compilation stops with a syntax error.
96 }
97 
addCategory(UnicodeSet * set,int32_t value,UErrorCode & ec)98 static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99     set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100 }
101 
102 //------------------------------------------------------------------------------
103 //
104 //  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105 //                           The state tables are hand-written in the file regexcst.txt,
106 //                           and converted to the form used here by a perl
107 //                           script regexcst.pl
108 //
109 //------------------------------------------------------------------------------
compile(const UnicodeString & pat,UParseError & pp,UErrorCode & e)110 void    RegexCompile::compile(
111                          const UnicodeString &pat,   // Source pat to be compiled.
112                          UParseError &pp,            // Error position info
113                          UErrorCode &e)              // Error Code
114 {
115     fRXPat->fPatternString = new UnicodeString(pat);
116     UText patternText = UTEXT_INITIALIZER;
117     utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118 
119     if (U_SUCCESS(e)) {
120         compile(&patternText, pp, e);
121         utext_close(&patternText);
122     }
123 }
124 
125 //
126 //   compile, UText mode
127 //     All the work is actually done here.
128 //
compile(UText * pat,UParseError & pp,UErrorCode & e)129 void    RegexCompile::compile(
130                          UText *pat,                 // Source pat to be compiled.
131                          UParseError &pp,            // Error position info
132                          UErrorCode &e)              // Error Code
133 {
134     fStatus             = &e;
135     fParseErr           = &pp;
136     fStackPtr           = 0;
137     fStack[fStackPtr]   = 0;
138 
139     if (U_FAILURE(*fStatus)) {
140         return;
141     }
142 
143     // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144     U_ASSERT(fRXPat->fPattern == NULL || utext_nativeLength(fRXPat->fPattern) == 0);
145 
146     // Prepare the RegexPattern object to receive the compiled pattern.
147     fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, FALSE, TRUE, fStatus);
148     if (U_FAILURE(*fStatus)) {
149         return;
150     }
151     fRXPat->fStaticSets     = RegexStaticSets::gStaticSets->fPropSets;
152     fRXPat->fStaticSets8    = RegexStaticSets::gStaticSets->fPropSets8;
153 
154 
155     // Initialize the pattern scanning state machine
156     fPatternLength = utext_nativeLength(pat);
157     uint16_t                state = 1;
158     const RegexTableEl      *tableEl;
159 
160     // UREGEX_LITERAL force entire pattern to be treated as a literal string.
161     if (fModeFlags & UREGEX_LITERAL) {
162         fQuoteMode = TRUE;
163     }
164 
165     nextChar(fC);                        // Fetch the first char from the pattern string.
166 
167     //
168     // Main loop for the regex pattern parsing state machine.
169     //   Runs once per state transition.
170     //   Each time through optionally performs, depending on the state table,
171     //      - an advance to the the next pattern char
172     //      - an action to be performed.
173     //      - pushing or popping a state to/from the local state return stack.
174     //   file regexcst.txt is the source for the state table.  The logic behind
175     //     recongizing the pattern syntax is there, not here.
176     //
177     for (;;) {
178         //  Bail out if anything has gone wrong.
179         //  Regex pattern parsing stops on the first error encountered.
180         if (U_FAILURE(*fStatus)) {
181             break;
182         }
183 
184         U_ASSERT(state != 0);
185 
186         // Find the state table element that matches the input char from the pattern, or the
187         //    class of the input character.  Start with the first table row for this
188         //    state, then linearly scan forward until we find a row that matches the
189         //    character.  The last row for each state always matches all characters, so
190         //    the search will stop there, if not before.
191         //
192         tableEl = &gRuleParseStateTable[state];
193         REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
194             fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
195 
196         for (;;) {    // loop through table rows belonging to this state, looking for one
197                       //   that matches the current input char.
198             REGEX_SCAN_DEBUG_PRINTF(("."));
199             if (tableEl->fCharClass < 127 && fC.fQuoted == FALSE &&   tableEl->fCharClass == fC.fChar) {
200                 // Table row specified an individual character, not a set, and
201                 //   the input character is not quoted, and
202                 //   the input character matched it.
203                 break;
204             }
205             if (tableEl->fCharClass == 255) {
206                 // Table row specified default, match anything character class.
207                 break;
208             }
209             if (tableEl->fCharClass == 254 && fC.fQuoted)  {
210                 // Table row specified "quoted" and the char was quoted.
211                 break;
212             }
213             if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
214                 // Table row specified eof and we hit eof on the input.
215                 break;
216             }
217 
218             if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
219                 fC.fQuoted == FALSE &&                                       //   char is not escaped &&
220                 fC.fChar != (UChar32)-1) {                                   //   char is not EOF
221                 U_ASSERT(tableEl->fCharClass <= 137);
222                 if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
223                     // Table row specified a character class, or set of characters,
224                     //   and the current char matches it.
225                     break;
226                 }
227             }
228 
229             // No match on this row, advance to the next  row for this state,
230             tableEl++;
231         }
232         REGEX_SCAN_DEBUG_PRINTF(("\n"));
233 
234         //
235         // We've found the row of the state table that matches the current input
236         //   character from the rules string.
237         // Perform any action specified  by this row in the state table.
238         if (doParseActions(tableEl->fAction) == FALSE) {
239             // Break out of the state machine loop if the
240             //   the action signalled some kind of error, or
241             //   the action was to exit, occurs on normal end-of-rules-input.
242             break;
243         }
244 
245         if (tableEl->fPushState != 0) {
246             fStackPtr++;
247             if (fStackPtr >= kStackSize) {
248                 error(U_REGEX_INTERNAL_ERROR);
249                 REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
250                 fStackPtr--;
251             }
252             fStack[fStackPtr] = tableEl->fPushState;
253         }
254 
255         //
256         //  NextChar.  This is where characters are actually fetched from the pattern.
257         //             Happens under control of the 'n' tag in the state table.
258         //
259         if (tableEl->fNextChar) {
260             nextChar(fC);
261         }
262 
263         // Get the next state from the table entry, or from the
264         //   state stack if the next state was specified as "pop".
265         if (tableEl->fNextState != 255) {
266             state = tableEl->fNextState;
267         } else {
268             state = fStack[fStackPtr];
269             fStackPtr--;
270             if (fStackPtr < 0) {
271                 // state stack underflow
272                 // This will occur if the user pattern has mis-matched parentheses,
273                 //   with extra close parens.
274                 //
275                 fStackPtr++;
276                 error(U_REGEX_MISMATCHED_PAREN);
277             }
278         }
279 
280     }
281 
282     if (U_FAILURE(*fStatus)) {
283         // Bail out if the pattern had errors.
284         //   Set stack cleanup:  a successful compile would have left it empty,
285         //   but errors can leave temporary sets hanging around.
286         while (!fSetStack.empty()) {
287             delete (UnicodeSet *)fSetStack.pop();
288         }
289         return;
290     }
291 
292     //
293     // The pattern has now been read and processed, and the compiled code generated.
294     //
295 
296     //
297     // The pattern's fFrameSize so far has accumulated the requirements for
298     //   storage for capture parentheses, counters, etc. that are encountered
299     //   in the pattern.  Add space for the two variables that are always
300     //   present in the saved state:  the input string position (int64_t) and
301     //   the position in the compiled pattern.
302     //
303     allocateStackData(RESTACKFRAME_HDRCOUNT);
304 
305     //
306     // Optimization pass 1: NOPs, back-references, and case-folding
307     //
308     stripNOPs();
309 
310     //
311     // Get bounds for the minimum and maximum length of a string that this
312     //   pattern can match.  Used to avoid looking for matches in strings that
313     //   are too short.
314     //
315     fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
316 
317     //
318     // Optimization pass 2: match start type
319     //
320     matchStartType();
321 
322     //
323     // Set up fast latin-1 range sets
324     //
325     int32_t numSets = fRXPat->fSets->size();
326     fRXPat->fSets8 = new Regex8BitSet[numSets];
327     // Null pointer check.
328     if (fRXPat->fSets8 == NULL) {
329         e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
330         return;
331     }
332     int32_t i;
333     for (i=0; i<numSets; i++) {
334         UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
335         fRXPat->fSets8[i].init(s);
336     }
337 
338 }
339 
340 
341 
342 
343 
344 //------------------------------------------------------------------------------
345 //
346 //  doParseAction        Do some action during regex pattern parsing.
347 //                       Called by the parse state machine.
348 //
349 //                       Generation of the match engine PCode happens here, or
350 //                       in functions called from the parse actions defined here.
351 //
352 //
353 //------------------------------------------------------------------------------
doParseActions(int32_t action)354 UBool RegexCompile::doParseActions(int32_t action)
355 {
356     UBool   returnVal = TRUE;
357 
358     switch ((Regex_PatternParseAction)action) {
359 
360     case doPatStart:
361         // Start of pattern compiles to:
362         //0   SAVE   2        Fall back to position of FAIL
363         //1   jmp    3
364         //2   FAIL            Stop if we ever reach here.
365         //3   NOP             Dummy, so start of pattern looks the same as
366         //                    the start of an ( grouping.
367         //4   NOP             Resreved, will be replaced by a save if there are
368         //                    OR | operators at the top level
369         appendOp(URX_STATE_SAVE, 2);
370         appendOp(URX_JMP,  3);
371         appendOp(URX_FAIL, 0);
372 
373         // Standard open nonCapture paren action emits the two NOPs and
374         //   sets up the paren stack frame.
375         doParseActions(doOpenNonCaptureParen);
376         break;
377 
378     case doPatFinish:
379         // We've scanned to the end of the pattern
380         //  The end of pattern compiles to:
381         //        URX_END
382         //    which will stop the runtime match engine.
383         //  Encountering end of pattern also behaves like a close paren,
384         //   and forces fixups of the State Save at the beginning of the compiled pattern
385         //   and of any OR operations at the top level.
386         //
387         handleCloseParen();
388         if (fParenStack.size() > 0) {
389             // Missing close paren in pattern.
390             error(U_REGEX_MISMATCHED_PAREN);
391         }
392 
393         // add the END operation to the compiled pattern.
394         appendOp(URX_END, 0);
395 
396         // Terminate the pattern compilation state machine.
397         returnVal = FALSE;
398         break;
399 
400 
401 
402     case doOrOperator:
403         // Scanning a '|', as in (A|B)
404         {
405             // Generate code for any pending literals preceding the '|'
406             fixLiterals(FALSE);
407 
408             // Insert a SAVE operation at the start of the pattern section preceding
409             //   this OR at this level.  This SAVE will branch the match forward
410             //   to the right hand side of the OR in the event that the left hand
411             //   side fails to match and backtracks.  Locate the position for the
412             //   save from the location on the top of the parentheses stack.
413             int32_t savePosition = fParenStack.popi();
414             int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
415             U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
416             op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
417             fRXPat->fCompiledPat->setElementAt(op, savePosition);
418 
419             // Append an JMP operation into the compiled pattern.  The operand for
420             //  the JMP will eventually be the location following the ')' for the
421             //  group.  This will be patched in later, when the ')' is encountered.
422             appendOp(URX_JMP, 0);
423 
424             // Push the position of the newly added JMP op onto the parentheses stack.
425             // This registers if for fixup when this block's close paren is encountered.
426             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
427 
428             // Append a NOP to the compiled pattern.  This is the slot reserved
429             //   for a SAVE in the event that there is yet another '|' following
430             //   this one.
431             appendOp(URX_NOP, 0);
432             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
433         }
434         break;
435 
436 
437     case doBeginNamedCapture:
438         // Scanning (?<letter.
439         //   The first letter of the name will come through again under doConinueNamedCapture.
440         fCaptureName = new UnicodeString();
441         if (fCaptureName == NULL) {
442             error(U_MEMORY_ALLOCATION_ERROR);
443         }
444         break;
445 
446     case  doContinueNamedCapture:
447         fCaptureName->append(fC.fChar);
448         break;
449 
450     case doBadNamedCapture:
451         error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
452         break;
453 
454     case doOpenCaptureParen:
455         // Open Capturing Paren, possibly named.
456         //   Compile to a
457         //      - NOP, which later may be replaced by a save-state if the
458         //         parenthesized group gets a * quantifier, followed by
459         //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
460         //      - NOP, which may later be replaced by a save-state if there
461         //             is an '|' alternation within the parens.
462         //
463         //    Each capture group gets three slots in the save stack frame:
464         //         0: Capture Group start position (in input string being matched.)
465         //         1: Capture Group end position.
466         //         2: Start of Match-in-progress.
467         //    The first two locations are for a completed capture group, and are
468         //     referred to by back references and the like.
469         //    The third location stores the capture start position when an START_CAPTURE is
470         //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
471         //      END_CAPTURE is encountered.
472         {
473             fixLiterals();
474             appendOp(URX_NOP, 0);
475             int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
476             appendOp(URX_START_CAPTURE, varsLoc);
477             appendOp(URX_NOP, 0);
478 
479             // On the Parentheses stack, start a new frame and add the postions
480             //   of the two NOPs.  Depending on what follows in the pattern, the
481             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
482             //   address of the end of the parenthesized group.
483             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
484             fParenStack.push(capturing, *fStatus);                        // Frame type.
485             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
486             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
487 
488             // Save the mapping from group number to stack frame variable position.
489             fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
490 
491             // If this is a named capture group, add the name->group number mapping.
492             if (fCaptureName != NULL) {
493                 int32_t groupNumber = fRXPat->fGroupMap->size();
494                 int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
495                 fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
496                 if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
497                     error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
498                 }
499             }
500         }
501         break;
502 
503     case doOpenNonCaptureParen:
504         // Open non-caputuring (grouping only) Paren.
505         //   Compile to a
506         //      - NOP, which later may be replaced by a save-state if the
507         //         parenthesized group gets a * quantifier, followed by
508         //      - NOP, which may later be replaced by a save-state if there
509         //             is an '|' alternation within the parens.
510         {
511             fixLiterals();
512             appendOp(URX_NOP, 0);
513             appendOp(URX_NOP, 0);
514 
515             // On the Parentheses stack, start a new frame and add the postions
516             //   of the two NOPs.
517             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
518             fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
519             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
520             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
521         }
522          break;
523 
524 
525     case doOpenAtomicParen:
526         // Open Atomic Paren.  (?>
527         //   Compile to a
528         //      - NOP, which later may be replaced if the parenthesized group
529         //         has a quantifier, followed by
530         //      - STO_SP  save state stack position, so it can be restored at the ")"
531         //      - NOP, which may later be replaced by a save-state if there
532         //             is an '|' alternation within the parens.
533         {
534             fixLiterals();
535             appendOp(URX_NOP, 0);
536             int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
537             appendOp(URX_STO_SP, varLoc);
538             appendOp(URX_NOP, 0);
539 
540             // On the Parentheses stack, start a new frame and add the postions
541             //   of the two NOPs.  Depending on what follows in the pattern, the
542             //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
543             //   address of the end of the parenthesized group.
544             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
545             fParenStack.push(atomic, *fStatus);                           // Frame type.
546             fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
547             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
548         }
549         break;
550 
551 
552     case doOpenLookAhead:
553         // Positive Look-ahead   (?=  stuff  )
554         //
555         //   Note:   Addition of transparent input regions, with the need to
556         //           restore the original regions when failing out of a lookahead
557         //           block, complicated this sequence.  Some conbined opcodes
558         //           might make sense - or might not, lookahead aren't that common.
559         //
560         //      Caution:  min match length optimization knows about this
561         //               sequence; don't change without making updates there too.
562         //
563         // Compiles to
564         //    1    START_LA     dataLoc     Saves SP, Input Pos
565         //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
566         //    3    JMP          6           continue ...
567         //
568         //    4.   LA_END                   Look Ahead failed.  Restore regions.
569         //    5.   BACKTRACK                and back track again.
570         //
571         //    6.   NOP              reserved for use by quantifiers on the block.
572         //                          Look-ahead can't have quantifiers, but paren stack
573         //                             compile time conventions require the slot anyhow.
574         //    7.   NOP              may be replaced if there is are '|' ops in the block.
575         //    8.     code for parenthesized stuff.
576         //    9.   LA_END
577         //
578         //  Two data slots are reserved, for saving the stack ptr and the input position.
579         {
580             fixLiterals();
581             int32_t dataLoc = allocateData(2);
582             appendOp(URX_LA_START, dataLoc);
583             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
584             appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
585             appendOp(URX_LA_END, dataLoc);
586             appendOp(URX_BACKTRACK, 0);
587             appendOp(URX_NOP, 0);
588             appendOp(URX_NOP, 0);
589 
590             // On the Parentheses stack, start a new frame and add the postions
591             //   of the NOPs.
592             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
593             fParenStack.push(lookAhead, *fStatus);                        // Frame type.
594             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
595             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
596         }
597         break;
598 
599     case doOpenLookAheadNeg:
600         // Negated Lookahead.   (?! stuff )
601         // Compiles to
602         //    1.    START_LA    dataloc
603         //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
604         //                                //   which continues with the match.
605         //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
606         //    4.       code for parenthesized stuff.
607         //    5.    END_LA                // Cut back stack, remove saved state from step 2.
608         //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
609         //    7.    END_LA                // Restore match region, in case look-ahead was using
610         //                                        an alternate (transparent) region.
611         {
612             fixLiterals();
613             int32_t dataLoc = allocateData(2);
614             appendOp(URX_LA_START, dataLoc);
615             appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
616             appendOp(URX_NOP, 0);
617 
618             // On the Parentheses stack, start a new frame and add the postions
619             //   of the StateSave and NOP.
620             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
621             fParenStack.push(negLookAhead, *fStatus);                    // Frame type
622             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
623             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
624 
625             // Instructions #5 - #7 will be added when the ')' is encountered.
626         }
627         break;
628 
629     case doOpenLookBehind:
630         {
631             //   Compile a (?<= look-behind open paren.
632             //
633             //          Compiles to
634             //              0       URX_LB_START     dataLoc
635             //              1       URX_LB_CONT      dataLoc
636             //              2                        MinMatchLen
637             //              3                        MaxMatchLen
638             //              4       URX_NOP          Standard '(' boilerplate.
639             //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
640             //              6         <code for LookBehind expression>
641             //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
642             //              8       URX_LA_END       dataLoc    # Restore stack, input pos
643             //
644             //          Allocate a block of matcher data, to contain (when running a match)
645             //              0:    Stack ptr on entry
646             //              1:    Input Index on entry
647             //              2:    Start index of match current match attempt.
648             //              3:    Original Input String len.
649 
650             // Generate match code for any pending literals.
651             fixLiterals();
652 
653             // Allocate data space
654             int32_t dataLoc = allocateData(4);
655 
656             // Emit URX_LB_START
657             appendOp(URX_LB_START, dataLoc);
658 
659             // Emit URX_LB_CONT
660             appendOp(URX_LB_CONT, dataLoc);
661             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
662             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
663 
664             // Emit the NOPs
665             appendOp(URX_NOP, 0);
666             appendOp(URX_NOP, 0);
667 
668             // On the Parentheses stack, start a new frame and add the postions
669             //   of the URX_LB_CONT and the NOP.
670             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
671             fParenStack.push(lookBehind, *fStatus);                       // Frame type
672             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
673             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
674 
675             // The final two instructions will be added when the ')' is encountered.
676         }
677 
678         break;
679 
680     case doOpenLookBehindNeg:
681         {
682             //   Compile a (?<! negated look-behind open paren.
683             //
684             //          Compiles to
685             //              0       URX_LB_START     dataLoc    # Save entry stack, input len
686             //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
687             //              2                        MinMatchLen
688             //              3                        MaxMatchLen
689             //              4                        continueLoc (9)
690             //              5       URX_NOP          Standard '(' boilerplate.
691             //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
692             //              7         <code for LookBehind expression>
693             //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
694             //              9       ...
695             //
696             //          Allocate a block of matcher data, to contain (when running a match)
697             //              0:    Stack ptr on entry
698             //              1:    Input Index on entry
699             //              2:    Start index of match current match attempt.
700             //              3:    Original Input String len.
701 
702             // Generate match code for any pending literals.
703             fixLiterals();
704 
705             // Allocate data space
706             int32_t dataLoc = allocateData(4);
707 
708             // Emit URX_LB_START
709             appendOp(URX_LB_START, dataLoc);
710 
711             // Emit URX_LBN_CONT
712             appendOp(URX_LBN_CONT, dataLoc);
713             appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
714             appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
715             appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
716 
717             // Emit the NOPs
718             appendOp(URX_NOP, 0);
719             appendOp(URX_NOP, 0);
720 
721             // On the Parentheses stack, start a new frame and add the postions
722             //   of the URX_LB_CONT and the NOP.
723             fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
724             fParenStack.push(lookBehindN, *fStatus);                      // Frame type
725             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
726             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
727 
728             // The final two instructions will be added when the ')' is encountered.
729         }
730         break;
731 
732     case doConditionalExpr:
733         // Conditionals such as (?(1)a:b)
734     case doPerlInline:
735         // Perl inline-condtionals.  (?{perl code}a|b) We're not perl, no way to do them.
736         error(U_REGEX_UNIMPLEMENTED);
737         break;
738 
739 
740     case doCloseParen:
741         handleCloseParen();
742         if (fParenStack.size() <= 0) {
743             //  Extra close paren, or missing open paren.
744             error(U_REGEX_MISMATCHED_PAREN);
745         }
746         break;
747 
748     case doNOP:
749         break;
750 
751 
752     case doBadOpenParenType:
753     case doRuleError:
754         error(U_REGEX_RULE_SYNTAX);
755         break;
756 
757 
758     case doMismatchedParenErr:
759         error(U_REGEX_MISMATCHED_PAREN);
760         break;
761 
762     case doPlus:
763         //  Normal '+'  compiles to
764         //     1.   stuff to be repeated  (already built)
765         //     2.   jmp-sav 1
766         //     3.   ...
767         //
768         //  Or, if the item to be repeated can match a zero length string,
769         //     1.   STO_INP_LOC  data-loc
770         //     2.      body of stuff to be repeated
771         //     3.   JMP_SAV_X    2
772         //     4.   ...
773 
774         //
775         //  Or, if the item to be repeated is simple
776         //     1.   Item to be repeated.
777         //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
778         //     3.   LOOP_C       stack location
779         {
780             int32_t  topLoc = blockTopLoc(FALSE);        // location of item #1
781             int32_t  frameLoc;
782 
783             // Check for simple constructs, which may get special optimized code.
784             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
785                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
786 
787                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
788                     // Emit optimized code for [char set]+
789                     appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
790                     frameLoc = allocateStackData(1);
791                     appendOp(URX_LOOP_C, frameLoc);
792                     break;
793                 }
794 
795                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
796                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
797                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
798                     // Emit Optimized code for .+ operations.
799                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
800                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
801                         // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
802                         loopOpI |= 1;
803                     }
804                     if (fModeFlags & UREGEX_UNIX_LINES) {
805                         loopOpI |= 2;
806                     }
807                     appendOp(loopOpI);
808                     frameLoc = allocateStackData(1);
809                     appendOp(URX_LOOP_C, frameLoc);
810                     break;
811                 }
812 
813             }
814 
815             // General case.
816 
817             // Check for minimum match length of zero, which requires
818             //    extra loop-breaking code.
819             if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
820                 // Zero length match is possible.
821                 // Emit the code sequence that can handle it.
822                 insertOp(topLoc);
823                 frameLoc = allocateStackData(1);
824 
825                 int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
826                 fRXPat->fCompiledPat->setElementAt(op, topLoc);
827 
828                 appendOp(URX_JMP_SAV_X, topLoc+1);
829             } else {
830                 // Simpler code when the repeated body must match something non-empty
831                 appendOp(URX_JMP_SAV, topLoc);
832             }
833         }
834         break;
835 
836     case doNGPlus:
837         //  Non-greedy '+?'  compiles to
838         //     1.   stuff to be repeated  (already built)
839         //     2.   state-save  1
840         //     3.   ...
841         {
842             int32_t topLoc      = blockTopLoc(FALSE);
843             appendOp(URX_STATE_SAVE, topLoc);
844         }
845         break;
846 
847 
848     case doOpt:
849         // Normal (greedy) ? quantifier.
850         //  Compiles to
851         //     1. state save 3
852         //     2.    body of optional block
853         //     3. ...
854         // Insert the state save into the compiled pattern, and we're done.
855         {
856             int32_t   saveStateLoc = blockTopLoc(TRUE);
857             int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
858             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
859         }
860         break;
861 
862     case doNGOpt:
863         // Non-greedy ?? quantifier
864         //   compiles to
865         //    1.  jmp   4
866         //    2.     body of optional block
867         //    3   jmp   5
868         //    4.  state save 2
869         //    5    ...
870         //  This code is less than ideal, with two jmps instead of one, because we can only
871         //  insert one instruction at the top of the block being iterated.
872         {
873             int32_t  jmp1_loc = blockTopLoc(TRUE);
874             int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
875 
876             int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
877             fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
878 
879             appendOp(URX_JMP, jmp2_loc+2);
880 
881             appendOp(URX_STATE_SAVE, jmp1_loc+1);
882         }
883         break;
884 
885 
886     case doStar:
887         // Normal (greedy) * quantifier.
888         // Compiles to
889         //       1.   STATE_SAVE   4
890         //       2.      body of stuff being iterated over
891         //       3.   JMP_SAV      2
892         //       4.   ...
893         //
894         // Or, if the body is a simple [Set],
895         //       1.   LOOP_SR_I    set number
896         //       2.   LOOP_C       stack location
897         //       ...
898         //
899         // Or if this is a .*
900         //       1.   LOOP_DOT_I    (. matches all mode flag)
901         //       2.   LOOP_C        stack location
902         //
903         // Or, if the body can match a zero-length string, to inhibit infinite loops,
904         //       1.   STATE_SAVE   5
905         //       2.   STO_INP_LOC  data-loc
906         //       3.      body of stuff
907         //       4.   JMP_SAV_X    2
908         //       5.   ...
909         {
910             // location of item #1, the STATE_SAVE
911             int32_t   topLoc = blockTopLoc(FALSE);
912             int32_t   dataLoc = -1;
913 
914             // Check for simple *, where the construct being repeated
915             //   compiled to single opcode, and might be optimizable.
916             if (topLoc == fRXPat->fCompiledPat->size() - 1) {
917                 int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
918 
919                 if (URX_TYPE(repeatedOp) == URX_SETREF) {
920                     // Emit optimized code for a [char set]*
921                     int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
922                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
923                     dataLoc = allocateStackData(1);
924                     appendOp(URX_LOOP_C, dataLoc);
925                     break;
926                 }
927 
928                 if (URX_TYPE(repeatedOp) == URX_DOTANY ||
929                     URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
930                     URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
931                     // Emit Optimized code for .* operations.
932                     int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
933                     if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
934                         // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
935                         loopOpI |= 1;
936                     }
937                     if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
938                         loopOpI |= 2;
939                     }
940                     fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
941                     dataLoc = allocateStackData(1);
942                     appendOp(URX_LOOP_C, dataLoc);
943                     break;
944                 }
945             }
946 
947             // Emit general case code for this *
948             // The optimizations did not apply.
949 
950             int32_t   saveStateLoc = blockTopLoc(TRUE);
951             int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
952 
953             // Check for minimum match length of zero, which requires
954             //    extra loop-breaking code.
955             if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
956                 insertOp(saveStateLoc);
957                 dataLoc = allocateStackData(1);
958 
959                 int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
960                 fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
961                 jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
962             }
963 
964             // Locate the position in the compiled pattern where the match will continue
965             //   after completing the *.   (4 or 5 in the comment above)
966             int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
967 
968             // Put together the save state op and store it into the compiled code.
969             int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
970             fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
971 
972             // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
973             appendOp(jmpOp);
974         }
975         break;
976 
977     case doNGStar:
978         // Non-greedy *? quantifier
979         // compiles to
980         //     1.   JMP    3
981         //     2.      body of stuff being iterated over
982         //     3.   STATE_SAVE  2
983         //     4    ...
984         {
985             int32_t     jmpLoc  = blockTopLoc(TRUE);                   // loc  1.
986             int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
987             int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
988             fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
989             appendOp(URX_STATE_SAVE, jmpLoc+1);
990         }
991         break;
992 
993 
994     case doIntervalInit:
995         // The '{' opening an interval quantifier was just scanned.
996         // Init the counter varaiables that will accumulate the values as the digits
997         //    are scanned.
998         fIntervalLow = 0;
999         fIntervalUpper = -1;
1000         break;
1001 
1002     case doIntevalLowerDigit:
1003         // Scanned a digit from the lower value of an {lower,upper} interval
1004         {
1005             int32_t digitValue = u_charDigitValue(fC.fChar);
1006             U_ASSERT(digitValue >= 0);
1007             int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1008             if (val > INT32_MAX) {
1009                 error(U_REGEX_NUMBER_TOO_BIG);
1010             } else {
1011                 fIntervalLow = (int32_t)val;
1012             }
1013         }
1014         break;
1015 
1016     case doIntervalUpperDigit:
1017         // Scanned a digit from the upper value of an {lower,upper} interval
1018         {
1019             if (fIntervalUpper < 0) {
1020                 fIntervalUpper = 0;
1021             }
1022             int32_t digitValue = u_charDigitValue(fC.fChar);
1023             U_ASSERT(digitValue >= 0);
1024             int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1025             if (val > INT32_MAX) {
1026                 error(U_REGEX_NUMBER_TOO_BIG);
1027             } else {
1028                 fIntervalUpper = (int32_t)val;
1029             }
1030         }
1031         break;
1032 
1033     case doIntervalSame:
1034         // Scanned a single value interval like {27}.  Upper = Lower.
1035         fIntervalUpper = fIntervalLow;
1036         break;
1037 
1038     case doInterval:
1039         // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1040         if (compileInlineInterval() == FALSE) {
1041             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1042         }
1043         break;
1044 
1045     case doPossessiveInterval:
1046         // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1047         {
1048             // Remember the loc for the top of the block being looped over.
1049             //   (Can not reserve a slot in the compiled pattern at this time, because
1050             //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1051             //    once per block.)
1052             int32_t topLoc = blockTopLoc(FALSE);
1053 
1054             // Produce normal looping code.
1055             compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1056 
1057             // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1058             //  just as if the loop was inclosed in atomic parentheses.
1059 
1060             // First the STO_SP before the start of the loop
1061             insertOp(topLoc);
1062 
1063             int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1064             int32_t  op     = buildOp(URX_STO_SP, varLoc);
1065             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1066 
1067             int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1068             U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1069             loopOp++;     // point LoopOp after the just-inserted STO_SP
1070             fRXPat->fCompiledPat->push(loopOp, *fStatus);
1071 
1072             // Then the LD_SP after the end of the loop
1073             appendOp(URX_LD_SP, varLoc);
1074         }
1075 
1076         break;
1077 
1078     case doNGInterval:
1079         // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1080         compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1081         break;
1082 
1083     case doIntervalError:
1084         error(U_REGEX_BAD_INTERVAL);
1085         break;
1086 
1087     case doLiteralChar:
1088         // We've just scanned a "normal" character from the pattern,
1089         literalChar(fC.fChar);
1090         break;
1091 
1092 
1093     case doEscapedLiteralChar:
1094         // We've just scanned an backslashed escaped character with  no
1095         //   special meaning.  It represents itself.
1096         if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1097             ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1098             (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1099                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1100              }
1101         literalChar(fC.fChar);
1102         break;
1103 
1104 
1105     case doDotAny:
1106         // scanned a ".",  match any single character.
1107         {
1108             fixLiterals(FALSE);
1109             if (fModeFlags & UREGEX_DOTALL) {
1110                 appendOp(URX_DOTANY_ALL, 0);
1111             } else if (fModeFlags & UREGEX_UNIX_LINES) {
1112                 appendOp(URX_DOTANY_UNIX, 0);
1113             } else {
1114                 appendOp(URX_DOTANY, 0);
1115             }
1116         }
1117         break;
1118 
1119     case doCaret:
1120         {
1121             fixLiterals(FALSE);
1122             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1123                 appendOp(URX_CARET, 0);
1124             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1125                 appendOp(URX_CARET_M, 0);
1126             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1127                 appendOp(URX_CARET, 0);   // Only testing true start of input.
1128             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1129                 appendOp(URX_CARET_M_UNIX, 0);
1130             }
1131         }
1132         break;
1133 
1134     case doDollar:
1135         {
1136             fixLiterals(FALSE);
1137             if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1138                 appendOp(URX_DOLLAR, 0);
1139             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1140                 appendOp(URX_DOLLAR_M, 0);
1141             } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1142                 appendOp(URX_DOLLAR_D, 0);
1143             } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1144                 appendOp(URX_DOLLAR_MD, 0);
1145             }
1146         }
1147         break;
1148 
1149     case doBackslashA:
1150         fixLiterals(FALSE);
1151         appendOp(URX_CARET, 0);
1152         break;
1153 
1154     case doBackslashB:
1155         {
1156             #if  UCONFIG_NO_BREAK_ITERATION==1
1157             if (fModeFlags & UREGEX_UWORD) {
1158                 error(U_UNSUPPORTED_ERROR);
1159             }
1160             #endif
1161             fixLiterals(FALSE);
1162             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1163             appendOp(op, 1);
1164         }
1165         break;
1166 
1167     case doBackslashb:
1168         {
1169             #if  UCONFIG_NO_BREAK_ITERATION==1
1170             if (fModeFlags & UREGEX_UWORD) {
1171                 error(U_UNSUPPORTED_ERROR);
1172             }
1173             #endif
1174             fixLiterals(FALSE);
1175             int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1176             appendOp(op, 0);
1177         }
1178         break;
1179 
1180     case doBackslashD:
1181         fixLiterals(FALSE);
1182         appendOp(URX_BACKSLASH_D, 1);
1183         break;
1184 
1185     case doBackslashd:
1186         fixLiterals(FALSE);
1187         appendOp(URX_BACKSLASH_D, 0);
1188         break;
1189 
1190     case doBackslashG:
1191         fixLiterals(FALSE);
1192         appendOp(URX_BACKSLASH_G, 0);
1193         break;
1194 
1195     case doBackslashH:
1196         fixLiterals(FALSE);
1197         appendOp(URX_BACKSLASH_H, 1);
1198         break;
1199 
1200     case doBackslashh:
1201         fixLiterals(FALSE);
1202         appendOp(URX_BACKSLASH_H, 0);
1203         break;
1204 
1205     case doBackslashR:
1206         fixLiterals(FALSE);
1207         appendOp(URX_BACKSLASH_R, 0);
1208         break;
1209 
1210     case doBackslashS:
1211         fixLiterals(FALSE);
1212         appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1213         break;
1214 
1215     case doBackslashs:
1216         fixLiterals(FALSE);
1217         appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1218         break;
1219 
1220     case doBackslashV:
1221         fixLiterals(FALSE);
1222         appendOp(URX_BACKSLASH_V, 1);
1223         break;
1224 
1225     case doBackslashv:
1226         fixLiterals(FALSE);
1227         appendOp(URX_BACKSLASH_V, 0);
1228         break;
1229 
1230     case doBackslashW:
1231         fixLiterals(FALSE);
1232         appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1233         break;
1234 
1235     case doBackslashw:
1236         fixLiterals(FALSE);
1237         appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1238         break;
1239 
1240     case doBackslashX:
1241         fixLiterals(FALSE);
1242         appendOp(URX_BACKSLASH_X, 0);
1243         break;
1244 
1245 
1246     case doBackslashZ:
1247         fixLiterals(FALSE);
1248         appendOp(URX_DOLLAR, 0);
1249         break;
1250 
1251     case doBackslashz:
1252         fixLiterals(FALSE);
1253         appendOp(URX_BACKSLASH_Z, 0);
1254         break;
1255 
1256     case doEscapeError:
1257         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1258         break;
1259 
1260     case doExit:
1261         fixLiterals(FALSE);
1262         returnVal = FALSE;
1263         break;
1264 
1265     case doProperty:
1266         {
1267             fixLiterals(FALSE);
1268             UnicodeSet *theSet = scanProp();
1269             compileSet(theSet);
1270         }
1271         break;
1272 
1273     case doNamedChar:
1274         {
1275             UChar32 c = scanNamedChar();
1276             literalChar(c);
1277         }
1278         break;
1279 
1280 
1281     case doBackRef:
1282         // BackReference.  Somewhat unusual in that the front-end can not completely parse
1283         //                 the regular expression, because the number of digits to be consumed
1284         //                 depends on the number of capture groups that have been defined.  So
1285         //                 we have to do it here instead.
1286         {
1287             int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1288             int32_t  groupNum = 0;
1289             UChar32  c        = fC.fChar;
1290 
1291             for (;;) {
1292                 // Loop once per digit, for max allowed number of digits in a back reference.
1293                 int32_t digit = u_charDigitValue(c);
1294                 groupNum = groupNum * 10 + digit;
1295                 if (groupNum >= numCaptureGroups) {
1296                     break;
1297                 }
1298                 c = peekCharLL();
1299                 if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == FALSE) {
1300                     break;
1301                 }
1302                 nextCharLL();
1303             }
1304 
1305             // Scan of the back reference in the source regexp is complete.  Now generate
1306             //  the compiled code for it.
1307             // Because capture groups can be forward-referenced by back-references,
1308             //  we fill the operand with the capture group number.  At the end
1309             //  of compilation, it will be changed to the variable's location.
1310             U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1311                                      //    and shouldn't enter this code path at all.
1312             fixLiterals(FALSE);
1313             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1314                 appendOp(URX_BACKREF_I, groupNum);
1315             } else {
1316                 appendOp(URX_BACKREF, groupNum);
1317             }
1318         }
1319         break;
1320 
1321     case doBeginNamedBackRef:
1322         U_ASSERT(fCaptureName == NULL);
1323         fCaptureName = new UnicodeString;
1324         if (fCaptureName == NULL) {
1325             error(U_MEMORY_ALLOCATION_ERROR);
1326         }
1327         break;
1328 
1329     case doContinueNamedBackRef:
1330         fCaptureName->append(fC.fChar);
1331         break;
1332 
1333     case doCompleteNamedBackRef:
1334         {
1335         int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
1336         if (groupNumber == 0) {
1337             // Group name has not been defined.
1338             //   Could be a forward reference. If we choose to support them at some
1339             //   future time, extra mechanism will be required at this point.
1340             error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1341         } else {
1342             // Given the number, handle identically to a \n numbered back reference.
1343             // See comments above, under doBackRef
1344             fixLiterals(FALSE);
1345             if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1346                 appendOp(URX_BACKREF_I, groupNumber);
1347             } else {
1348                 appendOp(URX_BACKREF, groupNumber);
1349             }
1350         }
1351         delete fCaptureName;
1352         fCaptureName = NULL;
1353         break;
1354         }
1355 
1356     case doPossessivePlus:
1357         // Possessive ++ quantifier.
1358         // Compiles to
1359         //       1.   STO_SP
1360         //       2.      body of stuff being iterated over
1361         //       3.   STATE_SAVE 5
1362         //       4.   JMP        2
1363         //       5.   LD_SP
1364         //       6.   ...
1365         //
1366         //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1367         //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1368         //
1369         {
1370             // Emit the STO_SP
1371             int32_t   topLoc = blockTopLoc(TRUE);
1372             int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1373             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1374             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1375 
1376             // Emit the STATE_SAVE
1377             appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1378 
1379             // Emit the JMP
1380             appendOp(URX_JMP, topLoc+1);
1381 
1382             // Emit the LD_SP
1383             appendOp(URX_LD_SP, stoLoc);
1384         }
1385         break;
1386 
1387     case doPossessiveStar:
1388         // Possessive *+ quantifier.
1389         // Compiles to
1390         //       1.   STO_SP       loc
1391         //       2.   STATE_SAVE   5
1392         //       3.      body of stuff being iterated over
1393         //       4.   JMP          2
1394         //       5.   LD_SP        loc
1395         //       6    ...
1396         // TODO:  do something to cut back the state stack each time through the loop.
1397         {
1398             // Reserve two slots at the top of the block.
1399             int32_t   topLoc = blockTopLoc(TRUE);
1400             insertOp(topLoc);
1401 
1402             // emit   STO_SP     loc
1403             int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1404             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1405             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1406 
1407             // Emit the SAVE_STATE   5
1408             int32_t L7 = fRXPat->fCompiledPat->size()+1;
1409             op = buildOp(URX_STATE_SAVE, L7);
1410             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1411 
1412             // Append the JMP operation.
1413             appendOp(URX_JMP, topLoc+1);
1414 
1415             // Emit the LD_SP       loc
1416             appendOp(URX_LD_SP, stoLoc);
1417         }
1418         break;
1419 
1420     case doPossessiveOpt:
1421         // Possessive  ?+ quantifier.
1422         //  Compiles to
1423         //     1. STO_SP      loc
1424         //     2. SAVE_STATE  5
1425         //     3.    body of optional block
1426         //     4. LD_SP       loc
1427         //     5. ...
1428         //
1429         {
1430             // Reserve two slots at the top of the block.
1431             int32_t   topLoc = blockTopLoc(TRUE);
1432             insertOp(topLoc);
1433 
1434             // Emit the STO_SP
1435             int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1436             int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1437             fRXPat->fCompiledPat->setElementAt(op, topLoc);
1438 
1439             // Emit the SAVE_STATE
1440             int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1441             op = buildOp(URX_STATE_SAVE, continueLoc);
1442             fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1443 
1444             // Emit the LD_SP
1445             appendOp(URX_LD_SP, stoLoc);
1446         }
1447         break;
1448 
1449 
1450     case doBeginMatchMode:
1451         fNewModeFlags = fModeFlags;
1452         fSetModeFlag  = TRUE;
1453         break;
1454 
1455     case doMatchMode:   //  (?i)    and similar
1456         {
1457             int32_t  bit = 0;
1458             switch (fC.fChar) {
1459             case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1460             case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1461             case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1462             case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1463             case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1464             case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1465             case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1466             case 0x2d: /* '-' */   fSetModeFlag = FALSE;          break;
1467             default:
1468                 U_ASSERT(FALSE);   // Should never happen.  Other chars are filtered out
1469                                    // by the scanner.
1470             }
1471             if (fSetModeFlag) {
1472                 fNewModeFlags |= bit;
1473             } else {
1474                 fNewModeFlags &= ~bit;
1475             }
1476         }
1477         break;
1478 
1479     case doSetMatchMode:
1480         // Emit code to match any pending literals, using the not-yet changed match mode.
1481         fixLiterals();
1482 
1483         // We've got a (?i) or similar.  The match mode is being changed, but
1484         //   the change is not scoped to a parenthesized block.
1485         U_ASSERT(fNewModeFlags < 0);
1486         fModeFlags = fNewModeFlags;
1487 
1488         break;
1489 
1490 
1491     case doMatchModeParen:
1492         // We've got a (?i: or similar.  Begin a parenthesized block, save old
1493         //   mode flags so they can be restored at the close of the block.
1494         //
1495         //   Compile to a
1496         //      - NOP, which later may be replaced by a save-state if the
1497         //         parenthesized group gets a * quantifier, followed by
1498         //      - NOP, which may later be replaced by a save-state if there
1499         //             is an '|' alternation within the parens.
1500         {
1501             fixLiterals(FALSE);
1502             appendOp(URX_NOP, 0);
1503             appendOp(URX_NOP, 0);
1504 
1505             // On the Parentheses stack, start a new frame and add the postions
1506             //   of the two NOPs (a normal non-capturing () frame, except for the
1507             //   saving of the orignal mode flags.)
1508             fParenStack.push(fModeFlags, *fStatus);
1509             fParenStack.push(flags, *fStatus);                            // Frame Marker
1510             fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1511             fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1512 
1513             // Set the current mode flags to the new values.
1514             U_ASSERT(fNewModeFlags < 0);
1515             fModeFlags = fNewModeFlags;
1516         }
1517         break;
1518 
1519     case doBadModeFlag:
1520         error(U_REGEX_INVALID_FLAG);
1521         break;
1522 
1523     case doSuppressComments:
1524         // We have just scanned a '(?'.  We now need to prevent the character scanner from
1525         // treating a '#' as a to-the-end-of-line comment.
1526         //   (This Perl compatibility just gets uglier and uglier to do...)
1527         fEOLComments = FALSE;
1528         break;
1529 
1530 
1531     case doSetAddAmp:
1532         {
1533           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1534           set->add(chAmp);
1535         }
1536         break;
1537 
1538     case doSetAddDash:
1539         {
1540           UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1541           set->add(chDash);
1542         }
1543         break;
1544 
1545      case doSetBackslash_s:
1546         {
1547          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1548          set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1549          break;
1550         }
1551 
1552      case doSetBackslash_S:
1553         {
1554             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1555             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1556             SSet.complement();
1557             set->addAll(SSet);
1558             break;
1559         }
1560 
1561     case doSetBackslash_d:
1562         {
1563             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1564             // TODO - make a static set, ticket 6058.
1565             addCategory(set, U_GC_ND_MASK, *fStatus);
1566             break;
1567         }
1568 
1569     case doSetBackslash_D:
1570         {
1571             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1572             UnicodeSet digits;
1573             // TODO - make a static set, ticket 6058.
1574             digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1575             digits.complement();
1576             set->addAll(digits);
1577             break;
1578         }
1579 
1580     case doSetBackslash_h:
1581         {
1582             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1583             UnicodeSet h;
1584             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1585             h.add((UChar32)9);   // Tab
1586             set->addAll(h);
1587             break;
1588         }
1589 
1590     case doSetBackslash_H:
1591         {
1592             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1593             UnicodeSet h;
1594             h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1595             h.add((UChar32)9);   // Tab
1596             h.complement();
1597             set->addAll(h);
1598             break;
1599         }
1600 
1601     case doSetBackslash_v:
1602         {
1603             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1604             set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1605             set->add((UChar32)0x85);
1606             set->add((UChar32)0x2028, (UChar32)0x2029);
1607             break;
1608         }
1609 
1610     case doSetBackslash_V:
1611         {
1612             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1613             UnicodeSet v;
1614             v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1615             v.add((UChar32)0x85);
1616             v.add((UChar32)0x2028, (UChar32)0x2029);
1617             v.complement();
1618             set->addAll(v);
1619             break;
1620         }
1621 
1622     case doSetBackslash_w:
1623         {
1624             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1625             set->addAll(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1626             break;
1627         }
1628 
1629     case doSetBackslash_W:
1630         {
1631             UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1632             UnicodeSet SSet(*RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1633             SSet.complement();
1634             set->addAll(SSet);
1635             break;
1636         }
1637 
1638     case doSetBegin:
1639         fixLiterals(FALSE);
1640         fSetStack.push(new UnicodeSet(), *fStatus);
1641         fSetOpStack.push(setStart, *fStatus);
1642         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1643             fSetOpStack.push(setCaseClose, *fStatus);
1644         }
1645         break;
1646 
1647     case doSetBeginDifference1:
1648         //  We have scanned something like [[abc]-[
1649         //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1650         //  Push a Difference operator, which will cause the new set to be subtracted from what
1651         //    went before once it is created.
1652         setPushOp(setDifference1);
1653         fSetOpStack.push(setStart, *fStatus);
1654         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1655             fSetOpStack.push(setCaseClose, *fStatus);
1656         }
1657         break;
1658 
1659     case doSetBeginIntersection1:
1660         //  We have scanned something like  [[abc]&[
1661         //   Need both the '&' operator and the open '[' operator.
1662         setPushOp(setIntersection1);
1663         fSetOpStack.push(setStart, *fStatus);
1664         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1665             fSetOpStack.push(setCaseClose, *fStatus);
1666         }
1667         break;
1668 
1669     case doSetBeginUnion:
1670         //  We have scanned something like  [[abc][
1671         //     Need to handle the union operation explicitly [[abc] | [
1672         setPushOp(setUnion);
1673         fSetOpStack.push(setStart, *fStatus);
1674         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1675             fSetOpStack.push(setCaseClose, *fStatus);
1676         }
1677         break;
1678 
1679     case doSetDifference2:
1680         // We have scanned something like [abc--
1681         //   Consider this to unambiguously be a set difference operator.
1682         setPushOp(setDifference2);
1683         break;
1684 
1685     case doSetEnd:
1686         // Have encountered the ']' that closes a set.
1687         //    Force the evaluation of any pending operations within this set,
1688         //    leave the completed set on the top of the set stack.
1689         setEval(setEnd);
1690         U_ASSERT(fSetOpStack.peeki()==setStart);
1691         fSetOpStack.popi();
1692         break;
1693 
1694     case doSetFinish:
1695         {
1696         // Finished a complete set expression, including all nested sets.
1697         //   The close bracket has already triggered clearing out pending set operators,
1698         //    the operator stack should be empty and the operand stack should have just
1699         //    one entry, the result set.
1700         U_ASSERT(fSetOpStack.empty());
1701         UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1702         U_ASSERT(fSetStack.empty());
1703         compileSet(theSet);
1704         break;
1705         }
1706 
1707     case doSetIntersection2:
1708         // Have scanned something like [abc&&
1709         setPushOp(setIntersection2);
1710         break;
1711 
1712     case doSetLiteral:
1713         // Union the just-scanned literal character into the set being built.
1714         //    This operation is the highest precedence set operation, so we can always do
1715         //    it immediately, without waiting to see what follows.  It is necessary to perform
1716         //    any pending '-' or '&' operation first, because these have the same precedence
1717         //    as union-ing in a literal'
1718         {
1719             setEval(setUnion);
1720             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1721             s->add(fC.fChar);
1722             fLastSetLiteral = fC.fChar;
1723             break;
1724         }
1725 
1726     case doSetLiteralEscaped:
1727         // A back-slash escaped literal character was encountered.
1728         // Processing is the same as with setLiteral, above, with the addition of
1729         //  the optional check for errors on escaped ASCII letters.
1730         {
1731             if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1732                 ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1733                  (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1734                 error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1735             }
1736             setEval(setUnion);
1737             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1738             s->add(fC.fChar);
1739             fLastSetLiteral = fC.fChar;
1740             break;
1741         }
1742 
1743         case doSetNamedChar:
1744         // Scanning a \N{UNICODE CHARACTER NAME}
1745         //  Aside from the source of the character, the processing is identical to doSetLiteral,
1746         //    above.
1747         {
1748             UChar32  c = scanNamedChar();
1749             setEval(setUnion);
1750             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1751             s->add(c);
1752             fLastSetLiteral = c;
1753             break;
1754         }
1755 
1756     case doSetNamedRange:
1757         // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1758         // The left character is already in the set, and is saved in fLastSetLiteral.
1759         // The right side needs to be picked up, the scan is at the 'N'.
1760         // Lower Limit > Upper limit being an error matches both Java
1761         //        and ICU UnicodeSet behavior.
1762         {
1763             UChar32  c = scanNamedChar();
1764             if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1765                 error(U_REGEX_INVALID_RANGE);
1766             }
1767             UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1768             s->add(fLastSetLiteral, c);
1769             fLastSetLiteral = c;
1770             break;
1771         }
1772 
1773 
1774     case  doSetNegate:
1775         // Scanned a '^' at the start of a set.
1776         // Push the negation operator onto the set op stack.
1777         // A twist for case-insensitive matching:
1778         //   the case closure operation must happen _before_ negation.
1779         //   But the case closure operation will already be on the stack if it's required.
1780         //   This requires checking for case closure, and swapping the stack order
1781         //    if it is present.
1782         {
1783             int32_t  tosOp = fSetOpStack.peeki();
1784             if (tosOp == setCaseClose) {
1785                 fSetOpStack.popi();
1786                 fSetOpStack.push(setNegation, *fStatus);
1787                 fSetOpStack.push(setCaseClose, *fStatus);
1788             } else {
1789                 fSetOpStack.push(setNegation, *fStatus);
1790             }
1791         }
1792         break;
1793 
1794     case doSetNoCloseError:
1795         error(U_REGEX_MISSING_CLOSE_BRACKET);
1796         break;
1797 
1798     case doSetOpError:
1799         error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1800         break;
1801 
1802     case doSetPosixProp:
1803         {
1804             UnicodeSet *s = scanPosixProp();
1805             if (s != NULL) {
1806                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1807                 tos->addAll(*s);
1808                 delete s;
1809             }  // else error.  scanProp() reported the error status already.
1810         }
1811         break;
1812 
1813     case doSetProp:
1814         //  Scanned a \p \P within [brackets].
1815         {
1816             UnicodeSet *s = scanProp();
1817             if (s != NULL) {
1818                 UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1819                 tos->addAll(*s);
1820                 delete s;
1821             }  // else error.  scanProp() reported the error status already.
1822         }
1823         break;
1824 
1825 
1826     case doSetRange:
1827         // We have scanned literal-literal.  Add the range to the set.
1828         // The left character is already in the set, and is saved in fLastSetLiteral.
1829         // The right side is the current character.
1830         // Lower Limit > Upper limit being an error matches both Java
1831         //        and ICU UnicodeSet behavior.
1832         {
1833 
1834         if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1835             error(U_REGEX_INVALID_RANGE);
1836         }
1837         UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1838         s->add(fLastSetLiteral, fC.fChar);
1839         break;
1840         }
1841 
1842     default:
1843         U_ASSERT(FALSE);
1844         error(U_REGEX_INTERNAL_ERROR);
1845         break;
1846     }
1847 
1848     if (U_FAILURE(*fStatus)) {
1849         returnVal = FALSE;
1850     }
1851 
1852     return returnVal;
1853 }
1854 
1855 
1856 
1857 //------------------------------------------------------------------------------
1858 //
1859 //   literalChar           We've encountered a literal character from the pattern,
1860 //                             or an escape sequence that reduces to a character.
1861 //                         Add it to the string containing all literal chars/strings from
1862 //                             the pattern.
1863 //
1864 //------------------------------------------------------------------------------
literalChar(UChar32 c)1865 void RegexCompile::literalChar(UChar32 c)  {
1866     fLiteralChars.append(c);
1867 }
1868 
1869 
1870 //------------------------------------------------------------------------------
1871 //
1872 //    fixLiterals           When compiling something that can follow a literal
1873 //                          string in a pattern, emit the code to match the
1874 //                          accumulated literal string.
1875 //
1876 //                          Optionally, split the last char of the string off into
1877 //                          a single "ONE_CHAR" operation, so that quantifiers can
1878 //                          apply to that char alone.  Example:   abc*
1879 //                          The * must apply to the 'c' only.
1880 //
1881 //------------------------------------------------------------------------------
fixLiterals(UBool split)1882 void    RegexCompile::fixLiterals(UBool split) {
1883 
1884     // If no literal characters have been scanned but not yet had code generated
1885     //   for them, nothing needs to be done.
1886     if (fLiteralChars.length() == 0) {
1887         return;
1888     }
1889 
1890     int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1891     UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1892 
1893     // Split:  We need to  ensure that the last item in the compiled pattern
1894     //     refers only to the last literal scanned in the pattern, so that
1895     //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1896     //     Split before case folding for case insensitive matches.
1897 
1898     if (split) {
1899         fLiteralChars.truncate(indexOfLastCodePoint);
1900         fixLiterals(FALSE);   // Recursive call, emit code to match the first part of the string.
1901                               //  Note that the truncated literal string may be empty, in which case
1902                               //  nothing will be emitted.
1903 
1904         literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1905         fixLiterals(FALSE);          // Second recursive call, code for the final code point.
1906         return;
1907     }
1908 
1909     // If we are doing case-insensitive matching, case fold the string.  This may expand
1910     //   the string, e.g. the German sharp-s turns into "ss"
1911     if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1912         fLiteralChars.foldCase();
1913         indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1914         lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1915     }
1916 
1917     if (indexOfLastCodePoint == 0) {
1918         // Single character, emit a URX_ONECHAR op to match it.
1919         if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1920                  u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1921             appendOp(URX_ONECHAR_I, lastCodePoint);
1922         } else {
1923             appendOp(URX_ONECHAR, lastCodePoint);
1924         }
1925     } else {
1926         // Two or more chars, emit a URX_STRING to match them.
1927         if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1928             error(U_REGEX_PATTERN_TOO_BIG);
1929         }
1930         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1931             appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1932         } else {
1933             // TODO here:  add optimization to split case sensitive strings of length two
1934             //             into two single char ops, for efficiency.
1935             appendOp(URX_STRING, fRXPat->fLiteralText.length());
1936         }
1937         appendOp(URX_STRING_LEN, fLiteralChars.length());
1938 
1939         // Add this string into the accumulated strings of the compiled pattern.
1940         fRXPat->fLiteralText.append(fLiteralChars);
1941     }
1942 
1943     fLiteralChars.remove();
1944 }
1945 
1946 
buildOp(int32_t type,int32_t val)1947 int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1948     if (U_FAILURE(*fStatus)) {
1949         return 0;
1950     }
1951     if (type < 0 || type > 255) {
1952         U_ASSERT(FALSE);
1953         error(U_REGEX_INTERNAL_ERROR);
1954         type = URX_RESERVED_OP;
1955     }
1956     if (val > 0x00ffffff) {
1957         U_ASSERT(FALSE);
1958         error(U_REGEX_INTERNAL_ERROR);
1959         val = 0;
1960     }
1961     if (val < 0) {
1962         if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1963             U_ASSERT(FALSE);
1964             error(U_REGEX_INTERNAL_ERROR);
1965             return -1;
1966         }
1967         if (URX_TYPE(val) != 0xff) {
1968             U_ASSERT(FALSE);
1969             error(U_REGEX_INTERNAL_ERROR);
1970             return -1;
1971         }
1972         type = URX_RESERVED_OP_N;
1973     }
1974     return (type << 24) | val;
1975 }
1976 
1977 
1978 //------------------------------------------------------------------------------
1979 //
1980 //   appendOp()             Append a new instruction onto the compiled pattern
1981 //                          Includes error checking, limiting the size of the
1982 //                          pattern to lengths that can be represented in the
1983 //                          24 bit operand field of an instruction.
1984 //
1985 //------------------------------------------------------------------------------
appendOp(int32_t op)1986 void RegexCompile::appendOp(int32_t op) {
1987     if (U_FAILURE(*fStatus)) {
1988         return;
1989     }
1990     fRXPat->fCompiledPat->addElement(op, *fStatus);
1991     if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
1992         error(U_REGEX_PATTERN_TOO_BIG);
1993     }
1994 }
1995 
appendOp(int32_t type,int32_t val)1996 void RegexCompile::appendOp(int32_t type, int32_t val) {
1997     appendOp(buildOp(type, val));
1998 }
1999 
2000 
2001 //------------------------------------------------------------------------------
2002 //
2003 //   insertOp()             Insert a slot for a new opcode into the already
2004 //                          compiled pattern code.
2005 //
2006 //                          Fill the slot with a NOP.  Our caller will replace it
2007 //                          with what they really wanted.
2008 //
2009 //------------------------------------------------------------------------------
insertOp(int32_t where)2010 void   RegexCompile::insertOp(int32_t where) {
2011     UVector64 *code = fRXPat->fCompiledPat;
2012     U_ASSERT(where>0 && where < code->size());
2013 
2014     int32_t  nop = buildOp(URX_NOP, 0);
2015     code->insertElementAt(nop, where, *fStatus);
2016 
2017     // Walk through the pattern, looking for any ops with targets that
2018     //  were moved down by the insert.  Fix them.
2019     int32_t loc;
2020     for (loc=0; loc<code->size(); loc++) {
2021         int32_t op = (int32_t)code->elementAti(loc);
2022         int32_t opType = URX_TYPE(op);
2023         int32_t opValue = URX_VAL(op);
2024         if ((opType == URX_JMP         ||
2025             opType == URX_JMPX         ||
2026             opType == URX_STATE_SAVE   ||
2027             opType == URX_CTR_LOOP     ||
2028             opType == URX_CTR_LOOP_NG  ||
2029             opType == URX_JMP_SAV      ||
2030             opType == URX_JMP_SAV_X    ||
2031             opType == URX_RELOC_OPRND)    && opValue > where) {
2032             // Target location for this opcode is after the insertion point and
2033             //   needs to be incremented to adjust for the insertion.
2034             opValue++;
2035             op = buildOp(opType, opValue);
2036             code->setElementAt(op, loc);
2037         }
2038     }
2039 
2040     // Now fix up the parentheses stack.  All positive values in it are locations in
2041     //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2042     for (loc=0; loc<fParenStack.size(); loc++) {
2043         int32_t x = fParenStack.elementAti(loc);
2044         U_ASSERT(x < code->size());
2045         if (x>where) {
2046             x++;
2047             fParenStack.setElementAt(x, loc);
2048         }
2049     }
2050 
2051     if (fMatchCloseParen > where) {
2052         fMatchCloseParen++;
2053     }
2054     if (fMatchOpenParen > where) {
2055         fMatchOpenParen++;
2056     }
2057 }
2058 
2059 
2060 //------------------------------------------------------------------------------
2061 //
2062 //   allocateData()        Allocate storage in the matcher's static data area.
2063 //                         Return the index for the newly allocated data.
2064 //                         The storage won't actually exist until we are running a match
2065 //                         operation, but the storage indexes are inserted into various
2066 //                         opcodes while compiling the pattern.
2067 //
2068 //------------------------------------------------------------------------------
allocateData(int32_t size)2069 int32_t RegexCompile::allocateData(int32_t size) {
2070     if (U_FAILURE(*fStatus)) {
2071         return 0;
2072     }
2073     if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2074         error(U_REGEX_INTERNAL_ERROR);
2075         return 0;
2076     }
2077     int32_t dataIndex = fRXPat->fDataSize;
2078     fRXPat->fDataSize += size;
2079     if (fRXPat->fDataSize >= 0x00fffff0) {
2080         error(U_REGEX_INTERNAL_ERROR);
2081     }
2082     return dataIndex;
2083 }
2084 
2085 
2086 //------------------------------------------------------------------------------
2087 //
2088 //   allocateStackData()   Allocate space in the back-tracking stack frame.
2089 //                         Return the index for the newly allocated data.
2090 //                         The frame indexes are inserted into various
2091 //                         opcodes while compiling the pattern, meaning that frame
2092 //                         size must be restricted to the size that will fit
2093 //                         as an operand (24 bits).
2094 //
2095 //------------------------------------------------------------------------------
allocateStackData(int32_t size)2096 int32_t RegexCompile::allocateStackData(int32_t size) {
2097     if (U_FAILURE(*fStatus)) {
2098         return 0;
2099     }
2100     if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2101         error(U_REGEX_INTERNAL_ERROR);
2102         return 0;
2103     }
2104     int32_t dataIndex = fRXPat->fFrameSize;
2105     fRXPat->fFrameSize += size;
2106     if (fRXPat->fFrameSize >= 0x00fffff0) {
2107         error(U_REGEX_PATTERN_TOO_BIG);
2108     }
2109     return dataIndex;
2110 }
2111 
2112 
2113 //------------------------------------------------------------------------------
2114 //
2115 //   blockTopLoc()          Find or create a location in the compiled pattern
2116 //                          at the start of the operation or block that has
2117 //                          just been compiled.  Needed when a quantifier (* or
2118 //                          whatever) appears, and we need to add an operation
2119 //                          at the start of the thing being quantified.
2120 //
2121 //                          (Parenthesized Blocks) have a slot with a NOP that
2122 //                          is reserved for this purpose.  .* or similar don't
2123 //                          and a slot needs to be added.
2124 //
2125 //       parameter reserveLoc   :  TRUE -  ensure that there is space to add an opcode
2126 //                                         at the returned location.
2127 //                                 FALSE - just return the address,
2128 //                                         do not reserve a location there.
2129 //
2130 //------------------------------------------------------------------------------
blockTopLoc(UBool reserveLoc)2131 int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2132     int32_t   theLoc;
2133     fixLiterals(TRUE);  // Emit code for any pending literals.
2134                         //   If last item was a string, emit separate op for the its last char.
2135     if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2136     {
2137         // The item just processed is a parenthesized block.
2138         theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2139         U_ASSERT(theLoc > 0);
2140         U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2141     }
2142     else {
2143         // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2144         // No slot for STATE_SAVE was pre-reserved in the compiled code.
2145         // We need to make space now.
2146         theLoc = fRXPat->fCompiledPat->size()-1;
2147         int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2148         if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2149             // Strings take two opcode, we want the position of the first one.
2150             // We can have a string at this point if a single character case-folded to two.
2151             theLoc--;
2152         }
2153         if (reserveLoc) {
2154             int32_t  nop = buildOp(URX_NOP, 0);
2155             fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2156         }
2157     }
2158     return theLoc;
2159 }
2160 
2161 
2162 
2163 //------------------------------------------------------------------------------
2164 //
2165 //    handleCloseParen      When compiling a close paren, we need to go back
2166 //                          and fix up any JMP or SAVE operations within the
2167 //                          parenthesized block that need to target the end
2168 //                          of the block.  The locations of these are kept on
2169 //                          the paretheses stack.
2170 //
2171 //                          This function is called both when encountering a
2172 //                          real ) and at the end of the pattern.
2173 //
2174 //------------------------------------------------------------------------------
handleCloseParen()2175 void  RegexCompile::handleCloseParen() {
2176     int32_t   patIdx;
2177     int32_t   patOp;
2178     if (fParenStack.size() <= 0) {
2179         error(U_REGEX_MISMATCHED_PAREN);
2180         return;
2181     }
2182 
2183     // Emit code for any pending literals.
2184     fixLiterals(FALSE);
2185 
2186     // Fixup any operations within the just-closed parenthesized group
2187     //    that need to reference the end of the (block).
2188     //    (The first one popped from the stack is an unused slot for
2189     //     alternation (OR) state save, but applying the fixup to it does no harm.)
2190     for (;;) {
2191         patIdx = fParenStack.popi();
2192         if (patIdx < 0) {
2193             // value < 0 flags the start of the frame on the paren stack.
2194             break;
2195         }
2196         U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2197         patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2198         U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2199         patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2200         fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2201         fMatchOpenParen     = patIdx;
2202     }
2203 
2204     //  At the close of any parenthesized block, restore the match mode flags  to
2205     //  the value they had at the open paren.  Saved value is
2206     //  at the top of the paren stack.
2207     fModeFlags = fParenStack.popi();
2208     U_ASSERT(fModeFlags < 0);
2209 
2210     // DO any additional fixups, depending on the specific kind of
2211     // parentesized grouping this is
2212 
2213     switch (patIdx) {
2214     case plain:
2215     case flags:
2216         // No additional fixups required.
2217         //   (Grouping-only parentheses)
2218         break;
2219     case capturing:
2220         // Capturing Parentheses.
2221         //   Insert a End Capture op into the pattern.
2222         //   The frame offset of the variables for this cg is obtained from the
2223         //       start capture op and put it into the end-capture op.
2224         {
2225             int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2226             U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2227 
2228             int32_t   frameVarLocation = URX_VAL(captureOp);
2229             appendOp(URX_END_CAPTURE, frameVarLocation);
2230         }
2231         break;
2232     case atomic:
2233         // Atomic Parenthesis.
2234         //   Insert a LD_SP operation to restore the state stack to the position
2235         //   it was when the atomic parens were entered.
2236         {
2237             int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2238             U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2239             int32_t   stoLoc = URX_VAL(stoOp);
2240             appendOp(URX_LD_SP, stoLoc);
2241         }
2242         break;
2243 
2244     case lookAhead:
2245         {
2246             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2247             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2248             int32_t dataLoc  = URX_VAL(startOp);
2249             appendOp(URX_LA_END, dataLoc);
2250         }
2251         break;
2252 
2253     case negLookAhead:
2254         {
2255             // See comment at doOpenLookAheadNeg
2256             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2257             U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2258             int32_t dataLoc  = URX_VAL(startOp);
2259             appendOp(URX_LA_END, dataLoc);
2260             appendOp(URX_BACKTRACK, 0);
2261             appendOp(URX_LA_END, dataLoc);
2262 
2263             // Patch the URX_SAVE near the top of the block.
2264             // The destination of the SAVE is the final LA_END that was just added.
2265             int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2266             U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2267             int32_t dest     = fRXPat->fCompiledPat->size()-1;
2268             saveOp           = buildOp(URX_STATE_SAVE, dest);
2269             fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2270         }
2271         break;
2272 
2273     case lookBehind:
2274         {
2275             // See comment at doOpenLookBehind.
2276 
2277             // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2278             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2279             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2280             int32_t dataLoc  = URX_VAL(startOp);
2281             appendOp(URX_LB_END, dataLoc);
2282             appendOp(URX_LA_END, dataLoc);
2283 
2284             // Determine the min and max bounds for the length of the
2285             //  string that the pattern can match.
2286             //  An unbounded upper limit is an error.
2287             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2288             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2289             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2290             if (URX_TYPE(maxML) != 0) {
2291                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2292                 break;
2293             }
2294             if (maxML == INT32_MAX) {
2295                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2296                 break;
2297             }
2298             U_ASSERT(minML <= maxML);
2299 
2300             // Insert the min and max match len bounds into the URX_LB_CONT op that
2301             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2302             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2303             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2304 
2305         }
2306         break;
2307 
2308 
2309 
2310     case lookBehindN:
2311         {
2312             // See comment at doOpenLookBehindNeg.
2313 
2314             // Append the URX_LBN_END to the compiled pattern.
2315             int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2316             U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2317             int32_t dataLoc  = URX_VAL(startOp);
2318             appendOp(URX_LBN_END, dataLoc);
2319 
2320             // Determine the min and max bounds for the length of the
2321             //  string that the pattern can match.
2322             //  An unbounded upper limit is an error.
2323             int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2324             int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2325             int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2326             if (URX_TYPE(maxML) != 0) {
2327                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2328                 break;
2329             }
2330             if (maxML == INT32_MAX) {
2331                 error(U_REGEX_LOOK_BEHIND_LIMIT);
2332                 break;
2333             }
2334             U_ASSERT(minML <= maxML);
2335 
2336             // Insert the min and max match len bounds into the URX_LB_CONT op that
2337             //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2338             fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2339             fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2340 
2341             // Insert the pattern location to continue at after a successful match
2342             //  as the last operand of the URX_LBN_CONT
2343             int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2344             fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2345         }
2346         break;
2347 
2348 
2349 
2350     default:
2351         U_ASSERT(FALSE);
2352     }
2353 
2354     // remember the next location in the compiled pattern.
2355     // The compilation of Quantifiers will look at this to see whether its looping
2356     //   over a parenthesized block or a single item
2357     fMatchCloseParen = fRXPat->fCompiledPat->size();
2358 }
2359 
2360 
2361 
2362 //------------------------------------------------------------------------------
2363 //
2364 //   compileSet       Compile the pattern operations for a reference to a
2365 //                    UnicodeSet.
2366 //
2367 //------------------------------------------------------------------------------
compileSet(UnicodeSet * theSet)2368 void        RegexCompile::compileSet(UnicodeSet *theSet)
2369 {
2370     if (theSet == NULL) {
2371         return;
2372     }
2373     //  Remove any strings from the set.
2374     //  There shoudn't be any, but just in case.
2375     //     (Case Closure can add them; if we had a simple case closure avaialble that
2376     //      ignored strings, that would be better.)
2377     theSet->removeAllStrings();
2378     int32_t  setSize = theSet->size();
2379 
2380     switch (setSize) {
2381     case 0:
2382         {
2383             // Set of no elements.   Always fails to match.
2384             appendOp(URX_BACKTRACK, 0);
2385             delete theSet;
2386         }
2387         break;
2388 
2389     case 1:
2390         {
2391             // The set contains only a single code point.  Put it into
2392             //   the compiled pattern as a single char operation rather
2393             //   than a set, and discard the set itself.
2394             literalChar(theSet->charAt(0));
2395             delete theSet;
2396         }
2397         break;
2398 
2399     default:
2400         {
2401             //  The set contains two or more chars.  (the normal case)
2402             //  Put it into the compiled pattern as a set.
2403             int32_t setNumber = fRXPat->fSets->size();
2404             fRXPat->fSets->addElement(theSet, *fStatus);
2405             appendOp(URX_SETREF, setNumber);
2406         }
2407     }
2408 }
2409 
2410 
2411 //------------------------------------------------------------------------------
2412 //
2413 //   compileInterval    Generate the code for a {min, max} style interval quantifier.
2414 //                      Except for the specific opcodes used, the code is the same
2415 //                      for all three types (greedy, non-greedy, possessive) of
2416 //                      intervals.  The opcodes are supplied as parameters.
2417 //                      (There are two sets of opcodes - greedy & possessive use the
2418 //                      same ones, while non-greedy has it's own.)
2419 //
2420 //                      The code for interval loops has this form:
2421 //                         0  CTR_INIT   counter loc (in stack frame)
2422 //                         1             5  patt address of CTR_LOOP at bottom of block
2423 //                         2             min count
2424 //                         3             max count   (-1 for unbounded)
2425 //                         4  ...        block to be iterated over
2426 //                         5  CTR_LOOP
2427 //
2428 //                       In
2429 //------------------------------------------------------------------------------
compileInterval(int32_t InitOp,int32_t LoopOp)2430 void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2431 {
2432     // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2433     //   four slots in the compiled code.  Reserve them.
2434     int32_t   topOfBlock = blockTopLoc(TRUE);
2435     insertOp(topOfBlock);
2436     insertOp(topOfBlock);
2437     insertOp(topOfBlock);
2438 
2439     // The operands for the CTR_INIT opcode include the index in the matcher data
2440     //   of the counter.  Allocate it now. There are two data items
2441     //        counterLoc   -->  Loop counter
2442     //               +1    -->  Input index (for breaking non-progressing loops)
2443     //                          (Only present if unbounded upper limit on loop)
2444     int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2445     int32_t   counterLoc = allocateStackData(dataSize);
2446 
2447     int32_t   op = buildOp(InitOp, counterLoc);
2448     fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2449 
2450     // The second operand of CTR_INIT is the location following the end of the loop.
2451     //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2452     //   compilation of something later on causes the code to grow and the target
2453     //   position to move.
2454     int32_t loopEnd = fRXPat->fCompiledPat->size();
2455     op = buildOp(URX_RELOC_OPRND, loopEnd);
2456     fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2457 
2458     // Followed by the min and max counts.
2459     fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2460     fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2461 
2462     // Apend the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2463     //   Goes at end of the block being looped over, so just append to the code so far.
2464     appendOp(LoopOp, topOfBlock);
2465 
2466     if ((fIntervalLow & 0xff000000) != 0 ||
2467         (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2468             error(U_REGEX_NUMBER_TOO_BIG);
2469         }
2470 
2471     if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2472         error(U_REGEX_MAX_LT_MIN);
2473     }
2474 }
2475 
2476 
2477 
compileInlineInterval()2478 UBool RegexCompile::compileInlineInterval() {
2479     if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2480         // Too big to inline.  Fail, which will cause looping code to be generated.
2481         //   (Upper < Lower picks up unbounded upper and errors, both.)
2482         return FALSE;
2483     }
2484 
2485     int32_t   topOfBlock = blockTopLoc(FALSE);
2486     if (fIntervalUpper == 0) {
2487         // Pathological case.  Attempt no matches, as if the block doesn't exist.
2488         // Discard the generated code for the block.
2489         // If the block included parens, discard the info pertaining to them as well.
2490         fRXPat->fCompiledPat->setSize(topOfBlock);
2491         if (fMatchOpenParen >= topOfBlock) {
2492             fMatchOpenParen = -1;
2493         }
2494         if (fMatchCloseParen >= topOfBlock) {
2495             fMatchCloseParen = -1;
2496         }
2497         return TRUE;
2498     }
2499 
2500     if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2501         // The thing being repeated is not a single op, but some
2502         //   more complex block.  Do it as a loop, not inlines.
2503         //   Note that things "repeated" a max of once are handled as inline, because
2504         //     the one copy of the code already generated is just fine.
2505         return FALSE;
2506     }
2507 
2508     // Pick up the opcode that is to be repeated
2509     //
2510     int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2511 
2512     // Compute the pattern location where the inline sequence
2513     //   will end, and set up the state save op that will be needed.
2514     //
2515     int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2516                                 + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2517     int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2518     if (fIntervalLow == 0) {
2519         insertOp(topOfBlock);
2520         fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2521     }
2522 
2523 
2524 
2525     //  Loop, emitting the op for the thing being repeated each time.
2526     //    Loop starts at 1 because one instance of the op already exists in the pattern,
2527     //    it was put there when it was originally encountered.
2528     int32_t i;
2529     for (i=1; i<fIntervalUpper; i++ ) {
2530         if (i >= fIntervalLow) {
2531             appendOp(saveOp);
2532         }
2533         appendOp(op);
2534     }
2535     return TRUE;
2536 }
2537 
2538 
2539 
2540 //------------------------------------------------------------------------------
2541 //
2542 //   caseInsensitiveStart  given a single code point from a pattern string, determine the
2543 //                         set of characters that could potentially begin a case-insensitive
2544 //                         match of a string beginning with that character, using full Unicode
2545 //                         case insensitive matching.
2546 //
2547 //          This is used in optimizing find().
2548 //
2549 //          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2550 //          misses cases like this:
2551 //             A string from the pattern begins with 'ss' (although all we know
2552 //                 in this context is that it begins with 's')
2553 //             The pattern could match a string beginning with a German sharp-s
2554 //
2555 //           To the ordinary case closure for a character c, we add all other
2556 //           characters cx where the case closure of cx incudes a string form that begins
2557 //           with the original character c.
2558 //
2559 //           This function could be made smarter. The full pattern string is available
2560 //           and it would be possible to verify that the extra characters being added
2561 //           to the starting set fully match, rather than having just a first-char of the
2562 //           folded form match.
2563 //
2564 //------------------------------------------------------------------------------
findCaseInsensitiveStarters(UChar32 c,UnicodeSet * starterChars)2565 void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2566 
2567 // Machine Generated below.
2568 // It may need updating with new versions of Unicode.
2569 // Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2570 // The update tool is here: svn+ssh://source.icu-project.org/repos/icu/tools/trunk/unicode/c/genregexcasing
2571 
2572 // Machine Generated Data. Do not hand edit.
2573     static const UChar32 RECaseFixCodePoints[] = {
2574         0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc,
2575         0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565,
2576         0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07,
2577         0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61,
2578         0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2579 
2580     static const int16_t RECaseFixStringOffsets[] = {
2581         0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10,
2582         0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f,
2583         0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43,
2584         0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57,
2585         0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2586 
2587     static const int16_t RECaseFixCounts[] = {
2588         0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1,
2589         0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1,
2590         0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2591         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2,
2592         0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2593 
2594     static const UChar RECaseFixData[] = {
2595         0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf,
2596         0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3,
2597         0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3,
2598         0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3,
2599         0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14,
2600         0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83,
2601         0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90,
2602         0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95,
2603         0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2,
2604         0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7,
2605         0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2606 
2607 // End of machine generated data.
2608 
2609     if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2610         // This function should never be called with an invalid input character.
2611         U_ASSERT(FALSE);
2612         starterChars->clear();
2613     } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2614         UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2615         starterChars->set(caseFoldedC, caseFoldedC);
2616 
2617         int32_t i;
2618         for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2619             // Simple linear search through the sorted list of interesting code points.
2620         }
2621 
2622         if (RECaseFixCodePoints[i] == c) {
2623             int32_t dataIndex = RECaseFixStringOffsets[i];
2624             int32_t numCharsToAdd = RECaseFixCounts[i];
2625             UChar32 cpToAdd = 0;
2626             for (int32_t j=0; j<numCharsToAdd; j++) {
2627                 U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2628                 starterChars->add(cpToAdd);
2629             }
2630         }
2631 
2632         starterChars->closeOver(USET_CASE_INSENSITIVE);
2633         starterChars->removeAllStrings();
2634     } else {
2635         // Not a cased character. Just return it alone.
2636         starterChars->set(c, c);
2637     }
2638 }
2639 
2640 
2641 // Increment with overflow check.
2642 // val and delta will both be positive.
2643 
safeIncrement(int32_t val,int32_t delta)2644 static int32_t safeIncrement(int32_t val, int32_t delta) {
2645     if (INT32_MAX - val > delta) {
2646         return val + delta;
2647     } else {
2648         return INT32_MAX;
2649     }
2650 }
2651 
2652 
2653 //------------------------------------------------------------------------------
2654 //
2655 //   matchStartType    Determine how a match can start.
2656 //                     Used to optimize find() operations.
2657 //
2658 //                     Operation is very similar to minMatchLength().  Walk the compiled
2659 //                     pattern, keeping an on-going minimum-match-length.  For any
2660 //                     op where the min match coming in is zero, add that ops possible
2661 //                     starting matches to the possible starts for the overall pattern.
2662 //
2663 //------------------------------------------------------------------------------
matchStartType()2664 void   RegexCompile::matchStartType() {
2665     if (U_FAILURE(*fStatus)) {
2666         return;
2667     }
2668 
2669 
2670     int32_t    loc;                    // Location in the pattern of the current op being processed.
2671     int32_t    op;                     // The op being processed
2672     int32_t    opType;                 // The opcode type of the op
2673     int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2674     int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2675 
2676     UBool      atStart = TRUE;         // True if no part of the pattern yet encountered
2677                                        //   could have advanced the position in a match.
2678                                        //   (Maximum match length so far == 0)
2679 
2680     // forwardedLength is a vector holding minimum-match-length values that
2681     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2682     //   It must be one longer than the pattern being checked because some  ops
2683     //   will jmp to a end-of-block+1 location from within a block, and we must
2684     //   count those when checking the block.
2685     int32_t end = fRXPat->fCompiledPat->size();
2686     UVector32  forwardedLength(end+1, *fStatus);
2687     forwardedLength.setSize(end+1);
2688     for (loc=3; loc<end; loc++) {
2689         forwardedLength.setElementAt(INT32_MAX, loc);
2690     }
2691 
2692     for (loc = 3; loc<end; loc++) {
2693         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2694         opType = URX_TYPE(op);
2695 
2696         // The loop is advancing linearly through the pattern.
2697         // If the op we are now at was the destination of a branch in the pattern,
2698         // and that path has a shorter minimum length than the current accumulated value,
2699         // replace the current accumulated value.
2700         if (forwardedLength.elementAti(loc) < currentLen) {
2701             currentLen = forwardedLength.elementAti(loc);
2702             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2703         }
2704 
2705         switch (opType) {
2706             // Ops that don't change the total length matched
2707         case URX_RESERVED_OP:
2708         case URX_END:
2709         case URX_FAIL:
2710         case URX_STRING_LEN:
2711         case URX_NOP:
2712         case URX_START_CAPTURE:
2713         case URX_END_CAPTURE:
2714         case URX_BACKSLASH_B:
2715         case URX_BACKSLASH_BU:
2716         case URX_BACKSLASH_G:
2717         case URX_BACKSLASH_Z:
2718         case URX_DOLLAR:
2719         case URX_DOLLAR_M:
2720         case URX_DOLLAR_D:
2721         case URX_DOLLAR_MD:
2722         case URX_RELOC_OPRND:
2723         case URX_STO_INP_LOC:
2724         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2725         case URX_BACKREF_I:
2726 
2727         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2728         case URX_LD_SP:
2729             break;
2730 
2731         case URX_CARET:
2732             if (atStart) {
2733                 fRXPat->fStartType = START_START;
2734             }
2735             break;
2736 
2737         case URX_CARET_M:
2738         case URX_CARET_M_UNIX:
2739             if (atStart) {
2740                 fRXPat->fStartType = START_LINE;
2741             }
2742             break;
2743 
2744         case URX_ONECHAR:
2745             if (currentLen == 0) {
2746                 // This character could appear at the start of a match.
2747                 //   Add it to the set of possible starting characters.
2748                 fRXPat->fInitialChars->add(URX_VAL(op));
2749                 numInitialStrings += 2;
2750             }
2751             currentLen = safeIncrement(currentLen, 1);
2752             atStart = FALSE;
2753             break;
2754 
2755 
2756         case URX_SETREF:
2757             if (currentLen == 0) {
2758                 int32_t  sn = URX_VAL(op);
2759                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2760                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2761                 fRXPat->fInitialChars->addAll(*s);
2762                 numInitialStrings += 2;
2763             }
2764             currentLen = safeIncrement(currentLen, 1);
2765             atStart = FALSE;
2766             break;
2767 
2768         case URX_LOOP_SR_I:
2769             // [Set]*, like a SETREF, above, in what it can match,
2770             //  but may not match at all, so currentLen is not incremented.
2771             if (currentLen == 0) {
2772                 int32_t  sn = URX_VAL(op);
2773                 U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2774                 const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2775                 fRXPat->fInitialChars->addAll(*s);
2776                 numInitialStrings += 2;
2777             }
2778             atStart = FALSE;
2779             break;
2780 
2781         case URX_LOOP_DOT_I:
2782             if (currentLen == 0) {
2783                 // .* at the start of a pattern.
2784                 //    Any character can begin the match.
2785                 fRXPat->fInitialChars->clear();
2786                 fRXPat->fInitialChars->complement();
2787                 numInitialStrings += 2;
2788             }
2789             atStart = FALSE;
2790             break;
2791 
2792 
2793         case URX_STATIC_SETREF:
2794             if (currentLen == 0) {
2795                 int32_t  sn = URX_VAL(op);
2796                 U_ASSERT(sn>0 && sn<URX_LAST_SET);
2797                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2798                 fRXPat->fInitialChars->addAll(*s);
2799                 numInitialStrings += 2;
2800             }
2801             currentLen = safeIncrement(currentLen, 1);
2802             atStart = FALSE;
2803             break;
2804 
2805 
2806 
2807         case URX_STAT_SETREF_N:
2808             if (currentLen == 0) {
2809                 int32_t  sn = URX_VAL(op);
2810                 const UnicodeSet *s = fRXPat->fStaticSets[sn];
2811                 UnicodeSet sc(*s);
2812                 sc.complement();
2813                 fRXPat->fInitialChars->addAll(sc);
2814                 numInitialStrings += 2;
2815             }
2816             currentLen = safeIncrement(currentLen, 1);
2817             atStart = FALSE;
2818             break;
2819 
2820 
2821 
2822         case URX_BACKSLASH_D:
2823             // Digit Char
2824              if (currentLen == 0) {
2825                  UnicodeSet s;
2826                  s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2827                  if (URX_VAL(op) != 0) {
2828                      s.complement();
2829                  }
2830                  fRXPat->fInitialChars->addAll(s);
2831                  numInitialStrings += 2;
2832             }
2833             currentLen = safeIncrement(currentLen, 1);
2834             atStart = FALSE;
2835             break;
2836 
2837 
2838         case URX_BACKSLASH_H:
2839             // Horiz white space
2840             if (currentLen == 0) {
2841                 UnicodeSet s;
2842                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2843                 s.add((UChar32)9);   // Tab
2844                 if (URX_VAL(op) != 0) {
2845                     s.complement();
2846                 }
2847                 fRXPat->fInitialChars->addAll(s);
2848                 numInitialStrings += 2;
2849             }
2850             currentLen = safeIncrement(currentLen, 1);
2851             atStart = FALSE;
2852             break;
2853 
2854 
2855         case URX_BACKSLASH_R:       // Any line ending sequence
2856         case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2857             if (currentLen == 0) {
2858                 UnicodeSet s;
2859                 s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2860                 s.add((UChar32)0x85);
2861                 s.add((UChar32)0x2028, (UChar32)0x2029);
2862                 if (URX_VAL(op) != 0) {
2863                      // Complement option applies to URX_BACKSLASH_V only.
2864                      s.complement();
2865                 }
2866                 fRXPat->fInitialChars->addAll(s);
2867                 numInitialStrings += 2;
2868             }
2869             currentLen = safeIncrement(currentLen, 1);
2870             atStart = FALSE;
2871             break;
2872 
2873 
2874 
2875         case URX_ONECHAR_I:
2876             // Case Insensitive Single Character.
2877             if (currentLen == 0) {
2878                 UChar32  c = URX_VAL(op);
2879                 if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2880                     UnicodeSet starters(c, c);
2881                     starters.closeOver(USET_CASE_INSENSITIVE);
2882                     // findCaseInsensitiveStarters(c, &starters);
2883                     //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2884                     //   The expanded folding can't match the pattern.
2885                     fRXPat->fInitialChars->addAll(starters);
2886                 } else {
2887                     // Char has no case variants.  Just add it as-is to the
2888                     //   set of possible starting chars.
2889                     fRXPat->fInitialChars->add(c);
2890                 }
2891                 numInitialStrings += 2;
2892             }
2893             currentLen = safeIncrement(currentLen, 1);
2894             atStart = FALSE;
2895             break;
2896 
2897 
2898         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
2899         case URX_DOTANY_ALL:    // . matches one or two.
2900         case URX_DOTANY:
2901         case URX_DOTANY_UNIX:
2902             if (currentLen == 0) {
2903                 // These constructs are all bad news when they appear at the start
2904                 //   of a match.  Any character can begin the match.
2905                 fRXPat->fInitialChars->clear();
2906                 fRXPat->fInitialChars->complement();
2907                 numInitialStrings += 2;
2908             }
2909             currentLen = safeIncrement(currentLen, 1);
2910             atStart = FALSE;
2911             break;
2912 
2913 
2914         case URX_JMPX:
2915             loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2916             U_FALLTHROUGH;
2917         case URX_JMP:
2918             {
2919                 int32_t  jmpDest = URX_VAL(op);
2920                 if (jmpDest < loc) {
2921                     // Loop of some kind.  Can safely ignore, the worst that will happen
2922                     //  is that we understate the true minimum length
2923                     currentLen = forwardedLength.elementAti(loc+1);
2924 
2925                 } else {
2926                     // Forward jump.  Propagate the current min length to the target loc of the jump.
2927                     U_ASSERT(jmpDest <= end+1);
2928                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
2929                         forwardedLength.setElementAt(currentLen, jmpDest);
2930                     }
2931                 }
2932             }
2933             atStart = FALSE;
2934             break;
2935 
2936         case URX_JMP_SAV:
2937         case URX_JMP_SAV_X:
2938             // Combo of state save to the next loc, + jmp backwards.
2939             //   Net effect on min. length computation is nothing.
2940             atStart = FALSE;
2941             break;
2942 
2943         case URX_BACKTRACK:
2944             // Fails are kind of like a branch, except that the min length was
2945             //   propagated already, by the state save.
2946             currentLen = forwardedLength.elementAti(loc+1);
2947             atStart = FALSE;
2948             break;
2949 
2950 
2951         case URX_STATE_SAVE:
2952             {
2953                 // State Save, for forward jumps, propagate the current minimum.
2954                 //             of the state save.
2955                 int32_t  jmpDest = URX_VAL(op);
2956                 if (jmpDest > loc) {
2957                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
2958                         forwardedLength.setElementAt(currentLen, jmpDest);
2959                     }
2960                 }
2961             }
2962             atStart = FALSE;
2963             break;
2964 
2965 
2966 
2967 
2968         case URX_STRING:
2969             {
2970                 loc++;
2971                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2972                 int32_t stringLen   = URX_VAL(stringLenOp);
2973                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
2974                 U_ASSERT(stringLenOp >= 2);
2975                 if (currentLen == 0) {
2976                     // Add the starting character of this string to the set of possible starting
2977                     //   characters for this pattern.
2978                     int32_t stringStartIdx = URX_VAL(op);
2979                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
2980                     fRXPat->fInitialChars->add(c);
2981 
2982                     // Remember this string.  After the entire pattern has been checked,
2983                     //  if nothing else is identified that can start a match, we'll use it.
2984                     numInitialStrings++;
2985                     fRXPat->fInitialStringIdx = stringStartIdx;
2986                     fRXPat->fInitialStringLen = stringLen;
2987                 }
2988 
2989                 currentLen = safeIncrement(currentLen, stringLen);
2990                 atStart = FALSE;
2991             }
2992             break;
2993 
2994         case URX_STRING_I:
2995             {
2996                 // Case-insensitive string.  Unlike exact-match strings, we won't
2997                 //   attempt a string search for possible match positions.  But we
2998                 //   do update the set of possible starting characters.
2999                 loc++;
3000                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3001                 int32_t stringLen   = URX_VAL(stringLenOp);
3002                 U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3003                 U_ASSERT(stringLenOp >= 2);
3004                 if (currentLen == 0) {
3005                     // Add the starting character of this string to the set of possible starting
3006                     //   characters for this pattern.
3007                     int32_t stringStartIdx = URX_VAL(op);
3008                     UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3009                     UnicodeSet s;
3010                     findCaseInsensitiveStarters(c, &s);
3011                     fRXPat->fInitialChars->addAll(s);
3012                     numInitialStrings += 2;  // Matching on an initial string not possible.
3013                 }
3014                 currentLen = safeIncrement(currentLen, stringLen);
3015                 atStart = FALSE;
3016             }
3017             break;
3018 
3019         case URX_CTR_INIT:
3020         case URX_CTR_INIT_NG:
3021             {
3022                 // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3023                 //   so location must be updated accordingly.
3024                 // Loop Init Ops.
3025                 //   If the min loop count == 0
3026                 //      move loc forwards to the end of the loop, skipping over the body.
3027                 //   If the min count is > 0,
3028                 //      continue normal processing of the body of the loop.
3029                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3030                         loopEndLoc   = URX_VAL(loopEndLoc);
3031                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3032                 if (minLoopCount == 0) {
3033                     // Min Loop Count of 0, treat like a forward branch and
3034                     //   move the current minimum length up to the target
3035                     //   (end of loop) location.
3036                     U_ASSERT(loopEndLoc <= end+1);
3037                     if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3038                         forwardedLength.setElementAt(currentLen, loopEndLoc);
3039                     }
3040                 }
3041                 loc+=3;  // Skips over operands of CTR_INIT
3042             }
3043             atStart = FALSE;
3044             break;
3045 
3046 
3047         case URX_CTR_LOOP:
3048         case URX_CTR_LOOP_NG:
3049             // Loop ops.
3050             //  The jump is conditional, backwards only.
3051             atStart = FALSE;
3052             break;
3053 
3054         case URX_LOOP_C:
3055             // More loop ops.  These state-save to themselves.
3056             //   don't change the minimum match
3057             atStart = FALSE;
3058             break;
3059 
3060 
3061         case URX_LA_START:
3062         case URX_LB_START:
3063             {
3064                 // Look-around.  Scan forward until the matching look-ahead end,
3065                 //   without processing the look-around block.  This is overly pessimistic.
3066 
3067                 // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3068                 //   lookahead contains two LA_END instructions, so count goes up by two
3069                 //   for each LA_START.
3070                 int32_t  depth = (opType == URX_LA_START? 2: 1);
3071                 for (;;) {
3072                     loc++;
3073                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3074                     if (URX_TYPE(op) == URX_LA_START) {
3075                         depth+=2;
3076                     }
3077                     if (URX_TYPE(op) == URX_LB_START) {
3078                         depth++;
3079                     }
3080                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3081                         depth--;
3082                         if (depth == 0) {
3083                             break;
3084                         }
3085                     }
3086                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3087                         // Need this because neg lookahead blocks will FAIL to outside
3088                         //   of the block.
3089                         int32_t  jmpDest = URX_VAL(op);
3090                         if (jmpDest > loc) {
3091                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3092                                 forwardedLength.setElementAt(currentLen, jmpDest);
3093                             }
3094                         }
3095                     }
3096                     U_ASSERT(loc <= end);
3097                 }
3098             }
3099             break;
3100 
3101         case URX_LA_END:
3102         case URX_LB_CONT:
3103         case URX_LB_END:
3104         case URX_LBN_CONT:
3105         case URX_LBN_END:
3106             U_ASSERT(FALSE);     // Shouldn't get here.  These ops should be
3107                                  //  consumed by the scan in URX_LA_START and LB_START
3108 
3109             break;
3110 
3111         default:
3112             U_ASSERT(FALSE);
3113             }
3114 
3115         }
3116 
3117 
3118     // We have finished walking through the ops.  Check whether some forward jump
3119     //   propagated a shorter length to location end+1.
3120     if (forwardedLength.elementAti(end+1) < currentLen) {
3121         currentLen = forwardedLength.elementAti(end+1);
3122     }
3123 
3124 
3125     fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3126 
3127 
3128     // Sort out what we should check for when looking for candidate match start positions.
3129     // In order of preference,
3130     //     1.   Start of input text buffer.
3131     //     2.   A literal string.
3132     //     3.   Start of line in multi-line mode.
3133     //     4.   A single literal character.
3134     //     5.   A character from a set of characters.
3135     //
3136     if (fRXPat->fStartType == START_START) {
3137         // Match only at the start of an input text string.
3138         //    start type is already set.  We're done.
3139     } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3140         // Match beginning only with a literal string.
3141         UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3142         U_ASSERT(fRXPat->fInitialChars->contains(c));
3143         fRXPat->fStartType   = START_STRING;
3144         fRXPat->fInitialChar = c;
3145     } else if (fRXPat->fStartType == START_LINE) {
3146         // Match at start of line in Multi-Line mode.
3147         // Nothing to do here; everything is already set.
3148     } else if (fRXPat->fMinMatchLen == 0) {
3149         // Zero length match possible.  We could start anywhere.
3150         fRXPat->fStartType = START_NO_INFO;
3151     } else if (fRXPat->fInitialChars->size() == 1) {
3152         // All matches begin with the same char.
3153         fRXPat->fStartType   = START_CHAR;
3154         fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3155         U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3156     } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == FALSE &&
3157         fRXPat->fMinMatchLen > 0) {
3158         // Matches start with a set of character smaller than the set of all chars.
3159         fRXPat->fStartType = START_SET;
3160     } else {
3161         // Matches can start with anything
3162         fRXPat->fStartType = START_NO_INFO;
3163     }
3164 
3165     return;
3166 }
3167 
3168 
3169 
3170 //------------------------------------------------------------------------------
3171 //
3172 //   minMatchLength    Calculate the length of the shortest string that could
3173 //                     match the specified pattern.
3174 //                     Length is in 16 bit code units, not code points.
3175 //
3176 //                     The calculated length may not be exact.  The returned
3177 //                     value may be shorter than the actual minimum; it must
3178 //                     never be longer.
3179 //
3180 //                     start and end are the range of p-code operations to be
3181 //                     examined.  The endpoints are included in the range.
3182 //
3183 //------------------------------------------------------------------------------
minMatchLength(int32_t start,int32_t end)3184 int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3185     if (U_FAILURE(*fStatus)) {
3186         return 0;
3187     }
3188 
3189     U_ASSERT(start <= end);
3190     U_ASSERT(end < fRXPat->fCompiledPat->size());
3191 
3192 
3193     int32_t    loc;
3194     int32_t    op;
3195     int32_t    opType;
3196     int32_t    currentLen = 0;
3197 
3198 
3199     // forwardedLength is a vector holding minimum-match-length values that
3200     //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3201     //   It must be one longer than the pattern being checked because some  ops
3202     //   will jmp to a end-of-block+1 location from within a block, and we must
3203     //   count those when checking the block.
3204     UVector32  forwardedLength(end+2, *fStatus);
3205     forwardedLength.setSize(end+2);
3206     for (loc=start; loc<=end+1; loc++) {
3207         forwardedLength.setElementAt(INT32_MAX, loc);
3208     }
3209 
3210     for (loc = start; loc<=end; loc++) {
3211         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3212         opType = URX_TYPE(op);
3213 
3214         // The loop is advancing linearly through the pattern.
3215         // If the op we are now at was the destination of a branch in the pattern,
3216         // and that path has a shorter minimum length than the current accumulated value,
3217         // replace the current accumulated value.
3218         // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3219                                                                //   no-match-possible cases.
3220         if (forwardedLength.elementAti(loc) < currentLen) {
3221             currentLen = forwardedLength.elementAti(loc);
3222             U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3223         }
3224 
3225         switch (opType) {
3226             // Ops that don't change the total length matched
3227         case URX_RESERVED_OP:
3228         case URX_END:
3229         case URX_STRING_LEN:
3230         case URX_NOP:
3231         case URX_START_CAPTURE:
3232         case URX_END_CAPTURE:
3233         case URX_BACKSLASH_B:
3234         case URX_BACKSLASH_BU:
3235         case URX_BACKSLASH_G:
3236         case URX_BACKSLASH_Z:
3237         case URX_CARET:
3238         case URX_DOLLAR:
3239         case URX_DOLLAR_M:
3240         case URX_DOLLAR_D:
3241         case URX_DOLLAR_MD:
3242         case URX_RELOC_OPRND:
3243         case URX_STO_INP_LOC:
3244         case URX_CARET_M:
3245         case URX_CARET_M_UNIX:
3246         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3247         case URX_BACKREF_I:
3248 
3249         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3250         case URX_LD_SP:
3251 
3252         case URX_JMP_SAV:
3253         case URX_JMP_SAV_X:
3254             break;
3255 
3256 
3257             // Ops that match a minimum of one character (one or two 16 bit code units.)
3258             //
3259         case URX_ONECHAR:
3260         case URX_STATIC_SETREF:
3261         case URX_STAT_SETREF_N:
3262         case URX_SETREF:
3263         case URX_BACKSLASH_D:
3264         case URX_BACKSLASH_H:
3265         case URX_BACKSLASH_R:
3266         case URX_BACKSLASH_V:
3267         case URX_ONECHAR_I:
3268         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3269         case URX_DOTANY_ALL:    // . matches one or two.
3270         case URX_DOTANY:
3271         case URX_DOTANY_UNIX:
3272             currentLen = safeIncrement(currentLen, 1);
3273             break;
3274 
3275 
3276         case URX_JMPX:
3277             loc++;              // URX_JMPX has an extra operand, ignored here,
3278                                 //   otherwise processed identically to URX_JMP.
3279             U_FALLTHROUGH;
3280         case URX_JMP:
3281             {
3282                 int32_t  jmpDest = URX_VAL(op);
3283                 if (jmpDest < loc) {
3284                     // Loop of some kind.  Can safely ignore, the worst that will happen
3285                     //  is that we understate the true minimum length
3286                     currentLen = forwardedLength.elementAti(loc+1);
3287                 } else {
3288                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3289                     U_ASSERT(jmpDest <= end+1);
3290                     if (forwardedLength.elementAti(jmpDest) > currentLen) {
3291                         forwardedLength.setElementAt(currentLen, jmpDest);
3292                     }
3293                 }
3294             }
3295             break;
3296 
3297         case URX_BACKTRACK:
3298             {
3299                 // Back-tracks are kind of like a branch, except that the min length was
3300                 //   propagated already, by the state save.
3301                 currentLen = forwardedLength.elementAti(loc+1);
3302             }
3303             break;
3304 
3305 
3306         case URX_STATE_SAVE:
3307             {
3308                 // State Save, for forward jumps, propagate the current minimum.
3309                 //             of the state save.
3310                 int32_t  jmpDest = URX_VAL(op);
3311                 if (jmpDest > loc) {
3312                     if (currentLen < forwardedLength.elementAti(jmpDest)) {
3313                         forwardedLength.setElementAt(currentLen, jmpDest);
3314                     }
3315                 }
3316             }
3317             break;
3318 
3319 
3320         case URX_STRING:
3321             {
3322                 loc++;
3323                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3324                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3325             }
3326             break;
3327 
3328 
3329         case URX_STRING_I:
3330             {
3331                 loc++;
3332                 // TODO: with full case folding, matching input text may be shorter than
3333                 //       the string we have here.  More smarts could put some bounds on it.
3334                 //       Assume a min length of one for now.  A min length of zero causes
3335                 //        optimization failures for a pattern like "string"+
3336                 // currentLen += URX_VAL(stringLenOp);
3337                 currentLen = safeIncrement(currentLen, 1);
3338             }
3339             break;
3340 
3341         case URX_CTR_INIT:
3342         case URX_CTR_INIT_NG:
3343             {
3344                 // Loop Init Ops.
3345                 //   If the min loop count == 0
3346                 //      move loc forwards to the end of the loop, skipping over the body.
3347                 //   If the min count is > 0,
3348                 //      continue normal processing of the body of the loop.
3349                 int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3350                         loopEndLoc   = URX_VAL(loopEndLoc);
3351                 int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3352                 if (minLoopCount == 0) {
3353                     loc = loopEndLoc;
3354                 } else {
3355                     loc+=3;  // Skips over operands of CTR_INIT
3356                 }
3357             }
3358             break;
3359 
3360 
3361         case URX_CTR_LOOP:
3362         case URX_CTR_LOOP_NG:
3363             // Loop ops.
3364             //  The jump is conditional, backwards only.
3365             break;
3366 
3367         case URX_LOOP_SR_I:
3368         case URX_LOOP_DOT_I:
3369         case URX_LOOP_C:
3370             // More loop ops.  These state-save to themselves.
3371             //   don't change the minimum match - could match nothing at all.
3372             break;
3373 
3374 
3375         case URX_LA_START:
3376         case URX_LB_START:
3377             {
3378                 // Look-around.  Scan forward until the matching look-ahead end,
3379                 //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3380                 //   it assumes that the look-ahead match might be zero-length.
3381                 //   TODO:  Positive lookahead could recursively do the block, then continue
3382                 //          with the longer of the block or the value coming in.  Ticket 6060
3383                 int32_t  depth = (opType == URX_LA_START? 2: 1);;
3384                 for (;;) {
3385                     loc++;
3386                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3387                     if (URX_TYPE(op) == URX_LA_START) {
3388                         // The boilerplate for look-ahead includes two LA_END insturctions,
3389                         //    Depth will be decremented by each one when it is seen.
3390                         depth += 2;
3391                     }
3392                     if (URX_TYPE(op) == URX_LB_START) {
3393                         depth++;
3394                     }
3395                     if (URX_TYPE(op) == URX_LA_END) {
3396                         depth--;
3397                         if (depth == 0) {
3398                             break;
3399                         }
3400                     }
3401                     if (URX_TYPE(op)==URX_LBN_END) {
3402                         depth--;
3403                         if (depth == 0) {
3404                             break;
3405                         }
3406                     }
3407                     if (URX_TYPE(op) == URX_STATE_SAVE) {
3408                         // Need this because neg lookahead blocks will FAIL to outside
3409                         //   of the block.
3410                         int32_t  jmpDest = URX_VAL(op);
3411                         if (jmpDest > loc) {
3412                             if (currentLen < forwardedLength.elementAti(jmpDest)) {
3413                                 forwardedLength.setElementAt(currentLen, jmpDest);
3414                             }
3415                         }
3416                     }
3417                     U_ASSERT(loc <= end);
3418                 }
3419             }
3420             break;
3421 
3422         case URX_LA_END:
3423         case URX_LB_CONT:
3424         case URX_LB_END:
3425         case URX_LBN_CONT:
3426         case URX_LBN_END:
3427             // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3428             //   range being sized, which happens when measuring size of look-behind blocks.
3429             break;
3430 
3431         default:
3432             U_ASSERT(FALSE);
3433             }
3434 
3435         }
3436 
3437     // We have finished walking through the ops.  Check whether some forward jump
3438     //   propagated a shorter length to location end+1.
3439     if (forwardedLength.elementAti(end+1) < currentLen) {
3440         currentLen = forwardedLength.elementAti(end+1);
3441         U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3442     }
3443 
3444     return currentLen;
3445 }
3446 
3447 //------------------------------------------------------------------------------
3448 //
3449 //   maxMatchLength    Calculate the length of the longest string that could
3450 //                     match the specified pattern.
3451 //                     Length is in 16 bit code units, not code points.
3452 //
3453 //                     The calculated length may not be exact.  The returned
3454 //                     value may be longer than the actual maximum; it must
3455 //                     never be shorter.
3456 //
3457 //------------------------------------------------------------------------------
maxMatchLength(int32_t start,int32_t end)3458 int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3459     if (U_FAILURE(*fStatus)) {
3460         return 0;
3461     }
3462     U_ASSERT(start <= end);
3463     U_ASSERT(end < fRXPat->fCompiledPat->size());
3464 
3465 
3466     int32_t    loc;
3467     int32_t    op;
3468     int32_t    opType;
3469     int32_t    currentLen = 0;
3470     UVector32  forwardedLength(end+1, *fStatus);
3471     forwardedLength.setSize(end+1);
3472 
3473     for (loc=start; loc<=end; loc++) {
3474         forwardedLength.setElementAt(0, loc);
3475     }
3476 
3477     for (loc = start; loc<=end; loc++) {
3478         op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3479         opType = URX_TYPE(op);
3480 
3481         // The loop is advancing linearly through the pattern.
3482         // If the op we are now at was the destination of a branch in the pattern,
3483         // and that path has a longer maximum length than the current accumulated value,
3484         // replace the current accumulated value.
3485         if (forwardedLength.elementAti(loc) > currentLen) {
3486             currentLen = forwardedLength.elementAti(loc);
3487         }
3488 
3489         switch (opType) {
3490             // Ops that don't change the total length matched
3491         case URX_RESERVED_OP:
3492         case URX_END:
3493         case URX_STRING_LEN:
3494         case URX_NOP:
3495         case URX_START_CAPTURE:
3496         case URX_END_CAPTURE:
3497         case URX_BACKSLASH_B:
3498         case URX_BACKSLASH_BU:
3499         case URX_BACKSLASH_G:
3500         case URX_BACKSLASH_Z:
3501         case URX_CARET:
3502         case URX_DOLLAR:
3503         case URX_DOLLAR_M:
3504         case URX_DOLLAR_D:
3505         case URX_DOLLAR_MD:
3506         case URX_RELOC_OPRND:
3507         case URX_STO_INP_LOC:
3508         case URX_CARET_M:
3509         case URX_CARET_M_UNIX:
3510 
3511         case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3512         case URX_LD_SP:
3513 
3514         case URX_LB_END:
3515         case URX_LB_CONT:
3516         case URX_LBN_CONT:
3517         case URX_LBN_END:
3518             break;
3519 
3520 
3521             // Ops that increase that cause an unbounded increase in the length
3522             //   of a matched string, or that increase it a hard to characterize way.
3523             //   Call the max length unbounded, and stop further checking.
3524         case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3525         case URX_BACKREF_I:
3526         case URX_BACKSLASH_X:   // Grahpeme Cluster.  Minimum is 1, max unbounded.
3527             currentLen = INT32_MAX;
3528             break;
3529 
3530 
3531             // Ops that match a max of one character (possibly two 16 bit code units.)
3532             //
3533         case URX_STATIC_SETREF:
3534         case URX_STAT_SETREF_N:
3535         case URX_SETREF:
3536         case URX_BACKSLASH_D:
3537         case URX_BACKSLASH_H:
3538         case URX_BACKSLASH_R:
3539         case URX_BACKSLASH_V:
3540         case URX_ONECHAR_I:
3541         case URX_DOTANY_ALL:
3542         case URX_DOTANY:
3543         case URX_DOTANY_UNIX:
3544             currentLen = safeIncrement(currentLen, 2);
3545             break;
3546 
3547             // Single literal character.  Increase current max length by one or two,
3548             //       depending on whether the char is in the supplementary range.
3549         case URX_ONECHAR:
3550             currentLen = safeIncrement(currentLen, 1);
3551             if (URX_VAL(op) > 0x10000) {
3552                 currentLen = safeIncrement(currentLen, 1);
3553             }
3554             break;
3555 
3556             // Jumps.
3557             //
3558         case URX_JMP:
3559         case URX_JMPX:
3560         case URX_JMP_SAV:
3561         case URX_JMP_SAV_X:
3562             {
3563                 int32_t  jmpDest = URX_VAL(op);
3564                 if (jmpDest < loc) {
3565                     // Loop of some kind.  Max match length is unbounded.
3566                     currentLen = INT32_MAX;
3567                 } else {
3568                     // Forward jump.  Propagate the current min length to the target loc of the jump.
3569                     if (forwardedLength.elementAti(jmpDest) < currentLen) {
3570                         forwardedLength.setElementAt(currentLen, jmpDest);
3571                     }
3572                     currentLen = 0;
3573                 }
3574             }
3575             break;
3576 
3577         case URX_BACKTRACK:
3578             // back-tracks are kind of like a branch, except that the max length was
3579             //   propagated already, by the state save.
3580             currentLen = forwardedLength.elementAti(loc+1);
3581             break;
3582 
3583 
3584         case URX_STATE_SAVE:
3585             {
3586                 // State Save, for forward jumps, propagate the current minimum.
3587                 //               of the state save.
3588                 //             For backwards jumps, they create a loop, maximum
3589                 //               match length is unbounded.
3590                 int32_t  jmpDest = URX_VAL(op);
3591                 if (jmpDest > loc) {
3592                     if (currentLen > forwardedLength.elementAti(jmpDest)) {
3593                         forwardedLength.setElementAt(currentLen, jmpDest);
3594                     }
3595                 } else {
3596                     currentLen = INT32_MAX;
3597                 }
3598             }
3599             break;
3600 
3601 
3602 
3603 
3604         case URX_STRING:
3605             {
3606                 loc++;
3607                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3608                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3609                 break;
3610             }
3611 
3612         case URX_STRING_I:
3613             // TODO:  This code assumes that any user string that matches will be no longer
3614             //        than our compiled string, with case insensitive matching.
3615             //        Our compiled string has been case-folded already.
3616             //
3617             //        Any matching user string will have no more code points than our
3618             //        compiled (folded) string.  Folding may add code points, but
3619             //        not remove them.
3620             //
3621             //        There is a potential problem if a supplemental code point
3622             //        case-folds to a BMP code point.  In this case our compiled string
3623             //        could be shorter (in code units) than a matching user string.
3624             //
3625             //        At this time (Unicode 6.1) there are no such characters, and this case
3626             //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3627             //        any problematic characters are added to Unicode.
3628             //
3629             //        If this happens, we can make a set of the BMP chars that the
3630             //        troublesome supplementals fold to, scan our string, and bump the
3631             //        currentLen one extra for each that is found.
3632             //
3633             {
3634                 loc++;
3635                 int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3636                 currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3637             }
3638             break;
3639 
3640         case URX_CTR_INIT:
3641         case URX_CTR_INIT_NG:
3642             // For Loops, recursively call this function on the pattern for the loop body,
3643             //   then multiply the result by the maximum loop count.
3644             {
3645                 int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3646                 if (loopEndLoc == loc+4) {
3647                     // Loop has an empty body. No affect on max match length.
3648                     // Continue processing with code after the loop end.
3649                     loc = loopEndLoc;
3650                     break;
3651                 }
3652 
3653                 int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3654                 if (maxLoopCount == -1) {
3655                     // Unbounded Loop. No upper bound on match length.
3656                     currentLen = INT32_MAX;
3657                     break;
3658                 }
3659 
3660                 U_ASSERT(loopEndLoc >= loc+4);
3661                 int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3662                 int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount;
3663                 if (updatedLen >= INT32_MAX) {
3664                     currentLen = INT32_MAX;
3665                     break;
3666                 }
3667                 currentLen = (int32_t)updatedLen;
3668                 loc = loopEndLoc;
3669                 break;
3670             }
3671 
3672         case URX_CTR_LOOP:
3673         case URX_CTR_LOOP_NG:
3674             // These opcodes will be skipped over by code for URX_CRT_INIT.
3675             // We shouldn't encounter them here.
3676             U_ASSERT(FALSE);
3677             break;
3678 
3679         case URX_LOOP_SR_I:
3680         case URX_LOOP_DOT_I:
3681         case URX_LOOP_C:
3682             // For anything to do with loops, make the match length unbounded.
3683             currentLen = INT32_MAX;
3684             break;
3685 
3686 
3687 
3688         case URX_LA_START:
3689         case URX_LA_END:
3690             // Look-ahead.  Just ignore, treat the look-ahead block as if
3691             // it were normal pattern.  Gives a too-long match length,
3692             //  but good enough for now.
3693             break;
3694 
3695             // End of look-ahead ops should always be consumed by the processing at
3696             //  the URX_LA_START op.
3697             // U_ASSERT(FALSE);
3698             // break;
3699 
3700         case URX_LB_START:
3701             {
3702                 // Look-behind.  Scan forward until the matching look-around end,
3703                 //   without processing the look-behind block.
3704                 int32_t  depth = 0;
3705                 for (;;) {
3706                     loc++;
3707                     op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3708                     if (URX_TYPE(op) == URX_LA_START || URX_TYPE(op) == URX_LB_START) {
3709                         depth++;
3710                     }
3711                     if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3712                         if (depth == 0) {
3713                             break;
3714                         }
3715                         depth--;
3716                     }
3717                     U_ASSERT(loc < end);
3718                 }
3719             }
3720             break;
3721 
3722         default:
3723             U_ASSERT(FALSE);
3724         }
3725 
3726 
3727         if (currentLen == INT32_MAX) {
3728             //  The maximum length is unbounded.
3729             //  Stop further processing of the pattern.
3730             break;
3731         }
3732 
3733     }
3734     return currentLen;
3735 
3736 }
3737 
3738 
3739 //------------------------------------------------------------------------------
3740 //
3741 //   stripNOPs    Remove any NOP operations from the compiled pattern code.
3742 //                Extra NOPs are inserted for some constructs during the initial
3743 //                code generation to provide locations that may be patched later.
3744 //                Many end up unneeded, and are removed by this function.
3745 //
3746 //                In order to minimize the number of passes through the pattern,
3747 //                back-reference fixup is also performed here (adjusting
3748 //                back-reference operands to point to the correct frame offsets).
3749 //
3750 //------------------------------------------------------------------------------
stripNOPs()3751 void RegexCompile::stripNOPs() {
3752 
3753     if (U_FAILURE(*fStatus)) {
3754         return;
3755     }
3756 
3757     int32_t    end = fRXPat->fCompiledPat->size();
3758     UVector32  deltas(end, *fStatus);
3759 
3760     // Make a first pass over the code, computing the amount that things
3761     //   will be offset at each location in the original code.
3762     int32_t   loc;
3763     int32_t   d = 0;
3764     for (loc=0; loc<end; loc++) {
3765         deltas.addElement(d, *fStatus);
3766         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3767         if (URX_TYPE(op) == URX_NOP) {
3768             d++;
3769         }
3770     }
3771 
3772     UnicodeString caseStringBuffer;
3773 
3774     // Make a second pass over the code, removing the NOPs by moving following
3775     //  code up, and patching operands that refer to code locations that
3776     //  are being moved.  The array of offsets from the first step is used
3777     //  to compute the new operand values.
3778     int32_t src;
3779     int32_t dst = 0;
3780     for (src=0; src<end; src++) {
3781         int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3782         int32_t opType = URX_TYPE(op);
3783         switch (opType) {
3784         case URX_NOP:
3785             break;
3786 
3787         case URX_STATE_SAVE:
3788         case URX_JMP:
3789         case URX_CTR_LOOP:
3790         case URX_CTR_LOOP_NG:
3791         case URX_RELOC_OPRND:
3792         case URX_JMPX:
3793         case URX_JMP_SAV:
3794         case URX_JMP_SAV_X:
3795             // These are instructions with operands that refer to code locations.
3796             {
3797                 int32_t  operandAddress = URX_VAL(op);
3798                 U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3799                 int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3800                 op = buildOp(opType, fixedOperandAddress);
3801                 fRXPat->fCompiledPat->setElementAt(op, dst);
3802                 dst++;
3803                 break;
3804             }
3805 
3806         case URX_BACKREF:
3807         case URX_BACKREF_I:
3808             {
3809                 int32_t where = URX_VAL(op);
3810                 if (where > fRXPat->fGroupMap->size()) {
3811                     error(U_REGEX_INVALID_BACK_REF);
3812                     break;
3813                 }
3814                 where = fRXPat->fGroupMap->elementAti(where-1);
3815                 op    = buildOp(opType, where);
3816                 fRXPat->fCompiledPat->setElementAt(op, dst);
3817                 dst++;
3818 
3819                 fRXPat->fNeedsAltInput = TRUE;
3820                 break;
3821             }
3822         case URX_RESERVED_OP:
3823         case URX_RESERVED_OP_N:
3824         case URX_BACKTRACK:
3825         case URX_END:
3826         case URX_ONECHAR:
3827         case URX_STRING:
3828         case URX_STRING_LEN:
3829         case URX_START_CAPTURE:
3830         case URX_END_CAPTURE:
3831         case URX_STATIC_SETREF:
3832         case URX_STAT_SETREF_N:
3833         case URX_SETREF:
3834         case URX_DOTANY:
3835         case URX_FAIL:
3836         case URX_BACKSLASH_B:
3837         case URX_BACKSLASH_BU:
3838         case URX_BACKSLASH_G:
3839         case URX_BACKSLASH_X:
3840         case URX_BACKSLASH_Z:
3841         case URX_DOTANY_ALL:
3842         case URX_BACKSLASH_D:
3843         case URX_CARET:
3844         case URX_DOLLAR:
3845         case URX_CTR_INIT:
3846         case URX_CTR_INIT_NG:
3847         case URX_DOTANY_UNIX:
3848         case URX_STO_SP:
3849         case URX_LD_SP:
3850         case URX_STO_INP_LOC:
3851         case URX_LA_START:
3852         case URX_LA_END:
3853         case URX_ONECHAR_I:
3854         case URX_STRING_I:
3855         case URX_DOLLAR_M:
3856         case URX_CARET_M:
3857         case URX_CARET_M_UNIX:
3858         case URX_LB_START:
3859         case URX_LB_CONT:
3860         case URX_LB_END:
3861         case URX_LBN_CONT:
3862         case URX_LBN_END:
3863         case URX_LOOP_SR_I:
3864         case URX_LOOP_DOT_I:
3865         case URX_LOOP_C:
3866         case URX_DOLLAR_D:
3867         case URX_DOLLAR_MD:
3868         case URX_BACKSLASH_H:
3869         case URX_BACKSLASH_R:
3870         case URX_BACKSLASH_V:
3871             // These instructions are unaltered by the relocation.
3872             fRXPat->fCompiledPat->setElementAt(op, dst);
3873             dst++;
3874             break;
3875 
3876         default:
3877             // Some op is unaccounted for.
3878             U_ASSERT(FALSE);
3879             error(U_REGEX_INTERNAL_ERROR);
3880         }
3881     }
3882 
3883     fRXPat->fCompiledPat->setSize(dst);
3884 }
3885 
3886 
3887 
3888 
3889 //------------------------------------------------------------------------------
3890 //
3891 //  Error         Report a rule parse error.
3892 //                Only report it if no previous error has been recorded.
3893 //
3894 //------------------------------------------------------------------------------
error(UErrorCode e)3895 void RegexCompile::error(UErrorCode e) {
3896     if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3897         *fStatus = e;
3898         // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3899         // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3900         // int64_t. If the values of the latter are out of range for the former,
3901         // set them to the appropriate "field not supported" values.
3902         if (fLineNum > 0x7FFFFFFF) {
3903             fParseErr->line   = 0;
3904             fParseErr->offset = -1;
3905         } else if (fCharNum > 0x7FFFFFFF) {
3906             fParseErr->line   = (int32_t)fLineNum;
3907             fParseErr->offset = -1;
3908         } else {
3909             fParseErr->line   = (int32_t)fLineNum;
3910             fParseErr->offset = (int32_t)fCharNum;
3911         }
3912 
3913         UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3914 
3915         // Fill in the context.
3916         //   Note: extractBetween() pins supplied indicies to the string bounds.
3917         uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3918         uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3919         utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3920         utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3921     }
3922 }
3923 
3924 
3925 //
3926 //  Assorted Unicode character constants.
3927 //     Numeric because there is no portable way to enter them as literals.
3928 //     (Think EBCDIC).
3929 //
3930 static const UChar      chCR        = 0x0d;      // New lines, for terminating comments.
3931 static const UChar      chLF        = 0x0a;      // Line Feed
3932 static const UChar      chPound     = 0x23;      // '#', introduces a comment.
3933 static const UChar      chDigit0    = 0x30;      // '0'
3934 static const UChar      chDigit7    = 0x37;      // '9'
3935 static const UChar      chColon     = 0x3A;      // ':'
3936 static const UChar      chE         = 0x45;      // 'E'
3937 static const UChar      chQ         = 0x51;      // 'Q'
3938 //static const UChar      chN         = 0x4E;      // 'N'
3939 static const UChar      chP         = 0x50;      // 'P'
3940 static const UChar      chBackSlash = 0x5c;      // '\'  introduces a char escape
3941 //static const UChar      chLBracket  = 0x5b;      // '['
3942 static const UChar      chRBracket  = 0x5d;      // ']'
3943 static const UChar      chUp        = 0x5e;      // '^'
3944 static const UChar      chLowerP    = 0x70;
3945 static const UChar      chLBrace    = 0x7b;      // '{'
3946 static const UChar      chRBrace    = 0x7d;      // '}'
3947 static const UChar      chNEL       = 0x85;      //    NEL newline variant
3948 static const UChar      chLS        = 0x2028;    //    Unicode Line Separator
3949 
3950 
3951 //------------------------------------------------------------------------------
3952 //
3953 //  nextCharLL    Low Level Next Char from the regex pattern.
3954 //                Get a char from the string, keep track of input position
3955 //                     for error reporting.
3956 //
3957 //------------------------------------------------------------------------------
nextCharLL()3958 UChar32  RegexCompile::nextCharLL() {
3959     UChar32       ch;
3960 
3961     if (fPeekChar != -1) {
3962         ch = fPeekChar;
3963         fPeekChar = -1;
3964         return ch;
3965     }
3966 
3967     // assume we're already in the right place
3968     ch = UTEXT_NEXT32(fRXPat->fPattern);
3969     if (ch == U_SENTINEL) {
3970         return ch;
3971     }
3972 
3973     if (ch == chCR ||
3974         ch == chNEL ||
3975         ch == chLS   ||
3976         (ch == chLF && fLastChar != chCR)) {
3977         // Character is starting a new line.  Bump up the line number, and
3978         //  reset the column to 0.
3979         fLineNum++;
3980         fCharNum=0;
3981     }
3982     else {
3983         // Character is not starting a new line.  Except in the case of a
3984         //   LF following a CR, increment the column position.
3985         if (ch != chLF) {
3986             fCharNum++;
3987         }
3988     }
3989     fLastChar = ch;
3990     return ch;
3991 }
3992 
3993 //------------------------------------------------------------------------------
3994 //
3995 //   peekCharLL    Low Level Character Scanning, sneak a peek at the next
3996 //                 character without actually getting it.
3997 //
3998 //------------------------------------------------------------------------------
peekCharLL()3999 UChar32  RegexCompile::peekCharLL() {
4000     if (fPeekChar == -1) {
4001         fPeekChar = nextCharLL();
4002     }
4003     return fPeekChar;
4004 }
4005 
4006 
4007 //------------------------------------------------------------------------------
4008 //
4009 //   nextChar     for pattern scanning.  At this level, we handle stripping
4010 //                out comments and processing some backslash character escapes.
4011 //                The rest of the pattern grammar is handled at the next level up.
4012 //
4013 //------------------------------------------------------------------------------
nextChar(RegexPatternChar & c)4014 void RegexCompile::nextChar(RegexPatternChar &c) {
4015 
4016     fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4017     c.fChar    = nextCharLL();
4018     c.fQuoted  = FALSE;
4019 
4020     if (fQuoteMode) {
4021         c.fQuoted = TRUE;
4022         if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4023             c.fChar == (UChar32)-1) {
4024             fQuoteMode = FALSE;  //  Exit quote mode,
4025             nextCharLL();        // discard the E
4026             nextChar(c);         // recurse to get the real next char
4027         }
4028     }
4029     else if (fInBackslashQuote) {
4030         // The current character immediately follows a '\'
4031         // Don't check for any further escapes, just return it as-is.
4032         // Don't set c.fQuoted, because that would prevent the state machine from
4033         //    dispatching on the character.
4034         fInBackslashQuote = FALSE;
4035     }
4036     else
4037     {
4038         // We are not in a \Q quoted region \E of the source.
4039         //
4040         if (fModeFlags & UREGEX_COMMENTS) {
4041             //
4042             // We are in free-spacing and comments mode.
4043             //  Scan through any white space and comments, until we
4044             //  reach a significant character or the end of inut.
4045             for (;;) {
4046                 if (c.fChar == (UChar32)-1) {
4047                     break;     // End of Input
4048                 }
4049                 if  (c.fChar == chPound && fEOLComments == TRUE) {
4050                     // Start of a comment.  Consume the rest of it, until EOF or a new line
4051                     for (;;) {
4052                         c.fChar = nextCharLL();
4053                         if (c.fChar == (UChar32)-1 ||  // EOF
4054                             c.fChar == chCR        ||
4055                             c.fChar == chLF        ||
4056                             c.fChar == chNEL       ||
4057                             c.fChar == chLS)       {
4058                             break;
4059                         }
4060                     }
4061                 }
4062                 // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4063                 if (PatternProps::isWhiteSpace(c.fChar) == FALSE) {
4064                     break;
4065                 }
4066                 c.fChar = nextCharLL();
4067             }
4068         }
4069 
4070         //
4071         //  check for backslash escaped characters.
4072         //
4073         if (c.fChar == chBackSlash) {
4074             int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4075             if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4076                 //
4077                 // A '\' sequence that is handled by ICU's standard unescapeAt function.
4078                 //   Includes \uxxxx, \n, \r, many others.
4079                 //   Return the single equivalent character.
4080                 //
4081                 nextCharLL();                 // get & discard the peeked char.
4082                 c.fQuoted = TRUE;
4083 
4084                 if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4085                     int32_t endIndex = (int32_t)pos;
4086                     c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4087 
4088                     if (endIndex == pos) {
4089                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4090                     }
4091                     fCharNum += endIndex - pos;
4092                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4093                 } else {
4094                     int32_t offset = 0;
4095                     struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4096 
4097                     UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4098                     c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4099 
4100                     if (offset == 0) {
4101                         error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4102                     } else if (context.lastOffset == offset) {
4103                         UTEXT_PREVIOUS32(fRXPat->fPattern);
4104                     } else if (context.lastOffset != offset-1) {
4105                         utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4106                     }
4107                     fCharNum += offset;
4108                 }
4109             }
4110             else if (peekCharLL() == chDigit0) {
4111                 //  Octal Escape, using Java Regexp Conventions
4112                 //    which are \0 followed by 1-3 octal digits.
4113                 //    Different from ICU Unescape handling of Octal, which does not
4114                 //    require the leading 0.
4115                 //  Java also has the convention of only consuming 2 octal digits if
4116                 //    the three digit number would be > 0xff
4117                 //
4118                 c.fChar = 0;
4119                 nextCharLL();    // Consume the initial 0.
4120                 int index;
4121                 for (index=0; index<3; index++) {
4122                     int32_t ch = peekCharLL();
4123                     if (ch<chDigit0 || ch>chDigit7) {
4124                         if (index==0) {
4125                            // \0 is not followed by any octal digits.
4126                            error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4127                         }
4128                         break;
4129                     }
4130                     c.fChar <<= 3;
4131                     c.fChar += ch&7;
4132                     if (c.fChar <= 255) {
4133                         nextCharLL();
4134                     } else {
4135                         // The last digit made the number too big.  Forget we saw it.
4136                         c.fChar >>= 3;
4137                     }
4138                 }
4139                 c.fQuoted = TRUE;
4140             }
4141             else if (peekCharLL() == chQ) {
4142                 //  "\Q"  enter quote mode, which will continue until "\E"
4143                 fQuoteMode = TRUE;
4144                 nextCharLL();       // discard the 'Q'.
4145                 nextChar(c);        // recurse to get the real next char.
4146             }
4147             else
4148             {
4149                 // We are in a '\' escape that will be handled by the state table scanner.
4150                 // Just return the backslash, but remember that the following char is to
4151                 //  be taken literally.
4152                 fInBackslashQuote = TRUE;
4153             }
4154         }
4155     }
4156 
4157     // re-enable # to end-of-line comments, in case they were disabled.
4158     // They are disabled by the parser upon seeing '(?', but this lasts for
4159     //  the fetching of the next character only.
4160     fEOLComments = TRUE;
4161 
4162     // putc(c.fChar, stdout);
4163 }
4164 
4165 
4166 
4167 //------------------------------------------------------------------------------
4168 //
4169 //  scanNamedChar
4170 //            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4171 //
4172 //             The scan position will be at the 'N'.  On return
4173 //             the scan position should be just after the '}'
4174 //
4175 //             Return the UChar32
4176 //
4177 //------------------------------------------------------------------------------
scanNamedChar()4178 UChar32  RegexCompile::scanNamedChar() {
4179     if (U_FAILURE(*fStatus)) {
4180         return 0;
4181     }
4182 
4183     nextChar(fC);
4184     if (fC.fChar != chLBrace) {
4185         error(U_REGEX_PROPERTY_SYNTAX);
4186         return 0;
4187     }
4188 
4189     UnicodeString  charName;
4190     for (;;) {
4191         nextChar(fC);
4192         if (fC.fChar == chRBrace) {
4193             break;
4194         }
4195         if (fC.fChar == -1) {
4196             error(U_REGEX_PROPERTY_SYNTAX);
4197             return 0;
4198         }
4199         charName.append(fC.fChar);
4200     }
4201 
4202     char name[100];
4203     if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4204          (uint32_t)charName.length()>=sizeof(name)) {
4205         // All Unicode character names have only invariant characters.
4206         // The API to get a character, given a name, accepts only char *, forcing us to convert,
4207         //   which requires this error check
4208         error(U_REGEX_PROPERTY_SYNTAX);
4209         return 0;
4210     }
4211     charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4212 
4213     UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4214     if (U_FAILURE(*fStatus)) {
4215         error(U_REGEX_PROPERTY_SYNTAX);
4216     }
4217 
4218     nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4219     return theChar;
4220 }
4221 
4222 //------------------------------------------------------------------------------
4223 //
4224 //  scanProp   Construct a UnicodeSet from the text at the current scan
4225 //             position, which will be of the form \p{whaterver}
4226 //
4227 //             The scan position will be at the 'p' or 'P'.  On return
4228 //             the scan position should be just after the '}'
4229 //
4230 //             Return a UnicodeSet, constructed from the \P pattern,
4231 //             or NULL if the pattern is invalid.
4232 //
4233 //------------------------------------------------------------------------------
scanProp()4234 UnicodeSet *RegexCompile::scanProp() {
4235     UnicodeSet    *uset = NULL;
4236 
4237     if (U_FAILURE(*fStatus)) {
4238         return NULL;
4239     }
4240     (void)chLowerP;   // Suppress compiler unused variable warning.
4241     U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4242     UBool negated = (fC.fChar == chP);
4243 
4244     UnicodeString propertyName;
4245     nextChar(fC);
4246     if (fC.fChar != chLBrace) {
4247         error(U_REGEX_PROPERTY_SYNTAX);
4248         return NULL;
4249     }
4250     for (;;) {
4251         nextChar(fC);
4252         if (fC.fChar == chRBrace) {
4253             break;
4254         }
4255         if (fC.fChar == -1) {
4256             // Hit the end of the input string without finding the closing '}'
4257             error(U_REGEX_PROPERTY_SYNTAX);
4258             return NULL;
4259         }
4260         propertyName.append(fC.fChar);
4261     }
4262     uset = createSetForProperty(propertyName, negated);
4263     nextChar(fC);    // Move input scan to position following the closing '}'
4264     return uset;
4265 }
4266 
4267 //------------------------------------------------------------------------------
4268 //
4269 //  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4270 //             position, which is expected be of the form [:property expression:]
4271 //
4272 //             The scan position will be at the opening ':'.  On return
4273 //             the scan position must be on the closing ']'
4274 //
4275 //             Return a UnicodeSet constructed from the pattern,
4276 //             or NULL if this is not a valid POSIX-style set expression.
4277 //             If not a property expression, restore the initial scan position
4278 //                (to the opening ':')
4279 //
4280 //               Note:  the opening '[:' is not sufficient to guarantee that
4281 //                      this is a [:property:] expression.
4282 //                      [:'+=,] is a perfectly good ordinary set expression that
4283 //                              happens to include ':' as one of its characters.
4284 //
4285 //------------------------------------------------------------------------------
scanPosixProp()4286 UnicodeSet *RegexCompile::scanPosixProp() {
4287     UnicodeSet    *uset = NULL;
4288 
4289     if (U_FAILURE(*fStatus)) {
4290         return NULL;
4291     }
4292 
4293     U_ASSERT(fC.fChar == chColon);
4294 
4295     // Save the scanner state.
4296     // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4297     int64_t     savedScanIndex        = fScanIndex;
4298     int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4299     UBool       savedQuoteMode        = fQuoteMode;
4300     UBool       savedInBackslashQuote = fInBackslashQuote;
4301     UBool       savedEOLComments      = fEOLComments;
4302     int64_t     savedLineNum          = fLineNum;
4303     int64_t     savedCharNum          = fCharNum;
4304     UChar32     savedLastChar         = fLastChar;
4305     UChar32     savedPeekChar         = fPeekChar;
4306     RegexPatternChar savedfC          = fC;
4307 
4308     // Scan for a closing ].   A little tricky because there are some perverse
4309     //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4310     //   ending on the second closing ].
4311 
4312     UnicodeString propName;
4313     UBool         negated  = FALSE;
4314 
4315     // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4316     nextChar(fC);
4317     if (fC.fChar == chUp) {
4318        negated = TRUE;
4319        nextChar(fC);
4320     }
4321 
4322     // Scan for the closing ":]", collecting the property name along the way.
4323     UBool  sawPropSetTerminator = FALSE;
4324     for (;;) {
4325         propName.append(fC.fChar);
4326         nextChar(fC);
4327         if (fC.fQuoted || fC.fChar == -1) {
4328             // Escaped characters or end of input - either says this isn't a [:Property:]
4329             break;
4330         }
4331         if (fC.fChar == chColon) {
4332             nextChar(fC);
4333             if (fC.fChar == chRBracket) {
4334                 sawPropSetTerminator = TRUE;
4335             }
4336             break;
4337         }
4338     }
4339 
4340     if (sawPropSetTerminator) {
4341         uset = createSetForProperty(propName, negated);
4342     }
4343     else
4344     {
4345         // No closing ":]".
4346         //  Restore the original scan position.
4347         //  The main scanner will retry the input as a normal set expression,
4348         //    not a [:Property:] expression.
4349         fScanIndex        = savedScanIndex;
4350         fQuoteMode        = savedQuoteMode;
4351         fInBackslashQuote = savedInBackslashQuote;
4352         fEOLComments      = savedEOLComments;
4353         fLineNum          = savedLineNum;
4354         fCharNum          = savedCharNum;
4355         fLastChar         = savedLastChar;
4356         fPeekChar         = savedPeekChar;
4357         fC                = savedfC;
4358         UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4359     }
4360     return uset;
4361 }
4362 
addIdentifierIgnorable(UnicodeSet * set,UErrorCode & ec)4363 static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4364     set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4365     addCategory(set, U_GC_CF_MASK, ec);
4366 }
4367 
4368 //
4369 //  Create a Unicode Set from a Unicode Property expression.
4370 //     This is common code underlying both \p{...} ane [:...:] expressions.
4371 //     Includes trying the Java "properties" that aren't supported as
4372 //     normal ICU UnicodeSet properties
4373 //
createSetForProperty(const UnicodeString & propName,UBool negated)4374 UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4375 
4376     if (U_FAILURE(*fStatus)) {
4377         return nullptr;
4378     }
4379     LocalPointer<UnicodeSet> set;
4380     UErrorCode status = U_ZERO_ERROR;
4381 
4382     do {      // non-loop, exists to allow breaks from the block.
4383         //
4384         //  First try the property as we received it
4385         //
4386         UnicodeString   setExpr;
4387         uint32_t        usetFlags = 0;
4388         setExpr.append(u"[\\p{", -1);
4389         setExpr.append(propName);
4390         setExpr.append(u"}]", -1);
4391         if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4392             usetFlags |= USET_CASE_INSENSITIVE;
4393         }
4394         set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
4395         if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4396             break;
4397         }
4398 
4399         //
4400         //  The incoming property wasn't directly recognized by ICU.
4401 
4402         //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4403         //     Java accepts 'word' with mixed case.
4404         //     Java accepts 'all' only in all lower case.
4405 
4406         status = U_ZERO_ERROR;
4407         if (propName.caseCompare(u"word", -1, 0) == 0) {
4408             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
4409             break;
4410         }
4411         if (propName.compare(u"all", -1) == 0) {
4412             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4413             break;
4414         }
4415 
4416 
4417         //    Do Java InBlock expressions
4418         //
4419         UnicodeString mPropName = propName;
4420         if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4421             status = U_ZERO_ERROR;
4422             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4423             if (U_FAILURE(status)) {
4424                 break;
4425             }
4426             UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4427             set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4428             break;
4429         }
4430 
4431         //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4432         //  as "BooleanPropertyValue". The property value can be either a
4433         //  a General Category or a Script Name.
4434 
4435         if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4436             mPropName.remove(0, 2);      // Strip the "Is"
4437             if (mPropName.indexOf(u'=') >= 0) {
4438                 // Reject any "Is..." property expression containing an '=', that is,
4439                 // any non-binary property expression.
4440                 status = U_REGEX_PROPERTY_SYNTAX;
4441                 break;
4442             }
4443 
4444             if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4445                 mPropName.setTo(u"unassigned", -1);
4446                 negated = !negated;
4447             } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4448                 mPropName.setTo(u"Titlecase_Letter", -1);
4449             }
4450 
4451             mPropName.insert(0, u"[\\p{", -1);
4452             mPropName.append(u"}]", -1);
4453             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4454 
4455             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4456                 set->closeOver(USET_CASE_INSENSITIVE);
4457             }
4458             break;
4459 
4460         }
4461 
4462         if (propName.startsWith(u"java", -1)) {
4463             status = U_ZERO_ERROR;
4464             set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4465             if (U_FAILURE(status)) {
4466                 break;
4467             }
4468             //
4469             //  Try the various Java specific properties.
4470             //   These all begin with "java"
4471             //
4472             if (propName.compare(u"javaDefined", -1) == 0) {
4473                 addCategory(set.getAlias(), U_GC_CN_MASK, status);
4474                 set->complement();
4475             }
4476             else if (propName.compare(u"javaDigit", -1) == 0) {
4477                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4478             }
4479             else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4480                 addIdentifierIgnorable(set.getAlias(), status);
4481             }
4482             else if (propName.compare(u"javaISOControl", -1) == 0) {
4483                 set->add(0, 0x1F).add(0x7F, 0x9F);
4484             }
4485             else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4486                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4487                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4488                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4489                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4490                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4491                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4492                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4493                 addIdentifierIgnorable(set.getAlias(), status);
4494             }
4495             else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4496                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4497                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4498                 addCategory(set.getAlias(), U_GC_SC_MASK, status);
4499                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4500             }
4501             else if (propName.compare(u"javaLetter", -1) == 0) {
4502                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4503             }
4504             else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4505                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4506                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4507             }
4508             else if (propName.compare(u"javaLowerCase", -1) == 0) {
4509                 addCategory(set.getAlias(), U_GC_LL_MASK, status);
4510             }
4511             else if (propName.compare(u"javaMirrored", -1) == 0) {
4512                 set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4513             }
4514             else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4515                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4516             }
4517             else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4518                 set->add(0x10000, UnicodeSet::MAX_VALUE);
4519             }
4520             else if (propName.compare(u"javaTitleCase", -1) == 0) {
4521                 addCategory(set.getAlias(), U_GC_LT_MASK, status);
4522             }
4523             else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4524                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4525                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4526             }
4527             else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4528                 addCategory(set.getAlias(), U_GC_L_MASK, status);
4529                 addCategory(set.getAlias(), U_GC_PC_MASK, status);
4530                 addCategory(set.getAlias(), U_GC_ND_MASK, status);
4531                 addCategory(set.getAlias(), U_GC_NL_MASK, status);
4532                 addCategory(set.getAlias(), U_GC_MC_MASK, status);
4533                 addCategory(set.getAlias(), U_GC_MN_MASK, status);
4534                 addIdentifierIgnorable(set.getAlias(), status);
4535             }
4536             else if (propName.compare(u"javaUpperCase", -1) == 0) {
4537                 addCategory(set.getAlias(), U_GC_LU_MASK, status);
4538             }
4539             else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4540                 set->add(0, UnicodeSet::MAX_VALUE);
4541             }
4542             else if (propName.compare(u"javaWhitespace", -1) == 0) {
4543                 addCategory(set.getAlias(), U_GC_Z_MASK, status);
4544                 set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4545                 set->add(9, 0x0d).add(0x1c, 0x1f);
4546             } else {
4547                 status = U_REGEX_PROPERTY_SYNTAX;
4548             }
4549 
4550             if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4551                 set->closeOver(USET_CASE_INSENSITIVE);
4552             }
4553             break;
4554         }
4555 
4556         // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4557         // extensions matched it.
4558         status = U_REGEX_PROPERTY_SYNTAX;
4559     } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4560 
4561     if (U_SUCCESS(status)) {
4562         U_ASSERT(set.isValid());
4563         if (negated) {
4564             set->complement();
4565         }
4566         return set.orphan();
4567     } else {
4568         if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4569             status = U_REGEX_PROPERTY_SYNTAX;
4570         }
4571         error(status);
4572         return nullptr;
4573     }
4574 }
4575 
4576 
4577 //
4578 //  SetEval   Part of the evaluation of [set expressions].
4579 //            Perform any pending (stacked) operations with precedence
4580 //            equal or greater to that of the next operator encountered
4581 //            in the expression.
4582 //
setEval(int32_t nextOp)4583 void RegexCompile::setEval(int32_t nextOp) {
4584     UnicodeSet *rightOperand = NULL;
4585     UnicodeSet *leftOperand  = NULL;
4586     for (;;) {
4587         U_ASSERT(fSetOpStack.empty()==FALSE);
4588         int32_t pendingSetOperation = fSetOpStack.peeki();
4589         if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4590             break;
4591         }
4592         fSetOpStack.popi();
4593         U_ASSERT(fSetStack.empty() == FALSE);
4594         rightOperand = (UnicodeSet *)fSetStack.peek();
4595         switch (pendingSetOperation) {
4596             case setNegation:
4597                 rightOperand->complement();
4598                 break;
4599             case setCaseClose:
4600                 // TODO: need a simple close function.  Ticket 6065
4601                 rightOperand->closeOver(USET_CASE_INSENSITIVE);
4602                 rightOperand->removeAllStrings();
4603                 break;
4604             case setDifference1:
4605             case setDifference2:
4606                 fSetStack.pop();
4607                 leftOperand = (UnicodeSet *)fSetStack.peek();
4608                 leftOperand->removeAll(*rightOperand);
4609                 delete rightOperand;
4610                 break;
4611             case setIntersection1:
4612             case setIntersection2:
4613                 fSetStack.pop();
4614                 leftOperand = (UnicodeSet *)fSetStack.peek();
4615                 leftOperand->retainAll(*rightOperand);
4616                 delete rightOperand;
4617                 break;
4618             case setUnion:
4619                 fSetStack.pop();
4620                 leftOperand = (UnicodeSet *)fSetStack.peek();
4621                 leftOperand->addAll(*rightOperand);
4622                 delete rightOperand;
4623                 break;
4624             default:
4625                 U_ASSERT(FALSE);
4626                 break;
4627             }
4628         }
4629     }
4630 
setPushOp(int32_t op)4631 void RegexCompile::setPushOp(int32_t op) {
4632     setEval(op);
4633     fSetOpStack.push(op, *fStatus);
4634     fSetStack.push(new UnicodeSet(), *fStatus);
4635 }
4636 
4637 U_NAMESPACE_END
4638 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4639 
4640