1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/regexp/jsregexp.h"
12 #include "src/utils.h"
13 
14 namespace v8 {
15 namespace internal {
16 
RegExpParser(FlatStringReader * in,Handle<String> * error,bool multiline,bool unicode,Isolate * isolate,Zone * zone)17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18                            bool multiline, bool unicode, Isolate* isolate,
19                            Zone* zone)
20     : isolate_(isolate),
21       zone_(zone),
22       error_(error),
23       captures_(NULL),
24       in_(in),
25       current_(kEndMarker),
26       next_pos_(0),
27       captures_started_(0),
28       capture_count_(0),
29       has_more_(true),
30       multiline_(multiline),
31       unicode_(unicode),
32       simple_(false),
33       contains_anchor_(false),
34       is_scanned_for_captures_(false),
35       failed_(false) {
36   Advance();
37 }
38 
39 
Next()40 uc32 RegExpParser::Next() {
41   if (has_next()) {
42     return in()->Get(next_pos_);
43   } else {
44     return kEndMarker;
45   }
46 }
47 
48 
Advance()49 void RegExpParser::Advance() {
50   if (next_pos_ < in()->length()) {
51     StackLimitCheck check(isolate());
52     if (check.HasOverflowed()) {
53       ReportError(CStrVector(Isolate::kStackOverflowMessage));
54     } else if (zone()->excess_allocation()) {
55       ReportError(CStrVector("Regular expression too large"));
56     } else {
57       current_ = in()->Get(next_pos_);
58       next_pos_++;
59       // Read the whole surrogate pair in case of unicode flag, if possible.
60       if (unicode_ && next_pos_ < in()->length() &&
61           unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
62         uc16 trail = in()->Get(next_pos_);
63         if (unibrow::Utf16::IsTrailSurrogate(trail)) {
64           current_ = unibrow::Utf16::CombineSurrogatePair(
65               static_cast<uc16>(current_), trail);
66           next_pos_++;
67         }
68       }
69     }
70   } else {
71     current_ = kEndMarker;
72     // Advance so that position() points to 1-after-the-last-character. This is
73     // important so that Reset() to this position works correctly.
74     next_pos_ = in()->length() + 1;
75     has_more_ = false;
76   }
77 }
78 
79 
Reset(int pos)80 void RegExpParser::Reset(int pos) {
81   next_pos_ = pos;
82   has_more_ = (pos < in()->length());
83   Advance();
84 }
85 
86 
Advance(int dist)87 void RegExpParser::Advance(int dist) {
88   next_pos_ += dist - 1;
89   Advance();
90 }
91 
92 
simple()93 bool RegExpParser::simple() { return simple_; }
94 
95 
IsSyntaxCharacter(uc32 c)96 bool RegExpParser::IsSyntaxCharacter(uc32 c) {
97   return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
98          c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
99          c == '{' || c == '}' || c == '|';
100 }
101 
102 
ReportError(Vector<const char> message)103 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
104   failed_ = true;
105   *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
106   // Zip to the end to make sure the no more input is read.
107   current_ = kEndMarker;
108   next_pos_ = in()->length();
109   return NULL;
110 }
111 
112 
113 #define CHECK_FAILED /**/); \
114   if (failed_) return NULL; \
115   ((void)0
116 
117 
118 // Pattern ::
119 //   Disjunction
ParsePattern()120 RegExpTree* RegExpParser::ParsePattern() {
121   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
122   DCHECK(!has_more());
123   // If the result of parsing is a literal string atom, and it has the
124   // same length as the input, then the atom is identical to the input.
125   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
126     simple_ = true;
127   }
128   return result;
129 }
130 
131 
132 // Disjunction ::
133 //   Alternative
134 //   Alternative | Disjunction
135 // Alternative ::
136 //   [empty]
137 //   Term Alternative
138 // Term ::
139 //   Assertion
140 //   Atom
141 //   Atom Quantifier
ParseDisjunction()142 RegExpTree* RegExpParser::ParseDisjunction() {
143   // Used to store current state while parsing subexpressions.
144   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
145                                   zone());
146   RegExpParserState* state = &initial_state;
147   // Cache the builder in a local variable for quick access.
148   RegExpBuilder* builder = initial_state.builder();
149   while (true) {
150     switch (current()) {
151       case kEndMarker:
152         if (state->IsSubexpression()) {
153           // Inside a parenthesized group when hitting end of input.
154           ReportError(CStrVector("Unterminated group") CHECK_FAILED);
155         }
156         DCHECK_EQ(INITIAL, state->group_type());
157         // Parsing completed successfully.
158         return builder->ToRegExp();
159       case ')': {
160         if (!state->IsSubexpression()) {
161           ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
162         }
163         DCHECK_NE(INITIAL, state->group_type());
164 
165         Advance();
166         // End disjunction parsing and convert builder content to new single
167         // regexp atom.
168         RegExpTree* body = builder->ToRegExp();
169 
170         int end_capture_index = captures_started();
171 
172         int capture_index = state->capture_index();
173         SubexpressionType group_type = state->group_type();
174 
175         // Build result of subexpression.
176         if (group_type == CAPTURE) {
177           RegExpCapture* capture = GetCapture(capture_index);
178           capture->set_body(body);
179           body = capture;
180         } else if (group_type != GROUPING) {
181           DCHECK(group_type == POSITIVE_LOOKAROUND ||
182                  group_type == NEGATIVE_LOOKAROUND);
183           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
184           body = new (zone()) RegExpLookaround(
185               body, is_positive, end_capture_index - capture_index,
186               capture_index, state->lookaround_type());
187         }
188 
189         // Restore previous state.
190         state = state->previous_state();
191         builder = state->builder();
192 
193         builder->AddAtom(body);
194         // For compatability with JSC and ES3, we allow quantifiers after
195         // lookaheads, and break in all cases.
196         break;
197       }
198       case '|': {
199         Advance();
200         builder->NewAlternative();
201         continue;
202       }
203       case '*':
204       case '+':
205       case '?':
206         return ReportError(CStrVector("Nothing to repeat"));
207       case '^': {
208         Advance();
209         if (multiline_) {
210           builder->AddAssertion(
211               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
212         } else {
213           builder->AddAssertion(
214               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
215           set_contains_anchor();
216         }
217         continue;
218       }
219       case '$': {
220         Advance();
221         RegExpAssertion::AssertionType assertion_type =
222             multiline_ ? RegExpAssertion::END_OF_LINE
223                        : RegExpAssertion::END_OF_INPUT;
224         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
225         continue;
226       }
227       case '.': {
228         Advance();
229         // everything except \x0a, \x0d, \u2028 and \u2029
230         ZoneList<CharacterRange>* ranges =
231             new (zone()) ZoneList<CharacterRange>(2, zone());
232         CharacterRange::AddClassEscape('.', ranges, zone());
233         RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
234         builder->AddAtom(atom);
235         break;
236       }
237       case '(': {
238         SubexpressionType subexpr_type = CAPTURE;
239         RegExpLookaround::Type lookaround_type = state->lookaround_type();
240         Advance();
241         if (current() == '?') {
242           switch (Next()) {
243             case ':':
244               subexpr_type = GROUPING;
245               break;
246             case '=':
247               lookaround_type = RegExpLookaround::LOOKAHEAD;
248               subexpr_type = POSITIVE_LOOKAROUND;
249               break;
250             case '!':
251               lookaround_type = RegExpLookaround::LOOKAHEAD;
252               subexpr_type = NEGATIVE_LOOKAROUND;
253               break;
254             case '<':
255               if (FLAG_harmony_regexp_lookbehind) {
256                 Advance();
257                 lookaround_type = RegExpLookaround::LOOKBEHIND;
258                 if (Next() == '=') {
259                   subexpr_type = POSITIVE_LOOKAROUND;
260                   break;
261                 } else if (Next() == '!') {
262                   subexpr_type = NEGATIVE_LOOKAROUND;
263                   break;
264                 }
265               }
266             // Fall through.
267             default:
268               ReportError(CStrVector("Invalid group") CHECK_FAILED);
269               break;
270           }
271           Advance(2);
272         } else {
273           if (captures_started_ >= kMaxCaptures) {
274             ReportError(CStrVector("Too many captures") CHECK_FAILED);
275           }
276           captures_started_++;
277         }
278         // Store current state and begin new disjunction parsing.
279         state = new (zone()) RegExpParserState(
280             state, subexpr_type, lookaround_type, captures_started_, zone());
281         builder = state->builder();
282         continue;
283       }
284       case '[': {
285         RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
286         builder->AddAtom(atom);
287         break;
288       }
289       // Atom ::
290       //   \ AtomEscape
291       case '\\':
292         switch (Next()) {
293           case kEndMarker:
294             return ReportError(CStrVector("\\ at end of pattern"));
295           case 'b':
296             Advance(2);
297             builder->AddAssertion(
298                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
299             continue;
300           case 'B':
301             Advance(2);
302             builder->AddAssertion(
303                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
304             continue;
305           // AtomEscape ::
306           //   CharacterClassEscape
307           //
308           // CharacterClassEscape :: one of
309           //   d D s S w W
310           case 'd':
311           case 'D':
312           case 's':
313           case 'S':
314           case 'w':
315           case 'W': {
316             uc32 c = Next();
317             Advance(2);
318             ZoneList<CharacterRange>* ranges =
319                 new (zone()) ZoneList<CharacterRange>(2, zone());
320             CharacterRange::AddClassEscape(c, ranges, zone());
321             RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
322             builder->AddAtom(atom);
323             break;
324           }
325           case '1':
326           case '2':
327           case '3':
328           case '4':
329           case '5':
330           case '6':
331           case '7':
332           case '8':
333           case '9': {
334             int index = 0;
335             if (ParseBackReferenceIndex(&index)) {
336               if (state->IsInsideCaptureGroup(index)) {
337                 // The back reference is inside the capture group it refers to.
338                 // Nothing can possibly have been captured yet, so we use empty
339                 // instead. This ensures that, when checking a back reference,
340                 // the capture registers of the referenced capture are either
341                 // both set or both cleared.
342                 builder->AddEmpty();
343               } else {
344                 RegExpCapture* capture = GetCapture(index);
345                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
346                 builder->AddAtom(atom);
347               }
348               break;
349             }
350             uc32 first_digit = Next();
351             if (first_digit == '8' || first_digit == '9') {
352               // If the 'u' flag is present, only syntax characters can be
353               // escaped,
354               // no other identity escapes are allowed. If the 'u' flag is not
355               // present, all identity escapes are allowed.
356               if (!unicode_) {
357                 builder->AddCharacter(first_digit);
358                 Advance(2);
359               } else {
360                 return ReportError(CStrVector("Invalid escape"));
361               }
362               break;
363             }
364           }
365           // FALLTHROUGH
366           case '0': {
367             Advance();
368             uc32 octal = ParseOctalLiteral();
369             builder->AddCharacter(octal);
370             break;
371           }
372           // ControlEscape :: one of
373           //   f n r t v
374           case 'f':
375             Advance(2);
376             builder->AddCharacter('\f');
377             break;
378           case 'n':
379             Advance(2);
380             builder->AddCharacter('\n');
381             break;
382           case 'r':
383             Advance(2);
384             builder->AddCharacter('\r');
385             break;
386           case 't':
387             Advance(2);
388             builder->AddCharacter('\t');
389             break;
390           case 'v':
391             Advance(2);
392             builder->AddCharacter('\v');
393             break;
394           case 'c': {
395             Advance();
396             uc32 controlLetter = Next();
397             // Special case if it is an ASCII letter.
398             // Convert lower case letters to uppercase.
399             uc32 letter = controlLetter & ~('a' ^ 'A');
400             if (letter < 'A' || 'Z' < letter) {
401               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
402               // This is outside the specification. We match JSC in
403               // reading the backslash as a literal character instead
404               // of as starting an escape.
405               builder->AddCharacter('\\');
406             } else {
407               Advance(2);
408               builder->AddCharacter(controlLetter & 0x1f);
409             }
410             break;
411           }
412           case 'x': {
413             Advance(2);
414             uc32 value;
415             if (ParseHexEscape(2, &value)) {
416               builder->AddCharacter(value);
417             } else if (!unicode_) {
418               builder->AddCharacter('x');
419             } else {
420               // If the 'u' flag is present, invalid escapes are not treated as
421               // identity escapes.
422               return ReportError(CStrVector("Invalid escape"));
423             }
424             break;
425           }
426           case 'u': {
427             Advance(2);
428             uc32 value;
429             if (ParseUnicodeEscape(&value)) {
430               builder->AddUnicodeCharacter(value);
431             } else if (!unicode_) {
432               builder->AddCharacter('u');
433             } else {
434               // If the 'u' flag is present, invalid escapes are not treated as
435               // identity escapes.
436               return ReportError(CStrVector("Invalid unicode escape"));
437             }
438             break;
439           }
440           default:
441             Advance();
442             // If the 'u' flag is present, only syntax characters can be
443             // escaped, no
444             // other identity escapes are allowed. If the 'u' flag is not
445             // present,
446             // all identity escapes are allowed.
447             if (!unicode_ || IsSyntaxCharacter(current())) {
448               builder->AddCharacter(current());
449               Advance();
450             } else {
451               return ReportError(CStrVector("Invalid escape"));
452             }
453             break;
454         }
455         break;
456       case '{': {
457         int dummy;
458         if (ParseIntervalQuantifier(&dummy, &dummy)) {
459           ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
460         }
461         // fallthrough
462       }
463       default:
464         builder->AddUnicodeCharacter(current());
465         Advance();
466         break;
467     }  // end switch(current())
468 
469     int min;
470     int max;
471     switch (current()) {
472       // QuantifierPrefix ::
473       //   *
474       //   +
475       //   ?
476       //   {
477       case '*':
478         min = 0;
479         max = RegExpTree::kInfinity;
480         Advance();
481         break;
482       case '+':
483         min = 1;
484         max = RegExpTree::kInfinity;
485         Advance();
486         break;
487       case '?':
488         min = 0;
489         max = 1;
490         Advance();
491         break;
492       case '{':
493         if (ParseIntervalQuantifier(&min, &max)) {
494           if (max < min) {
495             ReportError(CStrVector("numbers out of order in {} quantifier.")
496                             CHECK_FAILED);
497           }
498           break;
499         } else {
500           continue;
501         }
502       default:
503         continue;
504     }
505     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
506     if (current() == '?') {
507       quantifier_type = RegExpQuantifier::NON_GREEDY;
508       Advance();
509     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
510       // FLAG_regexp_possessive_quantifier is a debug-only flag.
511       quantifier_type = RegExpQuantifier::POSSESSIVE;
512       Advance();
513     }
514     builder->AddQuantifierToAtom(min, max, quantifier_type);
515   }
516 }
517 
518 
519 #ifdef DEBUG
520 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)521 static bool IsSpecialClassEscape(uc32 c) {
522   switch (c) {
523     case 'd':
524     case 'D':
525     case 's':
526     case 'S':
527     case 'w':
528     case 'W':
529       return true;
530     default:
531       return false;
532   }
533 }
534 #endif
535 
536 
537 // In order to know whether an escape is a backreference or not we have to scan
538 // the entire regexp and find the number of capturing parentheses.  However we
539 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
540 // is called when needed.  It can see the difference between capturing and
541 // noncapturing parentheses and can skip character classes and backslash-escaped
542 // characters.
ScanForCaptures()543 void RegExpParser::ScanForCaptures() {
544   // Start with captures started previous to current position
545   int capture_count = captures_started();
546   // Add count of captures after this position.
547   int n;
548   while ((n = current()) != kEndMarker) {
549     Advance();
550     switch (n) {
551       case '\\':
552         Advance();
553         break;
554       case '[': {
555         int c;
556         while ((c = current()) != kEndMarker) {
557           Advance();
558           if (c == '\\') {
559             Advance();
560           } else {
561             if (c == ']') break;
562           }
563         }
564         break;
565       }
566       case '(':
567         if (current() != '?') capture_count++;
568         break;
569     }
570   }
571   capture_count_ = capture_count;
572   is_scanned_for_captures_ = true;
573 }
574 
575 
ParseBackReferenceIndex(int * index_out)576 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
577   DCHECK_EQ('\\', current());
578   DCHECK('1' <= Next() && Next() <= '9');
579   // Try to parse a decimal literal that is no greater than the total number
580   // of left capturing parentheses in the input.
581   int start = position();
582   int value = Next() - '0';
583   Advance(2);
584   while (true) {
585     uc32 c = current();
586     if (IsDecimalDigit(c)) {
587       value = 10 * value + (c - '0');
588       if (value > kMaxCaptures) {
589         Reset(start);
590         return false;
591       }
592       Advance();
593     } else {
594       break;
595     }
596   }
597   if (value > captures_started()) {
598     if (!is_scanned_for_captures_) {
599       int saved_position = position();
600       ScanForCaptures();
601       Reset(saved_position);
602     }
603     if (value > capture_count_) {
604       Reset(start);
605       return false;
606     }
607   }
608   *index_out = value;
609   return true;
610 }
611 
612 
GetCapture(int index)613 RegExpCapture* RegExpParser::GetCapture(int index) {
614   // The index for the capture groups are one-based. Its index in the list is
615   // zero-based.
616   int know_captures =
617       is_scanned_for_captures_ ? capture_count_ : captures_started_;
618   DCHECK(index <= know_captures);
619   if (captures_ == NULL) {
620     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
621   }
622   while (captures_->length() < know_captures) {
623     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
624   }
625   return captures_->at(index - 1);
626 }
627 
628 
IsInsideCaptureGroup(int index)629 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
630   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
631     if (s->group_type() != CAPTURE) continue;
632     // Return true if we found the matching capture index.
633     if (index == s->capture_index()) return true;
634     // Abort if index is larger than what has been parsed up till this state.
635     if (index > s->capture_index()) return false;
636   }
637   return false;
638 }
639 
640 
641 // QuantifierPrefix ::
642 //   { DecimalDigits }
643 //   { DecimalDigits , }
644 //   { DecimalDigits , DecimalDigits }
645 //
646 // Returns true if parsing succeeds, and set the min_out and max_out
647 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)648 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
649   DCHECK_EQ(current(), '{');
650   int start = position();
651   Advance();
652   int min = 0;
653   if (!IsDecimalDigit(current())) {
654     Reset(start);
655     return false;
656   }
657   while (IsDecimalDigit(current())) {
658     int next = current() - '0';
659     if (min > (RegExpTree::kInfinity - next) / 10) {
660       // Overflow. Skip past remaining decimal digits and return -1.
661       do {
662         Advance();
663       } while (IsDecimalDigit(current()));
664       min = RegExpTree::kInfinity;
665       break;
666     }
667     min = 10 * min + next;
668     Advance();
669   }
670   int max = 0;
671   if (current() == '}') {
672     max = min;
673     Advance();
674   } else if (current() == ',') {
675     Advance();
676     if (current() == '}') {
677       max = RegExpTree::kInfinity;
678       Advance();
679     } else {
680       while (IsDecimalDigit(current())) {
681         int next = current() - '0';
682         if (max > (RegExpTree::kInfinity - next) / 10) {
683           do {
684             Advance();
685           } while (IsDecimalDigit(current()));
686           max = RegExpTree::kInfinity;
687           break;
688         }
689         max = 10 * max + next;
690         Advance();
691       }
692       if (current() != '}') {
693         Reset(start);
694         return false;
695       }
696       Advance();
697     }
698   } else {
699     Reset(start);
700     return false;
701   }
702   *min_out = min;
703   *max_out = max;
704   return true;
705 }
706 
707 
ParseOctalLiteral()708 uc32 RegExpParser::ParseOctalLiteral() {
709   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
710   // For compatibility with some other browsers (not all), we parse
711   // up to three octal digits with a value below 256.
712   uc32 value = current() - '0';
713   Advance();
714   if ('0' <= current() && current() <= '7') {
715     value = value * 8 + current() - '0';
716     Advance();
717     if (value < 32 && '0' <= current() && current() <= '7') {
718       value = value * 8 + current() - '0';
719       Advance();
720     }
721   }
722   return value;
723 }
724 
725 
ParseHexEscape(int length,uc32 * value)726 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
727   int start = position();
728   uc32 val = 0;
729   for (int i = 0; i < length; ++i) {
730     uc32 c = current();
731     int d = HexValue(c);
732     if (d < 0) {
733       Reset(start);
734       return false;
735     }
736     val = val * 16 + d;
737     Advance();
738   }
739   *value = val;
740   return true;
741 }
742 
743 
ParseUnicodeEscape(uc32 * value)744 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
745   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
746   // allowed). In the latter case, the number of hex digits between { } is
747   // arbitrary. \ and u have already been read.
748   if (current() == '{' && unicode_) {
749     int start = position();
750     Advance();
751     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
752       if (current() == '}') {
753         Advance();
754         return true;
755       }
756     }
757     Reset(start);
758     return false;
759   }
760   // \u but no {, or \u{...} escapes not allowed.
761   return ParseHexEscape(4, value);
762 }
763 
764 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)765 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
766   uc32 x = 0;
767   int d = HexValue(current());
768   if (d < 0) {
769     return false;
770   }
771   while (d >= 0) {
772     x = x * 16 + d;
773     if (x > max_value) {
774       return false;
775     }
776     Advance();
777     d = HexValue(current());
778   }
779   *value = x;
780   return true;
781 }
782 
783 
ParseClassCharacterEscape()784 uc32 RegExpParser::ParseClassCharacterEscape() {
785   DCHECK(current() == '\\');
786   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
787   Advance();
788   switch (current()) {
789     case 'b':
790       Advance();
791       return '\b';
792     // ControlEscape :: one of
793     //   f n r t v
794     case 'f':
795       Advance();
796       return '\f';
797     case 'n':
798       Advance();
799       return '\n';
800     case 'r':
801       Advance();
802       return '\r';
803     case 't':
804       Advance();
805       return '\t';
806     case 'v':
807       Advance();
808       return '\v';
809     case 'c': {
810       uc32 controlLetter = Next();
811       uc32 letter = controlLetter & ~('A' ^ 'a');
812       // For compatibility with JSC, inside a character class
813       // we also accept digits and underscore as control characters.
814       if ((controlLetter >= '0' && controlLetter <= '9') ||
815           controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
816         Advance(2);
817         // Control letters mapped to ASCII control characters in the range
818         // 0x00-0x1f.
819         return controlLetter & 0x1f;
820       }
821       // We match JSC in reading the backslash as a literal
822       // character instead of as starting an escape.
823       return '\\';
824     }
825     case '0':
826     case '1':
827     case '2':
828     case '3':
829     case '4':
830     case '5':
831     case '6':
832     case '7':
833       // For compatibility, we interpret a decimal escape that isn't
834       // a back reference (and therefore either \0 or not valid according
835       // to the specification) as a 1..3 digit octal character code.
836       return ParseOctalLiteral();
837     case 'x': {
838       Advance();
839       uc32 value;
840       if (ParseHexEscape(2, &value)) {
841         return value;
842       }
843       if (!unicode_) {
844         // If \x is not followed by a two-digit hexadecimal, treat it
845         // as an identity escape.
846         return 'x';
847       }
848       // If the 'u' flag is present, invalid escapes are not treated as
849       // identity escapes.
850       ReportError(CStrVector("Invalid escape"));
851       return 0;
852     }
853     case 'u': {
854       Advance();
855       uc32 value;
856       if (ParseUnicodeEscape(&value)) {
857         return value;
858       }
859       if (!unicode_) {
860         return 'u';
861       }
862       // If the 'u' flag is present, invalid escapes are not treated as
863       // identity escapes.
864       ReportError(CStrVector("Invalid unicode escape"));
865       return 0;
866     }
867     default: {
868       uc32 result = current();
869       // If the 'u' flag is present, only syntax characters can be escaped, no
870       // other identity escapes are allowed. If the 'u' flag is not present, all
871       // identity escapes are allowed.
872       if (!unicode_ || IsSyntaxCharacter(result)) {
873         Advance();
874         return result;
875       }
876       ReportError(CStrVector("Invalid escape"));
877       return 0;
878     }
879   }
880   return 0;
881 }
882 
883 
ParseClassAtom(uc16 * char_class)884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
885   DCHECK_EQ(0, *char_class);
886   uc32 first = current();
887   if (first == '\\') {
888     switch (Next()) {
889       case 'w':
890       case 'W':
891       case 'd':
892       case 'D':
893       case 's':
894       case 'S': {
895         *char_class = Next();
896         Advance(2);
897         return CharacterRange::Singleton(0);  // Return dummy value.
898       }
899       case kEndMarker:
900         return ReportError(CStrVector("\\ at end of pattern"));
901       default:
902         uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
903         return CharacterRange::Singleton(c);
904     }
905   } else {
906     Advance();
907     return CharacterRange::Singleton(first);
908   }
909 }
910 
911 
912 static const uc16 kNoCharClass = 0;
913 
914 // Adds range or pre-defined character class to character ranges.
915 // If char_class is not kInvalidClass, it's interpreted as a class
916 // escape (i.e., 's' means whitespace, from '\s').
AddRangeOrEscape(ZoneList<CharacterRange> * ranges,uc16 char_class,CharacterRange range,Zone * zone)917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
918                                     uc16 char_class, CharacterRange range,
919                                     Zone* zone) {
920   if (char_class != kNoCharClass) {
921     CharacterRange::AddClassEscape(char_class, ranges, zone);
922   } else {
923     ranges->Add(range, zone);
924   }
925 }
926 
927 
ParseCharacterClass()928 RegExpTree* RegExpParser::ParseCharacterClass() {
929   static const char* kUnterminated = "Unterminated character class";
930   static const char* kRangeOutOfOrder = "Range out of order in character class";
931 
932   DCHECK_EQ(current(), '[');
933   Advance();
934   bool is_negated = false;
935   if (current() == '^') {
936     is_negated = true;
937     Advance();
938   }
939   ZoneList<CharacterRange>* ranges =
940       new (zone()) ZoneList<CharacterRange>(2, zone());
941   while (has_more() && current() != ']') {
942     uc16 char_class = kNoCharClass;
943     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
944     if (current() == '-') {
945       Advance();
946       if (current() == kEndMarker) {
947         // If we reach the end we break out of the loop and let the
948         // following code report an error.
949         break;
950       } else if (current() == ']') {
951         AddRangeOrEscape(ranges, char_class, first, zone());
952         ranges->Add(CharacterRange::Singleton('-'), zone());
953         break;
954       }
955       uc16 char_class_2 = kNoCharClass;
956       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
957       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
958         // Either end is an escaped character class. Treat the '-' verbatim.
959         AddRangeOrEscape(ranges, char_class, first, zone());
960         ranges->Add(CharacterRange::Singleton('-'), zone());
961         AddRangeOrEscape(ranges, char_class_2, next, zone());
962         continue;
963       }
964       if (first.from() > next.to()) {
965         return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
966       }
967       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
968     } else {
969       AddRangeOrEscape(ranges, char_class, first, zone());
970     }
971   }
972   if (!has_more()) {
973     return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
974   }
975   Advance();
976   if (ranges->length() == 0) {
977     ranges->Add(CharacterRange::Everything(), zone());
978     is_negated = !is_negated;
979   }
980   return new (zone()) RegExpCharacterClass(ranges, is_negated);
981 }
982 
983 
984 #undef CHECK_FAILED
985 
986 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,bool multiline,bool unicode,RegExpCompileData * result)987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
988                                FlatStringReader* input, bool multiline,
989                                bool unicode, RegExpCompileData* result) {
990   DCHECK(result != NULL);
991   RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
992   RegExpTree* tree = parser.ParsePattern();
993   if (parser.failed()) {
994     DCHECK(tree == NULL);
995     DCHECK(!result->error.is_null());
996   } else {
997     DCHECK(tree != NULL);
998     DCHECK(result->error.is_null());
999     if (FLAG_trace_regexp_parser) {
1000       OFStream os(stdout);
1001       tree->Print(os, zone);
1002       os << "\n";
1003     }
1004     result->tree = tree;
1005     int capture_count = parser.captures_started();
1006     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1007     result->contains_anchor = parser.contains_anchor();
1008     result->capture_count = capture_count;
1009   }
1010   return !parser.failed();
1011 }
1012 
1013 
RegExpBuilder(Zone * zone)1014 RegExpBuilder::RegExpBuilder(Zone* zone)
1015     : zone_(zone),
1016       pending_empty_(false),
1017       characters_(NULL),
1018       terms_(),
1019       alternatives_()
1020 #ifdef DEBUG
1021       ,
1022       last_added_(ADD_NONE)
1023 #endif
1024 {
1025 }
1026 
1027 
FlushCharacters()1028 void RegExpBuilder::FlushCharacters() {
1029   pending_empty_ = false;
1030   if (characters_ != NULL) {
1031     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1032     characters_ = NULL;
1033     text_.Add(atom, zone());
1034     LAST(ADD_ATOM);
1035   }
1036 }
1037 
1038 
FlushText()1039 void RegExpBuilder::FlushText() {
1040   FlushCharacters();
1041   int num_text = text_.length();
1042   if (num_text == 0) {
1043     return;
1044   } else if (num_text == 1) {
1045     terms_.Add(text_.last(), zone());
1046   } else {
1047     RegExpText* text = new (zone()) RegExpText(zone());
1048     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1049     terms_.Add(text, zone());
1050   }
1051   text_.Clear();
1052 }
1053 
1054 
AddCharacter(uc16 c)1055 void RegExpBuilder::AddCharacter(uc16 c) {
1056   pending_empty_ = false;
1057   if (characters_ == NULL) {
1058     characters_ = new (zone()) ZoneList<uc16>(4, zone());
1059   }
1060   characters_->Add(c, zone());
1061   LAST(ADD_CHAR);
1062 }
1063 
1064 
AddUnicodeCharacter(uc32 c)1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1066   if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1067     ZoneList<uc16> surrogate_pair(2, zone());
1068     surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
1069     surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
1070     RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1071     AddAtom(atom);
1072   } else {
1073     AddCharacter(static_cast<uc16>(c));
1074   }
1075 }
1076 
1077 
AddEmpty()1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1079 
1080 
AddAtom(RegExpTree * term)1081 void RegExpBuilder::AddAtom(RegExpTree* term) {
1082   if (term->IsEmpty()) {
1083     AddEmpty();
1084     return;
1085   }
1086   if (term->IsTextElement()) {
1087     FlushCharacters();
1088     text_.Add(term, zone());
1089   } else {
1090     FlushText();
1091     terms_.Add(term, zone());
1092   }
1093   LAST(ADD_ATOM);
1094 }
1095 
1096 
AddAssertion(RegExpTree * assert)1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1098   FlushText();
1099   terms_.Add(assert, zone());
1100   LAST(ADD_ASSERT);
1101 }
1102 
1103 
NewAlternative()1104 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1105 
1106 
FlushTerms()1107 void RegExpBuilder::FlushTerms() {
1108   FlushText();
1109   int num_terms = terms_.length();
1110   RegExpTree* alternative;
1111   if (num_terms == 0) {
1112     alternative = new (zone()) RegExpEmpty();
1113   } else if (num_terms == 1) {
1114     alternative = terms_.last();
1115   } else {
1116     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1117   }
1118   alternatives_.Add(alternative, zone());
1119   terms_.Clear();
1120   LAST(ADD_NONE);
1121 }
1122 
1123 
ToRegExp()1124 RegExpTree* RegExpBuilder::ToRegExp() {
1125   FlushTerms();
1126   int num_alternatives = alternatives_.length();
1127   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1128   if (num_alternatives == 1) return alternatives_.last();
1129   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1130 }
1131 
1132 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1133 void RegExpBuilder::AddQuantifierToAtom(
1134     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1135   if (pending_empty_) {
1136     pending_empty_ = false;
1137     return;
1138   }
1139   RegExpTree* atom;
1140   if (characters_ != NULL) {
1141     DCHECK(last_added_ == ADD_CHAR);
1142     // Last atom was character.
1143     Vector<const uc16> char_vector = characters_->ToConstVector();
1144     int num_chars = char_vector.length();
1145     if (num_chars > 1) {
1146       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1147       text_.Add(new (zone()) RegExpAtom(prefix), zone());
1148       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1149     }
1150     characters_ = NULL;
1151     atom = new (zone()) RegExpAtom(char_vector);
1152     FlushText();
1153   } else if (text_.length() > 0) {
1154     DCHECK(last_added_ == ADD_ATOM);
1155     atom = text_.RemoveLast();
1156     FlushText();
1157   } else if (terms_.length() > 0) {
1158     DCHECK(last_added_ == ADD_ATOM);
1159     atom = terms_.RemoveLast();
1160     if (atom->max_match() == 0) {
1161       // Guaranteed to only match an empty string.
1162       LAST(ADD_TERM);
1163       if (min == 0) {
1164         return;
1165       }
1166       terms_.Add(atom, zone());
1167       return;
1168     }
1169   } else {
1170     // Only call immediately after adding an atom or character!
1171     UNREACHABLE();
1172     return;
1173   }
1174   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1175              zone());
1176   LAST(ADD_TERM);
1177 }
1178 
1179 }  // namespace internal
1180 }  // namespace v8
1181