1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-parser.h"
6 
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/ostreams.h"
12 #include "src/regexp/jsregexp.h"
13 #include "src/utils.h"
14 
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uset.h"
17 #endif  // V8_I18N_SUPPORT
18 
19 namespace v8 {
20 namespace internal {
21 
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23                            JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24     : isolate_(isolate),
25       zone_(zone),
26       error_(error),
27       captures_(NULL),
28       named_captures_(NULL),
29       named_back_references_(NULL),
30       in_(in),
31       current_(kEndMarker),
32       ignore_case_(flags & JSRegExp::kIgnoreCase),
33       multiline_(flags & JSRegExp::kMultiline),
34       unicode_(flags & JSRegExp::kUnicode),
35       next_pos_(0),
36       captures_started_(0),
37       capture_count_(0),
38       has_more_(true),
39       simple_(false),
40       contains_anchor_(false),
41       is_scanned_for_captures_(false),
42       failed_(false) {
43   Advance();
44 }
45 
46 template <bool update_position>
ReadNext()47 inline uc32 RegExpParser::ReadNext() {
48   int position = next_pos_;
49   uc32 c0 = in()->Get(position);
50   position++;
51   // Read the whole surrogate pair in case of unicode flag, if possible.
52   if (unicode() && position < in()->length() &&
53       unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54     uc16 c1 = in()->Get(position);
55     if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56       c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57       position++;
58     }
59   }
60   if (update_position) next_pos_ = position;
61   return c0;
62 }
63 
64 
Next()65 uc32 RegExpParser::Next() {
66   if (has_next()) {
67     return ReadNext<false>();
68   } else {
69     return kEndMarker;
70   }
71 }
72 
73 
Advance()74 void RegExpParser::Advance() {
75   if (has_next()) {
76     StackLimitCheck check(isolate());
77     if (check.HasOverflowed()) {
78       ReportError(CStrVector(
79           MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
80     } else if (zone()->excess_allocation()) {
81       ReportError(CStrVector("Regular expression too large"));
82     } else {
83       current_ = ReadNext<true>();
84     }
85   } else {
86     current_ = kEndMarker;
87     // Advance so that position() points to 1-after-the-last-character. This is
88     // important so that Reset() to this position works correctly.
89     next_pos_ = in()->length() + 1;
90     has_more_ = false;
91   }
92 }
93 
94 
Reset(int pos)95 void RegExpParser::Reset(int pos) {
96   next_pos_ = pos;
97   has_more_ = (pos < in()->length());
98   Advance();
99 }
100 
101 
Advance(int dist)102 void RegExpParser::Advance(int dist) {
103   next_pos_ += dist - 1;
104   Advance();
105 }
106 
107 
simple()108 bool RegExpParser::simple() { return simple_; }
109 
IsSyntaxCharacterOrSlash(uc32 c)110 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
111   switch (c) {
112     case '^':
113     case '$':
114     case '\\':
115     case '.':
116     case '*':
117     case '+':
118     case '?':
119     case '(':
120     case ')':
121     case '[':
122     case ']':
123     case '{':
124     case '}':
125     case '|':
126     case '/':
127       return true;
128     default:
129       break;
130   }
131   return false;
132 }
133 
134 
ReportError(Vector<const char> message)135 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
136   if (failed_) return NULL;  // Do not overwrite any existing error.
137   failed_ = true;
138   *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
139   // Zip to the end to make sure the no more input is read.
140   current_ = kEndMarker;
141   next_pos_ = in()->length();
142   return NULL;
143 }
144 
145 
146 #define CHECK_FAILED /**/); \
147   if (failed_) return NULL; \
148   ((void)0
149 
150 
151 // Pattern ::
152 //   Disjunction
ParsePattern()153 RegExpTree* RegExpParser::ParsePattern() {
154   RegExpTree* result = ParseDisjunction(CHECK_FAILED);
155   PatchNamedBackReferences(CHECK_FAILED);
156   DCHECK(!has_more());
157   // If the result of parsing is a literal string atom, and it has the
158   // same length as the input, then the atom is identical to the input.
159   if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
160     simple_ = true;
161   }
162   return result;
163 }
164 
165 
166 // Disjunction ::
167 //   Alternative
168 //   Alternative | Disjunction
169 // Alternative ::
170 //   [empty]
171 //   Term Alternative
172 // Term ::
173 //   Assertion
174 //   Atom
175 //   Atom Quantifier
ParseDisjunction()176 RegExpTree* RegExpParser::ParseDisjunction() {
177   // Used to store current state while parsing subexpressions.
178   RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
179                                   nullptr, ignore_case(), unicode(), zone());
180   RegExpParserState* state = &initial_state;
181   // Cache the builder in a local variable for quick access.
182   RegExpBuilder* builder = initial_state.builder();
183   while (true) {
184     switch (current()) {
185       case kEndMarker:
186         if (state->IsSubexpression()) {
187           // Inside a parenthesized group when hitting end of input.
188           return ReportError(CStrVector("Unterminated group"));
189         }
190         DCHECK_EQ(INITIAL, state->group_type());
191         // Parsing completed successfully.
192         return builder->ToRegExp();
193       case ')': {
194         if (!state->IsSubexpression()) {
195           return ReportError(CStrVector("Unmatched ')'"));
196         }
197         DCHECK_NE(INITIAL, state->group_type());
198 
199         Advance();
200         // End disjunction parsing and convert builder content to new single
201         // regexp atom.
202         RegExpTree* body = builder->ToRegExp();
203 
204         int end_capture_index = captures_started();
205 
206         int capture_index = state->capture_index();
207         SubexpressionType group_type = state->group_type();
208 
209         // Build result of subexpression.
210         if (group_type == CAPTURE) {
211           if (state->IsNamedCapture()) {
212             CreateNamedCaptureAtIndex(state->capture_name(),
213                                       capture_index CHECK_FAILED);
214           }
215           RegExpCapture* capture = GetCapture(capture_index);
216           capture->set_body(body);
217           body = capture;
218         } else if (group_type != GROUPING) {
219           DCHECK(group_type == POSITIVE_LOOKAROUND ||
220                  group_type == NEGATIVE_LOOKAROUND);
221           bool is_positive = (group_type == POSITIVE_LOOKAROUND);
222           body = new (zone()) RegExpLookaround(
223               body, is_positive, end_capture_index - capture_index,
224               capture_index, state->lookaround_type());
225         }
226 
227         // Restore previous state.
228         state = state->previous_state();
229         builder = state->builder();
230 
231         builder->AddAtom(body);
232         // For compatability with JSC and ES3, we allow quantifiers after
233         // lookaheads, and break in all cases.
234         break;
235       }
236       case '|': {
237         Advance();
238         builder->NewAlternative();
239         continue;
240       }
241       case '*':
242       case '+':
243       case '?':
244         return ReportError(CStrVector("Nothing to repeat"));
245       case '^': {
246         Advance();
247         if (multiline()) {
248           builder->AddAssertion(
249               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
250         } else {
251           builder->AddAssertion(
252               new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
253           set_contains_anchor();
254         }
255         continue;
256       }
257       case '$': {
258         Advance();
259         RegExpAssertion::AssertionType assertion_type =
260             multiline() ? RegExpAssertion::END_OF_LINE
261                         : RegExpAssertion::END_OF_INPUT;
262         builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
263         continue;
264       }
265       case '.': {
266         Advance();
267         // everything except \x0a, \x0d, \u2028 and \u2029
268         ZoneList<CharacterRange>* ranges =
269             new (zone()) ZoneList<CharacterRange>(2, zone());
270         CharacterRange::AddClassEscape('.', ranges, zone());
271         RegExpCharacterClass* cc =
272             new (zone()) RegExpCharacterClass(ranges, false);
273         builder->AddCharacterClass(cc);
274         break;
275       }
276       case '(': {
277         SubexpressionType subexpr_type = CAPTURE;
278         RegExpLookaround::Type lookaround_type = state->lookaround_type();
279         bool is_named_capture = false;
280         Advance();
281         if (current() == '?') {
282           switch (Next()) {
283             case ':':
284               subexpr_type = GROUPING;
285               Advance(2);
286               break;
287             case '=':
288               lookaround_type = RegExpLookaround::LOOKAHEAD;
289               subexpr_type = POSITIVE_LOOKAROUND;
290               Advance(2);
291               break;
292             case '!':
293               lookaround_type = RegExpLookaround::LOOKAHEAD;
294               subexpr_type = NEGATIVE_LOOKAROUND;
295               Advance(2);
296               break;
297             case '<':
298               Advance();
299               if (FLAG_harmony_regexp_lookbehind) {
300                 if (Next() == '=') {
301                   subexpr_type = POSITIVE_LOOKAROUND;
302                   lookaround_type = RegExpLookaround::LOOKBEHIND;
303                   Advance(2);
304                   break;
305                 } else if (Next() == '!') {
306                   subexpr_type = NEGATIVE_LOOKAROUND;
307                   lookaround_type = RegExpLookaround::LOOKBEHIND;
308                   Advance(2);
309                   break;
310                 }
311               }
312               if (FLAG_harmony_regexp_named_captures && unicode()) {
313                 is_named_capture = true;
314                 Advance();
315                 break;
316               }
317             // Fall through.
318             default:
319               return ReportError(CStrVector("Invalid group"));
320           }
321         }
322 
323         const ZoneVector<uc16>* capture_name = nullptr;
324         if (subexpr_type == CAPTURE) {
325           if (captures_started_ >= kMaxCaptures) {
326             return ReportError(CStrVector("Too many captures"));
327           }
328           captures_started_++;
329 
330           if (is_named_capture) {
331             capture_name = ParseCaptureGroupName(CHECK_FAILED);
332           }
333         }
334         // Store current state and begin new disjunction parsing.
335         state = new (zone()) RegExpParserState(
336             state, subexpr_type, lookaround_type, captures_started_,
337             capture_name, ignore_case(), unicode(), zone());
338         builder = state->builder();
339         continue;
340       }
341       case '[': {
342         RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
343         builder->AddCharacterClass(cc->AsCharacterClass());
344         break;
345       }
346       // Atom ::
347       //   \ AtomEscape
348       case '\\':
349         switch (Next()) {
350           case kEndMarker:
351             return ReportError(CStrVector("\\ at end of pattern"));
352           case 'b':
353             Advance(2);
354             builder->AddAssertion(
355                 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
356             continue;
357           case 'B':
358             Advance(2);
359             builder->AddAssertion(
360                 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
361             continue;
362           // AtomEscape ::
363           //   CharacterClassEscape
364           //
365           // CharacterClassEscape :: one of
366           //   d D s S w W
367           case 'd':
368           case 'D':
369           case 's':
370           case 'S':
371           case 'w':
372           case 'W': {
373             uc32 c = Next();
374             Advance(2);
375             ZoneList<CharacterRange>* ranges =
376                 new (zone()) ZoneList<CharacterRange>(2, zone());
377             CharacterRange::AddClassEscape(c, ranges, zone());
378             RegExpCharacterClass* cc =
379                 new (zone()) RegExpCharacterClass(ranges, false);
380             builder->AddCharacterClass(cc);
381             break;
382           }
383           case 'p':
384           case 'P': {
385             uc32 p = Next();
386             Advance(2);
387             if (unicode()) {
388               if (FLAG_harmony_regexp_property) {
389                 ZoneList<CharacterRange>* ranges =
390                     new (zone()) ZoneList<CharacterRange>(2, zone());
391                 if (!ParsePropertyClass(ranges, p == 'P')) {
392                   return ReportError(CStrVector("Invalid property name"));
393                 }
394                 RegExpCharacterClass* cc =
395                     new (zone()) RegExpCharacterClass(ranges, false);
396                 builder->AddCharacterClass(cc);
397               } else {
398                 // With /u, no identity escapes except for syntax characters
399                 // are allowed. Otherwise, all identity escapes are allowed.
400                 return ReportError(CStrVector("Invalid escape"));
401               }
402             } else {
403               builder->AddCharacter(p);
404             }
405             break;
406           }
407           case '1':
408           case '2':
409           case '3':
410           case '4':
411           case '5':
412           case '6':
413           case '7':
414           case '8':
415           case '9': {
416             int index = 0;
417             bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
418             if (is_backref) {
419               if (state->IsInsideCaptureGroup(index)) {
420                 // The back reference is inside the capture group it refers to.
421                 // Nothing can possibly have been captured yet, so we use empty
422                 // instead. This ensures that, when checking a back reference,
423                 // the capture registers of the referenced capture are either
424                 // both set or both cleared.
425                 builder->AddEmpty();
426               } else {
427                 RegExpCapture* capture = GetCapture(index);
428                 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
429                 builder->AddAtom(atom);
430               }
431               break;
432             }
433             // With /u, no identity escapes except for syntax characters
434             // are allowed. Otherwise, all identity escapes are allowed.
435             if (unicode()) {
436               return ReportError(CStrVector("Invalid escape"));
437             }
438             uc32 first_digit = Next();
439             if (first_digit == '8' || first_digit == '9') {
440               builder->AddCharacter(first_digit);
441               Advance(2);
442               break;
443             }
444           }
445           // Fall through.
446           case '0': {
447             Advance();
448             if (unicode() && Next() >= '0' && Next() <= '9') {
449               // With /u, decimal escape with leading 0 are not parsed as octal.
450               return ReportError(CStrVector("Invalid decimal escape"));
451             }
452             uc32 octal = ParseOctalLiteral();
453             builder->AddCharacter(octal);
454             break;
455           }
456           // ControlEscape :: one of
457           //   f n r t v
458           case 'f':
459             Advance(2);
460             builder->AddCharacter('\f');
461             break;
462           case 'n':
463             Advance(2);
464             builder->AddCharacter('\n');
465             break;
466           case 'r':
467             Advance(2);
468             builder->AddCharacter('\r');
469             break;
470           case 't':
471             Advance(2);
472             builder->AddCharacter('\t');
473             break;
474           case 'v':
475             Advance(2);
476             builder->AddCharacter('\v');
477             break;
478           case 'c': {
479             Advance();
480             uc32 controlLetter = Next();
481             // Special case if it is an ASCII letter.
482             // Convert lower case letters to uppercase.
483             uc32 letter = controlLetter & ~('a' ^ 'A');
484             if (letter < 'A' || 'Z' < letter) {
485               // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
486               // This is outside the specification. We match JSC in
487               // reading the backslash as a literal character instead
488               // of as starting an escape.
489               if (unicode()) {
490                 // With /u, invalid escapes are not treated as identity escapes.
491                 return ReportError(CStrVector("Invalid unicode escape"));
492               }
493               builder->AddCharacter('\\');
494             } else {
495               Advance(2);
496               builder->AddCharacter(controlLetter & 0x1f);
497             }
498             break;
499           }
500           case 'x': {
501             Advance(2);
502             uc32 value;
503             if (ParseHexEscape(2, &value)) {
504               builder->AddCharacter(value);
505             } else if (!unicode()) {
506               builder->AddCharacter('x');
507             } else {
508               // With /u, invalid escapes are not treated as identity escapes.
509               return ReportError(CStrVector("Invalid escape"));
510             }
511             break;
512           }
513           case 'u': {
514             Advance(2);
515             uc32 value;
516             if (ParseUnicodeEscape(&value)) {
517               builder->AddEscapedUnicodeCharacter(value);
518             } else if (!unicode()) {
519               builder->AddCharacter('u');
520             } else {
521               // With /u, invalid escapes are not treated as identity escapes.
522               return ReportError(CStrVector("Invalid unicode escape"));
523             }
524             break;
525           }
526           case 'k':
527             if (FLAG_harmony_regexp_named_captures && unicode()) {
528               Advance(2);
529               ParseNamedBackReference(builder, state CHECK_FAILED);
530               break;
531             }
532           // Fall through.
533           default:
534             Advance();
535             // With /u, no identity escapes except for syntax characters
536             // are allowed. Otherwise, all identity escapes are allowed.
537             if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
538               builder->AddCharacter(current());
539               Advance();
540             } else {
541               return ReportError(CStrVector("Invalid escape"));
542             }
543             break;
544         }
545         break;
546       case '{': {
547         int dummy;
548         bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
549         if (parsed) return ReportError(CStrVector("Nothing to repeat"));
550         // Fall through.
551       }
552       case '}':
553       case ']':
554         if (unicode()) {
555           return ReportError(CStrVector("Lone quantifier brackets"));
556         }
557       // Fall through.
558       default:
559         builder->AddUnicodeCharacter(current());
560         Advance();
561         break;
562     }  // end switch(current())
563 
564     int min;
565     int max;
566     switch (current()) {
567       // QuantifierPrefix ::
568       //   *
569       //   +
570       //   ?
571       //   {
572       case '*':
573         min = 0;
574         max = RegExpTree::kInfinity;
575         Advance();
576         break;
577       case '+':
578         min = 1;
579         max = RegExpTree::kInfinity;
580         Advance();
581         break;
582       case '?':
583         min = 0;
584         max = 1;
585         Advance();
586         break;
587       case '{':
588         if (ParseIntervalQuantifier(&min, &max)) {
589           if (max < min) {
590             return ReportError(
591                 CStrVector("numbers out of order in {} quantifier"));
592           }
593           break;
594         } else if (unicode()) {
595           // With /u, incomplete quantifiers are not allowed.
596           return ReportError(CStrVector("Incomplete quantifier"));
597         }
598         continue;
599       default:
600         continue;
601     }
602     RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
603     if (current() == '?') {
604       quantifier_type = RegExpQuantifier::NON_GREEDY;
605       Advance();
606     } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
607       // FLAG_regexp_possessive_quantifier is a debug-only flag.
608       quantifier_type = RegExpQuantifier::POSSESSIVE;
609       Advance();
610     }
611     if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
612       return ReportError(CStrVector("Invalid quantifier"));
613     }
614   }
615 }
616 
617 
618 #ifdef DEBUG
619 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)620 static bool IsSpecialClassEscape(uc32 c) {
621   switch (c) {
622     case 'd':
623     case 'D':
624     case 's':
625     case 'S':
626     case 'w':
627     case 'W':
628       return true;
629     default:
630       return false;
631   }
632 }
633 #endif
634 
635 
636 // In order to know whether an escape is a backreference or not we have to scan
637 // the entire regexp and find the number of capturing parentheses.  However we
638 // don't want to scan the regexp twice unless it is necessary.  This mini-parser
639 // is called when needed.  It can see the difference between capturing and
640 // noncapturing parentheses and can skip character classes and backslash-escaped
641 // characters.
ScanForCaptures()642 void RegExpParser::ScanForCaptures() {
643   // Start with captures started previous to current position
644   int capture_count = captures_started();
645   // Add count of captures after this position.
646   int n;
647   while ((n = current()) != kEndMarker) {
648     Advance();
649     switch (n) {
650       case '\\':
651         Advance();
652         break;
653       case '[': {
654         int c;
655         while ((c = current()) != kEndMarker) {
656           Advance();
657           if (c == '\\') {
658             Advance();
659           } else {
660             if (c == ']') break;
661           }
662         }
663         break;
664       }
665       case '(':
666         if (current() != '?') capture_count++;
667         break;
668     }
669   }
670   capture_count_ = capture_count;
671   is_scanned_for_captures_ = true;
672 }
673 
674 
ParseBackReferenceIndex(int * index_out)675 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
676   DCHECK_EQ('\\', current());
677   DCHECK('1' <= Next() && Next() <= '9');
678   // Try to parse a decimal literal that is no greater than the total number
679   // of left capturing parentheses in the input.
680   int start = position();
681   int value = Next() - '0';
682   Advance(2);
683   while (true) {
684     uc32 c = current();
685     if (IsDecimalDigit(c)) {
686       value = 10 * value + (c - '0');
687       if (value > kMaxCaptures) {
688         Reset(start);
689         return false;
690       }
691       Advance();
692     } else {
693       break;
694     }
695   }
696   if (value > captures_started()) {
697     if (!is_scanned_for_captures_) {
698       int saved_position = position();
699       ScanForCaptures();
700       Reset(saved_position);
701     }
702     if (value > capture_count_) {
703       Reset(start);
704       return false;
705     }
706   }
707   *index_out = value;
708   return true;
709 }
710 
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)711 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
712   if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
713     v->push_back(code_unit);
714   } else {
715     v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
716     v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
717   }
718 }
719 
ParseCaptureGroupName()720 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
721   DCHECK(FLAG_harmony_regexp_named_captures);
722   DCHECK(unicode());
723 
724   ZoneVector<uc16>* name =
725       new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
726 
727   bool at_start = true;
728   while (true) {
729     uc32 c = current();
730     Advance();
731 
732     // Convert unicode escapes.
733     if (c == '\\' && current() == 'u') {
734       Advance();
735       if (!ParseUnicodeEscape(&c)) {
736         ReportError(CStrVector("Invalid Unicode escape sequence"));
737         return nullptr;
738       }
739     }
740 
741     if (at_start) {
742       if (!IdentifierStart::Is(c)) {
743         ReportError(CStrVector("Invalid capture group name"));
744         return nullptr;
745       }
746       push_code_unit(name, c);
747       at_start = false;
748     } else {
749       if (c == '>') {
750         break;
751       } else if (IdentifierPart::Is(c)) {
752         push_code_unit(name, c);
753       } else {
754         ReportError(CStrVector("Invalid capture group name"));
755         return nullptr;
756       }
757     }
758   }
759 
760   return name;
761 }
762 
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)763 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
764                                              int index) {
765   DCHECK(FLAG_harmony_regexp_named_captures);
766   DCHECK(unicode());
767   DCHECK(0 < index && index <= captures_started_);
768   DCHECK_NOT_NULL(name);
769 
770   if (named_captures_ == nullptr) {
771     named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
772   } else {
773     // Check for duplicates and bail if we find any.
774     for (const auto& named_capture : *named_captures_) {
775       if (*named_capture->name() == *name) {
776         ReportError(CStrVector("Duplicate capture group name"));
777         return false;
778       }
779     }
780   }
781 
782   RegExpCapture* capture = GetCapture(index);
783   DCHECK(capture->name() == nullptr);
784 
785   capture->set_name(name);
786   named_captures_->Add(capture, zone());
787 
788   return true;
789 }
790 
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)791 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
792                                            RegExpParserState* state) {
793   // The parser is assumed to be on the '<' in \k<name>.
794   if (current() != '<') {
795     ReportError(CStrVector("Invalid named reference"));
796     return false;
797   }
798 
799   Advance();
800   const ZoneVector<uc16>* name = ParseCaptureGroupName();
801   if (name == nullptr) {
802     return false;
803   }
804 
805   if (state->IsInsideCaptureGroup(name)) {
806     builder->AddEmpty();
807   } else {
808     RegExpBackReference* atom = new (zone()) RegExpBackReference();
809     atom->set_name(name);
810 
811     builder->AddAtom(atom);
812 
813     if (named_back_references_ == nullptr) {
814       named_back_references_ =
815           new (zone()) ZoneList<RegExpBackReference*>(1, zone());
816     }
817     named_back_references_->Add(atom, zone());
818   }
819 
820   return true;
821 }
822 
PatchNamedBackReferences()823 void RegExpParser::PatchNamedBackReferences() {
824   if (named_back_references_ == nullptr) return;
825 
826   if (named_captures_ == nullptr) {
827     ReportError(CStrVector("Invalid named capture referenced"));
828     return;
829   }
830 
831   // Look up and patch the actual capture for each named back reference.
832   // TODO(jgruber): O(n^2), optimize if necessary.
833 
834   for (int i = 0; i < named_back_references_->length(); i++) {
835     RegExpBackReference* ref = named_back_references_->at(i);
836 
837     int index = -1;
838     for (const auto& capture : *named_captures_) {
839       if (*capture->name() == *ref->name()) {
840         index = capture->index();
841         break;
842       }
843     }
844 
845     if (index == -1) {
846       ReportError(CStrVector("Invalid named capture referenced"));
847       return;
848     }
849 
850     ref->set_capture(GetCapture(index));
851   }
852 }
853 
GetCapture(int index)854 RegExpCapture* RegExpParser::GetCapture(int index) {
855   // The index for the capture groups are one-based. Its index in the list is
856   // zero-based.
857   int know_captures =
858       is_scanned_for_captures_ ? capture_count_ : captures_started_;
859   DCHECK(index <= know_captures);
860   if (captures_ == NULL) {
861     captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
862   }
863   while (captures_->length() < know_captures) {
864     captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
865   }
866   return captures_->at(index - 1);
867 }
868 
CreateCaptureNameMap()869 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
870   if (named_captures_ == nullptr || named_captures_->is_empty())
871     return Handle<FixedArray>();
872 
873   Factory* factory = isolate()->factory();
874 
875   int len = named_captures_->length() * 2;
876   Handle<FixedArray> array = factory->NewFixedArray(len);
877 
878   for (int i = 0; i < named_captures_->length(); i++) {
879     RegExpCapture* capture = named_captures_->at(i);
880     MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
881     array->set(i * 2, *name.ToHandleChecked());
882     array->set(i * 2 + 1, Smi::FromInt(capture->index()));
883   }
884 
885   return array;
886 }
887 
IsInsideCaptureGroup(int index)888 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
889   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
890     if (s->group_type() != CAPTURE) continue;
891     // Return true if we found the matching capture index.
892     if (index == s->capture_index()) return true;
893     // Abort if index is larger than what has been parsed up till this state.
894     if (index > s->capture_index()) return false;
895   }
896   return false;
897 }
898 
IsInsideCaptureGroup(const ZoneVector<uc16> * name)899 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
900     const ZoneVector<uc16>* name) {
901   DCHECK_NOT_NULL(name);
902   for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
903     if (s->capture_name() == nullptr) continue;
904     if (*s->capture_name() == *name) return true;
905   }
906   return false;
907 }
908 
909 // QuantifierPrefix ::
910 //   { DecimalDigits }
911 //   { DecimalDigits , }
912 //   { DecimalDigits , DecimalDigits }
913 //
914 // Returns true if parsing succeeds, and set the min_out and max_out
915 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)916 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
917   DCHECK_EQ(current(), '{');
918   int start = position();
919   Advance();
920   int min = 0;
921   if (!IsDecimalDigit(current())) {
922     Reset(start);
923     return false;
924   }
925   while (IsDecimalDigit(current())) {
926     int next = current() - '0';
927     if (min > (RegExpTree::kInfinity - next) / 10) {
928       // Overflow. Skip past remaining decimal digits and return -1.
929       do {
930         Advance();
931       } while (IsDecimalDigit(current()));
932       min = RegExpTree::kInfinity;
933       break;
934     }
935     min = 10 * min + next;
936     Advance();
937   }
938   int max = 0;
939   if (current() == '}') {
940     max = min;
941     Advance();
942   } else if (current() == ',') {
943     Advance();
944     if (current() == '}') {
945       max = RegExpTree::kInfinity;
946       Advance();
947     } else {
948       while (IsDecimalDigit(current())) {
949         int next = current() - '0';
950         if (max > (RegExpTree::kInfinity - next) / 10) {
951           do {
952             Advance();
953           } while (IsDecimalDigit(current()));
954           max = RegExpTree::kInfinity;
955           break;
956         }
957         max = 10 * max + next;
958         Advance();
959       }
960       if (current() != '}') {
961         Reset(start);
962         return false;
963       }
964       Advance();
965     }
966   } else {
967     Reset(start);
968     return false;
969   }
970   *min_out = min;
971   *max_out = max;
972   return true;
973 }
974 
975 
ParseOctalLiteral()976 uc32 RegExpParser::ParseOctalLiteral() {
977   DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
978   // For compatibility with some other browsers (not all), we parse
979   // up to three octal digits with a value below 256.
980   uc32 value = current() - '0';
981   Advance();
982   if ('0' <= current() && current() <= '7') {
983     value = value * 8 + current() - '0';
984     Advance();
985     if (value < 32 && '0' <= current() && current() <= '7') {
986       value = value * 8 + current() - '0';
987       Advance();
988     }
989   }
990   return value;
991 }
992 
993 
ParseHexEscape(int length,uc32 * value)994 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
995   int start = position();
996   uc32 val = 0;
997   for (int i = 0; i < length; ++i) {
998     uc32 c = current();
999     int d = HexValue(c);
1000     if (d < 0) {
1001       Reset(start);
1002       return false;
1003     }
1004     val = val * 16 + d;
1005     Advance();
1006   }
1007   *value = val;
1008   return true;
1009 }
1010 
1011 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1012 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1013   // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1014   // allowed). In the latter case, the number of hex digits between { } is
1015   // arbitrary. \ and u have already been read.
1016   if (current() == '{' && unicode()) {
1017     int start = position();
1018     Advance();
1019     if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1020       if (current() == '}') {
1021         Advance();
1022         return true;
1023       }
1024     }
1025     Reset(start);
1026     return false;
1027   }
1028   // \u but no {, or \u{...} escapes not allowed.
1029   bool result = ParseHexEscape(4, value);
1030   if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1031       current() == '\\') {
1032     // Attempt to read trail surrogate.
1033     int start = position();
1034     if (Next() == 'u') {
1035       Advance(2);
1036       uc32 trail;
1037       if (ParseHexEscape(4, &trail) &&
1038           unibrow::Utf16::IsTrailSurrogate(trail)) {
1039         *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1040                                                       static_cast<uc16>(trail));
1041         return true;
1042       }
1043     }
1044     Reset(start);
1045   }
1046   return result;
1047 }
1048 
1049 #ifdef V8_I18N_SUPPORT
1050 
1051 namespace {
1052 
IsExactPropertyAlias(const char * property_name,UProperty property)1053 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1054   const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1055   if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1056   for (int i = 0;; i++) {
1057     const char* long_name = u_getPropertyName(
1058         property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1059     if (long_name == NULL) break;
1060     if (strcmp(property_name, long_name) == 0) return true;
1061   }
1062   return false;
1063 }
1064 
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1065 bool IsExactPropertyValueAlias(const char* property_value_name,
1066                                UProperty property, int32_t property_value) {
1067   const char* short_name =
1068       u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1069   if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1070     return true;
1071   }
1072   for (int i = 0;; i++) {
1073     const char* long_name = u_getPropertyValueName(
1074         property, property_value,
1075         static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1076     if (long_name == NULL) break;
1077     if (strcmp(property_value_name, long_name) == 0) return true;
1078   }
1079   return false;
1080 }
1081 
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1082 bool LookupPropertyValueName(UProperty property,
1083                              const char* property_value_name, bool negate,
1084                              ZoneList<CharacterRange>* result, Zone* zone) {
1085   int32_t property_value =
1086       u_getPropertyValueEnum(property, property_value_name);
1087   if (property_value == UCHAR_INVALID_CODE) return false;
1088 
1089   // We require the property name to match exactly to one of the property value
1090   // aliases. However, u_getPropertyValueEnum uses loose matching.
1091   if (!IsExactPropertyValueAlias(property_value_name, property,
1092                                  property_value)) {
1093     return false;
1094   }
1095 
1096   USet* set = uset_openEmpty();
1097   UErrorCode ec = U_ZERO_ERROR;
1098   uset_applyIntPropertyValue(set, property, property_value, &ec);
1099   bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
1100 
1101   if (success) {
1102     uset_removeAllStrings(set);
1103     if (negate) uset_complement(set);
1104     int item_count = uset_getItemCount(set);
1105     int item_result = 0;
1106     for (int i = 0; i < item_count; i++) {
1107       uc32 start = 0;
1108       uc32 end = 0;
1109       item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
1110       result->Add(CharacterRange::Range(start, end), zone);
1111     }
1112     DCHECK_EQ(U_ZERO_ERROR, ec);
1113     DCHECK_EQ(0, item_result);
1114   }
1115   uset_close(set);
1116   return success;
1117 }
1118 
1119 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1120 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1121   return strncmp(name, literal, N + 1) == 0;
1122 }
1123 
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1124 bool LookupSpecialPropertyValueName(const char* name,
1125                                     ZoneList<CharacterRange>* result,
1126                                     bool negate, Zone* zone) {
1127   if (NameEquals(name, "Any")) {
1128     if (!negate) result->Add(CharacterRange::Everything(), zone);
1129   } else if (NameEquals(name, "ASCII")) {
1130     result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1131                        : CharacterRange::Range(0x0, 0x7f),
1132                 zone);
1133   } else if (NameEquals(name, "Assigned")) {
1134     return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1135                                    !negate, result, zone);
1136   } else {
1137     return false;
1138   }
1139   return true;
1140 }
1141 
1142 }  // anonymous namespace
1143 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1144 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1145                                       bool negate) {
1146   // Parse the property class as follows:
1147   // - In \p{name}, 'name' is interpreted
1148   //   - either as a general category property value name.
1149   //   - or as a binary property name.
1150   // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1151   //   and 'value' is interpreted as one of the available property value names.
1152   // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1153   // - Loose matching is not applied.
1154   List<char> first_part;
1155   List<char> second_part;
1156   if (current() == '{') {
1157     // Parse \p{[PropertyName=]PropertyNameValue}
1158     for (Advance(); current() != '}' && current() != '='; Advance()) {
1159       if (!has_next()) return false;
1160       first_part.Add(static_cast<char>(current()));
1161     }
1162     if (current() == '=') {
1163       for (Advance(); current() != '}'; Advance()) {
1164         if (!has_next()) return false;
1165         second_part.Add(static_cast<char>(current()));
1166       }
1167       second_part.Add(0);  // null-terminate string.
1168     }
1169   } else {
1170     return false;
1171   }
1172   Advance();
1173   first_part.Add(0);  // null-terminate string.
1174 
1175   if (second_part.is_empty()) {
1176     // First attempt to interpret as general category property value name.
1177     const char* name = first_part.ToConstVector().start();
1178     if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1179                                 result, zone())) {
1180       return true;
1181     }
1182     // Interpret "Any", "ASCII", and "Assigned".
1183     if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1184       return true;
1185     }
1186     // Then attempt to interpret as binary property name with value name 'Y'.
1187     UProperty property = u_getPropertyEnum(name);
1188     if (property < UCHAR_BINARY_START) return false;
1189     if (property >= UCHAR_BINARY_LIMIT) return false;
1190     if (!IsExactPropertyAlias(name, property)) return false;
1191     return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1192                                    zone());
1193   } else {
1194     // Both property name and value name are specified. Attempt to interpret
1195     // the property name as enumerated property.
1196     const char* property_name = first_part.ToConstVector().start();
1197     const char* value_name = second_part.ToConstVector().start();
1198     UProperty property = u_getPropertyEnum(property_name);
1199     if (property < UCHAR_INT_START) return false;
1200     if (property >= UCHAR_INT_LIMIT) return false;
1201     if (!IsExactPropertyAlias(property_name, property)) return false;
1202     return LookupPropertyValueName(property, value_name, negate, result,
1203                                    zone());
1204   }
1205 }
1206 
1207 #else  // V8_I18N_SUPPORT
1208 
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1209 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1210                                       bool negate) {
1211   return false;
1212 }
1213 
1214 #endif  // V8_I18N_SUPPORT
1215 
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1216 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1217   uc32 x = 0;
1218   int d = HexValue(current());
1219   if (d < 0) {
1220     return false;
1221   }
1222   while (d >= 0) {
1223     x = x * 16 + d;
1224     if (x > max_value) {
1225       return false;
1226     }
1227     Advance();
1228     d = HexValue(current());
1229   }
1230   *value = x;
1231   return true;
1232 }
1233 
1234 
ParseClassCharacterEscape()1235 uc32 RegExpParser::ParseClassCharacterEscape() {
1236   DCHECK(current() == '\\');
1237   DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1238   Advance();
1239   switch (current()) {
1240     case 'b':
1241       Advance();
1242       return '\b';
1243     // ControlEscape :: one of
1244     //   f n r t v
1245     case 'f':
1246       Advance();
1247       return '\f';
1248     case 'n':
1249       Advance();
1250       return '\n';
1251     case 'r':
1252       Advance();
1253       return '\r';
1254     case 't':
1255       Advance();
1256       return '\t';
1257     case 'v':
1258       Advance();
1259       return '\v';
1260     case 'c': {
1261       uc32 controlLetter = Next();
1262       uc32 letter = controlLetter & ~('A' ^ 'a');
1263       // For compatibility with JSC, inside a character class. We also accept
1264       // digits and underscore as control characters, unless with /u.
1265       if (letter >= 'A' && letter <= 'Z') {
1266         Advance(2);
1267         // Control letters mapped to ASCII control characters in the range
1268         // 0x00-0x1f.
1269         return controlLetter & 0x1f;
1270       }
1271       if (unicode()) {
1272         // With /u, invalid escapes are not treated as identity escapes.
1273         ReportError(CStrVector("Invalid class escape"));
1274         return 0;
1275       }
1276       if ((controlLetter >= '0' && controlLetter <= '9') ||
1277           controlLetter == '_') {
1278         Advance(2);
1279         return controlLetter & 0x1f;
1280       }
1281       // We match JSC in reading the backslash as a literal
1282       // character instead of as starting an escape.
1283       return '\\';
1284     }
1285     case '0':
1286       // With /u, \0 is interpreted as NUL if not followed by another digit.
1287       if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1288         Advance();
1289         return 0;
1290       }
1291     // Fall through.
1292     case '1':
1293     case '2':
1294     case '3':
1295     case '4':
1296     case '5':
1297     case '6':
1298     case '7':
1299       // For compatibility, we interpret a decimal escape that isn't
1300       // a back reference (and therefore either \0 or not valid according
1301       // to the specification) as a 1..3 digit octal character code.
1302       if (unicode()) {
1303         // With /u, decimal escape is not interpreted as octal character code.
1304         ReportError(CStrVector("Invalid class escape"));
1305         return 0;
1306       }
1307       return ParseOctalLiteral();
1308     case 'x': {
1309       Advance();
1310       uc32 value;
1311       if (ParseHexEscape(2, &value)) return value;
1312       if (unicode()) {
1313         // With /u, invalid escapes are not treated as identity escapes.
1314         ReportError(CStrVector("Invalid escape"));
1315         return 0;
1316       }
1317       // If \x is not followed by a two-digit hexadecimal, treat it
1318       // as an identity escape.
1319       return 'x';
1320     }
1321     case 'u': {
1322       Advance();
1323       uc32 value;
1324       if (ParseUnicodeEscape(&value)) return value;
1325       if (unicode()) {
1326         // With /u, invalid escapes are not treated as identity escapes.
1327         ReportError(CStrVector("Invalid unicode escape"));
1328         return 0;
1329       }
1330       // If \u is not followed by a two-digit hexadecimal, treat it
1331       // as an identity escape.
1332       return 'u';
1333     }
1334     default: {
1335       uc32 result = current();
1336       // With /u, no identity escapes except for syntax characters and '-' are
1337       // allowed. Otherwise, all identity escapes are allowed.
1338       if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1339         Advance();
1340         return result;
1341       }
1342       ReportError(CStrVector("Invalid escape"));
1343       return 0;
1344     }
1345   }
1346   return 0;
1347 }
1348 
1349 
ParseClassAtom(uc16 * char_class)1350 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1351   DCHECK_EQ(0, *char_class);
1352   uc32 first = current();
1353   if (first == '\\') {
1354     switch (Next()) {
1355       case 'w':
1356       case 'W':
1357       case 'd':
1358       case 'D':
1359       case 's':
1360       case 'S': {
1361         *char_class = Next();
1362         Advance(2);
1363         return CharacterRange::Singleton(0);  // Return dummy value.
1364       }
1365       case kEndMarker:
1366         return ReportError(CStrVector("\\ at end of pattern"));
1367       default:
1368         first = ParseClassCharacterEscape(CHECK_FAILED);
1369     }
1370   } else {
1371     Advance();
1372   }
1373 
1374   return CharacterRange::Singleton(first);
1375 }
1376 
1377 static const uc16 kNoCharClass = 0;
1378 
1379 // Adds range or pre-defined character class to character ranges.
1380 // If char_class is not kInvalidClass, it's interpreted as a class
1381 // escape (i.e., 's' means whitespace, from '\s').
AddRangeOrEscape(ZoneList<CharacterRange> * ranges,uc16 char_class,CharacterRange range,Zone * zone)1382 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1383                                     uc16 char_class, CharacterRange range,
1384                                     Zone* zone) {
1385   if (char_class != kNoCharClass) {
1386     CharacterRange::AddClassEscape(char_class, ranges, zone);
1387   } else {
1388     ranges->Add(range, zone);
1389   }
1390 }
1391 
ParseClassProperty(ZoneList<CharacterRange> * ranges)1392 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1393   if (!FLAG_harmony_regexp_property) return false;
1394   if (!unicode()) return false;
1395   if (current() != '\\') return false;
1396   uc32 next = Next();
1397   bool parse_success = false;
1398   if (next == 'p') {
1399     Advance(2);
1400     parse_success = ParsePropertyClass(ranges, false);
1401   } else if (next == 'P') {
1402     Advance(2);
1403     parse_success = ParsePropertyClass(ranges, true);
1404   } else {
1405     return false;
1406   }
1407   if (!parse_success)
1408     ReportError(CStrVector("Invalid property name in character class"));
1409   return parse_success;
1410 }
1411 
ParseCharacterClass()1412 RegExpTree* RegExpParser::ParseCharacterClass() {
1413   static const char* kUnterminated = "Unterminated character class";
1414   static const char* kRangeInvalid = "Invalid character class";
1415   static const char* kRangeOutOfOrder = "Range out of order in character class";
1416 
1417   DCHECK_EQ(current(), '[');
1418   Advance();
1419   bool is_negated = false;
1420   if (current() == '^') {
1421     is_negated = true;
1422     Advance();
1423   }
1424   ZoneList<CharacterRange>* ranges =
1425       new (zone()) ZoneList<CharacterRange>(2, zone());
1426   while (has_more() && current() != ']') {
1427     bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1428     if (parsed_property) continue;
1429     uc16 char_class = kNoCharClass;
1430     CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1431     if (current() == '-') {
1432       Advance();
1433       if (current() == kEndMarker) {
1434         // If we reach the end we break out of the loop and let the
1435         // following code report an error.
1436         break;
1437       } else if (current() == ']') {
1438         AddRangeOrEscape(ranges, char_class, first, zone());
1439         ranges->Add(CharacterRange::Singleton('-'), zone());
1440         break;
1441       }
1442       uc16 char_class_2 = kNoCharClass;
1443       CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1444       if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1445         // Either end is an escaped character class. Treat the '-' verbatim.
1446         if (unicode()) {
1447           // ES2015 21.2.2.15.1 step 1.
1448           return ReportError(CStrVector(kRangeInvalid));
1449         }
1450         AddRangeOrEscape(ranges, char_class, first, zone());
1451         ranges->Add(CharacterRange::Singleton('-'), zone());
1452         AddRangeOrEscape(ranges, char_class_2, next, zone());
1453         continue;
1454       }
1455       // ES2015 21.2.2.15.1 step 6.
1456       if (first.from() > next.to()) {
1457         return ReportError(CStrVector(kRangeOutOfOrder));
1458       }
1459       ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1460     } else {
1461       AddRangeOrEscape(ranges, char_class, first, zone());
1462     }
1463   }
1464   if (!has_more()) {
1465     return ReportError(CStrVector(kUnterminated));
1466   }
1467   Advance();
1468   if (ranges->length() == 0) {
1469     ranges->Add(CharacterRange::Everything(), zone());
1470     is_negated = !is_negated;
1471   }
1472   return new (zone()) RegExpCharacterClass(ranges, is_negated);
1473 }
1474 
1475 
1476 #undef CHECK_FAILED
1477 
1478 
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1479 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1480                                FlatStringReader* input, JSRegExp::Flags flags,
1481                                RegExpCompileData* result) {
1482   DCHECK(result != NULL);
1483   RegExpParser parser(input, &result->error, flags, isolate, zone);
1484   RegExpTree* tree = parser.ParsePattern();
1485   if (parser.failed()) {
1486     DCHECK(tree == NULL);
1487     DCHECK(!result->error.is_null());
1488   } else {
1489     DCHECK(tree != NULL);
1490     DCHECK(result->error.is_null());
1491     if (FLAG_trace_regexp_parser) {
1492       OFStream os(stdout);
1493       tree->Print(os, zone);
1494       os << "\n";
1495     }
1496     result->tree = tree;
1497     int capture_count = parser.captures_started();
1498     result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1499     result->contains_anchor = parser.contains_anchor();
1500     result->capture_name_map = parser.CreateCaptureNameMap();
1501     result->capture_count = capture_count;
1502   }
1503   return !parser.failed();
1504 }
1505 
RegExpBuilder(Zone * zone,bool ignore_case,bool unicode)1506 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1507     : zone_(zone),
1508       pending_empty_(false),
1509       ignore_case_(ignore_case),
1510       unicode_(unicode),
1511       characters_(NULL),
1512       pending_surrogate_(kNoPendingSurrogate),
1513       terms_(),
1514       alternatives_()
1515 #ifdef DEBUG
1516       ,
1517       last_added_(ADD_NONE)
1518 #endif
1519 {
1520 }
1521 
1522 
AddLeadSurrogate(uc16 lead_surrogate)1523 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1524   DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1525   FlushPendingSurrogate();
1526   // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1527   pending_surrogate_ = lead_surrogate;
1528 }
1529 
1530 
AddTrailSurrogate(uc16 trail_surrogate)1531 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1532   DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1533   if (pending_surrogate_ != kNoPendingSurrogate) {
1534     uc16 lead_surrogate = pending_surrogate_;
1535     pending_surrogate_ = kNoPendingSurrogate;
1536     DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1537     uc32 combined =
1538         unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1539     if (NeedsDesugaringForIgnoreCase(combined)) {
1540       AddCharacterClassForDesugaring(combined);
1541     } else {
1542       ZoneList<uc16> surrogate_pair(2, zone());
1543       surrogate_pair.Add(lead_surrogate, zone());
1544       surrogate_pair.Add(trail_surrogate, zone());
1545       RegExpAtom* atom =
1546           new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1547       AddAtom(atom);
1548     }
1549   } else {
1550     pending_surrogate_ = trail_surrogate;
1551     FlushPendingSurrogate();
1552   }
1553 }
1554 
1555 
FlushPendingSurrogate()1556 void RegExpBuilder::FlushPendingSurrogate() {
1557   if (pending_surrogate_ != kNoPendingSurrogate) {
1558     DCHECK(unicode());
1559     uc32 c = pending_surrogate_;
1560     pending_surrogate_ = kNoPendingSurrogate;
1561     AddCharacterClassForDesugaring(c);
1562   }
1563 }
1564 
1565 
FlushCharacters()1566 void RegExpBuilder::FlushCharacters() {
1567   FlushPendingSurrogate();
1568   pending_empty_ = false;
1569   if (characters_ != NULL) {
1570     RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1571     characters_ = NULL;
1572     text_.Add(atom, zone());
1573     LAST(ADD_ATOM);
1574   }
1575 }
1576 
1577 
FlushText()1578 void RegExpBuilder::FlushText() {
1579   FlushCharacters();
1580   int num_text = text_.length();
1581   if (num_text == 0) {
1582     return;
1583   } else if (num_text == 1) {
1584     terms_.Add(text_.last(), zone());
1585   } else {
1586     RegExpText* text = new (zone()) RegExpText(zone());
1587     for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1588     terms_.Add(text, zone());
1589   }
1590   text_.Clear();
1591 }
1592 
1593 
AddCharacter(uc16 c)1594 void RegExpBuilder::AddCharacter(uc16 c) {
1595   FlushPendingSurrogate();
1596   pending_empty_ = false;
1597   if (NeedsDesugaringForIgnoreCase(c)) {
1598     AddCharacterClassForDesugaring(c);
1599   } else {
1600     if (characters_ == NULL) {
1601       characters_ = new (zone()) ZoneList<uc16>(4, zone());
1602     }
1603     characters_->Add(c, zone());
1604     LAST(ADD_CHAR);
1605   }
1606 }
1607 
1608 
AddUnicodeCharacter(uc32 c)1609 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1610   if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1611     DCHECK(unicode());
1612     AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1613     AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1614   } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1615     AddLeadSurrogate(c);
1616   } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1617     AddTrailSurrogate(c);
1618   } else {
1619     AddCharacter(static_cast<uc16>(c));
1620   }
1621 }
1622 
AddEscapedUnicodeCharacter(uc32 character)1623 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1624   // A lead or trail surrogate parsed via escape sequence will not
1625   // pair up with any preceding lead or following trail surrogate.
1626   FlushPendingSurrogate();
1627   AddUnicodeCharacter(character);
1628   FlushPendingSurrogate();
1629 }
1630 
AddEmpty()1631 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1632 
1633 
AddCharacterClass(RegExpCharacterClass * cc)1634 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1635   if (NeedsDesugaringForUnicode(cc)) {
1636     // With /u, character class needs to be desugared, so it
1637     // must be a standalone term instead of being part of a RegExpText.
1638     AddTerm(cc);
1639   } else {
1640     AddAtom(cc);
1641   }
1642 }
1643 
AddCharacterClassForDesugaring(uc32 c)1644 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1645   AddTerm(new (zone()) RegExpCharacterClass(
1646       CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1647 }
1648 
1649 
AddAtom(RegExpTree * term)1650 void RegExpBuilder::AddAtom(RegExpTree* term) {
1651   if (term->IsEmpty()) {
1652     AddEmpty();
1653     return;
1654   }
1655   if (term->IsTextElement()) {
1656     FlushCharacters();
1657     text_.Add(term, zone());
1658   } else {
1659     FlushText();
1660     terms_.Add(term, zone());
1661   }
1662   LAST(ADD_ATOM);
1663 }
1664 
1665 
AddTerm(RegExpTree * term)1666 void RegExpBuilder::AddTerm(RegExpTree* term) {
1667   FlushText();
1668   terms_.Add(term, zone());
1669   LAST(ADD_ATOM);
1670 }
1671 
1672 
AddAssertion(RegExpTree * assert)1673 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1674   FlushText();
1675   terms_.Add(assert, zone());
1676   LAST(ADD_ASSERT);
1677 }
1678 
1679 
NewAlternative()1680 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1681 
1682 
FlushTerms()1683 void RegExpBuilder::FlushTerms() {
1684   FlushText();
1685   int num_terms = terms_.length();
1686   RegExpTree* alternative;
1687   if (num_terms == 0) {
1688     alternative = new (zone()) RegExpEmpty();
1689   } else if (num_terms == 1) {
1690     alternative = terms_.last();
1691   } else {
1692     alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1693   }
1694   alternatives_.Add(alternative, zone());
1695   terms_.Clear();
1696   LAST(ADD_NONE);
1697 }
1698 
1699 
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1700 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1701   if (!unicode()) return false;
1702   // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1703   // necessarily mean that we need to desugar. It's probably nicer to have a
1704   // separate pass to figure out unicode desugarings.
1705   if (ignore_case()) return true;
1706   ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1707   CharacterRange::Canonicalize(ranges);
1708   for (int i = ranges->length() - 1; i >= 0; i--) {
1709     uc32 from = ranges->at(i).from();
1710     uc32 to = ranges->at(i).to();
1711     // Check for non-BMP characters.
1712     if (to >= kNonBmpStart) return true;
1713     // Check for lone surrogates.
1714     if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1715   }
1716   return false;
1717 }
1718 
1719 
NeedsDesugaringForIgnoreCase(uc32 c)1720 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1721 #ifdef V8_I18N_SUPPORT
1722   if (unicode() && ignore_case()) {
1723     USet* set = uset_open(c, c);
1724     uset_closeOver(set, USET_CASE_INSENSITIVE);
1725     uset_removeAllStrings(set);
1726     bool result = uset_size(set) > 1;
1727     uset_close(set);
1728     return result;
1729   }
1730   // In the case where ICU is not included, we act as if the unicode flag is
1731   // not set, and do not desugar.
1732 #endif  // V8_I18N_SUPPORT
1733   return false;
1734 }
1735 
1736 
ToRegExp()1737 RegExpTree* RegExpBuilder::ToRegExp() {
1738   FlushTerms();
1739   int num_alternatives = alternatives_.length();
1740   if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1741   if (num_alternatives == 1) return alternatives_.last();
1742   return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1743 }
1744 
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1745 bool RegExpBuilder::AddQuantifierToAtom(
1746     int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1747   FlushPendingSurrogate();
1748   if (pending_empty_) {
1749     pending_empty_ = false;
1750     return true;
1751   }
1752   RegExpTree* atom;
1753   if (characters_ != NULL) {
1754     DCHECK(last_added_ == ADD_CHAR);
1755     // Last atom was character.
1756     Vector<const uc16> char_vector = characters_->ToConstVector();
1757     int num_chars = char_vector.length();
1758     if (num_chars > 1) {
1759       Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1760       text_.Add(new (zone()) RegExpAtom(prefix), zone());
1761       char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1762     }
1763     characters_ = NULL;
1764     atom = new (zone()) RegExpAtom(char_vector);
1765     FlushText();
1766   } else if (text_.length() > 0) {
1767     DCHECK(last_added_ == ADD_ATOM);
1768     atom = text_.RemoveLast();
1769     FlushText();
1770   } else if (terms_.length() > 0) {
1771     DCHECK(last_added_ == ADD_ATOM);
1772     atom = terms_.RemoveLast();
1773     // With /u, lookarounds are not quantifiable.
1774     if (unicode() && atom->IsLookaround()) return false;
1775     if (atom->max_match() == 0) {
1776       // Guaranteed to only match an empty string.
1777       LAST(ADD_TERM);
1778       if (min == 0) {
1779         return true;
1780       }
1781       terms_.Add(atom, zone());
1782       return true;
1783     }
1784   } else {
1785     // Only call immediately after adding an atom or character!
1786     UNREACHABLE();
1787     return false;
1788   }
1789   terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1790              zone());
1791   LAST(ADD_TERM);
1792   return true;
1793 }
1794 
1795 }  // namespace internal
1796 }  // namespace v8
1797