1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-parser.h"
6
7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h"
9 #include "src/isolate.h"
10 #include "src/objects-inl.h"
11 #include "src/ostreams.h"
12 #include "src/regexp/jsregexp.h"
13 #include "src/utils.h"
14
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uset.h"
17 #endif // V8_I18N_SUPPORT
18
19 namespace v8 {
20 namespace internal {
21
RegExpParser(FlatStringReader * in,Handle<String> * error,JSRegExp::Flags flags,Isolate * isolate,Zone * zone)22 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
23 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
24 : isolate_(isolate),
25 zone_(zone),
26 error_(error),
27 captures_(NULL),
28 named_captures_(NULL),
29 named_back_references_(NULL),
30 in_(in),
31 current_(kEndMarker),
32 ignore_case_(flags & JSRegExp::kIgnoreCase),
33 multiline_(flags & JSRegExp::kMultiline),
34 unicode_(flags & JSRegExp::kUnicode),
35 next_pos_(0),
36 captures_started_(0),
37 capture_count_(0),
38 has_more_(true),
39 simple_(false),
40 contains_anchor_(false),
41 is_scanned_for_captures_(false),
42 failed_(false) {
43 Advance();
44 }
45
46 template <bool update_position>
ReadNext()47 inline uc32 RegExpParser::ReadNext() {
48 int position = next_pos_;
49 uc32 c0 = in()->Get(position);
50 position++;
51 // Read the whole surrogate pair in case of unicode flag, if possible.
52 if (unicode() && position < in()->length() &&
53 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
54 uc16 c1 = in()->Get(position);
55 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
56 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
57 position++;
58 }
59 }
60 if (update_position) next_pos_ = position;
61 return c0;
62 }
63
64
Next()65 uc32 RegExpParser::Next() {
66 if (has_next()) {
67 return ReadNext<false>();
68 } else {
69 return kEndMarker;
70 }
71 }
72
73
Advance()74 void RegExpParser::Advance() {
75 if (has_next()) {
76 StackLimitCheck check(isolate());
77 if (check.HasOverflowed()) {
78 ReportError(CStrVector(
79 MessageTemplate::TemplateString(MessageTemplate::kStackOverflow)));
80 } else if (zone()->excess_allocation()) {
81 ReportError(CStrVector("Regular expression too large"));
82 } else {
83 current_ = ReadNext<true>();
84 }
85 } else {
86 current_ = kEndMarker;
87 // Advance so that position() points to 1-after-the-last-character. This is
88 // important so that Reset() to this position works correctly.
89 next_pos_ = in()->length() + 1;
90 has_more_ = false;
91 }
92 }
93
94
Reset(int pos)95 void RegExpParser::Reset(int pos) {
96 next_pos_ = pos;
97 has_more_ = (pos < in()->length());
98 Advance();
99 }
100
101
Advance(int dist)102 void RegExpParser::Advance(int dist) {
103 next_pos_ += dist - 1;
104 Advance();
105 }
106
107
simple()108 bool RegExpParser::simple() { return simple_; }
109
IsSyntaxCharacterOrSlash(uc32 c)110 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
111 switch (c) {
112 case '^':
113 case '$':
114 case '\\':
115 case '.':
116 case '*':
117 case '+':
118 case '?':
119 case '(':
120 case ')':
121 case '[':
122 case ']':
123 case '{':
124 case '}':
125 case '|':
126 case '/':
127 return true;
128 default:
129 break;
130 }
131 return false;
132 }
133
134
ReportError(Vector<const char> message)135 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
136 if (failed_) return NULL; // Do not overwrite any existing error.
137 failed_ = true;
138 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
139 // Zip to the end to make sure the no more input is read.
140 current_ = kEndMarker;
141 next_pos_ = in()->length();
142 return NULL;
143 }
144
145
146 #define CHECK_FAILED /**/); \
147 if (failed_) return NULL; \
148 ((void)0
149
150
151 // Pattern ::
152 // Disjunction
ParsePattern()153 RegExpTree* RegExpParser::ParsePattern() {
154 RegExpTree* result = ParseDisjunction(CHECK_FAILED);
155 PatchNamedBackReferences(CHECK_FAILED);
156 DCHECK(!has_more());
157 // If the result of parsing is a literal string atom, and it has the
158 // same length as the input, then the atom is identical to the input.
159 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {
160 simple_ = true;
161 }
162 return result;
163 }
164
165
166 // Disjunction ::
167 // Alternative
168 // Alternative | Disjunction
169 // Alternative ::
170 // [empty]
171 // Term Alternative
172 // Term ::
173 // Assertion
174 // Atom
175 // Atom Quantifier
ParseDisjunction()176 RegExpTree* RegExpParser::ParseDisjunction() {
177 // Used to store current state while parsing subexpressions.
178 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
179 nullptr, ignore_case(), unicode(), zone());
180 RegExpParserState* state = &initial_state;
181 // Cache the builder in a local variable for quick access.
182 RegExpBuilder* builder = initial_state.builder();
183 while (true) {
184 switch (current()) {
185 case kEndMarker:
186 if (state->IsSubexpression()) {
187 // Inside a parenthesized group when hitting end of input.
188 return ReportError(CStrVector("Unterminated group"));
189 }
190 DCHECK_EQ(INITIAL, state->group_type());
191 // Parsing completed successfully.
192 return builder->ToRegExp();
193 case ')': {
194 if (!state->IsSubexpression()) {
195 return ReportError(CStrVector("Unmatched ')'"));
196 }
197 DCHECK_NE(INITIAL, state->group_type());
198
199 Advance();
200 // End disjunction parsing and convert builder content to new single
201 // regexp atom.
202 RegExpTree* body = builder->ToRegExp();
203
204 int end_capture_index = captures_started();
205
206 int capture_index = state->capture_index();
207 SubexpressionType group_type = state->group_type();
208
209 // Build result of subexpression.
210 if (group_type == CAPTURE) {
211 if (state->IsNamedCapture()) {
212 CreateNamedCaptureAtIndex(state->capture_name(),
213 capture_index CHECK_FAILED);
214 }
215 RegExpCapture* capture = GetCapture(capture_index);
216 capture->set_body(body);
217 body = capture;
218 } else if (group_type != GROUPING) {
219 DCHECK(group_type == POSITIVE_LOOKAROUND ||
220 group_type == NEGATIVE_LOOKAROUND);
221 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
222 body = new (zone()) RegExpLookaround(
223 body, is_positive, end_capture_index - capture_index,
224 capture_index, state->lookaround_type());
225 }
226
227 // Restore previous state.
228 state = state->previous_state();
229 builder = state->builder();
230
231 builder->AddAtom(body);
232 // For compatability with JSC and ES3, we allow quantifiers after
233 // lookaheads, and break in all cases.
234 break;
235 }
236 case '|': {
237 Advance();
238 builder->NewAlternative();
239 continue;
240 }
241 case '*':
242 case '+':
243 case '?':
244 return ReportError(CStrVector("Nothing to repeat"));
245 case '^': {
246 Advance();
247 if (multiline()) {
248 builder->AddAssertion(
249 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
250 } else {
251 builder->AddAssertion(
252 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
253 set_contains_anchor();
254 }
255 continue;
256 }
257 case '$': {
258 Advance();
259 RegExpAssertion::AssertionType assertion_type =
260 multiline() ? RegExpAssertion::END_OF_LINE
261 : RegExpAssertion::END_OF_INPUT;
262 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
263 continue;
264 }
265 case '.': {
266 Advance();
267 // everything except \x0a, \x0d, \u2028 and \u2029
268 ZoneList<CharacterRange>* ranges =
269 new (zone()) ZoneList<CharacterRange>(2, zone());
270 CharacterRange::AddClassEscape('.', ranges, zone());
271 RegExpCharacterClass* cc =
272 new (zone()) RegExpCharacterClass(ranges, false);
273 builder->AddCharacterClass(cc);
274 break;
275 }
276 case '(': {
277 SubexpressionType subexpr_type = CAPTURE;
278 RegExpLookaround::Type lookaround_type = state->lookaround_type();
279 bool is_named_capture = false;
280 Advance();
281 if (current() == '?') {
282 switch (Next()) {
283 case ':':
284 subexpr_type = GROUPING;
285 Advance(2);
286 break;
287 case '=':
288 lookaround_type = RegExpLookaround::LOOKAHEAD;
289 subexpr_type = POSITIVE_LOOKAROUND;
290 Advance(2);
291 break;
292 case '!':
293 lookaround_type = RegExpLookaround::LOOKAHEAD;
294 subexpr_type = NEGATIVE_LOOKAROUND;
295 Advance(2);
296 break;
297 case '<':
298 Advance();
299 if (FLAG_harmony_regexp_lookbehind) {
300 if (Next() == '=') {
301 subexpr_type = POSITIVE_LOOKAROUND;
302 lookaround_type = RegExpLookaround::LOOKBEHIND;
303 Advance(2);
304 break;
305 } else if (Next() == '!') {
306 subexpr_type = NEGATIVE_LOOKAROUND;
307 lookaround_type = RegExpLookaround::LOOKBEHIND;
308 Advance(2);
309 break;
310 }
311 }
312 if (FLAG_harmony_regexp_named_captures && unicode()) {
313 is_named_capture = true;
314 Advance();
315 break;
316 }
317 // Fall through.
318 default:
319 return ReportError(CStrVector("Invalid group"));
320 }
321 }
322
323 const ZoneVector<uc16>* capture_name = nullptr;
324 if (subexpr_type == CAPTURE) {
325 if (captures_started_ >= kMaxCaptures) {
326 return ReportError(CStrVector("Too many captures"));
327 }
328 captures_started_++;
329
330 if (is_named_capture) {
331 capture_name = ParseCaptureGroupName(CHECK_FAILED);
332 }
333 }
334 // Store current state and begin new disjunction parsing.
335 state = new (zone()) RegExpParserState(
336 state, subexpr_type, lookaround_type, captures_started_,
337 capture_name, ignore_case(), unicode(), zone());
338 builder = state->builder();
339 continue;
340 }
341 case '[': {
342 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
343 builder->AddCharacterClass(cc->AsCharacterClass());
344 break;
345 }
346 // Atom ::
347 // \ AtomEscape
348 case '\\':
349 switch (Next()) {
350 case kEndMarker:
351 return ReportError(CStrVector("\\ at end of pattern"));
352 case 'b':
353 Advance(2);
354 builder->AddAssertion(
355 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));
356 continue;
357 case 'B':
358 Advance(2);
359 builder->AddAssertion(
360 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));
361 continue;
362 // AtomEscape ::
363 // CharacterClassEscape
364 //
365 // CharacterClassEscape :: one of
366 // d D s S w W
367 case 'd':
368 case 'D':
369 case 's':
370 case 'S':
371 case 'w':
372 case 'W': {
373 uc32 c = Next();
374 Advance(2);
375 ZoneList<CharacterRange>* ranges =
376 new (zone()) ZoneList<CharacterRange>(2, zone());
377 CharacterRange::AddClassEscape(c, ranges, zone());
378 RegExpCharacterClass* cc =
379 new (zone()) RegExpCharacterClass(ranges, false);
380 builder->AddCharacterClass(cc);
381 break;
382 }
383 case 'p':
384 case 'P': {
385 uc32 p = Next();
386 Advance(2);
387 if (unicode()) {
388 if (FLAG_harmony_regexp_property) {
389 ZoneList<CharacterRange>* ranges =
390 new (zone()) ZoneList<CharacterRange>(2, zone());
391 if (!ParsePropertyClass(ranges, p == 'P')) {
392 return ReportError(CStrVector("Invalid property name"));
393 }
394 RegExpCharacterClass* cc =
395 new (zone()) RegExpCharacterClass(ranges, false);
396 builder->AddCharacterClass(cc);
397 } else {
398 // With /u, no identity escapes except for syntax characters
399 // are allowed. Otherwise, all identity escapes are allowed.
400 return ReportError(CStrVector("Invalid escape"));
401 }
402 } else {
403 builder->AddCharacter(p);
404 }
405 break;
406 }
407 case '1':
408 case '2':
409 case '3':
410 case '4':
411 case '5':
412 case '6':
413 case '7':
414 case '8':
415 case '9': {
416 int index = 0;
417 bool is_backref = ParseBackReferenceIndex(&index CHECK_FAILED);
418 if (is_backref) {
419 if (state->IsInsideCaptureGroup(index)) {
420 // The back reference is inside the capture group it refers to.
421 // Nothing can possibly have been captured yet, so we use empty
422 // instead. This ensures that, when checking a back reference,
423 // the capture registers of the referenced capture are either
424 // both set or both cleared.
425 builder->AddEmpty();
426 } else {
427 RegExpCapture* capture = GetCapture(index);
428 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
429 builder->AddAtom(atom);
430 }
431 break;
432 }
433 // With /u, no identity escapes except for syntax characters
434 // are allowed. Otherwise, all identity escapes are allowed.
435 if (unicode()) {
436 return ReportError(CStrVector("Invalid escape"));
437 }
438 uc32 first_digit = Next();
439 if (first_digit == '8' || first_digit == '9') {
440 builder->AddCharacter(first_digit);
441 Advance(2);
442 break;
443 }
444 }
445 // Fall through.
446 case '0': {
447 Advance();
448 if (unicode() && Next() >= '0' && Next() <= '9') {
449 // With /u, decimal escape with leading 0 are not parsed as octal.
450 return ReportError(CStrVector("Invalid decimal escape"));
451 }
452 uc32 octal = ParseOctalLiteral();
453 builder->AddCharacter(octal);
454 break;
455 }
456 // ControlEscape :: one of
457 // f n r t v
458 case 'f':
459 Advance(2);
460 builder->AddCharacter('\f');
461 break;
462 case 'n':
463 Advance(2);
464 builder->AddCharacter('\n');
465 break;
466 case 'r':
467 Advance(2);
468 builder->AddCharacter('\r');
469 break;
470 case 't':
471 Advance(2);
472 builder->AddCharacter('\t');
473 break;
474 case 'v':
475 Advance(2);
476 builder->AddCharacter('\v');
477 break;
478 case 'c': {
479 Advance();
480 uc32 controlLetter = Next();
481 // Special case if it is an ASCII letter.
482 // Convert lower case letters to uppercase.
483 uc32 letter = controlLetter & ~('a' ^ 'A');
484 if (letter < 'A' || 'Z' < letter) {
485 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
486 // This is outside the specification. We match JSC in
487 // reading the backslash as a literal character instead
488 // of as starting an escape.
489 if (unicode()) {
490 // With /u, invalid escapes are not treated as identity escapes.
491 return ReportError(CStrVector("Invalid unicode escape"));
492 }
493 builder->AddCharacter('\\');
494 } else {
495 Advance(2);
496 builder->AddCharacter(controlLetter & 0x1f);
497 }
498 break;
499 }
500 case 'x': {
501 Advance(2);
502 uc32 value;
503 if (ParseHexEscape(2, &value)) {
504 builder->AddCharacter(value);
505 } else if (!unicode()) {
506 builder->AddCharacter('x');
507 } else {
508 // With /u, invalid escapes are not treated as identity escapes.
509 return ReportError(CStrVector("Invalid escape"));
510 }
511 break;
512 }
513 case 'u': {
514 Advance(2);
515 uc32 value;
516 if (ParseUnicodeEscape(&value)) {
517 builder->AddEscapedUnicodeCharacter(value);
518 } else if (!unicode()) {
519 builder->AddCharacter('u');
520 } else {
521 // With /u, invalid escapes are not treated as identity escapes.
522 return ReportError(CStrVector("Invalid unicode escape"));
523 }
524 break;
525 }
526 case 'k':
527 if (FLAG_harmony_regexp_named_captures && unicode()) {
528 Advance(2);
529 ParseNamedBackReference(builder, state CHECK_FAILED);
530 break;
531 }
532 // Fall through.
533 default:
534 Advance();
535 // With /u, no identity escapes except for syntax characters
536 // are allowed. Otherwise, all identity escapes are allowed.
537 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
538 builder->AddCharacter(current());
539 Advance();
540 } else {
541 return ReportError(CStrVector("Invalid escape"));
542 }
543 break;
544 }
545 break;
546 case '{': {
547 int dummy;
548 bool parsed = ParseIntervalQuantifier(&dummy, &dummy CHECK_FAILED);
549 if (parsed) return ReportError(CStrVector("Nothing to repeat"));
550 // Fall through.
551 }
552 case '}':
553 case ']':
554 if (unicode()) {
555 return ReportError(CStrVector("Lone quantifier brackets"));
556 }
557 // Fall through.
558 default:
559 builder->AddUnicodeCharacter(current());
560 Advance();
561 break;
562 } // end switch(current())
563
564 int min;
565 int max;
566 switch (current()) {
567 // QuantifierPrefix ::
568 // *
569 // +
570 // ?
571 // {
572 case '*':
573 min = 0;
574 max = RegExpTree::kInfinity;
575 Advance();
576 break;
577 case '+':
578 min = 1;
579 max = RegExpTree::kInfinity;
580 Advance();
581 break;
582 case '?':
583 min = 0;
584 max = 1;
585 Advance();
586 break;
587 case '{':
588 if (ParseIntervalQuantifier(&min, &max)) {
589 if (max < min) {
590 return ReportError(
591 CStrVector("numbers out of order in {} quantifier"));
592 }
593 break;
594 } else if (unicode()) {
595 // With /u, incomplete quantifiers are not allowed.
596 return ReportError(CStrVector("Incomplete quantifier"));
597 }
598 continue;
599 default:
600 continue;
601 }
602 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
603 if (current() == '?') {
604 quantifier_type = RegExpQuantifier::NON_GREEDY;
605 Advance();
606 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
607 // FLAG_regexp_possessive_quantifier is a debug-only flag.
608 quantifier_type = RegExpQuantifier::POSSESSIVE;
609 Advance();
610 }
611 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
612 return ReportError(CStrVector("Invalid quantifier"));
613 }
614 }
615 }
616
617
618 #ifdef DEBUG
619 // Currently only used in an DCHECK.
IsSpecialClassEscape(uc32 c)620 static bool IsSpecialClassEscape(uc32 c) {
621 switch (c) {
622 case 'd':
623 case 'D':
624 case 's':
625 case 'S':
626 case 'w':
627 case 'W':
628 return true;
629 default:
630 return false;
631 }
632 }
633 #endif
634
635
636 // In order to know whether an escape is a backreference or not we have to scan
637 // the entire regexp and find the number of capturing parentheses. However we
638 // don't want to scan the regexp twice unless it is necessary. This mini-parser
639 // is called when needed. It can see the difference between capturing and
640 // noncapturing parentheses and can skip character classes and backslash-escaped
641 // characters.
ScanForCaptures()642 void RegExpParser::ScanForCaptures() {
643 // Start with captures started previous to current position
644 int capture_count = captures_started();
645 // Add count of captures after this position.
646 int n;
647 while ((n = current()) != kEndMarker) {
648 Advance();
649 switch (n) {
650 case '\\':
651 Advance();
652 break;
653 case '[': {
654 int c;
655 while ((c = current()) != kEndMarker) {
656 Advance();
657 if (c == '\\') {
658 Advance();
659 } else {
660 if (c == ']') break;
661 }
662 }
663 break;
664 }
665 case '(':
666 if (current() != '?') capture_count++;
667 break;
668 }
669 }
670 capture_count_ = capture_count;
671 is_scanned_for_captures_ = true;
672 }
673
674
ParseBackReferenceIndex(int * index_out)675 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {
676 DCHECK_EQ('\\', current());
677 DCHECK('1' <= Next() && Next() <= '9');
678 // Try to parse a decimal literal that is no greater than the total number
679 // of left capturing parentheses in the input.
680 int start = position();
681 int value = Next() - '0';
682 Advance(2);
683 while (true) {
684 uc32 c = current();
685 if (IsDecimalDigit(c)) {
686 value = 10 * value + (c - '0');
687 if (value > kMaxCaptures) {
688 Reset(start);
689 return false;
690 }
691 Advance();
692 } else {
693 break;
694 }
695 }
696 if (value > captures_started()) {
697 if (!is_scanned_for_captures_) {
698 int saved_position = position();
699 ScanForCaptures();
700 Reset(saved_position);
701 }
702 if (value > capture_count_) {
703 Reset(start);
704 return false;
705 }
706 }
707 *index_out = value;
708 return true;
709 }
710
push_code_unit(ZoneVector<uc16> * v,uint32_t code_unit)711 static void push_code_unit(ZoneVector<uc16>* v, uint32_t code_unit) {
712 if (code_unit <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
713 v->push_back(code_unit);
714 } else {
715 v->push_back(unibrow::Utf16::LeadSurrogate(code_unit));
716 v->push_back(unibrow::Utf16::TrailSurrogate(code_unit));
717 }
718 }
719
ParseCaptureGroupName()720 const ZoneVector<uc16>* RegExpParser::ParseCaptureGroupName() {
721 DCHECK(FLAG_harmony_regexp_named_captures);
722 DCHECK(unicode());
723
724 ZoneVector<uc16>* name =
725 new (zone()->New(sizeof(ZoneVector<uc16>))) ZoneVector<uc16>(zone());
726
727 bool at_start = true;
728 while (true) {
729 uc32 c = current();
730 Advance();
731
732 // Convert unicode escapes.
733 if (c == '\\' && current() == 'u') {
734 Advance();
735 if (!ParseUnicodeEscape(&c)) {
736 ReportError(CStrVector("Invalid Unicode escape sequence"));
737 return nullptr;
738 }
739 }
740
741 if (at_start) {
742 if (!IdentifierStart::Is(c)) {
743 ReportError(CStrVector("Invalid capture group name"));
744 return nullptr;
745 }
746 push_code_unit(name, c);
747 at_start = false;
748 } else {
749 if (c == '>') {
750 break;
751 } else if (IdentifierPart::Is(c)) {
752 push_code_unit(name, c);
753 } else {
754 ReportError(CStrVector("Invalid capture group name"));
755 return nullptr;
756 }
757 }
758 }
759
760 return name;
761 }
762
CreateNamedCaptureAtIndex(const ZoneVector<uc16> * name,int index)763 bool RegExpParser::CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name,
764 int index) {
765 DCHECK(FLAG_harmony_regexp_named_captures);
766 DCHECK(unicode());
767 DCHECK(0 < index && index <= captures_started_);
768 DCHECK_NOT_NULL(name);
769
770 if (named_captures_ == nullptr) {
771 named_captures_ = new (zone()) ZoneList<RegExpCapture*>(1, zone());
772 } else {
773 // Check for duplicates and bail if we find any.
774 for (const auto& named_capture : *named_captures_) {
775 if (*named_capture->name() == *name) {
776 ReportError(CStrVector("Duplicate capture group name"));
777 return false;
778 }
779 }
780 }
781
782 RegExpCapture* capture = GetCapture(index);
783 DCHECK(capture->name() == nullptr);
784
785 capture->set_name(name);
786 named_captures_->Add(capture, zone());
787
788 return true;
789 }
790
ParseNamedBackReference(RegExpBuilder * builder,RegExpParserState * state)791 bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder,
792 RegExpParserState* state) {
793 // The parser is assumed to be on the '<' in \k<name>.
794 if (current() != '<') {
795 ReportError(CStrVector("Invalid named reference"));
796 return false;
797 }
798
799 Advance();
800 const ZoneVector<uc16>* name = ParseCaptureGroupName();
801 if (name == nullptr) {
802 return false;
803 }
804
805 if (state->IsInsideCaptureGroup(name)) {
806 builder->AddEmpty();
807 } else {
808 RegExpBackReference* atom = new (zone()) RegExpBackReference();
809 atom->set_name(name);
810
811 builder->AddAtom(atom);
812
813 if (named_back_references_ == nullptr) {
814 named_back_references_ =
815 new (zone()) ZoneList<RegExpBackReference*>(1, zone());
816 }
817 named_back_references_->Add(atom, zone());
818 }
819
820 return true;
821 }
822
PatchNamedBackReferences()823 void RegExpParser::PatchNamedBackReferences() {
824 if (named_back_references_ == nullptr) return;
825
826 if (named_captures_ == nullptr) {
827 ReportError(CStrVector("Invalid named capture referenced"));
828 return;
829 }
830
831 // Look up and patch the actual capture for each named back reference.
832 // TODO(jgruber): O(n^2), optimize if necessary.
833
834 for (int i = 0; i < named_back_references_->length(); i++) {
835 RegExpBackReference* ref = named_back_references_->at(i);
836
837 int index = -1;
838 for (const auto& capture : *named_captures_) {
839 if (*capture->name() == *ref->name()) {
840 index = capture->index();
841 break;
842 }
843 }
844
845 if (index == -1) {
846 ReportError(CStrVector("Invalid named capture referenced"));
847 return;
848 }
849
850 ref->set_capture(GetCapture(index));
851 }
852 }
853
GetCapture(int index)854 RegExpCapture* RegExpParser::GetCapture(int index) {
855 // The index for the capture groups are one-based. Its index in the list is
856 // zero-based.
857 int know_captures =
858 is_scanned_for_captures_ ? capture_count_ : captures_started_;
859 DCHECK(index <= know_captures);
860 if (captures_ == NULL) {
861 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());
862 }
863 while (captures_->length() < know_captures) {
864 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());
865 }
866 return captures_->at(index - 1);
867 }
868
CreateCaptureNameMap()869 Handle<FixedArray> RegExpParser::CreateCaptureNameMap() {
870 if (named_captures_ == nullptr || named_captures_->is_empty())
871 return Handle<FixedArray>();
872
873 Factory* factory = isolate()->factory();
874
875 int len = named_captures_->length() * 2;
876 Handle<FixedArray> array = factory->NewFixedArray(len);
877
878 for (int i = 0; i < named_captures_->length(); i++) {
879 RegExpCapture* capture = named_captures_->at(i);
880 MaybeHandle<String> name = factory->NewStringFromTwoByte(capture->name());
881 array->set(i * 2, *name.ToHandleChecked());
882 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
883 }
884
885 return array;
886 }
887
IsInsideCaptureGroup(int index)888 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {
889 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
890 if (s->group_type() != CAPTURE) continue;
891 // Return true if we found the matching capture index.
892 if (index == s->capture_index()) return true;
893 // Abort if index is larger than what has been parsed up till this state.
894 if (index > s->capture_index()) return false;
895 }
896 return false;
897 }
898
IsInsideCaptureGroup(const ZoneVector<uc16> * name)899 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(
900 const ZoneVector<uc16>* name) {
901 DCHECK_NOT_NULL(name);
902 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {
903 if (s->capture_name() == nullptr) continue;
904 if (*s->capture_name() == *name) return true;
905 }
906 return false;
907 }
908
909 // QuantifierPrefix ::
910 // { DecimalDigits }
911 // { DecimalDigits , }
912 // { DecimalDigits , DecimalDigits }
913 //
914 // Returns true if parsing succeeds, and set the min_out and max_out
915 // values. Values are truncated to RegExpTree::kInfinity if they overflow.
ParseIntervalQuantifier(int * min_out,int * max_out)916 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {
917 DCHECK_EQ(current(), '{');
918 int start = position();
919 Advance();
920 int min = 0;
921 if (!IsDecimalDigit(current())) {
922 Reset(start);
923 return false;
924 }
925 while (IsDecimalDigit(current())) {
926 int next = current() - '0';
927 if (min > (RegExpTree::kInfinity - next) / 10) {
928 // Overflow. Skip past remaining decimal digits and return -1.
929 do {
930 Advance();
931 } while (IsDecimalDigit(current()));
932 min = RegExpTree::kInfinity;
933 break;
934 }
935 min = 10 * min + next;
936 Advance();
937 }
938 int max = 0;
939 if (current() == '}') {
940 max = min;
941 Advance();
942 } else if (current() == ',') {
943 Advance();
944 if (current() == '}') {
945 max = RegExpTree::kInfinity;
946 Advance();
947 } else {
948 while (IsDecimalDigit(current())) {
949 int next = current() - '0';
950 if (max > (RegExpTree::kInfinity - next) / 10) {
951 do {
952 Advance();
953 } while (IsDecimalDigit(current()));
954 max = RegExpTree::kInfinity;
955 break;
956 }
957 max = 10 * max + next;
958 Advance();
959 }
960 if (current() != '}') {
961 Reset(start);
962 return false;
963 }
964 Advance();
965 }
966 } else {
967 Reset(start);
968 return false;
969 }
970 *min_out = min;
971 *max_out = max;
972 return true;
973 }
974
975
ParseOctalLiteral()976 uc32 RegExpParser::ParseOctalLiteral() {
977 DCHECK(('0' <= current() && current() <= '7') || current() == kEndMarker);
978 // For compatibility with some other browsers (not all), we parse
979 // up to three octal digits with a value below 256.
980 uc32 value = current() - '0';
981 Advance();
982 if ('0' <= current() && current() <= '7') {
983 value = value * 8 + current() - '0';
984 Advance();
985 if (value < 32 && '0' <= current() && current() <= '7') {
986 value = value * 8 + current() - '0';
987 Advance();
988 }
989 }
990 return value;
991 }
992
993
ParseHexEscape(int length,uc32 * value)994 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
995 int start = position();
996 uc32 val = 0;
997 for (int i = 0; i < length; ++i) {
998 uc32 c = current();
999 int d = HexValue(c);
1000 if (d < 0) {
1001 Reset(start);
1002 return false;
1003 }
1004 val = val * 16 + d;
1005 Advance();
1006 }
1007 *value = val;
1008 return true;
1009 }
1010
1011 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uc32 * value)1012 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
1013 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
1014 // allowed). In the latter case, the number of hex digits between { } is
1015 // arbitrary. \ and u have already been read.
1016 if (current() == '{' && unicode()) {
1017 int start = position();
1018 Advance();
1019 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
1020 if (current() == '}') {
1021 Advance();
1022 return true;
1023 }
1024 }
1025 Reset(start);
1026 return false;
1027 }
1028 // \u but no {, or \u{...} escapes not allowed.
1029 bool result = ParseHexEscape(4, value);
1030 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
1031 current() == '\\') {
1032 // Attempt to read trail surrogate.
1033 int start = position();
1034 if (Next() == 'u') {
1035 Advance(2);
1036 uc32 trail;
1037 if (ParseHexEscape(4, &trail) &&
1038 unibrow::Utf16::IsTrailSurrogate(trail)) {
1039 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
1040 static_cast<uc16>(trail));
1041 return true;
1042 }
1043 }
1044 Reset(start);
1045 }
1046 return result;
1047 }
1048
1049 #ifdef V8_I18N_SUPPORT
1050
1051 namespace {
1052
IsExactPropertyAlias(const char * property_name,UProperty property)1053 bool IsExactPropertyAlias(const char* property_name, UProperty property) {
1054 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1055 if (short_name != NULL && strcmp(property_name, short_name) == 0) return true;
1056 for (int i = 0;; i++) {
1057 const char* long_name = u_getPropertyName(
1058 property, static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1059 if (long_name == NULL) break;
1060 if (strcmp(property_name, long_name) == 0) return true;
1061 }
1062 return false;
1063 }
1064
IsExactPropertyValueAlias(const char * property_value_name,UProperty property,int32_t property_value)1065 bool IsExactPropertyValueAlias(const char* property_value_name,
1066 UProperty property, int32_t property_value) {
1067 const char* short_name =
1068 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1069 if (short_name != NULL && strcmp(property_value_name, short_name) == 0) {
1070 return true;
1071 }
1072 for (int i = 0;; i++) {
1073 const char* long_name = u_getPropertyValueName(
1074 property, property_value,
1075 static_cast<UPropertyNameChoice>(U_LONG_PROPERTY_NAME + i));
1076 if (long_name == NULL) break;
1077 if (strcmp(property_value_name, long_name) == 0) return true;
1078 }
1079 return false;
1080 }
1081
LookupPropertyValueName(UProperty property,const char * property_value_name,bool negate,ZoneList<CharacterRange> * result,Zone * zone)1082 bool LookupPropertyValueName(UProperty property,
1083 const char* property_value_name, bool negate,
1084 ZoneList<CharacterRange>* result, Zone* zone) {
1085 int32_t property_value =
1086 u_getPropertyValueEnum(property, property_value_name);
1087 if (property_value == UCHAR_INVALID_CODE) return false;
1088
1089 // We require the property name to match exactly to one of the property value
1090 // aliases. However, u_getPropertyValueEnum uses loose matching.
1091 if (!IsExactPropertyValueAlias(property_value_name, property,
1092 property_value)) {
1093 return false;
1094 }
1095
1096 USet* set = uset_openEmpty();
1097 UErrorCode ec = U_ZERO_ERROR;
1098 uset_applyIntPropertyValue(set, property, property_value, &ec);
1099 bool success = ec == U_ZERO_ERROR && !uset_isEmpty(set);
1100
1101 if (success) {
1102 uset_removeAllStrings(set);
1103 if (negate) uset_complement(set);
1104 int item_count = uset_getItemCount(set);
1105 int item_result = 0;
1106 for (int i = 0; i < item_count; i++) {
1107 uc32 start = 0;
1108 uc32 end = 0;
1109 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
1110 result->Add(CharacterRange::Range(start, end), zone);
1111 }
1112 DCHECK_EQ(U_ZERO_ERROR, ec);
1113 DCHECK_EQ(0, item_result);
1114 }
1115 uset_close(set);
1116 return success;
1117 }
1118
1119 template <size_t N>
NameEquals(const char * name,const char (& literal)[N])1120 inline bool NameEquals(const char* name, const char (&literal)[N]) {
1121 return strncmp(name, literal, N + 1) == 0;
1122 }
1123
LookupSpecialPropertyValueName(const char * name,ZoneList<CharacterRange> * result,bool negate,Zone * zone)1124 bool LookupSpecialPropertyValueName(const char* name,
1125 ZoneList<CharacterRange>* result,
1126 bool negate, Zone* zone) {
1127 if (NameEquals(name, "Any")) {
1128 if (!negate) result->Add(CharacterRange::Everything(), zone);
1129 } else if (NameEquals(name, "ASCII")) {
1130 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
1131 : CharacterRange::Range(0x0, 0x7f),
1132 zone);
1133 } else if (NameEquals(name, "Assigned")) {
1134 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY, "Unassigned",
1135 !negate, result, zone);
1136 } else {
1137 return false;
1138 }
1139 return true;
1140 }
1141
1142 } // anonymous namespace
1143
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1144 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1145 bool negate) {
1146 // Parse the property class as follows:
1147 // - In \p{name}, 'name' is interpreted
1148 // - either as a general category property value name.
1149 // - or as a binary property name.
1150 // - In \p{name=value}, 'name' is interpreted as an enumerated property name,
1151 // and 'value' is interpreted as one of the available property value names.
1152 // - Aliases in PropertyAlias.txt and PropertyValueAlias.txt can be used.
1153 // - Loose matching is not applied.
1154 List<char> first_part;
1155 List<char> second_part;
1156 if (current() == '{') {
1157 // Parse \p{[PropertyName=]PropertyNameValue}
1158 for (Advance(); current() != '}' && current() != '='; Advance()) {
1159 if (!has_next()) return false;
1160 first_part.Add(static_cast<char>(current()));
1161 }
1162 if (current() == '=') {
1163 for (Advance(); current() != '}'; Advance()) {
1164 if (!has_next()) return false;
1165 second_part.Add(static_cast<char>(current()));
1166 }
1167 second_part.Add(0); // null-terminate string.
1168 }
1169 } else {
1170 return false;
1171 }
1172 Advance();
1173 first_part.Add(0); // null-terminate string.
1174
1175 if (second_part.is_empty()) {
1176 // First attempt to interpret as general category property value name.
1177 const char* name = first_part.ToConstVector().start();
1178 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
1179 result, zone())) {
1180 return true;
1181 }
1182 // Interpret "Any", "ASCII", and "Assigned".
1183 if (LookupSpecialPropertyValueName(name, result, negate, zone())) {
1184 return true;
1185 }
1186 // Then attempt to interpret as binary property name with value name 'Y'.
1187 UProperty property = u_getPropertyEnum(name);
1188 if (property < UCHAR_BINARY_START) return false;
1189 if (property >= UCHAR_BINARY_LIMIT) return false;
1190 if (!IsExactPropertyAlias(name, property)) return false;
1191 return LookupPropertyValueName(property, negate ? "N" : "Y", false, result,
1192 zone());
1193 } else {
1194 // Both property name and value name are specified. Attempt to interpret
1195 // the property name as enumerated property.
1196 const char* property_name = first_part.ToConstVector().start();
1197 const char* value_name = second_part.ToConstVector().start();
1198 UProperty property = u_getPropertyEnum(property_name);
1199 if (property < UCHAR_INT_START) return false;
1200 if (property >= UCHAR_INT_LIMIT) return false;
1201 if (!IsExactPropertyAlias(property_name, property)) return false;
1202 return LookupPropertyValueName(property, value_name, negate, result,
1203 zone());
1204 }
1205 }
1206
1207 #else // V8_I18N_SUPPORT
1208
ParsePropertyClass(ZoneList<CharacterRange> * result,bool negate)1209 bool RegExpParser::ParsePropertyClass(ZoneList<CharacterRange>* result,
1210 bool negate) {
1211 return false;
1212 }
1213
1214 #endif // V8_I18N_SUPPORT
1215
ParseUnlimitedLengthHexNumber(int max_value,uc32 * value)1216 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
1217 uc32 x = 0;
1218 int d = HexValue(current());
1219 if (d < 0) {
1220 return false;
1221 }
1222 while (d >= 0) {
1223 x = x * 16 + d;
1224 if (x > max_value) {
1225 return false;
1226 }
1227 Advance();
1228 d = HexValue(current());
1229 }
1230 *value = x;
1231 return true;
1232 }
1233
1234
ParseClassCharacterEscape()1235 uc32 RegExpParser::ParseClassCharacterEscape() {
1236 DCHECK(current() == '\\');
1237 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
1238 Advance();
1239 switch (current()) {
1240 case 'b':
1241 Advance();
1242 return '\b';
1243 // ControlEscape :: one of
1244 // f n r t v
1245 case 'f':
1246 Advance();
1247 return '\f';
1248 case 'n':
1249 Advance();
1250 return '\n';
1251 case 'r':
1252 Advance();
1253 return '\r';
1254 case 't':
1255 Advance();
1256 return '\t';
1257 case 'v':
1258 Advance();
1259 return '\v';
1260 case 'c': {
1261 uc32 controlLetter = Next();
1262 uc32 letter = controlLetter & ~('A' ^ 'a');
1263 // For compatibility with JSC, inside a character class. We also accept
1264 // digits and underscore as control characters, unless with /u.
1265 if (letter >= 'A' && letter <= 'Z') {
1266 Advance(2);
1267 // Control letters mapped to ASCII control characters in the range
1268 // 0x00-0x1f.
1269 return controlLetter & 0x1f;
1270 }
1271 if (unicode()) {
1272 // With /u, invalid escapes are not treated as identity escapes.
1273 ReportError(CStrVector("Invalid class escape"));
1274 return 0;
1275 }
1276 if ((controlLetter >= '0' && controlLetter <= '9') ||
1277 controlLetter == '_') {
1278 Advance(2);
1279 return controlLetter & 0x1f;
1280 }
1281 // We match JSC in reading the backslash as a literal
1282 // character instead of as starting an escape.
1283 return '\\';
1284 }
1285 case '0':
1286 // With /u, \0 is interpreted as NUL if not followed by another digit.
1287 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
1288 Advance();
1289 return 0;
1290 }
1291 // Fall through.
1292 case '1':
1293 case '2':
1294 case '3':
1295 case '4':
1296 case '5':
1297 case '6':
1298 case '7':
1299 // For compatibility, we interpret a decimal escape that isn't
1300 // a back reference (and therefore either \0 or not valid according
1301 // to the specification) as a 1..3 digit octal character code.
1302 if (unicode()) {
1303 // With /u, decimal escape is not interpreted as octal character code.
1304 ReportError(CStrVector("Invalid class escape"));
1305 return 0;
1306 }
1307 return ParseOctalLiteral();
1308 case 'x': {
1309 Advance();
1310 uc32 value;
1311 if (ParseHexEscape(2, &value)) return value;
1312 if (unicode()) {
1313 // With /u, invalid escapes are not treated as identity escapes.
1314 ReportError(CStrVector("Invalid escape"));
1315 return 0;
1316 }
1317 // If \x is not followed by a two-digit hexadecimal, treat it
1318 // as an identity escape.
1319 return 'x';
1320 }
1321 case 'u': {
1322 Advance();
1323 uc32 value;
1324 if (ParseUnicodeEscape(&value)) return value;
1325 if (unicode()) {
1326 // With /u, invalid escapes are not treated as identity escapes.
1327 ReportError(CStrVector("Invalid unicode escape"));
1328 return 0;
1329 }
1330 // If \u is not followed by a two-digit hexadecimal, treat it
1331 // as an identity escape.
1332 return 'u';
1333 }
1334 default: {
1335 uc32 result = current();
1336 // With /u, no identity escapes except for syntax characters and '-' are
1337 // allowed. Otherwise, all identity escapes are allowed.
1338 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
1339 Advance();
1340 return result;
1341 }
1342 ReportError(CStrVector("Invalid escape"));
1343 return 0;
1344 }
1345 }
1346 return 0;
1347 }
1348
1349
ParseClassAtom(uc16 * char_class)1350 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
1351 DCHECK_EQ(0, *char_class);
1352 uc32 first = current();
1353 if (first == '\\') {
1354 switch (Next()) {
1355 case 'w':
1356 case 'W':
1357 case 'd':
1358 case 'D':
1359 case 's':
1360 case 'S': {
1361 *char_class = Next();
1362 Advance(2);
1363 return CharacterRange::Singleton(0); // Return dummy value.
1364 }
1365 case kEndMarker:
1366 return ReportError(CStrVector("\\ at end of pattern"));
1367 default:
1368 first = ParseClassCharacterEscape(CHECK_FAILED);
1369 }
1370 } else {
1371 Advance();
1372 }
1373
1374 return CharacterRange::Singleton(first);
1375 }
1376
1377 static const uc16 kNoCharClass = 0;
1378
1379 // Adds range or pre-defined character class to character ranges.
1380 // If char_class is not kInvalidClass, it's interpreted as a class
1381 // escape (i.e., 's' means whitespace, from '\s').
AddRangeOrEscape(ZoneList<CharacterRange> * ranges,uc16 char_class,CharacterRange range,Zone * zone)1382 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
1383 uc16 char_class, CharacterRange range,
1384 Zone* zone) {
1385 if (char_class != kNoCharClass) {
1386 CharacterRange::AddClassEscape(char_class, ranges, zone);
1387 } else {
1388 ranges->Add(range, zone);
1389 }
1390 }
1391
ParseClassProperty(ZoneList<CharacterRange> * ranges)1392 bool RegExpParser::ParseClassProperty(ZoneList<CharacterRange>* ranges) {
1393 if (!FLAG_harmony_regexp_property) return false;
1394 if (!unicode()) return false;
1395 if (current() != '\\') return false;
1396 uc32 next = Next();
1397 bool parse_success = false;
1398 if (next == 'p') {
1399 Advance(2);
1400 parse_success = ParsePropertyClass(ranges, false);
1401 } else if (next == 'P') {
1402 Advance(2);
1403 parse_success = ParsePropertyClass(ranges, true);
1404 } else {
1405 return false;
1406 }
1407 if (!parse_success)
1408 ReportError(CStrVector("Invalid property name in character class"));
1409 return parse_success;
1410 }
1411
ParseCharacterClass()1412 RegExpTree* RegExpParser::ParseCharacterClass() {
1413 static const char* kUnterminated = "Unterminated character class";
1414 static const char* kRangeInvalid = "Invalid character class";
1415 static const char* kRangeOutOfOrder = "Range out of order in character class";
1416
1417 DCHECK_EQ(current(), '[');
1418 Advance();
1419 bool is_negated = false;
1420 if (current() == '^') {
1421 is_negated = true;
1422 Advance();
1423 }
1424 ZoneList<CharacterRange>* ranges =
1425 new (zone()) ZoneList<CharacterRange>(2, zone());
1426 while (has_more() && current() != ']') {
1427 bool parsed_property = ParseClassProperty(ranges CHECK_FAILED);
1428 if (parsed_property) continue;
1429 uc16 char_class = kNoCharClass;
1430 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1431 if (current() == '-') {
1432 Advance();
1433 if (current() == kEndMarker) {
1434 // If we reach the end we break out of the loop and let the
1435 // following code report an error.
1436 break;
1437 } else if (current() == ']') {
1438 AddRangeOrEscape(ranges, char_class, first, zone());
1439 ranges->Add(CharacterRange::Singleton('-'), zone());
1440 break;
1441 }
1442 uc16 char_class_2 = kNoCharClass;
1443 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1444 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1445 // Either end is an escaped character class. Treat the '-' verbatim.
1446 if (unicode()) {
1447 // ES2015 21.2.2.15.1 step 1.
1448 return ReportError(CStrVector(kRangeInvalid));
1449 }
1450 AddRangeOrEscape(ranges, char_class, first, zone());
1451 ranges->Add(CharacterRange::Singleton('-'), zone());
1452 AddRangeOrEscape(ranges, char_class_2, next, zone());
1453 continue;
1454 }
1455 // ES2015 21.2.2.15.1 step 6.
1456 if (first.from() > next.to()) {
1457 return ReportError(CStrVector(kRangeOutOfOrder));
1458 }
1459 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1460 } else {
1461 AddRangeOrEscape(ranges, char_class, first, zone());
1462 }
1463 }
1464 if (!has_more()) {
1465 return ReportError(CStrVector(kUnterminated));
1466 }
1467 Advance();
1468 if (ranges->length() == 0) {
1469 ranges->Add(CharacterRange::Everything(), zone());
1470 is_negated = !is_negated;
1471 }
1472 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1473 }
1474
1475
1476 #undef CHECK_FAILED
1477
1478
ParseRegExp(Isolate * isolate,Zone * zone,FlatStringReader * input,JSRegExp::Flags flags,RegExpCompileData * result)1479 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1480 FlatStringReader* input, JSRegExp::Flags flags,
1481 RegExpCompileData* result) {
1482 DCHECK(result != NULL);
1483 RegExpParser parser(input, &result->error, flags, isolate, zone);
1484 RegExpTree* tree = parser.ParsePattern();
1485 if (parser.failed()) {
1486 DCHECK(tree == NULL);
1487 DCHECK(!result->error.is_null());
1488 } else {
1489 DCHECK(tree != NULL);
1490 DCHECK(result->error.is_null());
1491 if (FLAG_trace_regexp_parser) {
1492 OFStream os(stdout);
1493 tree->Print(os, zone);
1494 os << "\n";
1495 }
1496 result->tree = tree;
1497 int capture_count = parser.captures_started();
1498 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1499 result->contains_anchor = parser.contains_anchor();
1500 result->capture_name_map = parser.CreateCaptureNameMap();
1501 result->capture_count = capture_count;
1502 }
1503 return !parser.failed();
1504 }
1505
RegExpBuilder(Zone * zone,bool ignore_case,bool unicode)1506 RegExpBuilder::RegExpBuilder(Zone* zone, bool ignore_case, bool unicode)
1507 : zone_(zone),
1508 pending_empty_(false),
1509 ignore_case_(ignore_case),
1510 unicode_(unicode),
1511 characters_(NULL),
1512 pending_surrogate_(kNoPendingSurrogate),
1513 terms_(),
1514 alternatives_()
1515 #ifdef DEBUG
1516 ,
1517 last_added_(ADD_NONE)
1518 #endif
1519 {
1520 }
1521
1522
AddLeadSurrogate(uc16 lead_surrogate)1523 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1524 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1525 FlushPendingSurrogate();
1526 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1527 pending_surrogate_ = lead_surrogate;
1528 }
1529
1530
AddTrailSurrogate(uc16 trail_surrogate)1531 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1532 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1533 if (pending_surrogate_ != kNoPendingSurrogate) {
1534 uc16 lead_surrogate = pending_surrogate_;
1535 pending_surrogate_ = kNoPendingSurrogate;
1536 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1537 uc32 combined =
1538 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1539 if (NeedsDesugaringForIgnoreCase(combined)) {
1540 AddCharacterClassForDesugaring(combined);
1541 } else {
1542 ZoneList<uc16> surrogate_pair(2, zone());
1543 surrogate_pair.Add(lead_surrogate, zone());
1544 surrogate_pair.Add(trail_surrogate, zone());
1545 RegExpAtom* atom =
1546 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1547 AddAtom(atom);
1548 }
1549 } else {
1550 pending_surrogate_ = trail_surrogate;
1551 FlushPendingSurrogate();
1552 }
1553 }
1554
1555
FlushPendingSurrogate()1556 void RegExpBuilder::FlushPendingSurrogate() {
1557 if (pending_surrogate_ != kNoPendingSurrogate) {
1558 DCHECK(unicode());
1559 uc32 c = pending_surrogate_;
1560 pending_surrogate_ = kNoPendingSurrogate;
1561 AddCharacterClassForDesugaring(c);
1562 }
1563 }
1564
1565
FlushCharacters()1566 void RegExpBuilder::FlushCharacters() {
1567 FlushPendingSurrogate();
1568 pending_empty_ = false;
1569 if (characters_ != NULL) {
1570 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1571 characters_ = NULL;
1572 text_.Add(atom, zone());
1573 LAST(ADD_ATOM);
1574 }
1575 }
1576
1577
FlushText()1578 void RegExpBuilder::FlushText() {
1579 FlushCharacters();
1580 int num_text = text_.length();
1581 if (num_text == 0) {
1582 return;
1583 } else if (num_text == 1) {
1584 terms_.Add(text_.last(), zone());
1585 } else {
1586 RegExpText* text = new (zone()) RegExpText(zone());
1587 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1588 terms_.Add(text, zone());
1589 }
1590 text_.Clear();
1591 }
1592
1593
AddCharacter(uc16 c)1594 void RegExpBuilder::AddCharacter(uc16 c) {
1595 FlushPendingSurrogate();
1596 pending_empty_ = false;
1597 if (NeedsDesugaringForIgnoreCase(c)) {
1598 AddCharacterClassForDesugaring(c);
1599 } else {
1600 if (characters_ == NULL) {
1601 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1602 }
1603 characters_->Add(c, zone());
1604 LAST(ADD_CHAR);
1605 }
1606 }
1607
1608
AddUnicodeCharacter(uc32 c)1609 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1610 if (c > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
1611 DCHECK(unicode());
1612 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1613 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1614 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1615 AddLeadSurrogate(c);
1616 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1617 AddTrailSurrogate(c);
1618 } else {
1619 AddCharacter(static_cast<uc16>(c));
1620 }
1621 }
1622
AddEscapedUnicodeCharacter(uc32 character)1623 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1624 // A lead or trail surrogate parsed via escape sequence will not
1625 // pair up with any preceding lead or following trail surrogate.
1626 FlushPendingSurrogate();
1627 AddUnicodeCharacter(character);
1628 FlushPendingSurrogate();
1629 }
1630
AddEmpty()1631 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1632
1633
AddCharacterClass(RegExpCharacterClass * cc)1634 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1635 if (NeedsDesugaringForUnicode(cc)) {
1636 // With /u, character class needs to be desugared, so it
1637 // must be a standalone term instead of being part of a RegExpText.
1638 AddTerm(cc);
1639 } else {
1640 AddAtom(cc);
1641 }
1642 }
1643
AddCharacterClassForDesugaring(uc32 c)1644 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1645 AddTerm(new (zone()) RegExpCharacterClass(
1646 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1647 }
1648
1649
AddAtom(RegExpTree * term)1650 void RegExpBuilder::AddAtom(RegExpTree* term) {
1651 if (term->IsEmpty()) {
1652 AddEmpty();
1653 return;
1654 }
1655 if (term->IsTextElement()) {
1656 FlushCharacters();
1657 text_.Add(term, zone());
1658 } else {
1659 FlushText();
1660 terms_.Add(term, zone());
1661 }
1662 LAST(ADD_ATOM);
1663 }
1664
1665
AddTerm(RegExpTree * term)1666 void RegExpBuilder::AddTerm(RegExpTree* term) {
1667 FlushText();
1668 terms_.Add(term, zone());
1669 LAST(ADD_ATOM);
1670 }
1671
1672
AddAssertion(RegExpTree * assert)1673 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1674 FlushText();
1675 terms_.Add(assert, zone());
1676 LAST(ADD_ASSERT);
1677 }
1678
1679
NewAlternative()1680 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1681
1682
FlushTerms()1683 void RegExpBuilder::FlushTerms() {
1684 FlushText();
1685 int num_terms = terms_.length();
1686 RegExpTree* alternative;
1687 if (num_terms == 0) {
1688 alternative = new (zone()) RegExpEmpty();
1689 } else if (num_terms == 1) {
1690 alternative = terms_.last();
1691 } else {
1692 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1693 }
1694 alternatives_.Add(alternative, zone());
1695 terms_.Clear();
1696 LAST(ADD_NONE);
1697 }
1698
1699
NeedsDesugaringForUnicode(RegExpCharacterClass * cc)1700 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1701 if (!unicode()) return false;
1702 // TODO(yangguo): we could be smarter than this. Case-insensitivity does not
1703 // necessarily mean that we need to desugar. It's probably nicer to have a
1704 // separate pass to figure out unicode desugarings.
1705 if (ignore_case()) return true;
1706 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1707 CharacterRange::Canonicalize(ranges);
1708 for (int i = ranges->length() - 1; i >= 0; i--) {
1709 uc32 from = ranges->at(i).from();
1710 uc32 to = ranges->at(i).to();
1711 // Check for non-BMP characters.
1712 if (to >= kNonBmpStart) return true;
1713 // Check for lone surrogates.
1714 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1715 }
1716 return false;
1717 }
1718
1719
NeedsDesugaringForIgnoreCase(uc32 c)1720 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1721 #ifdef V8_I18N_SUPPORT
1722 if (unicode() && ignore_case()) {
1723 USet* set = uset_open(c, c);
1724 uset_closeOver(set, USET_CASE_INSENSITIVE);
1725 uset_removeAllStrings(set);
1726 bool result = uset_size(set) > 1;
1727 uset_close(set);
1728 return result;
1729 }
1730 // In the case where ICU is not included, we act as if the unicode flag is
1731 // not set, and do not desugar.
1732 #endif // V8_I18N_SUPPORT
1733 return false;
1734 }
1735
1736
ToRegExp()1737 RegExpTree* RegExpBuilder::ToRegExp() {
1738 FlushTerms();
1739 int num_alternatives = alternatives_.length();
1740 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1741 if (num_alternatives == 1) return alternatives_.last();
1742 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1743 }
1744
AddQuantifierToAtom(int min,int max,RegExpQuantifier::QuantifierType quantifier_type)1745 bool RegExpBuilder::AddQuantifierToAtom(
1746 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1747 FlushPendingSurrogate();
1748 if (pending_empty_) {
1749 pending_empty_ = false;
1750 return true;
1751 }
1752 RegExpTree* atom;
1753 if (characters_ != NULL) {
1754 DCHECK(last_added_ == ADD_CHAR);
1755 // Last atom was character.
1756 Vector<const uc16> char_vector = characters_->ToConstVector();
1757 int num_chars = char_vector.length();
1758 if (num_chars > 1) {
1759 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1760 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1761 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1762 }
1763 characters_ = NULL;
1764 atom = new (zone()) RegExpAtom(char_vector);
1765 FlushText();
1766 } else if (text_.length() > 0) {
1767 DCHECK(last_added_ == ADD_ATOM);
1768 atom = text_.RemoveLast();
1769 FlushText();
1770 } else if (terms_.length() > 0) {
1771 DCHECK(last_added_ == ADD_ATOM);
1772 atom = terms_.RemoveLast();
1773 // With /u, lookarounds are not quantifiable.
1774 if (unicode() && atom->IsLookaround()) return false;
1775 if (atom->max_match() == 0) {
1776 // Guaranteed to only match an empty string.
1777 LAST(ADD_TERM);
1778 if (min == 0) {
1779 return true;
1780 }
1781 terms_.Add(atom, zone());
1782 return true;
1783 }
1784 } else {
1785 // Only call immediately after adding an atom or character!
1786 UNREACHABLE();
1787 return false;
1788 }
1789 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1790 zone());
1791 LAST(ADD_TERM);
1792 return true;
1793 }
1794
1795 } // namespace internal
1796 } // namespace v8
1797