1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_
7 
8 #include "src/objects.h"
9 #include "src/objects/js-regexp.h"
10 #include "src/regexp/regexp-ast.h"
11 #include "src/zone/zone.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 struct RegExpCompileData;
17 
18 // A BufferedZoneList is an automatically growing list, just like (and backed
19 // by) a ZoneList, that is optimized for the case of adding and removing
20 // a single element. The last element added is stored outside the backing list,
21 // and if no more than one element is ever added, the ZoneList isn't even
22 // allocated.
23 // Elements must not be nullptr pointers.
24 template <typename T, int initial_size>
25 class BufferedZoneList {
26  public:
BufferedZoneList()27   BufferedZoneList() : list_(nullptr), last_(nullptr) {}
28 
29   // Adds element at end of list. This element is buffered and can
30   // be read using last() or removed using RemoveLast until a new Add or until
31   // RemoveLast or GetList has been called.
Add(T * value,Zone * zone)32   void Add(T* value, Zone* zone) {
33     if (last_ != nullptr) {
34       if (list_ == nullptr) {
35         list_ = new (zone) ZoneList<T*>(initial_size, zone);
36       }
37       list_->Add(last_, zone);
38     }
39     last_ = value;
40   }
41 
last()42   T* last() {
43     DCHECK(last_ != nullptr);
44     return last_;
45   }
46 
RemoveLast()47   T* RemoveLast() {
48     DCHECK(last_ != nullptr);
49     T* result = last_;
50     if ((list_ != nullptr) && (list_->length() > 0))
51       last_ = list_->RemoveLast();
52     else
53       last_ = nullptr;
54     return result;
55   }
56 
Get(int i)57   T* Get(int i) {
58     DCHECK((0 <= i) && (i < length()));
59     if (list_ == nullptr) {
60       DCHECK_EQ(0, i);
61       return last_;
62     } else {
63       if (i == list_->length()) {
64         DCHECK(last_ != nullptr);
65         return last_;
66       } else {
67         return list_->at(i);
68       }
69     }
70   }
71 
Clear()72   void Clear() {
73     list_ = nullptr;
74     last_ = nullptr;
75   }
76 
length()77   int length() {
78     int length = (list_ == nullptr) ? 0 : list_->length();
79     return length + ((last_ == nullptr) ? 0 : 1);
80   }
81 
GetList(Zone * zone)82   ZoneList<T*>* GetList(Zone* zone) {
83     if (list_ == nullptr) {
84       list_ = new (zone) ZoneList<T*>(initial_size, zone);
85     }
86     if (last_ != nullptr) {
87       list_->Add(last_, zone);
88       last_ = nullptr;
89     }
90     return list_;
91   }
92 
93  private:
94   ZoneList<T*>* list_;
95   T* last_;
96 };
97 
98 
99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 class RegExpBuilder : public ZoneObject {
101  public:
102   RegExpBuilder(Zone* zone, JSRegExp::Flags flags);
103   void AddCharacter(uc16 character);
104   void AddUnicodeCharacter(uc32 character);
105   void AddEscapedUnicodeCharacter(uc32 character);
106   // "Adds" an empty expression. Does nothing except consume a
107   // following quantifier
108   void AddEmpty();
109   void AddCharacterClass(RegExpCharacterClass* cc);
110   void AddCharacterClassForDesugaring(uc32 c);
111   void AddAtom(RegExpTree* tree);
112   void AddTerm(RegExpTree* tree);
113   void AddAssertion(RegExpTree* tree);
114   void NewAlternative();  // '|'
115   bool AddQuantifierToAtom(int min, int max,
116                            RegExpQuantifier::QuantifierType type);
117   void FlushText();
118   RegExpTree* ToRegExp();
flags()119   JSRegExp::Flags flags() const { return flags_; }
set_flags(JSRegExp::Flags flags)120   void set_flags(JSRegExp::Flags flags) { flags_ = flags; }
121 
ignore_case()122   bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
multiline()123   bool multiline() const { return (flags_ & JSRegExp::kMultiline) != 0; }
dotall()124   bool dotall() const { return (flags_ & JSRegExp::kDotAll) != 0; }
125 
126  private:
127   static const uc16 kNoPendingSurrogate = 0;
128   void AddLeadSurrogate(uc16 lead_surrogate);
129   void AddTrailSurrogate(uc16 trail_surrogate);
130   void FlushPendingSurrogate();
131   void FlushCharacters();
132   void FlushTerms();
133   bool NeedsDesugaringForUnicode(RegExpCharacterClass* cc);
134   bool NeedsDesugaringForIgnoreCase(uc32 c);
zone()135   Zone* zone() const { return zone_; }
unicode()136   bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
137 
138   Zone* zone_;
139   bool pending_empty_;
140   JSRegExp::Flags flags_;
141   ZoneList<uc16>* characters_;
142   uc16 pending_surrogate_;
143   BufferedZoneList<RegExpTree, 2> terms_;
144   BufferedZoneList<RegExpTree, 2> text_;
145   BufferedZoneList<RegExpTree, 2> alternatives_;
146 #ifdef DEBUG
147   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
148 #define LAST(x) last_added_ = x;
149 #else
150 #define LAST(x)
151 #endif
152 };
153 
154 
155 class RegExpParser BASE_EMBEDDED {
156  public:
157   RegExpParser(FlatStringReader* in, Handle<String>* error,
158                JSRegExp::Flags flags, Isolate* isolate, Zone* zone);
159 
160   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
161                           JSRegExp::Flags flags, RegExpCompileData* result);
162 
163   RegExpTree* ParsePattern();
164   RegExpTree* ParseDisjunction();
165   RegExpTree* ParseGroup();
166 
167   // Parses a {...,...} quantifier and stores the range in the given
168   // out parameters.
169   bool ParseIntervalQuantifier(int* min_out, int* max_out);
170 
171   // Parses and returns a single escaped character.  The character
172   // must not be 'b' or 'B' since they are usually handle specially.
173   uc32 ParseClassCharacterEscape();
174 
175   // Checks whether the following is a length-digit hexadecimal number,
176   // and sets the value if it is.
177   bool ParseHexEscape(int length, uc32* value);
178   bool ParseUnicodeEscape(uc32* value);
179   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
180   bool ParsePropertyClass(ZoneList<CharacterRange>* result, bool negate);
181   RegExpTree* ParseCharacterClass(const RegExpBuilder* state);
182 
183   uc32 ParseOctalLiteral();
184 
185   // Tries to parse the input as a back reference.  If successful it
186   // stores the result in the output parameter and returns true.  If
187   // it fails it will push back the characters read so the same characters
188   // can be reparsed.
189   bool ParseBackReferenceIndex(int* index_out);
190 
191   // Parse inside a class. Either add escaped class to the range, or return
192   // false and pass parsed single character through |char_out|.
193   void ParseClassEscape(ZoneList<CharacterRange>* ranges, Zone* zone,
194                         bool add_unicode_case_equivalents, uc32* char_out,
195                         bool* is_class_escape);
196 
197   char ParseClassEscape();
198 
199   RegExpTree* ReportError(Vector<const char> message);
200   void Advance();
201   void Advance(int dist);
202   void Reset(int pos);
203 
204   // Reports whether the pattern might be used as a literal search string.
205   // Only use if the result of the parse is a single atom node.
206   bool simple();
contains_anchor()207   bool contains_anchor() { return contains_anchor_; }
set_contains_anchor()208   void set_contains_anchor() { contains_anchor_ = true; }
captures_started()209   int captures_started() { return captures_started_; }
position()210   int position() { return next_pos_ - 1; }
failed()211   bool failed() { return failed_; }
212   // The Unicode flag can't be changed using in-regexp syntax, so it's OK to
213   // just read the initial flag value here.
unicode()214   bool unicode() const { return (top_level_flags_ & JSRegExp::kUnicode) != 0; }
215 
216   static bool IsSyntaxCharacterOrSlash(uc32 c);
217 
218   static const int kMaxCaptures = 1 << 16;
219   static const uc32 kEndMarker = (1 << 21);
220 
221  private:
222   enum SubexpressionType {
223     INITIAL,
224     CAPTURE,  // All positive values represent captures.
225     POSITIVE_LOOKAROUND,
226     NEGATIVE_LOOKAROUND,
227     GROUPING
228   };
229 
230   class RegExpParserState : public ZoneObject {
231    public:
232     // Push a state on the stack.
RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,const ZoneVector<uc16> * capture_name,JSRegExp::Flags flags,Zone * zone)233     RegExpParserState(RegExpParserState* previous_state,
234                       SubexpressionType group_type,
235                       RegExpLookaround::Type lookaround_type,
236                       int disjunction_capture_index,
237                       const ZoneVector<uc16>* capture_name,
238                       JSRegExp::Flags flags, Zone* zone)
239         : previous_state_(previous_state),
240           builder_(new (zone) RegExpBuilder(zone, flags)),
241           group_type_(group_type),
242           lookaround_type_(lookaround_type),
243           disjunction_capture_index_(disjunction_capture_index),
244           capture_name_(capture_name) {}
245     // Parser state of containing expression, if any.
previous_state()246     RegExpParserState* previous_state() const { return previous_state_; }
IsSubexpression()247     bool IsSubexpression() { return previous_state_ != nullptr; }
248     // RegExpBuilder building this regexp's AST.
builder()249     RegExpBuilder* builder() const { return builder_; }
250     // Type of regexp being parsed (parenthesized group or entire regexp).
group_type()251     SubexpressionType group_type() const { return group_type_; }
252     // Lookahead or Lookbehind.
lookaround_type()253     RegExpLookaround::Type lookaround_type() const { return lookaround_type_; }
254     // Index in captures array of first capture in this sub-expression, if any.
255     // Also the capture index of this sub-expression itself, if group_type
256     // is CAPTURE.
capture_index()257     int capture_index() const { return disjunction_capture_index_; }
258     // The name of the current sub-expression, if group_type is CAPTURE. Only
259     // used for named captures.
capture_name()260     const ZoneVector<uc16>* capture_name() const { return capture_name_; }
261 
IsNamedCapture()262     bool IsNamedCapture() const { return capture_name_ != nullptr; }
263 
264     // Check whether the parser is inside a capture group with the given index.
265     bool IsInsideCaptureGroup(int index);
266     // Check whether the parser is inside a capture group with the given name.
267     bool IsInsideCaptureGroup(const ZoneVector<uc16>* name);
268 
269    private:
270     // Linked list implementation of stack of states.
271     RegExpParserState* const previous_state_;
272     // Builder for the stored disjunction.
273     RegExpBuilder* const builder_;
274     // Stored disjunction type (capture, look-ahead or grouping), if any.
275     const SubexpressionType group_type_;
276     // Stored read direction.
277     const RegExpLookaround::Type lookaround_type_;
278     // Stored disjunction's capture index (if any).
279     const int disjunction_capture_index_;
280     // Stored capture name (if any).
281     const ZoneVector<uc16>* const capture_name_;
282   };
283 
284   // Return the 1-indexed RegExpCapture object, allocate if necessary.
285   RegExpCapture* GetCapture(int index);
286 
287   // Creates a new named capture at the specified index. Must be called exactly
288   // once for each named capture. Fails if a capture with the same name is
289   // encountered.
290   bool CreateNamedCaptureAtIndex(const ZoneVector<uc16>* name, int index);
291 
292   // Parses the name of a capture group (?<name>pattern). The name must adhere
293   // to IdentifierName in the ECMAScript standard.
294   const ZoneVector<uc16>* ParseCaptureGroupName();
295 
296   bool ParseNamedBackReference(RegExpBuilder* builder,
297                                RegExpParserState* state);
298   RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
299 
300   // After the initial parsing pass, patch corresponding RegExpCapture objects
301   // into all RegExpBackReferences. This is done after initial parsing in order
302   // to avoid complicating cases in which references comes before the capture.
303   void PatchNamedBackReferences();
304 
305   Handle<FixedArray> CreateCaptureNameMap();
306 
307   // Returns true iff the pattern contains named captures. May call
308   // ScanForCaptures to look ahead at the remaining pattern.
309   bool HasNamedCaptures();
310 
isolate()311   Isolate* isolate() { return isolate_; }
zone()312   Zone* zone() const { return zone_; }
313 
current()314   uc32 current() { return current_; }
has_more()315   bool has_more() { return has_more_; }
has_next()316   bool has_next() { return next_pos_ < in()->length(); }
317   uc32 Next();
318   template <bool update_position>
319   uc32 ReadNext();
in()320   FlatStringReader* in() { return in_; }
321   void ScanForCaptures();
322 
323   Isolate* isolate_;
324   Zone* zone_;
325   Handle<String>* error_;
326   ZoneList<RegExpCapture*>* captures_;
327   ZoneList<RegExpCapture*>* named_captures_;
328   ZoneList<RegExpBackReference*>* named_back_references_;
329   FlatStringReader* in_;
330   uc32 current_;
331   // These are the flags specified outside the regexp syntax ie after the
332   // terminating '/' or in the second argument to the constructor.  The current
333   // flags are stored on the RegExpBuilder.
334   JSRegExp::Flags top_level_flags_;
335   int next_pos_;
336   int captures_started_;
337   int capture_count_;  // Only valid after we have scanned for captures.
338   bool has_more_;
339   bool simple_;
340   bool contains_anchor_;
341   bool is_scanned_for_captures_;
342   bool has_named_captures_;  // Only valid after we have scanned for captures.
343   bool failed_;
344 };
345 
346 }  // namespace internal
347 }  // namespace v8
348 
349 #endif  // V8_REGEXP_REGEXP_PARSER_H_
350