1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_PARSER_H_
6 #define V8_REGEXP_REGEXP_PARSER_H_
7 
8 #include "src/objects.h"
9 #include "src/regexp/regexp-ast.h"
10 #include "src/zone.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 struct RegExpCompileData;
16 
17 
18 // A BufferedZoneList is an automatically growing list, just like (and backed
19 // by) a ZoneList, that is optimized for the case of adding and removing
20 // a single element. The last element added is stored outside the backing list,
21 // and if no more than one element is ever added, the ZoneList isn't even
22 // allocated.
23 // Elements must not be NULL pointers.
24 template <typename T, int initial_size>
25 class BufferedZoneList {
26  public:
BufferedZoneList()27   BufferedZoneList() : list_(NULL), last_(NULL) {}
28 
29   // Adds element at end of list. This element is buffered and can
30   // be read using last() or removed using RemoveLast until a new Add or until
31   // RemoveLast or GetList has been called.
Add(T * value,Zone * zone)32   void Add(T* value, Zone* zone) {
33     if (last_ != NULL) {
34       if (list_ == NULL) {
35         list_ = new (zone) ZoneList<T*>(initial_size, zone);
36       }
37       list_->Add(last_, zone);
38     }
39     last_ = value;
40   }
41 
last()42   T* last() {
43     DCHECK(last_ != NULL);
44     return last_;
45   }
46 
RemoveLast()47   T* RemoveLast() {
48     DCHECK(last_ != NULL);
49     T* result = last_;
50     if ((list_ != NULL) && (list_->length() > 0))
51       last_ = list_->RemoveLast();
52     else
53       last_ = NULL;
54     return result;
55   }
56 
Get(int i)57   T* Get(int i) {
58     DCHECK((0 <= i) && (i < length()));
59     if (list_ == NULL) {
60       DCHECK_EQ(0, i);
61       return last_;
62     } else {
63       if (i == list_->length()) {
64         DCHECK(last_ != NULL);
65         return last_;
66       } else {
67         return list_->at(i);
68       }
69     }
70   }
71 
Clear()72   void Clear() {
73     list_ = NULL;
74     last_ = NULL;
75   }
76 
length()77   int length() {
78     int length = (list_ == NULL) ? 0 : list_->length();
79     return length + ((last_ == NULL) ? 0 : 1);
80   }
81 
GetList(Zone * zone)82   ZoneList<T*>* GetList(Zone* zone) {
83     if (list_ == NULL) {
84       list_ = new (zone) ZoneList<T*>(initial_size, zone);
85     }
86     if (last_ != NULL) {
87       list_->Add(last_, zone);
88       last_ = NULL;
89     }
90     return list_;
91   }
92 
93  private:
94   ZoneList<T*>* list_;
95   T* last_;
96 };
97 
98 
99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives.
100 class RegExpBuilder : public ZoneObject {
101  public:
102   explicit RegExpBuilder(Zone* zone);
103   void AddCharacter(uc16 character);
104   void AddUnicodeCharacter(uc32 character);
105   // "Adds" an empty expression. Does nothing except consume a
106   // following quantifier
107   void AddEmpty();
108   void AddAtom(RegExpTree* tree);
109   void AddAssertion(RegExpTree* tree);
110   void NewAlternative();  // '|'
111   void AddQuantifierToAtom(int min, int max,
112                            RegExpQuantifier::QuantifierType type);
113   RegExpTree* ToRegExp();
114 
115  private:
116   void FlushCharacters();
117   void FlushText();
118   void FlushTerms();
zone()119   Zone* zone() const { return zone_; }
120 
121   Zone* zone_;
122   bool pending_empty_;
123   ZoneList<uc16>* characters_;
124   BufferedZoneList<RegExpTree, 2> terms_;
125   BufferedZoneList<RegExpTree, 2> text_;
126   BufferedZoneList<RegExpTree, 2> alternatives_;
127 #ifdef DEBUG
128   enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_;
129 #define LAST(x) last_added_ = x;
130 #else
131 #define LAST(x)
132 #endif
133 };
134 
135 
136 class RegExpParser BASE_EMBEDDED {
137  public:
138   RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode,
139                bool unicode, Isolate* isolate, Zone* zone);
140 
141   static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input,
142                           bool multiline, bool unicode,
143                           RegExpCompileData* result);
144 
145   RegExpTree* ParsePattern();
146   RegExpTree* ParseDisjunction();
147   RegExpTree* ParseGroup();
148   RegExpTree* ParseCharacterClass();
149 
150   // Parses a {...,...} quantifier and stores the range in the given
151   // out parameters.
152   bool ParseIntervalQuantifier(int* min_out, int* max_out);
153 
154   // Parses and returns a single escaped character.  The character
155   // must not be 'b' or 'B' since they are usually handle specially.
156   uc32 ParseClassCharacterEscape();
157 
158   // Checks whether the following is a length-digit hexadecimal number,
159   // and sets the value if it is.
160   bool ParseHexEscape(int length, uc32* value);
161   bool ParseUnicodeEscape(uc32* value);
162   bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value);
163 
164   uc32 ParseOctalLiteral();
165 
166   // Tries to parse the input as a back reference.  If successful it
167   // stores the result in the output parameter and returns true.  If
168   // it fails it will push back the characters read so the same characters
169   // can be reparsed.
170   bool ParseBackReferenceIndex(int* index_out);
171 
172   CharacterRange ParseClassAtom(uc16* char_class);
173   RegExpTree* ReportError(Vector<const char> message);
174   void Advance();
175   void Advance(int dist);
176   void Reset(int pos);
177 
178   // Reports whether the pattern might be used as a literal search string.
179   // Only use if the result of the parse is a single atom node.
180   bool simple();
contains_anchor()181   bool contains_anchor() { return contains_anchor_; }
set_contains_anchor()182   void set_contains_anchor() { contains_anchor_ = true; }
captures_started()183   int captures_started() { return captures_started_; }
position()184   int position() { return next_pos_ - 1; }
failed()185   bool failed() { return failed_; }
186 
187   static bool IsSyntaxCharacter(uc32 c);
188 
189   static const int kMaxCaptures = 1 << 16;
190   static const uc32 kEndMarker = (1 << 21);
191 
192  private:
193   enum SubexpressionType {
194     INITIAL,
195     CAPTURE,  // All positive values represent captures.
196     POSITIVE_LOOKAROUND,
197     NEGATIVE_LOOKAROUND,
198     GROUPING
199   };
200 
201   class RegExpParserState : public ZoneObject {
202    public:
RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,Zone * zone)203     RegExpParserState(RegExpParserState* previous_state,
204                       SubexpressionType group_type,
205                       RegExpLookaround::Type lookaround_type,
206                       int disjunction_capture_index, Zone* zone)
207         : previous_state_(previous_state),
208           builder_(new (zone) RegExpBuilder(zone)),
209           group_type_(group_type),
210           lookaround_type_(lookaround_type),
211           disjunction_capture_index_(disjunction_capture_index) {}
212     // Parser state of containing expression, if any.
previous_state()213     RegExpParserState* previous_state() { return previous_state_; }
IsSubexpression()214     bool IsSubexpression() { return previous_state_ != NULL; }
215     // RegExpBuilder building this regexp's AST.
builder()216     RegExpBuilder* builder() { return builder_; }
217     // Type of regexp being parsed (parenthesized group or entire regexp).
group_type()218     SubexpressionType group_type() { return group_type_; }
219     // Lookahead or Lookbehind.
lookaround_type()220     RegExpLookaround::Type lookaround_type() { return lookaround_type_; }
221     // Index in captures array of first capture in this sub-expression, if any.
222     // Also the capture index of this sub-expression itself, if group_type
223     // is CAPTURE.
capture_index()224     int capture_index() { return disjunction_capture_index_; }
225 
226     // Check whether the parser is inside a capture group with the given index.
227     bool IsInsideCaptureGroup(int index);
228 
229    private:
230     // Linked list implementation of stack of states.
231     RegExpParserState* previous_state_;
232     // Builder for the stored disjunction.
233     RegExpBuilder* builder_;
234     // Stored disjunction type (capture, look-ahead or grouping), if any.
235     SubexpressionType group_type_;
236     // Stored read direction.
237     RegExpLookaround::Type lookaround_type_;
238     // Stored disjunction's capture index (if any).
239     int disjunction_capture_index_;
240   };
241 
242   // Return the 1-indexed RegExpCapture object, allocate if necessary.
243   RegExpCapture* GetCapture(int index);
244 
isolate()245   Isolate* isolate() { return isolate_; }
zone()246   Zone* zone() const { return zone_; }
247 
current()248   uc32 current() { return current_; }
has_more()249   bool has_more() { return has_more_; }
has_next()250   bool has_next() { return next_pos_ < in()->length(); }
251   uc32 Next();
in()252   FlatStringReader* in() { return in_; }
253   void ScanForCaptures();
254 
255   Isolate* isolate_;
256   Zone* zone_;
257   Handle<String>* error_;
258   ZoneList<RegExpCapture*>* captures_;
259   FlatStringReader* in_;
260   uc32 current_;
261   int next_pos_;
262   int captures_started_;
263   // The capture count is only valid after we have scanned for captures.
264   int capture_count_;
265   bool has_more_;
266   bool multiline_;
267   bool unicode_;
268   bool simple_;
269   bool contains_anchor_;
270   bool is_scanned_for_captures_;
271   bool failed_;
272 };
273 
274 }  // namespace internal
275 }  // namespace v8
276 
277 #endif  // V8_REGEXP_REGEXP_PARSER_H_
278