1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_PARSER_H_ 6 #define V8_REGEXP_REGEXP_PARSER_H_ 7 8 #include "src/objects.h" 9 #include "src/regexp/regexp-ast.h" 10 #include "src/zone.h" 11 12 namespace v8 { 13 namespace internal { 14 15 struct RegExpCompileData; 16 17 18 // A BufferedZoneList is an automatically growing list, just like (and backed 19 // by) a ZoneList, that is optimized for the case of adding and removing 20 // a single element. The last element added is stored outside the backing list, 21 // and if no more than one element is ever added, the ZoneList isn't even 22 // allocated. 23 // Elements must not be NULL pointers. 24 template <typename T, int initial_size> 25 class BufferedZoneList { 26 public: BufferedZoneList()27 BufferedZoneList() : list_(NULL), last_(NULL) {} 28 29 // Adds element at end of list. This element is buffered and can 30 // be read using last() or removed using RemoveLast until a new Add or until 31 // RemoveLast or GetList has been called. Add(T * value,Zone * zone)32 void Add(T* value, Zone* zone) { 33 if (last_ != NULL) { 34 if (list_ == NULL) { 35 list_ = new (zone) ZoneList<T*>(initial_size, zone); 36 } 37 list_->Add(last_, zone); 38 } 39 last_ = value; 40 } 41 last()42 T* last() { 43 DCHECK(last_ != NULL); 44 return last_; 45 } 46 RemoveLast()47 T* RemoveLast() { 48 DCHECK(last_ != NULL); 49 T* result = last_; 50 if ((list_ != NULL) && (list_->length() > 0)) 51 last_ = list_->RemoveLast(); 52 else 53 last_ = NULL; 54 return result; 55 } 56 Get(int i)57 T* Get(int i) { 58 DCHECK((0 <= i) && (i < length())); 59 if (list_ == NULL) { 60 DCHECK_EQ(0, i); 61 return last_; 62 } else { 63 if (i == list_->length()) { 64 DCHECK(last_ != NULL); 65 return last_; 66 } else { 67 return list_->at(i); 68 } 69 } 70 } 71 Clear()72 void Clear() { 73 list_ = NULL; 74 last_ = NULL; 75 } 76 length()77 int length() { 78 int length = (list_ == NULL) ? 0 : list_->length(); 79 return length + ((last_ == NULL) ? 0 : 1); 80 } 81 GetList(Zone * zone)82 ZoneList<T*>* GetList(Zone* zone) { 83 if (list_ == NULL) { 84 list_ = new (zone) ZoneList<T*>(initial_size, zone); 85 } 86 if (last_ != NULL) { 87 list_->Add(last_, zone); 88 last_ = NULL; 89 } 90 return list_; 91 } 92 93 private: 94 ZoneList<T*>* list_; 95 T* last_; 96 }; 97 98 99 // Accumulates RegExp atoms and assertions into lists of terms and alternatives. 100 class RegExpBuilder : public ZoneObject { 101 public: 102 explicit RegExpBuilder(Zone* zone); 103 void AddCharacter(uc16 character); 104 void AddUnicodeCharacter(uc32 character); 105 // "Adds" an empty expression. Does nothing except consume a 106 // following quantifier 107 void AddEmpty(); 108 void AddAtom(RegExpTree* tree); 109 void AddAssertion(RegExpTree* tree); 110 void NewAlternative(); // '|' 111 void AddQuantifierToAtom(int min, int max, 112 RegExpQuantifier::QuantifierType type); 113 RegExpTree* ToRegExp(); 114 115 private: 116 void FlushCharacters(); 117 void FlushText(); 118 void FlushTerms(); zone()119 Zone* zone() const { return zone_; } 120 121 Zone* zone_; 122 bool pending_empty_; 123 ZoneList<uc16>* characters_; 124 BufferedZoneList<RegExpTree, 2> terms_; 125 BufferedZoneList<RegExpTree, 2> text_; 126 BufferedZoneList<RegExpTree, 2> alternatives_; 127 #ifdef DEBUG 128 enum { ADD_NONE, ADD_CHAR, ADD_TERM, ADD_ASSERT, ADD_ATOM } last_added_; 129 #define LAST(x) last_added_ = x; 130 #else 131 #define LAST(x) 132 #endif 133 }; 134 135 136 class RegExpParser BASE_EMBEDDED { 137 public: 138 RegExpParser(FlatStringReader* in, Handle<String>* error, bool multiline_mode, 139 bool unicode, Isolate* isolate, Zone* zone); 140 141 static bool ParseRegExp(Isolate* isolate, Zone* zone, FlatStringReader* input, 142 bool multiline, bool unicode, 143 RegExpCompileData* result); 144 145 RegExpTree* ParsePattern(); 146 RegExpTree* ParseDisjunction(); 147 RegExpTree* ParseGroup(); 148 RegExpTree* ParseCharacterClass(); 149 150 // Parses a {...,...} quantifier and stores the range in the given 151 // out parameters. 152 bool ParseIntervalQuantifier(int* min_out, int* max_out); 153 154 // Parses and returns a single escaped character. The character 155 // must not be 'b' or 'B' since they are usually handle specially. 156 uc32 ParseClassCharacterEscape(); 157 158 // Checks whether the following is a length-digit hexadecimal number, 159 // and sets the value if it is. 160 bool ParseHexEscape(int length, uc32* value); 161 bool ParseUnicodeEscape(uc32* value); 162 bool ParseUnlimitedLengthHexNumber(int max_value, uc32* value); 163 164 uc32 ParseOctalLiteral(); 165 166 // Tries to parse the input as a back reference. If successful it 167 // stores the result in the output parameter and returns true. If 168 // it fails it will push back the characters read so the same characters 169 // can be reparsed. 170 bool ParseBackReferenceIndex(int* index_out); 171 172 CharacterRange ParseClassAtom(uc16* char_class); 173 RegExpTree* ReportError(Vector<const char> message); 174 void Advance(); 175 void Advance(int dist); 176 void Reset(int pos); 177 178 // Reports whether the pattern might be used as a literal search string. 179 // Only use if the result of the parse is a single atom node. 180 bool simple(); contains_anchor()181 bool contains_anchor() { return contains_anchor_; } set_contains_anchor()182 void set_contains_anchor() { contains_anchor_ = true; } captures_started()183 int captures_started() { return captures_started_; } position()184 int position() { return next_pos_ - 1; } failed()185 bool failed() { return failed_; } 186 187 static bool IsSyntaxCharacter(uc32 c); 188 189 static const int kMaxCaptures = 1 << 16; 190 static const uc32 kEndMarker = (1 << 21); 191 192 private: 193 enum SubexpressionType { 194 INITIAL, 195 CAPTURE, // All positive values represent captures. 196 POSITIVE_LOOKAROUND, 197 NEGATIVE_LOOKAROUND, 198 GROUPING 199 }; 200 201 class RegExpParserState : public ZoneObject { 202 public: RegExpParserState(RegExpParserState * previous_state,SubexpressionType group_type,RegExpLookaround::Type lookaround_type,int disjunction_capture_index,Zone * zone)203 RegExpParserState(RegExpParserState* previous_state, 204 SubexpressionType group_type, 205 RegExpLookaround::Type lookaround_type, 206 int disjunction_capture_index, Zone* zone) 207 : previous_state_(previous_state), 208 builder_(new (zone) RegExpBuilder(zone)), 209 group_type_(group_type), 210 lookaround_type_(lookaround_type), 211 disjunction_capture_index_(disjunction_capture_index) {} 212 // Parser state of containing expression, if any. previous_state()213 RegExpParserState* previous_state() { return previous_state_; } IsSubexpression()214 bool IsSubexpression() { return previous_state_ != NULL; } 215 // RegExpBuilder building this regexp's AST. builder()216 RegExpBuilder* builder() { return builder_; } 217 // Type of regexp being parsed (parenthesized group or entire regexp). group_type()218 SubexpressionType group_type() { return group_type_; } 219 // Lookahead or Lookbehind. lookaround_type()220 RegExpLookaround::Type lookaround_type() { return lookaround_type_; } 221 // Index in captures array of first capture in this sub-expression, if any. 222 // Also the capture index of this sub-expression itself, if group_type 223 // is CAPTURE. capture_index()224 int capture_index() { return disjunction_capture_index_; } 225 226 // Check whether the parser is inside a capture group with the given index. 227 bool IsInsideCaptureGroup(int index); 228 229 private: 230 // Linked list implementation of stack of states. 231 RegExpParserState* previous_state_; 232 // Builder for the stored disjunction. 233 RegExpBuilder* builder_; 234 // Stored disjunction type (capture, look-ahead or grouping), if any. 235 SubexpressionType group_type_; 236 // Stored read direction. 237 RegExpLookaround::Type lookaround_type_; 238 // Stored disjunction's capture index (if any). 239 int disjunction_capture_index_; 240 }; 241 242 // Return the 1-indexed RegExpCapture object, allocate if necessary. 243 RegExpCapture* GetCapture(int index); 244 isolate()245 Isolate* isolate() { return isolate_; } zone()246 Zone* zone() const { return zone_; } 247 current()248 uc32 current() { return current_; } has_more()249 bool has_more() { return has_more_; } has_next()250 bool has_next() { return next_pos_ < in()->length(); } 251 uc32 Next(); in()252 FlatStringReader* in() { return in_; } 253 void ScanForCaptures(); 254 255 Isolate* isolate_; 256 Zone* zone_; 257 Handle<String>* error_; 258 ZoneList<RegExpCapture*>* captures_; 259 FlatStringReader* in_; 260 uc32 current_; 261 int next_pos_; 262 int captures_started_; 263 // The capture count is only valid after we have scanned for captures. 264 int capture_count_; 265 bool has_more_; 266 bool multiline_; 267 bool unicode_; 268 bool simple_; 269 bool contains_anchor_; 270 bool is_scanned_for_captures_; 271 bool failed_; 272 }; 273 274 } // namespace internal 275 } // namespace v8 276 277 #endif // V8_REGEXP_REGEXP_PARSER_H_ 278