1 // Copyright 2016 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_AST_H_ 6 #define V8_REGEXP_REGEXP_AST_H_ 7 8 #include "src/objects.h" 9 #include "src/utils.h" 10 #include "src/zone/zone-containers.h" 11 #include "src/zone/zone.h" 12 13 namespace v8 { 14 namespace internal { 15 16 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \ 17 VISIT(Disjunction) \ 18 VISIT(Alternative) \ 19 VISIT(Assertion) \ 20 VISIT(CharacterClass) \ 21 VISIT(Atom) \ 22 VISIT(Quantifier) \ 23 VISIT(Capture) \ 24 VISIT(Lookaround) \ 25 VISIT(BackReference) \ 26 VISIT(Empty) \ 27 VISIT(Text) 28 29 30 #define FORWARD_DECLARE(Name) class RegExp##Name; 31 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE) 32 #undef FORWARD_DECLARE 33 34 class RegExpCompiler; 35 class RegExpNode; 36 class RegExpTree; 37 38 39 class RegExpVisitor BASE_EMBEDDED { 40 public: ~RegExpVisitor()41 virtual ~RegExpVisitor() {} 42 #define MAKE_CASE(Name) \ 43 virtual void* Visit##Name(RegExp##Name*, void* data) = 0; 44 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE) 45 #undef MAKE_CASE 46 }; 47 48 49 // A simple closed interval. 50 class Interval { 51 public: Interval()52 Interval() : from_(kNone), to_(kNone) {} Interval(int from,int to)53 Interval(int from, int to) : from_(from), to_(to) {} Union(Interval that)54 Interval Union(Interval that) { 55 if (that.from_ == kNone) 56 return *this; 57 else if (from_ == kNone) 58 return that; 59 else 60 return Interval(Min(from_, that.from_), Max(to_, that.to_)); 61 } Contains(int value)62 bool Contains(int value) { return (from_ <= value) && (value <= to_); } is_empty()63 bool is_empty() { return from_ == kNone; } from()64 int from() const { return from_; } to()65 int to() const { return to_; } Empty()66 static Interval Empty() { return Interval(); } 67 static const int kNone = -1; 68 69 private: 70 int from_; 71 int to_; 72 }; 73 74 75 // Represents code units in the range from from_ to to_, both ends are 76 // inclusive. 77 class CharacterRange { 78 public: CharacterRange()79 CharacterRange() : from_(0), to_(0) {} 80 // For compatibility with the CHECK_OK macro CharacterRange(void * null)81 CharacterRange(void* null) { DCHECK_NULL(null); } // NOLINT 82 static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, 83 Zone* zone); 84 static Vector<const int> GetWordBounds(); Singleton(uc32 value)85 static inline CharacterRange Singleton(uc32 value) { 86 return CharacterRange(value, value); 87 } Range(uc32 from,uc32 to)88 static inline CharacterRange Range(uc32 from, uc32 to) { 89 DCHECK(0 <= from && to <= String::kMaxCodePoint); 90 DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to)); 91 return CharacterRange(from, to); 92 } Everything()93 static inline CharacterRange Everything() { 94 return CharacterRange(0, String::kMaxCodePoint); 95 } List(Zone * zone,CharacterRange range)96 static inline ZoneList<CharacterRange>* List(Zone* zone, 97 CharacterRange range) { 98 ZoneList<CharacterRange>* list = 99 new (zone) ZoneList<CharacterRange>(1, zone); 100 list->Add(range, zone); 101 return list; 102 } Contains(uc32 i)103 bool Contains(uc32 i) { return from_ <= i && i <= to_; } from()104 uc32 from() const { return from_; } set_from(uc32 value)105 void set_from(uc32 value) { from_ = value; } to()106 uc32 to() const { return to_; } set_to(uc32 value)107 void set_to(uc32 value) { to_ = value; } is_valid()108 bool is_valid() { return from_ <= to_; } IsEverything(uc16 max)109 bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; } IsSingleton()110 bool IsSingleton() { return (from_ == to_); } 111 static void AddCaseEquivalents(Isolate* isolate, Zone* zone, 112 ZoneList<CharacterRange>* ranges, 113 bool is_one_byte); 114 // Whether a range list is in canonical form: Ranges ordered by from value, 115 // and ranges non-overlapping and non-adjacent. 116 static bool IsCanonical(ZoneList<CharacterRange>* ranges); 117 // Convert range list to canonical form. The characters covered by the ranges 118 // will still be the same, but no character is in more than one range, and 119 // adjacent ranges are merged. The resulting list may be shorter than the 120 // original, but cannot be longer. 121 static void Canonicalize(ZoneList<CharacterRange>* ranges); 122 // Negate the contents of a character range in canonical form. 123 static void Negate(ZoneList<CharacterRange>* src, 124 ZoneList<CharacterRange>* dst, Zone* zone); 125 static const int kStartMarker = (1 << 24); 126 static const int kPayloadMask = (1 << 24) - 1; 127 128 private: CharacterRange(uc32 from,uc32 to)129 CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {} 130 131 uc32 from_; 132 uc32 to_; 133 }; 134 135 136 class CharacterSet final BASE_EMBEDDED { 137 public: CharacterSet(uc16 standard_set_type)138 explicit CharacterSet(uc16 standard_set_type) 139 : ranges_(NULL), standard_set_type_(standard_set_type) {} CharacterSet(ZoneList<CharacterRange> * ranges)140 explicit CharacterSet(ZoneList<CharacterRange>* ranges) 141 : ranges_(ranges), standard_set_type_(0) {} 142 ZoneList<CharacterRange>* ranges(Zone* zone); standard_set_type()143 uc16 standard_set_type() { return standard_set_type_; } set_standard_set_type(uc16 special_set_type)144 void set_standard_set_type(uc16 special_set_type) { 145 standard_set_type_ = special_set_type; 146 } is_standard()147 bool is_standard() { return standard_set_type_ != 0; } 148 void Canonicalize(); 149 150 private: 151 ZoneList<CharacterRange>* ranges_; 152 // If non-zero, the value represents a standard set (e.g., all whitespace 153 // characters) without having to expand the ranges. 154 uc16 standard_set_type_; 155 }; 156 157 158 class TextElement final BASE_EMBEDDED { 159 public: 160 enum TextType { ATOM, CHAR_CLASS }; 161 162 static TextElement Atom(RegExpAtom* atom); 163 static TextElement CharClass(RegExpCharacterClass* char_class); 164 cp_offset()165 int cp_offset() const { return cp_offset_; } set_cp_offset(int cp_offset)166 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; } 167 int length() const; 168 text_type()169 TextType text_type() const { return text_type_; } 170 tree()171 RegExpTree* tree() const { return tree_; } 172 atom()173 RegExpAtom* atom() const { 174 DCHECK(text_type() == ATOM); 175 return reinterpret_cast<RegExpAtom*>(tree()); 176 } 177 char_class()178 RegExpCharacterClass* char_class() const { 179 DCHECK(text_type() == CHAR_CLASS); 180 return reinterpret_cast<RegExpCharacterClass*>(tree()); 181 } 182 183 private: TextElement(TextType text_type,RegExpTree * tree)184 TextElement(TextType text_type, RegExpTree* tree) 185 : cp_offset_(-1), text_type_(text_type), tree_(tree) {} 186 187 int cp_offset_; 188 TextType text_type_; 189 RegExpTree* tree_; 190 }; 191 192 193 class RegExpTree : public ZoneObject { 194 public: 195 static const int kInfinity = kMaxInt; ~RegExpTree()196 virtual ~RegExpTree() {} 197 virtual void* Accept(RegExpVisitor* visitor, void* data) = 0; 198 virtual RegExpNode* ToNode(RegExpCompiler* compiler, 199 RegExpNode* on_success) = 0; IsTextElement()200 virtual bool IsTextElement() { return false; } IsAnchoredAtStart()201 virtual bool IsAnchoredAtStart() { return false; } IsAnchoredAtEnd()202 virtual bool IsAnchoredAtEnd() { return false; } 203 virtual int min_match() = 0; 204 virtual int max_match() = 0; 205 // Returns the interval of registers used for captures within this 206 // expression. CaptureRegisters()207 virtual Interval CaptureRegisters() { return Interval::Empty(); } 208 virtual void AppendToText(RegExpText* text, Zone* zone); 209 std::ostream& Print(std::ostream& os, Zone* zone); // NOLINT 210 #define MAKE_ASTYPE(Name) \ 211 virtual RegExp##Name* As##Name(); \ 212 virtual bool Is##Name(); 213 FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE) 214 #undef MAKE_ASTYPE 215 }; 216 217 218 class RegExpDisjunction final : public RegExpTree { 219 public: 220 explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives); 221 void* Accept(RegExpVisitor* visitor, void* data) override; 222 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 223 RegExpDisjunction* AsDisjunction() override; 224 Interval CaptureRegisters() override; 225 bool IsDisjunction() override; 226 bool IsAnchoredAtStart() override; 227 bool IsAnchoredAtEnd() override; min_match()228 int min_match() override { return min_match_; } max_match()229 int max_match() override { return max_match_; } alternatives()230 ZoneList<RegExpTree*>* alternatives() { return alternatives_; } 231 232 private: 233 bool SortConsecutiveAtoms(RegExpCompiler* compiler); 234 void RationalizeConsecutiveAtoms(RegExpCompiler* compiler); 235 void FixSingleCharacterDisjunctions(RegExpCompiler* compiler); 236 ZoneList<RegExpTree*>* alternatives_; 237 int min_match_; 238 int max_match_; 239 }; 240 241 242 class RegExpAlternative final : public RegExpTree { 243 public: 244 explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes); 245 void* Accept(RegExpVisitor* visitor, void* data) override; 246 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 247 RegExpAlternative* AsAlternative() override; 248 Interval CaptureRegisters() override; 249 bool IsAlternative() override; 250 bool IsAnchoredAtStart() override; 251 bool IsAnchoredAtEnd() override; min_match()252 int min_match() override { return min_match_; } max_match()253 int max_match() override { return max_match_; } nodes()254 ZoneList<RegExpTree*>* nodes() { return nodes_; } 255 256 private: 257 ZoneList<RegExpTree*>* nodes_; 258 int min_match_; 259 int max_match_; 260 }; 261 262 263 class RegExpAssertion final : public RegExpTree { 264 public: 265 enum AssertionType { 266 START_OF_LINE, 267 START_OF_INPUT, 268 END_OF_LINE, 269 END_OF_INPUT, 270 BOUNDARY, 271 NON_BOUNDARY 272 }; RegExpAssertion(AssertionType type)273 explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {} 274 void* Accept(RegExpVisitor* visitor, void* data) override; 275 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 276 RegExpAssertion* AsAssertion() override; 277 bool IsAssertion() override; 278 bool IsAnchoredAtStart() override; 279 bool IsAnchoredAtEnd() override; min_match()280 int min_match() override { return 0; } max_match()281 int max_match() override { return 0; } assertion_type()282 AssertionType assertion_type() { return assertion_type_; } 283 284 private: 285 AssertionType assertion_type_; 286 }; 287 288 289 class RegExpCharacterClass final : public RegExpTree { 290 public: RegExpCharacterClass(ZoneList<CharacterRange> * ranges,bool is_negated)291 RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated) 292 : set_(ranges), is_negated_(is_negated) {} RegExpCharacterClass(uc16 type)293 explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {} 294 void* Accept(RegExpVisitor* visitor, void* data) override; 295 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 296 RegExpCharacterClass* AsCharacterClass() override; 297 bool IsCharacterClass() override; IsTextElement()298 bool IsTextElement() override { return true; } min_match()299 int min_match() override { return 1; } 300 // The character class may match two code units for unicode regexps. 301 // TODO(yangguo): we should split this class for usage in TextElement, and 302 // make max_match() dependent on the character class content. max_match()303 int max_match() override { return 2; } 304 void AppendToText(RegExpText* text, Zone* zone) override; character_set()305 CharacterSet character_set() { return set_; } 306 // TODO(lrn): Remove need for complex version if is_standard that 307 // recognizes a mangled standard set and just do { return set_.is_special(); } 308 bool is_standard(Zone* zone); 309 // Returns a value representing the standard character set if is_standard() 310 // returns true. 311 // Currently used values are: 312 // s : unicode whitespace 313 // S : unicode non-whitespace 314 // w : ASCII word character (digit, letter, underscore) 315 // W : non-ASCII word character 316 // d : ASCII digit 317 // D : non-ASCII digit 318 // . : non-newline 319 // * : All characters, for advancing unanchored regexp standard_type()320 uc16 standard_type() { return set_.standard_set_type(); } ranges(Zone * zone)321 ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); } is_negated()322 bool is_negated() { return is_negated_; } 323 324 private: 325 CharacterSet set_; 326 bool is_negated_; 327 }; 328 329 330 class RegExpAtom final : public RegExpTree { 331 public: RegExpAtom(Vector<const uc16> data)332 explicit RegExpAtom(Vector<const uc16> data) : data_(data) {} 333 void* Accept(RegExpVisitor* visitor, void* data) override; 334 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 335 RegExpAtom* AsAtom() override; 336 bool IsAtom() override; IsTextElement()337 bool IsTextElement() override { return true; } min_match()338 int min_match() override { return data_.length(); } max_match()339 int max_match() override { return data_.length(); } 340 void AppendToText(RegExpText* text, Zone* zone) override; data()341 Vector<const uc16> data() { return data_; } length()342 int length() { return data_.length(); } 343 344 private: 345 Vector<const uc16> data_; 346 }; 347 348 349 class RegExpText final : public RegExpTree { 350 public: RegExpText(Zone * zone)351 explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {} 352 void* Accept(RegExpVisitor* visitor, void* data) override; 353 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 354 RegExpText* AsText() override; 355 bool IsText() override; IsTextElement()356 bool IsTextElement() override { return true; } min_match()357 int min_match() override { return length_; } max_match()358 int max_match() override { return length_; } 359 void AppendToText(RegExpText* text, Zone* zone) override; AddElement(TextElement elm,Zone * zone)360 void AddElement(TextElement elm, Zone* zone) { 361 elements_.Add(elm, zone); 362 length_ += elm.length(); 363 } elements()364 ZoneList<TextElement>* elements() { return &elements_; } 365 366 private: 367 ZoneList<TextElement> elements_; 368 int length_; 369 }; 370 371 372 class RegExpQuantifier final : public RegExpTree { 373 public: 374 enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE }; RegExpQuantifier(int min,int max,QuantifierType type,RegExpTree * body)375 RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body) 376 : body_(body), 377 min_(min), 378 max_(max), 379 min_match_(min * body->min_match()), 380 quantifier_type_(type) { 381 if (max > 0 && body->max_match() > kInfinity / max) { 382 max_match_ = kInfinity; 383 } else { 384 max_match_ = max * body->max_match(); 385 } 386 } 387 void* Accept(RegExpVisitor* visitor, void* data) override; 388 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 389 static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body, 390 RegExpCompiler* compiler, RegExpNode* on_success, 391 bool not_at_start = false); 392 RegExpQuantifier* AsQuantifier() override; 393 Interval CaptureRegisters() override; 394 bool IsQuantifier() override; min_match()395 int min_match() override { return min_match_; } max_match()396 int max_match() override { return max_match_; } min()397 int min() { return min_; } max()398 int max() { return max_; } is_possessive()399 bool is_possessive() { return quantifier_type_ == POSSESSIVE; } is_non_greedy()400 bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; } is_greedy()401 bool is_greedy() { return quantifier_type_ == GREEDY; } body()402 RegExpTree* body() { return body_; } 403 404 private: 405 RegExpTree* body_; 406 int min_; 407 int max_; 408 int min_match_; 409 int max_match_; 410 QuantifierType quantifier_type_; 411 }; 412 413 414 class RegExpCapture final : public RegExpTree { 415 public: RegExpCapture(int index)416 explicit RegExpCapture(int index) 417 : body_(NULL), index_(index), name_(nullptr) {} 418 void* Accept(RegExpVisitor* visitor, void* data) override; 419 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 420 static RegExpNode* ToNode(RegExpTree* body, int index, 421 RegExpCompiler* compiler, RegExpNode* on_success); 422 RegExpCapture* AsCapture() override; 423 bool IsAnchoredAtStart() override; 424 bool IsAnchoredAtEnd() override; 425 Interval CaptureRegisters() override; 426 bool IsCapture() override; min_match()427 int min_match() override { return body_->min_match(); } max_match()428 int max_match() override { return body_->max_match(); } body()429 RegExpTree* body() { return body_; } set_body(RegExpTree * body)430 void set_body(RegExpTree* body) { body_ = body; } index()431 int index() { return index_; } name()432 const ZoneVector<uc16>* name() const { return name_; } set_name(const ZoneVector<uc16> * name)433 void set_name(const ZoneVector<uc16>* name) { name_ = name; } StartRegister(int index)434 static int StartRegister(int index) { return index * 2; } EndRegister(int index)435 static int EndRegister(int index) { return index * 2 + 1; } 436 437 private: 438 RegExpTree* body_; 439 int index_; 440 const ZoneVector<uc16>* name_; 441 }; 442 443 444 class RegExpLookaround final : public RegExpTree { 445 public: 446 enum Type { LOOKAHEAD, LOOKBEHIND }; 447 RegExpLookaround(RegExpTree * body,bool is_positive,int capture_count,int capture_from,Type type)448 RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count, 449 int capture_from, Type type) 450 : body_(body), 451 is_positive_(is_positive), 452 capture_count_(capture_count), 453 capture_from_(capture_from), 454 type_(type) {} 455 456 void* Accept(RegExpVisitor* visitor, void* data) override; 457 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 458 RegExpLookaround* AsLookaround() override; 459 Interval CaptureRegisters() override; 460 bool IsLookaround() override; 461 bool IsAnchoredAtStart() override; min_match()462 int min_match() override { return 0; } max_match()463 int max_match() override { return 0; } body()464 RegExpTree* body() { return body_; } is_positive()465 bool is_positive() { return is_positive_; } capture_count()466 int capture_count() { return capture_count_; } capture_from()467 int capture_from() { return capture_from_; } type()468 Type type() { return type_; } 469 470 class Builder { 471 public: 472 Builder(bool is_positive, RegExpNode* on_success, 473 int stack_pointer_register, int position_register, 474 int capture_register_count = 0, int capture_register_start = 0); on_match_success()475 RegExpNode* on_match_success() { return on_match_success_; } 476 RegExpNode* ForMatch(RegExpNode* match); 477 478 private: 479 bool is_positive_; 480 RegExpNode* on_match_success_; 481 RegExpNode* on_success_; 482 int stack_pointer_register_; 483 int position_register_; 484 }; 485 486 private: 487 RegExpTree* body_; 488 bool is_positive_; 489 int capture_count_; 490 int capture_from_; 491 Type type_; 492 }; 493 494 495 class RegExpBackReference final : public RegExpTree { 496 public: RegExpBackReference()497 RegExpBackReference() : capture_(nullptr), name_(nullptr) {} RegExpBackReference(RegExpCapture * capture)498 explicit RegExpBackReference(RegExpCapture* capture) 499 : capture_(capture), name_(nullptr) {} 500 void* Accept(RegExpVisitor* visitor, void* data) override; 501 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 502 RegExpBackReference* AsBackReference() override; 503 bool IsBackReference() override; min_match()504 int min_match() override { return 0; } 505 // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite 506 // recursion, we give up. Ignorance is bliss. max_match()507 int max_match() override { return kInfinity; } index()508 int index() { return capture_->index(); } capture()509 RegExpCapture* capture() { return capture_; } set_capture(RegExpCapture * capture)510 void set_capture(RegExpCapture* capture) { capture_ = capture; } name()511 const ZoneVector<uc16>* name() const { return name_; } set_name(const ZoneVector<uc16> * name)512 void set_name(const ZoneVector<uc16>* name) { name_ = name; } 513 514 private: 515 RegExpCapture* capture_; 516 const ZoneVector<uc16>* name_; 517 }; 518 519 520 class RegExpEmpty final : public RegExpTree { 521 public: RegExpEmpty()522 RegExpEmpty() {} 523 void* Accept(RegExpVisitor* visitor, void* data) override; 524 RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override; 525 RegExpEmpty* AsEmpty() override; 526 bool IsEmpty() override; min_match()527 int min_match() override { return 0; } max_match()528 int max_match() override { return 0; } 529 }; 530 531 } // namespace internal 532 } // namespace v8 533 534 #endif // V8_REGEXP_REGEXP_AST_H_ 535