1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_AST_H_
6 #define V8_REGEXP_REGEXP_AST_H_
7 
8 #include "src/objects.h"
9 #include "src/utils.h"
10 #include "src/zone/zone-containers.h"
11 #include "src/zone/zone.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
17   VISIT(Disjunction)                      \
18   VISIT(Alternative)                      \
19   VISIT(Assertion)                        \
20   VISIT(CharacterClass)                   \
21   VISIT(Atom)                             \
22   VISIT(Quantifier)                       \
23   VISIT(Capture)                          \
24   VISIT(Lookaround)                       \
25   VISIT(BackReference)                    \
26   VISIT(Empty)                            \
27   VISIT(Text)
28 
29 
30 #define FORWARD_DECLARE(Name) class RegExp##Name;
31 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
32 #undef FORWARD_DECLARE
33 
34 class RegExpCompiler;
35 class RegExpNode;
36 class RegExpTree;
37 
38 
39 class RegExpVisitor BASE_EMBEDDED {
40  public:
~RegExpVisitor()41   virtual ~RegExpVisitor() {}
42 #define MAKE_CASE(Name) \
43   virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
44   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
45 #undef MAKE_CASE
46 };
47 
48 
49 // A simple closed interval.
50 class Interval {
51  public:
Interval()52   Interval() : from_(kNone), to_(kNone) {}
Interval(int from,int to)53   Interval(int from, int to) : from_(from), to_(to) {}
Union(Interval that)54   Interval Union(Interval that) {
55     if (that.from_ == kNone)
56       return *this;
57     else if (from_ == kNone)
58       return that;
59     else
60       return Interval(Min(from_, that.from_), Max(to_, that.to_));
61   }
Contains(int value)62   bool Contains(int value) { return (from_ <= value) && (value <= to_); }
is_empty()63   bool is_empty() { return from_ == kNone; }
from()64   int from() const { return from_; }
to()65   int to() const { return to_; }
Empty()66   static Interval Empty() { return Interval(); }
67   static const int kNone = -1;
68 
69  private:
70   int from_;
71   int to_;
72 };
73 
74 
75 // Represents code units in the range from from_ to to_, both ends are
76 // inclusive.
77 class CharacterRange {
78  public:
CharacterRange()79   CharacterRange() : from_(0), to_(0) {}
80   // For compatibility with the CHECK_OK macro
CharacterRange(void * null)81   CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
82   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
83                              Zone* zone);
84   static Vector<const int> GetWordBounds();
Singleton(uc32 value)85   static inline CharacterRange Singleton(uc32 value) {
86     return CharacterRange(value, value);
87   }
Range(uc32 from,uc32 to)88   static inline CharacterRange Range(uc32 from, uc32 to) {
89     DCHECK(0 <= from && to <= String::kMaxCodePoint);
90     DCHECK(static_cast<uint32_t>(from) <= static_cast<uint32_t>(to));
91     return CharacterRange(from, to);
92   }
Everything()93   static inline CharacterRange Everything() {
94     return CharacterRange(0, String::kMaxCodePoint);
95   }
List(Zone * zone,CharacterRange range)96   static inline ZoneList<CharacterRange>* List(Zone* zone,
97                                                CharacterRange range) {
98     ZoneList<CharacterRange>* list =
99         new (zone) ZoneList<CharacterRange>(1, zone);
100     list->Add(range, zone);
101     return list;
102   }
Contains(uc32 i)103   bool Contains(uc32 i) { return from_ <= i && i <= to_; }
from()104   uc32 from() const { return from_; }
set_from(uc32 value)105   void set_from(uc32 value) { from_ = value; }
to()106   uc32 to() const { return to_; }
set_to(uc32 value)107   void set_to(uc32 value) { to_ = value; }
is_valid()108   bool is_valid() { return from_ <= to_; }
IsEverything(uc16 max)109   bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
IsSingleton()110   bool IsSingleton() { return (from_ == to_); }
111   static void AddCaseEquivalents(Isolate* isolate, Zone* zone,
112                                  ZoneList<CharacterRange>* ranges,
113                                  bool is_one_byte);
114   // Whether a range list is in canonical form: Ranges ordered by from value,
115   // and ranges non-overlapping and non-adjacent.
116   static bool IsCanonical(ZoneList<CharacterRange>* ranges);
117   // Convert range list to canonical form. The characters covered by the ranges
118   // will still be the same, but no character is in more than one range, and
119   // adjacent ranges are merged. The resulting list may be shorter than the
120   // original, but cannot be longer.
121   static void Canonicalize(ZoneList<CharacterRange>* ranges);
122   // Negate the contents of a character range in canonical form.
123   static void Negate(ZoneList<CharacterRange>* src,
124                      ZoneList<CharacterRange>* dst, Zone* zone);
125   static const int kStartMarker = (1 << 24);
126   static const int kPayloadMask = (1 << 24) - 1;
127 
128  private:
CharacterRange(uc32 from,uc32 to)129   CharacterRange(uc32 from, uc32 to) : from_(from), to_(to) {}
130 
131   uc32 from_;
132   uc32 to_;
133 };
134 
135 
136 class CharacterSet final BASE_EMBEDDED {
137  public:
CharacterSet(uc16 standard_set_type)138   explicit CharacterSet(uc16 standard_set_type)
139       : ranges_(NULL), standard_set_type_(standard_set_type) {}
CharacterSet(ZoneList<CharacterRange> * ranges)140   explicit CharacterSet(ZoneList<CharacterRange>* ranges)
141       : ranges_(ranges), standard_set_type_(0) {}
142   ZoneList<CharacterRange>* ranges(Zone* zone);
standard_set_type()143   uc16 standard_set_type() { return standard_set_type_; }
set_standard_set_type(uc16 special_set_type)144   void set_standard_set_type(uc16 special_set_type) {
145     standard_set_type_ = special_set_type;
146   }
is_standard()147   bool is_standard() { return standard_set_type_ != 0; }
148   void Canonicalize();
149 
150  private:
151   ZoneList<CharacterRange>* ranges_;
152   // If non-zero, the value represents a standard set (e.g., all whitespace
153   // characters) without having to expand the ranges.
154   uc16 standard_set_type_;
155 };
156 
157 
158 class TextElement final BASE_EMBEDDED {
159  public:
160   enum TextType { ATOM, CHAR_CLASS };
161 
162   static TextElement Atom(RegExpAtom* atom);
163   static TextElement CharClass(RegExpCharacterClass* char_class);
164 
cp_offset()165   int cp_offset() const { return cp_offset_; }
set_cp_offset(int cp_offset)166   void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
167   int length() const;
168 
text_type()169   TextType text_type() const { return text_type_; }
170 
tree()171   RegExpTree* tree() const { return tree_; }
172 
atom()173   RegExpAtom* atom() const {
174     DCHECK(text_type() == ATOM);
175     return reinterpret_cast<RegExpAtom*>(tree());
176   }
177 
char_class()178   RegExpCharacterClass* char_class() const {
179     DCHECK(text_type() == CHAR_CLASS);
180     return reinterpret_cast<RegExpCharacterClass*>(tree());
181   }
182 
183  private:
TextElement(TextType text_type,RegExpTree * tree)184   TextElement(TextType text_type, RegExpTree* tree)
185       : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
186 
187   int cp_offset_;
188   TextType text_type_;
189   RegExpTree* tree_;
190 };
191 
192 
193 class RegExpTree : public ZoneObject {
194  public:
195   static const int kInfinity = kMaxInt;
~RegExpTree()196   virtual ~RegExpTree() {}
197   virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
198   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
199                              RegExpNode* on_success) = 0;
IsTextElement()200   virtual bool IsTextElement() { return false; }
IsAnchoredAtStart()201   virtual bool IsAnchoredAtStart() { return false; }
IsAnchoredAtEnd()202   virtual bool IsAnchoredAtEnd() { return false; }
203   virtual int min_match() = 0;
204   virtual int max_match() = 0;
205   // Returns the interval of registers used for captures within this
206   // expression.
CaptureRegisters()207   virtual Interval CaptureRegisters() { return Interval::Empty(); }
208   virtual void AppendToText(RegExpText* text, Zone* zone);
209   std::ostream& Print(std::ostream& os, Zone* zone);  // NOLINT
210 #define MAKE_ASTYPE(Name)           \
211   virtual RegExp##Name* As##Name(); \
212   virtual bool Is##Name();
213   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
214 #undef MAKE_ASTYPE
215 };
216 
217 
218 class RegExpDisjunction final : public RegExpTree {
219  public:
220   explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
221   void* Accept(RegExpVisitor* visitor, void* data) override;
222   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
223   RegExpDisjunction* AsDisjunction() override;
224   Interval CaptureRegisters() override;
225   bool IsDisjunction() override;
226   bool IsAnchoredAtStart() override;
227   bool IsAnchoredAtEnd() override;
min_match()228   int min_match() override { return min_match_; }
max_match()229   int max_match() override { return max_match_; }
alternatives()230   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
231 
232  private:
233   bool SortConsecutiveAtoms(RegExpCompiler* compiler);
234   void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
235   void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
236   ZoneList<RegExpTree*>* alternatives_;
237   int min_match_;
238   int max_match_;
239 };
240 
241 
242 class RegExpAlternative final : public RegExpTree {
243  public:
244   explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
245   void* Accept(RegExpVisitor* visitor, void* data) override;
246   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
247   RegExpAlternative* AsAlternative() override;
248   Interval CaptureRegisters() override;
249   bool IsAlternative() override;
250   bool IsAnchoredAtStart() override;
251   bool IsAnchoredAtEnd() override;
min_match()252   int min_match() override { return min_match_; }
max_match()253   int max_match() override { return max_match_; }
nodes()254   ZoneList<RegExpTree*>* nodes() { return nodes_; }
255 
256  private:
257   ZoneList<RegExpTree*>* nodes_;
258   int min_match_;
259   int max_match_;
260 };
261 
262 
263 class RegExpAssertion final : public RegExpTree {
264  public:
265   enum AssertionType {
266     START_OF_LINE,
267     START_OF_INPUT,
268     END_OF_LINE,
269     END_OF_INPUT,
270     BOUNDARY,
271     NON_BOUNDARY
272   };
RegExpAssertion(AssertionType type)273   explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
274   void* Accept(RegExpVisitor* visitor, void* data) override;
275   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
276   RegExpAssertion* AsAssertion() override;
277   bool IsAssertion() override;
278   bool IsAnchoredAtStart() override;
279   bool IsAnchoredAtEnd() override;
min_match()280   int min_match() override { return 0; }
max_match()281   int max_match() override { return 0; }
assertion_type()282   AssertionType assertion_type() { return assertion_type_; }
283 
284  private:
285   AssertionType assertion_type_;
286 };
287 
288 
289 class RegExpCharacterClass final : public RegExpTree {
290  public:
RegExpCharacterClass(ZoneList<CharacterRange> * ranges,bool is_negated)291   RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
292       : set_(ranges), is_negated_(is_negated) {}
RegExpCharacterClass(uc16 type)293   explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
294   void* Accept(RegExpVisitor* visitor, void* data) override;
295   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
296   RegExpCharacterClass* AsCharacterClass() override;
297   bool IsCharacterClass() override;
IsTextElement()298   bool IsTextElement() override { return true; }
min_match()299   int min_match() override { return 1; }
300   // The character class may match two code units for unicode regexps.
301   // TODO(yangguo): we should split this class for usage in TextElement, and
302   //                make max_match() dependent on the character class content.
max_match()303   int max_match() override { return 2; }
304   void AppendToText(RegExpText* text, Zone* zone) override;
character_set()305   CharacterSet character_set() { return set_; }
306   // TODO(lrn): Remove need for complex version if is_standard that
307   // recognizes a mangled standard set and just do { return set_.is_special(); }
308   bool is_standard(Zone* zone);
309   // Returns a value representing the standard character set if is_standard()
310   // returns true.
311   // Currently used values are:
312   // s : unicode whitespace
313   // S : unicode non-whitespace
314   // w : ASCII word character (digit, letter, underscore)
315   // W : non-ASCII word character
316   // d : ASCII digit
317   // D : non-ASCII digit
318   // . : non-newline
319   // * : All characters, for advancing unanchored regexp
standard_type()320   uc16 standard_type() { return set_.standard_set_type(); }
ranges(Zone * zone)321   ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
is_negated()322   bool is_negated() { return is_negated_; }
323 
324  private:
325   CharacterSet set_;
326   bool is_negated_;
327 };
328 
329 
330 class RegExpAtom final : public RegExpTree {
331  public:
RegExpAtom(Vector<const uc16> data)332   explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
333   void* Accept(RegExpVisitor* visitor, void* data) override;
334   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
335   RegExpAtom* AsAtom() override;
336   bool IsAtom() override;
IsTextElement()337   bool IsTextElement() override { return true; }
min_match()338   int min_match() override { return data_.length(); }
max_match()339   int max_match() override { return data_.length(); }
340   void AppendToText(RegExpText* text, Zone* zone) override;
data()341   Vector<const uc16> data() { return data_; }
length()342   int length() { return data_.length(); }
343 
344  private:
345   Vector<const uc16> data_;
346 };
347 
348 
349 class RegExpText final : public RegExpTree {
350  public:
RegExpText(Zone * zone)351   explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
352   void* Accept(RegExpVisitor* visitor, void* data) override;
353   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
354   RegExpText* AsText() override;
355   bool IsText() override;
IsTextElement()356   bool IsTextElement() override { return true; }
min_match()357   int min_match() override { return length_; }
max_match()358   int max_match() override { return length_; }
359   void AppendToText(RegExpText* text, Zone* zone) override;
AddElement(TextElement elm,Zone * zone)360   void AddElement(TextElement elm, Zone* zone) {
361     elements_.Add(elm, zone);
362     length_ += elm.length();
363   }
elements()364   ZoneList<TextElement>* elements() { return &elements_; }
365 
366  private:
367   ZoneList<TextElement> elements_;
368   int length_;
369 };
370 
371 
372 class RegExpQuantifier final : public RegExpTree {
373  public:
374   enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min,int max,QuantifierType type,RegExpTree * body)375   RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
376       : body_(body),
377         min_(min),
378         max_(max),
379         min_match_(min * body->min_match()),
380         quantifier_type_(type) {
381     if (max > 0 && body->max_match() > kInfinity / max) {
382       max_match_ = kInfinity;
383     } else {
384       max_match_ = max * body->max_match();
385     }
386   }
387   void* Accept(RegExpVisitor* visitor, void* data) override;
388   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
389   static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
390                             RegExpCompiler* compiler, RegExpNode* on_success,
391                             bool not_at_start = false);
392   RegExpQuantifier* AsQuantifier() override;
393   Interval CaptureRegisters() override;
394   bool IsQuantifier() override;
min_match()395   int min_match() override { return min_match_; }
max_match()396   int max_match() override { return max_match_; }
min()397   int min() { return min_; }
max()398   int max() { return max_; }
is_possessive()399   bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
is_non_greedy()400   bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
is_greedy()401   bool is_greedy() { return quantifier_type_ == GREEDY; }
body()402   RegExpTree* body() { return body_; }
403 
404  private:
405   RegExpTree* body_;
406   int min_;
407   int max_;
408   int min_match_;
409   int max_match_;
410   QuantifierType quantifier_type_;
411 };
412 
413 
414 class RegExpCapture final : public RegExpTree {
415  public:
RegExpCapture(int index)416   explicit RegExpCapture(int index)
417       : body_(NULL), index_(index), name_(nullptr) {}
418   void* Accept(RegExpVisitor* visitor, void* data) override;
419   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
420   static RegExpNode* ToNode(RegExpTree* body, int index,
421                             RegExpCompiler* compiler, RegExpNode* on_success);
422   RegExpCapture* AsCapture() override;
423   bool IsAnchoredAtStart() override;
424   bool IsAnchoredAtEnd() override;
425   Interval CaptureRegisters() override;
426   bool IsCapture() override;
min_match()427   int min_match() override { return body_->min_match(); }
max_match()428   int max_match() override { return body_->max_match(); }
body()429   RegExpTree* body() { return body_; }
set_body(RegExpTree * body)430   void set_body(RegExpTree* body) { body_ = body; }
index()431   int index() { return index_; }
name()432   const ZoneVector<uc16>* name() const { return name_; }
set_name(const ZoneVector<uc16> * name)433   void set_name(const ZoneVector<uc16>* name) { name_ = name; }
StartRegister(int index)434   static int StartRegister(int index) { return index * 2; }
EndRegister(int index)435   static int EndRegister(int index) { return index * 2 + 1; }
436 
437  private:
438   RegExpTree* body_;
439   int index_;
440   const ZoneVector<uc16>* name_;
441 };
442 
443 
444 class RegExpLookaround final : public RegExpTree {
445  public:
446   enum Type { LOOKAHEAD, LOOKBEHIND };
447 
RegExpLookaround(RegExpTree * body,bool is_positive,int capture_count,int capture_from,Type type)448   RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
449                    int capture_from, Type type)
450       : body_(body),
451         is_positive_(is_positive),
452         capture_count_(capture_count),
453         capture_from_(capture_from),
454         type_(type) {}
455 
456   void* Accept(RegExpVisitor* visitor, void* data) override;
457   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
458   RegExpLookaround* AsLookaround() override;
459   Interval CaptureRegisters() override;
460   bool IsLookaround() override;
461   bool IsAnchoredAtStart() override;
min_match()462   int min_match() override { return 0; }
max_match()463   int max_match() override { return 0; }
body()464   RegExpTree* body() { return body_; }
is_positive()465   bool is_positive() { return is_positive_; }
capture_count()466   int capture_count() { return capture_count_; }
capture_from()467   int capture_from() { return capture_from_; }
type()468   Type type() { return type_; }
469 
470   class Builder {
471    public:
472     Builder(bool is_positive, RegExpNode* on_success,
473             int stack_pointer_register, int position_register,
474             int capture_register_count = 0, int capture_register_start = 0);
on_match_success()475     RegExpNode* on_match_success() { return on_match_success_; }
476     RegExpNode* ForMatch(RegExpNode* match);
477 
478    private:
479     bool is_positive_;
480     RegExpNode* on_match_success_;
481     RegExpNode* on_success_;
482     int stack_pointer_register_;
483     int position_register_;
484   };
485 
486  private:
487   RegExpTree* body_;
488   bool is_positive_;
489   int capture_count_;
490   int capture_from_;
491   Type type_;
492 };
493 
494 
495 class RegExpBackReference final : public RegExpTree {
496  public:
RegExpBackReference()497   RegExpBackReference() : capture_(nullptr), name_(nullptr) {}
RegExpBackReference(RegExpCapture * capture)498   explicit RegExpBackReference(RegExpCapture* capture)
499       : capture_(capture), name_(nullptr) {}
500   void* Accept(RegExpVisitor* visitor, void* data) override;
501   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
502   RegExpBackReference* AsBackReference() override;
503   bool IsBackReference() override;
min_match()504   int min_match() override { return 0; }
505   // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
506   // recursion, we give up. Ignorance is bliss.
max_match()507   int max_match() override { return kInfinity; }
index()508   int index() { return capture_->index(); }
capture()509   RegExpCapture* capture() { return capture_; }
set_capture(RegExpCapture * capture)510   void set_capture(RegExpCapture* capture) { capture_ = capture; }
name()511   const ZoneVector<uc16>* name() const { return name_; }
set_name(const ZoneVector<uc16> * name)512   void set_name(const ZoneVector<uc16>* name) { name_ = name; }
513 
514  private:
515   RegExpCapture* capture_;
516   const ZoneVector<uc16>* name_;
517 };
518 
519 
520 class RegExpEmpty final : public RegExpTree {
521  public:
RegExpEmpty()522   RegExpEmpty() {}
523   void* Accept(RegExpVisitor* visitor, void* data) override;
524   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
525   RegExpEmpty* AsEmpty() override;
526   bool IsEmpty() override;
min_match()527   int min_match() override { return 0; }
max_match()528   int max_match() override { return 0; }
529 };
530 
531 }  // namespace internal
532 }  // namespace v8
533 
534 #endif  // V8_REGEXP_REGEXP_AST_H_
535