1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_AST_H_
6 #define V8_REGEXP_REGEXP_AST_H_
7 
8 #include "src/utils.h"
9 #include "src/zone.h"
10 
11 namespace v8 {
12 namespace internal {
13 
14 #define FOR_EACH_REG_EXP_TREE_TYPE(VISIT) \
15   VISIT(Disjunction)                      \
16   VISIT(Alternative)                      \
17   VISIT(Assertion)                        \
18   VISIT(CharacterClass)                   \
19   VISIT(Atom)                             \
20   VISIT(Quantifier)                       \
21   VISIT(Capture)                          \
22   VISIT(Lookaround)                       \
23   VISIT(BackReference)                    \
24   VISIT(Empty)                            \
25   VISIT(Text)
26 
27 
28 #define FORWARD_DECLARE(Name) class RegExp##Name;
29 FOR_EACH_REG_EXP_TREE_TYPE(FORWARD_DECLARE)
30 #undef FORWARD_DECLARE
31 
32 class RegExpCompiler;
33 class RegExpNode;
34 class RegExpTree;
35 
36 
37 class RegExpVisitor BASE_EMBEDDED {
38  public:
~RegExpVisitor()39   virtual ~RegExpVisitor() {}
40 #define MAKE_CASE(Name) \
41   virtual void* Visit##Name(RegExp##Name*, void* data) = 0;
42   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_CASE)
43 #undef MAKE_CASE
44 };
45 
46 
47 // A simple closed interval.
48 class Interval {
49  public:
Interval()50   Interval() : from_(kNone), to_(kNone) {}
Interval(int from,int to)51   Interval(int from, int to) : from_(from), to_(to) {}
Union(Interval that)52   Interval Union(Interval that) {
53     if (that.from_ == kNone)
54       return *this;
55     else if (from_ == kNone)
56       return that;
57     else
58       return Interval(Min(from_, that.from_), Max(to_, that.to_));
59   }
Contains(int value)60   bool Contains(int value) { return (from_ <= value) && (value <= to_); }
is_empty()61   bool is_empty() { return from_ == kNone; }
from()62   int from() const { return from_; }
to()63   int to() const { return to_; }
Empty()64   static Interval Empty() { return Interval(); }
65   static const int kNone = -1;
66 
67  private:
68   int from_;
69   int to_;
70 };
71 
72 
73 // Represents code units in the range from from_ to to_, both ends are
74 // inclusive.
75 class CharacterRange {
76  public:
CharacterRange()77   CharacterRange() : from_(0), to_(0) {}
78   // For compatibility with the CHECK_OK macro
CharacterRange(void * null)79   CharacterRange(void* null) { DCHECK_NULL(null); }  // NOLINT
CharacterRange(uc16 from,uc16 to)80   CharacterRange(uc16 from, uc16 to) : from_(from), to_(to) {}
81   static void AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
82                              Zone* zone);
83   static Vector<const int> GetWordBounds();
Singleton(uc16 value)84   static inline CharacterRange Singleton(uc16 value) {
85     return CharacterRange(value, value);
86   }
Range(uc16 from,uc16 to)87   static inline CharacterRange Range(uc16 from, uc16 to) {
88     DCHECK(from <= to);
89     return CharacterRange(from, to);
90   }
Everything()91   static inline CharacterRange Everything() {
92     return CharacterRange(0, 0xFFFF);
93   }
Contains(uc16 i)94   bool Contains(uc16 i) { return from_ <= i && i <= to_; }
from()95   uc16 from() const { return from_; }
set_from(uc16 value)96   void set_from(uc16 value) { from_ = value; }
to()97   uc16 to() const { return to_; }
set_to(uc16 value)98   void set_to(uc16 value) { to_ = value; }
is_valid()99   bool is_valid() { return from_ <= to_; }
IsEverything(uc16 max)100   bool IsEverything(uc16 max) { return from_ == 0 && to_ >= max; }
IsSingleton()101   bool IsSingleton() { return (from_ == to_); }
102   void AddCaseEquivalents(Isolate* isolate, Zone* zone,
103                           ZoneList<CharacterRange>* ranges, bool is_one_byte);
104   static void Split(ZoneList<CharacterRange>* base, Vector<const int> overlay,
105                     ZoneList<CharacterRange>** included,
106                     ZoneList<CharacterRange>** excluded, Zone* zone);
107   // Whether a range list is in canonical form: Ranges ordered by from value,
108   // and ranges non-overlapping and non-adjacent.
109   static bool IsCanonical(ZoneList<CharacterRange>* ranges);
110   // Convert range list to canonical form. The characters covered by the ranges
111   // will still be the same, but no character is in more than one range, and
112   // adjacent ranges are merged. The resulting list may be shorter than the
113   // original, but cannot be longer.
114   static void Canonicalize(ZoneList<CharacterRange>* ranges);
115   // Negate the contents of a character range in canonical form.
116   static void Negate(ZoneList<CharacterRange>* src,
117                      ZoneList<CharacterRange>* dst, Zone* zone);
118   static const int kStartMarker = (1 << 24);
119   static const int kPayloadMask = (1 << 24) - 1;
120 
121  private:
122   uc16 from_;
123   uc16 to_;
124 };
125 
126 
127 class CharacterSet final BASE_EMBEDDED {
128  public:
CharacterSet(uc16 standard_set_type)129   explicit CharacterSet(uc16 standard_set_type)
130       : ranges_(NULL), standard_set_type_(standard_set_type) {}
CharacterSet(ZoneList<CharacterRange> * ranges)131   explicit CharacterSet(ZoneList<CharacterRange>* ranges)
132       : ranges_(ranges), standard_set_type_(0) {}
133   ZoneList<CharacterRange>* ranges(Zone* zone);
standard_set_type()134   uc16 standard_set_type() { return standard_set_type_; }
set_standard_set_type(uc16 special_set_type)135   void set_standard_set_type(uc16 special_set_type) {
136     standard_set_type_ = special_set_type;
137   }
is_standard()138   bool is_standard() { return standard_set_type_ != 0; }
139   void Canonicalize();
140 
141  private:
142   ZoneList<CharacterRange>* ranges_;
143   // If non-zero, the value represents a standard set (e.g., all whitespace
144   // characters) without having to expand the ranges.
145   uc16 standard_set_type_;
146 };
147 
148 
149 class TextElement final BASE_EMBEDDED {
150  public:
151   enum TextType { ATOM, CHAR_CLASS };
152 
153   static TextElement Atom(RegExpAtom* atom);
154   static TextElement CharClass(RegExpCharacterClass* char_class);
155 
cp_offset()156   int cp_offset() const { return cp_offset_; }
set_cp_offset(int cp_offset)157   void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
158   int length() const;
159 
text_type()160   TextType text_type() const { return text_type_; }
161 
tree()162   RegExpTree* tree() const { return tree_; }
163 
atom()164   RegExpAtom* atom() const {
165     DCHECK(text_type() == ATOM);
166     return reinterpret_cast<RegExpAtom*>(tree());
167   }
168 
char_class()169   RegExpCharacterClass* char_class() const {
170     DCHECK(text_type() == CHAR_CLASS);
171     return reinterpret_cast<RegExpCharacterClass*>(tree());
172   }
173 
174  private:
TextElement(TextType text_type,RegExpTree * tree)175   TextElement(TextType text_type, RegExpTree* tree)
176       : cp_offset_(-1), text_type_(text_type), tree_(tree) {}
177 
178   int cp_offset_;
179   TextType text_type_;
180   RegExpTree* tree_;
181 };
182 
183 
184 class RegExpTree : public ZoneObject {
185  public:
186   static const int kInfinity = kMaxInt;
~RegExpTree()187   virtual ~RegExpTree() {}
188   virtual void* Accept(RegExpVisitor* visitor, void* data) = 0;
189   virtual RegExpNode* ToNode(RegExpCompiler* compiler,
190                              RegExpNode* on_success) = 0;
IsTextElement()191   virtual bool IsTextElement() { return false; }
IsAnchoredAtStart()192   virtual bool IsAnchoredAtStart() { return false; }
IsAnchoredAtEnd()193   virtual bool IsAnchoredAtEnd() { return false; }
194   virtual int min_match() = 0;
195   virtual int max_match() = 0;
196   // Returns the interval of registers used for captures within this
197   // expression.
CaptureRegisters()198   virtual Interval CaptureRegisters() { return Interval::Empty(); }
199   virtual void AppendToText(RegExpText* text, Zone* zone);
200   std::ostream& Print(std::ostream& os, Zone* zone);  // NOLINT
201 #define MAKE_ASTYPE(Name)           \
202   virtual RegExp##Name* As##Name(); \
203   virtual bool Is##Name();
204   FOR_EACH_REG_EXP_TREE_TYPE(MAKE_ASTYPE)
205 #undef MAKE_ASTYPE
206 };
207 
208 
209 class RegExpDisjunction final : public RegExpTree {
210  public:
211   explicit RegExpDisjunction(ZoneList<RegExpTree*>* alternatives);
212   void* Accept(RegExpVisitor* visitor, void* data) override;
213   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
214   RegExpDisjunction* AsDisjunction() override;
215   Interval CaptureRegisters() override;
216   bool IsDisjunction() override;
217   bool IsAnchoredAtStart() override;
218   bool IsAnchoredAtEnd() override;
min_match()219   int min_match() override { return min_match_; }
max_match()220   int max_match() override { return max_match_; }
alternatives()221   ZoneList<RegExpTree*>* alternatives() { return alternatives_; }
222 
223  private:
224   bool SortConsecutiveAtoms(RegExpCompiler* compiler);
225   void RationalizeConsecutiveAtoms(RegExpCompiler* compiler);
226   void FixSingleCharacterDisjunctions(RegExpCompiler* compiler);
227   ZoneList<RegExpTree*>* alternatives_;
228   int min_match_;
229   int max_match_;
230 };
231 
232 
233 class RegExpAlternative final : public RegExpTree {
234  public:
235   explicit RegExpAlternative(ZoneList<RegExpTree*>* nodes);
236   void* Accept(RegExpVisitor* visitor, void* data) override;
237   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
238   RegExpAlternative* AsAlternative() override;
239   Interval CaptureRegisters() override;
240   bool IsAlternative() override;
241   bool IsAnchoredAtStart() override;
242   bool IsAnchoredAtEnd() override;
min_match()243   int min_match() override { return min_match_; }
max_match()244   int max_match() override { return max_match_; }
nodes()245   ZoneList<RegExpTree*>* nodes() { return nodes_; }
246 
247  private:
248   ZoneList<RegExpTree*>* nodes_;
249   int min_match_;
250   int max_match_;
251 };
252 
253 
254 class RegExpAssertion final : public RegExpTree {
255  public:
256   enum AssertionType {
257     START_OF_LINE,
258     START_OF_INPUT,
259     END_OF_LINE,
260     END_OF_INPUT,
261     BOUNDARY,
262     NON_BOUNDARY
263   };
RegExpAssertion(AssertionType type)264   explicit RegExpAssertion(AssertionType type) : assertion_type_(type) {}
265   void* Accept(RegExpVisitor* visitor, void* data) override;
266   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
267   RegExpAssertion* AsAssertion() override;
268   bool IsAssertion() override;
269   bool IsAnchoredAtStart() override;
270   bool IsAnchoredAtEnd() override;
min_match()271   int min_match() override { return 0; }
max_match()272   int max_match() override { return 0; }
assertion_type()273   AssertionType assertion_type() { return assertion_type_; }
274 
275  private:
276   AssertionType assertion_type_;
277 };
278 
279 
280 class RegExpCharacterClass final : public RegExpTree {
281  public:
RegExpCharacterClass(ZoneList<CharacterRange> * ranges,bool is_negated)282   RegExpCharacterClass(ZoneList<CharacterRange>* ranges, bool is_negated)
283       : set_(ranges), is_negated_(is_negated) {}
RegExpCharacterClass(uc16 type)284   explicit RegExpCharacterClass(uc16 type) : set_(type), is_negated_(false) {}
285   void* Accept(RegExpVisitor* visitor, void* data) override;
286   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
287   RegExpCharacterClass* AsCharacterClass() override;
288   bool IsCharacterClass() override;
IsTextElement()289   bool IsTextElement() override { return true; }
min_match()290   int min_match() override { return 1; }
max_match()291   int max_match() override { return 1; }
292   void AppendToText(RegExpText* text, Zone* zone) override;
character_set()293   CharacterSet character_set() { return set_; }
294   // TODO(lrn): Remove need for complex version if is_standard that
295   // recognizes a mangled standard set and just do { return set_.is_special(); }
296   bool is_standard(Zone* zone);
297   // Returns a value representing the standard character set if is_standard()
298   // returns true.
299   // Currently used values are:
300   // s : unicode whitespace
301   // S : unicode non-whitespace
302   // w : ASCII word character (digit, letter, underscore)
303   // W : non-ASCII word character
304   // d : ASCII digit
305   // D : non-ASCII digit
306   // . : non-unicode non-newline
307   // * : All characters
standard_type()308   uc16 standard_type() { return set_.standard_set_type(); }
ranges(Zone * zone)309   ZoneList<CharacterRange>* ranges(Zone* zone) { return set_.ranges(zone); }
is_negated()310   bool is_negated() { return is_negated_; }
311 
312  private:
313   CharacterSet set_;
314   bool is_negated_;
315 };
316 
317 
318 class RegExpAtom final : public RegExpTree {
319  public:
RegExpAtom(Vector<const uc16> data)320   explicit RegExpAtom(Vector<const uc16> data) : data_(data) {}
321   void* Accept(RegExpVisitor* visitor, void* data) override;
322   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
323   RegExpAtom* AsAtom() override;
324   bool IsAtom() override;
IsTextElement()325   bool IsTextElement() override { return true; }
min_match()326   int min_match() override { return data_.length(); }
max_match()327   int max_match() override { return data_.length(); }
328   void AppendToText(RegExpText* text, Zone* zone) override;
data()329   Vector<const uc16> data() { return data_; }
length()330   int length() { return data_.length(); }
331 
332  private:
333   Vector<const uc16> data_;
334 };
335 
336 
337 class RegExpText final : public RegExpTree {
338  public:
RegExpText(Zone * zone)339   explicit RegExpText(Zone* zone) : elements_(2, zone), length_(0) {}
340   void* Accept(RegExpVisitor* visitor, void* data) override;
341   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
342   RegExpText* AsText() override;
343   bool IsText() override;
IsTextElement()344   bool IsTextElement() override { return true; }
min_match()345   int min_match() override { return length_; }
max_match()346   int max_match() override { return length_; }
347   void AppendToText(RegExpText* text, Zone* zone) override;
AddElement(TextElement elm,Zone * zone)348   void AddElement(TextElement elm, Zone* zone) {
349     elements_.Add(elm, zone);
350     length_ += elm.length();
351   }
elements()352   ZoneList<TextElement>* elements() { return &elements_; }
353 
354  private:
355   ZoneList<TextElement> elements_;
356   int length_;
357 };
358 
359 
360 class RegExpQuantifier final : public RegExpTree {
361  public:
362   enum QuantifierType { GREEDY, NON_GREEDY, POSSESSIVE };
RegExpQuantifier(int min,int max,QuantifierType type,RegExpTree * body)363   RegExpQuantifier(int min, int max, QuantifierType type, RegExpTree* body)
364       : body_(body),
365         min_(min),
366         max_(max),
367         min_match_(min * body->min_match()),
368         quantifier_type_(type) {
369     if (max > 0 && body->max_match() > kInfinity / max) {
370       max_match_ = kInfinity;
371     } else {
372       max_match_ = max * body->max_match();
373     }
374   }
375   void* Accept(RegExpVisitor* visitor, void* data) override;
376   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
377   static RegExpNode* ToNode(int min, int max, bool is_greedy, RegExpTree* body,
378                             RegExpCompiler* compiler, RegExpNode* on_success,
379                             bool not_at_start = false);
380   RegExpQuantifier* AsQuantifier() override;
381   Interval CaptureRegisters() override;
382   bool IsQuantifier() override;
min_match()383   int min_match() override { return min_match_; }
max_match()384   int max_match() override { return max_match_; }
min()385   int min() { return min_; }
max()386   int max() { return max_; }
is_possessive()387   bool is_possessive() { return quantifier_type_ == POSSESSIVE; }
is_non_greedy()388   bool is_non_greedy() { return quantifier_type_ == NON_GREEDY; }
is_greedy()389   bool is_greedy() { return quantifier_type_ == GREEDY; }
body()390   RegExpTree* body() { return body_; }
391 
392  private:
393   RegExpTree* body_;
394   int min_;
395   int max_;
396   int min_match_;
397   int max_match_;
398   QuantifierType quantifier_type_;
399 };
400 
401 
402 class RegExpCapture final : public RegExpTree {
403  public:
RegExpCapture(int index)404   explicit RegExpCapture(int index) : body_(NULL), index_(index) {}
405   void* Accept(RegExpVisitor* visitor, void* data) override;
406   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
407   static RegExpNode* ToNode(RegExpTree* body, int index,
408                             RegExpCompiler* compiler, RegExpNode* on_success);
409   RegExpCapture* AsCapture() override;
410   bool IsAnchoredAtStart() override;
411   bool IsAnchoredAtEnd() override;
412   Interval CaptureRegisters() override;
413   bool IsCapture() override;
min_match()414   int min_match() override { return body_->min_match(); }
max_match()415   int max_match() override { return body_->max_match(); }
body()416   RegExpTree* body() { return body_; }
set_body(RegExpTree * body)417   void set_body(RegExpTree* body) { body_ = body; }
index()418   int index() { return index_; }
StartRegister(int index)419   static int StartRegister(int index) { return index * 2; }
EndRegister(int index)420   static int EndRegister(int index) { return index * 2 + 1; }
421 
422  private:
423   RegExpTree* body_;
424   int index_;
425 };
426 
427 
428 class RegExpLookaround final : public RegExpTree {
429  public:
430   enum Type { LOOKAHEAD, LOOKBEHIND };
431 
RegExpLookaround(RegExpTree * body,bool is_positive,int capture_count,int capture_from,Type type)432   RegExpLookaround(RegExpTree* body, bool is_positive, int capture_count,
433                    int capture_from, Type type)
434       : body_(body),
435         is_positive_(is_positive),
436         capture_count_(capture_count),
437         capture_from_(capture_from),
438         type_(type) {}
439 
440   void* Accept(RegExpVisitor* visitor, void* data) override;
441   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
442   RegExpLookaround* AsLookaround() override;
443   Interval CaptureRegisters() override;
444   bool IsLookaround() override;
445   bool IsAnchoredAtStart() override;
min_match()446   int min_match() override { return 0; }
max_match()447   int max_match() override { return 0; }
body()448   RegExpTree* body() { return body_; }
is_positive()449   bool is_positive() { return is_positive_; }
capture_count()450   int capture_count() { return capture_count_; }
capture_from()451   int capture_from() { return capture_from_; }
type()452   Type type() { return type_; }
453 
454  private:
455   RegExpTree* body_;
456   bool is_positive_;
457   int capture_count_;
458   int capture_from_;
459   Type type_;
460 };
461 
462 
463 class RegExpBackReference final : public RegExpTree {
464  public:
RegExpBackReference(RegExpCapture * capture)465   explicit RegExpBackReference(RegExpCapture* capture) : capture_(capture) {}
466   void* Accept(RegExpVisitor* visitor, void* data) override;
467   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
468   RegExpBackReference* AsBackReference() override;
469   bool IsBackReference() override;
min_match()470   int min_match() override { return 0; }
471   // The back reference may be recursive, e.g. /(\2)(\1)/. To avoid infinite
472   // recursion, we give up. Ignorance is bliss.
max_match()473   int max_match() override { return kInfinity; }
index()474   int index() { return capture_->index(); }
capture()475   RegExpCapture* capture() { return capture_; }
476 
477  private:
478   RegExpCapture* capture_;
479 };
480 
481 
482 class RegExpEmpty final : public RegExpTree {
483  public:
RegExpEmpty()484   RegExpEmpty() {}
485   void* Accept(RegExpVisitor* visitor, void* data) override;
486   RegExpNode* ToNode(RegExpCompiler* compiler, RegExpNode* on_success) override;
487   RegExpEmpty* AsEmpty() override;
488   bool IsEmpty() override;
min_match()489   int min_match() override { return 0; }
max_match()490   int max_match() override { return 0; }
491 };
492 
493 }  // namespace internal
494 }  // namespace v8
495 
496 #endif  // V8_REGEXP_REGEXP_AST_H_
497