1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_JSREGEXP_H_
6 #define V8_REGEXP_JSREGEXP_H_
7 
8 #include "src/allocation.h"
9 #include "src/assembler.h"
10 #include "src/isolate.h"
11 #include "src/objects/js-regexp.h"
12 #include "src/regexp/regexp-ast.h"
13 #include "src/regexp/regexp-macro-assembler.h"
14 
15 namespace v8 {
16 namespace internal {
17 
18 class NodeVisitor;
19 class RegExpCompiler;
20 class RegExpMacroAssembler;
21 class RegExpNode;
22 class RegExpTree;
23 class BoyerMooreLookahead;
24 
IgnoreCase(JSRegExp::Flags flags)25 inline bool IgnoreCase(JSRegExp::Flags flags) {
26   return (flags & JSRegExp::kIgnoreCase) != 0;
27 }
28 
IsUnicode(JSRegExp::Flags flags)29 inline bool IsUnicode(JSRegExp::Flags flags) {
30   return (flags & JSRegExp::kUnicode) != 0;
31 }
32 
IsSticky(JSRegExp::Flags flags)33 inline bool IsSticky(JSRegExp::Flags flags) {
34   return (flags & JSRegExp::kSticky) != 0;
35 }
36 
IsGlobal(JSRegExp::Flags flags)37 inline bool IsGlobal(JSRegExp::Flags flags) {
38   return (flags & JSRegExp::kGlobal) != 0;
39 }
40 
DotAll(JSRegExp::Flags flags)41 inline bool DotAll(JSRegExp::Flags flags) {
42   return (flags & JSRegExp::kDotAll) != 0;
43 }
44 
Multiline(JSRegExp::Flags flags)45 inline bool Multiline(JSRegExp::Flags flags) {
46   return (flags & JSRegExp::kMultiline) != 0;
47 }
48 
NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags)49 inline bool NeedsUnicodeCaseEquivalents(JSRegExp::Flags flags) {
50   // Both unicode and ignore_case flags are set. We need to use ICU to find
51   // the closure over case equivalents.
52   return IsUnicode(flags) && IgnoreCase(flags);
53 }
54 
55 class RegExpImpl {
56  public:
57   // Whether V8 is compiled with native regexp support or not.
UsesNativeRegExp()58   static bool UsesNativeRegExp() {
59 #ifdef V8_INTERPRETED_REGEXP
60     return false;
61 #else
62     return true;
63 #endif
64   }
65 
66   // Returns a string representation of a regular expression.
67   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
68   // This function calls the garbage collector if necessary.
69   static Handle<String> ToString(Handle<Object> value);
70 
71   // Parses the RegExp pattern and prepares the JSRegExp object with
72   // generic data and choice of implementation - as well as what
73   // the implementation wants to store in the data field.
74   // Returns false if compilation fails.
75   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Compile(
76       Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
77       JSRegExp::Flags flags);
78 
79   // See ECMA-262 section 15.10.6.2.
80   // This function calls the garbage collector if necessary.
81   V8_EXPORT_PRIVATE V8_WARN_UNUSED_RESULT static MaybeHandle<Object> Exec(
82       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
83       int index, Handle<RegExpMatchInfo> last_match_info);
84 
85   // Prepares a JSRegExp object with Irregexp-specific data.
86   static void IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
87                                  Handle<String> pattern, JSRegExp::Flags flags,
88                                  int capture_register_count);
89 
90   static void AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
91                           Handle<String> pattern, JSRegExp::Flags flags,
92                           Handle<String> match_pattern);
93 
94   static int AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
95                          Handle<String> subject, int index, int32_t* output,
96                          int output_size);
97 
98   static Handle<Object> AtomExec(Isolate* isolate, Handle<JSRegExp> regexp,
99                                  Handle<String> subject, int index,
100                                  Handle<RegExpMatchInfo> last_match_info);
101 
102   enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
103 
104   // Prepare a RegExp for being executed one or more times (using
105   // IrregexpExecOnce) on the subject.
106   // This ensures that the regexp is compiled for the subject, and that
107   // the subject is flat.
108   // Returns the number of integer spaces required by IrregexpExecOnce
109   // as its "registers" argument.  If the regexp cannot be compiled,
110   // an exception is set as pending, and this function returns negative.
111   static int IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
112                              Handle<String> subject);
113 
114   // Execute a regular expression on the subject, starting from index.
115   // If matching succeeds, return the number of matches.  This can be larger
116   // than one in the case of global regular expressions.
117   // The captures and subcaptures are stored into the registers vector.
118   // If matching fails, returns RE_FAILURE.
119   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
120   static int IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
121                              Handle<String> subject, int index, int32_t* output,
122                              int output_size);
123 
124   // Execute an Irregexp bytecode pattern.
125   // On a successful match, the result is a JSArray containing
126   // captured positions.  On a failure, the result is the null value.
127   // Returns an empty handle in case of an exception.
128   V8_WARN_UNUSED_RESULT static MaybeHandle<Object> IrregexpExec(
129       Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
130       int index, Handle<RegExpMatchInfo> last_match_info);
131 
132   // Set last match info.  If match is nullptr, then setting captures is
133   // omitted.
134   static Handle<RegExpMatchInfo> SetLastMatchInfo(
135       Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
136       Handle<String> subject, int capture_count, int32_t* match);
137 
138   class GlobalCache {
139    public:
140     GlobalCache(Handle<JSRegExp> regexp,
141                 Handle<String> subject,
142                 Isolate* isolate);
143 
144     V8_INLINE ~GlobalCache();
145 
146     // Fetch the next entry in the cache for global regexp match results.
147     // This does not set the last match info.  Upon failure, nullptr is
148     // returned. The cause can be checked with Result().  The previous result is
149     // still in available in memory when a failure happens.
150     V8_INLINE int32_t* FetchNext();
151 
152     V8_INLINE int32_t* LastSuccessfulMatch();
153 
HasException()154     V8_INLINE bool HasException() { return num_matches_ < 0; }
155 
156    private:
157     int AdvanceZeroLength(int last_index);
158 
159     int num_matches_;
160     int max_matches_;
161     int current_match_index_;
162     int registers_per_match_;
163     // Pointer to the last set of captures.
164     int32_t* register_array_;
165     int register_array_size_;
166     Handle<JSRegExp> regexp_;
167     Handle<String> subject_;
168     Isolate* isolate_;
169   };
170 
171   // For acting on the JSRegExp data FixedArray.
172   static int IrregexpMaxRegisterCount(FixedArray* re);
173   static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
174   static void SetIrregexpCaptureNameMap(FixedArray* re,
175                                         Handle<FixedArray> value);
176   static int IrregexpNumberOfCaptures(FixedArray* re);
177   static int IrregexpNumberOfRegisters(FixedArray* re);
178   static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
179   static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
180 
181   // Limit the space regexps take up on the heap.  In order to limit this we
182   // would like to keep track of the amount of regexp code on the heap.  This
183   // is not tracked, however.  As a conservative approximation we track the
184   // total regexp code compiled including code that has subsequently been freed
185   // and the total executable memory at any point.
186   static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
187   static const size_t kRegExpCompiledLimit = 1 * MB;
188   static const int kRegExpTooLargeToOptimize = 20 * KB;
189 
190  private:
191   static bool CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
192                               Handle<String> sample_subject, bool is_one_byte);
193   static inline bool EnsureCompiledIrregexp(Isolate* isolate,
194                                             Handle<JSRegExp> re,
195                                             Handle<String> sample_subject,
196                                             bool is_one_byte);
197 };
198 
199 
200 // Represents the location of one element relative to the intersection of
201 // two sets. Corresponds to the four areas of a Venn diagram.
202 enum ElementInSetsRelation {
203   kInsideNone = 0,
204   kInsideFirst = 1,
205   kInsideSecond = 2,
206   kInsideBoth = 3
207 };
208 
209 
210 // A set of unsigned integers that behaves especially well on small
211 // integers (< 32).  May do zone-allocation.
212 class OutSet: public ZoneObject {
213  public:
OutSet()214   OutSet() : first_(0), remaining_(nullptr), successors_(nullptr) {}
215   OutSet* Extend(unsigned value, Zone* zone);
216   bool Get(unsigned value) const;
217   static const unsigned kFirstLimit = 32;
218 
219  private:
220   // Destructively set a value in this set.  In most cases you want
221   // to use Extend instead to ensure that only one instance exists
222   // that contains the same values.
223   void Set(unsigned value, Zone* zone);
224 
225   // The successors are a list of sets that contain the same values
226   // as this set and the one more value that is not present in this
227   // set.
successors(Zone * zone)228   ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
229 
OutSet(uint32_t first,ZoneList<unsigned> * remaining)230   OutSet(uint32_t first, ZoneList<unsigned>* remaining)
231       : first_(first), remaining_(remaining), successors_(nullptr) {}
232   uint32_t first_;
233   ZoneList<unsigned>* remaining_;
234   ZoneList<OutSet*>* successors_;
235   friend class Trace;
236 };
237 
238 
239 // A mapping from integers, specified as ranges, to a set of integers.
240 // Used for mapping character ranges to choices.
241 class DispatchTable : public ZoneObject {
242  public:
DispatchTable(Zone * zone)243   explicit DispatchTable(Zone* zone) : tree_(zone) { }
244 
245   class Entry {
246    public:
Entry()247     Entry() : from_(0), to_(0), out_set_(nullptr) {}
Entry(uc32 from,uc32 to,OutSet * out_set)248     Entry(uc32 from, uc32 to, OutSet* out_set)
249         : from_(from), to_(to), out_set_(out_set) {
250       DCHECK(from <= to);
251     }
from()252     uc32 from() { return from_; }
to()253     uc32 to() { return to_; }
set_to(uc32 value)254     void set_to(uc32 value) { to_ = value; }
AddValue(int value,Zone * zone)255     void AddValue(int value, Zone* zone) {
256       out_set_ = out_set_->Extend(value, zone);
257     }
out_set()258     OutSet* out_set() { return out_set_; }
259    private:
260     uc32 from_;
261     uc32 to_;
262     OutSet* out_set_;
263   };
264 
265   class Config {
266    public:
267     typedef uc32 Key;
268     typedef Entry Value;
269     static const uc32 kNoKey;
NoValue()270     static const Entry NoValue() { return Value(); }
Compare(uc32 a,uc32 b)271     static inline int Compare(uc32 a, uc32 b) {
272       if (a == b)
273         return 0;
274       else if (a < b)
275         return -1;
276       else
277         return 1;
278     }
279   };
280 
281   void AddRange(CharacterRange range, int value, Zone* zone);
282   OutSet* Get(uc32 value);
283   void Dump();
284 
285   template <typename Callback>
ForEach(Callback * callback)286   void ForEach(Callback* callback) {
287     return tree()->ForEach(callback);
288   }
289 
290  private:
291   // There can't be a static empty set since it allocates its
292   // successors in a zone and caches them.
empty()293   OutSet* empty() { return &empty_; }
294   OutSet empty_;
tree()295   ZoneSplayTree<Config>* tree() { return &tree_; }
296   ZoneSplayTree<Config> tree_;
297 };
298 
299 
300 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
301 class UnicodeRangeSplitter {
302  public:
303   UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
304   void Call(uc32 from, DispatchTable::Entry entry);
305 
bmp()306   ZoneList<CharacterRange>* bmp() { return bmp_; }
lead_surrogates()307   ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
trail_surrogates()308   ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
non_bmp()309   ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
310 
311  private:
312   static const int kBase = 0;
313   // Separate ranges into
314   static const int kBmpCodePoints = 1;
315   static const int kLeadSurrogates = 2;
316   static const int kTrailSurrogates = 3;
317   static const int kNonBmpCodePoints = 4;
318 
319   Zone* zone_;
320   DispatchTable table_;
321   ZoneList<CharacterRange>* bmp_;
322   ZoneList<CharacterRange>* lead_surrogates_;
323   ZoneList<CharacterRange>* trail_surrogates_;
324   ZoneList<CharacterRange>* non_bmp_;
325 };
326 
327 
328 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
329   VISIT(End)                                                         \
330   VISIT(Action)                                                      \
331   VISIT(Choice)                                                      \
332   VISIT(BackReference)                                               \
333   VISIT(Assertion)                                                   \
334   VISIT(Text)
335 
336 
337 class Trace;
338 struct PreloadState;
339 class GreedyLoopState;
340 class AlternativeGenerationList;
341 
342 struct NodeInfo {
NodeInfoNodeInfo343   NodeInfo()
344       : being_analyzed(false),
345         been_analyzed(false),
346         follows_word_interest(false),
347         follows_newline_interest(false),
348         follows_start_interest(false),
349         at_end(false),
350         visited(false),
351         replacement_calculated(false) { }
352 
353   // Returns true if the interests and assumptions of this node
354   // matches the given one.
MatchesNodeInfo355   bool Matches(NodeInfo* that) {
356     return (at_end == that->at_end) &&
357            (follows_word_interest == that->follows_word_interest) &&
358            (follows_newline_interest == that->follows_newline_interest) &&
359            (follows_start_interest == that->follows_start_interest);
360   }
361 
362   // Updates the interests of this node given the interests of the
363   // node preceding it.
AddFromPrecedingNodeInfo364   void AddFromPreceding(NodeInfo* that) {
365     at_end |= that->at_end;
366     follows_word_interest |= that->follows_word_interest;
367     follows_newline_interest |= that->follows_newline_interest;
368     follows_start_interest |= that->follows_start_interest;
369   }
370 
HasLookbehindNodeInfo371   bool HasLookbehind() {
372     return follows_word_interest ||
373            follows_newline_interest ||
374            follows_start_interest;
375   }
376 
377   // Sets the interests of this node to include the interests of the
378   // following node.
AddFromFollowingNodeInfo379   void AddFromFollowing(NodeInfo* that) {
380     follows_word_interest |= that->follows_word_interest;
381     follows_newline_interest |= that->follows_newline_interest;
382     follows_start_interest |= that->follows_start_interest;
383   }
384 
ResetCompilationStateNodeInfo385   void ResetCompilationState() {
386     being_analyzed = false;
387     been_analyzed = false;
388   }
389 
390   bool being_analyzed: 1;
391   bool been_analyzed: 1;
392 
393   // These bits are set of this node has to know what the preceding
394   // character was.
395   bool follows_word_interest: 1;
396   bool follows_newline_interest: 1;
397   bool follows_start_interest: 1;
398 
399   bool at_end: 1;
400   bool visited: 1;
401   bool replacement_calculated: 1;
402 };
403 
404 
405 // Details of a quick mask-compare check that can look ahead in the
406 // input stream.
407 class QuickCheckDetails {
408  public:
QuickCheckDetails()409   QuickCheckDetails()
410       : characters_(0),
411         mask_(0),
412         value_(0),
413         cannot_match_(false) { }
QuickCheckDetails(int characters)414   explicit QuickCheckDetails(int characters)
415       : characters_(characters),
416         mask_(0),
417         value_(0),
418         cannot_match_(false) { }
419   bool Rationalize(bool one_byte);
420   // Merge in the information from another branch of an alternation.
421   void Merge(QuickCheckDetails* other, int from_index);
422   // Advance the current position by some amount.
423   void Advance(int by, bool one_byte);
424   void Clear();
cannot_match()425   bool cannot_match() { return cannot_match_; }
set_cannot_match()426   void set_cannot_match() { cannot_match_ = true; }
427   struct Position {
PositionPosition428     Position() : mask(0), value(0), determines_perfectly(false) { }
429     uc16 mask;
430     uc16 value;
431     bool determines_perfectly;
432   };
characters()433   int characters() { return characters_; }
set_characters(int characters)434   void set_characters(int characters) { characters_ = characters; }
positions(int index)435   Position* positions(int index) {
436     DCHECK_LE(0, index);
437     DCHECK_GT(characters_, index);
438     return positions_ + index;
439   }
mask()440   uint32_t mask() { return mask_; }
value()441   uint32_t value() { return value_; }
442 
443  private:
444   // How many characters do we have quick check information from.  This is
445   // the same for all branches of a choice node.
446   int characters_;
447   Position positions_[4];
448   // These values are the condensate of the above array after Rationalize().
449   uint32_t mask_;
450   uint32_t value_;
451   // If set to true, there is no way this quick check can match at all.
452   // E.g., if it requires to be at the start of the input, and isn't.
453   bool cannot_match_;
454 };
455 
456 
457 extern int kUninitializedRegExpNodePlaceHolder;
458 
459 
460 class RegExpNode: public ZoneObject {
461  public:
RegExpNode(Zone * zone)462   explicit RegExpNode(Zone* zone)
463       : replacement_(nullptr),
464         on_work_list_(false),
465         trace_count_(0),
466         zone_(zone) {
467     bm_info_[0] = bm_info_[1] = nullptr;
468   }
469   virtual ~RegExpNode();
470   virtual void Accept(NodeVisitor* visitor) = 0;
471   // Generates a goto to this node or actually generates the code at this point.
472   virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
473   // How many characters must this node consume at a minimum in order to
474   // succeed.  If we have found at least 'still_to_find' characters that
475   // must be consumed there is no need to ask any following nodes whether
476   // they are sure to eat any more characters.  The not_at_start argument is
477   // used to indicate that we know we are not at the start of the input.  In
478   // this case anchored branches will always fail and can be ignored when
479   // determining how many characters are consumed on success.
480   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
481   // Emits some quick code that checks whether the preloaded characters match.
482   // Falls through on certain failure, jumps to the label on possible success.
483   // If the node cannot make a quick check it does nothing and returns false.
484   bool EmitQuickCheck(RegExpCompiler* compiler,
485                       Trace* bounds_check_trace,
486                       Trace* trace,
487                       bool preload_has_checked_bounds,
488                       Label* on_possible_success,
489                       QuickCheckDetails* details_return,
490                       bool fall_through_on_failure);
491   // For a given number of characters this returns a mask and a value.  The
492   // next n characters are anded with the mask and compared with the value.
493   // A comparison failure indicates the node cannot match the next n characters.
494   // A comparison success indicates the node may match.
495   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
496                                     RegExpCompiler* compiler,
497                                     int characters_filled_in,
498                                     bool not_at_start) = 0;
499   static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
GreedyLoopTextLength()500   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
501   // Only returns the successor for a text node of length 1 that matches any
502   // character and that has no guards on it.
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)503   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
504       RegExpCompiler* compiler) {
505     return nullptr;
506   }
507 
508   // Collects information on the possible code units (mod 128) that can match if
509   // we look forward.  This is used for a Boyer-Moore-like string searching
510   // implementation.  TODO(erikcorry):  This should share more code with
511   // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
512   // the number of nodes we are willing to look at in order to create this data.
513   static const int kRecursionBudget = 200;
514   bool KeepRecursing(RegExpCompiler* compiler);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)515   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
516                             BoyerMooreLookahead* bm, bool not_at_start) {
517     UNREACHABLE();
518   }
519 
520   // If we know that the input is one-byte then there are some nodes that can
521   // never match.  This method returns a node that can be substituted for
522   // itself, or nullptr if the node can never match.
FilterOneByte(int depth)523   virtual RegExpNode* FilterOneByte(int depth) { return this; }
524   // Helper for FilterOneByte.
replacement()525   RegExpNode* replacement() {
526     DCHECK(info()->replacement_calculated);
527     return replacement_;
528   }
set_replacement(RegExpNode * replacement)529   RegExpNode* set_replacement(RegExpNode* replacement) {
530     info()->replacement_calculated = true;
531     replacement_ =  replacement;
532     return replacement;  // For convenience.
533   }
534 
535   // We want to avoid recalculating the lookahead info, so we store it on the
536   // node.  Only info that is for this node is stored.  We can tell that the
537   // info is for this node when offset == 0, so the information is calculated
538   // relative to this node.
SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)539   void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
540     if (offset == 0) set_bm_info(not_at_start, bm);
541   }
542 
label()543   Label* label() { return &label_; }
544   // If non-generic code is generated for a node (i.e. the node is not at the
545   // start of the trace) then it cannot be reused.  This variable sets a limit
546   // on how often we allow that to happen before we insist on starting a new
547   // trace and generating generic code for a node that can be reused by flushing
548   // the deferred actions in the current trace and generating a goto.
549   static const int kMaxCopiesCodeGenerated = 10;
550 
on_work_list()551   bool on_work_list() { return on_work_list_; }
set_on_work_list(bool value)552   void set_on_work_list(bool value) { on_work_list_ = value; }
553 
info()554   NodeInfo* info() { return &info_; }
555 
bm_info(bool not_at_start)556   BoyerMooreLookahead* bm_info(bool not_at_start) {
557     return bm_info_[not_at_start ? 1 : 0];
558   }
559 
zone()560   Zone* zone() const { return zone_; }
561 
562  protected:
563   enum LimitResult { DONE, CONTINUE };
564   RegExpNode* replacement_;
565 
566   LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
567 
set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)568   void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
569     bm_info_[not_at_start ? 1 : 0] = bm;
570   }
571 
572  private:
573   static const int kFirstCharBudget = 10;
574   Label label_;
575   bool on_work_list_;
576   NodeInfo info_;
577   // This variable keeps track of how many times code has been generated for
578   // this node (in different traces).  We don't keep track of where the
579   // generated code is located unless the code is generated at the start of
580   // a trace, in which case it is generic and can be reused by flushing the
581   // deferred operations in the current trace and generating a goto.
582   int trace_count_;
583   BoyerMooreLookahead* bm_info_[2];
584 
585   Zone* zone_;
586 };
587 
588 
589 class SeqRegExpNode: public RegExpNode {
590  public:
SeqRegExpNode(RegExpNode * on_success)591   explicit SeqRegExpNode(RegExpNode* on_success)
592       : RegExpNode(on_success->zone()), on_success_(on_success) { }
on_success()593   RegExpNode* on_success() { return on_success_; }
set_on_success(RegExpNode * node)594   void set_on_success(RegExpNode* node) { on_success_ = node; }
595   virtual RegExpNode* FilterOneByte(int depth);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)596   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
597                             BoyerMooreLookahead* bm, bool not_at_start) {
598     on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
599     if (offset == 0) set_bm_info(not_at_start, bm);
600   }
601 
602  protected:
603   RegExpNode* FilterSuccessor(int depth);
604 
605  private:
606   RegExpNode* on_success_;
607 };
608 
609 
610 class ActionNode: public SeqRegExpNode {
611  public:
612   enum ActionType {
613     SET_REGISTER,
614     INCREMENT_REGISTER,
615     STORE_POSITION,
616     BEGIN_SUBMATCH,
617     POSITIVE_SUBMATCH_SUCCESS,
618     EMPTY_MATCH_CHECK,
619     CLEAR_CAPTURES
620   };
621   static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
622   static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
623   static ActionNode* StorePosition(int reg,
624                                    bool is_capture,
625                                    RegExpNode* on_success);
626   static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
627   static ActionNode* BeginSubmatch(int stack_pointer_reg,
628                                    int position_reg,
629                                    RegExpNode* on_success);
630   static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
631                                              int restore_reg,
632                                              int clear_capture_count,
633                                              int clear_capture_from,
634                                              RegExpNode* on_success);
635   static ActionNode* EmptyMatchCheck(int start_register,
636                                      int repetition_register,
637                                      int repetition_limit,
638                                      RegExpNode* on_success);
639   virtual void Accept(NodeVisitor* visitor);
640   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
641   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)642   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
643                                     RegExpCompiler* compiler,
644                                     int filled_in,
645                                     bool not_at_start) {
646     return on_success()->GetQuickCheckDetails(
647         details, compiler, filled_in, not_at_start);
648   }
649   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
650                             BoyerMooreLookahead* bm, bool not_at_start);
action_type()651   ActionType action_type() { return action_type_; }
652   // TODO(erikcorry): We should allow some action nodes in greedy loops.
GreedyLoopTextLength()653   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
654 
655  private:
656   union {
657     struct {
658       int reg;
659       int value;
660     } u_store_register;
661     struct {
662       int reg;
663     } u_increment_register;
664     struct {
665       int reg;
666       bool is_capture;
667     } u_position_register;
668     struct {
669       int stack_pointer_register;
670       int current_position_register;
671       int clear_register_count;
672       int clear_register_from;
673     } u_submatch;
674     struct {
675       int start_register;
676       int repetition_register;
677       int repetition_limit;
678     } u_empty_match_check;
679     struct {
680       int range_from;
681       int range_to;
682     } u_clear_captures;
683   } data_;
ActionNode(ActionType action_type,RegExpNode * on_success)684   ActionNode(ActionType action_type, RegExpNode* on_success)
685       : SeqRegExpNode(on_success),
686         action_type_(action_type) { }
687   ActionType action_type_;
688   friend class DotPrinter;
689 };
690 
691 
692 class TextNode: public SeqRegExpNode {
693  public:
TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)694   TextNode(ZoneList<TextElement>* elms, bool read_backward,
695            RegExpNode* on_success)
696       : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)697   TextNode(RegExpCharacterClass* that, bool read_backward,
698            RegExpNode* on_success)
699       : SeqRegExpNode(on_success),
700         elms_(new (zone()) ZoneList<TextElement>(1, zone())),
701         read_backward_(read_backward) {
702     elms_->Add(TextElement::CharClass(that), zone());
703   }
704   // Create TextNode for a single character class for the given ranges.
705   static TextNode* CreateForCharacterRanges(Zone* zone,
706                                             ZoneList<CharacterRange>* ranges,
707                                             bool read_backward,
708                                             RegExpNode* on_success,
709                                             JSRegExp::Flags flags);
710   // Create TextNode for a surrogate pair with a range given for the
711   // lead and the trail surrogate each.
712   static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
713                                           CharacterRange trail,
714                                           bool read_backward,
715                                           RegExpNode* on_success,
716                                           JSRegExp::Flags flags);
717   virtual void Accept(NodeVisitor* visitor);
718   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
719   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
720   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
721                                     RegExpCompiler* compiler,
722                                     int characters_filled_in,
723                                     bool not_at_start);
elements()724   ZoneList<TextElement>* elements() { return elms_; }
read_backward()725   bool read_backward() { return read_backward_; }
726   void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
727   virtual int GreedyLoopTextLength();
728   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
729       RegExpCompiler* compiler);
730   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
731                             BoyerMooreLookahead* bm, bool not_at_start);
732   void CalculateOffsets();
733   virtual RegExpNode* FilterOneByte(int depth);
734 
735  private:
736   enum TextEmitPassType {
737     NON_LATIN1_MATCH,            // Check for characters that can't match.
738     SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
739     NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
740     CASE_CHARACTER_MATCH,        // Case-independent single character check.
741     CHARACTER_CLASS_MATCH        // Character class.
742   };
743   static bool SkipPass(TextEmitPassType pass, bool ignore_case);
744   static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
745   static const int kLastPass = CHARACTER_CLASS_MATCH;
746   void TextEmitPass(RegExpCompiler* compiler,
747                     TextEmitPassType pass,
748                     bool preloaded,
749                     Trace* trace,
750                     bool first_element_checked,
751                     int* checked_up_to);
752   int Length();
753   ZoneList<TextElement>* elms_;
754   bool read_backward_;
755 };
756 
757 
758 class AssertionNode: public SeqRegExpNode {
759  public:
760   enum AssertionType {
761     AT_END,
762     AT_START,
763     AT_BOUNDARY,
764     AT_NON_BOUNDARY,
765     AFTER_NEWLINE
766   };
AtEnd(RegExpNode * on_success)767   static AssertionNode* AtEnd(RegExpNode* on_success) {
768     return new(on_success->zone()) AssertionNode(AT_END, on_success);
769   }
AtStart(RegExpNode * on_success)770   static AssertionNode* AtStart(RegExpNode* on_success) {
771     return new(on_success->zone()) AssertionNode(AT_START, on_success);
772   }
AtBoundary(RegExpNode * on_success)773   static AssertionNode* AtBoundary(RegExpNode* on_success) {
774     return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
775   }
AtNonBoundary(RegExpNode * on_success)776   static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
777     return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
778   }
AfterNewline(RegExpNode * on_success)779   static AssertionNode* AfterNewline(RegExpNode* on_success) {
780     return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
781   }
782   virtual void Accept(NodeVisitor* visitor);
783   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
784   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
785   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
786                                     RegExpCompiler* compiler,
787                                     int filled_in,
788                                     bool not_at_start);
789   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
790                             BoyerMooreLookahead* bm, bool not_at_start);
assertion_type()791   AssertionType assertion_type() { return assertion_type_; }
792 
793  private:
794   void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
795   enum IfPrevious { kIsNonWord, kIsWord };
796   void BacktrackIfPrevious(RegExpCompiler* compiler,
797                            Trace* trace,
798                            IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t,RegExpNode * on_success)799   AssertionNode(AssertionType t, RegExpNode* on_success)
800       : SeqRegExpNode(on_success), assertion_type_(t) { }
801   AssertionType assertion_type_;
802 };
803 
804 
805 class BackReferenceNode: public SeqRegExpNode {
806  public:
BackReferenceNode(int start_reg,int end_reg,JSRegExp::Flags flags,bool read_backward,RegExpNode * on_success)807   BackReferenceNode(int start_reg, int end_reg, JSRegExp::Flags flags,
808                     bool read_backward, RegExpNode* on_success)
809       : SeqRegExpNode(on_success),
810         start_reg_(start_reg),
811         end_reg_(end_reg),
812         flags_(flags),
813         read_backward_(read_backward) {}
814   virtual void Accept(NodeVisitor* visitor);
start_register()815   int start_register() { return start_reg_; }
end_register()816   int end_register() { return end_reg_; }
read_backward()817   bool read_backward() { return read_backward_; }
818   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
819   virtual int EatsAtLeast(int still_to_find,
820                           int recursion_depth,
821                           bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)822   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
823                                     RegExpCompiler* compiler,
824                                     int characters_filled_in,
825                                     bool not_at_start) {
826     return;
827   }
828   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
829                             BoyerMooreLookahead* bm, bool not_at_start);
830 
831  private:
832   int start_reg_;
833   int end_reg_;
834   JSRegExp::Flags flags_;
835   bool read_backward_;
836 };
837 
838 
839 class EndNode: public RegExpNode {
840  public:
841   enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action,Zone * zone)842   EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
843   virtual void Accept(NodeVisitor* visitor);
844   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
EatsAtLeast(int still_to_find,int recursion_depth,bool not_at_start)845   virtual int EatsAtLeast(int still_to_find,
846                           int recursion_depth,
847                           bool not_at_start) { return 0; }
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)848   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
849                                     RegExpCompiler* compiler,
850                                     int characters_filled_in,
851                                     bool not_at_start) {
852     // Returning 0 from EatsAtLeast should ensure we never get here.
853     UNREACHABLE();
854   }
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)855   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
856                             BoyerMooreLookahead* bm, bool not_at_start) {
857     // Returning 0 from EatsAtLeast should ensure we never get here.
858     UNREACHABLE();
859   }
860 
861  private:
862   Action action_;
863 };
864 
865 
866 class NegativeSubmatchSuccess: public EndNode {
867  public:
NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)868   NegativeSubmatchSuccess(int stack_pointer_reg,
869                           int position_reg,
870                           int clear_capture_count,
871                           int clear_capture_start,
872                           Zone* zone)
873       : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
874         stack_pointer_register_(stack_pointer_reg),
875         current_position_register_(position_reg),
876         clear_capture_count_(clear_capture_count),
877         clear_capture_start_(clear_capture_start) { }
878   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
879 
880  private:
881   int stack_pointer_register_;
882   int current_position_register_;
883   int clear_capture_count_;
884   int clear_capture_start_;
885 };
886 
887 
888 class Guard: public ZoneObject {
889  public:
890   enum Relation { LT, GEQ };
Guard(int reg,Relation op,int value)891   Guard(int reg, Relation op, int value)
892       : reg_(reg),
893         op_(op),
894         value_(value) { }
reg()895   int reg() { return reg_; }
op()896   Relation op() { return op_; }
value()897   int value() { return value_; }
898 
899  private:
900   int reg_;
901   Relation op_;
902   int value_;
903 };
904 
905 
906 class GuardedAlternative {
907  public:
GuardedAlternative(RegExpNode * node)908   explicit GuardedAlternative(RegExpNode* node)
909       : node_(node), guards_(nullptr) {}
910   void AddGuard(Guard* guard, Zone* zone);
node()911   RegExpNode* node() { return node_; }
set_node(RegExpNode * node)912   void set_node(RegExpNode* node) { node_ = node; }
guards()913   ZoneList<Guard*>* guards() { return guards_; }
914 
915  private:
916   RegExpNode* node_;
917   ZoneList<Guard*>* guards_;
918 };
919 
920 
921 class AlternativeGeneration;
922 
923 
924 class ChoiceNode: public RegExpNode {
925  public:
ChoiceNode(int expected_size,Zone * zone)926   explicit ChoiceNode(int expected_size, Zone* zone)
927       : RegExpNode(zone),
928         alternatives_(new (zone)
929                           ZoneList<GuardedAlternative>(expected_size, zone)),
930         table_(nullptr),
931         not_at_start_(false),
932         being_calculated_(false) {}
933   virtual void Accept(NodeVisitor* visitor);
AddAlternative(GuardedAlternative node)934   void AddAlternative(GuardedAlternative node) {
935     alternatives()->Add(node, zone());
936   }
alternatives()937   ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
938   DispatchTable* GetTable(bool ignore_case);
939   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
940   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
941   int EatsAtLeastHelper(int still_to_find,
942                         int budget,
943                         RegExpNode* ignore_this_node,
944                         bool not_at_start);
945   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
946                                     RegExpCompiler* compiler,
947                                     int characters_filled_in,
948                                     bool not_at_start);
949   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
950                             BoyerMooreLookahead* bm, bool not_at_start);
951 
being_calculated()952   bool being_calculated() { return being_calculated_; }
not_at_start()953   bool not_at_start() { return not_at_start_; }
set_not_at_start()954   void set_not_at_start() { not_at_start_ = true; }
set_being_calculated(bool b)955   void set_being_calculated(bool b) { being_calculated_ = b; }
try_to_emit_quick_check_for_alternative(bool is_first)956   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
957     return true;
958   }
959   virtual RegExpNode* FilterOneByte(int depth);
read_backward()960   virtual bool read_backward() { return false; }
961 
962  protected:
963   int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
964   ZoneList<GuardedAlternative>* alternatives_;
965 
966  private:
967   friend class DispatchTableConstructor;
968   friend class Analysis;
969   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
970                      Guard* guard,
971                      Trace* trace);
972   int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
973   void EmitOutOfLineContinuation(RegExpCompiler* compiler,
974                                  Trace* trace,
975                                  GuardedAlternative alternative,
976                                  AlternativeGeneration* alt_gen,
977                                  int preload_characters,
978                                  bool next_expects_preload);
979   void SetUpPreLoad(RegExpCompiler* compiler,
980                     Trace* current_trace,
981                     PreloadState* preloads);
982   void AssertGuardsMentionRegisters(Trace* trace);
983   int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
984   Trace* EmitGreedyLoop(RegExpCompiler* compiler,
985                         Trace* trace,
986                         AlternativeGenerationList* alt_gens,
987                         PreloadState* preloads,
988                         GreedyLoopState* greedy_loop_state,
989                         int text_length);
990   void EmitChoices(RegExpCompiler* compiler,
991                    AlternativeGenerationList* alt_gens,
992                    int first_choice,
993                    Trace* trace,
994                    PreloadState* preloads);
995   DispatchTable* table_;
996   // If true, this node is never checked at the start of the input.
997   // Allows a new trace to start with at_start() set to false.
998   bool not_at_start_;
999   bool being_calculated_;
1000 };
1001 
1002 
1003 class NegativeLookaroundChoiceNode : public ChoiceNode {
1004  public:
NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)1005   explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
1006                                         GuardedAlternative then_do_this,
1007                                         Zone* zone)
1008       : ChoiceNode(2, zone) {
1009     AddAlternative(this_must_fail);
1010     AddAlternative(then_do_this);
1011   }
1012   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1013   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1014                                     RegExpCompiler* compiler,
1015                                     int characters_filled_in,
1016                                     bool not_at_start);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)1017   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1018                             BoyerMooreLookahead* bm, bool not_at_start) {
1019     alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
1020                                               not_at_start);
1021     if (offset == 0) set_bm_info(not_at_start, bm);
1022   }
1023   // For a negative lookahead we don't emit the quick check for the
1024   // alternative that is expected to fail.  This is because quick check code
1025   // starts by loading enough characters for the alternative that takes fewest
1026   // characters, but on a negative lookahead the negative branch did not take
1027   // part in that calculation (EatsAtLeast) so the assumptions don't hold.
try_to_emit_quick_check_for_alternative(bool is_first)1028   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
1029     return !is_first;
1030   }
1031   virtual RegExpNode* FilterOneByte(int depth);
1032 };
1033 
1034 
1035 class LoopChoiceNode: public ChoiceNode {
1036  public:
LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,Zone * zone)1037   LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1038       : ChoiceNode(2, zone),
1039         loop_node_(nullptr),
1040         continue_node_(nullptr),
1041         body_can_be_zero_length_(body_can_be_zero_length),
1042         read_backward_(read_backward) {}
1043   void AddLoopAlternative(GuardedAlternative alt);
1044   void AddContinueAlternative(GuardedAlternative alt);
1045   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1046   virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1047   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1048                                     RegExpCompiler* compiler,
1049                                     int characters_filled_in,
1050                                     bool not_at_start);
1051   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1052                             BoyerMooreLookahead* bm, bool not_at_start);
loop_node()1053   RegExpNode* loop_node() { return loop_node_; }
continue_node()1054   RegExpNode* continue_node() { return continue_node_; }
body_can_be_zero_length()1055   bool body_can_be_zero_length() { return body_can_be_zero_length_; }
read_backward()1056   virtual bool read_backward() { return read_backward_; }
1057   virtual void Accept(NodeVisitor* visitor);
1058   virtual RegExpNode* FilterOneByte(int depth);
1059 
1060  private:
1061   // AddAlternative is made private for loop nodes because alternatives
1062   // should not be added freely, we need to keep track of which node
1063   // goes back to the node itself.
AddAlternative(GuardedAlternative node)1064   void AddAlternative(GuardedAlternative node) {
1065     ChoiceNode::AddAlternative(node);
1066   }
1067 
1068   RegExpNode* loop_node_;
1069   RegExpNode* continue_node_;
1070   bool body_can_be_zero_length_;
1071   bool read_backward_;
1072 };
1073 
1074 
1075 // Improve the speed that we scan for an initial point where a non-anchored
1076 // regexp can match by using a Boyer-Moore-like table. This is done by
1077 // identifying non-greedy non-capturing loops in the nodes that eat any
1078 // character one at a time.  For example in the middle of the regexp
1079 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
1080 // inserted at the start of any non-anchored regexp.
1081 //
1082 // When we have found such a loop we look ahead in the nodes to find the set of
1083 // characters that can come at given distances. For example for the regexp
1084 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1085 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1086 // the lookahead info where the set of characters is reasonably constrained. In
1087 // our example this is from index 1 to 2 (0 is not constrained). We can now
1088 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1089 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1090 //
1091 // For Unicode input strings we do the same, but modulo 128.
1092 //
1093 // We also look at the first string fed to the regexp and use that to get a hint
1094 // of the character frequencies in the inputs. This affects the assessment of
1095 // whether the set of characters is 'reasonably constrained'.
1096 //
1097 // We also have another lookahead mechanism (called quick check in the code),
1098 // which uses a wide load of multiple characters followed by a mask and compare
1099 // to determine whether a match is possible at this point.
1100 enum ContainedInLattice {
1101   kNotYet = 0,
1102   kLatticeIn = 1,
1103   kLatticeOut = 2,
1104   kLatticeUnknown = 3  // Can also mean both in and out.
1105 };
1106 
1107 
Combine(ContainedInLattice a,ContainedInLattice b)1108 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1109   return static_cast<ContainedInLattice>(a | b);
1110 }
1111 
1112 
1113 ContainedInLattice AddRange(ContainedInLattice a,
1114                             const int* ranges,
1115                             int ranges_size,
1116                             Interval new_range);
1117 
1118 
1119 class BoyerMoorePositionInfo : public ZoneObject {
1120  public:
BoyerMoorePositionInfo(Zone * zone)1121   explicit BoyerMoorePositionInfo(Zone* zone)
1122       : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1123         map_count_(0),
1124         w_(kNotYet),
1125         s_(kNotYet),
1126         d_(kNotYet),
1127         surrogate_(kNotYet) {
1128      for (int i = 0; i < kMapSize; i++) {
1129        map_->Add(false, zone);
1130      }
1131   }
1132 
at(int i)1133   bool& at(int i) { return map_->at(i); }
1134 
1135   static const int kMapSize = 128;
1136   static const int kMask = kMapSize - 1;
1137 
map_count()1138   int map_count() const { return map_count_; }
1139 
1140   void Set(int character);
1141   void SetInterval(const Interval& interval);
1142   void SetAll();
is_non_word()1143   bool is_non_word() { return w_ == kLatticeOut; }
is_word()1144   bool is_word() { return w_ == kLatticeIn; }
1145 
1146  private:
1147   ZoneList<bool>* map_;
1148   int map_count_;  // Number of set bits in the map.
1149   ContainedInLattice w_;  // The \w character class.
1150   ContainedInLattice s_;  // The \s character class.
1151   ContainedInLattice d_;  // The \d character class.
1152   ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
1153 };
1154 
1155 
1156 class BoyerMooreLookahead : public ZoneObject {
1157  public:
1158   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1159 
length()1160   int length() { return length_; }
max_char()1161   int max_char() { return max_char_; }
compiler()1162   RegExpCompiler* compiler() { return compiler_; }
1163 
Count(int map_number)1164   int Count(int map_number) {
1165     return bitmaps_->at(map_number)->map_count();
1166   }
1167 
at(int i)1168   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1169 
Set(int map_number,int character)1170   void Set(int map_number, int character) {
1171     if (character > max_char_) return;
1172     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1173     info->Set(character);
1174   }
1175 
SetInterval(int map_number,const Interval & interval)1176   void SetInterval(int map_number, const Interval& interval) {
1177     if (interval.from() > max_char_) return;
1178     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1179     if (interval.to() > max_char_) {
1180       info->SetInterval(Interval(interval.from(), max_char_));
1181     } else {
1182       info->SetInterval(interval);
1183     }
1184   }
1185 
SetAll(int map_number)1186   void SetAll(int map_number) {
1187     bitmaps_->at(map_number)->SetAll();
1188   }
1189 
SetRest(int from_map)1190   void SetRest(int from_map) {
1191     for (int i = from_map; i < length_; i++) SetAll(i);
1192   }
1193   void EmitSkipInstructions(RegExpMacroAssembler* masm);
1194 
1195  private:
1196   // This is the value obtained by EatsAtLeast.  If we do not have at least this
1197   // many characters left in the sample string then the match is bound to fail.
1198   // Therefore it is OK to read a character this far ahead of the current match
1199   // point.
1200   int length_;
1201   RegExpCompiler* compiler_;
1202   // 0xff for Latin1, 0xffff for UTF-16.
1203   int max_char_;
1204   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1205 
1206   int GetSkipTable(int min_lookahead,
1207                    int max_lookahead,
1208                    Handle<ByteArray> boolean_skip_table);
1209   bool FindWorthwhileInterval(int* from, int* to);
1210   int FindBestInterval(
1211     int max_number_of_chars, int old_biggest_points, int* from, int* to);
1212 };
1213 
1214 
1215 // There are many ways to generate code for a node.  This class encapsulates
1216 // the current way we should be generating.  In other words it encapsulates
1217 // the current state of the code generator.  The effect of this is that we
1218 // generate code for paths that the matcher can take through the regular
1219 // expression.  A given node in the regexp can be code-generated several times
1220 // as it can be part of several traces.  For example for the regexp:
1221 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1222 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
1223 // to match foo is generated only once (the traces have a common prefix).  The
1224 // code to store the capture is deferred and generated (twice) after the places
1225 // where baz has been matched.
1226 class Trace {
1227  public:
1228   // A value for a property that is either known to be true, know to be false,
1229   // or not known.
1230   enum TriBool {
1231     UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1232   };
1233 
1234   class DeferredAction {
1235    public:
DeferredAction(ActionNode::ActionType action_type,int reg)1236     DeferredAction(ActionNode::ActionType action_type, int reg)
1237         : action_type_(action_type), reg_(reg), next_(nullptr) {}
next()1238     DeferredAction* next() { return next_; }
1239     bool Mentions(int reg);
reg()1240     int reg() { return reg_; }
action_type()1241     ActionNode::ActionType action_type() { return action_type_; }
1242    private:
1243     ActionNode::ActionType action_type_;
1244     int reg_;
1245     DeferredAction* next_;
1246     friend class Trace;
1247   };
1248 
1249   class DeferredCapture : public DeferredAction {
1250    public:
DeferredCapture(int reg,bool is_capture,Trace * trace)1251     DeferredCapture(int reg, bool is_capture, Trace* trace)
1252         : DeferredAction(ActionNode::STORE_POSITION, reg),
1253           cp_offset_(trace->cp_offset()),
1254           is_capture_(is_capture) { }
cp_offset()1255     int cp_offset() { return cp_offset_; }
is_capture()1256     bool is_capture() { return is_capture_; }
1257    private:
1258     int cp_offset_;
1259     bool is_capture_;
set_cp_offset(int cp_offset)1260     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1261   };
1262 
1263   class DeferredSetRegister : public DeferredAction {
1264    public:
DeferredSetRegister(int reg,int value)1265     DeferredSetRegister(int reg, int value)
1266         : DeferredAction(ActionNode::SET_REGISTER, reg),
1267           value_(value) { }
value()1268     int value() { return value_; }
1269    private:
1270     int value_;
1271   };
1272 
1273   class DeferredClearCaptures : public DeferredAction {
1274    public:
DeferredClearCaptures(Interval range)1275     explicit DeferredClearCaptures(Interval range)
1276         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1277           range_(range) { }
range()1278     Interval range() { return range_; }
1279    private:
1280     Interval range_;
1281   };
1282 
1283   class DeferredIncrementRegister : public DeferredAction {
1284    public:
DeferredIncrementRegister(int reg)1285     explicit DeferredIncrementRegister(int reg)
1286         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1287   };
1288 
Trace()1289   Trace()
1290       : cp_offset_(0),
1291         actions_(nullptr),
1292         backtrack_(nullptr),
1293         stop_node_(nullptr),
1294         loop_label_(nullptr),
1295         characters_preloaded_(0),
1296         bound_checked_up_to_(0),
1297         flush_budget_(100),
1298         at_start_(UNKNOWN) {}
1299 
1300   // End the trace.  This involves flushing the deferred actions in the trace
1301   // and pushing a backtrack location onto the backtrack stack.  Once this is
1302   // done we can start a new trace or go to one that has already been
1303   // generated.
1304   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()1305   int cp_offset() { return cp_offset_; }
actions()1306   DeferredAction* actions() { return actions_; }
1307   // A trivial trace is one that has no deferred actions or other state that
1308   // affects the assumptions used when generating code.  There is no recorded
1309   // backtrack location in a trivial trace, so with a trivial trace we will
1310   // generate code that, on a failure to match, gets the backtrack location
1311   // from the backtrack stack rather than using a direct jump instruction.  We
1312   // always start code generation with a trivial trace and non-trivial traces
1313   // are created as we emit code for nodes or add to the list of deferred
1314   // actions in the trace.  The location of the code generated for a node using
1315   // a trivial trace is recorded in a label in the node so that gotos can be
1316   // generated to that code.
is_trivial()1317   bool is_trivial() {
1318     return backtrack_ == nullptr && actions_ == nullptr && cp_offset_ == 0 &&
1319            characters_preloaded_ == 0 && bound_checked_up_to_ == 0 &&
1320            quick_check_performed_.characters() == 0 && at_start_ == UNKNOWN;
1321   }
at_start()1322   TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)1323   void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()1324   Label* backtrack() { return backtrack_; }
loop_label()1325   Label* loop_label() { return loop_label_; }
stop_node()1326   RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()1327   int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()1328   int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()1329   int flush_budget() { return flush_budget_; }
quick_check_performed()1330   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1331   bool mentions_reg(int reg);
1332   // Returns true if a deferred position store exists to the specified
1333   // register and stores the offset in the out-parameter.  Otherwise
1334   // returns false.
1335   bool GetStoredPosition(int reg, int* cp_offset);
1336   // These set methods and AdvanceCurrentPositionInTrace should be used only on
1337   // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)1338   void add_action(DeferredAction* new_action) {
1339     DCHECK(new_action->next_ == nullptr);
1340     new_action->next_ = actions_;
1341     actions_ = new_action;
1342   }
set_backtrack(Label * backtrack)1343   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)1344   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)1345   void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)1346   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)1347   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)1348   void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)1349   void set_quick_check_performed(QuickCheckDetails* d) {
1350     quick_check_performed_ = *d;
1351   }
1352   void InvalidateCurrentCharacter();
1353   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1354 
1355  private:
1356   int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1357   void PerformDeferredActions(RegExpMacroAssembler* macro,
1358                               int max_register,
1359                               const OutSet& affected_registers,
1360                               OutSet* registers_to_pop,
1361                               OutSet* registers_to_clear,
1362                               Zone* zone);
1363   void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1364                                 int max_register,
1365                                 const OutSet& registers_to_pop,
1366                                 const OutSet& registers_to_clear);
1367   int cp_offset_;
1368   DeferredAction* actions_;
1369   Label* backtrack_;
1370   RegExpNode* stop_node_;
1371   Label* loop_label_;
1372   int characters_preloaded_;
1373   int bound_checked_up_to_;
1374   QuickCheckDetails quick_check_performed_;
1375   int flush_budget_;
1376   TriBool at_start_;
1377 };
1378 
1379 
1380 class GreedyLoopState {
1381  public:
1382   explicit GreedyLoopState(bool not_at_start);
1383 
label()1384   Label* label() { return &label_; }
counter_backtrack_trace()1385   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1386 
1387  private:
1388   Label label_;
1389   Trace counter_backtrack_trace_;
1390 };
1391 
1392 
1393 struct PreloadState {
1394   static const int kEatsAtLeastNotYetInitialized = -1;
1395   bool preload_is_current_;
1396   bool preload_has_checked_bounds_;
1397   int preload_characters_;
1398   int eats_at_least_;
initPreloadState1399   void init() {
1400     eats_at_least_ = kEatsAtLeastNotYetInitialized;
1401   }
1402 };
1403 
1404 
1405 class NodeVisitor {
1406  public:
~NodeVisitor()1407   virtual ~NodeVisitor() { }
1408 #define DECLARE_VISIT(Type)                                          \
1409   virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1410 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1411 #undef DECLARE_VISIT
1412   virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1413 };
1414 
1415 
1416 // Node visitor used to add the start set of the alternatives to the
1417 // dispatch table of a choice node.
1418 class DispatchTableConstructor: public NodeVisitor {
1419  public:
DispatchTableConstructor(DispatchTable * table,bool ignore_case,Zone * zone)1420   DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1421                            Zone* zone)
1422       : table_(table),
1423         choice_index_(-1),
1424         ignore_case_(ignore_case),
1425         zone_(zone) { }
1426 
1427   void BuildTable(ChoiceNode* node);
1428 
AddRange(CharacterRange range)1429   void AddRange(CharacterRange range) {
1430     table()->AddRange(range, choice_index_, zone_);
1431   }
1432 
1433   void AddInverse(ZoneList<CharacterRange>* ranges);
1434 
1435 #define DECLARE_VISIT(Type)                                          \
1436   virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1437 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1438 #undef DECLARE_VISIT
1439 
1440   DispatchTable* table() { return table_; }
set_choice_index(int value)1441   void set_choice_index(int value) { choice_index_ = value; }
1442 
1443  protected:
1444   DispatchTable* table_;
1445   int choice_index_;
1446   bool ignore_case_;
1447   Zone* zone_;
1448 };
1449 
1450 
1451 // Assertion propagation moves information about assertions such as
1452 // \b to the affected nodes.  For instance, in /.\b./ information must
1453 // be propagated to the first '.' that whatever follows needs to know
1454 // if it matched a word or a non-word, and to the second '.' that it
1455 // has to check if it succeeds a word or non-word.  In this case the
1456 // result will be something like:
1457 //
1458 //   +-------+        +------------+
1459 //   |   .   |        |      .     |
1460 //   +-------+  --->  +------------+
1461 //   | word? |        | check word |
1462 //   +-------+        +------------+
1463 class Analysis: public NodeVisitor {
1464  public:
Analysis(Isolate * isolate,bool is_one_byte)1465   Analysis(Isolate* isolate, bool is_one_byte)
1466       : isolate_(isolate), is_one_byte_(is_one_byte), error_message_(nullptr) {}
1467   void EnsureAnalyzed(RegExpNode* node);
1468 
1469 #define DECLARE_VISIT(Type)                                          \
1470   virtual void Visit##Type(Type##Node* that);
1471 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1472 #undef DECLARE_VISIT
1473   virtual void VisitLoopChoice(LoopChoiceNode* that);
1474 
has_failed()1475   bool has_failed() { return error_message_ != nullptr; }
error_message()1476   const char* error_message() {
1477     DCHECK(error_message_ != nullptr);
1478     return error_message_;
1479   }
fail(const char * error_message)1480   void fail(const char* error_message) {
1481     error_message_ = error_message;
1482   }
1483 
isolate()1484   Isolate* isolate() const { return isolate_; }
1485 
1486  private:
1487   Isolate* isolate_;
1488   bool is_one_byte_;
1489   const char* error_message_;
1490 
1491   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1492 };
1493 
1494 
1495 struct RegExpCompileData {
RegExpCompileDataRegExpCompileData1496   RegExpCompileData()
1497       : tree(nullptr),
1498         node(nullptr),
1499         simple(true),
1500         contains_anchor(false),
1501         capture_count(0) {}
1502   RegExpTree* tree;
1503   RegExpNode* node;
1504   bool simple;
1505   bool contains_anchor;
1506   Handle<FixedArray> capture_name_map;
1507   Handle<String> error;
1508   int capture_count;
1509 };
1510 
1511 
1512 class RegExpEngine: public AllStatic {
1513  public:
1514   struct CompilationResult {
CompilationResultCompilationResult1515     CompilationResult(Isolate* isolate, const char* error_message)
1516         : error_message(error_message),
1517           code(ReadOnlyRoots(isolate).the_hole_value()),
1518           num_registers(0) {}
CompilationResultCompilationResult1519     CompilationResult(Object* code, int registers)
1520         : error_message(nullptr), code(code), num_registers(registers) {}
1521     const char* error_message;
1522     Object* code;
1523     int num_registers;
1524   };
1525 
1526   static CompilationResult Compile(Isolate* isolate, Zone* zone,
1527                                    RegExpCompileData* input,
1528                                    JSRegExp::Flags flags,
1529                                    Handle<String> pattern,
1530                                    Handle<String> sample_subject,
1531                                    bool is_one_byte);
1532 
1533   static bool TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern);
1534 
1535   static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1536 };
1537 
1538 
1539 class RegExpResultsCache : public AllStatic {
1540  public:
1541   enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1542 
1543   // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
1544   // On success, the returned result is guaranteed to be a COW-array.
1545   static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1546                         FixedArray** last_match_out, ResultsCacheType type);
1547   // Attempt to add value_array to the cache specified by type.  On success,
1548   // value_array is turned into a COW-array.
1549   static void Enter(Isolate* isolate, Handle<String> key_string,
1550                     Handle<Object> key_pattern, Handle<FixedArray> value_array,
1551                     Handle<FixedArray> last_match_cache, ResultsCacheType type);
1552   static void Clear(FixedArray* cache);
1553   static const int kRegExpResultsCacheSize = 0x100;
1554 
1555  private:
1556   static const int kArrayEntriesPerCacheEntry = 4;
1557   static const int kStringOffset = 0;
1558   static const int kPatternOffset = 1;
1559   static const int kArrayOffset = 2;
1560   static const int kLastMatchOffset = 3;
1561 };
1562 
1563 }  // namespace internal
1564 }  // namespace v8
1565 
1566 #endif  // V8_REGEXP_JSREGEXP_H_
1567