1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_JSREGEXP_H_
6 #define V8_REGEXP_JSREGEXP_H_
7 
8 #include "src/allocation.h"
9 #include "src/assembler.h"
10 #include "src/regexp/regexp-ast.h"
11 #include "src/regexp/regexp-macro-assembler.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 class NodeVisitor;
17 class RegExpCompiler;
18 class RegExpMacroAssembler;
19 class RegExpNode;
20 class RegExpTree;
21 class BoyerMooreLookahead;
22 
23 class RegExpImpl {
24  public:
25   // Whether V8 is compiled with native regexp support or not.
UsesNativeRegExp()26   static bool UsesNativeRegExp() {
27 #ifdef V8_INTERPRETED_REGEXP
28     return false;
29 #else
30     return true;
31 #endif
32   }
33 
34   // Returns a string representation of a regular expression.
35   // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
36   // This function calls the garbage collector if necessary.
37   static Handle<String> ToString(Handle<Object> value);
38 
39   // Parses the RegExp pattern and prepares the JSRegExp object with
40   // generic data and choice of implementation - as well as what
41   // the implementation wants to store in the data field.
42   // Returns false if compilation fails.
43   MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
44                                                      Handle<String> pattern,
45                                                      JSRegExp::Flags flags);
46 
47   // See ECMA-262 section 15.10.6.2.
48   // This function calls the garbage collector if necessary.
49   V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
50       Handle<JSRegExp> regexp, Handle<String> subject, int index,
51       Handle<RegExpMatchInfo> last_match_info);
52 
53   // Prepares a JSRegExp object with Irregexp-specific data.
54   static void IrregexpInitialize(Handle<JSRegExp> re,
55                                  Handle<String> pattern,
56                                  JSRegExp::Flags flags,
57                                  int capture_register_count);
58 
59 
60   static void AtomCompile(Handle<JSRegExp> re,
61                           Handle<String> pattern,
62                           JSRegExp::Flags flags,
63                           Handle<String> match_pattern);
64 
65 
66   static int AtomExecRaw(Handle<JSRegExp> regexp,
67                          Handle<String> subject,
68                          int index,
69                          int32_t* output,
70                          int output_size);
71 
72   static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
73                                  Handle<String> subject, int index,
74                                  Handle<RegExpMatchInfo> last_match_info);
75 
76   enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
77 
78   // Prepare a RegExp for being executed one or more times (using
79   // IrregexpExecOnce) on the subject.
80   // This ensures that the regexp is compiled for the subject, and that
81   // the subject is flat.
82   // Returns the number of integer spaces required by IrregexpExecOnce
83   // as its "registers" argument.  If the regexp cannot be compiled,
84   // an exception is set as pending, and this function returns negative.
85   static int IrregexpPrepare(Handle<JSRegExp> regexp,
86                              Handle<String> subject);
87 
88   // Execute a regular expression on the subject, starting from index.
89   // If matching succeeds, return the number of matches.  This can be larger
90   // than one in the case of global regular expressions.
91   // The captures and subcaptures are stored into the registers vector.
92   // If matching fails, returns RE_FAILURE.
93   // If execution fails, sets a pending exception and returns RE_EXCEPTION.
94   static int IrregexpExecRaw(Handle<JSRegExp> regexp,
95                              Handle<String> subject,
96                              int index,
97                              int32_t* output,
98                              int output_size);
99 
100   // Execute an Irregexp bytecode pattern.
101   // On a successful match, the result is a JSArray containing
102   // captured positions.  On a failure, the result is the null value.
103   // Returns an empty handle in case of an exception.
104   MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
105       Handle<JSRegExp> regexp, Handle<String> subject, int index,
106       Handle<RegExpMatchInfo> last_match_info);
107 
108   // Set last match info.  If match is NULL, then setting captures is omitted.
109   static Handle<RegExpMatchInfo> SetLastMatchInfo(
110       Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
111       int capture_count, int32_t* match);
112 
113   class GlobalCache {
114    public:
115     GlobalCache(Handle<JSRegExp> regexp,
116                 Handle<String> subject,
117                 Isolate* isolate);
118 
119     INLINE(~GlobalCache());
120 
121     // Fetch the next entry in the cache for global regexp match results.
122     // This does not set the last match info.  Upon failure, NULL is returned.
123     // The cause can be checked with Result().  The previous
124     // result is still in available in memory when a failure happens.
125     INLINE(int32_t* FetchNext());
126 
127     INLINE(int32_t* LastSuccessfulMatch());
128 
INLINE(bool HasException ())129     INLINE(bool HasException()) { return num_matches_ < 0; }
130 
131    private:
132     int AdvanceZeroLength(int last_index);
133 
134     int num_matches_;
135     int max_matches_;
136     int current_match_index_;
137     int registers_per_match_;
138     // Pointer to the last set of captures.
139     int32_t* register_array_;
140     int register_array_size_;
141     Handle<JSRegExp> regexp_;
142     Handle<String> subject_;
143   };
144 
145   // For acting on the JSRegExp data FixedArray.
146   static int IrregexpMaxRegisterCount(FixedArray* re);
147   static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
148   static void SetIrregexpCaptureNameMap(FixedArray* re,
149                                         Handle<FixedArray> value);
150   static int IrregexpNumberOfCaptures(FixedArray* re);
151   static int IrregexpNumberOfRegisters(FixedArray* re);
152   static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
153   static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
154 
155   // Limit the space regexps take up on the heap.  In order to limit this we
156   // would like to keep track of the amount of regexp code on the heap.  This
157   // is not tracked, however.  As a conservative approximation we track the
158   // total regexp code compiled including code that has subsequently been freed
159   // and the total executable memory at any point.
160   static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
161   static const int kRegExpCompiledLimit = 1 * MB;
162   static const int kRegExpTooLargeToOptimize = 20 * KB;
163 
164  private:
165   static bool CompileIrregexp(Handle<JSRegExp> re,
166                               Handle<String> sample_subject, bool is_one_byte);
167   static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
168                                             Handle<String> sample_subject,
169                                             bool is_one_byte);
170 };
171 
172 
173 // Represents the location of one element relative to the intersection of
174 // two sets. Corresponds to the four areas of a Venn diagram.
175 enum ElementInSetsRelation {
176   kInsideNone = 0,
177   kInsideFirst = 1,
178   kInsideSecond = 2,
179   kInsideBoth = 3
180 };
181 
182 
183 // A set of unsigned integers that behaves especially well on small
184 // integers (< 32).  May do zone-allocation.
185 class OutSet: public ZoneObject {
186  public:
OutSet()187   OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
188   OutSet* Extend(unsigned value, Zone* zone);
189   bool Get(unsigned value) const;
190   static const unsigned kFirstLimit = 32;
191 
192  private:
193   // Destructively set a value in this set.  In most cases you want
194   // to use Extend instead to ensure that only one instance exists
195   // that contains the same values.
196   void Set(unsigned value, Zone* zone);
197 
198   // The successors are a list of sets that contain the same values
199   // as this set and the one more value that is not present in this
200   // set.
successors(Zone * zone)201   ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
202 
OutSet(uint32_t first,ZoneList<unsigned> * remaining)203   OutSet(uint32_t first, ZoneList<unsigned>* remaining)
204       : first_(first), remaining_(remaining), successors_(NULL) { }
205   uint32_t first_;
206   ZoneList<unsigned>* remaining_;
207   ZoneList<OutSet*>* successors_;
208   friend class Trace;
209 };
210 
211 
212 // A mapping from integers, specified as ranges, to a set of integers.
213 // Used for mapping character ranges to choices.
214 class DispatchTable : public ZoneObject {
215  public:
DispatchTable(Zone * zone)216   explicit DispatchTable(Zone* zone) : tree_(zone) { }
217 
218   class Entry {
219    public:
Entry()220     Entry() : from_(0), to_(0), out_set_(NULL) { }
Entry(uc32 from,uc32 to,OutSet * out_set)221     Entry(uc32 from, uc32 to, OutSet* out_set)
222         : from_(from), to_(to), out_set_(out_set) {
223       DCHECK(from <= to);
224     }
from()225     uc32 from() { return from_; }
to()226     uc32 to() { return to_; }
set_to(uc32 value)227     void set_to(uc32 value) { to_ = value; }
AddValue(int value,Zone * zone)228     void AddValue(int value, Zone* zone) {
229       out_set_ = out_set_->Extend(value, zone);
230     }
out_set()231     OutSet* out_set() { return out_set_; }
232    private:
233     uc32 from_;
234     uc32 to_;
235     OutSet* out_set_;
236   };
237 
238   class Config {
239    public:
240     typedef uc32 Key;
241     typedef Entry Value;
242     static const uc32 kNoKey;
NoValue()243     static const Entry NoValue() { return Value(); }
Compare(uc32 a,uc32 b)244     static inline int Compare(uc32 a, uc32 b) {
245       if (a == b)
246         return 0;
247       else if (a < b)
248         return -1;
249       else
250         return 1;
251     }
252   };
253 
254   void AddRange(CharacterRange range, int value, Zone* zone);
255   OutSet* Get(uc32 value);
256   void Dump();
257 
258   template <typename Callback>
ForEach(Callback * callback)259   void ForEach(Callback* callback) {
260     return tree()->ForEach(callback);
261   }
262 
263  private:
264   // There can't be a static empty set since it allocates its
265   // successors in a zone and caches them.
empty()266   OutSet* empty() { return &empty_; }
267   OutSet empty_;
tree()268   ZoneSplayTree<Config>* tree() { return &tree_; }
269   ZoneSplayTree<Config> tree_;
270 };
271 
272 
273 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
274 class UnicodeRangeSplitter {
275  public:
276   UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
277   void Call(uc32 from, DispatchTable::Entry entry);
278 
bmp()279   ZoneList<CharacterRange>* bmp() { return bmp_; }
lead_surrogates()280   ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
trail_surrogates()281   ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
non_bmp()282   ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
283 
284  private:
285   static const int kBase = 0;
286   // Separate ranges into
287   static const int kBmpCodePoints = 1;
288   static const int kLeadSurrogates = 2;
289   static const int kTrailSurrogates = 3;
290   static const int kNonBmpCodePoints = 4;
291 
292   Zone* zone_;
293   DispatchTable table_;
294   ZoneList<CharacterRange>* bmp_;
295   ZoneList<CharacterRange>* lead_surrogates_;
296   ZoneList<CharacterRange>* trail_surrogates_;
297   ZoneList<CharacterRange>* non_bmp_;
298 };
299 
300 
301 #define FOR_EACH_NODE_TYPE(VISIT)                                    \
302   VISIT(End)                                                         \
303   VISIT(Action)                                                      \
304   VISIT(Choice)                                                      \
305   VISIT(BackReference)                                               \
306   VISIT(Assertion)                                                   \
307   VISIT(Text)
308 
309 
310 class Trace;
311 struct PreloadState;
312 class GreedyLoopState;
313 class AlternativeGenerationList;
314 
315 struct NodeInfo {
NodeInfoNodeInfo316   NodeInfo()
317       : being_analyzed(false),
318         been_analyzed(false),
319         follows_word_interest(false),
320         follows_newline_interest(false),
321         follows_start_interest(false),
322         at_end(false),
323         visited(false),
324         replacement_calculated(false) { }
325 
326   // Returns true if the interests and assumptions of this node
327   // matches the given one.
MatchesNodeInfo328   bool Matches(NodeInfo* that) {
329     return (at_end == that->at_end) &&
330            (follows_word_interest == that->follows_word_interest) &&
331            (follows_newline_interest == that->follows_newline_interest) &&
332            (follows_start_interest == that->follows_start_interest);
333   }
334 
335   // Updates the interests of this node given the interests of the
336   // node preceding it.
AddFromPrecedingNodeInfo337   void AddFromPreceding(NodeInfo* that) {
338     at_end |= that->at_end;
339     follows_word_interest |= that->follows_word_interest;
340     follows_newline_interest |= that->follows_newline_interest;
341     follows_start_interest |= that->follows_start_interest;
342   }
343 
HasLookbehindNodeInfo344   bool HasLookbehind() {
345     return follows_word_interest ||
346            follows_newline_interest ||
347            follows_start_interest;
348   }
349 
350   // Sets the interests of this node to include the interests of the
351   // following node.
AddFromFollowingNodeInfo352   void AddFromFollowing(NodeInfo* that) {
353     follows_word_interest |= that->follows_word_interest;
354     follows_newline_interest |= that->follows_newline_interest;
355     follows_start_interest |= that->follows_start_interest;
356   }
357 
ResetCompilationStateNodeInfo358   void ResetCompilationState() {
359     being_analyzed = false;
360     been_analyzed = false;
361   }
362 
363   bool being_analyzed: 1;
364   bool been_analyzed: 1;
365 
366   // These bits are set of this node has to know what the preceding
367   // character was.
368   bool follows_word_interest: 1;
369   bool follows_newline_interest: 1;
370   bool follows_start_interest: 1;
371 
372   bool at_end: 1;
373   bool visited: 1;
374   bool replacement_calculated: 1;
375 };
376 
377 
378 // Details of a quick mask-compare check that can look ahead in the
379 // input stream.
380 class QuickCheckDetails {
381  public:
QuickCheckDetails()382   QuickCheckDetails()
383       : characters_(0),
384         mask_(0),
385         value_(0),
386         cannot_match_(false) { }
QuickCheckDetails(int characters)387   explicit QuickCheckDetails(int characters)
388       : characters_(characters),
389         mask_(0),
390         value_(0),
391         cannot_match_(false) { }
392   bool Rationalize(bool one_byte);
393   // Merge in the information from another branch of an alternation.
394   void Merge(QuickCheckDetails* other, int from_index);
395   // Advance the current position by some amount.
396   void Advance(int by, bool one_byte);
397   void Clear();
cannot_match()398   bool cannot_match() { return cannot_match_; }
set_cannot_match()399   void set_cannot_match() { cannot_match_ = true; }
400   struct Position {
PositionPosition401     Position() : mask(0), value(0), determines_perfectly(false) { }
402     uc16 mask;
403     uc16 value;
404     bool determines_perfectly;
405   };
characters()406   int characters() { return characters_; }
set_characters(int characters)407   void set_characters(int characters) { characters_ = characters; }
positions(int index)408   Position* positions(int index) {
409     DCHECK(index >= 0);
410     DCHECK(index < characters_);
411     return positions_ + index;
412   }
mask()413   uint32_t mask() { return mask_; }
value()414   uint32_t value() { return value_; }
415 
416  private:
417   // How many characters do we have quick check information from.  This is
418   // the same for all branches of a choice node.
419   int characters_;
420   Position positions_[4];
421   // These values are the condensate of the above array after Rationalize().
422   uint32_t mask_;
423   uint32_t value_;
424   // If set to true, there is no way this quick check can match at all.
425   // E.g., if it requires to be at the start of the input, and isn't.
426   bool cannot_match_;
427 };
428 
429 
430 extern int kUninitializedRegExpNodePlaceHolder;
431 
432 
433 class RegExpNode: public ZoneObject {
434  public:
RegExpNode(Zone * zone)435   explicit RegExpNode(Zone* zone)
436       : replacement_(NULL), on_work_list_(false), trace_count_(0), zone_(zone) {
437     bm_info_[0] = bm_info_[1] = NULL;
438   }
439   virtual ~RegExpNode();
440   virtual void Accept(NodeVisitor* visitor) = 0;
441   // Generates a goto to this node or actually generates the code at this point.
442   virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
443   // How many characters must this node consume at a minimum in order to
444   // succeed.  If we have found at least 'still_to_find' characters that
445   // must be consumed there is no need to ask any following nodes whether
446   // they are sure to eat any more characters.  The not_at_start argument is
447   // used to indicate that we know we are not at the start of the input.  In
448   // this case anchored branches will always fail and can be ignored when
449   // determining how many characters are consumed on success.
450   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
451   // Emits some quick code that checks whether the preloaded characters match.
452   // Falls through on certain failure, jumps to the label on possible success.
453   // If the node cannot make a quick check it does nothing and returns false.
454   bool EmitQuickCheck(RegExpCompiler* compiler,
455                       Trace* bounds_check_trace,
456                       Trace* trace,
457                       bool preload_has_checked_bounds,
458                       Label* on_possible_success,
459                       QuickCheckDetails* details_return,
460                       bool fall_through_on_failure);
461   // For a given number of characters this returns a mask and a value.  The
462   // next n characters are anded with the mask and compared with the value.
463   // A comparison failure indicates the node cannot match the next n characters.
464   // A comparison success indicates the node may match.
465   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
466                                     RegExpCompiler* compiler,
467                                     int characters_filled_in,
468                                     bool not_at_start) = 0;
469   static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
GreedyLoopTextLength()470   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
471   // Only returns the successor for a text node of length 1 that matches any
472   // character and that has no guards on it.
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)473   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
474       RegExpCompiler* compiler) {
475     return NULL;
476   }
477 
478   // Collects information on the possible code units (mod 128) that can match if
479   // we look forward.  This is used for a Boyer-Moore-like string searching
480   // implementation.  TODO(erikcorry):  This should share more code with
481   // EatsAtLeast, GetQuickCheckDetails.  The budget argument is used to limit
482   // the number of nodes we are willing to look at in order to create this data.
483   static const int kRecursionBudget = 200;
484   bool KeepRecursing(RegExpCompiler* compiler);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)485   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
486                             BoyerMooreLookahead* bm, bool not_at_start) {
487     UNREACHABLE();
488   }
489 
490   // If we know that the input is one-byte then there are some nodes that can
491   // never match.  This method returns a node that can be substituted for
492   // itself, or NULL if the node can never match.
FilterOneByte(int depth,bool ignore_case)493   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
494     return this;
495   }
496   // Helper for FilterOneByte.
replacement()497   RegExpNode* replacement() {
498     DCHECK(info()->replacement_calculated);
499     return replacement_;
500   }
set_replacement(RegExpNode * replacement)501   RegExpNode* set_replacement(RegExpNode* replacement) {
502     info()->replacement_calculated = true;
503     replacement_ =  replacement;
504     return replacement;  // For convenience.
505   }
506 
507   // We want to avoid recalculating the lookahead info, so we store it on the
508   // node.  Only info that is for this node is stored.  We can tell that the
509   // info is for this node when offset == 0, so the information is calculated
510   // relative to this node.
SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)511   void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
512     if (offset == 0) set_bm_info(not_at_start, bm);
513   }
514 
label()515   Label* label() { return &label_; }
516   // If non-generic code is generated for a node (i.e. the node is not at the
517   // start of the trace) then it cannot be reused.  This variable sets a limit
518   // on how often we allow that to happen before we insist on starting a new
519   // trace and generating generic code for a node that can be reused by flushing
520   // the deferred actions in the current trace and generating a goto.
521   static const int kMaxCopiesCodeGenerated = 10;
522 
on_work_list()523   bool on_work_list() { return on_work_list_; }
set_on_work_list(bool value)524   void set_on_work_list(bool value) { on_work_list_ = value; }
525 
info()526   NodeInfo* info() { return &info_; }
527 
bm_info(bool not_at_start)528   BoyerMooreLookahead* bm_info(bool not_at_start) {
529     return bm_info_[not_at_start ? 1 : 0];
530   }
531 
zone()532   Zone* zone() const { return zone_; }
533 
534  protected:
535   enum LimitResult { DONE, CONTINUE };
536   RegExpNode* replacement_;
537 
538   LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
539 
set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)540   void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
541     bm_info_[not_at_start ? 1 : 0] = bm;
542   }
543 
544  private:
545   static const int kFirstCharBudget = 10;
546   Label label_;
547   bool on_work_list_;
548   NodeInfo info_;
549   // This variable keeps track of how many times code has been generated for
550   // this node (in different traces).  We don't keep track of where the
551   // generated code is located unless the code is generated at the start of
552   // a trace, in which case it is generic and can be reused by flushing the
553   // deferred operations in the current trace and generating a goto.
554   int trace_count_;
555   BoyerMooreLookahead* bm_info_[2];
556 
557   Zone* zone_;
558 };
559 
560 
561 class SeqRegExpNode: public RegExpNode {
562  public:
SeqRegExpNode(RegExpNode * on_success)563   explicit SeqRegExpNode(RegExpNode* on_success)
564       : RegExpNode(on_success->zone()), on_success_(on_success) { }
on_success()565   RegExpNode* on_success() { return on_success_; }
set_on_success(RegExpNode * node)566   void set_on_success(RegExpNode* node) { on_success_ = node; }
567   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)568   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
569                             BoyerMooreLookahead* bm, bool not_at_start) {
570     on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
571     if (offset == 0) set_bm_info(not_at_start, bm);
572   }
573 
574  protected:
575   RegExpNode* FilterSuccessor(int depth, bool ignore_case);
576 
577  private:
578   RegExpNode* on_success_;
579 };
580 
581 
582 class ActionNode: public SeqRegExpNode {
583  public:
584   enum ActionType {
585     SET_REGISTER,
586     INCREMENT_REGISTER,
587     STORE_POSITION,
588     BEGIN_SUBMATCH,
589     POSITIVE_SUBMATCH_SUCCESS,
590     EMPTY_MATCH_CHECK,
591     CLEAR_CAPTURES
592   };
593   static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
594   static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
595   static ActionNode* StorePosition(int reg,
596                                    bool is_capture,
597                                    RegExpNode* on_success);
598   static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
599   static ActionNode* BeginSubmatch(int stack_pointer_reg,
600                                    int position_reg,
601                                    RegExpNode* on_success);
602   static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
603                                              int restore_reg,
604                                              int clear_capture_count,
605                                              int clear_capture_from,
606                                              RegExpNode* on_success);
607   static ActionNode* EmptyMatchCheck(int start_register,
608                                      int repetition_register,
609                                      int repetition_limit,
610                                      RegExpNode* on_success);
611   virtual void Accept(NodeVisitor* visitor);
612   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
613   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)614   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
615                                     RegExpCompiler* compiler,
616                                     int filled_in,
617                                     bool not_at_start) {
618     return on_success()->GetQuickCheckDetails(
619         details, compiler, filled_in, not_at_start);
620   }
621   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
622                             BoyerMooreLookahead* bm, bool not_at_start);
action_type()623   ActionType action_type() { return action_type_; }
624   // TODO(erikcorry): We should allow some action nodes in greedy loops.
GreedyLoopTextLength()625   virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
626 
627  private:
628   union {
629     struct {
630       int reg;
631       int value;
632     } u_store_register;
633     struct {
634       int reg;
635     } u_increment_register;
636     struct {
637       int reg;
638       bool is_capture;
639     } u_position_register;
640     struct {
641       int stack_pointer_register;
642       int current_position_register;
643       int clear_register_count;
644       int clear_register_from;
645     } u_submatch;
646     struct {
647       int start_register;
648       int repetition_register;
649       int repetition_limit;
650     } u_empty_match_check;
651     struct {
652       int range_from;
653       int range_to;
654     } u_clear_captures;
655   } data_;
ActionNode(ActionType action_type,RegExpNode * on_success)656   ActionNode(ActionType action_type, RegExpNode* on_success)
657       : SeqRegExpNode(on_success),
658         action_type_(action_type) { }
659   ActionType action_type_;
660   friend class DotPrinter;
661 };
662 
663 
664 class TextNode: public SeqRegExpNode {
665  public:
TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)666   TextNode(ZoneList<TextElement>* elms, bool read_backward,
667            RegExpNode* on_success)
668       : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)669   TextNode(RegExpCharacterClass* that, bool read_backward,
670            RegExpNode* on_success)
671       : SeqRegExpNode(on_success),
672         elms_(new (zone()) ZoneList<TextElement>(1, zone())),
673         read_backward_(read_backward) {
674     elms_->Add(TextElement::CharClass(that), zone());
675   }
676   // Create TextNode for a single character class for the given ranges.
677   static TextNode* CreateForCharacterRanges(Zone* zone,
678                                             ZoneList<CharacterRange>* ranges,
679                                             bool read_backward,
680                                             RegExpNode* on_success);
681   // Create TextNode for a surrogate pair with a range given for the
682   // lead and the trail surrogate each.
683   static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
684                                           CharacterRange trail,
685                                           bool read_backward,
686                                           RegExpNode* on_success);
687   virtual void Accept(NodeVisitor* visitor);
688   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
689   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
690   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
691                                     RegExpCompiler* compiler,
692                                     int characters_filled_in,
693                                     bool not_at_start);
elements()694   ZoneList<TextElement>* elements() { return elms_; }
read_backward()695   bool read_backward() { return read_backward_; }
696   void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
697   virtual int GreedyLoopTextLength();
698   virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
699       RegExpCompiler* compiler);
700   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
701                             BoyerMooreLookahead* bm, bool not_at_start);
702   void CalculateOffsets();
703   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
704 
705  private:
706   enum TextEmitPassType {
707     NON_LATIN1_MATCH,            // Check for characters that can't match.
708     SIMPLE_CHARACTER_MATCH,      // Case-dependent single character check.
709     NON_LETTER_CHARACTER_MATCH,  // Check characters that have no case equivs.
710     CASE_CHARACTER_MATCH,        // Case-independent single character check.
711     CHARACTER_CLASS_MATCH        // Character class.
712   };
713   static bool SkipPass(int pass, bool ignore_case);
714   static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
715   static const int kLastPass = CHARACTER_CLASS_MATCH;
716   void TextEmitPass(RegExpCompiler* compiler,
717                     TextEmitPassType pass,
718                     bool preloaded,
719                     Trace* trace,
720                     bool first_element_checked,
721                     int* checked_up_to);
722   int Length();
723   ZoneList<TextElement>* elms_;
724   bool read_backward_;
725 };
726 
727 
728 class AssertionNode: public SeqRegExpNode {
729  public:
730   enum AssertionType {
731     AT_END,
732     AT_START,
733     AT_BOUNDARY,
734     AT_NON_BOUNDARY,
735     AFTER_NEWLINE
736   };
AtEnd(RegExpNode * on_success)737   static AssertionNode* AtEnd(RegExpNode* on_success) {
738     return new(on_success->zone()) AssertionNode(AT_END, on_success);
739   }
AtStart(RegExpNode * on_success)740   static AssertionNode* AtStart(RegExpNode* on_success) {
741     return new(on_success->zone()) AssertionNode(AT_START, on_success);
742   }
AtBoundary(RegExpNode * on_success)743   static AssertionNode* AtBoundary(RegExpNode* on_success) {
744     return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
745   }
AtNonBoundary(RegExpNode * on_success)746   static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
747     return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
748   }
AfterNewline(RegExpNode * on_success)749   static AssertionNode* AfterNewline(RegExpNode* on_success) {
750     return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
751   }
752   virtual void Accept(NodeVisitor* visitor);
753   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
754   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
755   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
756                                     RegExpCompiler* compiler,
757                                     int filled_in,
758                                     bool not_at_start);
759   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
760                             BoyerMooreLookahead* bm, bool not_at_start);
assertion_type()761   AssertionType assertion_type() { return assertion_type_; }
762 
763  private:
764   void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
765   enum IfPrevious { kIsNonWord, kIsWord };
766   void BacktrackIfPrevious(RegExpCompiler* compiler,
767                            Trace* trace,
768                            IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t,RegExpNode * on_success)769   AssertionNode(AssertionType t, RegExpNode* on_success)
770       : SeqRegExpNode(on_success), assertion_type_(t) { }
771   AssertionType assertion_type_;
772 };
773 
774 
775 class BackReferenceNode: public SeqRegExpNode {
776  public:
BackReferenceNode(int start_reg,int end_reg,bool read_backward,RegExpNode * on_success)777   BackReferenceNode(int start_reg, int end_reg, bool read_backward,
778                     RegExpNode* on_success)
779       : SeqRegExpNode(on_success),
780         start_reg_(start_reg),
781         end_reg_(end_reg),
782         read_backward_(read_backward) {}
783   virtual void Accept(NodeVisitor* visitor);
start_register()784   int start_register() { return start_reg_; }
end_register()785   int end_register() { return end_reg_; }
read_backward()786   bool read_backward() { return read_backward_; }
787   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
788   virtual int EatsAtLeast(int still_to_find,
789                           int recursion_depth,
790                           bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)791   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
792                                     RegExpCompiler* compiler,
793                                     int characters_filled_in,
794                                     bool not_at_start) {
795     return;
796   }
797   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
798                             BoyerMooreLookahead* bm, bool not_at_start);
799 
800  private:
801   int start_reg_;
802   int end_reg_;
803   bool read_backward_;
804 };
805 
806 
807 class EndNode: public RegExpNode {
808  public:
809   enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action,Zone * zone)810   EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
811   virtual void Accept(NodeVisitor* visitor);
812   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
EatsAtLeast(int still_to_find,int recursion_depth,bool not_at_start)813   virtual int EatsAtLeast(int still_to_find,
814                           int recursion_depth,
815                           bool not_at_start) { return 0; }
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)816   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
817                                     RegExpCompiler* compiler,
818                                     int characters_filled_in,
819                                     bool not_at_start) {
820     // Returning 0 from EatsAtLeast should ensure we never get here.
821     UNREACHABLE();
822   }
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)823   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
824                             BoyerMooreLookahead* bm, bool not_at_start) {
825     // Returning 0 from EatsAtLeast should ensure we never get here.
826     UNREACHABLE();
827   }
828 
829  private:
830   Action action_;
831 };
832 
833 
834 class NegativeSubmatchSuccess: public EndNode {
835  public:
NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)836   NegativeSubmatchSuccess(int stack_pointer_reg,
837                           int position_reg,
838                           int clear_capture_count,
839                           int clear_capture_start,
840                           Zone* zone)
841       : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
842         stack_pointer_register_(stack_pointer_reg),
843         current_position_register_(position_reg),
844         clear_capture_count_(clear_capture_count),
845         clear_capture_start_(clear_capture_start) { }
846   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
847 
848  private:
849   int stack_pointer_register_;
850   int current_position_register_;
851   int clear_capture_count_;
852   int clear_capture_start_;
853 };
854 
855 
856 class Guard: public ZoneObject {
857  public:
858   enum Relation { LT, GEQ };
Guard(int reg,Relation op,int value)859   Guard(int reg, Relation op, int value)
860       : reg_(reg),
861         op_(op),
862         value_(value) { }
reg()863   int reg() { return reg_; }
op()864   Relation op() { return op_; }
value()865   int value() { return value_; }
866 
867  private:
868   int reg_;
869   Relation op_;
870   int value_;
871 };
872 
873 
874 class GuardedAlternative {
875  public:
GuardedAlternative(RegExpNode * node)876   explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
877   void AddGuard(Guard* guard, Zone* zone);
node()878   RegExpNode* node() { return node_; }
set_node(RegExpNode * node)879   void set_node(RegExpNode* node) { node_ = node; }
guards()880   ZoneList<Guard*>* guards() { return guards_; }
881 
882  private:
883   RegExpNode* node_;
884   ZoneList<Guard*>* guards_;
885 };
886 
887 
888 class AlternativeGeneration;
889 
890 
891 class ChoiceNode: public RegExpNode {
892  public:
ChoiceNode(int expected_size,Zone * zone)893   explicit ChoiceNode(int expected_size, Zone* zone)
894       : RegExpNode(zone),
895         alternatives_(new(zone)
896                       ZoneList<GuardedAlternative>(expected_size, zone)),
897         table_(NULL),
898         not_at_start_(false),
899         being_calculated_(false) { }
900   virtual void Accept(NodeVisitor* visitor);
AddAlternative(GuardedAlternative node)901   void AddAlternative(GuardedAlternative node) {
902     alternatives()->Add(node, zone());
903   }
alternatives()904   ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
905   DispatchTable* GetTable(bool ignore_case);
906   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
907   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
908   int EatsAtLeastHelper(int still_to_find,
909                         int budget,
910                         RegExpNode* ignore_this_node,
911                         bool not_at_start);
912   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
913                                     RegExpCompiler* compiler,
914                                     int characters_filled_in,
915                                     bool not_at_start);
916   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
917                             BoyerMooreLookahead* bm, bool not_at_start);
918 
being_calculated()919   bool being_calculated() { return being_calculated_; }
not_at_start()920   bool not_at_start() { return not_at_start_; }
set_not_at_start()921   void set_not_at_start() { not_at_start_ = true; }
set_being_calculated(bool b)922   void set_being_calculated(bool b) { being_calculated_ = b; }
try_to_emit_quick_check_for_alternative(bool is_first)923   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
924     return true;
925   }
926   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
read_backward()927   virtual bool read_backward() { return false; }
928 
929  protected:
930   int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
931   ZoneList<GuardedAlternative>* alternatives_;
932 
933  private:
934   friend class DispatchTableConstructor;
935   friend class Analysis;
936   void GenerateGuard(RegExpMacroAssembler* macro_assembler,
937                      Guard* guard,
938                      Trace* trace);
939   int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
940   void EmitOutOfLineContinuation(RegExpCompiler* compiler,
941                                  Trace* trace,
942                                  GuardedAlternative alternative,
943                                  AlternativeGeneration* alt_gen,
944                                  int preload_characters,
945                                  bool next_expects_preload);
946   void SetUpPreLoad(RegExpCompiler* compiler,
947                     Trace* current_trace,
948                     PreloadState* preloads);
949   void AssertGuardsMentionRegisters(Trace* trace);
950   int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
951   Trace* EmitGreedyLoop(RegExpCompiler* compiler,
952                         Trace* trace,
953                         AlternativeGenerationList* alt_gens,
954                         PreloadState* preloads,
955                         GreedyLoopState* greedy_loop_state,
956                         int text_length);
957   void EmitChoices(RegExpCompiler* compiler,
958                    AlternativeGenerationList* alt_gens,
959                    int first_choice,
960                    Trace* trace,
961                    PreloadState* preloads);
962   DispatchTable* table_;
963   // If true, this node is never checked at the start of the input.
964   // Allows a new trace to start with at_start() set to false.
965   bool not_at_start_;
966   bool being_calculated_;
967 };
968 
969 
970 class NegativeLookaroundChoiceNode : public ChoiceNode {
971  public:
NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)972   explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
973                                         GuardedAlternative then_do_this,
974                                         Zone* zone)
975       : ChoiceNode(2, zone) {
976     AddAlternative(this_must_fail);
977     AddAlternative(then_do_this);
978   }
979   virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
980   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
981                                     RegExpCompiler* compiler,
982                                     int characters_filled_in,
983                                     bool not_at_start);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)984   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
985                             BoyerMooreLookahead* bm, bool not_at_start) {
986     alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
987                                               not_at_start);
988     if (offset == 0) set_bm_info(not_at_start, bm);
989   }
990   // For a negative lookahead we don't emit the quick check for the
991   // alternative that is expected to fail.  This is because quick check code
992   // starts by loading enough characters for the alternative that takes fewest
993   // characters, but on a negative lookahead the negative branch did not take
994   // part in that calculation (EatsAtLeast) so the assumptions don't hold.
try_to_emit_quick_check_for_alternative(bool is_first)995   virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
996     return !is_first;
997   }
998   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
999 };
1000 
1001 
1002 class LoopChoiceNode: public ChoiceNode {
1003  public:
LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,Zone * zone)1004   LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1005       : ChoiceNode(2, zone),
1006         loop_node_(NULL),
1007         continue_node_(NULL),
1008         body_can_be_zero_length_(body_can_be_zero_length),
1009         read_backward_(read_backward) {}
1010   void AddLoopAlternative(GuardedAlternative alt);
1011   void AddContinueAlternative(GuardedAlternative alt);
1012   virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1013   virtual int EatsAtLeast(int still_to_find,  int budget, bool not_at_start);
1014   virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1015                                     RegExpCompiler* compiler,
1016                                     int characters_filled_in,
1017                                     bool not_at_start);
1018   virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1019                             BoyerMooreLookahead* bm, bool not_at_start);
loop_node()1020   RegExpNode* loop_node() { return loop_node_; }
continue_node()1021   RegExpNode* continue_node() { return continue_node_; }
body_can_be_zero_length()1022   bool body_can_be_zero_length() { return body_can_be_zero_length_; }
read_backward()1023   virtual bool read_backward() { return read_backward_; }
1024   virtual void Accept(NodeVisitor* visitor);
1025   virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1026 
1027  private:
1028   // AddAlternative is made private for loop nodes because alternatives
1029   // should not be added freely, we need to keep track of which node
1030   // goes back to the node itself.
AddAlternative(GuardedAlternative node)1031   void AddAlternative(GuardedAlternative node) {
1032     ChoiceNode::AddAlternative(node);
1033   }
1034 
1035   RegExpNode* loop_node_;
1036   RegExpNode* continue_node_;
1037   bool body_can_be_zero_length_;
1038   bool read_backward_;
1039 };
1040 
1041 
1042 // Improve the speed that we scan for an initial point where a non-anchored
1043 // regexp can match by using a Boyer-Moore-like table. This is done by
1044 // identifying non-greedy non-capturing loops in the nodes that eat any
1045 // character one at a time.  For example in the middle of the regexp
1046 // /foo[\s\S]*?bar/ we find such a loop.  There is also such a loop implicitly
1047 // inserted at the start of any non-anchored regexp.
1048 //
1049 // When we have found such a loop we look ahead in the nodes to find the set of
1050 // characters that can come at given distances. For example for the regexp
1051 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1052 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1053 // the lookahead info where the set of characters is reasonably constrained. In
1054 // our example this is from index 1 to 2 (0 is not constrained). We can now
1055 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1056 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1057 //
1058 // For Unicode input strings we do the same, but modulo 128.
1059 //
1060 // We also look at the first string fed to the regexp and use that to get a hint
1061 // of the character frequencies in the inputs. This affects the assessment of
1062 // whether the set of characters is 'reasonably constrained'.
1063 //
1064 // We also have another lookahead mechanism (called quick check in the code),
1065 // which uses a wide load of multiple characters followed by a mask and compare
1066 // to determine whether a match is possible at this point.
1067 enum ContainedInLattice {
1068   kNotYet = 0,
1069   kLatticeIn = 1,
1070   kLatticeOut = 2,
1071   kLatticeUnknown = 3  // Can also mean both in and out.
1072 };
1073 
1074 
Combine(ContainedInLattice a,ContainedInLattice b)1075 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1076   return static_cast<ContainedInLattice>(a | b);
1077 }
1078 
1079 
1080 ContainedInLattice AddRange(ContainedInLattice a,
1081                             const int* ranges,
1082                             int ranges_size,
1083                             Interval new_range);
1084 
1085 
1086 class BoyerMoorePositionInfo : public ZoneObject {
1087  public:
BoyerMoorePositionInfo(Zone * zone)1088   explicit BoyerMoorePositionInfo(Zone* zone)
1089       : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1090         map_count_(0),
1091         w_(kNotYet),
1092         s_(kNotYet),
1093         d_(kNotYet),
1094         surrogate_(kNotYet) {
1095      for (int i = 0; i < kMapSize; i++) {
1096        map_->Add(false, zone);
1097      }
1098   }
1099 
at(int i)1100   bool& at(int i) { return map_->at(i); }
1101 
1102   static const int kMapSize = 128;
1103   static const int kMask = kMapSize - 1;
1104 
map_count()1105   int map_count() const { return map_count_; }
1106 
1107   void Set(int character);
1108   void SetInterval(const Interval& interval);
1109   void SetAll();
is_non_word()1110   bool is_non_word() { return w_ == kLatticeOut; }
is_word()1111   bool is_word() { return w_ == kLatticeIn; }
1112 
1113  private:
1114   ZoneList<bool>* map_;
1115   int map_count_;  // Number of set bits in the map.
1116   ContainedInLattice w_;  // The \w character class.
1117   ContainedInLattice s_;  // The \s character class.
1118   ContainedInLattice d_;  // The \d character class.
1119   ContainedInLattice surrogate_;  // Surrogate UTF-16 code units.
1120 };
1121 
1122 
1123 class BoyerMooreLookahead : public ZoneObject {
1124  public:
1125   BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1126 
length()1127   int length() { return length_; }
max_char()1128   int max_char() { return max_char_; }
compiler()1129   RegExpCompiler* compiler() { return compiler_; }
1130 
Count(int map_number)1131   int Count(int map_number) {
1132     return bitmaps_->at(map_number)->map_count();
1133   }
1134 
at(int i)1135   BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1136 
Set(int map_number,int character)1137   void Set(int map_number, int character) {
1138     if (character > max_char_) return;
1139     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1140     info->Set(character);
1141   }
1142 
SetInterval(int map_number,const Interval & interval)1143   void SetInterval(int map_number, const Interval& interval) {
1144     if (interval.from() > max_char_) return;
1145     BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1146     if (interval.to() > max_char_) {
1147       info->SetInterval(Interval(interval.from(), max_char_));
1148     } else {
1149       info->SetInterval(interval);
1150     }
1151   }
1152 
SetAll(int map_number)1153   void SetAll(int map_number) {
1154     bitmaps_->at(map_number)->SetAll();
1155   }
1156 
SetRest(int from_map)1157   void SetRest(int from_map) {
1158     for (int i = from_map; i < length_; i++) SetAll(i);
1159   }
1160   void EmitSkipInstructions(RegExpMacroAssembler* masm);
1161 
1162  private:
1163   // This is the value obtained by EatsAtLeast.  If we do not have at least this
1164   // many characters left in the sample string then the match is bound to fail.
1165   // Therefore it is OK to read a character this far ahead of the current match
1166   // point.
1167   int length_;
1168   RegExpCompiler* compiler_;
1169   // 0xff for Latin1, 0xffff for UTF-16.
1170   int max_char_;
1171   ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1172 
1173   int GetSkipTable(int min_lookahead,
1174                    int max_lookahead,
1175                    Handle<ByteArray> boolean_skip_table);
1176   bool FindWorthwhileInterval(int* from, int* to);
1177   int FindBestInterval(
1178     int max_number_of_chars, int old_biggest_points, int* from, int* to);
1179 };
1180 
1181 
1182 // There are many ways to generate code for a node.  This class encapsulates
1183 // the current way we should be generating.  In other words it encapsulates
1184 // the current state of the code generator.  The effect of this is that we
1185 // generate code for paths that the matcher can take through the regular
1186 // expression.  A given node in the regexp can be code-generated several times
1187 // as it can be part of several traces.  For example for the regexp:
1188 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1189 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace.  The code
1190 // to match foo is generated only once (the traces have a common prefix).  The
1191 // code to store the capture is deferred and generated (twice) after the places
1192 // where baz has been matched.
1193 class Trace {
1194  public:
1195   // A value for a property that is either known to be true, know to be false,
1196   // or not known.
1197   enum TriBool {
1198     UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1199   };
1200 
1201   class DeferredAction {
1202    public:
DeferredAction(ActionNode::ActionType action_type,int reg)1203     DeferredAction(ActionNode::ActionType action_type, int reg)
1204         : action_type_(action_type), reg_(reg), next_(NULL) { }
next()1205     DeferredAction* next() { return next_; }
1206     bool Mentions(int reg);
reg()1207     int reg() { return reg_; }
action_type()1208     ActionNode::ActionType action_type() { return action_type_; }
1209    private:
1210     ActionNode::ActionType action_type_;
1211     int reg_;
1212     DeferredAction* next_;
1213     friend class Trace;
1214   };
1215 
1216   class DeferredCapture : public DeferredAction {
1217    public:
DeferredCapture(int reg,bool is_capture,Trace * trace)1218     DeferredCapture(int reg, bool is_capture, Trace* trace)
1219         : DeferredAction(ActionNode::STORE_POSITION, reg),
1220           cp_offset_(trace->cp_offset()),
1221           is_capture_(is_capture) { }
cp_offset()1222     int cp_offset() { return cp_offset_; }
is_capture()1223     bool is_capture() { return is_capture_; }
1224    private:
1225     int cp_offset_;
1226     bool is_capture_;
set_cp_offset(int cp_offset)1227     void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1228   };
1229 
1230   class DeferredSetRegister : public DeferredAction {
1231    public:
DeferredSetRegister(int reg,int value)1232     DeferredSetRegister(int reg, int value)
1233         : DeferredAction(ActionNode::SET_REGISTER, reg),
1234           value_(value) { }
value()1235     int value() { return value_; }
1236    private:
1237     int value_;
1238   };
1239 
1240   class DeferredClearCaptures : public DeferredAction {
1241    public:
DeferredClearCaptures(Interval range)1242     explicit DeferredClearCaptures(Interval range)
1243         : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1244           range_(range) { }
range()1245     Interval range() { return range_; }
1246    private:
1247     Interval range_;
1248   };
1249 
1250   class DeferredIncrementRegister : public DeferredAction {
1251    public:
DeferredIncrementRegister(int reg)1252     explicit DeferredIncrementRegister(int reg)
1253         : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1254   };
1255 
Trace()1256   Trace()
1257       : cp_offset_(0),
1258         actions_(NULL),
1259         backtrack_(NULL),
1260         stop_node_(NULL),
1261         loop_label_(NULL),
1262         characters_preloaded_(0),
1263         bound_checked_up_to_(0),
1264         flush_budget_(100),
1265         at_start_(UNKNOWN) { }
1266 
1267   // End the trace.  This involves flushing the deferred actions in the trace
1268   // and pushing a backtrack location onto the backtrack stack.  Once this is
1269   // done we can start a new trace or go to one that has already been
1270   // generated.
1271   void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()1272   int cp_offset() { return cp_offset_; }
actions()1273   DeferredAction* actions() { return actions_; }
1274   // A trivial trace is one that has no deferred actions or other state that
1275   // affects the assumptions used when generating code.  There is no recorded
1276   // backtrack location in a trivial trace, so with a trivial trace we will
1277   // generate code that, on a failure to match, gets the backtrack location
1278   // from the backtrack stack rather than using a direct jump instruction.  We
1279   // always start code generation with a trivial trace and non-trivial traces
1280   // are created as we emit code for nodes or add to the list of deferred
1281   // actions in the trace.  The location of the code generated for a node using
1282   // a trivial trace is recorded in a label in the node so that gotos can be
1283   // generated to that code.
is_trivial()1284   bool is_trivial() {
1285     return backtrack_ == NULL &&
1286            actions_ == NULL &&
1287            cp_offset_ == 0 &&
1288            characters_preloaded_ == 0 &&
1289            bound_checked_up_to_ == 0 &&
1290            quick_check_performed_.characters() == 0 &&
1291            at_start_ == UNKNOWN;
1292   }
at_start()1293   TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)1294   void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()1295   Label* backtrack() { return backtrack_; }
loop_label()1296   Label* loop_label() { return loop_label_; }
stop_node()1297   RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()1298   int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()1299   int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()1300   int flush_budget() { return flush_budget_; }
quick_check_performed()1301   QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1302   bool mentions_reg(int reg);
1303   // Returns true if a deferred position store exists to the specified
1304   // register and stores the offset in the out-parameter.  Otherwise
1305   // returns false.
1306   bool GetStoredPosition(int reg, int* cp_offset);
1307   // These set methods and AdvanceCurrentPositionInTrace should be used only on
1308   // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)1309   void add_action(DeferredAction* new_action) {
1310     DCHECK(new_action->next_ == NULL);
1311     new_action->next_ = actions_;
1312     actions_ = new_action;
1313   }
set_backtrack(Label * backtrack)1314   void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)1315   void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)1316   void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)1317   void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)1318   void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)1319   void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)1320   void set_quick_check_performed(QuickCheckDetails* d) {
1321     quick_check_performed_ = *d;
1322   }
1323   void InvalidateCurrentCharacter();
1324   void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1325 
1326  private:
1327   int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1328   void PerformDeferredActions(RegExpMacroAssembler* macro,
1329                               int max_register,
1330                               const OutSet& affected_registers,
1331                               OutSet* registers_to_pop,
1332                               OutSet* registers_to_clear,
1333                               Zone* zone);
1334   void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1335                                 int max_register,
1336                                 const OutSet& registers_to_pop,
1337                                 const OutSet& registers_to_clear);
1338   int cp_offset_;
1339   DeferredAction* actions_;
1340   Label* backtrack_;
1341   RegExpNode* stop_node_;
1342   Label* loop_label_;
1343   int characters_preloaded_;
1344   int bound_checked_up_to_;
1345   QuickCheckDetails quick_check_performed_;
1346   int flush_budget_;
1347   TriBool at_start_;
1348 };
1349 
1350 
1351 class GreedyLoopState {
1352  public:
1353   explicit GreedyLoopState(bool not_at_start);
1354 
label()1355   Label* label() { return &label_; }
counter_backtrack_trace()1356   Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1357 
1358  private:
1359   Label label_;
1360   Trace counter_backtrack_trace_;
1361 };
1362 
1363 
1364 struct PreloadState {
1365   static const int kEatsAtLeastNotYetInitialized = -1;
1366   bool preload_is_current_;
1367   bool preload_has_checked_bounds_;
1368   int preload_characters_;
1369   int eats_at_least_;
initPreloadState1370   void init() {
1371     eats_at_least_ = kEatsAtLeastNotYetInitialized;
1372   }
1373 };
1374 
1375 
1376 class NodeVisitor {
1377  public:
~NodeVisitor()1378   virtual ~NodeVisitor() { }
1379 #define DECLARE_VISIT(Type)                                          \
1380   virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1381 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1382 #undef DECLARE_VISIT
1383   virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1384 };
1385 
1386 
1387 // Node visitor used to add the start set of the alternatives to the
1388 // dispatch table of a choice node.
1389 class DispatchTableConstructor: public NodeVisitor {
1390  public:
DispatchTableConstructor(DispatchTable * table,bool ignore_case,Zone * zone)1391   DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1392                            Zone* zone)
1393       : table_(table),
1394         choice_index_(-1),
1395         ignore_case_(ignore_case),
1396         zone_(zone) { }
1397 
1398   void BuildTable(ChoiceNode* node);
1399 
AddRange(CharacterRange range)1400   void AddRange(CharacterRange range) {
1401     table()->AddRange(range, choice_index_, zone_);
1402   }
1403 
1404   void AddInverse(ZoneList<CharacterRange>* ranges);
1405 
1406 #define DECLARE_VISIT(Type)                                          \
1407   virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1408 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1409 #undef DECLARE_VISIT
1410 
1411   DispatchTable* table() { return table_; }
set_choice_index(int value)1412   void set_choice_index(int value) { choice_index_ = value; }
1413 
1414  protected:
1415   DispatchTable* table_;
1416   int choice_index_;
1417   bool ignore_case_;
1418   Zone* zone_;
1419 };
1420 
1421 
1422 // Assertion propagation moves information about assertions such as
1423 // \b to the affected nodes.  For instance, in /.\b./ information must
1424 // be propagated to the first '.' that whatever follows needs to know
1425 // if it matched a word or a non-word, and to the second '.' that it
1426 // has to check if it succeeds a word or non-word.  In this case the
1427 // result will be something like:
1428 //
1429 //   +-------+        +------------+
1430 //   |   .   |        |      .     |
1431 //   +-------+  --->  +------------+
1432 //   | word? |        | check word |
1433 //   +-------+        +------------+
1434 class Analysis: public NodeVisitor {
1435  public:
Analysis(Isolate * isolate,JSRegExp::Flags flags,bool is_one_byte)1436   Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1437       : isolate_(isolate),
1438         flags_(flags),
1439         is_one_byte_(is_one_byte),
1440         error_message_(NULL) {}
1441   void EnsureAnalyzed(RegExpNode* node);
1442 
1443 #define DECLARE_VISIT(Type)                                          \
1444   virtual void Visit##Type(Type##Node* that);
1445 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1446 #undef DECLARE_VISIT
1447   virtual void VisitLoopChoice(LoopChoiceNode* that);
1448 
has_failed()1449   bool has_failed() { return error_message_ != NULL; }
error_message()1450   const char* error_message() {
1451     DCHECK(error_message_ != NULL);
1452     return error_message_;
1453   }
fail(const char * error_message)1454   void fail(const char* error_message) {
1455     error_message_ = error_message;
1456   }
1457 
isolate()1458   Isolate* isolate() const { return isolate_; }
1459 
ignore_case()1460   bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1461   bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
1462 
1463  private:
1464   Isolate* isolate_;
1465   JSRegExp::Flags flags_;
1466   bool is_one_byte_;
1467   const char* error_message_;
1468 
1469   DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1470 };
1471 
1472 
1473 struct RegExpCompileData {
RegExpCompileDataRegExpCompileData1474   RegExpCompileData()
1475     : tree(NULL),
1476       node(NULL),
1477       simple(true),
1478       contains_anchor(false),
1479       capture_count(0) { }
1480   RegExpTree* tree;
1481   RegExpNode* node;
1482   bool simple;
1483   bool contains_anchor;
1484   Handle<FixedArray> capture_name_map;
1485   Handle<String> error;
1486   int capture_count;
1487 };
1488 
1489 
1490 class RegExpEngine: public AllStatic {
1491  public:
1492   struct CompilationResult {
CompilationResultCompilationResult1493     CompilationResult(Isolate* isolate, const char* error_message)
1494         : error_message(error_message),
1495           code(isolate->heap()->the_hole_value()),
1496           num_registers(0) {}
CompilationResultCompilationResult1497     CompilationResult(Object* code, int registers)
1498         : error_message(NULL), code(code), num_registers(registers) {}
1499     const char* error_message;
1500     Object* code;
1501     int num_registers;
1502   };
1503 
1504   static CompilationResult Compile(Isolate* isolate, Zone* zone,
1505                                    RegExpCompileData* input,
1506                                    JSRegExp::Flags flags,
1507                                    Handle<String> pattern,
1508                                    Handle<String> sample_subject,
1509                                    bool is_one_byte);
1510 
1511   static bool TooMuchRegExpCode(Handle<String> pattern);
1512 
1513   static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1514 };
1515 
1516 
1517 class RegExpResultsCache : public AllStatic {
1518  public:
1519   enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1520 
1521   // Attempt to retrieve a cached result.  On failure, 0 is returned as a Smi.
1522   // On success, the returned result is guaranteed to be a COW-array.
1523   static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1524                         FixedArray** last_match_out, ResultsCacheType type);
1525   // Attempt to add value_array to the cache specified by type.  On success,
1526   // value_array is turned into a COW-array.
1527   static void Enter(Isolate* isolate, Handle<String> key_string,
1528                     Handle<Object> key_pattern, Handle<FixedArray> value_array,
1529                     Handle<FixedArray> last_match_cache, ResultsCacheType type);
1530   static void Clear(FixedArray* cache);
1531   static const int kRegExpResultsCacheSize = 0x100;
1532 
1533  private:
1534   static const int kArrayEntriesPerCacheEntry = 4;
1535   static const int kStringOffset = 0;
1536   static const int kPatternOffset = 1;
1537   static const int kArrayOffset = 2;
1538   static const int kLastMatchOffset = 3;
1539 };
1540 
1541 }  // namespace internal
1542 }  // namespace v8
1543 
1544 #endif  // V8_REGEXP_JSREGEXP_H_
1545