1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_JSREGEXP_H_
6 #define V8_REGEXP_JSREGEXP_H_
7
8 #include "src/allocation.h"
9 #include "src/assembler.h"
10 #include "src/regexp/regexp-ast.h"
11 #include "src/regexp/regexp-macro-assembler.h"
12
13 namespace v8 {
14 namespace internal {
15
16 class NodeVisitor;
17 class RegExpCompiler;
18 class RegExpMacroAssembler;
19 class RegExpNode;
20 class RegExpTree;
21 class BoyerMooreLookahead;
22
23 class RegExpImpl {
24 public:
25 // Whether V8 is compiled with native regexp support or not.
UsesNativeRegExp()26 static bool UsesNativeRegExp() {
27 #ifdef V8_INTERPRETED_REGEXP
28 return false;
29 #else
30 return true;
31 #endif
32 }
33
34 // Returns a string representation of a regular expression.
35 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
36 // This function calls the garbage collector if necessary.
37 static Handle<String> ToString(Handle<Object> value);
38
39 // Parses the RegExp pattern and prepares the JSRegExp object with
40 // generic data and choice of implementation - as well as what
41 // the implementation wants to store in the data field.
42 // Returns false if compilation fails.
43 MUST_USE_RESULT static MaybeHandle<Object> Compile(Handle<JSRegExp> re,
44 Handle<String> pattern,
45 JSRegExp::Flags flags);
46
47 // See ECMA-262 section 15.10.6.2.
48 // This function calls the garbage collector if necessary.
49 V8_EXPORT_PRIVATE MUST_USE_RESULT static MaybeHandle<Object> Exec(
50 Handle<JSRegExp> regexp, Handle<String> subject, int index,
51 Handle<RegExpMatchInfo> last_match_info);
52
53 // Prepares a JSRegExp object with Irregexp-specific data.
54 static void IrregexpInitialize(Handle<JSRegExp> re,
55 Handle<String> pattern,
56 JSRegExp::Flags flags,
57 int capture_register_count);
58
59
60 static void AtomCompile(Handle<JSRegExp> re,
61 Handle<String> pattern,
62 JSRegExp::Flags flags,
63 Handle<String> match_pattern);
64
65
66 static int AtomExecRaw(Handle<JSRegExp> regexp,
67 Handle<String> subject,
68 int index,
69 int32_t* output,
70 int output_size);
71
72 static Handle<Object> AtomExec(Handle<JSRegExp> regexp,
73 Handle<String> subject, int index,
74 Handle<RegExpMatchInfo> last_match_info);
75
76 enum IrregexpResult { RE_FAILURE = 0, RE_SUCCESS = 1, RE_EXCEPTION = -1 };
77
78 // Prepare a RegExp for being executed one or more times (using
79 // IrregexpExecOnce) on the subject.
80 // This ensures that the regexp is compiled for the subject, and that
81 // the subject is flat.
82 // Returns the number of integer spaces required by IrregexpExecOnce
83 // as its "registers" argument. If the regexp cannot be compiled,
84 // an exception is set as pending, and this function returns negative.
85 static int IrregexpPrepare(Handle<JSRegExp> regexp,
86 Handle<String> subject);
87
88 // Execute a regular expression on the subject, starting from index.
89 // If matching succeeds, return the number of matches. This can be larger
90 // than one in the case of global regular expressions.
91 // The captures and subcaptures are stored into the registers vector.
92 // If matching fails, returns RE_FAILURE.
93 // If execution fails, sets a pending exception and returns RE_EXCEPTION.
94 static int IrregexpExecRaw(Handle<JSRegExp> regexp,
95 Handle<String> subject,
96 int index,
97 int32_t* output,
98 int output_size);
99
100 // Execute an Irregexp bytecode pattern.
101 // On a successful match, the result is a JSArray containing
102 // captured positions. On a failure, the result is the null value.
103 // Returns an empty handle in case of an exception.
104 MUST_USE_RESULT static MaybeHandle<Object> IrregexpExec(
105 Handle<JSRegExp> regexp, Handle<String> subject, int index,
106 Handle<RegExpMatchInfo> last_match_info);
107
108 // Set last match info. If match is NULL, then setting captures is omitted.
109 static Handle<RegExpMatchInfo> SetLastMatchInfo(
110 Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
111 int capture_count, int32_t* match);
112
113 class GlobalCache {
114 public:
115 GlobalCache(Handle<JSRegExp> regexp,
116 Handle<String> subject,
117 Isolate* isolate);
118
119 INLINE(~GlobalCache());
120
121 // Fetch the next entry in the cache for global regexp match results.
122 // This does not set the last match info. Upon failure, NULL is returned.
123 // The cause can be checked with Result(). The previous
124 // result is still in available in memory when a failure happens.
125 INLINE(int32_t* FetchNext());
126
127 INLINE(int32_t* LastSuccessfulMatch());
128
INLINE(bool HasException ())129 INLINE(bool HasException()) { return num_matches_ < 0; }
130
131 private:
132 int AdvanceZeroLength(int last_index);
133
134 int num_matches_;
135 int max_matches_;
136 int current_match_index_;
137 int registers_per_match_;
138 // Pointer to the last set of captures.
139 int32_t* register_array_;
140 int register_array_size_;
141 Handle<JSRegExp> regexp_;
142 Handle<String> subject_;
143 };
144
145 // For acting on the JSRegExp data FixedArray.
146 static int IrregexpMaxRegisterCount(FixedArray* re);
147 static void SetIrregexpMaxRegisterCount(FixedArray* re, int value);
148 static void SetIrregexpCaptureNameMap(FixedArray* re,
149 Handle<FixedArray> value);
150 static int IrregexpNumberOfCaptures(FixedArray* re);
151 static int IrregexpNumberOfRegisters(FixedArray* re);
152 static ByteArray* IrregexpByteCode(FixedArray* re, bool is_one_byte);
153 static Code* IrregexpNativeCode(FixedArray* re, bool is_one_byte);
154
155 // Limit the space regexps take up on the heap. In order to limit this we
156 // would like to keep track of the amount of regexp code on the heap. This
157 // is not tracked, however. As a conservative approximation we track the
158 // total regexp code compiled including code that has subsequently been freed
159 // and the total executable memory at any point.
160 static const size_t kRegExpExecutableMemoryLimit = 16 * MB;
161 static const int kRegExpCompiledLimit = 1 * MB;
162 static const int kRegExpTooLargeToOptimize = 20 * KB;
163
164 private:
165 static bool CompileIrregexp(Handle<JSRegExp> re,
166 Handle<String> sample_subject, bool is_one_byte);
167 static inline bool EnsureCompiledIrregexp(Handle<JSRegExp> re,
168 Handle<String> sample_subject,
169 bool is_one_byte);
170 };
171
172
173 // Represents the location of one element relative to the intersection of
174 // two sets. Corresponds to the four areas of a Venn diagram.
175 enum ElementInSetsRelation {
176 kInsideNone = 0,
177 kInsideFirst = 1,
178 kInsideSecond = 2,
179 kInsideBoth = 3
180 };
181
182
183 // A set of unsigned integers that behaves especially well on small
184 // integers (< 32). May do zone-allocation.
185 class OutSet: public ZoneObject {
186 public:
OutSet()187 OutSet() : first_(0), remaining_(NULL), successors_(NULL) { }
188 OutSet* Extend(unsigned value, Zone* zone);
189 bool Get(unsigned value) const;
190 static const unsigned kFirstLimit = 32;
191
192 private:
193 // Destructively set a value in this set. In most cases you want
194 // to use Extend instead to ensure that only one instance exists
195 // that contains the same values.
196 void Set(unsigned value, Zone* zone);
197
198 // The successors are a list of sets that contain the same values
199 // as this set and the one more value that is not present in this
200 // set.
successors(Zone * zone)201 ZoneList<OutSet*>* successors(Zone* zone) { return successors_; }
202
OutSet(uint32_t first,ZoneList<unsigned> * remaining)203 OutSet(uint32_t first, ZoneList<unsigned>* remaining)
204 : first_(first), remaining_(remaining), successors_(NULL) { }
205 uint32_t first_;
206 ZoneList<unsigned>* remaining_;
207 ZoneList<OutSet*>* successors_;
208 friend class Trace;
209 };
210
211
212 // A mapping from integers, specified as ranges, to a set of integers.
213 // Used for mapping character ranges to choices.
214 class DispatchTable : public ZoneObject {
215 public:
DispatchTable(Zone * zone)216 explicit DispatchTable(Zone* zone) : tree_(zone) { }
217
218 class Entry {
219 public:
Entry()220 Entry() : from_(0), to_(0), out_set_(NULL) { }
Entry(uc32 from,uc32 to,OutSet * out_set)221 Entry(uc32 from, uc32 to, OutSet* out_set)
222 : from_(from), to_(to), out_set_(out_set) {
223 DCHECK(from <= to);
224 }
from()225 uc32 from() { return from_; }
to()226 uc32 to() { return to_; }
set_to(uc32 value)227 void set_to(uc32 value) { to_ = value; }
AddValue(int value,Zone * zone)228 void AddValue(int value, Zone* zone) {
229 out_set_ = out_set_->Extend(value, zone);
230 }
out_set()231 OutSet* out_set() { return out_set_; }
232 private:
233 uc32 from_;
234 uc32 to_;
235 OutSet* out_set_;
236 };
237
238 class Config {
239 public:
240 typedef uc32 Key;
241 typedef Entry Value;
242 static const uc32 kNoKey;
NoValue()243 static const Entry NoValue() { return Value(); }
Compare(uc32 a,uc32 b)244 static inline int Compare(uc32 a, uc32 b) {
245 if (a == b)
246 return 0;
247 else if (a < b)
248 return -1;
249 else
250 return 1;
251 }
252 };
253
254 void AddRange(CharacterRange range, int value, Zone* zone);
255 OutSet* Get(uc32 value);
256 void Dump();
257
258 template <typename Callback>
ForEach(Callback * callback)259 void ForEach(Callback* callback) {
260 return tree()->ForEach(callback);
261 }
262
263 private:
264 // There can't be a static empty set since it allocates its
265 // successors in a zone and caches them.
empty()266 OutSet* empty() { return &empty_; }
267 OutSet empty_;
tree()268 ZoneSplayTree<Config>* tree() { return &tree_; }
269 ZoneSplayTree<Config> tree_;
270 };
271
272
273 // Categorizes character ranges into BMP, non-BMP, lead, and trail surrogates.
274 class UnicodeRangeSplitter {
275 public:
276 UnicodeRangeSplitter(Zone* zone, ZoneList<CharacterRange>* base);
277 void Call(uc32 from, DispatchTable::Entry entry);
278
bmp()279 ZoneList<CharacterRange>* bmp() { return bmp_; }
lead_surrogates()280 ZoneList<CharacterRange>* lead_surrogates() { return lead_surrogates_; }
trail_surrogates()281 ZoneList<CharacterRange>* trail_surrogates() { return trail_surrogates_; }
non_bmp()282 ZoneList<CharacterRange>* non_bmp() const { return non_bmp_; }
283
284 private:
285 static const int kBase = 0;
286 // Separate ranges into
287 static const int kBmpCodePoints = 1;
288 static const int kLeadSurrogates = 2;
289 static const int kTrailSurrogates = 3;
290 static const int kNonBmpCodePoints = 4;
291
292 Zone* zone_;
293 DispatchTable table_;
294 ZoneList<CharacterRange>* bmp_;
295 ZoneList<CharacterRange>* lead_surrogates_;
296 ZoneList<CharacterRange>* trail_surrogates_;
297 ZoneList<CharacterRange>* non_bmp_;
298 };
299
300
301 #define FOR_EACH_NODE_TYPE(VISIT) \
302 VISIT(End) \
303 VISIT(Action) \
304 VISIT(Choice) \
305 VISIT(BackReference) \
306 VISIT(Assertion) \
307 VISIT(Text)
308
309
310 class Trace;
311 struct PreloadState;
312 class GreedyLoopState;
313 class AlternativeGenerationList;
314
315 struct NodeInfo {
NodeInfoNodeInfo316 NodeInfo()
317 : being_analyzed(false),
318 been_analyzed(false),
319 follows_word_interest(false),
320 follows_newline_interest(false),
321 follows_start_interest(false),
322 at_end(false),
323 visited(false),
324 replacement_calculated(false) { }
325
326 // Returns true if the interests and assumptions of this node
327 // matches the given one.
MatchesNodeInfo328 bool Matches(NodeInfo* that) {
329 return (at_end == that->at_end) &&
330 (follows_word_interest == that->follows_word_interest) &&
331 (follows_newline_interest == that->follows_newline_interest) &&
332 (follows_start_interest == that->follows_start_interest);
333 }
334
335 // Updates the interests of this node given the interests of the
336 // node preceding it.
AddFromPrecedingNodeInfo337 void AddFromPreceding(NodeInfo* that) {
338 at_end |= that->at_end;
339 follows_word_interest |= that->follows_word_interest;
340 follows_newline_interest |= that->follows_newline_interest;
341 follows_start_interest |= that->follows_start_interest;
342 }
343
HasLookbehindNodeInfo344 bool HasLookbehind() {
345 return follows_word_interest ||
346 follows_newline_interest ||
347 follows_start_interest;
348 }
349
350 // Sets the interests of this node to include the interests of the
351 // following node.
AddFromFollowingNodeInfo352 void AddFromFollowing(NodeInfo* that) {
353 follows_word_interest |= that->follows_word_interest;
354 follows_newline_interest |= that->follows_newline_interest;
355 follows_start_interest |= that->follows_start_interest;
356 }
357
ResetCompilationStateNodeInfo358 void ResetCompilationState() {
359 being_analyzed = false;
360 been_analyzed = false;
361 }
362
363 bool being_analyzed: 1;
364 bool been_analyzed: 1;
365
366 // These bits are set of this node has to know what the preceding
367 // character was.
368 bool follows_word_interest: 1;
369 bool follows_newline_interest: 1;
370 bool follows_start_interest: 1;
371
372 bool at_end: 1;
373 bool visited: 1;
374 bool replacement_calculated: 1;
375 };
376
377
378 // Details of a quick mask-compare check that can look ahead in the
379 // input stream.
380 class QuickCheckDetails {
381 public:
QuickCheckDetails()382 QuickCheckDetails()
383 : characters_(0),
384 mask_(0),
385 value_(0),
386 cannot_match_(false) { }
QuickCheckDetails(int characters)387 explicit QuickCheckDetails(int characters)
388 : characters_(characters),
389 mask_(0),
390 value_(0),
391 cannot_match_(false) { }
392 bool Rationalize(bool one_byte);
393 // Merge in the information from another branch of an alternation.
394 void Merge(QuickCheckDetails* other, int from_index);
395 // Advance the current position by some amount.
396 void Advance(int by, bool one_byte);
397 void Clear();
cannot_match()398 bool cannot_match() { return cannot_match_; }
set_cannot_match()399 void set_cannot_match() { cannot_match_ = true; }
400 struct Position {
PositionPosition401 Position() : mask(0), value(0), determines_perfectly(false) { }
402 uc16 mask;
403 uc16 value;
404 bool determines_perfectly;
405 };
characters()406 int characters() { return characters_; }
set_characters(int characters)407 void set_characters(int characters) { characters_ = characters; }
positions(int index)408 Position* positions(int index) {
409 DCHECK(index >= 0);
410 DCHECK(index < characters_);
411 return positions_ + index;
412 }
mask()413 uint32_t mask() { return mask_; }
value()414 uint32_t value() { return value_; }
415
416 private:
417 // How many characters do we have quick check information from. This is
418 // the same for all branches of a choice node.
419 int characters_;
420 Position positions_[4];
421 // These values are the condensate of the above array after Rationalize().
422 uint32_t mask_;
423 uint32_t value_;
424 // If set to true, there is no way this quick check can match at all.
425 // E.g., if it requires to be at the start of the input, and isn't.
426 bool cannot_match_;
427 };
428
429
430 extern int kUninitializedRegExpNodePlaceHolder;
431
432
433 class RegExpNode: public ZoneObject {
434 public:
RegExpNode(Zone * zone)435 explicit RegExpNode(Zone* zone)
436 : replacement_(NULL), on_work_list_(false), trace_count_(0), zone_(zone) {
437 bm_info_[0] = bm_info_[1] = NULL;
438 }
439 virtual ~RegExpNode();
440 virtual void Accept(NodeVisitor* visitor) = 0;
441 // Generates a goto to this node or actually generates the code at this point.
442 virtual void Emit(RegExpCompiler* compiler, Trace* trace) = 0;
443 // How many characters must this node consume at a minimum in order to
444 // succeed. If we have found at least 'still_to_find' characters that
445 // must be consumed there is no need to ask any following nodes whether
446 // they are sure to eat any more characters. The not_at_start argument is
447 // used to indicate that we know we are not at the start of the input. In
448 // this case anchored branches will always fail and can be ignored when
449 // determining how many characters are consumed on success.
450 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start) = 0;
451 // Emits some quick code that checks whether the preloaded characters match.
452 // Falls through on certain failure, jumps to the label on possible success.
453 // If the node cannot make a quick check it does nothing and returns false.
454 bool EmitQuickCheck(RegExpCompiler* compiler,
455 Trace* bounds_check_trace,
456 Trace* trace,
457 bool preload_has_checked_bounds,
458 Label* on_possible_success,
459 QuickCheckDetails* details_return,
460 bool fall_through_on_failure);
461 // For a given number of characters this returns a mask and a value. The
462 // next n characters are anded with the mask and compared with the value.
463 // A comparison failure indicates the node cannot match the next n characters.
464 // A comparison success indicates the node may match.
465 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
466 RegExpCompiler* compiler,
467 int characters_filled_in,
468 bool not_at_start) = 0;
469 static const int kNodeIsTooComplexForGreedyLoops = kMinInt;
GreedyLoopTextLength()470 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
471 // Only returns the successor for a text node of length 1 that matches any
472 // character and that has no guards on it.
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)473 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
474 RegExpCompiler* compiler) {
475 return NULL;
476 }
477
478 // Collects information on the possible code units (mod 128) that can match if
479 // we look forward. This is used for a Boyer-Moore-like string searching
480 // implementation. TODO(erikcorry): This should share more code with
481 // EatsAtLeast, GetQuickCheckDetails. The budget argument is used to limit
482 // the number of nodes we are willing to look at in order to create this data.
483 static const int kRecursionBudget = 200;
484 bool KeepRecursing(RegExpCompiler* compiler);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)485 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
486 BoyerMooreLookahead* bm, bool not_at_start) {
487 UNREACHABLE();
488 }
489
490 // If we know that the input is one-byte then there are some nodes that can
491 // never match. This method returns a node that can be substituted for
492 // itself, or NULL if the node can never match.
FilterOneByte(int depth,bool ignore_case)493 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case) {
494 return this;
495 }
496 // Helper for FilterOneByte.
replacement()497 RegExpNode* replacement() {
498 DCHECK(info()->replacement_calculated);
499 return replacement_;
500 }
set_replacement(RegExpNode * replacement)501 RegExpNode* set_replacement(RegExpNode* replacement) {
502 info()->replacement_calculated = true;
503 replacement_ = replacement;
504 return replacement; // For convenience.
505 }
506
507 // We want to avoid recalculating the lookahead info, so we store it on the
508 // node. Only info that is for this node is stored. We can tell that the
509 // info is for this node when offset == 0, so the information is calculated
510 // relative to this node.
SaveBMInfo(BoyerMooreLookahead * bm,bool not_at_start,int offset)511 void SaveBMInfo(BoyerMooreLookahead* bm, bool not_at_start, int offset) {
512 if (offset == 0) set_bm_info(not_at_start, bm);
513 }
514
label()515 Label* label() { return &label_; }
516 // If non-generic code is generated for a node (i.e. the node is not at the
517 // start of the trace) then it cannot be reused. This variable sets a limit
518 // on how often we allow that to happen before we insist on starting a new
519 // trace and generating generic code for a node that can be reused by flushing
520 // the deferred actions in the current trace and generating a goto.
521 static const int kMaxCopiesCodeGenerated = 10;
522
on_work_list()523 bool on_work_list() { return on_work_list_; }
set_on_work_list(bool value)524 void set_on_work_list(bool value) { on_work_list_ = value; }
525
info()526 NodeInfo* info() { return &info_; }
527
bm_info(bool not_at_start)528 BoyerMooreLookahead* bm_info(bool not_at_start) {
529 return bm_info_[not_at_start ? 1 : 0];
530 }
531
zone()532 Zone* zone() const { return zone_; }
533
534 protected:
535 enum LimitResult { DONE, CONTINUE };
536 RegExpNode* replacement_;
537
538 LimitResult LimitVersions(RegExpCompiler* compiler, Trace* trace);
539
set_bm_info(bool not_at_start,BoyerMooreLookahead * bm)540 void set_bm_info(bool not_at_start, BoyerMooreLookahead* bm) {
541 bm_info_[not_at_start ? 1 : 0] = bm;
542 }
543
544 private:
545 static const int kFirstCharBudget = 10;
546 Label label_;
547 bool on_work_list_;
548 NodeInfo info_;
549 // This variable keeps track of how many times code has been generated for
550 // this node (in different traces). We don't keep track of where the
551 // generated code is located unless the code is generated at the start of
552 // a trace, in which case it is generic and can be reused by flushing the
553 // deferred operations in the current trace and generating a goto.
554 int trace_count_;
555 BoyerMooreLookahead* bm_info_[2];
556
557 Zone* zone_;
558 };
559
560
561 class SeqRegExpNode: public RegExpNode {
562 public:
SeqRegExpNode(RegExpNode * on_success)563 explicit SeqRegExpNode(RegExpNode* on_success)
564 : RegExpNode(on_success->zone()), on_success_(on_success) { }
on_success()565 RegExpNode* on_success() { return on_success_; }
set_on_success(RegExpNode * node)566 void set_on_success(RegExpNode* node) { on_success_ = node; }
567 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)568 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
569 BoyerMooreLookahead* bm, bool not_at_start) {
570 on_success_->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
571 if (offset == 0) set_bm_info(not_at_start, bm);
572 }
573
574 protected:
575 RegExpNode* FilterSuccessor(int depth, bool ignore_case);
576
577 private:
578 RegExpNode* on_success_;
579 };
580
581
582 class ActionNode: public SeqRegExpNode {
583 public:
584 enum ActionType {
585 SET_REGISTER,
586 INCREMENT_REGISTER,
587 STORE_POSITION,
588 BEGIN_SUBMATCH,
589 POSITIVE_SUBMATCH_SUCCESS,
590 EMPTY_MATCH_CHECK,
591 CLEAR_CAPTURES
592 };
593 static ActionNode* SetRegister(int reg, int val, RegExpNode* on_success);
594 static ActionNode* IncrementRegister(int reg, RegExpNode* on_success);
595 static ActionNode* StorePosition(int reg,
596 bool is_capture,
597 RegExpNode* on_success);
598 static ActionNode* ClearCaptures(Interval range, RegExpNode* on_success);
599 static ActionNode* BeginSubmatch(int stack_pointer_reg,
600 int position_reg,
601 RegExpNode* on_success);
602 static ActionNode* PositiveSubmatchSuccess(int stack_pointer_reg,
603 int restore_reg,
604 int clear_capture_count,
605 int clear_capture_from,
606 RegExpNode* on_success);
607 static ActionNode* EmptyMatchCheck(int start_register,
608 int repetition_register,
609 int repetition_limit,
610 RegExpNode* on_success);
611 virtual void Accept(NodeVisitor* visitor);
612 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
613 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)614 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
615 RegExpCompiler* compiler,
616 int filled_in,
617 bool not_at_start) {
618 return on_success()->GetQuickCheckDetails(
619 details, compiler, filled_in, not_at_start);
620 }
621 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
622 BoyerMooreLookahead* bm, bool not_at_start);
action_type()623 ActionType action_type() { return action_type_; }
624 // TODO(erikcorry): We should allow some action nodes in greedy loops.
GreedyLoopTextLength()625 virtual int GreedyLoopTextLength() { return kNodeIsTooComplexForGreedyLoops; }
626
627 private:
628 union {
629 struct {
630 int reg;
631 int value;
632 } u_store_register;
633 struct {
634 int reg;
635 } u_increment_register;
636 struct {
637 int reg;
638 bool is_capture;
639 } u_position_register;
640 struct {
641 int stack_pointer_register;
642 int current_position_register;
643 int clear_register_count;
644 int clear_register_from;
645 } u_submatch;
646 struct {
647 int start_register;
648 int repetition_register;
649 int repetition_limit;
650 } u_empty_match_check;
651 struct {
652 int range_from;
653 int range_to;
654 } u_clear_captures;
655 } data_;
ActionNode(ActionType action_type,RegExpNode * on_success)656 ActionNode(ActionType action_type, RegExpNode* on_success)
657 : SeqRegExpNode(on_success),
658 action_type_(action_type) { }
659 ActionType action_type_;
660 friend class DotPrinter;
661 };
662
663
664 class TextNode: public SeqRegExpNode {
665 public:
TextNode(ZoneList<TextElement> * elms,bool read_backward,RegExpNode * on_success)666 TextNode(ZoneList<TextElement>* elms, bool read_backward,
667 RegExpNode* on_success)
668 : SeqRegExpNode(on_success), elms_(elms), read_backward_(read_backward) {}
TextNode(RegExpCharacterClass * that,bool read_backward,RegExpNode * on_success)669 TextNode(RegExpCharacterClass* that, bool read_backward,
670 RegExpNode* on_success)
671 : SeqRegExpNode(on_success),
672 elms_(new (zone()) ZoneList<TextElement>(1, zone())),
673 read_backward_(read_backward) {
674 elms_->Add(TextElement::CharClass(that), zone());
675 }
676 // Create TextNode for a single character class for the given ranges.
677 static TextNode* CreateForCharacterRanges(Zone* zone,
678 ZoneList<CharacterRange>* ranges,
679 bool read_backward,
680 RegExpNode* on_success);
681 // Create TextNode for a surrogate pair with a range given for the
682 // lead and the trail surrogate each.
683 static TextNode* CreateForSurrogatePair(Zone* zone, CharacterRange lead,
684 CharacterRange trail,
685 bool read_backward,
686 RegExpNode* on_success);
687 virtual void Accept(NodeVisitor* visitor);
688 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
689 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
690 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
691 RegExpCompiler* compiler,
692 int characters_filled_in,
693 bool not_at_start);
elements()694 ZoneList<TextElement>* elements() { return elms_; }
read_backward()695 bool read_backward() { return read_backward_; }
696 void MakeCaseIndependent(Isolate* isolate, bool is_one_byte);
697 virtual int GreedyLoopTextLength();
698 virtual RegExpNode* GetSuccessorOfOmnivorousTextNode(
699 RegExpCompiler* compiler);
700 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
701 BoyerMooreLookahead* bm, bool not_at_start);
702 void CalculateOffsets();
703 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
704
705 private:
706 enum TextEmitPassType {
707 NON_LATIN1_MATCH, // Check for characters that can't match.
708 SIMPLE_CHARACTER_MATCH, // Case-dependent single character check.
709 NON_LETTER_CHARACTER_MATCH, // Check characters that have no case equivs.
710 CASE_CHARACTER_MATCH, // Case-independent single character check.
711 CHARACTER_CLASS_MATCH // Character class.
712 };
713 static bool SkipPass(int pass, bool ignore_case);
714 static const int kFirstRealPass = SIMPLE_CHARACTER_MATCH;
715 static const int kLastPass = CHARACTER_CLASS_MATCH;
716 void TextEmitPass(RegExpCompiler* compiler,
717 TextEmitPassType pass,
718 bool preloaded,
719 Trace* trace,
720 bool first_element_checked,
721 int* checked_up_to);
722 int Length();
723 ZoneList<TextElement>* elms_;
724 bool read_backward_;
725 };
726
727
728 class AssertionNode: public SeqRegExpNode {
729 public:
730 enum AssertionType {
731 AT_END,
732 AT_START,
733 AT_BOUNDARY,
734 AT_NON_BOUNDARY,
735 AFTER_NEWLINE
736 };
AtEnd(RegExpNode * on_success)737 static AssertionNode* AtEnd(RegExpNode* on_success) {
738 return new(on_success->zone()) AssertionNode(AT_END, on_success);
739 }
AtStart(RegExpNode * on_success)740 static AssertionNode* AtStart(RegExpNode* on_success) {
741 return new(on_success->zone()) AssertionNode(AT_START, on_success);
742 }
AtBoundary(RegExpNode * on_success)743 static AssertionNode* AtBoundary(RegExpNode* on_success) {
744 return new(on_success->zone()) AssertionNode(AT_BOUNDARY, on_success);
745 }
AtNonBoundary(RegExpNode * on_success)746 static AssertionNode* AtNonBoundary(RegExpNode* on_success) {
747 return new(on_success->zone()) AssertionNode(AT_NON_BOUNDARY, on_success);
748 }
AfterNewline(RegExpNode * on_success)749 static AssertionNode* AfterNewline(RegExpNode* on_success) {
750 return new(on_success->zone()) AssertionNode(AFTER_NEWLINE, on_success);
751 }
752 virtual void Accept(NodeVisitor* visitor);
753 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
754 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
755 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
756 RegExpCompiler* compiler,
757 int filled_in,
758 bool not_at_start);
759 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
760 BoyerMooreLookahead* bm, bool not_at_start);
assertion_type()761 AssertionType assertion_type() { return assertion_type_; }
762
763 private:
764 void EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace);
765 enum IfPrevious { kIsNonWord, kIsWord };
766 void BacktrackIfPrevious(RegExpCompiler* compiler,
767 Trace* trace,
768 IfPrevious backtrack_if_previous);
AssertionNode(AssertionType t,RegExpNode * on_success)769 AssertionNode(AssertionType t, RegExpNode* on_success)
770 : SeqRegExpNode(on_success), assertion_type_(t) { }
771 AssertionType assertion_type_;
772 };
773
774
775 class BackReferenceNode: public SeqRegExpNode {
776 public:
BackReferenceNode(int start_reg,int end_reg,bool read_backward,RegExpNode * on_success)777 BackReferenceNode(int start_reg, int end_reg, bool read_backward,
778 RegExpNode* on_success)
779 : SeqRegExpNode(on_success),
780 start_reg_(start_reg),
781 end_reg_(end_reg),
782 read_backward_(read_backward) {}
783 virtual void Accept(NodeVisitor* visitor);
start_register()784 int start_register() { return start_reg_; }
end_register()785 int end_register() { return end_reg_; }
read_backward()786 bool read_backward() { return read_backward_; }
787 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
788 virtual int EatsAtLeast(int still_to_find,
789 int recursion_depth,
790 bool not_at_start);
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)791 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
792 RegExpCompiler* compiler,
793 int characters_filled_in,
794 bool not_at_start) {
795 return;
796 }
797 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
798 BoyerMooreLookahead* bm, bool not_at_start);
799
800 private:
801 int start_reg_;
802 int end_reg_;
803 bool read_backward_;
804 };
805
806
807 class EndNode: public RegExpNode {
808 public:
809 enum Action { ACCEPT, BACKTRACK, NEGATIVE_SUBMATCH_SUCCESS };
EndNode(Action action,Zone * zone)810 EndNode(Action action, Zone* zone) : RegExpNode(zone), action_(action) {}
811 virtual void Accept(NodeVisitor* visitor);
812 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
EatsAtLeast(int still_to_find,int recursion_depth,bool not_at_start)813 virtual int EatsAtLeast(int still_to_find,
814 int recursion_depth,
815 bool not_at_start) { return 0; }
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)816 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
817 RegExpCompiler* compiler,
818 int characters_filled_in,
819 bool not_at_start) {
820 // Returning 0 from EatsAtLeast should ensure we never get here.
821 UNREACHABLE();
822 }
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)823 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
824 BoyerMooreLookahead* bm, bool not_at_start) {
825 // Returning 0 from EatsAtLeast should ensure we never get here.
826 UNREACHABLE();
827 }
828
829 private:
830 Action action_;
831 };
832
833
834 class NegativeSubmatchSuccess: public EndNode {
835 public:
NegativeSubmatchSuccess(int stack_pointer_reg,int position_reg,int clear_capture_count,int clear_capture_start,Zone * zone)836 NegativeSubmatchSuccess(int stack_pointer_reg,
837 int position_reg,
838 int clear_capture_count,
839 int clear_capture_start,
840 Zone* zone)
841 : EndNode(NEGATIVE_SUBMATCH_SUCCESS, zone),
842 stack_pointer_register_(stack_pointer_reg),
843 current_position_register_(position_reg),
844 clear_capture_count_(clear_capture_count),
845 clear_capture_start_(clear_capture_start) { }
846 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
847
848 private:
849 int stack_pointer_register_;
850 int current_position_register_;
851 int clear_capture_count_;
852 int clear_capture_start_;
853 };
854
855
856 class Guard: public ZoneObject {
857 public:
858 enum Relation { LT, GEQ };
Guard(int reg,Relation op,int value)859 Guard(int reg, Relation op, int value)
860 : reg_(reg),
861 op_(op),
862 value_(value) { }
reg()863 int reg() { return reg_; }
op()864 Relation op() { return op_; }
value()865 int value() { return value_; }
866
867 private:
868 int reg_;
869 Relation op_;
870 int value_;
871 };
872
873
874 class GuardedAlternative {
875 public:
GuardedAlternative(RegExpNode * node)876 explicit GuardedAlternative(RegExpNode* node) : node_(node), guards_(NULL) { }
877 void AddGuard(Guard* guard, Zone* zone);
node()878 RegExpNode* node() { return node_; }
set_node(RegExpNode * node)879 void set_node(RegExpNode* node) { node_ = node; }
guards()880 ZoneList<Guard*>* guards() { return guards_; }
881
882 private:
883 RegExpNode* node_;
884 ZoneList<Guard*>* guards_;
885 };
886
887
888 class AlternativeGeneration;
889
890
891 class ChoiceNode: public RegExpNode {
892 public:
ChoiceNode(int expected_size,Zone * zone)893 explicit ChoiceNode(int expected_size, Zone* zone)
894 : RegExpNode(zone),
895 alternatives_(new(zone)
896 ZoneList<GuardedAlternative>(expected_size, zone)),
897 table_(NULL),
898 not_at_start_(false),
899 being_calculated_(false) { }
900 virtual void Accept(NodeVisitor* visitor);
AddAlternative(GuardedAlternative node)901 void AddAlternative(GuardedAlternative node) {
902 alternatives()->Add(node, zone());
903 }
alternatives()904 ZoneList<GuardedAlternative>* alternatives() { return alternatives_; }
905 DispatchTable* GetTable(bool ignore_case);
906 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
907 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
908 int EatsAtLeastHelper(int still_to_find,
909 int budget,
910 RegExpNode* ignore_this_node,
911 bool not_at_start);
912 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
913 RegExpCompiler* compiler,
914 int characters_filled_in,
915 bool not_at_start);
916 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
917 BoyerMooreLookahead* bm, bool not_at_start);
918
being_calculated()919 bool being_calculated() { return being_calculated_; }
not_at_start()920 bool not_at_start() { return not_at_start_; }
set_not_at_start()921 void set_not_at_start() { not_at_start_ = true; }
set_being_calculated(bool b)922 void set_being_calculated(bool b) { being_calculated_ = b; }
try_to_emit_quick_check_for_alternative(bool is_first)923 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
924 return true;
925 }
926 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
read_backward()927 virtual bool read_backward() { return false; }
928
929 protected:
930 int GreedyLoopTextLengthForAlternative(GuardedAlternative* alternative);
931 ZoneList<GuardedAlternative>* alternatives_;
932
933 private:
934 friend class DispatchTableConstructor;
935 friend class Analysis;
936 void GenerateGuard(RegExpMacroAssembler* macro_assembler,
937 Guard* guard,
938 Trace* trace);
939 int CalculatePreloadCharacters(RegExpCompiler* compiler, int eats_at_least);
940 void EmitOutOfLineContinuation(RegExpCompiler* compiler,
941 Trace* trace,
942 GuardedAlternative alternative,
943 AlternativeGeneration* alt_gen,
944 int preload_characters,
945 bool next_expects_preload);
946 void SetUpPreLoad(RegExpCompiler* compiler,
947 Trace* current_trace,
948 PreloadState* preloads);
949 void AssertGuardsMentionRegisters(Trace* trace);
950 int EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler, Trace* trace);
951 Trace* EmitGreedyLoop(RegExpCompiler* compiler,
952 Trace* trace,
953 AlternativeGenerationList* alt_gens,
954 PreloadState* preloads,
955 GreedyLoopState* greedy_loop_state,
956 int text_length);
957 void EmitChoices(RegExpCompiler* compiler,
958 AlternativeGenerationList* alt_gens,
959 int first_choice,
960 Trace* trace,
961 PreloadState* preloads);
962 DispatchTable* table_;
963 // If true, this node is never checked at the start of the input.
964 // Allows a new trace to start with at_start() set to false.
965 bool not_at_start_;
966 bool being_calculated_;
967 };
968
969
970 class NegativeLookaroundChoiceNode : public ChoiceNode {
971 public:
NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,GuardedAlternative then_do_this,Zone * zone)972 explicit NegativeLookaroundChoiceNode(GuardedAlternative this_must_fail,
973 GuardedAlternative then_do_this,
974 Zone* zone)
975 : ChoiceNode(2, zone) {
976 AddAlternative(this_must_fail);
977 AddAlternative(then_do_this);
978 }
979 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
980 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
981 RegExpCompiler* compiler,
982 int characters_filled_in,
983 bool not_at_start);
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)984 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
985 BoyerMooreLookahead* bm, bool not_at_start) {
986 alternatives_->at(1).node()->FillInBMInfo(isolate, offset, budget - 1, bm,
987 not_at_start);
988 if (offset == 0) set_bm_info(not_at_start, bm);
989 }
990 // For a negative lookahead we don't emit the quick check for the
991 // alternative that is expected to fail. This is because quick check code
992 // starts by loading enough characters for the alternative that takes fewest
993 // characters, but on a negative lookahead the negative branch did not take
994 // part in that calculation (EatsAtLeast) so the assumptions don't hold.
try_to_emit_quick_check_for_alternative(bool is_first)995 virtual bool try_to_emit_quick_check_for_alternative(bool is_first) {
996 return !is_first;
997 }
998 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
999 };
1000
1001
1002 class LoopChoiceNode: public ChoiceNode {
1003 public:
LoopChoiceNode(bool body_can_be_zero_length,bool read_backward,Zone * zone)1004 LoopChoiceNode(bool body_can_be_zero_length, bool read_backward, Zone* zone)
1005 : ChoiceNode(2, zone),
1006 loop_node_(NULL),
1007 continue_node_(NULL),
1008 body_can_be_zero_length_(body_can_be_zero_length),
1009 read_backward_(read_backward) {}
1010 void AddLoopAlternative(GuardedAlternative alt);
1011 void AddContinueAlternative(GuardedAlternative alt);
1012 virtual void Emit(RegExpCompiler* compiler, Trace* trace);
1013 virtual int EatsAtLeast(int still_to_find, int budget, bool not_at_start);
1014 virtual void GetQuickCheckDetails(QuickCheckDetails* details,
1015 RegExpCompiler* compiler,
1016 int characters_filled_in,
1017 bool not_at_start);
1018 virtual void FillInBMInfo(Isolate* isolate, int offset, int budget,
1019 BoyerMooreLookahead* bm, bool not_at_start);
loop_node()1020 RegExpNode* loop_node() { return loop_node_; }
continue_node()1021 RegExpNode* continue_node() { return continue_node_; }
body_can_be_zero_length()1022 bool body_can_be_zero_length() { return body_can_be_zero_length_; }
read_backward()1023 virtual bool read_backward() { return read_backward_; }
1024 virtual void Accept(NodeVisitor* visitor);
1025 virtual RegExpNode* FilterOneByte(int depth, bool ignore_case);
1026
1027 private:
1028 // AddAlternative is made private for loop nodes because alternatives
1029 // should not be added freely, we need to keep track of which node
1030 // goes back to the node itself.
AddAlternative(GuardedAlternative node)1031 void AddAlternative(GuardedAlternative node) {
1032 ChoiceNode::AddAlternative(node);
1033 }
1034
1035 RegExpNode* loop_node_;
1036 RegExpNode* continue_node_;
1037 bool body_can_be_zero_length_;
1038 bool read_backward_;
1039 };
1040
1041
1042 // Improve the speed that we scan for an initial point where a non-anchored
1043 // regexp can match by using a Boyer-Moore-like table. This is done by
1044 // identifying non-greedy non-capturing loops in the nodes that eat any
1045 // character one at a time. For example in the middle of the regexp
1046 // /foo[\s\S]*?bar/ we find such a loop. There is also such a loop implicitly
1047 // inserted at the start of any non-anchored regexp.
1048 //
1049 // When we have found such a loop we look ahead in the nodes to find the set of
1050 // characters that can come at given distances. For example for the regexp
1051 // /.?foo/ we know that there are at least 3 characters ahead of us, and the
1052 // sets of characters that can occur are [any, [f, o], [o]]. We find a range in
1053 // the lookahead info where the set of characters is reasonably constrained. In
1054 // our example this is from index 1 to 2 (0 is not constrained). We can now
1055 // look 3 characters ahead and if we don't find one of [f, o] (the union of
1056 // [f, o] and [o]) then we can skip forwards by the range size (in this case 2).
1057 //
1058 // For Unicode input strings we do the same, but modulo 128.
1059 //
1060 // We also look at the first string fed to the regexp and use that to get a hint
1061 // of the character frequencies in the inputs. This affects the assessment of
1062 // whether the set of characters is 'reasonably constrained'.
1063 //
1064 // We also have another lookahead mechanism (called quick check in the code),
1065 // which uses a wide load of multiple characters followed by a mask and compare
1066 // to determine whether a match is possible at this point.
1067 enum ContainedInLattice {
1068 kNotYet = 0,
1069 kLatticeIn = 1,
1070 kLatticeOut = 2,
1071 kLatticeUnknown = 3 // Can also mean both in and out.
1072 };
1073
1074
Combine(ContainedInLattice a,ContainedInLattice b)1075 inline ContainedInLattice Combine(ContainedInLattice a, ContainedInLattice b) {
1076 return static_cast<ContainedInLattice>(a | b);
1077 }
1078
1079
1080 ContainedInLattice AddRange(ContainedInLattice a,
1081 const int* ranges,
1082 int ranges_size,
1083 Interval new_range);
1084
1085
1086 class BoyerMoorePositionInfo : public ZoneObject {
1087 public:
BoyerMoorePositionInfo(Zone * zone)1088 explicit BoyerMoorePositionInfo(Zone* zone)
1089 : map_(new(zone) ZoneList<bool>(kMapSize, zone)),
1090 map_count_(0),
1091 w_(kNotYet),
1092 s_(kNotYet),
1093 d_(kNotYet),
1094 surrogate_(kNotYet) {
1095 for (int i = 0; i < kMapSize; i++) {
1096 map_->Add(false, zone);
1097 }
1098 }
1099
at(int i)1100 bool& at(int i) { return map_->at(i); }
1101
1102 static const int kMapSize = 128;
1103 static const int kMask = kMapSize - 1;
1104
map_count()1105 int map_count() const { return map_count_; }
1106
1107 void Set(int character);
1108 void SetInterval(const Interval& interval);
1109 void SetAll();
is_non_word()1110 bool is_non_word() { return w_ == kLatticeOut; }
is_word()1111 bool is_word() { return w_ == kLatticeIn; }
1112
1113 private:
1114 ZoneList<bool>* map_;
1115 int map_count_; // Number of set bits in the map.
1116 ContainedInLattice w_; // The \w character class.
1117 ContainedInLattice s_; // The \s character class.
1118 ContainedInLattice d_; // The \d character class.
1119 ContainedInLattice surrogate_; // Surrogate UTF-16 code units.
1120 };
1121
1122
1123 class BoyerMooreLookahead : public ZoneObject {
1124 public:
1125 BoyerMooreLookahead(int length, RegExpCompiler* compiler, Zone* zone);
1126
length()1127 int length() { return length_; }
max_char()1128 int max_char() { return max_char_; }
compiler()1129 RegExpCompiler* compiler() { return compiler_; }
1130
Count(int map_number)1131 int Count(int map_number) {
1132 return bitmaps_->at(map_number)->map_count();
1133 }
1134
at(int i)1135 BoyerMoorePositionInfo* at(int i) { return bitmaps_->at(i); }
1136
Set(int map_number,int character)1137 void Set(int map_number, int character) {
1138 if (character > max_char_) return;
1139 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1140 info->Set(character);
1141 }
1142
SetInterval(int map_number,const Interval & interval)1143 void SetInterval(int map_number, const Interval& interval) {
1144 if (interval.from() > max_char_) return;
1145 BoyerMoorePositionInfo* info = bitmaps_->at(map_number);
1146 if (interval.to() > max_char_) {
1147 info->SetInterval(Interval(interval.from(), max_char_));
1148 } else {
1149 info->SetInterval(interval);
1150 }
1151 }
1152
SetAll(int map_number)1153 void SetAll(int map_number) {
1154 bitmaps_->at(map_number)->SetAll();
1155 }
1156
SetRest(int from_map)1157 void SetRest(int from_map) {
1158 for (int i = from_map; i < length_; i++) SetAll(i);
1159 }
1160 void EmitSkipInstructions(RegExpMacroAssembler* masm);
1161
1162 private:
1163 // This is the value obtained by EatsAtLeast. If we do not have at least this
1164 // many characters left in the sample string then the match is bound to fail.
1165 // Therefore it is OK to read a character this far ahead of the current match
1166 // point.
1167 int length_;
1168 RegExpCompiler* compiler_;
1169 // 0xff for Latin1, 0xffff for UTF-16.
1170 int max_char_;
1171 ZoneList<BoyerMoorePositionInfo*>* bitmaps_;
1172
1173 int GetSkipTable(int min_lookahead,
1174 int max_lookahead,
1175 Handle<ByteArray> boolean_skip_table);
1176 bool FindWorthwhileInterval(int* from, int* to);
1177 int FindBestInterval(
1178 int max_number_of_chars, int old_biggest_points, int* from, int* to);
1179 };
1180
1181
1182 // There are many ways to generate code for a node. This class encapsulates
1183 // the current way we should be generating. In other words it encapsulates
1184 // the current state of the code generator. The effect of this is that we
1185 // generate code for paths that the matcher can take through the regular
1186 // expression. A given node in the regexp can be code-generated several times
1187 // as it can be part of several traces. For example for the regexp:
1188 // /foo(bar|ip)baz/ the code to match baz will be generated twice, once as part
1189 // of the foo-bar-baz trace and once as part of the foo-ip-baz trace. The code
1190 // to match foo is generated only once (the traces have a common prefix). The
1191 // code to store the capture is deferred and generated (twice) after the places
1192 // where baz has been matched.
1193 class Trace {
1194 public:
1195 // A value for a property that is either known to be true, know to be false,
1196 // or not known.
1197 enum TriBool {
1198 UNKNOWN = -1, FALSE_VALUE = 0, TRUE_VALUE = 1
1199 };
1200
1201 class DeferredAction {
1202 public:
DeferredAction(ActionNode::ActionType action_type,int reg)1203 DeferredAction(ActionNode::ActionType action_type, int reg)
1204 : action_type_(action_type), reg_(reg), next_(NULL) { }
next()1205 DeferredAction* next() { return next_; }
1206 bool Mentions(int reg);
reg()1207 int reg() { return reg_; }
action_type()1208 ActionNode::ActionType action_type() { return action_type_; }
1209 private:
1210 ActionNode::ActionType action_type_;
1211 int reg_;
1212 DeferredAction* next_;
1213 friend class Trace;
1214 };
1215
1216 class DeferredCapture : public DeferredAction {
1217 public:
DeferredCapture(int reg,bool is_capture,Trace * trace)1218 DeferredCapture(int reg, bool is_capture, Trace* trace)
1219 : DeferredAction(ActionNode::STORE_POSITION, reg),
1220 cp_offset_(trace->cp_offset()),
1221 is_capture_(is_capture) { }
cp_offset()1222 int cp_offset() { return cp_offset_; }
is_capture()1223 bool is_capture() { return is_capture_; }
1224 private:
1225 int cp_offset_;
1226 bool is_capture_;
set_cp_offset(int cp_offset)1227 void set_cp_offset(int cp_offset) { cp_offset_ = cp_offset; }
1228 };
1229
1230 class DeferredSetRegister : public DeferredAction {
1231 public:
DeferredSetRegister(int reg,int value)1232 DeferredSetRegister(int reg, int value)
1233 : DeferredAction(ActionNode::SET_REGISTER, reg),
1234 value_(value) { }
value()1235 int value() { return value_; }
1236 private:
1237 int value_;
1238 };
1239
1240 class DeferredClearCaptures : public DeferredAction {
1241 public:
DeferredClearCaptures(Interval range)1242 explicit DeferredClearCaptures(Interval range)
1243 : DeferredAction(ActionNode::CLEAR_CAPTURES, -1),
1244 range_(range) { }
range()1245 Interval range() { return range_; }
1246 private:
1247 Interval range_;
1248 };
1249
1250 class DeferredIncrementRegister : public DeferredAction {
1251 public:
DeferredIncrementRegister(int reg)1252 explicit DeferredIncrementRegister(int reg)
1253 : DeferredAction(ActionNode::INCREMENT_REGISTER, reg) { }
1254 };
1255
Trace()1256 Trace()
1257 : cp_offset_(0),
1258 actions_(NULL),
1259 backtrack_(NULL),
1260 stop_node_(NULL),
1261 loop_label_(NULL),
1262 characters_preloaded_(0),
1263 bound_checked_up_to_(0),
1264 flush_budget_(100),
1265 at_start_(UNKNOWN) { }
1266
1267 // End the trace. This involves flushing the deferred actions in the trace
1268 // and pushing a backtrack location onto the backtrack stack. Once this is
1269 // done we can start a new trace or go to one that has already been
1270 // generated.
1271 void Flush(RegExpCompiler* compiler, RegExpNode* successor);
cp_offset()1272 int cp_offset() { return cp_offset_; }
actions()1273 DeferredAction* actions() { return actions_; }
1274 // A trivial trace is one that has no deferred actions or other state that
1275 // affects the assumptions used when generating code. There is no recorded
1276 // backtrack location in a trivial trace, so with a trivial trace we will
1277 // generate code that, on a failure to match, gets the backtrack location
1278 // from the backtrack stack rather than using a direct jump instruction. We
1279 // always start code generation with a trivial trace and non-trivial traces
1280 // are created as we emit code for nodes or add to the list of deferred
1281 // actions in the trace. The location of the code generated for a node using
1282 // a trivial trace is recorded in a label in the node so that gotos can be
1283 // generated to that code.
is_trivial()1284 bool is_trivial() {
1285 return backtrack_ == NULL &&
1286 actions_ == NULL &&
1287 cp_offset_ == 0 &&
1288 characters_preloaded_ == 0 &&
1289 bound_checked_up_to_ == 0 &&
1290 quick_check_performed_.characters() == 0 &&
1291 at_start_ == UNKNOWN;
1292 }
at_start()1293 TriBool at_start() { return at_start_; }
set_at_start(TriBool at_start)1294 void set_at_start(TriBool at_start) { at_start_ = at_start; }
backtrack()1295 Label* backtrack() { return backtrack_; }
loop_label()1296 Label* loop_label() { return loop_label_; }
stop_node()1297 RegExpNode* stop_node() { return stop_node_; }
characters_preloaded()1298 int characters_preloaded() { return characters_preloaded_; }
bound_checked_up_to()1299 int bound_checked_up_to() { return bound_checked_up_to_; }
flush_budget()1300 int flush_budget() { return flush_budget_; }
quick_check_performed()1301 QuickCheckDetails* quick_check_performed() { return &quick_check_performed_; }
1302 bool mentions_reg(int reg);
1303 // Returns true if a deferred position store exists to the specified
1304 // register and stores the offset in the out-parameter. Otherwise
1305 // returns false.
1306 bool GetStoredPosition(int reg, int* cp_offset);
1307 // These set methods and AdvanceCurrentPositionInTrace should be used only on
1308 // new traces - the intention is that traces are immutable after creation.
add_action(DeferredAction * new_action)1309 void add_action(DeferredAction* new_action) {
1310 DCHECK(new_action->next_ == NULL);
1311 new_action->next_ = actions_;
1312 actions_ = new_action;
1313 }
set_backtrack(Label * backtrack)1314 void set_backtrack(Label* backtrack) { backtrack_ = backtrack; }
set_stop_node(RegExpNode * node)1315 void set_stop_node(RegExpNode* node) { stop_node_ = node; }
set_loop_label(Label * label)1316 void set_loop_label(Label* label) { loop_label_ = label; }
set_characters_preloaded(int count)1317 void set_characters_preloaded(int count) { characters_preloaded_ = count; }
set_bound_checked_up_to(int to)1318 void set_bound_checked_up_to(int to) { bound_checked_up_to_ = to; }
set_flush_budget(int to)1319 void set_flush_budget(int to) { flush_budget_ = to; }
set_quick_check_performed(QuickCheckDetails * d)1320 void set_quick_check_performed(QuickCheckDetails* d) {
1321 quick_check_performed_ = *d;
1322 }
1323 void InvalidateCurrentCharacter();
1324 void AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler);
1325
1326 private:
1327 int FindAffectedRegisters(OutSet* affected_registers, Zone* zone);
1328 void PerformDeferredActions(RegExpMacroAssembler* macro,
1329 int max_register,
1330 const OutSet& affected_registers,
1331 OutSet* registers_to_pop,
1332 OutSet* registers_to_clear,
1333 Zone* zone);
1334 void RestoreAffectedRegisters(RegExpMacroAssembler* macro,
1335 int max_register,
1336 const OutSet& registers_to_pop,
1337 const OutSet& registers_to_clear);
1338 int cp_offset_;
1339 DeferredAction* actions_;
1340 Label* backtrack_;
1341 RegExpNode* stop_node_;
1342 Label* loop_label_;
1343 int characters_preloaded_;
1344 int bound_checked_up_to_;
1345 QuickCheckDetails quick_check_performed_;
1346 int flush_budget_;
1347 TriBool at_start_;
1348 };
1349
1350
1351 class GreedyLoopState {
1352 public:
1353 explicit GreedyLoopState(bool not_at_start);
1354
label()1355 Label* label() { return &label_; }
counter_backtrack_trace()1356 Trace* counter_backtrack_trace() { return &counter_backtrack_trace_; }
1357
1358 private:
1359 Label label_;
1360 Trace counter_backtrack_trace_;
1361 };
1362
1363
1364 struct PreloadState {
1365 static const int kEatsAtLeastNotYetInitialized = -1;
1366 bool preload_is_current_;
1367 bool preload_has_checked_bounds_;
1368 int preload_characters_;
1369 int eats_at_least_;
initPreloadState1370 void init() {
1371 eats_at_least_ = kEatsAtLeastNotYetInitialized;
1372 }
1373 };
1374
1375
1376 class NodeVisitor {
1377 public:
~NodeVisitor()1378 virtual ~NodeVisitor() { }
1379 #define DECLARE_VISIT(Type) \
1380 virtual void Visit##Type(Type##Node* that) = 0;
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1381 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1382 #undef DECLARE_VISIT
1383 virtual void VisitLoopChoice(LoopChoiceNode* that) { VisitChoice(that); }
1384 };
1385
1386
1387 // Node visitor used to add the start set of the alternatives to the
1388 // dispatch table of a choice node.
1389 class DispatchTableConstructor: public NodeVisitor {
1390 public:
DispatchTableConstructor(DispatchTable * table,bool ignore_case,Zone * zone)1391 DispatchTableConstructor(DispatchTable* table, bool ignore_case,
1392 Zone* zone)
1393 : table_(table),
1394 choice_index_(-1),
1395 ignore_case_(ignore_case),
1396 zone_(zone) { }
1397
1398 void BuildTable(ChoiceNode* node);
1399
AddRange(CharacterRange range)1400 void AddRange(CharacterRange range) {
1401 table()->AddRange(range, choice_index_, zone_);
1402 }
1403
1404 void AddInverse(ZoneList<CharacterRange>* ranges);
1405
1406 #define DECLARE_VISIT(Type) \
1407 virtual void Visit##Type(Type##Node* that);
FOR_EACH_NODE_TYPE(DECLARE_VISIT)1408 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1409 #undef DECLARE_VISIT
1410
1411 DispatchTable* table() { return table_; }
set_choice_index(int value)1412 void set_choice_index(int value) { choice_index_ = value; }
1413
1414 protected:
1415 DispatchTable* table_;
1416 int choice_index_;
1417 bool ignore_case_;
1418 Zone* zone_;
1419 };
1420
1421
1422 // Assertion propagation moves information about assertions such as
1423 // \b to the affected nodes. For instance, in /.\b./ information must
1424 // be propagated to the first '.' that whatever follows needs to know
1425 // if it matched a word or a non-word, and to the second '.' that it
1426 // has to check if it succeeds a word or non-word. In this case the
1427 // result will be something like:
1428 //
1429 // +-------+ +------------+
1430 // | . | | . |
1431 // +-------+ ---> +------------+
1432 // | word? | | check word |
1433 // +-------+ +------------+
1434 class Analysis: public NodeVisitor {
1435 public:
Analysis(Isolate * isolate,JSRegExp::Flags flags,bool is_one_byte)1436 Analysis(Isolate* isolate, JSRegExp::Flags flags, bool is_one_byte)
1437 : isolate_(isolate),
1438 flags_(flags),
1439 is_one_byte_(is_one_byte),
1440 error_message_(NULL) {}
1441 void EnsureAnalyzed(RegExpNode* node);
1442
1443 #define DECLARE_VISIT(Type) \
1444 virtual void Visit##Type(Type##Node* that);
1445 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
1446 #undef DECLARE_VISIT
1447 virtual void VisitLoopChoice(LoopChoiceNode* that);
1448
has_failed()1449 bool has_failed() { return error_message_ != NULL; }
error_message()1450 const char* error_message() {
1451 DCHECK(error_message_ != NULL);
1452 return error_message_;
1453 }
fail(const char * error_message)1454 void fail(const char* error_message) {
1455 error_message_ = error_message;
1456 }
1457
isolate()1458 Isolate* isolate() const { return isolate_; }
1459
ignore_case()1460 bool ignore_case() const { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1461 bool unicode() const { return (flags_ & JSRegExp::kUnicode) != 0; }
1462
1463 private:
1464 Isolate* isolate_;
1465 JSRegExp::Flags flags_;
1466 bool is_one_byte_;
1467 const char* error_message_;
1468
1469 DISALLOW_IMPLICIT_CONSTRUCTORS(Analysis);
1470 };
1471
1472
1473 struct RegExpCompileData {
RegExpCompileDataRegExpCompileData1474 RegExpCompileData()
1475 : tree(NULL),
1476 node(NULL),
1477 simple(true),
1478 contains_anchor(false),
1479 capture_count(0) { }
1480 RegExpTree* tree;
1481 RegExpNode* node;
1482 bool simple;
1483 bool contains_anchor;
1484 Handle<FixedArray> capture_name_map;
1485 Handle<String> error;
1486 int capture_count;
1487 };
1488
1489
1490 class RegExpEngine: public AllStatic {
1491 public:
1492 struct CompilationResult {
CompilationResultCompilationResult1493 CompilationResult(Isolate* isolate, const char* error_message)
1494 : error_message(error_message),
1495 code(isolate->heap()->the_hole_value()),
1496 num_registers(0) {}
CompilationResultCompilationResult1497 CompilationResult(Object* code, int registers)
1498 : error_message(NULL), code(code), num_registers(registers) {}
1499 const char* error_message;
1500 Object* code;
1501 int num_registers;
1502 };
1503
1504 static CompilationResult Compile(Isolate* isolate, Zone* zone,
1505 RegExpCompileData* input,
1506 JSRegExp::Flags flags,
1507 Handle<String> pattern,
1508 Handle<String> sample_subject,
1509 bool is_one_byte);
1510
1511 static bool TooMuchRegExpCode(Handle<String> pattern);
1512
1513 static void DotPrint(const char* label, RegExpNode* node, bool ignore_case);
1514 };
1515
1516
1517 class RegExpResultsCache : public AllStatic {
1518 public:
1519 enum ResultsCacheType { REGEXP_MULTIPLE_INDICES, STRING_SPLIT_SUBSTRINGS };
1520
1521 // Attempt to retrieve a cached result. On failure, 0 is returned as a Smi.
1522 // On success, the returned result is guaranteed to be a COW-array.
1523 static Object* Lookup(Heap* heap, String* key_string, Object* key_pattern,
1524 FixedArray** last_match_out, ResultsCacheType type);
1525 // Attempt to add value_array to the cache specified by type. On success,
1526 // value_array is turned into a COW-array.
1527 static void Enter(Isolate* isolate, Handle<String> key_string,
1528 Handle<Object> key_pattern, Handle<FixedArray> value_array,
1529 Handle<FixedArray> last_match_cache, ResultsCacheType type);
1530 static void Clear(FixedArray* cache);
1531 static const int kRegExpResultsCacheSize = 0x100;
1532
1533 private:
1534 static const int kArrayEntriesPerCacheEntry = 4;
1535 static const int kStringOffset = 0;
1536 static const int kPatternOffset = 1;
1537 static const int kArrayOffset = 2;
1538 static const int kLastMatchOffset = 3;
1539 };
1540
1541 } // namespace internal
1542 } // namespace v8
1543
1544 #endif // V8_REGEXP_JSREGEXP_H_
1545