1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/jsregexp.h"
6 
7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h"
10 #include "src/compiler.h"
11 #include "src/execution.h"
12 #include "src/factory.h"
13 #include "src/isolate-inl.h"
14 #include "src/messages.h"
15 #include "src/ostreams.h"
16 #include "src/regexp/interpreter-irregexp.h"
17 #include "src/regexp/jsregexp-inl.h"
18 #include "src/regexp/regexp-macro-assembler.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-parser.h"
22 #include "src/regexp/regexp-stack.h"
23 #include "src/runtime/runtime.h"
24 #include "src/splay-tree-inl.h"
25 #include "src/string-search.h"
26 #include "src/unicode-decoder.h"
27 
28 #ifndef V8_INTERPRETED_REGEXP
29 #if V8_TARGET_ARCH_IA32
30 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
31 #elif V8_TARGET_ARCH_X64
32 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
33 #elif V8_TARGET_ARCH_ARM64
34 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
35 #elif V8_TARGET_ARCH_ARM
36 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
37 #elif V8_TARGET_ARCH_PPC
38 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
39 #elif V8_TARGET_ARCH_MIPS
40 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
41 #elif V8_TARGET_ARCH_MIPS64
42 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
43 #elif V8_TARGET_ARCH_X87
44 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
45 #else
46 #error Unsupported target architecture.
47 #endif
48 #endif
49 
50 
51 namespace v8 {
52 namespace internal {
53 
54 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)55 static inline MaybeHandle<Object> ThrowRegExpException(
56     Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
57   Isolate* isolate = re->GetIsolate();
58   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
59                                           pattern, error_text),
60                   Object);
61 }
62 
63 
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> error_text)64 inline void ThrowRegExpException(Handle<JSRegExp> re,
65                                  Handle<String> error_text) {
66   USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
67 }
68 
69 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)70 ContainedInLattice AddRange(ContainedInLattice containment,
71                             const int* ranges,
72                             int ranges_length,
73                             Interval new_range) {
74   DCHECK((ranges_length & 1) == 1);
75   DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
76   if (containment == kLatticeUnknown) return containment;
77   bool inside = false;
78   int last = 0;
79   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
80     // Consider the range from last to ranges[i].
81     // We haven't got to the new range yet.
82     if (ranges[i] <= new_range.from()) continue;
83     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
84     // inclusive, but the values in ranges are not.
85     if (last <= new_range.from() && new_range.to() < ranges[i]) {
86       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
87     }
88     return kLatticeUnknown;
89   }
90   return containment;
91 }
92 
93 
94 // More makes code generation slower, less makes V8 benchmark score lower.
95 const int kMaxLookaheadForBoyerMoore = 8;
96 // In a 3-character pattern you can maximally step forwards 3 characters
97 // at a time, which is not always enough to pay for the extra logic.
98 const int kPatternTooShortForBoyerMoore = 2;
99 
100 
101 // Identifies the sort of regexps where the regexp engine is faster
102 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)103 static bool HasFewDifferentCharacters(Handle<String> pattern) {
104   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
105   if (length <= kPatternTooShortForBoyerMoore) return false;
106   const int kMod = 128;
107   bool character_found[kMod];
108   int different = 0;
109   memset(&character_found[0], 0, sizeof(character_found));
110   for (int i = 0; i < length; i++) {
111     int ch = (pattern->Get(i) & (kMod - 1));
112     if (!character_found[ch]) {
113       character_found[ch] = true;
114       different++;
115       // We declare a regexp low-alphabet if it has at least 3 times as many
116       // characters as it has different characters.
117       if (different * 3 > length) return false;
118     }
119   }
120   return true;
121 }
122 
123 
124 // Generic RegExp methods. Dispatches to implementation specific methods.
125 
126 
Compile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)127 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
128                                         Handle<String> pattern,
129                                         JSRegExp::Flags flags) {
130   Isolate* isolate = re->GetIsolate();
131   Zone zone;
132   CompilationCache* compilation_cache = isolate->compilation_cache();
133   MaybeHandle<FixedArray> maybe_cached =
134       compilation_cache->LookupRegExp(pattern, flags);
135   Handle<FixedArray> cached;
136   bool in_cache = maybe_cached.ToHandle(&cached);
137   LOG(isolate, RegExpCompileEvent(re, in_cache));
138 
139   Handle<Object> result;
140   if (in_cache) {
141     re->set_data(*cached);
142     return re;
143   }
144   pattern = String::Flatten(pattern);
145   PostponeInterruptsScope postpone(isolate);
146   RegExpCompileData parse_result;
147   FlatStringReader reader(isolate, pattern);
148   if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader,
149                                  flags & JSRegExp::kMultiline,
150                                  flags & JSRegExp::kUnicode, &parse_result)) {
151     // Throw an exception if we fail to parse the pattern.
152     return ThrowRegExpException(re, pattern, parse_result.error);
153   }
154 
155   bool has_been_compiled = false;
156 
157   if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
158       !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
159     // Parse-tree is a single atom that is equal to the pattern.
160     AtomCompile(re, pattern, flags, pattern);
161     has_been_compiled = true;
162   } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
163              !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
164     RegExpAtom* atom = parse_result.tree->AsAtom();
165     Vector<const uc16> atom_pattern = atom->data();
166     Handle<String> atom_string;
167     ASSIGN_RETURN_ON_EXCEPTION(
168         isolate, atom_string,
169         isolate->factory()->NewStringFromTwoByte(atom_pattern),
170         Object);
171     if (!HasFewDifferentCharacters(atom_string)) {
172       AtomCompile(re, pattern, flags, atom_string);
173       has_been_compiled = true;
174     }
175   }
176   if (!has_been_compiled) {
177     IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
178   }
179   DCHECK(re->data()->IsFixedArray());
180   // Compilation succeeded so the data is set on the regexp
181   // and we can store it in the cache.
182   Handle<FixedArray> data(FixedArray::cast(re->data()));
183   compilation_cache->PutRegExp(pattern, flags, data);
184 
185   return re;
186 }
187 
188 
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)189 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
190                                      Handle<String> subject,
191                                      int index,
192                                      Handle<JSArray> last_match_info) {
193   switch (regexp->TypeTag()) {
194     case JSRegExp::ATOM:
195       return AtomExec(regexp, subject, index, last_match_info);
196     case JSRegExp::IRREGEXP: {
197       return IrregexpExec(regexp, subject, index, last_match_info);
198     }
199     default:
200       UNREACHABLE();
201       return MaybeHandle<Object>();
202   }
203 }
204 
205 
206 // RegExp Atom implementation: Simple string search using indexOf.
207 
208 
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)209 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
210                              Handle<String> pattern,
211                              JSRegExp::Flags flags,
212                              Handle<String> match_pattern) {
213   re->GetIsolate()->factory()->SetRegExpAtomData(re,
214                                                  JSRegExp::ATOM,
215                                                  pattern,
216                                                  flags,
217                                                  match_pattern);
218 }
219 
220 
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)221 static void SetAtomLastCapture(FixedArray* array,
222                                String* subject,
223                                int from,
224                                int to) {
225   SealHandleScope shs(array->GetIsolate());
226   RegExpImpl::SetLastCaptureCount(array, 2);
227   RegExpImpl::SetLastSubject(array, subject);
228   RegExpImpl::SetLastInput(array, subject);
229   RegExpImpl::SetCapture(array, 0, from);
230   RegExpImpl::SetCapture(array, 1, to);
231 }
232 
233 
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)234 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
235                             Handle<String> subject,
236                             int index,
237                             int32_t* output,
238                             int output_size) {
239   Isolate* isolate = regexp->GetIsolate();
240 
241   DCHECK(0 <= index);
242   DCHECK(index <= subject->length());
243 
244   subject = String::Flatten(subject);
245   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
246 
247   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
248   int needle_len = needle->length();
249   DCHECK(needle->IsFlat());
250   DCHECK_LT(0, needle_len);
251 
252   if (index + needle_len > subject->length()) {
253     return RegExpImpl::RE_FAILURE;
254   }
255 
256   for (int i = 0; i < output_size; i += 2) {
257     String::FlatContent needle_content = needle->GetFlatContent();
258     String::FlatContent subject_content = subject->GetFlatContent();
259     DCHECK(needle_content.IsFlat());
260     DCHECK(subject_content.IsFlat());
261     // dispatch on type of strings
262     index =
263         (needle_content.IsOneByte()
264              ? (subject_content.IsOneByte()
265                     ? SearchString(isolate, subject_content.ToOneByteVector(),
266                                    needle_content.ToOneByteVector(), index)
267                     : SearchString(isolate, subject_content.ToUC16Vector(),
268                                    needle_content.ToOneByteVector(), index))
269              : (subject_content.IsOneByte()
270                     ? SearchString(isolate, subject_content.ToOneByteVector(),
271                                    needle_content.ToUC16Vector(), index)
272                     : SearchString(isolate, subject_content.ToUC16Vector(),
273                                    needle_content.ToUC16Vector(), index)));
274     if (index == -1) {
275       return i / 2;  // Return number of matches.
276     } else {
277       output[i] = index;
278       output[i+1] = index + needle_len;
279       index += needle_len;
280     }
281   }
282   return output_size / 2;
283 }
284 
285 
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)286 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
287                                     Handle<String> subject,
288                                     int index,
289                                     Handle<JSArray> last_match_info) {
290   Isolate* isolate = re->GetIsolate();
291 
292   static const int kNumRegisters = 2;
293   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
294   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
295 
296   int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
297 
298   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
299 
300   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
301   SealHandleScope shs(isolate);
302   FixedArray* array = FixedArray::cast(last_match_info->elements());
303   SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
304   return last_match_info;
305 }
306 
307 
308 // Irregexp implementation.
309 
310 // Ensures that the regexp object contains a compiled version of the
311 // source for either one-byte or two-byte subject strings.
312 // If the compiled version doesn't already exist, it is compiled
313 // from the source pattern.
314 // If compilation fails, an exception is thrown and this function
315 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)316 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
317                                         Handle<String> sample_subject,
318                                         bool is_one_byte) {
319   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
320 #ifdef V8_INTERPRETED_REGEXP
321   if (compiled_code->IsByteArray()) return true;
322 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
323   if (compiled_code->IsCode()) return true;
324 #endif
325   // We could potentially have marked this as flushable, but have kept
326   // a saved version if we did not flush it yet.
327   Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
328   if (saved_code->IsCode()) {
329     // Reinstate the code in the original place.
330     re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
331     DCHECK(compiled_code->IsSmi());
332     return true;
333   }
334   return CompileIrregexp(re, sample_subject, is_one_byte);
335 }
336 
337 
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)338 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
339                                  Handle<String> sample_subject,
340                                  bool is_one_byte) {
341   // Compile the RegExp.
342   Isolate* isolate = re->GetIsolate();
343   Zone zone;
344   PostponeInterruptsScope postpone(isolate);
345   // If we had a compilation error the last time this is saved at the
346   // saved code index.
347   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
348   // When arriving here entry can only be a smi, either representing an
349   // uncompiled regexp, a previous compilation error, or code that has
350   // been flushed.
351   DCHECK(entry->IsSmi());
352   int entry_value = Smi::cast(entry)->value();
353   DCHECK(entry_value == JSRegExp::kUninitializedValue ||
354          entry_value == JSRegExp::kCompilationErrorValue ||
355          (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
356 
357   if (entry_value == JSRegExp::kCompilationErrorValue) {
358     // A previous compilation failed and threw an error which we store in
359     // the saved code index (we store the error message, not the actual
360     // error). Recreate the error object and throw it.
361     Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
362     DCHECK(error_string->IsString());
363     Handle<String> error_message(String::cast(error_string));
364     ThrowRegExpException(re, error_message);
365     return false;
366   }
367 
368   JSRegExp::Flags flags = re->GetFlags();
369 
370   Handle<String> pattern(re->Pattern());
371   pattern = String::Flatten(pattern);
372   RegExpCompileData compile_data;
373   FlatStringReader reader(isolate, pattern);
374   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader,
375                                  flags & JSRegExp::kMultiline,
376                                  flags & JSRegExp::kUnicode, &compile_data)) {
377     // Throw an exception if we fail to parse the pattern.
378     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
379     USE(ThrowRegExpException(re, pattern, compile_data.error));
380     return false;
381   }
382   RegExpEngine::CompilationResult result = RegExpEngine::Compile(
383       isolate, &zone, &compile_data, flags & JSRegExp::kIgnoreCase,
384       flags & JSRegExp::kGlobal, flags & JSRegExp::kMultiline,
385       flags & JSRegExp::kSticky, pattern, sample_subject, is_one_byte);
386   if (result.error_message != NULL) {
387     // Unable to compile regexp.
388     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
389         CStrVector(result.error_message)).ToHandleChecked();
390     ThrowRegExpException(re, error_message);
391     return false;
392   }
393 
394   Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
395   data->set(JSRegExp::code_index(is_one_byte), result.code);
396   int register_max = IrregexpMaxRegisterCount(*data);
397   if (result.num_registers > register_max) {
398     SetIrregexpMaxRegisterCount(*data, result.num_registers);
399   }
400 
401   return true;
402 }
403 
404 
IrregexpMaxRegisterCount(FixedArray * re)405 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
406   return Smi::cast(
407       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
408 }
409 
410 
SetIrregexpMaxRegisterCount(FixedArray * re,int value)411 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
412   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
413 }
414 
415 
IrregexpNumberOfCaptures(FixedArray * re)416 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
417   return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
418 }
419 
420 
IrregexpNumberOfRegisters(FixedArray * re)421 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
422   return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
423 }
424 
425 
IrregexpByteCode(FixedArray * re,bool is_one_byte)426 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
427   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
428 }
429 
430 
IrregexpNativeCode(FixedArray * re,bool is_one_byte)431 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
432   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
433 }
434 
435 
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)436 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
437                                     Handle<String> pattern,
438                                     JSRegExp::Flags flags,
439                                     int capture_count) {
440   // Initialize compiled code entries to null.
441   re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
442                                                      JSRegExp::IRREGEXP,
443                                                      pattern,
444                                                      flags,
445                                                      capture_count);
446 }
447 
448 
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)449 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
450                                 Handle<String> subject) {
451   subject = String::Flatten(subject);
452 
453   // Check representation of the underlying storage.
454   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
455   if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
456 
457 #ifdef V8_INTERPRETED_REGEXP
458   // Byte-code regexp needs space allocated for all its registers.
459   // The result captures are copied to the start of the registers array
460   // if the match succeeds.  This way those registers are not clobbered
461   // when we set the last match info from last successful match.
462   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
463          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
464 #else  // V8_INTERPRETED_REGEXP
465   // Native regexp only needs room to output captures. Registers are handled
466   // internally.
467   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
468 #endif  // V8_INTERPRETED_REGEXP
469 }
470 
471 
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)472 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
473                                 Handle<String> subject,
474                                 int index,
475                                 int32_t* output,
476                                 int output_size) {
477   Isolate* isolate = regexp->GetIsolate();
478 
479   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
480 
481   DCHECK(index >= 0);
482   DCHECK(index <= subject->length());
483   DCHECK(subject->IsFlat());
484 
485   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
486 
487 #ifndef V8_INTERPRETED_REGEXP
488   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
489   do {
490     EnsureCompiledIrregexp(regexp, subject, is_one_byte);
491     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
492     // The stack is used to allocate registers for the compiled regexp code.
493     // This means that in case of failure, the output registers array is left
494     // untouched and contains the capture results from the previous successful
495     // match.  We can use that to set the last match info lazily.
496     NativeRegExpMacroAssembler::Result res =
497         NativeRegExpMacroAssembler::Match(code,
498                                           subject,
499                                           output,
500                                           output_size,
501                                           index,
502                                           isolate);
503     if (res != NativeRegExpMacroAssembler::RETRY) {
504       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
505              isolate->has_pending_exception());
506       STATIC_ASSERT(
507           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
508       STATIC_ASSERT(
509           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
510       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
511                     == RE_EXCEPTION);
512       return static_cast<IrregexpResult>(res);
513     }
514     // If result is RETRY, the string has changed representation, and we
515     // must restart from scratch.
516     // In this case, it means we must make sure we are prepared to handle
517     // the, potentially, different subject (the string can switch between
518     // being internal and external, and even between being Latin1 and UC16,
519     // but the characters are always the same).
520     IrregexpPrepare(regexp, subject);
521     is_one_byte = subject->IsOneByteRepresentationUnderneath();
522   } while (true);
523   UNREACHABLE();
524   return RE_EXCEPTION;
525 #else  // V8_INTERPRETED_REGEXP
526 
527   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
528   // We must have done EnsureCompiledIrregexp, so we can get the number of
529   // registers.
530   int number_of_capture_registers =
531       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
532   int32_t* raw_output = &output[number_of_capture_registers];
533   // We do not touch the actual capture result registers until we know there
534   // has been a match so that we can use those capture results to set the
535   // last match info.
536   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
537     raw_output[i] = -1;
538   }
539   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
540                                isolate);
541 
542   IrregexpResult result = IrregexpInterpreter::Match(isolate,
543                                                      byte_codes,
544                                                      subject,
545                                                      raw_output,
546                                                      index);
547   if (result == RE_SUCCESS) {
548     // Copy capture results to the start of the registers array.
549     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
550   }
551   if (result == RE_EXCEPTION) {
552     DCHECK(!isolate->has_pending_exception());
553     isolate->StackOverflow();
554   }
555   return result;
556 #endif  // V8_INTERPRETED_REGEXP
557 }
558 
559 
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)560 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
561                                              Handle<String> subject,
562                                              int previous_index,
563                                              Handle<JSArray> last_match_info) {
564   Isolate* isolate = regexp->GetIsolate();
565   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
566 
567   // Prepare space for the return values.
568 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
569   if (FLAG_trace_regexp_bytecodes) {
570     String* pattern = regexp->Pattern();
571     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
572     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
573   }
574 #endif
575   int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
576   if (required_registers < 0) {
577     // Compiling failed with an exception.
578     DCHECK(isolate->has_pending_exception());
579     return MaybeHandle<Object>();
580   }
581 
582   int32_t* output_registers = NULL;
583   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
584     output_registers = NewArray<int32_t>(required_registers);
585   }
586   base::SmartArrayPointer<int32_t> auto_release(output_registers);
587   if (output_registers == NULL) {
588     output_registers = isolate->jsregexp_static_offsets_vector();
589   }
590 
591   int res = RegExpImpl::IrregexpExecRaw(
592       regexp, subject, previous_index, output_registers, required_registers);
593   if (res == RE_SUCCESS) {
594     int capture_count =
595         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
596     return SetLastMatchInfo(
597         last_match_info, subject, capture_count, output_registers);
598   }
599   if (res == RE_EXCEPTION) {
600     DCHECK(isolate->has_pending_exception());
601     return MaybeHandle<Object>();
602   }
603   DCHECK(res == RE_FAILURE);
604   return isolate->factory()->null_value();
605 }
606 
607 
EnsureSize(Handle<JSArray> array,uint32_t minimum_size)608 static void EnsureSize(Handle<JSArray> array, uint32_t minimum_size) {
609   if (static_cast<uint32_t>(array->elements()->length()) < minimum_size) {
610     JSArray::SetLength(array, minimum_size);
611   }
612 }
613 
614 
SetLastMatchInfo(Handle<JSArray> last_match_info,Handle<String> subject,int capture_count,int32_t * match)615 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
616                                              Handle<String> subject,
617                                              int capture_count,
618                                              int32_t* match) {
619   DCHECK(last_match_info->HasFastObjectElements());
620   int capture_register_count = (capture_count + 1) * 2;
621   EnsureSize(last_match_info, capture_register_count + kLastMatchOverhead);
622   DisallowHeapAllocation no_allocation;
623   FixedArray* array = FixedArray::cast(last_match_info->elements());
624   if (match != NULL) {
625     for (int i = 0; i < capture_register_count; i += 2) {
626       SetCapture(array, i, match[i]);
627       SetCapture(array, i + 1, match[i + 1]);
628     }
629   }
630   SetLastCaptureCount(array, capture_register_count);
631   SetLastSubject(array, *subject);
632   SetLastInput(array, *subject);
633   return last_match_info;
634 }
635 
636 
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,bool is_global,Isolate * isolate)637 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
638                                      Handle<String> subject,
639                                      bool is_global,
640                                      Isolate* isolate)
641   : register_array_(NULL),
642     register_array_size_(0),
643     regexp_(regexp),
644     subject_(subject) {
645 #ifdef V8_INTERPRETED_REGEXP
646   bool interpreted = true;
647 #else
648   bool interpreted = false;
649 #endif  // V8_INTERPRETED_REGEXP
650 
651   if (regexp_->TypeTag() == JSRegExp::ATOM) {
652     static const int kAtomRegistersPerMatch = 2;
653     registers_per_match_ = kAtomRegistersPerMatch;
654     // There is no distinction between interpreted and native for atom regexps.
655     interpreted = false;
656   } else {
657     registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
658     if (registers_per_match_ < 0) {
659       num_matches_ = -1;  // Signal exception.
660       return;
661     }
662   }
663 
664   if (is_global && !interpreted) {
665     register_array_size_ =
666         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
667     max_matches_ = register_array_size_ / registers_per_match_;
668   } else {
669     // Global loop in interpreted regexp is not implemented.  We choose
670     // the size of the offsets vector so that it can only store one match.
671     register_array_size_ = registers_per_match_;
672     max_matches_ = 1;
673   }
674 
675   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
676     register_array_ = NewArray<int32_t>(register_array_size_);
677   } else {
678     register_array_ = isolate->jsregexp_static_offsets_vector();
679   }
680 
681   // Set state so that fetching the results the first time triggers a call
682   // to the compiled regexp.
683   current_match_index_ = max_matches_ - 1;
684   num_matches_ = max_matches_;
685   DCHECK(registers_per_match_ >= 2);  // Each match has at least one capture.
686   DCHECK_GE(register_array_size_, registers_per_match_);
687   int32_t* last_match =
688       &register_array_[current_match_index_ * registers_per_match_];
689   last_match[0] = -1;
690   last_match[1] = 0;
691 }
692 
693 
694 // -------------------------------------------------------------------
695 // Implementation of the Irregexp regular expression engine.
696 //
697 // The Irregexp regular expression engine is intended to be a complete
698 // implementation of ECMAScript regular expressions.  It generates either
699 // bytecodes or native code.
700 
701 //   The Irregexp regexp engine is structured in three steps.
702 //   1) The parser generates an abstract syntax tree.  See ast.cc.
703 //   2) From the AST a node network is created.  The nodes are all
704 //      subclasses of RegExpNode.  The nodes represent states when
705 //      executing a regular expression.  Several optimizations are
706 //      performed on the node network.
707 //   3) From the nodes we generate either byte codes or native code
708 //      that can actually execute the regular expression (perform
709 //      the search).  The code generation step is described in more
710 //      detail below.
711 
712 // Code generation.
713 //
714 //   The nodes are divided into four main categories.
715 //   * Choice nodes
716 //        These represent places where the regular expression can
717 //        match in more than one way.  For example on entry to an
718 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
719 //   * Action nodes
720 //        These represent places where some action should be
721 //        performed.  Examples include recording the current position
722 //        in the input string to a register (in order to implement
723 //        captures) or other actions on register for example in order
724 //        to implement the counters needed for {} repetitions.
725 //   * Matching nodes
726 //        These attempt to match some element part of the input string.
727 //        Examples of elements include character classes, plain strings
728 //        or back references.
729 //   * End nodes
730 //        These are used to implement the actions required on finding
731 //        a successful match or failing to find a match.
732 //
733 //   The code generated (whether as byte codes or native code) maintains
734 //   some state as it runs.  This consists of the following elements:
735 //
736 //   * The capture registers.  Used for string captures.
737 //   * Other registers.  Used for counters etc.
738 //   * The current position.
739 //   * The stack of backtracking information.  Used when a matching node
740 //     fails to find a match and needs to try an alternative.
741 //
742 // Conceptual regular expression execution model:
743 //
744 //   There is a simple conceptual model of regular expression execution
745 //   which will be presented first.  The actual code generated is a more
746 //   efficient simulation of the simple conceptual model:
747 //
748 //   * Choice nodes are implemented as follows:
749 //     For each choice except the last {
750 //       push current position
751 //       push backtrack code location
752 //       <generate code to test for choice>
753 //       backtrack code location:
754 //       pop current position
755 //     }
756 //     <generate code to test for last choice>
757 //
758 //   * Actions nodes are generated as follows
759 //     <push affected registers on backtrack stack>
760 //     <generate code to perform action>
761 //     push backtrack code location
762 //     <generate code to test for following nodes>
763 //     backtrack code location:
764 //     <pop affected registers to restore their state>
765 //     <pop backtrack location from stack and go to it>
766 //
767 //   * Matching nodes are generated as follows:
768 //     if input string matches at current position
769 //       update current position
770 //       <generate code to test for following nodes>
771 //     else
772 //       <pop backtrack location from stack and go to it>
773 //
774 //   Thus it can be seen that the current position is saved and restored
775 //   by the choice nodes, whereas the registers are saved and restored by
776 //   by the action nodes that manipulate them.
777 //
778 //   The other interesting aspect of this model is that nodes are generated
779 //   at the point where they are needed by a recursive call to Emit().  If
780 //   the node has already been code generated then the Emit() call will
781 //   generate a jump to the previously generated code instead.  In order to
782 //   limit recursion it is possible for the Emit() function to put the node
783 //   on a work list for later generation and instead generate a jump.  The
784 //   destination of the jump is resolved later when the code is generated.
785 //
786 // Actual regular expression code generation.
787 //
788 //   Code generation is actually more complicated than the above.  In order
789 //   to improve the efficiency of the generated code some optimizations are
790 //   performed
791 //
792 //   * Choice nodes have 1-character lookahead.
793 //     A choice node looks at the following character and eliminates some of
794 //     the choices immediately based on that character.  This is not yet
795 //     implemented.
796 //   * Simple greedy loops store reduced backtracking information.
797 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
798 //     then need to backtrack to a point where it can match "foo".  The naive
799 //     implementation of this would push each character position onto the
800 //     backtracking stack, then pop them off one by one.  This would use space
801 //     proportional to the length of the input string.  However since the "."
802 //     can only match in one way and always has a constant length (in this case
803 //     of 1) it suffices to store the current position on the top of the stack
804 //     once.  Matching now becomes merely incrementing the current position and
805 //     backtracking becomes decrementing the current position and checking the
806 //     result against the stored current position.  This is faster and saves
807 //     space.
808 //   * The current state is virtualized.
809 //     This is used to defer expensive operations until it is clear that they
810 //     are needed and to generate code for a node more than once, allowing
811 //     specialized an efficient versions of the code to be created. This is
812 //     explained in the section below.
813 //
814 // Execution state virtualization.
815 //
816 //   Instead of emitting code, nodes that manipulate the state can record their
817 //   manipulation in an object called the Trace.  The Trace object can record a
818 //   current position offset, an optional backtrack code location on the top of
819 //   the virtualized backtrack stack and some register changes.  When a node is
820 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
821 //   will emit code to bring the actual state into line with the virtual state.
822 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
823 //   registers).  Postponing work can save time when executing the regular
824 //   expression since it may be found that the work never has to be done as a
825 //   failure to match can occur.  In addition it is much faster to jump to a
826 //   known backtrack code location than it is to pop an unknown backtrack
827 //   location from the stack and jump there.
828 //
829 //   The virtual state found in the Trace affects code generation.  For example
830 //   the virtual state contains the difference between the actual current
831 //   position and the virtual current position, and matching code needs to use
832 //   this offset to attempt a match in the correct location of the input
833 //   string.  Therefore code generated for a non-trivial trace is specialized
834 //   to that trace.  The code generator therefore has the ability to generate
835 //   code for each node several times.  In order to limit the size of the
836 //   generated code there is an arbitrary limit on how many specialized sets of
837 //   code may be generated for a given node.  If the limit is reached, the
838 //   trace is flushed and a generic version of the code for a node is emitted.
839 //   This is subsequently used for that node.  The code emitted for non-generic
840 //   trace is not recorded in the node and so it cannot currently be reused in
841 //   the event that code generation is requested for an identical trace.
842 
843 
AppendToText(RegExpText * text,Zone * zone)844 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
845   UNREACHABLE();
846 }
847 
848 
AppendToText(RegExpText * text,Zone * zone)849 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
850   text->AddElement(TextElement::Atom(this), zone);
851 }
852 
853 
AppendToText(RegExpText * text,Zone * zone)854 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
855   text->AddElement(TextElement::CharClass(this), zone);
856 }
857 
858 
AppendToText(RegExpText * text,Zone * zone)859 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
860   for (int i = 0; i < elements()->length(); i++)
861     text->AddElement(elements()->at(i), zone);
862 }
863 
864 
Atom(RegExpAtom * atom)865 TextElement TextElement::Atom(RegExpAtom* atom) {
866   return TextElement(ATOM, atom);
867 }
868 
869 
CharClass(RegExpCharacterClass * char_class)870 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
871   return TextElement(CHAR_CLASS, char_class);
872 }
873 
874 
length() const875 int TextElement::length() const {
876   switch (text_type()) {
877     case ATOM:
878       return atom()->length();
879 
880     case CHAR_CLASS:
881       return 1;
882   }
883   UNREACHABLE();
884   return 0;
885 }
886 
887 
GetTable(bool ignore_case)888 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
889   if (table_ == NULL) {
890     table_ = new(zone()) DispatchTable(zone());
891     DispatchTableConstructor cons(table_, ignore_case, zone());
892     cons.BuildTable(this);
893   }
894   return table_;
895 }
896 
897 
898 class FrequencyCollator {
899  public:
FrequencyCollator()900   FrequencyCollator() : total_samples_(0) {
901     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
902       frequencies_[i] = CharacterFrequency(i);
903     }
904   }
905 
CountCharacter(int character)906   void CountCharacter(int character) {
907     int index = (character & RegExpMacroAssembler::kTableMask);
908     frequencies_[index].Increment();
909     total_samples_++;
910   }
911 
912   // Does not measure in percent, but rather per-128 (the table size from the
913   // regexp macro assembler).
Frequency(int in_character)914   int Frequency(int in_character) {
915     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
916     if (total_samples_ < 1) return 1;  // Division by zero.
917     int freq_in_per128 =
918         (frequencies_[in_character].counter() * 128) / total_samples_;
919     return freq_in_per128;
920   }
921 
922  private:
923   class CharacterFrequency {
924    public:
CharacterFrequency()925     CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)926     explicit CharacterFrequency(int character)
927         : counter_(0), character_(character) { }
928 
Increment()929     void Increment() { counter_++; }
counter()930     int counter() { return counter_; }
character()931     int character() { return character_; }
932 
933    private:
934     int counter_;
935     int character_;
936   };
937 
938 
939  private:
940   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
941   int total_samples_;
942 };
943 
944 
945 class RegExpCompiler {
946  public:
947   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
948                  bool ignore_case, bool is_one_byte);
949 
AllocateRegister()950   int AllocateRegister() {
951     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
952       reg_exp_too_big_ = true;
953       return next_register_;
954     }
955     return next_register_++;
956   }
957 
958   RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
959                                            RegExpNode* start,
960                                            int capture_count,
961                                            Handle<String> pattern);
962 
AddWork(RegExpNode * node)963   inline void AddWork(RegExpNode* node) {
964     if (!node->on_work_list() && !node->label()->is_bound()) {
965       node->set_on_work_list(true);
966       work_list_->Add(node);
967     }
968   }
969 
970   static const int kImplementationOffset = 0;
971   static const int kNumberOfRegistersOffset = 0;
972   static const int kCodeOffset = 1;
973 
macro_assembler()974   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()975   EndNode* accept() { return accept_; }
976 
977   static const int kMaxRecursion = 100;
recursion_depth()978   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()979   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()980   inline void DecrementRecursionDepth() { recursion_depth_--; }
981 
SetRegExpTooBig()982   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
983 
ignore_case()984   inline bool ignore_case() { return ignore_case_; }
one_byte()985   inline bool one_byte() { return one_byte_; }
optimize()986   inline bool optimize() { return optimize_; }
set_optimize(bool value)987   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()988   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)989   inline void set_limiting_recursion(bool value) {
990     limiting_recursion_ = value;
991   }
read_backward()992   bool read_backward() { return read_backward_; }
set_read_backward(bool value)993   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()994   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
995 
current_expansion_factor()996   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)997   void set_current_expansion_factor(int value) {
998     current_expansion_factor_ = value;
999   }
1000 
isolate() const1001   Isolate* isolate() const { return isolate_; }
zone() const1002   Zone* zone() const { return zone_; }
1003 
1004   static const int kNoRegister = -1;
1005 
1006  private:
1007   EndNode* accept_;
1008   int next_register_;
1009   List<RegExpNode*>* work_list_;
1010   int recursion_depth_;
1011   RegExpMacroAssembler* macro_assembler_;
1012   bool ignore_case_;
1013   bool one_byte_;
1014   bool reg_exp_too_big_;
1015   bool limiting_recursion_;
1016   bool optimize_;
1017   bool read_backward_;
1018   int current_expansion_factor_;
1019   FrequencyCollator frequency_collator_;
1020   Isolate* isolate_;
1021   Zone* zone_;
1022 };
1023 
1024 
1025 class RecursionCheck {
1026  public:
RecursionCheck(RegExpCompiler * compiler)1027   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1028     compiler->IncrementRecursionDepth();
1029   }
~RecursionCheck()1030   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1031  private:
1032   RegExpCompiler* compiler_;
1033 };
1034 
1035 
IrregexpRegExpTooBig(Isolate * isolate)1036 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1037   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1038 }
1039 
1040 
1041 // Attempts to compile the regexp using an Irregexp code generator.  Returns
1042 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool ignore_case,bool one_byte)1043 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1044                                bool ignore_case, bool one_byte)
1045     : next_register_(2 * (capture_count + 1)),
1046       work_list_(NULL),
1047       recursion_depth_(0),
1048       ignore_case_(ignore_case),
1049       one_byte_(one_byte),
1050       reg_exp_too_big_(false),
1051       limiting_recursion_(false),
1052       optimize_(FLAG_regexp_optimization),
1053       read_backward_(false),
1054       current_expansion_factor_(1),
1055       frequency_collator_(),
1056       isolate_(isolate),
1057       zone_(zone) {
1058   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1059   DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1060 }
1061 
1062 
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1063 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1064     RegExpMacroAssembler* macro_assembler,
1065     RegExpNode* start,
1066     int capture_count,
1067     Handle<String> pattern) {
1068   Heap* heap = pattern->GetHeap();
1069 
1070 #ifdef DEBUG
1071   if (FLAG_trace_regexp_assembler)
1072     macro_assembler_ =
1073         new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1074   else
1075 #endif
1076     macro_assembler_ = macro_assembler;
1077 
1078   List <RegExpNode*> work_list(0);
1079   work_list_ = &work_list;
1080   Label fail;
1081   macro_assembler_->PushBacktrack(&fail);
1082   Trace new_trace;
1083   start->Emit(this, &new_trace);
1084   macro_assembler_->Bind(&fail);
1085   macro_assembler_->Fail();
1086   while (!work_list.is_empty()) {
1087     RegExpNode* node = work_list.RemoveLast();
1088     node->set_on_work_list(false);
1089     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1090   }
1091   if (reg_exp_too_big_) {
1092     macro_assembler_->AbortedCodeGeneration();
1093     return IrregexpRegExpTooBig(isolate_);
1094   }
1095 
1096   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1097   heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1098   work_list_ = NULL;
1099 #ifdef ENABLE_DISASSEMBLER
1100   if (FLAG_print_code) {
1101     CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1102     OFStream os(trace_scope.file());
1103     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1104   }
1105 #endif
1106 #ifdef DEBUG
1107   if (FLAG_trace_regexp_assembler) {
1108     delete macro_assembler_;
1109   }
1110 #endif
1111   return RegExpEngine::CompilationResult(*code, next_register_);
1112 }
1113 
1114 
Mentions(int that)1115 bool Trace::DeferredAction::Mentions(int that) {
1116   if (action_type() == ActionNode::CLEAR_CAPTURES) {
1117     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1118     return range.Contains(that);
1119   } else {
1120     return reg() == that;
1121   }
1122 }
1123 
1124 
mentions_reg(int reg)1125 bool Trace::mentions_reg(int reg) {
1126   for (DeferredAction* action = actions_;
1127        action != NULL;
1128        action = action->next()) {
1129     if (action->Mentions(reg))
1130       return true;
1131   }
1132   return false;
1133 }
1134 
1135 
GetStoredPosition(int reg,int * cp_offset)1136 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1137   DCHECK_EQ(0, *cp_offset);
1138   for (DeferredAction* action = actions_;
1139        action != NULL;
1140        action = action->next()) {
1141     if (action->Mentions(reg)) {
1142       if (action->action_type() == ActionNode::STORE_POSITION) {
1143         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1144         return true;
1145       } else {
1146         return false;
1147       }
1148     }
1149   }
1150   return false;
1151 }
1152 
1153 
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1154 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1155                                  Zone* zone) {
1156   int max_register = RegExpCompiler::kNoRegister;
1157   for (DeferredAction* action = actions_;
1158        action != NULL;
1159        action = action->next()) {
1160     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1161       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1162       for (int i = range.from(); i <= range.to(); i++)
1163         affected_registers->Set(i, zone);
1164       if (range.to() > max_register) max_register = range.to();
1165     } else {
1166       affected_registers->Set(action->reg(), zone);
1167       if (action->reg() > max_register) max_register = action->reg();
1168     }
1169   }
1170   return max_register;
1171 }
1172 
1173 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1174 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1175                                      int max_register,
1176                                      const OutSet& registers_to_pop,
1177                                      const OutSet& registers_to_clear) {
1178   for (int reg = max_register; reg >= 0; reg--) {
1179     if (registers_to_pop.Get(reg)) {
1180       assembler->PopRegister(reg);
1181     } else if (registers_to_clear.Get(reg)) {
1182       int clear_to = reg;
1183       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1184         reg--;
1185       }
1186       assembler->ClearRegisters(reg, clear_to);
1187     }
1188   }
1189 }
1190 
1191 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1192 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1193                                    int max_register,
1194                                    const OutSet& affected_registers,
1195                                    OutSet* registers_to_pop,
1196                                    OutSet* registers_to_clear,
1197                                    Zone* zone) {
1198   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1199   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1200 
1201   // Count pushes performed to force a stack limit check occasionally.
1202   int pushes = 0;
1203 
1204   for (int reg = 0; reg <= max_register; reg++) {
1205     if (!affected_registers.Get(reg)) {
1206       continue;
1207     }
1208 
1209     // The chronologically first deferred action in the trace
1210     // is used to infer the action needed to restore a register
1211     // to its previous state (or not, if it's safe to ignore it).
1212     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1213     DeferredActionUndoType undo_action = IGNORE;
1214 
1215     int value = 0;
1216     bool absolute = false;
1217     bool clear = false;
1218     static const int kNoStore = kMinInt;
1219     int store_position = kNoStore;
1220     // This is a little tricky because we are scanning the actions in reverse
1221     // historical order (newest first).
1222     for (DeferredAction* action = actions_;
1223          action != NULL;
1224          action = action->next()) {
1225       if (action->Mentions(reg)) {
1226         switch (action->action_type()) {
1227           case ActionNode::SET_REGISTER: {
1228             Trace::DeferredSetRegister* psr =
1229                 static_cast<Trace::DeferredSetRegister*>(action);
1230             if (!absolute) {
1231               value += psr->value();
1232               absolute = true;
1233             }
1234             // SET_REGISTER is currently only used for newly introduced loop
1235             // counters. They can have a significant previous value if they
1236             // occour in a loop. TODO(lrn): Propagate this information, so
1237             // we can set undo_action to IGNORE if we know there is no value to
1238             // restore.
1239             undo_action = RESTORE;
1240             DCHECK_EQ(store_position, kNoStore);
1241             DCHECK(!clear);
1242             break;
1243           }
1244           case ActionNode::INCREMENT_REGISTER:
1245             if (!absolute) {
1246               value++;
1247             }
1248             DCHECK_EQ(store_position, kNoStore);
1249             DCHECK(!clear);
1250             undo_action = RESTORE;
1251             break;
1252           case ActionNode::STORE_POSITION: {
1253             Trace::DeferredCapture* pc =
1254                 static_cast<Trace::DeferredCapture*>(action);
1255             if (!clear && store_position == kNoStore) {
1256               store_position = pc->cp_offset();
1257             }
1258 
1259             // For captures we know that stores and clears alternate.
1260             // Other register, are never cleared, and if the occur
1261             // inside a loop, they might be assigned more than once.
1262             if (reg <= 1) {
1263               // Registers zero and one, aka "capture zero", is
1264               // always set correctly if we succeed. There is no
1265               // need to undo a setting on backtrack, because we
1266               // will set it again or fail.
1267               undo_action = IGNORE;
1268             } else {
1269               undo_action = pc->is_capture() ? CLEAR : RESTORE;
1270             }
1271             DCHECK(!absolute);
1272             DCHECK_EQ(value, 0);
1273             break;
1274           }
1275           case ActionNode::CLEAR_CAPTURES: {
1276             // Since we're scanning in reverse order, if we've already
1277             // set the position we have to ignore historically earlier
1278             // clearing operations.
1279             if (store_position == kNoStore) {
1280               clear = true;
1281             }
1282             undo_action = RESTORE;
1283             DCHECK(!absolute);
1284             DCHECK_EQ(value, 0);
1285             break;
1286           }
1287           default:
1288             UNREACHABLE();
1289             break;
1290         }
1291       }
1292     }
1293     // Prepare for the undo-action (e.g., push if it's going to be popped).
1294     if (undo_action == RESTORE) {
1295       pushes++;
1296       RegExpMacroAssembler::StackCheckFlag stack_check =
1297           RegExpMacroAssembler::kNoStackLimitCheck;
1298       if (pushes == push_limit) {
1299         stack_check = RegExpMacroAssembler::kCheckStackLimit;
1300         pushes = 0;
1301       }
1302 
1303       assembler->PushRegister(reg, stack_check);
1304       registers_to_pop->Set(reg, zone);
1305     } else if (undo_action == CLEAR) {
1306       registers_to_clear->Set(reg, zone);
1307     }
1308     // Perform the chronologically last action (or accumulated increment)
1309     // for the register.
1310     if (store_position != kNoStore) {
1311       assembler->WriteCurrentPositionToRegister(reg, store_position);
1312     } else if (clear) {
1313       assembler->ClearRegisters(reg, reg);
1314     } else if (absolute) {
1315       assembler->SetRegister(reg, value);
1316     } else if (value != 0) {
1317       assembler->AdvanceRegister(reg, value);
1318     }
1319   }
1320 }
1321 
1322 
1323 // This is called as we come into a loop choice node and some other tricky
1324 // nodes.  It normalizes the state of the code generator to ensure we can
1325 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1326 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1327   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1328 
1329   DCHECK(!is_trivial());
1330 
1331   if (actions_ == NULL && backtrack() == NULL) {
1332     // Here we just have some deferred cp advances to fix and we are back to
1333     // a normal situation.  We may also have to forget some information gained
1334     // through a quick check that was already performed.
1335     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1336     // Create a new trivial state and generate the node with that.
1337     Trace new_state;
1338     successor->Emit(compiler, &new_state);
1339     return;
1340   }
1341 
1342   // Generate deferred actions here along with code to undo them again.
1343   OutSet affected_registers;
1344 
1345   if (backtrack() != NULL) {
1346     // Here we have a concrete backtrack location.  These are set up by choice
1347     // nodes and so they indicate that we have a deferred save of the current
1348     // position which we may need to emit here.
1349     assembler->PushCurrentPosition();
1350   }
1351 
1352   int max_register = FindAffectedRegisters(&affected_registers,
1353                                            compiler->zone());
1354   OutSet registers_to_pop;
1355   OutSet registers_to_clear;
1356   PerformDeferredActions(assembler,
1357                          max_register,
1358                          affected_registers,
1359                          &registers_to_pop,
1360                          &registers_to_clear,
1361                          compiler->zone());
1362   if (cp_offset_ != 0) {
1363     assembler->AdvanceCurrentPosition(cp_offset_);
1364   }
1365 
1366   // Create a new trivial state and generate the node with that.
1367   Label undo;
1368   assembler->PushBacktrack(&undo);
1369   if (successor->KeepRecursing(compiler)) {
1370     Trace new_state;
1371     successor->Emit(compiler, &new_state);
1372   } else {
1373     compiler->AddWork(successor);
1374     assembler->GoTo(successor->label());
1375   }
1376 
1377   // On backtrack we need to restore state.
1378   assembler->Bind(&undo);
1379   RestoreAffectedRegisters(assembler,
1380                            max_register,
1381                            registers_to_pop,
1382                            registers_to_clear);
1383   if (backtrack() == NULL) {
1384     assembler->Backtrack();
1385   } else {
1386     assembler->PopCurrentPosition();
1387     assembler->GoTo(backtrack());
1388   }
1389 }
1390 
1391 
Emit(RegExpCompiler * compiler,Trace * trace)1392 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1393   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1394 
1395   // Omit flushing the trace. We discard the entire stack frame anyway.
1396 
1397   if (!label()->is_bound()) {
1398     // We are completely independent of the trace, since we ignore it,
1399     // so this code can be used as the generic version.
1400     assembler->Bind(label());
1401   }
1402 
1403   // Throw away everything on the backtrack stack since the start
1404   // of the negative submatch and restore the character position.
1405   assembler->ReadCurrentPositionFromRegister(current_position_register_);
1406   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1407   if (clear_capture_count_ > 0) {
1408     // Clear any captures that might have been performed during the success
1409     // of the body of the negative look-ahead.
1410     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1411     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1412   }
1413   // Now that we have unwound the stack we find at the top of the stack the
1414   // backtrack that the BeginSubmatch node got.
1415   assembler->Backtrack();
1416 }
1417 
1418 
Emit(RegExpCompiler * compiler,Trace * trace)1419 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1420   if (!trace->is_trivial()) {
1421     trace->Flush(compiler, this);
1422     return;
1423   }
1424   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1425   if (!label()->is_bound()) {
1426     assembler->Bind(label());
1427   }
1428   switch (action_) {
1429     case ACCEPT:
1430       assembler->Succeed();
1431       return;
1432     case BACKTRACK:
1433       assembler->GoTo(trace->backtrack());
1434       return;
1435     case NEGATIVE_SUBMATCH_SUCCESS:
1436       // This case is handled in a different virtual method.
1437       UNREACHABLE();
1438   }
1439   UNIMPLEMENTED();
1440 }
1441 
1442 
AddGuard(Guard * guard,Zone * zone)1443 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1444   if (guards_ == NULL)
1445     guards_ = new(zone) ZoneList<Guard*>(1, zone);
1446   guards_->Add(guard, zone);
1447 }
1448 
1449 
SetRegister(int reg,int val,RegExpNode * on_success)1450 ActionNode* ActionNode::SetRegister(int reg,
1451                                     int val,
1452                                     RegExpNode* on_success) {
1453   ActionNode* result =
1454       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1455   result->data_.u_store_register.reg = reg;
1456   result->data_.u_store_register.value = val;
1457   return result;
1458 }
1459 
1460 
IncrementRegister(int reg,RegExpNode * on_success)1461 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1462   ActionNode* result =
1463       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1464   result->data_.u_increment_register.reg = reg;
1465   return result;
1466 }
1467 
1468 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1469 ActionNode* ActionNode::StorePosition(int reg,
1470                                       bool is_capture,
1471                                       RegExpNode* on_success) {
1472   ActionNode* result =
1473       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1474   result->data_.u_position_register.reg = reg;
1475   result->data_.u_position_register.is_capture = is_capture;
1476   return result;
1477 }
1478 
1479 
ClearCaptures(Interval range,RegExpNode * on_success)1480 ActionNode* ActionNode::ClearCaptures(Interval range,
1481                                       RegExpNode* on_success) {
1482   ActionNode* result =
1483       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1484   result->data_.u_clear_captures.range_from = range.from();
1485   result->data_.u_clear_captures.range_to = range.to();
1486   return result;
1487 }
1488 
1489 
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1490 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1491                                       int position_reg,
1492                                       RegExpNode* on_success) {
1493   ActionNode* result =
1494       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1495   result->data_.u_submatch.stack_pointer_register = stack_reg;
1496   result->data_.u_submatch.current_position_register = position_reg;
1497   return result;
1498 }
1499 
1500 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1501 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1502                                                 int position_reg,
1503                                                 int clear_register_count,
1504                                                 int clear_register_from,
1505                                                 RegExpNode* on_success) {
1506   ActionNode* result =
1507       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1508   result->data_.u_submatch.stack_pointer_register = stack_reg;
1509   result->data_.u_submatch.current_position_register = position_reg;
1510   result->data_.u_submatch.clear_register_count = clear_register_count;
1511   result->data_.u_submatch.clear_register_from = clear_register_from;
1512   return result;
1513 }
1514 
1515 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1516 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1517                                         int repetition_register,
1518                                         int repetition_limit,
1519                                         RegExpNode* on_success) {
1520   ActionNode* result =
1521       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1522   result->data_.u_empty_match_check.start_register = start_register;
1523   result->data_.u_empty_match_check.repetition_register = repetition_register;
1524   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1525   return result;
1526 }
1527 
1528 
1529 #define DEFINE_ACCEPT(Type)                                          \
1530   void Type##Node::Accept(NodeVisitor* visitor) {                    \
1531     visitor->Visit##Type(this);                                      \
1532   }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1533 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1534 #undef DEFINE_ACCEPT
1535 
1536 
1537 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1538   visitor->VisitLoopChoice(this);
1539 }
1540 
1541 
1542 // -------------------------------------------------------------------
1543 // Emit code.
1544 
1545 
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1546 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1547                                Guard* guard,
1548                                Trace* trace) {
1549   switch (guard->op()) {
1550     case Guard::LT:
1551       DCHECK(!trace->mentions_reg(guard->reg()));
1552       macro_assembler->IfRegisterGE(guard->reg(),
1553                                     guard->value(),
1554                                     trace->backtrack());
1555       break;
1556     case Guard::GEQ:
1557       DCHECK(!trace->mentions_reg(guard->reg()));
1558       macro_assembler->IfRegisterLT(guard->reg(),
1559                                     guard->value(),
1560                                     trace->backtrack());
1561       break;
1562   }
1563 }
1564 
1565 
1566 // Returns the number of characters in the equivalence class, omitting those
1567 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1568 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1569                                      bool one_byte_subject,
1570                                      unibrow::uchar* letters) {
1571   int length =
1572       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1573   // Unibrow returns 0 or 1 for characters where case independence is
1574   // trivial.
1575   if (length == 0) {
1576     letters[0] = character;
1577     length = 1;
1578   }
1579 
1580   if (one_byte_subject) {
1581     int new_length = 0;
1582     for (int i = 0; i < length; i++) {
1583       if (letters[i] <= String::kMaxOneByteCharCode) {
1584         letters[new_length++] = letters[i];
1585       }
1586     }
1587     length = new_length;
1588   }
1589 
1590   return length;
1591 }
1592 
1593 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1594 static inline bool EmitSimpleCharacter(Isolate* isolate,
1595                                        RegExpCompiler* compiler,
1596                                        uc16 c,
1597                                        Label* on_failure,
1598                                        int cp_offset,
1599                                        bool check,
1600                                        bool preloaded) {
1601   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1602   bool bound_checked = false;
1603   if (!preloaded) {
1604     assembler->LoadCurrentCharacter(
1605         cp_offset,
1606         on_failure,
1607         check);
1608     bound_checked = true;
1609   }
1610   assembler->CheckNotCharacter(c, on_failure);
1611   return bound_checked;
1612 }
1613 
1614 
1615 // Only emits non-letters (things that don't have case).  Only used for case
1616 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1617 static inline bool EmitAtomNonLetter(Isolate* isolate,
1618                                      RegExpCompiler* compiler,
1619                                      uc16 c,
1620                                      Label* on_failure,
1621                                      int cp_offset,
1622                                      bool check,
1623                                      bool preloaded) {
1624   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1625   bool one_byte = compiler->one_byte();
1626   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1627   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1628   if (length < 1) {
1629     // This can't match.  Must be an one-byte subject and a non-one-byte
1630     // character.  We do not need to do anything since the one-byte pass
1631     // already handled this.
1632     return false;  // Bounds not checked.
1633   }
1634   bool checked = false;
1635   // We handle the length > 1 case in a later pass.
1636   if (length == 1) {
1637     if (one_byte && c > String::kMaxOneByteCharCodeU) {
1638       // Can't match - see above.
1639       return false;  // Bounds not checked.
1640     }
1641     if (!preloaded) {
1642       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1643       checked = check;
1644     }
1645     macro_assembler->CheckNotCharacter(c, on_failure);
1646   }
1647   return checked;
1648 }
1649 
1650 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1651 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1652                                       bool one_byte, uc16 c1, uc16 c2,
1653                                       Label* on_failure) {
1654   uc16 char_mask;
1655   if (one_byte) {
1656     char_mask = String::kMaxOneByteCharCode;
1657   } else {
1658     char_mask = String::kMaxUtf16CodeUnit;
1659   }
1660   uc16 exor = c1 ^ c2;
1661   // Check whether exor has only one bit set.
1662   if (((exor - 1) & exor) == 0) {
1663     // If c1 and c2 differ only by one bit.
1664     // Ecma262UnCanonicalize always gives the highest number last.
1665     DCHECK(c2 > c1);
1666     uc16 mask = char_mask ^ exor;
1667     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1668     return true;
1669   }
1670   DCHECK(c2 > c1);
1671   uc16 diff = c2 - c1;
1672   if (((diff - 1) & diff) == 0 && c1 >= diff) {
1673     // If the characters differ by 2^n but don't differ by one bit then
1674     // subtract the difference from the found character, then do the or
1675     // trick.  We avoid the theoretical case where negative numbers are
1676     // involved in order to simplify code generation.
1677     uc16 mask = char_mask ^ diff;
1678     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1679                                                     diff,
1680                                                     mask,
1681                                                     on_failure);
1682     return true;
1683   }
1684   return false;
1685 }
1686 
1687 
1688 typedef bool EmitCharacterFunction(Isolate* isolate,
1689                                    RegExpCompiler* compiler,
1690                                    uc16 c,
1691                                    Label* on_failure,
1692                                    int cp_offset,
1693                                    bool check,
1694                                    bool preloaded);
1695 
1696 // Only emits letters (things that have case).  Only used for case independent
1697 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1698 static inline bool EmitAtomLetter(Isolate* isolate,
1699                                   RegExpCompiler* compiler,
1700                                   uc16 c,
1701                                   Label* on_failure,
1702                                   int cp_offset,
1703                                   bool check,
1704                                   bool preloaded) {
1705   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1706   bool one_byte = compiler->one_byte();
1707   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1708   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1709   if (length <= 1) return false;
1710   // We may not need to check against the end of the input string
1711   // if this character lies before a character that matched.
1712   if (!preloaded) {
1713     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1714   }
1715   Label ok;
1716   DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1717   switch (length) {
1718     case 2: {
1719       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1720                                     chars[1], on_failure)) {
1721       } else {
1722         macro_assembler->CheckCharacter(chars[0], &ok);
1723         macro_assembler->CheckNotCharacter(chars[1], on_failure);
1724         macro_assembler->Bind(&ok);
1725       }
1726       break;
1727     }
1728     case 4:
1729       macro_assembler->CheckCharacter(chars[3], &ok);
1730       // Fall through!
1731     case 3:
1732       macro_assembler->CheckCharacter(chars[0], &ok);
1733       macro_assembler->CheckCharacter(chars[1], &ok);
1734       macro_assembler->CheckNotCharacter(chars[2], on_failure);
1735       macro_assembler->Bind(&ok);
1736       break;
1737     default:
1738       UNREACHABLE();
1739       break;
1740   }
1741   return true;
1742 }
1743 
1744 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1745 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1746                              int border,
1747                              Label* fall_through,
1748                              Label* above_or_equal,
1749                              Label* below) {
1750   if (below != fall_through) {
1751     masm->CheckCharacterLT(border, below);
1752     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1753   } else {
1754     masm->CheckCharacterGT(border - 1, above_or_equal);
1755   }
1756 }
1757 
1758 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1759 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1760                                    int first,
1761                                    int last,
1762                                    Label* fall_through,
1763                                    Label* in_range,
1764                                    Label* out_of_range) {
1765   if (in_range == fall_through) {
1766     if (first == last) {
1767       masm->CheckNotCharacter(first, out_of_range);
1768     } else {
1769       masm->CheckCharacterNotInRange(first, last, out_of_range);
1770     }
1771   } else {
1772     if (first == last) {
1773       masm->CheckCharacter(first, in_range);
1774     } else {
1775       masm->CheckCharacterInRange(first, last, in_range);
1776     }
1777     if (out_of_range != fall_through) masm->GoTo(out_of_range);
1778   }
1779 }
1780 
1781 
1782 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1783 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1784 static void EmitUseLookupTable(
1785     RegExpMacroAssembler* masm,
1786     ZoneList<int>* ranges,
1787     int start_index,
1788     int end_index,
1789     int min_char,
1790     Label* fall_through,
1791     Label* even_label,
1792     Label* odd_label) {
1793   static const int kSize = RegExpMacroAssembler::kTableSize;
1794   static const int kMask = RegExpMacroAssembler::kTableMask;
1795 
1796   int base = (min_char & ~kMask);
1797   USE(base);
1798 
1799   // Assert that everything is on one kTableSize page.
1800   for (int i = start_index; i <= end_index; i++) {
1801     DCHECK_EQ(ranges->at(i) & ~kMask, base);
1802   }
1803   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1804 
1805   char templ[kSize];
1806   Label* on_bit_set;
1807   Label* on_bit_clear;
1808   int bit;
1809   if (even_label == fall_through) {
1810     on_bit_set = odd_label;
1811     on_bit_clear = even_label;
1812     bit = 1;
1813   } else {
1814     on_bit_set = even_label;
1815     on_bit_clear = odd_label;
1816     bit = 0;
1817   }
1818   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1819     templ[i] = bit;
1820   }
1821   int j = 0;
1822   bit ^= 1;
1823   for (int i = start_index; i < end_index; i++) {
1824     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1825       templ[j] = bit;
1826     }
1827     bit ^= 1;
1828   }
1829   for (int i = j; i < kSize; i++) {
1830     templ[i] = bit;
1831   }
1832   Factory* factory = masm->isolate()->factory();
1833   // TODO(erikcorry): Cache these.
1834   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1835   for (int i = 0; i < kSize; i++) {
1836     ba->set(i, templ[i]);
1837   }
1838   masm->CheckBitInTable(ba, on_bit_set);
1839   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1840 }
1841 
1842 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1843 static void CutOutRange(RegExpMacroAssembler* masm,
1844                         ZoneList<int>* ranges,
1845                         int start_index,
1846                         int end_index,
1847                         int cut_index,
1848                         Label* even_label,
1849                         Label* odd_label) {
1850   bool odd = (((cut_index - start_index) & 1) == 1);
1851   Label* in_range_label = odd ? odd_label : even_label;
1852   Label dummy;
1853   EmitDoubleBoundaryTest(masm,
1854                          ranges->at(cut_index),
1855                          ranges->at(cut_index + 1) - 1,
1856                          &dummy,
1857                          in_range_label,
1858                          &dummy);
1859   DCHECK(!dummy.is_linked());
1860   // Cut out the single range by rewriting the array.  This creates a new
1861   // range that is a merger of the two ranges on either side of the one we
1862   // are cutting out.  The oddity of the labels is preserved.
1863   for (int j = cut_index; j > start_index; j--) {
1864     ranges->at(j) = ranges->at(j - 1);
1865   }
1866   for (int j = cut_index + 1; j < end_index; j++) {
1867     ranges->at(j) = ranges->at(j + 1);
1868   }
1869 }
1870 
1871 
1872 // Unicode case.  Split the search space into kSize spaces that are handled
1873 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1874 static void SplitSearchSpace(ZoneList<int>* ranges,
1875                              int start_index,
1876                              int end_index,
1877                              int* new_start_index,
1878                              int* new_end_index,
1879                              int* border) {
1880   static const int kSize = RegExpMacroAssembler::kTableSize;
1881   static const int kMask = RegExpMacroAssembler::kTableMask;
1882 
1883   int first = ranges->at(start_index);
1884   int last = ranges->at(end_index) - 1;
1885 
1886   *new_start_index = start_index;
1887   *border = (ranges->at(start_index) & ~kMask) + kSize;
1888   while (*new_start_index < end_index) {
1889     if (ranges->at(*new_start_index) > *border) break;
1890     (*new_start_index)++;
1891   }
1892   // new_start_index is the index of the first edge that is beyond the
1893   // current kSize space.
1894 
1895   // For very large search spaces we do a binary chop search of the non-Latin1
1896   // space instead of just going to the end of the current kSize space.  The
1897   // heuristics are complicated a little by the fact that any 128-character
1898   // encoding space can be quickly tested with a table lookup, so we don't
1899   // wish to do binary chop search at a smaller granularity than that.  A
1900   // 128-character space can take up a lot of space in the ranges array if,
1901   // for example, we only want to match every second character (eg. the lower
1902   // case characters on some Unicode pages).
1903   int binary_chop_index = (end_index + start_index) / 2;
1904   // The first test ensures that we get to the code that handles the Latin1
1905   // range with a single not-taken branch, speeding up this important
1906   // character range (even non-Latin1 charset-based text has spaces and
1907   // punctuation).
1908   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1909       end_index - start_index > (*new_start_index - start_index) * 2 &&
1910       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1911       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1912     int scan_forward_for_section_border = binary_chop_index;;
1913     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1914 
1915     while (scan_forward_for_section_border < end_index) {
1916       if (ranges->at(scan_forward_for_section_border) > new_border) {
1917         *new_start_index = scan_forward_for_section_border;
1918         *border = new_border;
1919         break;
1920       }
1921       scan_forward_for_section_border++;
1922     }
1923   }
1924 
1925   DCHECK(*new_start_index > start_index);
1926   *new_end_index = *new_start_index - 1;
1927   if (ranges->at(*new_end_index) == *border) {
1928     (*new_end_index)--;
1929   }
1930   if (*border >= ranges->at(end_index)) {
1931     *border = ranges->at(end_index);
1932     *new_start_index = end_index;  // Won't be used.
1933     *new_end_index = end_index - 1;
1934   }
1935 }
1936 
1937 
1938 // Gets a series of segment boundaries representing a character class.  If the
1939 // character is in the range between an even and an odd boundary (counting from
1940 // start_index) then go to even_label, otherwise go to odd_label.  We already
1941 // know that the character is in the range of min_char to max_char inclusive.
1942 // Either label can be NULL indicating backtracking.  Either label can also be
1943 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc16 min_char,uc16 max_char,Label * fall_through,Label * even_label,Label * odd_label)1944 static void GenerateBranches(RegExpMacroAssembler* masm,
1945                              ZoneList<int>* ranges,
1946                              int start_index,
1947                              int end_index,
1948                              uc16 min_char,
1949                              uc16 max_char,
1950                              Label* fall_through,
1951                              Label* even_label,
1952                              Label* odd_label) {
1953   int first = ranges->at(start_index);
1954   int last = ranges->at(end_index) - 1;
1955 
1956   DCHECK_LT(min_char, first);
1957 
1958   // Just need to test if the character is before or on-or-after
1959   // a particular character.
1960   if (start_index == end_index) {
1961     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1962     return;
1963   }
1964 
1965   // Another almost trivial case:  There is one interval in the middle that is
1966   // different from the end intervals.
1967   if (start_index + 1 == end_index) {
1968     EmitDoubleBoundaryTest(
1969         masm, first, last, fall_through, even_label, odd_label);
1970     return;
1971   }
1972 
1973   // It's not worth using table lookup if there are very few intervals in the
1974   // character class.
1975   if (end_index - start_index <= 6) {
1976     // It is faster to test for individual characters, so we look for those
1977     // first, then try arbitrary ranges in the second round.
1978     static int kNoCutIndex = -1;
1979     int cut = kNoCutIndex;
1980     for (int i = start_index; i < end_index; i++) {
1981       if (ranges->at(i) == ranges->at(i + 1) - 1) {
1982         cut = i;
1983         break;
1984       }
1985     }
1986     if (cut == kNoCutIndex) cut = start_index;
1987     CutOutRange(
1988         masm, ranges, start_index, end_index, cut, even_label, odd_label);
1989     DCHECK_GE(end_index - start_index, 2);
1990     GenerateBranches(masm,
1991                      ranges,
1992                      start_index + 1,
1993                      end_index - 1,
1994                      min_char,
1995                      max_char,
1996                      fall_through,
1997                      even_label,
1998                      odd_label);
1999     return;
2000   }
2001 
2002   // If there are a lot of intervals in the regexp, then we will use tables to
2003   // determine whether the character is inside or outside the character class.
2004   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2005 
2006   if ((max_char >> kBits) == (min_char >> kBits)) {
2007     EmitUseLookupTable(masm,
2008                        ranges,
2009                        start_index,
2010                        end_index,
2011                        min_char,
2012                        fall_through,
2013                        even_label,
2014                        odd_label);
2015     return;
2016   }
2017 
2018   if ((min_char >> kBits) != (first >> kBits)) {
2019     masm->CheckCharacterLT(first, odd_label);
2020     GenerateBranches(masm,
2021                      ranges,
2022                      start_index + 1,
2023                      end_index,
2024                      first,
2025                      max_char,
2026                      fall_through,
2027                      odd_label,
2028                      even_label);
2029     return;
2030   }
2031 
2032   int new_start_index = 0;
2033   int new_end_index = 0;
2034   int border = 0;
2035 
2036   SplitSearchSpace(ranges,
2037                    start_index,
2038                    end_index,
2039                    &new_start_index,
2040                    &new_end_index,
2041                    &border);
2042 
2043   Label handle_rest;
2044   Label* above = &handle_rest;
2045   if (border == last + 1) {
2046     // We didn't find any section that started after the limit, so everything
2047     // above the border is one of the terminal labels.
2048     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2049     DCHECK(new_end_index == end_index - 1);
2050   }
2051 
2052   DCHECK_LE(start_index, new_end_index);
2053   DCHECK_LE(new_start_index, end_index);
2054   DCHECK_LT(start_index, new_start_index);
2055   DCHECK_LT(new_end_index, end_index);
2056   DCHECK(new_end_index + 1 == new_start_index ||
2057          (new_end_index + 2 == new_start_index &&
2058           border == ranges->at(new_end_index + 1)));
2059   DCHECK_LT(min_char, border - 1);
2060   DCHECK_LT(border, max_char);
2061   DCHECK_LT(ranges->at(new_end_index), border);
2062   DCHECK(border < ranges->at(new_start_index) ||
2063          (border == ranges->at(new_start_index) &&
2064           new_start_index == end_index &&
2065           new_end_index == end_index - 1 &&
2066           border == last + 1));
2067   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2068 
2069   masm->CheckCharacterGT(border - 1, above);
2070   Label dummy;
2071   GenerateBranches(masm,
2072                    ranges,
2073                    start_index,
2074                    new_end_index,
2075                    min_char,
2076                    border - 1,
2077                    &dummy,
2078                    even_label,
2079                    odd_label);
2080   if (handle_rest.is_linked()) {
2081     masm->Bind(&handle_rest);
2082     bool flip = (new_start_index & 1) != (start_index & 1);
2083     GenerateBranches(masm,
2084                      ranges,
2085                      new_start_index,
2086                      end_index,
2087                      border,
2088                      max_char,
2089                      &dummy,
2090                      flip ? odd_label : even_label,
2091                      flip ? even_label : odd_label);
2092   }
2093 }
2094 
2095 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2096 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2097                           RegExpCharacterClass* cc, bool one_byte,
2098                           Label* on_failure, int cp_offset, bool check_offset,
2099                           bool preloaded, Zone* zone) {
2100   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2101   if (!CharacterRange::IsCanonical(ranges)) {
2102     CharacterRange::Canonicalize(ranges);
2103   }
2104 
2105   int max_char;
2106   if (one_byte) {
2107     max_char = String::kMaxOneByteCharCode;
2108   } else {
2109     max_char = String::kMaxUtf16CodeUnit;
2110   }
2111 
2112   int range_count = ranges->length();
2113 
2114   int last_valid_range = range_count - 1;
2115   while (last_valid_range >= 0) {
2116     CharacterRange& range = ranges->at(last_valid_range);
2117     if (range.from() <= max_char) {
2118       break;
2119     }
2120     last_valid_range--;
2121   }
2122 
2123   if (last_valid_range < 0) {
2124     if (!cc->is_negated()) {
2125       macro_assembler->GoTo(on_failure);
2126     }
2127     if (check_offset) {
2128       macro_assembler->CheckPosition(cp_offset, on_failure);
2129     }
2130     return;
2131   }
2132 
2133   if (last_valid_range == 0 &&
2134       ranges->at(0).IsEverything(max_char)) {
2135     if (cc->is_negated()) {
2136       macro_assembler->GoTo(on_failure);
2137     } else {
2138       // This is a common case hit by non-anchored expressions.
2139       if (check_offset) {
2140         macro_assembler->CheckPosition(cp_offset, on_failure);
2141       }
2142     }
2143     return;
2144   }
2145   if (last_valid_range == 0 &&
2146       !cc->is_negated() &&
2147       ranges->at(0).IsEverything(max_char)) {
2148     // This is a common case hit by non-anchored expressions.
2149     if (check_offset) {
2150       macro_assembler->CheckPosition(cp_offset, on_failure);
2151     }
2152     return;
2153   }
2154 
2155   if (!preloaded) {
2156     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2157   }
2158 
2159   if (cc->is_standard(zone) &&
2160         macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2161                                                     on_failure)) {
2162       return;
2163   }
2164 
2165 
2166   // A new list with ascending entries.  Each entry is a code unit
2167   // where there is a boundary between code units that are part of
2168   // the class and code units that are not.  Normally we insert an
2169   // entry at zero which goes to the failure label, but if there
2170   // was already one there we fall through for success on that entry.
2171   // Subsequent entries have alternating meaning (success/failure).
2172   ZoneList<int>* range_boundaries =
2173       new(zone) ZoneList<int>(last_valid_range, zone);
2174 
2175   bool zeroth_entry_is_failure = !cc->is_negated();
2176 
2177   for (int i = 0; i <= last_valid_range; i++) {
2178     CharacterRange& range = ranges->at(i);
2179     if (range.from() == 0) {
2180       DCHECK_EQ(i, 0);
2181       zeroth_entry_is_failure = !zeroth_entry_is_failure;
2182     } else {
2183       range_boundaries->Add(range.from(), zone);
2184     }
2185     range_boundaries->Add(range.to() + 1, zone);
2186   }
2187   int end_index = range_boundaries->length() - 1;
2188   if (range_boundaries->at(end_index) > max_char) {
2189     end_index--;
2190   }
2191 
2192   Label fall_through;
2193   GenerateBranches(macro_assembler,
2194                    range_boundaries,
2195                    0,  // start_index.
2196                    end_index,
2197                    0,  // min_char.
2198                    max_char,
2199                    &fall_through,
2200                    zeroth_entry_is_failure ? &fall_through : on_failure,
2201                    zeroth_entry_is_failure ? on_failure : &fall_through);
2202   macro_assembler->Bind(&fall_through);
2203 }
2204 
2205 
~RegExpNode()2206 RegExpNode::~RegExpNode() {
2207 }
2208 
2209 
LimitVersions(RegExpCompiler * compiler,Trace * trace)2210 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2211                                                   Trace* trace) {
2212   // If we are generating a greedy loop then don't stop and don't reuse code.
2213   if (trace->stop_node() != NULL) {
2214     return CONTINUE;
2215   }
2216 
2217   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2218   if (trace->is_trivial()) {
2219     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2220       // If a generic version is already scheduled to be generated or we have
2221       // recursed too deeply then just generate a jump to that code.
2222       macro_assembler->GoTo(&label_);
2223       // This will queue it up for generation of a generic version if it hasn't
2224       // already been queued.
2225       compiler->AddWork(this);
2226       return DONE;
2227     }
2228     // Generate generic version of the node and bind the label for later use.
2229     macro_assembler->Bind(&label_);
2230     return CONTINUE;
2231   }
2232 
2233   // We are being asked to make a non-generic version.  Keep track of how many
2234   // non-generic versions we generate so as not to overdo it.
2235   trace_count_++;
2236   if (KeepRecursing(compiler) && compiler->optimize() &&
2237       trace_count_ < kMaxCopiesCodeGenerated) {
2238     return CONTINUE;
2239   }
2240 
2241   // If we get here code has been generated for this node too many times or
2242   // recursion is too deep.  Time to switch to a generic version.  The code for
2243   // generic versions above can handle deep recursion properly.
2244   bool was_limiting = compiler->limiting_recursion();
2245   compiler->set_limiting_recursion(true);
2246   trace->Flush(compiler, this);
2247   compiler->set_limiting_recursion(was_limiting);
2248   return DONE;
2249 }
2250 
2251 
KeepRecursing(RegExpCompiler * compiler)2252 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2253   return !compiler->limiting_recursion() &&
2254          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2255 }
2256 
2257 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2258 int ActionNode::EatsAtLeast(int still_to_find,
2259                             int budget,
2260                             bool not_at_start) {
2261   if (budget <= 0) return 0;
2262   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2263   return on_success()->EatsAtLeast(still_to_find,
2264                                    budget - 1,
2265                                    not_at_start);
2266 }
2267 
2268 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2269 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2270                               BoyerMooreLookahead* bm, bool not_at_start) {
2271   if (action_type_ == BEGIN_SUBMATCH) {
2272     bm->SetRest(offset);
2273   } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2274     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2275   }
2276   SaveBMInfo(bm, not_at_start, offset);
2277 }
2278 
2279 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2280 int AssertionNode::EatsAtLeast(int still_to_find,
2281                                int budget,
2282                                bool not_at_start) {
2283   if (budget <= 0) return 0;
2284   // If we know we are not at the start and we are asked "how many characters
2285   // will you match if you succeed?" then we can answer anything since false
2286   // implies false.  So lets just return the max answer (still_to_find) since
2287   // that won't prevent us from preloading a lot of characters for the other
2288   // branches in the node graph.
2289   if (assertion_type() == AT_START && not_at_start) return still_to_find;
2290   return on_success()->EatsAtLeast(still_to_find,
2291                                    budget - 1,
2292                                    not_at_start);
2293 }
2294 
2295 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2296 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2297                                  BoyerMooreLookahead* bm, bool not_at_start) {
2298   // Match the behaviour of EatsAtLeast on this node.
2299   if (assertion_type() == AT_START && not_at_start) return;
2300   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2301   SaveBMInfo(bm, not_at_start, offset);
2302 }
2303 
2304 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2305 int BackReferenceNode::EatsAtLeast(int still_to_find,
2306                                    int budget,
2307                                    bool not_at_start) {
2308   if (read_backward()) return 0;
2309   if (budget <= 0) return 0;
2310   return on_success()->EatsAtLeast(still_to_find,
2311                                    budget - 1,
2312                                    not_at_start);
2313 }
2314 
2315 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2316 int TextNode::EatsAtLeast(int still_to_find,
2317                           int budget,
2318                           bool not_at_start) {
2319   if (read_backward()) return 0;
2320   int answer = Length();
2321   if (answer >= still_to_find) return answer;
2322   if (budget <= 0) return answer;
2323   // We are not at start after this node so we set the last argument to 'true'.
2324   return answer + on_success()->EatsAtLeast(still_to_find - answer,
2325                                             budget - 1,
2326                                             true);
2327 }
2328 
2329 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2330 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2331                                               bool not_at_start) {
2332   if (budget <= 0) return 0;
2333   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2334   // afterwards.
2335   RegExpNode* node = alternatives_->at(1).node();
2336   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2337 }
2338 
2339 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2340 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2341     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2342     bool not_at_start) {
2343   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2344   // afterwards.
2345   RegExpNode* node = alternatives_->at(1).node();
2346   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2347 }
2348 
2349 
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2350 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2351                                   int budget,
2352                                   RegExpNode* ignore_this_node,
2353                                   bool not_at_start) {
2354   if (budget <= 0) return 0;
2355   int min = 100;
2356   int choice_count = alternatives_->length();
2357   budget = (budget - 1) / choice_count;
2358   for (int i = 0; i < choice_count; i++) {
2359     RegExpNode* node = alternatives_->at(i).node();
2360     if (node == ignore_this_node) continue;
2361     int node_eats_at_least =
2362         node->EatsAtLeast(still_to_find, budget, not_at_start);
2363     if (node_eats_at_least < min) min = node_eats_at_least;
2364     if (min == 0) return 0;
2365   }
2366   return min;
2367 }
2368 
2369 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2370 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2371                                 int budget,
2372                                 bool not_at_start) {
2373   return EatsAtLeastHelper(still_to_find,
2374                            budget - 1,
2375                            loop_node_,
2376                            not_at_start);
2377 }
2378 
2379 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2380 int ChoiceNode::EatsAtLeast(int still_to_find,
2381                             int budget,
2382                             bool not_at_start) {
2383   return EatsAtLeastHelper(still_to_find,
2384                            budget,
2385                            NULL,
2386                            not_at_start);
2387 }
2388 
2389 
2390 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2391 static inline uint32_t SmearBitsRight(uint32_t v) {
2392   v |= v >> 1;
2393   v |= v >> 2;
2394   v |= v >> 4;
2395   v |= v >> 8;
2396   v |= v >> 16;
2397   return v;
2398 }
2399 
2400 
Rationalize(bool asc)2401 bool QuickCheckDetails::Rationalize(bool asc) {
2402   bool found_useful_op = false;
2403   uint32_t char_mask;
2404   if (asc) {
2405     char_mask = String::kMaxOneByteCharCode;
2406   } else {
2407     char_mask = String::kMaxUtf16CodeUnit;
2408   }
2409   mask_ = 0;
2410   value_ = 0;
2411   int char_shift = 0;
2412   for (int i = 0; i < characters_; i++) {
2413     Position* pos = &positions_[i];
2414     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2415       found_useful_op = true;
2416     }
2417     mask_ |= (pos->mask & char_mask) << char_shift;
2418     value_ |= (pos->value & char_mask) << char_shift;
2419     char_shift += asc ? 8 : 16;
2420   }
2421   return found_useful_op;
2422 }
2423 
2424 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2425 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2426                                 Trace* bounds_check_trace,
2427                                 Trace* trace,
2428                                 bool preload_has_checked_bounds,
2429                                 Label* on_possible_success,
2430                                 QuickCheckDetails* details,
2431                                 bool fall_through_on_failure) {
2432   if (details->characters() == 0) return false;
2433   GetQuickCheckDetails(
2434       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2435   if (details->cannot_match()) return false;
2436   if (!details->Rationalize(compiler->one_byte())) return false;
2437   DCHECK(details->characters() == 1 ||
2438          compiler->macro_assembler()->CanReadUnaligned());
2439   uint32_t mask = details->mask();
2440   uint32_t value = details->value();
2441 
2442   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2443 
2444   if (trace->characters_preloaded() != details->characters()) {
2445     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2446     // We are attempting to preload the minimum number of characters
2447     // any choice would eat, so if the bounds check fails, then none of the
2448     // choices can succeed, so we can just immediately backtrack, rather
2449     // than go to the next choice.
2450     assembler->LoadCurrentCharacter(trace->cp_offset(),
2451                                     bounds_check_trace->backtrack(),
2452                                     !preload_has_checked_bounds,
2453                                     details->characters());
2454   }
2455 
2456 
2457   bool need_mask = true;
2458 
2459   if (details->characters() == 1) {
2460     // If number of characters preloaded is 1 then we used a byte or 16 bit
2461     // load so the value is already masked down.
2462     uint32_t char_mask;
2463     if (compiler->one_byte()) {
2464       char_mask = String::kMaxOneByteCharCode;
2465     } else {
2466       char_mask = String::kMaxUtf16CodeUnit;
2467     }
2468     if ((mask & char_mask) == char_mask) need_mask = false;
2469     mask &= char_mask;
2470   } else {
2471     // For 2-character preloads in one-byte mode or 1-character preloads in
2472     // two-byte mode we also use a 16 bit load with zero extend.
2473     if (details->characters() == 2 && compiler->one_byte()) {
2474       if ((mask & 0xffff) == 0xffff) need_mask = false;
2475     } else if (details->characters() == 1 && !compiler->one_byte()) {
2476       if ((mask & 0xffff) == 0xffff) need_mask = false;
2477     } else {
2478       if (mask == 0xffffffff) need_mask = false;
2479     }
2480   }
2481 
2482   if (fall_through_on_failure) {
2483     if (need_mask) {
2484       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2485     } else {
2486       assembler->CheckCharacter(value, on_possible_success);
2487     }
2488   } else {
2489     if (need_mask) {
2490       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2491     } else {
2492       assembler->CheckNotCharacter(value, trace->backtrack());
2493     }
2494   }
2495   return true;
2496 }
2497 
2498 
2499 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2500 // super-class in the .h file).
2501 //
2502 // We iterate along the text object, building up for each character a
2503 // mask and value that can be used to test for a quick failure to match.
2504 // The masks and values for the positions will be combined into a single
2505 // machine word for the current character width in order to be used in
2506 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2507 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2508                                     RegExpCompiler* compiler,
2509                                     int characters_filled_in,
2510                                     bool not_at_start) {
2511   // Do not collect any quick check details if the text node reads backward,
2512   // since it reads in the opposite direction than we use for quick checks.
2513   if (read_backward()) return;
2514   Isolate* isolate = compiler->macro_assembler()->isolate();
2515   DCHECK(characters_filled_in < details->characters());
2516   int characters = details->characters();
2517   int char_mask;
2518   if (compiler->one_byte()) {
2519     char_mask = String::kMaxOneByteCharCode;
2520   } else {
2521     char_mask = String::kMaxUtf16CodeUnit;
2522   }
2523   for (int k = 0; k < elements()->length(); k++) {
2524     TextElement elm = elements()->at(k);
2525     if (elm.text_type() == TextElement::ATOM) {
2526       Vector<const uc16> quarks = elm.atom()->data();
2527       for (int i = 0; i < characters && i < quarks.length(); i++) {
2528         QuickCheckDetails::Position* pos =
2529             details->positions(characters_filled_in);
2530         uc16 c = quarks[i];
2531         if (compiler->ignore_case()) {
2532           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2533           int length = GetCaseIndependentLetters(isolate, c,
2534                                                  compiler->one_byte(), chars);
2535           if (length == 0) {
2536             // This can happen because all case variants are non-Latin1, but we
2537             // know the input is Latin1.
2538             details->set_cannot_match();
2539             pos->determines_perfectly = false;
2540             return;
2541           }
2542           if (length == 1) {
2543             // This letter has no case equivalents, so it's nice and simple
2544             // and the mask-compare will determine definitely whether we have
2545             // a match at this character position.
2546             pos->mask = char_mask;
2547             pos->value = c;
2548             pos->determines_perfectly = true;
2549           } else {
2550             uint32_t common_bits = char_mask;
2551             uint32_t bits = chars[0];
2552             for (int j = 1; j < length; j++) {
2553               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2554               common_bits ^= differing_bits;
2555               bits &= common_bits;
2556             }
2557             // If length is 2 and common bits has only one zero in it then
2558             // our mask and compare instruction will determine definitely
2559             // whether we have a match at this character position.  Otherwise
2560             // it can only be an approximate check.
2561             uint32_t one_zero = (common_bits | ~char_mask);
2562             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2563               pos->determines_perfectly = true;
2564             }
2565             pos->mask = common_bits;
2566             pos->value = bits;
2567           }
2568         } else {
2569           // Don't ignore case.  Nice simple case where the mask-compare will
2570           // determine definitely whether we have a match at this character
2571           // position.
2572           if (c > char_mask) {
2573             details->set_cannot_match();
2574             pos->determines_perfectly = false;
2575             return;
2576           }
2577           pos->mask = char_mask;
2578           pos->value = c;
2579           pos->determines_perfectly = true;
2580         }
2581         characters_filled_in++;
2582         DCHECK(characters_filled_in <= details->characters());
2583         if (characters_filled_in == details->characters()) {
2584           return;
2585         }
2586       }
2587     } else {
2588       QuickCheckDetails::Position* pos =
2589           details->positions(characters_filled_in);
2590       RegExpCharacterClass* tree = elm.char_class();
2591       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2592       if (tree->is_negated()) {
2593         // A quick check uses multi-character mask and compare.  There is no
2594         // useful way to incorporate a negative char class into this scheme
2595         // so we just conservatively create a mask and value that will always
2596         // succeed.
2597         pos->mask = 0;
2598         pos->value = 0;
2599       } else {
2600         int first_range = 0;
2601         while (ranges->at(first_range).from() > char_mask) {
2602           first_range++;
2603           if (first_range == ranges->length()) {
2604             details->set_cannot_match();
2605             pos->determines_perfectly = false;
2606             return;
2607           }
2608         }
2609         CharacterRange range = ranges->at(first_range);
2610         uc16 from = range.from();
2611         uc16 to = range.to();
2612         if (to > char_mask) {
2613           to = char_mask;
2614         }
2615         uint32_t differing_bits = (from ^ to);
2616         // A mask and compare is only perfect if the differing bits form a
2617         // number like 00011111 with one single block of trailing 1s.
2618         if ((differing_bits & (differing_bits + 1)) == 0 &&
2619              from + differing_bits == to) {
2620           pos->determines_perfectly = true;
2621         }
2622         uint32_t common_bits = ~SmearBitsRight(differing_bits);
2623         uint32_t bits = (from & common_bits);
2624         for (int i = first_range + 1; i < ranges->length(); i++) {
2625           CharacterRange range = ranges->at(i);
2626           uc16 from = range.from();
2627           uc16 to = range.to();
2628           if (from > char_mask) continue;
2629           if (to > char_mask) to = char_mask;
2630           // Here we are combining more ranges into the mask and compare
2631           // value.  With each new range the mask becomes more sparse and
2632           // so the chances of a false positive rise.  A character class
2633           // with multiple ranges is assumed never to be equivalent to a
2634           // mask and compare operation.
2635           pos->determines_perfectly = false;
2636           uint32_t new_common_bits = (from ^ to);
2637           new_common_bits = ~SmearBitsRight(new_common_bits);
2638           common_bits &= new_common_bits;
2639           bits &= new_common_bits;
2640           uint32_t differing_bits = (from & common_bits) ^ bits;
2641           common_bits ^= differing_bits;
2642           bits &= common_bits;
2643         }
2644         pos->mask = common_bits;
2645         pos->value = bits;
2646       }
2647       characters_filled_in++;
2648       DCHECK(characters_filled_in <= details->characters());
2649       if (characters_filled_in == details->characters()) {
2650         return;
2651       }
2652     }
2653   }
2654   DCHECK(characters_filled_in != details->characters());
2655   if (!details->cannot_match()) {
2656     on_success()-> GetQuickCheckDetails(details,
2657                                         compiler,
2658                                         characters_filled_in,
2659                                         true);
2660   }
2661 }
2662 
2663 
Clear()2664 void QuickCheckDetails::Clear() {
2665   for (int i = 0; i < characters_; i++) {
2666     positions_[i].mask = 0;
2667     positions_[i].value = 0;
2668     positions_[i].determines_perfectly = false;
2669   }
2670   characters_ = 0;
2671 }
2672 
2673 
Advance(int by,bool one_byte)2674 void QuickCheckDetails::Advance(int by, bool one_byte) {
2675   if (by >= characters_ || by < 0) {
2676     DCHECK_IMPLIES(by < 0, characters_ == 0);
2677     Clear();
2678     return;
2679   }
2680   DCHECK_LE(characters_ - by, 4);
2681   DCHECK_LE(characters_, 4);
2682   for (int i = 0; i < characters_ - by; i++) {
2683     positions_[i] = positions_[by + i];
2684   }
2685   for (int i = characters_ - by; i < characters_; i++) {
2686     positions_[i].mask = 0;
2687     positions_[i].value = 0;
2688     positions_[i].determines_perfectly = false;
2689   }
2690   characters_ -= by;
2691   // We could change mask_ and value_ here but we would never advance unless
2692   // they had already been used in a check and they won't be used again because
2693   // it would gain us nothing.  So there's no point.
2694 }
2695 
2696 
Merge(QuickCheckDetails * other,int from_index)2697 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2698   DCHECK(characters_ == other->characters_);
2699   if (other->cannot_match_) {
2700     return;
2701   }
2702   if (cannot_match_) {
2703     *this = *other;
2704     return;
2705   }
2706   for (int i = from_index; i < characters_; i++) {
2707     QuickCheckDetails::Position* pos = positions(i);
2708     QuickCheckDetails::Position* other_pos = other->positions(i);
2709     if (pos->mask != other_pos->mask ||
2710         pos->value != other_pos->value ||
2711         !other_pos->determines_perfectly) {
2712       // Our mask-compare operation will be approximate unless we have the
2713       // exact same operation on both sides of the alternation.
2714       pos->determines_perfectly = false;
2715     }
2716     pos->mask &= other_pos->mask;
2717     pos->value &= pos->mask;
2718     other_pos->value &= pos->mask;
2719     uc16 differing_bits = (pos->value ^ other_pos->value);
2720     pos->mask &= ~differing_bits;
2721     pos->value &= pos->mask;
2722   }
2723 }
2724 
2725 
2726 class VisitMarker {
2727  public:
VisitMarker(NodeInfo * info)2728   explicit VisitMarker(NodeInfo* info) : info_(info) {
2729     DCHECK(!info->visited);
2730     info->visited = true;
2731   }
~VisitMarker()2732   ~VisitMarker() {
2733     info_->visited = false;
2734   }
2735  private:
2736   NodeInfo* info_;
2737 };
2738 
2739 
FilterOneByte(int depth,bool ignore_case)2740 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2741   if (info()->replacement_calculated) return replacement();
2742   if (depth < 0) return this;
2743   DCHECK(!info()->visited);
2744   VisitMarker marker(info());
2745   return FilterSuccessor(depth - 1, ignore_case);
2746 }
2747 
2748 
FilterSuccessor(int depth,bool ignore_case)2749 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2750   RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2751   if (next == NULL) return set_replacement(NULL);
2752   on_success_ = next;
2753   return set_replacement(this);
2754 }
2755 
2756 
2757 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2758 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2759   // TODO(dcarney): this could be a lot more efficient.
2760   return range.Contains(0x39c) ||
2761       range.Contains(0x3bc) || range.Contains(0x178);
2762 }
2763 
2764 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2765 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2766   for (int i = 0; i < ranges->length(); i++) {
2767     // TODO(dcarney): this could be a lot more efficient.
2768     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2769   }
2770   return false;
2771 }
2772 
2773 
FilterOneByte(int depth,bool ignore_case)2774 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2775   if (info()->replacement_calculated) return replacement();
2776   if (depth < 0) return this;
2777   DCHECK(!info()->visited);
2778   VisitMarker marker(info());
2779   int element_count = elements()->length();
2780   for (int i = 0; i < element_count; i++) {
2781     TextElement elm = elements()->at(i);
2782     if (elm.text_type() == TextElement::ATOM) {
2783       Vector<const uc16> quarks = elm.atom()->data();
2784       for (int j = 0; j < quarks.length(); j++) {
2785         uint16_t c = quarks[j];
2786         if (c <= String::kMaxOneByteCharCode) continue;
2787         if (!ignore_case) return set_replacement(NULL);
2788         // Here, we need to check for characters whose upper and lower cases
2789         // are outside the Latin-1 range.
2790         uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2791         // Character is outside Latin-1 completely
2792         if (converted == 0) return set_replacement(NULL);
2793         // Convert quark to Latin-1 in place.
2794         uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2795         copy[j] = converted;
2796       }
2797     } else {
2798       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2799       RegExpCharacterClass* cc = elm.char_class();
2800       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2801       if (!CharacterRange::IsCanonical(ranges)) {
2802         CharacterRange::Canonicalize(ranges);
2803       }
2804       // Now they are in order so we only need to look at the first.
2805       int range_count = ranges->length();
2806       if (cc->is_negated()) {
2807         if (range_count != 0 &&
2808             ranges->at(0).from() == 0 &&
2809             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2810           // This will be handled in a later filter.
2811           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2812           return set_replacement(NULL);
2813         }
2814       } else {
2815         if (range_count == 0 ||
2816             ranges->at(0).from() > String::kMaxOneByteCharCode) {
2817           // This will be handled in a later filter.
2818           if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2819           return set_replacement(NULL);
2820         }
2821       }
2822     }
2823   }
2824   return FilterSuccessor(depth - 1, ignore_case);
2825 }
2826 
2827 
FilterOneByte(int depth,bool ignore_case)2828 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2829   if (info()->replacement_calculated) return replacement();
2830   if (depth < 0) return this;
2831   if (info()->visited) return this;
2832   {
2833     VisitMarker marker(info());
2834 
2835     RegExpNode* continue_replacement =
2836         continue_node_->FilterOneByte(depth - 1, ignore_case);
2837     // If we can't continue after the loop then there is no sense in doing the
2838     // loop.
2839     if (continue_replacement == NULL) return set_replacement(NULL);
2840   }
2841 
2842   return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2843 }
2844 
2845 
FilterOneByte(int depth,bool ignore_case)2846 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2847   if (info()->replacement_calculated) return replacement();
2848   if (depth < 0) return this;
2849   if (info()->visited) return this;
2850   VisitMarker marker(info());
2851   int choice_count = alternatives_->length();
2852 
2853   for (int i = 0; i < choice_count; i++) {
2854     GuardedAlternative alternative = alternatives_->at(i);
2855     if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2856       set_replacement(this);
2857       return this;
2858     }
2859   }
2860 
2861   int surviving = 0;
2862   RegExpNode* survivor = NULL;
2863   for (int i = 0; i < choice_count; i++) {
2864     GuardedAlternative alternative = alternatives_->at(i);
2865     RegExpNode* replacement =
2866         alternative.node()->FilterOneByte(depth - 1, ignore_case);
2867     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2868     if (replacement != NULL) {
2869       alternatives_->at(i).set_node(replacement);
2870       surviving++;
2871       survivor = replacement;
2872     }
2873   }
2874   if (surviving < 2) return set_replacement(survivor);
2875 
2876   set_replacement(this);
2877   if (surviving == choice_count) {
2878     return this;
2879   }
2880   // Only some of the nodes survived the filtering.  We need to rebuild the
2881   // alternatives list.
2882   ZoneList<GuardedAlternative>* new_alternatives =
2883       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2884   for (int i = 0; i < choice_count; i++) {
2885     RegExpNode* replacement =
2886         alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2887     if (replacement != NULL) {
2888       alternatives_->at(i).set_node(replacement);
2889       new_alternatives->Add(alternatives_->at(i), zone());
2890     }
2891   }
2892   alternatives_ = new_alternatives;
2893   return this;
2894 }
2895 
2896 
FilterOneByte(int depth,bool ignore_case)2897 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2898                                                         bool ignore_case) {
2899   if (info()->replacement_calculated) return replacement();
2900   if (depth < 0) return this;
2901   if (info()->visited) return this;
2902   VisitMarker marker(info());
2903   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2904   // afterwards.
2905   RegExpNode* node = alternatives_->at(1).node();
2906   RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2907   if (replacement == NULL) return set_replacement(NULL);
2908   alternatives_->at(1).set_node(replacement);
2909 
2910   RegExpNode* neg_node = alternatives_->at(0).node();
2911   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2912   // If the negative lookahead is always going to fail then
2913   // we don't need to check it.
2914   if (neg_replacement == NULL) return set_replacement(replacement);
2915   alternatives_->at(0).set_node(neg_replacement);
2916   return set_replacement(this);
2917 }
2918 
2919 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2920 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2921                                           RegExpCompiler* compiler,
2922                                           int characters_filled_in,
2923                                           bool not_at_start) {
2924   if (body_can_be_zero_length_ || info()->visited) return;
2925   VisitMarker marker(info());
2926   return ChoiceNode::GetQuickCheckDetails(details,
2927                                           compiler,
2928                                           characters_filled_in,
2929                                           not_at_start);
2930 }
2931 
2932 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2933 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2934                                   BoyerMooreLookahead* bm, bool not_at_start) {
2935   if (body_can_be_zero_length_ || budget <= 0) {
2936     bm->SetRest(offset);
2937     SaveBMInfo(bm, not_at_start, offset);
2938     return;
2939   }
2940   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2941   SaveBMInfo(bm, not_at_start, offset);
2942 }
2943 
2944 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2945 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2946                                       RegExpCompiler* compiler,
2947                                       int characters_filled_in,
2948                                       bool not_at_start) {
2949   not_at_start = (not_at_start || not_at_start_);
2950   int choice_count = alternatives_->length();
2951   DCHECK(choice_count > 0);
2952   alternatives_->at(0).node()->GetQuickCheckDetails(details,
2953                                                     compiler,
2954                                                     characters_filled_in,
2955                                                     not_at_start);
2956   for (int i = 1; i < choice_count; i++) {
2957     QuickCheckDetails new_details(details->characters());
2958     RegExpNode* node = alternatives_->at(i).node();
2959     node->GetQuickCheckDetails(&new_details, compiler,
2960                                characters_filled_in,
2961                                not_at_start);
2962     // Here we merge the quick match details of the two branches.
2963     details->Merge(&new_details, characters_filled_in);
2964   }
2965 }
2966 
2967 
2968 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2969 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2970                           Label* word,
2971                           Label* non_word,
2972                           bool fall_through_on_word) {
2973   if (assembler->CheckSpecialCharacterClass(
2974           fall_through_on_word ? 'w' : 'W',
2975           fall_through_on_word ? non_word : word)) {
2976     // Optimized implementation available.
2977     return;
2978   }
2979   assembler->CheckCharacterGT('z', non_word);
2980   assembler->CheckCharacterLT('0', non_word);
2981   assembler->CheckCharacterGT('a' - 1, word);
2982   assembler->CheckCharacterLT('9' + 1, word);
2983   assembler->CheckCharacterLT('A', non_word);
2984   assembler->CheckCharacterLT('Z' + 1, word);
2985   if (fall_through_on_word) {
2986     assembler->CheckNotCharacter('_', non_word);
2987   } else {
2988     assembler->CheckCharacter('_', word);
2989   }
2990 }
2991 
2992 
2993 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2994 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2995 static void EmitHat(RegExpCompiler* compiler,
2996                     RegExpNode* on_success,
2997                     Trace* trace) {
2998   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2999   // We will be loading the previous character into the current character
3000   // register.
3001   Trace new_trace(*trace);
3002   new_trace.InvalidateCurrentCharacter();
3003 
3004   Label ok;
3005   if (new_trace.cp_offset() == 0) {
3006     // The start of input counts as a newline in this context, so skip to
3007     // ok if we are at the start.
3008     assembler->CheckAtStart(&ok);
3009   }
3010   // We already checked that we are not at the start of input so it must be
3011   // OK to load the previous character.
3012   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3013                                   new_trace.backtrack(),
3014                                   false);
3015   if (!assembler->CheckSpecialCharacterClass('n',
3016                                              new_trace.backtrack())) {
3017     // Newline means \n, \r, 0x2028 or 0x2029.
3018     if (!compiler->one_byte()) {
3019       assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3020     }
3021     assembler->CheckCharacter('\n', &ok);
3022     assembler->CheckNotCharacter('\r', new_trace.backtrack());
3023   }
3024   assembler->Bind(&ok);
3025   on_success->Emit(compiler, &new_trace);
3026 }
3027 
3028 
3029 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3030 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3031   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3032   Isolate* isolate = assembler->isolate();
3033   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3034   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3035   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3036   if (lookahead == NULL) {
3037     int eats_at_least =
3038         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3039                                                     kRecursionBudget,
3040                                                     not_at_start));
3041     if (eats_at_least >= 1) {
3042       BoyerMooreLookahead* bm =
3043           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3044       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3045       if (bm->at(0)->is_non_word())
3046         next_is_word_character = Trace::FALSE_VALUE;
3047       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3048     }
3049   } else {
3050     if (lookahead->at(0)->is_non_word())
3051       next_is_word_character = Trace::FALSE_VALUE;
3052     if (lookahead->at(0)->is_word())
3053       next_is_word_character = Trace::TRUE_VALUE;
3054   }
3055   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3056   if (next_is_word_character == Trace::UNKNOWN) {
3057     Label before_non_word;
3058     Label before_word;
3059     if (trace->characters_preloaded() != 1) {
3060       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3061     }
3062     // Fall through on non-word.
3063     EmitWordCheck(assembler, &before_word, &before_non_word, false);
3064     // Next character is not a word character.
3065     assembler->Bind(&before_non_word);
3066     Label ok;
3067     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3068     assembler->GoTo(&ok);
3069 
3070     assembler->Bind(&before_word);
3071     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3072     assembler->Bind(&ok);
3073   } else if (next_is_word_character == Trace::TRUE_VALUE) {
3074     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3075   } else {
3076     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3077     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3078   }
3079 }
3080 
3081 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3082 void AssertionNode::BacktrackIfPrevious(
3083     RegExpCompiler* compiler,
3084     Trace* trace,
3085     AssertionNode::IfPrevious backtrack_if_previous) {
3086   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3087   Trace new_trace(*trace);
3088   new_trace.InvalidateCurrentCharacter();
3089 
3090   Label fall_through, dummy;
3091 
3092   Label* non_word = backtrack_if_previous == kIsNonWord ?
3093                     new_trace.backtrack() :
3094                     &fall_through;
3095   Label* word = backtrack_if_previous == kIsNonWord ?
3096                 &fall_through :
3097                 new_trace.backtrack();
3098 
3099   if (new_trace.cp_offset() == 0) {
3100     // The start of input counts as a non-word character, so the question is
3101     // decided if we are at the start.
3102     assembler->CheckAtStart(non_word);
3103   }
3104   // We already checked that we are not at the start of input so it must be
3105   // OK to load the previous character.
3106   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3107   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3108 
3109   assembler->Bind(&fall_through);
3110   on_success()->Emit(compiler, &new_trace);
3111 }
3112 
3113 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3114 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3115                                          RegExpCompiler* compiler,
3116                                          int filled_in,
3117                                          bool not_at_start) {
3118   if (assertion_type_ == AT_START && not_at_start) {
3119     details->set_cannot_match();
3120     return;
3121   }
3122   return on_success()->GetQuickCheckDetails(details,
3123                                             compiler,
3124                                             filled_in,
3125                                             not_at_start);
3126 }
3127 
3128 
Emit(RegExpCompiler * compiler,Trace * trace)3129 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3130   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3131   switch (assertion_type_) {
3132     case AT_END: {
3133       Label ok;
3134       assembler->CheckPosition(trace->cp_offset(), &ok);
3135       assembler->GoTo(trace->backtrack());
3136       assembler->Bind(&ok);
3137       break;
3138     }
3139     case AT_START: {
3140       if (trace->at_start() == Trace::FALSE_VALUE) {
3141         assembler->GoTo(trace->backtrack());
3142         return;
3143       }
3144       if (trace->at_start() == Trace::UNKNOWN) {
3145         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3146         Trace at_start_trace = *trace;
3147         at_start_trace.set_at_start(Trace::TRUE_VALUE);
3148         on_success()->Emit(compiler, &at_start_trace);
3149         return;
3150       }
3151     }
3152     break;
3153     case AFTER_NEWLINE:
3154       EmitHat(compiler, on_success(), trace);
3155       return;
3156     case AT_BOUNDARY:
3157     case AT_NON_BOUNDARY: {
3158       EmitBoundaryCheck(compiler, trace);
3159       return;
3160     }
3161   }
3162   on_success()->Emit(compiler, trace);
3163 }
3164 
3165 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3166 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3167   if (quick_check == NULL) return false;
3168   if (offset >= quick_check->characters()) return false;
3169   return quick_check->positions(offset)->determines_perfectly;
3170 }
3171 
3172 
UpdateBoundsCheck(int index,int * checked_up_to)3173 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3174   if (index > *checked_up_to) {
3175     *checked_up_to = index;
3176   }
3177 }
3178 
3179 
3180 // We call this repeatedly to generate code for each pass over the text node.
3181 // The passes are in increasing order of difficulty because we hope one
3182 // of the first passes will fail in which case we are saved the work of the
3183 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3184 // we will check the '%' in the first pass, the case independent 'a' in the
3185 // second pass and the character class in the last pass.
3186 //
3187 // The passes are done from right to left, so for example to test for /bar/
3188 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3189 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
3190 // bounds check most of the time.  In the example we only need to check for
3191 // end-of-input when loading the putative 'r'.
3192 //
3193 // A slight complication involves the fact that the first character may already
3194 // be fetched into a register by the previous node.  In this case we want to
3195 // do the test for that character first.  We do this in separate passes.  The
3196 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3197 // pass has been performed then subsequent passes will have true in
3198 // first_element_checked to indicate that that character does not need to be
3199 // checked again.
3200 //
3201 // In addition to all this we are passed a Trace, which can
3202 // contain an AlternativeGeneration object.  In this AlternativeGeneration
3203 // object we can see details of any quick check that was already passed in
3204 // order to get to the code we are now generating.  The quick check can involve
3205 // loading characters, which means we do not need to recheck the bounds
3206 // up to the limit the quick check already checked.  In addition the quick
3207 // check can have involved a mask and compare operation which may simplify
3208 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3209 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3210                             TextEmitPassType pass,
3211                             bool preloaded,
3212                             Trace* trace,
3213                             bool first_element_checked,
3214                             int* checked_up_to) {
3215   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3216   Isolate* isolate = assembler->isolate();
3217   bool one_byte = compiler->one_byte();
3218   Label* backtrack = trace->backtrack();
3219   QuickCheckDetails* quick_check = trace->quick_check_performed();
3220   int element_count = elements()->length();
3221   int backward_offset = read_backward() ? -Length() : 0;
3222   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3223     TextElement elm = elements()->at(i);
3224     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3225     if (elm.text_type() == TextElement::ATOM) {
3226       Vector<const uc16> quarks = elm.atom()->data();
3227       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3228         if (first_element_checked && i == 0 && j == 0) continue;
3229         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3230         EmitCharacterFunction* emit_function = NULL;
3231         switch (pass) {
3232           case NON_LATIN1_MATCH:
3233             DCHECK(one_byte);
3234             if (quarks[j] > String::kMaxOneByteCharCode) {
3235               assembler->GoTo(backtrack);
3236               return;
3237             }
3238             break;
3239           case NON_LETTER_CHARACTER_MATCH:
3240             emit_function = &EmitAtomNonLetter;
3241             break;
3242           case SIMPLE_CHARACTER_MATCH:
3243             emit_function = &EmitSimpleCharacter;
3244             break;
3245           case CASE_CHARACTER_MATCH:
3246             emit_function = &EmitAtomLetter;
3247             break;
3248           default:
3249             break;
3250         }
3251         if (emit_function != NULL) {
3252           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3253           bool bound_checked =
3254               emit_function(isolate, compiler, quarks[j], backtrack,
3255                             cp_offset + j, bounds_check, preloaded);
3256           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3257         }
3258       }
3259     } else {
3260       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3261       if (pass == CHARACTER_CLASS_MATCH) {
3262         if (first_element_checked && i == 0) continue;
3263         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3264         RegExpCharacterClass* cc = elm.char_class();
3265         bool bounds_check = *checked_up_to < cp_offset || read_backward();
3266         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3267                       bounds_check, preloaded, zone());
3268         UpdateBoundsCheck(cp_offset, checked_up_to);
3269       }
3270     }
3271   }
3272 }
3273 
3274 
Length()3275 int TextNode::Length() {
3276   TextElement elm = elements()->last();
3277   DCHECK(elm.cp_offset() >= 0);
3278   return elm.cp_offset() + elm.length();
3279 }
3280 
3281 
SkipPass(int int_pass,bool ignore_case)3282 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3283   TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3284   if (ignore_case) {
3285     return pass == SIMPLE_CHARACTER_MATCH;
3286   } else {
3287     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3288   }
3289 }
3290 
3291 
3292 // This generates the code to match a text node.  A text node can contain
3293 // straight character sequences (possibly to be matched in a case-independent
3294 // way) and character classes.  For efficiency we do not do this in a single
3295 // pass from left to right.  Instead we pass over the text node several times,
3296 // emitting code for some character positions every time.  See the comment on
3297 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3298 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3299   LimitResult limit_result = LimitVersions(compiler, trace);
3300   if (limit_result == DONE) return;
3301   DCHECK(limit_result == CONTINUE);
3302 
3303   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3304     compiler->SetRegExpTooBig();
3305     return;
3306   }
3307 
3308   if (compiler->one_byte()) {
3309     int dummy = 0;
3310     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3311   }
3312 
3313   bool first_elt_done = false;
3314   int bound_checked_to = trace->cp_offset() - 1;
3315   bound_checked_to += trace->bound_checked_up_to();
3316 
3317   // If a character is preloaded into the current character register then
3318   // check that now.
3319   if (trace->characters_preloaded() == 1) {
3320     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3321       if (!SkipPass(pass, compiler->ignore_case())) {
3322         TextEmitPass(compiler,
3323                      static_cast<TextEmitPassType>(pass),
3324                      true,
3325                      trace,
3326                      false,
3327                      &bound_checked_to);
3328       }
3329     }
3330     first_elt_done = true;
3331   }
3332 
3333   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3334     if (!SkipPass(pass, compiler->ignore_case())) {
3335       TextEmitPass(compiler,
3336                    static_cast<TextEmitPassType>(pass),
3337                    false,
3338                    trace,
3339                    first_elt_done,
3340                    &bound_checked_to);
3341     }
3342   }
3343 
3344   Trace successor_trace(*trace);
3345   // If we advance backward, we may end up at the start.
3346   successor_trace.AdvanceCurrentPositionInTrace(
3347       read_backward() ? -Length() : Length(), compiler);
3348   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3349                                                : Trace::FALSE_VALUE);
3350   RecursionCheck rc(compiler);
3351   on_success()->Emit(compiler, &successor_trace);
3352 }
3353 
3354 
InvalidateCurrentCharacter()3355 void Trace::InvalidateCurrentCharacter() {
3356   characters_preloaded_ = 0;
3357 }
3358 
3359 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3360 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3361   // We don't have an instruction for shifting the current character register
3362   // down or for using a shifted value for anything so lets just forget that
3363   // we preloaded any characters into it.
3364   characters_preloaded_ = 0;
3365   // Adjust the offsets of the quick check performed information.  This
3366   // information is used to find out what we already determined about the
3367   // characters by means of mask and compare.
3368   quick_check_performed_.Advance(by, compiler->one_byte());
3369   cp_offset_ += by;
3370   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3371     compiler->SetRegExpTooBig();
3372     cp_offset_ = 0;
3373   }
3374   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3375 }
3376 
3377 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3378 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3379   int element_count = elements()->length();
3380   for (int i = 0; i < element_count; i++) {
3381     TextElement elm = elements()->at(i);
3382     if (elm.text_type() == TextElement::CHAR_CLASS) {
3383       RegExpCharacterClass* cc = elm.char_class();
3384       // None of the standard character classes is different in the case
3385       // independent case and it slows us down if we don't know that.
3386       if (cc->is_standard(zone())) continue;
3387       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3388       int range_count = ranges->length();
3389       for (int j = 0; j < range_count; j++) {
3390         ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3391       }
3392     }
3393   }
3394 }
3395 
3396 
GreedyLoopTextLength()3397 int TextNode::GreedyLoopTextLength() { return Length(); }
3398 
3399 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3400 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3401     RegExpCompiler* compiler) {
3402   if (read_backward()) return NULL;
3403   if (elements()->length() != 1) return NULL;
3404   TextElement elm = elements()->at(0);
3405   if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3406   RegExpCharacterClass* node = elm.char_class();
3407   ZoneList<CharacterRange>* ranges = node->ranges(zone());
3408   if (!CharacterRange::IsCanonical(ranges)) {
3409     CharacterRange::Canonicalize(ranges);
3410   }
3411   if (node->is_negated()) {
3412     return ranges->length() == 0 ? on_success() : NULL;
3413   }
3414   if (ranges->length() != 1) return NULL;
3415   uint32_t max_char;
3416   if (compiler->one_byte()) {
3417     max_char = String::kMaxOneByteCharCode;
3418   } else {
3419     max_char = String::kMaxUtf16CodeUnit;
3420   }
3421   return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3422 }
3423 
3424 
3425 // Finds the fixed match length of a sequence of nodes that goes from
3426 // this alternative and back to this choice node.  If there are variable
3427 // length nodes or other complications in the way then return a sentinel
3428 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3429 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3430     GuardedAlternative* alternative) {
3431   int length = 0;
3432   RegExpNode* node = alternative->node();
3433   // Later we will generate code for all these text nodes using recursion
3434   // so we have to limit the max number.
3435   int recursion_depth = 0;
3436   while (node != this) {
3437     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3438       return kNodeIsTooComplexForGreedyLoops;
3439     }
3440     int node_length = node->GreedyLoopTextLength();
3441     if (node_length == kNodeIsTooComplexForGreedyLoops) {
3442       return kNodeIsTooComplexForGreedyLoops;
3443     }
3444     length += node_length;
3445     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3446     node = seq_node->on_success();
3447   }
3448   return read_backward() ? -length : length;
3449 }
3450 
3451 
AddLoopAlternative(GuardedAlternative alt)3452 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3453   DCHECK_NULL(loop_node_);
3454   AddAlternative(alt);
3455   loop_node_ = alt.node();
3456 }
3457 
3458 
AddContinueAlternative(GuardedAlternative alt)3459 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3460   DCHECK_NULL(continue_node_);
3461   AddAlternative(alt);
3462   continue_node_ = alt.node();
3463 }
3464 
3465 
Emit(RegExpCompiler * compiler,Trace * trace)3466 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3467   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3468   if (trace->stop_node() == this) {
3469     // Back edge of greedy optimized loop node graph.
3470     int text_length =
3471         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3472     DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3473     // Update the counter-based backtracking info on the stack.  This is an
3474     // optimization for greedy loops (see below).
3475     DCHECK(trace->cp_offset() == text_length);
3476     macro_assembler->AdvanceCurrentPosition(text_length);
3477     macro_assembler->GoTo(trace->loop_label());
3478     return;
3479   }
3480   DCHECK_NULL(trace->stop_node());
3481   if (!trace->is_trivial()) {
3482     trace->Flush(compiler, this);
3483     return;
3484   }
3485   ChoiceNode::Emit(compiler, trace);
3486 }
3487 
3488 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3489 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3490                                            int eats_at_least) {
3491   int preload_characters = Min(4, eats_at_least);
3492   if (compiler->macro_assembler()->CanReadUnaligned()) {
3493     bool one_byte = compiler->one_byte();
3494     if (one_byte) {
3495       if (preload_characters > 4) preload_characters = 4;
3496       // We can't preload 3 characters because there is no machine instruction
3497       // to do that.  We can't just load 4 because we could be reading
3498       // beyond the end of the string, which could cause a memory fault.
3499       if (preload_characters == 3) preload_characters = 2;
3500     } else {
3501       if (preload_characters > 2) preload_characters = 2;
3502     }
3503   } else {
3504     if (preload_characters > 1) preload_characters = 1;
3505   }
3506   return preload_characters;
3507 }
3508 
3509 
3510 // This class is used when generating the alternatives in a choice node.  It
3511 // records the way the alternative is being code generated.
3512 class AlternativeGeneration: public Malloced {
3513  public:
AlternativeGeneration()3514   AlternativeGeneration()
3515       : possible_success(),
3516         expects_preload(false),
3517         after(),
3518         quick_check_details() { }
3519   Label possible_success;
3520   bool expects_preload;
3521   Label after;
3522   QuickCheckDetails quick_check_details;
3523 };
3524 
3525 
3526 // Creates a list of AlternativeGenerations.  If the list has a reasonable
3527 // size then it is on the stack, otherwise the excess is on the heap.
3528 class AlternativeGenerationList {
3529  public:
AlternativeGenerationList(int count,Zone * zone)3530   AlternativeGenerationList(int count, Zone* zone)
3531       : alt_gens_(count, zone) {
3532     for (int i = 0; i < count && i < kAFew; i++) {
3533       alt_gens_.Add(a_few_alt_gens_ + i, zone);
3534     }
3535     for (int i = kAFew; i < count; i++) {
3536       alt_gens_.Add(new AlternativeGeneration(), zone);
3537     }
3538   }
~AlternativeGenerationList()3539   ~AlternativeGenerationList() {
3540     for (int i = kAFew; i < alt_gens_.length(); i++) {
3541       delete alt_gens_[i];
3542       alt_gens_[i] = NULL;
3543     }
3544   }
3545 
at(int i)3546   AlternativeGeneration* at(int i) {
3547     return alt_gens_[i];
3548   }
3549 
3550  private:
3551   static const int kAFew = 10;
3552   ZoneList<AlternativeGeneration*> alt_gens_;
3553   AlternativeGeneration a_few_alt_gens_[kAFew];
3554 };
3555 
3556 
3557 // The '2' variant is has inclusive from and exclusive to.
3558 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3559 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3560 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3561     0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3562     0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3563     0xFEFF, 0xFF00, 0x10000 };
3564 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3565 
3566 static const int kWordRanges[] = {
3567     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3568 static const int kWordRangeCount = arraysize(kWordRanges);
3569 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3570 static const int kDigitRangeCount = arraysize(kDigitRanges);
3571 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3572 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3573 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3574     0x2028, 0x202A, 0x10000 };
3575 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3576 
3577 
Set(int character)3578 void BoyerMoorePositionInfo::Set(int character) {
3579   SetInterval(Interval(character, character));
3580 }
3581 
3582 
SetInterval(const Interval & interval)3583 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3584   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3585   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3586   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3587   surrogate_ =
3588       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3589   if (interval.to() - interval.from() >= kMapSize - 1) {
3590     if (map_count_ != kMapSize) {
3591       map_count_ = kMapSize;
3592       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3593     }
3594     return;
3595   }
3596   for (int i = interval.from(); i <= interval.to(); i++) {
3597     int mod_character = (i & kMask);
3598     if (!map_->at(mod_character)) {
3599       map_count_++;
3600       map_->at(mod_character) = true;
3601     }
3602     if (map_count_ == kMapSize) return;
3603   }
3604 }
3605 
3606 
SetAll()3607 void BoyerMoorePositionInfo::SetAll() {
3608   s_ = w_ = d_ = kLatticeUnknown;
3609   if (map_count_ != kMapSize) {
3610     map_count_ = kMapSize;
3611     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3612   }
3613 }
3614 
3615 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3616 BoyerMooreLookahead::BoyerMooreLookahead(
3617     int length, RegExpCompiler* compiler, Zone* zone)
3618     : length_(length),
3619       compiler_(compiler) {
3620   if (compiler->one_byte()) {
3621     max_char_ = String::kMaxOneByteCharCode;
3622   } else {
3623     max_char_ = String::kMaxUtf16CodeUnit;
3624   }
3625   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3626   for (int i = 0; i < length; i++) {
3627     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3628   }
3629 }
3630 
3631 
3632 // Find the longest range of lookahead that has the fewest number of different
3633 // characters that can occur at a given position.  Since we are optimizing two
3634 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3635 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3636   int biggest_points = 0;
3637   // If more than 32 characters out of 128 can occur it is unlikely that we can
3638   // be lucky enough to step forwards much of the time.
3639   const int kMaxMax = 32;
3640   for (int max_number_of_chars = 4;
3641        max_number_of_chars < kMaxMax;
3642        max_number_of_chars *= 2) {
3643     biggest_points =
3644         FindBestInterval(max_number_of_chars, biggest_points, from, to);
3645   }
3646   if (biggest_points == 0) return false;
3647   return true;
3648 }
3649 
3650 
3651 // Find the highest-points range between 0 and length_ where the character
3652 // information is not too vague.  'Too vague' means that there are more than
3653 // max_number_of_chars that can occur at this position.  Calculates the number
3654 // of points as the product of width-of-the-range and
3655 // probability-of-finding-one-of-the-characters, where the probability is
3656 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3657 int BoyerMooreLookahead::FindBestInterval(
3658     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3659   int biggest_points = old_biggest_points;
3660   static const int kSize = RegExpMacroAssembler::kTableSize;
3661   for (int i = 0; i < length_; ) {
3662     while (i < length_ && Count(i) > max_number_of_chars) i++;
3663     if (i == length_) break;
3664     int remembered_from = i;
3665     bool union_map[kSize];
3666     for (int j = 0; j < kSize; j++) union_map[j] = false;
3667     while (i < length_ && Count(i) <= max_number_of_chars) {
3668       BoyerMoorePositionInfo* map = bitmaps_->at(i);
3669       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3670       i++;
3671     }
3672     int frequency = 0;
3673     for (int j = 0; j < kSize; j++) {
3674       if (union_map[j]) {
3675         // Add 1 to the frequency to give a small per-character boost for
3676         // the cases where our sampling is not good enough and many
3677         // characters have a frequency of zero.  This means the frequency
3678         // can theoretically be up to 2*kSize though we treat it mostly as
3679         // a fraction of kSize.
3680         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3681       }
3682     }
3683     // We use the probability of skipping times the distance we are skipping to
3684     // judge the effectiveness of this.  Actually we have a cut-off:  By
3685     // dividing by 2 we switch off the skipping if the probability of skipping
3686     // is less than 50%.  This is because the multibyte mask-and-compare
3687     // skipping in quickcheck is more likely to do well on this case.
3688     bool in_quickcheck_range =
3689         ((i - remembered_from < 4) ||
3690          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3691     // Called 'probability' but it is only a rough estimate and can actually
3692     // be outside the 0-kSize range.
3693     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3694     int points = (i - remembered_from) * probability;
3695     if (points > biggest_points) {
3696       *from = remembered_from;
3697       *to = i - 1;
3698       biggest_points = points;
3699     }
3700   }
3701   return biggest_points;
3702 }
3703 
3704 
3705 // Take all the characters that will not prevent a successful match if they
3706 // occur in the subject string in the range between min_lookahead and
3707 // max_lookahead (inclusive) measured from the current position.  If the
3708 // character at max_lookahead offset is not one of these characters, then we
3709 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3710 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3711                                       int max_lookahead,
3712                                       Handle<ByteArray> boolean_skip_table) {
3713   const int kSize = RegExpMacroAssembler::kTableSize;
3714 
3715   const int kSkipArrayEntry = 0;
3716   const int kDontSkipArrayEntry = 1;
3717 
3718   for (int i = 0; i < kSize; i++) {
3719     boolean_skip_table->set(i, kSkipArrayEntry);
3720   }
3721   int skip = max_lookahead + 1 - min_lookahead;
3722 
3723   for (int i = max_lookahead; i >= min_lookahead; i--) {
3724     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3725     for (int j = 0; j < kSize; j++) {
3726       if (map->at(j)) {
3727         boolean_skip_table->set(j, kDontSkipArrayEntry);
3728       }
3729     }
3730   }
3731 
3732   return skip;
3733 }
3734 
3735 
3736 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3737 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3738   const int kSize = RegExpMacroAssembler::kTableSize;
3739 
3740   int min_lookahead = 0;
3741   int max_lookahead = 0;
3742 
3743   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3744 
3745   bool found_single_character = false;
3746   int single_character = 0;
3747   for (int i = max_lookahead; i >= min_lookahead; i--) {
3748     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3749     if (map->map_count() > 1 ||
3750         (found_single_character && map->map_count() != 0)) {
3751       found_single_character = false;
3752       break;
3753     }
3754     for (int j = 0; j < kSize; j++) {
3755       if (map->at(j)) {
3756         found_single_character = true;
3757         single_character = j;
3758         break;
3759       }
3760     }
3761   }
3762 
3763   int lookahead_width = max_lookahead + 1 - min_lookahead;
3764 
3765   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3766     // The mask-compare can probably handle this better.
3767     return;
3768   }
3769 
3770   if (found_single_character) {
3771     Label cont, again;
3772     masm->Bind(&again);
3773     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3774     if (max_char_ > kSize) {
3775       masm->CheckCharacterAfterAnd(single_character,
3776                                    RegExpMacroAssembler::kTableMask,
3777                                    &cont);
3778     } else {
3779       masm->CheckCharacter(single_character, &cont);
3780     }
3781     masm->AdvanceCurrentPosition(lookahead_width);
3782     masm->GoTo(&again);
3783     masm->Bind(&cont);
3784     return;
3785   }
3786 
3787   Factory* factory = masm->isolate()->factory();
3788   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3789   int skip_distance = GetSkipTable(
3790       min_lookahead, max_lookahead, boolean_skip_table);
3791   DCHECK(skip_distance != 0);
3792 
3793   Label cont, again;
3794   masm->Bind(&again);
3795   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3796   masm->CheckBitInTable(boolean_skip_table, &cont);
3797   masm->AdvanceCurrentPosition(skip_distance);
3798   masm->GoTo(&again);
3799   masm->Bind(&cont);
3800 }
3801 
3802 
3803 /* Code generation for choice nodes.
3804  *
3805  * We generate quick checks that do a mask and compare to eliminate a
3806  * choice.  If the quick check succeeds then it jumps to the continuation to
3807  * do slow checks and check subsequent nodes.  If it fails (the common case)
3808  * it falls through to the next choice.
3809  *
3810  * Here is the desired flow graph.  Nodes directly below each other imply
3811  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3812  * 3 doesn't have a quick check so we have to call the slow check.
3813  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3814  * regexp continuation is generated directly after the Sn node, up to the
3815  * next GoTo if we decide to reuse some already generated code.  Some
3816  * nodes expect preload_characters to be preloaded into the current
3817  * character register.  R nodes do this preloading.  Vertices are marked
3818  * F for failures and S for success (possible success in the case of quick
3819  * nodes).  L, V, < and > are used as arrow heads.
3820  *
3821  * ----------> R
3822  *             |
3823  *             V
3824  *            Q1 -----> S1
3825  *             |   S   /
3826  *            F|      /
3827  *             |    F/
3828  *             |    /
3829  *             |   R
3830  *             |  /
3831  *             V L
3832  *            Q2 -----> S2
3833  *             |   S   /
3834  *            F|      /
3835  *             |    F/
3836  *             |    /
3837  *             |   R
3838  *             |  /
3839  *             V L
3840  *            S3
3841  *             |
3842  *            F|
3843  *             |
3844  *             R
3845  *             |
3846  * backtrack   V
3847  * <----------Q4
3848  *   \    F    |
3849  *    \        |S
3850  *     \   F   V
3851  *      \-----S4
3852  *
3853  * For greedy loops we push the current position, then generate the code that
3854  * eats the input specially in EmitGreedyLoop.  The other choice (the
3855  * continuation) is generated by the normal code in EmitChoices, and steps back
3856  * in the input to the starting position when it fails to match.  The loop code
3857  * looks like this (U is the unwind code that steps back in the greedy loop).
3858  *
3859  *              _____
3860  *             /     \
3861  *             V     |
3862  * ----------> S1    |
3863  *            /|     |
3864  *           / |S    |
3865  *         F/  \_____/
3866  *         /
3867  *        |<-----
3868  *        |      \
3869  *        V       |S
3870  *        Q2 ---> U----->backtrack
3871  *        |  F   /
3872  *       S|     /
3873  *        V  F /
3874  *        S2--/
3875  */
3876 
GreedyLoopState(bool not_at_start)3877 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3878   counter_backtrack_trace_.set_backtrack(&label_);
3879   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3880 }
3881 
3882 
AssertGuardsMentionRegisters(Trace * trace)3883 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3884 #ifdef DEBUG
3885   int choice_count = alternatives_->length();
3886   for (int i = 0; i < choice_count - 1; i++) {
3887     GuardedAlternative alternative = alternatives_->at(i);
3888     ZoneList<Guard*>* guards = alternative.guards();
3889     int guard_count = (guards == NULL) ? 0 : guards->length();
3890     for (int j = 0; j < guard_count; j++) {
3891       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3892     }
3893   }
3894 #endif
3895 }
3896 
3897 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3898 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3899                               Trace* current_trace,
3900                               PreloadState* state) {
3901     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3902       // Save some time by looking at most one machine word ahead.
3903       state->eats_at_least_ =
3904           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3905                       current_trace->at_start() == Trace::FALSE_VALUE);
3906     }
3907     state->preload_characters_ =
3908         CalculatePreloadCharacters(compiler, state->eats_at_least_);
3909 
3910     state->preload_is_current_ =
3911         (current_trace->characters_preloaded() == state->preload_characters_);
3912     state->preload_has_checked_bounds_ = state->preload_is_current_;
3913 }
3914 
3915 
Emit(RegExpCompiler * compiler,Trace * trace)3916 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3917   int choice_count = alternatives_->length();
3918 
3919   AssertGuardsMentionRegisters(trace);
3920 
3921   LimitResult limit_result = LimitVersions(compiler, trace);
3922   if (limit_result == DONE) return;
3923   DCHECK(limit_result == CONTINUE);
3924 
3925   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3926   // other choice nodes we only flush if we are out of code size budget.
3927   if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3928     trace->Flush(compiler, this);
3929     return;
3930   }
3931 
3932   RecursionCheck rc(compiler);
3933 
3934   PreloadState preload;
3935   preload.init();
3936   GreedyLoopState greedy_loop_state(not_at_start());
3937 
3938   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3939   AlternativeGenerationList alt_gens(choice_count, zone());
3940 
3941   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3942     trace = EmitGreedyLoop(compiler,
3943                            trace,
3944                            &alt_gens,
3945                            &preload,
3946                            &greedy_loop_state,
3947                            text_length);
3948   } else {
3949     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3950     // match the traces produced pre-cleanup.
3951     Label second_choice;
3952     compiler->macro_assembler()->Bind(&second_choice);
3953 
3954     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3955 
3956     EmitChoices(compiler,
3957                 &alt_gens,
3958                 0,
3959                 trace,
3960                 &preload);
3961   }
3962 
3963   // At this point we need to generate slow checks for the alternatives where
3964   // the quick check was inlined.  We can recognize these because the associated
3965   // label was bound.
3966   int new_flush_budget = trace->flush_budget() / choice_count;
3967   for (int i = 0; i < choice_count; i++) {
3968     AlternativeGeneration* alt_gen = alt_gens.at(i);
3969     Trace new_trace(*trace);
3970     // If there are actions to be flushed we have to limit how many times
3971     // they are flushed.  Take the budget of the parent trace and distribute
3972     // it fairly amongst the children.
3973     if (new_trace.actions() != NULL) {
3974       new_trace.set_flush_budget(new_flush_budget);
3975     }
3976     bool next_expects_preload =
3977         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3978     EmitOutOfLineContinuation(compiler,
3979                               &new_trace,
3980                               alternatives_->at(i),
3981                               alt_gen,
3982                               preload.preload_characters_,
3983                               next_expects_preload);
3984   }
3985 }
3986 
3987 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3988 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3989                                   Trace* trace,
3990                                   AlternativeGenerationList* alt_gens,
3991                                   PreloadState* preload,
3992                                   GreedyLoopState* greedy_loop_state,
3993                                   int text_length) {
3994   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3995   // Here we have special handling for greedy loops containing only text nodes
3996   // and other simple nodes.  These are handled by pushing the current
3997   // position on the stack and then incrementing the current position each
3998   // time around the switch.  On backtrack we decrement the current position
3999   // and check it against the pushed value.  This avoids pushing backtrack
4000   // information for each iteration of the loop, which could take up a lot of
4001   // space.
4002   DCHECK(trace->stop_node() == NULL);
4003   macro_assembler->PushCurrentPosition();
4004   Label greedy_match_failed;
4005   Trace greedy_match_trace;
4006   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4007   greedy_match_trace.set_backtrack(&greedy_match_failed);
4008   Label loop_label;
4009   macro_assembler->Bind(&loop_label);
4010   greedy_match_trace.set_stop_node(this);
4011   greedy_match_trace.set_loop_label(&loop_label);
4012   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4013   macro_assembler->Bind(&greedy_match_failed);
4014 
4015   Label second_choice;  // For use in greedy matches.
4016   macro_assembler->Bind(&second_choice);
4017 
4018   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4019 
4020   EmitChoices(compiler,
4021               alt_gens,
4022               1,
4023               new_trace,
4024               preload);
4025 
4026   macro_assembler->Bind(greedy_loop_state->label());
4027   // If we have unwound to the bottom then backtrack.
4028   macro_assembler->CheckGreedyLoop(trace->backtrack());
4029   // Otherwise try the second priority at an earlier position.
4030   macro_assembler->AdvanceCurrentPosition(-text_length);
4031   macro_assembler->GoTo(&second_choice);
4032   return new_trace;
4033 }
4034 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4035 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4036                                               Trace* trace) {
4037   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4038   if (alternatives_->length() != 2) return eats_at_least;
4039 
4040   GuardedAlternative alt1 = alternatives_->at(1);
4041   if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4042     return eats_at_least;
4043   }
4044   RegExpNode* eats_anything_node = alt1.node();
4045   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4046     return eats_at_least;
4047   }
4048 
4049   // Really we should be creating a new trace when we execute this function,
4050   // but there is no need, because the code it generates cannot backtrack, and
4051   // we always arrive here with a trivial trace (since it's the entry to a
4052   // loop.  That also implies that there are no preloaded characters, which is
4053   // good, because it means we won't be violating any assumptions by
4054   // overwriting those characters with new load instructions.
4055   DCHECK(trace->is_trivial());
4056 
4057   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4058   Isolate* isolate = macro_assembler->isolate();
4059   // At this point we know that we are at a non-greedy loop that will eat
4060   // any character one at a time.  Any non-anchored regexp has such a
4061   // loop prepended to it in order to find where it starts.  We look for
4062   // a pattern of the form ...abc... where we can look 6 characters ahead
4063   // and step forwards 3 if the character is not one of abc.  Abc need
4064   // not be atoms, they can be any reasonably limited character class or
4065   // small alternation.
4066   BoyerMooreLookahead* bm = bm_info(false);
4067   if (bm == NULL) {
4068     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4069                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
4070                                     kRecursionBudget,
4071                                     false));
4072     if (eats_at_least >= 1) {
4073       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4074                                            compiler,
4075                                            zone());
4076       GuardedAlternative alt0 = alternatives_->at(0);
4077       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4078     }
4079   }
4080   if (bm != NULL) {
4081     bm->EmitSkipInstructions(macro_assembler);
4082   }
4083   return eats_at_least;
4084 }
4085 
4086 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4087 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4088                              AlternativeGenerationList* alt_gens,
4089                              int first_choice,
4090                              Trace* trace,
4091                              PreloadState* preload) {
4092   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4093   SetUpPreLoad(compiler, trace, preload);
4094 
4095   // For now we just call all choices one after the other.  The idea ultimately
4096   // is to use the Dispatch table to try only the relevant ones.
4097   int choice_count = alternatives_->length();
4098 
4099   int new_flush_budget = trace->flush_budget() / choice_count;
4100 
4101   for (int i = first_choice; i < choice_count; i++) {
4102     bool is_last = i == choice_count - 1;
4103     bool fall_through_on_failure = !is_last;
4104     GuardedAlternative alternative = alternatives_->at(i);
4105     AlternativeGeneration* alt_gen = alt_gens->at(i);
4106     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4107     ZoneList<Guard*>* guards = alternative.guards();
4108     int guard_count = (guards == NULL) ? 0 : guards->length();
4109     Trace new_trace(*trace);
4110     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4111                                          preload->preload_characters_ :
4112                                          0);
4113     if (preload->preload_has_checked_bounds_) {
4114       new_trace.set_bound_checked_up_to(preload->preload_characters_);
4115     }
4116     new_trace.quick_check_performed()->Clear();
4117     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4118     if (!is_last) {
4119       new_trace.set_backtrack(&alt_gen->after);
4120     }
4121     alt_gen->expects_preload = preload->preload_is_current_;
4122     bool generate_full_check_inline = false;
4123     if (compiler->optimize() &&
4124         try_to_emit_quick_check_for_alternative(i == 0) &&
4125         alternative.node()->EmitQuickCheck(
4126             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4127             &alt_gen->possible_success, &alt_gen->quick_check_details,
4128             fall_through_on_failure)) {
4129       // Quick check was generated for this choice.
4130       preload->preload_is_current_ = true;
4131       preload->preload_has_checked_bounds_ = true;
4132       // If we generated the quick check to fall through on possible success,
4133       // we now need to generate the full check inline.
4134       if (!fall_through_on_failure) {
4135         macro_assembler->Bind(&alt_gen->possible_success);
4136         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4137         new_trace.set_characters_preloaded(preload->preload_characters_);
4138         new_trace.set_bound_checked_up_to(preload->preload_characters_);
4139         generate_full_check_inline = true;
4140       }
4141     } else if (alt_gen->quick_check_details.cannot_match()) {
4142       if (!fall_through_on_failure) {
4143         macro_assembler->GoTo(trace->backtrack());
4144       }
4145       continue;
4146     } else {
4147       // No quick check was generated.  Put the full code here.
4148       // If this is not the first choice then there could be slow checks from
4149       // previous cases that go here when they fail.  There's no reason to
4150       // insist that they preload characters since the slow check we are about
4151       // to generate probably can't use it.
4152       if (i != first_choice) {
4153         alt_gen->expects_preload = false;
4154         new_trace.InvalidateCurrentCharacter();
4155       }
4156       generate_full_check_inline = true;
4157     }
4158     if (generate_full_check_inline) {
4159       if (new_trace.actions() != NULL) {
4160         new_trace.set_flush_budget(new_flush_budget);
4161       }
4162       for (int j = 0; j < guard_count; j++) {
4163         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4164       }
4165       alternative.node()->Emit(compiler, &new_trace);
4166       preload->preload_is_current_ = false;
4167     }
4168     macro_assembler->Bind(&alt_gen->after);
4169   }
4170 }
4171 
4172 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4173 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4174                                            Trace* trace,
4175                                            GuardedAlternative alternative,
4176                                            AlternativeGeneration* alt_gen,
4177                                            int preload_characters,
4178                                            bool next_expects_preload) {
4179   if (!alt_gen->possible_success.is_linked()) return;
4180 
4181   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4182   macro_assembler->Bind(&alt_gen->possible_success);
4183   Trace out_of_line_trace(*trace);
4184   out_of_line_trace.set_characters_preloaded(preload_characters);
4185   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4186   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4187   ZoneList<Guard*>* guards = alternative.guards();
4188   int guard_count = (guards == NULL) ? 0 : guards->length();
4189   if (next_expects_preload) {
4190     Label reload_current_char;
4191     out_of_line_trace.set_backtrack(&reload_current_char);
4192     for (int j = 0; j < guard_count; j++) {
4193       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4194     }
4195     alternative.node()->Emit(compiler, &out_of_line_trace);
4196     macro_assembler->Bind(&reload_current_char);
4197     // Reload the current character, since the next quick check expects that.
4198     // We don't need to check bounds here because we only get into this
4199     // code through a quick check which already did the checked load.
4200     macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4201                                           NULL,
4202                                           false,
4203                                           preload_characters);
4204     macro_assembler->GoTo(&(alt_gen->after));
4205   } else {
4206     out_of_line_trace.set_backtrack(&(alt_gen->after));
4207     for (int j = 0; j < guard_count; j++) {
4208       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4209     }
4210     alternative.node()->Emit(compiler, &out_of_line_trace);
4211   }
4212 }
4213 
4214 
Emit(RegExpCompiler * compiler,Trace * trace)4215 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4216   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4217   LimitResult limit_result = LimitVersions(compiler, trace);
4218   if (limit_result == DONE) return;
4219   DCHECK(limit_result == CONTINUE);
4220 
4221   RecursionCheck rc(compiler);
4222 
4223   switch (action_type_) {
4224     case STORE_POSITION: {
4225       Trace::DeferredCapture
4226           new_capture(data_.u_position_register.reg,
4227                       data_.u_position_register.is_capture,
4228                       trace);
4229       Trace new_trace = *trace;
4230       new_trace.add_action(&new_capture);
4231       on_success()->Emit(compiler, &new_trace);
4232       break;
4233     }
4234     case INCREMENT_REGISTER: {
4235       Trace::DeferredIncrementRegister
4236           new_increment(data_.u_increment_register.reg);
4237       Trace new_trace = *trace;
4238       new_trace.add_action(&new_increment);
4239       on_success()->Emit(compiler, &new_trace);
4240       break;
4241     }
4242     case SET_REGISTER: {
4243       Trace::DeferredSetRegister
4244           new_set(data_.u_store_register.reg, data_.u_store_register.value);
4245       Trace new_trace = *trace;
4246       new_trace.add_action(&new_set);
4247       on_success()->Emit(compiler, &new_trace);
4248       break;
4249     }
4250     case CLEAR_CAPTURES: {
4251       Trace::DeferredClearCaptures
4252         new_capture(Interval(data_.u_clear_captures.range_from,
4253                              data_.u_clear_captures.range_to));
4254       Trace new_trace = *trace;
4255       new_trace.add_action(&new_capture);
4256       on_success()->Emit(compiler, &new_trace);
4257       break;
4258     }
4259     case BEGIN_SUBMATCH:
4260       if (!trace->is_trivial()) {
4261         trace->Flush(compiler, this);
4262       } else {
4263         assembler->WriteCurrentPositionToRegister(
4264             data_.u_submatch.current_position_register, 0);
4265         assembler->WriteStackPointerToRegister(
4266             data_.u_submatch.stack_pointer_register);
4267         on_success()->Emit(compiler, trace);
4268       }
4269       break;
4270     case EMPTY_MATCH_CHECK: {
4271       int start_pos_reg = data_.u_empty_match_check.start_register;
4272       int stored_pos = 0;
4273       int rep_reg = data_.u_empty_match_check.repetition_register;
4274       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4275       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4276       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4277         // If we know we haven't advanced and there is no minimum we
4278         // can just backtrack immediately.
4279         assembler->GoTo(trace->backtrack());
4280       } else if (know_dist && stored_pos < trace->cp_offset()) {
4281         // If we know we've advanced we can generate the continuation
4282         // immediately.
4283         on_success()->Emit(compiler, trace);
4284       } else if (!trace->is_trivial()) {
4285         trace->Flush(compiler, this);
4286       } else {
4287         Label skip_empty_check;
4288         // If we have a minimum number of repetitions we check the current
4289         // number first and skip the empty check if it's not enough.
4290         if (has_minimum) {
4291           int limit = data_.u_empty_match_check.repetition_limit;
4292           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4293         }
4294         // If the match is empty we bail out, otherwise we fall through
4295         // to the on-success continuation.
4296         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4297                                    trace->backtrack());
4298         assembler->Bind(&skip_empty_check);
4299         on_success()->Emit(compiler, trace);
4300       }
4301       break;
4302     }
4303     case POSITIVE_SUBMATCH_SUCCESS: {
4304       if (!trace->is_trivial()) {
4305         trace->Flush(compiler, this);
4306         return;
4307       }
4308       assembler->ReadCurrentPositionFromRegister(
4309           data_.u_submatch.current_position_register);
4310       assembler->ReadStackPointerFromRegister(
4311           data_.u_submatch.stack_pointer_register);
4312       int clear_register_count = data_.u_submatch.clear_register_count;
4313       if (clear_register_count == 0) {
4314         on_success()->Emit(compiler, trace);
4315         return;
4316       }
4317       int clear_registers_from = data_.u_submatch.clear_register_from;
4318       Label clear_registers_backtrack;
4319       Trace new_trace = *trace;
4320       new_trace.set_backtrack(&clear_registers_backtrack);
4321       on_success()->Emit(compiler, &new_trace);
4322 
4323       assembler->Bind(&clear_registers_backtrack);
4324       int clear_registers_to = clear_registers_from + clear_register_count - 1;
4325       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4326 
4327       DCHECK(trace->backtrack() == NULL);
4328       assembler->Backtrack();
4329       return;
4330     }
4331     default:
4332       UNREACHABLE();
4333   }
4334 }
4335 
4336 
Emit(RegExpCompiler * compiler,Trace * trace)4337 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4338   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4339   if (!trace->is_trivial()) {
4340     trace->Flush(compiler, this);
4341     return;
4342   }
4343 
4344   LimitResult limit_result = LimitVersions(compiler, trace);
4345   if (limit_result == DONE) return;
4346   DCHECK(limit_result == CONTINUE);
4347 
4348   RecursionCheck rc(compiler);
4349 
4350   DCHECK_EQ(start_reg_ + 1, end_reg_);
4351   if (compiler->ignore_case()) {
4352     assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),
4353                                                trace->backtrack());
4354   } else {
4355     assembler->CheckNotBackReference(start_reg_, read_backward(),
4356                                      trace->backtrack());
4357   }
4358   // We are going to advance backward, so we may end up at the start.
4359   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4360   on_success()->Emit(compiler, trace);
4361 }
4362 
4363 
4364 // -------------------------------------------------------------------
4365 // Dot/dotty output
4366 
4367 
4368 #ifdef DEBUG
4369 
4370 
4371 class DotPrinter: public NodeVisitor {
4372  public:
DotPrinter(std::ostream & os,bool ignore_case)4373   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
4374       : os_(os),
4375         ignore_case_(ignore_case) {}
4376   void PrintNode(const char* label, RegExpNode* node);
4377   void Visit(RegExpNode* node);
4378   void PrintAttributes(RegExpNode* from);
4379   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4380 #define DECLARE_VISIT(Type)                                          \
4381   virtual void Visit##Type(Type##Node* that);
4382 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4383 #undef DECLARE_VISIT
4384  private:
4385   std::ostream& os_;
4386   bool ignore_case_;
4387 };
4388 
4389 
PrintNode(const char * label,RegExpNode * node)4390 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4391   os_ << "digraph G {\n  graph [label=\"";
4392   for (int i = 0; label[i]; i++) {
4393     switch (label[i]) {
4394       case '\\':
4395         os_ << "\\\\";
4396         break;
4397       case '"':
4398         os_ << "\"";
4399         break;
4400       default:
4401         os_ << label[i];
4402         break;
4403     }
4404   }
4405   os_ << "\"];\n";
4406   Visit(node);
4407   os_ << "}" << std::endl;
4408 }
4409 
4410 
Visit(RegExpNode * node)4411 void DotPrinter::Visit(RegExpNode* node) {
4412   if (node->info()->visited) return;
4413   node->info()->visited = true;
4414   node->Accept(this);
4415 }
4416 
4417 
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4418 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4419   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
4420   Visit(on_failure);
4421 }
4422 
4423 
4424 class TableEntryBodyPrinter {
4425  public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4426   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
4427       : os_(os),
4428         choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4429   void Call(uc16 from, DispatchTable::Entry entry) {
4430     OutSet* out_set = entry.out_set();
4431     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4432       if (out_set->Get(i)) {
4433         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
4434             << choice()->alternatives()->at(i).node() << ";\n";
4435       }
4436     }
4437   }
4438  private:
choice()4439   ChoiceNode* choice() { return choice_; }
4440   std::ostream& os_;
4441   ChoiceNode* choice_;
4442 };
4443 
4444 
4445 class TableEntryHeaderPrinter {
4446  public:
TableEntryHeaderPrinter(std::ostream & os)4447   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
4448       : first_(true),
4449         os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4450   void Call(uc16 from, DispatchTable::Entry entry) {
4451     if (first_) {
4452       first_ = false;
4453     } else {
4454       os_ << "|";
4455     }
4456     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4457     OutSet* out_set = entry.out_set();
4458     int priority = 0;
4459     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4460       if (out_set->Get(i)) {
4461         if (priority > 0) os_ << "|";
4462         os_ << "<s" << from << "o" << i << "> " << priority;
4463         priority++;
4464       }
4465     }
4466     os_ << "}}";
4467   }
4468 
4469  private:
4470   bool first_;
4471   std::ostream& os_;
4472 };
4473 
4474 
4475 class AttributePrinter {
4476  public:
AttributePrinter(std::ostream & os)4477   explicit AttributePrinter(std::ostream& os)  // NOLINT
4478       : os_(os),
4479         first_(true) {}
PrintSeparator()4480   void PrintSeparator() {
4481     if (first_) {
4482       first_ = false;
4483     } else {
4484       os_ << "|";
4485     }
4486   }
PrintBit(const char * name,bool value)4487   void PrintBit(const char* name, bool value) {
4488     if (!value) return;
4489     PrintSeparator();
4490     os_ << "{" << name << "}";
4491   }
PrintPositive(const char * name,int value)4492   void PrintPositive(const char* name, int value) {
4493     if (value < 0) return;
4494     PrintSeparator();
4495     os_ << "{" << name << "|" << value << "}";
4496   }
4497 
4498  private:
4499   std::ostream& os_;
4500   bool first_;
4501 };
4502 
4503 
PrintAttributes(RegExpNode * that)4504 void DotPrinter::PrintAttributes(RegExpNode* that) {
4505   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4506       << "margin=0.1, fontsize=10, label=\"{";
4507   AttributePrinter printer(os_);
4508   NodeInfo* info = that->info();
4509   printer.PrintBit("NI", info->follows_newline_interest);
4510   printer.PrintBit("WI", info->follows_word_interest);
4511   printer.PrintBit("SI", info->follows_start_interest);
4512   Label* label = that->label();
4513   if (label->is_bound())
4514     printer.PrintPositive("@", label->pos());
4515   os_ << "}\"];\n"
4516       << "  a" << that << " -> n" << that
4517       << " [style=dashed, color=grey, arrowhead=none];\n";
4518 }
4519 
4520 
4521 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4522 void DotPrinter::VisitChoice(ChoiceNode* that) {
4523   if (kPrintDispatchTable) {
4524     os_ << "  n" << that << " [shape=Mrecord, label=\"";
4525     TableEntryHeaderPrinter header_printer(os_);
4526     that->GetTable(ignore_case_)->ForEach(&header_printer);
4527     os_ << "\"]\n";
4528     PrintAttributes(that);
4529     TableEntryBodyPrinter body_printer(os_, that);
4530     that->GetTable(ignore_case_)->ForEach(&body_printer);
4531   } else {
4532     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
4533     for (int i = 0; i < that->alternatives()->length(); i++) {
4534       GuardedAlternative alt = that->alternatives()->at(i);
4535       os_ << "  n" << that << " -> n" << alt.node();
4536     }
4537   }
4538   for (int i = 0; i < that->alternatives()->length(); i++) {
4539     GuardedAlternative alt = that->alternatives()->at(i);
4540     alt.node()->Accept(this);
4541   }
4542 }
4543 
4544 
VisitText(TextNode * that)4545 void DotPrinter::VisitText(TextNode* that) {
4546   Zone* zone = that->zone();
4547   os_ << "  n" << that << " [label=\"";
4548   for (int i = 0; i < that->elements()->length(); i++) {
4549     if (i > 0) os_ << " ";
4550     TextElement elm = that->elements()->at(i);
4551     switch (elm.text_type()) {
4552       case TextElement::ATOM: {
4553         Vector<const uc16> data = elm.atom()->data();
4554         for (int i = 0; i < data.length(); i++) {
4555           os_ << static_cast<char>(data[i]);
4556         }
4557         break;
4558       }
4559       case TextElement::CHAR_CLASS: {
4560         RegExpCharacterClass* node = elm.char_class();
4561         os_ << "[";
4562         if (node->is_negated()) os_ << "^";
4563         for (int j = 0; j < node->ranges(zone)->length(); j++) {
4564           CharacterRange range = node->ranges(zone)->at(j);
4565           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4566         }
4567         os_ << "]";
4568         break;
4569       }
4570       default:
4571         UNREACHABLE();
4572     }
4573   }
4574   os_ << "\", shape=box, peripheries=2];\n";
4575   PrintAttributes(that);
4576   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4577   Visit(that->on_success());
4578 }
4579 
4580 
VisitBackReference(BackReferenceNode * that)4581 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4582   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
4583       << that->end_register() << "\", shape=doubleoctagon];\n";
4584   PrintAttributes(that);
4585   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4586   Visit(that->on_success());
4587 }
4588 
4589 
VisitEnd(EndNode * that)4590 void DotPrinter::VisitEnd(EndNode* that) {
4591   os_ << "  n" << that << " [style=bold, shape=point];\n";
4592   PrintAttributes(that);
4593 }
4594 
4595 
VisitAssertion(AssertionNode * that)4596 void DotPrinter::VisitAssertion(AssertionNode* that) {
4597   os_ << "  n" << that << " [";
4598   switch (that->assertion_type()) {
4599     case AssertionNode::AT_END:
4600       os_ << "label=\"$\", shape=septagon";
4601       break;
4602     case AssertionNode::AT_START:
4603       os_ << "label=\"^\", shape=septagon";
4604       break;
4605     case AssertionNode::AT_BOUNDARY:
4606       os_ << "label=\"\\b\", shape=septagon";
4607       break;
4608     case AssertionNode::AT_NON_BOUNDARY:
4609       os_ << "label=\"\\B\", shape=septagon";
4610       break;
4611     case AssertionNode::AFTER_NEWLINE:
4612       os_ << "label=\"(?<=\\n)\", shape=septagon";
4613       break;
4614   }
4615   os_ << "];\n";
4616   PrintAttributes(that);
4617   RegExpNode* successor = that->on_success();
4618   os_ << "  n" << that << " -> n" << successor << ";\n";
4619   Visit(successor);
4620 }
4621 
4622 
VisitAction(ActionNode * that)4623 void DotPrinter::VisitAction(ActionNode* that) {
4624   os_ << "  n" << that << " [";
4625   switch (that->action_type_) {
4626     case ActionNode::SET_REGISTER:
4627       os_ << "label=\"$" << that->data_.u_store_register.reg
4628           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4629       break;
4630     case ActionNode::INCREMENT_REGISTER:
4631       os_ << "label=\"$" << that->data_.u_increment_register.reg
4632           << "++\", shape=octagon";
4633       break;
4634     case ActionNode::STORE_POSITION:
4635       os_ << "label=\"$" << that->data_.u_position_register.reg
4636           << ":=$pos\", shape=octagon";
4637       break;
4638     case ActionNode::BEGIN_SUBMATCH:
4639       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4640           << ":=$pos,begin\", shape=septagon";
4641       break;
4642     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4643       os_ << "label=\"escape\", shape=septagon";
4644       break;
4645     case ActionNode::EMPTY_MATCH_CHECK:
4646       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4647           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4648           << "<" << that->data_.u_empty_match_check.repetition_limit
4649           << "?\", shape=septagon";
4650       break;
4651     case ActionNode::CLEAR_CAPTURES: {
4652       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4653           << " to $" << that->data_.u_clear_captures.range_to
4654           << "\", shape=septagon";
4655       break;
4656     }
4657   }
4658   os_ << "];\n";
4659   PrintAttributes(that);
4660   RegExpNode* successor = that->on_success();
4661   os_ << "  n" << that << " -> n" << successor << ";\n";
4662   Visit(successor);
4663 }
4664 
4665 
4666 class DispatchTableDumper {
4667  public:
DispatchTableDumper(std::ostream & os)4668   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4669   void Call(uc16 key, DispatchTable::Entry entry);
4670  private:
4671   std::ostream& os_;
4672 };
4673 
4674 
Call(uc16 key,DispatchTable::Entry entry)4675 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4676   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4677   OutSet* set = entry.out_set();
4678   bool first = true;
4679   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4680     if (set->Get(i)) {
4681       if (first) {
4682         first = false;
4683       } else {
4684         os_ << ", ";
4685       }
4686       os_ << i;
4687     }
4688   }
4689   os_ << "}\n";
4690 }
4691 
4692 
Dump()4693 void DispatchTable::Dump() {
4694   OFStream os(stderr);
4695   DispatchTableDumper dumper(os);
4696   tree()->ForEach(&dumper);
4697 }
4698 
4699 
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4700 void RegExpEngine::DotPrint(const char* label,
4701                             RegExpNode* node,
4702                             bool ignore_case) {
4703   OFStream os(stdout);
4704   DotPrinter printer(os, ignore_case);
4705   printer.PrintNode(label, node);
4706 }
4707 
4708 
4709 #endif  // DEBUG
4710 
4711 
4712 // -------------------------------------------------------------------
4713 // Tree to graph conversion
4714 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4715 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4716                                RegExpNode* on_success) {
4717   ZoneList<TextElement>* elms =
4718       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4719   elms->Add(TextElement::Atom(this), compiler->zone());
4720   return new (compiler->zone())
4721       TextNode(elms, compiler->read_backward(), on_success);
4722 }
4723 
4724 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4725 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4726                                RegExpNode* on_success) {
4727   return new (compiler->zone())
4728       TextNode(elements(), compiler->read_backward(), on_success);
4729 }
4730 
4731 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4732 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4733                                  const int* special_class,
4734                                  int length) {
4735   length--;  // Remove final 0x10000.
4736   DCHECK(special_class[length] == 0x10000);
4737   DCHECK(ranges->length() != 0);
4738   DCHECK(length != 0);
4739   DCHECK(special_class[0] != 0);
4740   if (ranges->length() != (length >> 1) + 1) {
4741     return false;
4742   }
4743   CharacterRange range = ranges->at(0);
4744   if (range.from() != 0) {
4745     return false;
4746   }
4747   for (int i = 0; i < length; i += 2) {
4748     if (special_class[i] != (range.to() + 1)) {
4749       return false;
4750     }
4751     range = ranges->at((i >> 1) + 1);
4752     if (special_class[i+1] != range.from()) {
4753       return false;
4754     }
4755   }
4756   if (range.to() != 0xffff) {
4757     return false;
4758   }
4759   return true;
4760 }
4761 
4762 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4763 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4764                           const int* special_class,
4765                           int length) {
4766   length--;  // Remove final 0x10000.
4767   DCHECK(special_class[length] == 0x10000);
4768   if (ranges->length() * 2 != length) {
4769     return false;
4770   }
4771   for (int i = 0; i < length; i += 2) {
4772     CharacterRange range = ranges->at(i >> 1);
4773     if (range.from() != special_class[i] ||
4774         range.to() != special_class[i + 1] - 1) {
4775       return false;
4776     }
4777   }
4778   return true;
4779 }
4780 
4781 
is_standard(Zone * zone)4782 bool RegExpCharacterClass::is_standard(Zone* zone) {
4783   // TODO(lrn): Remove need for this function, by not throwing away information
4784   // along the way.
4785   if (is_negated_) {
4786     return false;
4787   }
4788   if (set_.is_standard()) {
4789     return true;
4790   }
4791   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4792     set_.set_standard_set_type('s');
4793     return true;
4794   }
4795   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4796     set_.set_standard_set_type('S');
4797     return true;
4798   }
4799   if (CompareInverseRanges(set_.ranges(zone),
4800                            kLineTerminatorRanges,
4801                            kLineTerminatorRangeCount)) {
4802     set_.set_standard_set_type('.');
4803     return true;
4804   }
4805   if (CompareRanges(set_.ranges(zone),
4806                     kLineTerminatorRanges,
4807                     kLineTerminatorRangeCount)) {
4808     set_.set_standard_set_type('n');
4809     return true;
4810   }
4811   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4812     set_.set_standard_set_type('w');
4813     return true;
4814   }
4815   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4816     set_.set_standard_set_type('W');
4817     return true;
4818   }
4819   return false;
4820 }
4821 
4822 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4823 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4824                                          RegExpNode* on_success) {
4825   return new (compiler->zone())
4826       TextNode(this, compiler->read_backward(), on_success);
4827 }
4828 
4829 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)4830 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4831   RegExpAtom* atom1 = (*a)->AsAtom();
4832   RegExpAtom* atom2 = (*b)->AsAtom();
4833   uc16 character1 = atom1->data().at(0);
4834   uc16 character2 = atom2->data().at(0);
4835   if (character1 < character2) return -1;
4836   if (character1 > character2) return 1;
4837   return 0;
4838 }
4839 
4840 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)4841 static unibrow::uchar Canonical(
4842     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4843     unibrow::uchar c) {
4844   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
4845   int length = canonicalize->get(c, '\0', chars);
4846   DCHECK_LE(length, 1);
4847   unibrow::uchar canonical = c;
4848   if (length == 1) canonical = chars[0];
4849   return canonical;
4850 }
4851 
4852 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)4853 int CompareFirstCharCaseIndependent(
4854     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4855     RegExpTree* const* a, RegExpTree* const* b) {
4856   RegExpAtom* atom1 = (*a)->AsAtom();
4857   RegExpAtom* atom2 = (*b)->AsAtom();
4858   unibrow::uchar character1 = atom1->data().at(0);
4859   unibrow::uchar character2 = atom2->data().at(0);
4860   if (character1 == character2) return 0;
4861   if (character1 >= 'a' || character2 >= 'a') {
4862     character1 = Canonical(canonicalize, character1);
4863     character2 = Canonical(canonicalize, character2);
4864   }
4865   return static_cast<int>(character1) - static_cast<int>(character2);
4866 }
4867 
4868 
4869 // We can stable sort runs of atoms, since the order does not matter if they
4870 // start with different characters.
4871 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)4872 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4873   ZoneList<RegExpTree*>* alternatives = this->alternatives();
4874   int length = alternatives->length();
4875   bool found_consecutive_atoms = false;
4876   for (int i = 0; i < length; i++) {
4877     while (i < length) {
4878       RegExpTree* alternative = alternatives->at(i);
4879       if (alternative->IsAtom()) break;
4880       i++;
4881     }
4882     // i is length or it is the index of an atom.
4883     if (i == length) break;
4884     int first_atom = i;
4885     i++;
4886     while (i < length) {
4887       RegExpTree* alternative = alternatives->at(i);
4888       if (!alternative->IsAtom()) break;
4889       i++;
4890     }
4891     // Sort atoms to get ones with common prefixes together.
4892     // This step is more tricky if we are in a case-independent regexp,
4893     // because it would change /is|I/ to /I|is/, and order matters when
4894     // the regexp parts don't match only disjoint starting points. To fix
4895     // this we have a version of CompareFirstChar that uses case-
4896     // independent character classes for comparison.
4897     DCHECK_LT(first_atom, alternatives->length());
4898     DCHECK_LE(i, alternatives->length());
4899     DCHECK_LE(first_atom, i);
4900     if (compiler->ignore_case()) {
4901       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4902           compiler->isolate()->regexp_macro_assembler_canonicalize();
4903       auto compare_closure =
4904           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
4905             return CompareFirstCharCaseIndependent(canonicalize, a, b);
4906           };
4907       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
4908     } else {
4909       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4910     }
4911     if (i - first_atom > 1) found_consecutive_atoms = true;
4912   }
4913   return found_consecutive_atoms;
4914 }
4915 
4916 
4917 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)4918 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4919   Zone* zone = compiler->zone();
4920   ZoneList<RegExpTree*>* alternatives = this->alternatives();
4921   int length = alternatives->length();
4922 
4923   int write_posn = 0;
4924   int i = 0;
4925   while (i < length) {
4926     RegExpTree* alternative = alternatives->at(i);
4927     if (!alternative->IsAtom()) {
4928       alternatives->at(write_posn++) = alternatives->at(i);
4929       i++;
4930       continue;
4931     }
4932     RegExpAtom* atom = alternative->AsAtom();
4933     unibrow::uchar common_prefix = atom->data().at(0);
4934     int first_with_prefix = i;
4935     int prefix_length = atom->length();
4936     i++;
4937     while (i < length) {
4938       alternative = alternatives->at(i);
4939       if (!alternative->IsAtom()) break;
4940       atom = alternative->AsAtom();
4941       unibrow::uchar new_prefix = atom->data().at(0);
4942       if (new_prefix != common_prefix) {
4943         if (!compiler->ignore_case()) break;
4944         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4945             compiler->isolate()->regexp_macro_assembler_canonicalize();
4946         new_prefix = Canonical(canonicalize, new_prefix);
4947         common_prefix = Canonical(canonicalize, common_prefix);
4948         if (new_prefix != common_prefix) break;
4949       }
4950       prefix_length = Min(prefix_length, atom->length());
4951       i++;
4952     }
4953     if (i > first_with_prefix + 2) {
4954       // Found worthwhile run of alternatives with common prefix of at least one
4955       // character.  The sorting function above did not sort on more than one
4956       // character for reasons of correctness, but there may still be a longer
4957       // common prefix if the terms were similar or presorted in the input.
4958       // Find out how long the common prefix is.
4959       int run_length = i - first_with_prefix;
4960       atom = alternatives->at(first_with_prefix)->AsAtom();
4961       for (int j = 1; j < run_length && prefix_length > 1; j++) {
4962         RegExpAtom* old_atom =
4963             alternatives->at(j + first_with_prefix)->AsAtom();
4964         for (int k = 1; k < prefix_length; k++) {
4965           if (atom->data().at(k) != old_atom->data().at(k)) {
4966             prefix_length = k;
4967             break;
4968           }
4969         }
4970       }
4971       RegExpAtom* prefix =
4972           new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4973       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4974       pair->Add(prefix, zone);
4975       ZoneList<RegExpTree*>* suffixes =
4976           new (zone) ZoneList<RegExpTree*>(run_length, zone);
4977       for (int j = 0; j < run_length; j++) {
4978         RegExpAtom* old_atom =
4979             alternatives->at(j + first_with_prefix)->AsAtom();
4980         int len = old_atom->length();
4981         if (len == prefix_length) {
4982           suffixes->Add(new (zone) RegExpEmpty(), zone);
4983         } else {
4984           RegExpTree* suffix = new (zone) RegExpAtom(
4985               old_atom->data().SubVector(prefix_length, old_atom->length()));
4986           suffixes->Add(suffix, zone);
4987         }
4988       }
4989       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
4990       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
4991     } else {
4992       // Just copy any non-worthwhile alternatives.
4993       for (int j = first_with_prefix; j < i; j++) {
4994         alternatives->at(write_posn++) = alternatives->at(j);
4995       }
4996     }
4997   }
4998   alternatives->Rewind(write_posn);  // Trim end of array.
4999 }
5000 
5001 
5002 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5003 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5004     RegExpCompiler* compiler) {
5005   Zone* zone = compiler->zone();
5006   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5007   int length = alternatives->length();
5008 
5009   int write_posn = 0;
5010   int i = 0;
5011   while (i < length) {
5012     RegExpTree* alternative = alternatives->at(i);
5013     if (!alternative->IsAtom()) {
5014       alternatives->at(write_posn++) = alternatives->at(i);
5015       i++;
5016       continue;
5017     }
5018     RegExpAtom* atom = alternative->AsAtom();
5019     if (atom->length() != 1) {
5020       alternatives->at(write_posn++) = alternatives->at(i);
5021       i++;
5022       continue;
5023     }
5024     int first_in_run = i;
5025     i++;
5026     while (i < length) {
5027       alternative = alternatives->at(i);
5028       if (!alternative->IsAtom()) break;
5029       atom = alternative->AsAtom();
5030       if (atom->length() != 1) break;
5031       i++;
5032     }
5033     if (i > first_in_run + 1) {
5034       // Found non-trivial run of single-character alternatives.
5035       int run_length = i - first_in_run;
5036       ZoneList<CharacterRange>* ranges =
5037           new (zone) ZoneList<CharacterRange>(2, zone);
5038       for (int j = 0; j < run_length; j++) {
5039         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5040         DCHECK_EQ(old_atom->length(), 1);
5041         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5042       }
5043       alternatives->at(write_posn++) =
5044           new (zone) RegExpCharacterClass(ranges, false);
5045     } else {
5046       // Just copy any trivial alternatives.
5047       for (int j = first_in_run; j < i; j++) {
5048         alternatives->at(write_posn++) = alternatives->at(j);
5049       }
5050     }
5051   }
5052   alternatives->Rewind(write_posn);  // Trim end of array.
5053 }
5054 
5055 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5056 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5057                                       RegExpNode* on_success) {
5058   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5059 
5060   if (alternatives->length() > 2) {
5061     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5062     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5063     FixSingleCharacterDisjunctions(compiler);
5064     if (alternatives->length() == 1) {
5065       return alternatives->at(0)->ToNode(compiler, on_success);
5066     }
5067   }
5068 
5069   int length = alternatives->length();
5070 
5071   ChoiceNode* result =
5072       new(compiler->zone()) ChoiceNode(length, compiler->zone());
5073   for (int i = 0; i < length; i++) {
5074     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5075                                                                on_success));
5076     result->AddAlternative(alternative);
5077   }
5078   return result;
5079 }
5080 
5081 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5082 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5083                                      RegExpNode* on_success) {
5084   return ToNode(min(),
5085                 max(),
5086                 is_greedy(),
5087                 body(),
5088                 compiler,
5089                 on_success);
5090 }
5091 
5092 
5093 // Scoped object to keep track of how much we unroll quantifier loops in the
5094 // regexp graph generator.
5095 class RegExpExpansionLimiter {
5096  public:
5097   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5098   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5099       : compiler_(compiler),
5100         saved_expansion_factor_(compiler->current_expansion_factor()),
5101         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5102     DCHECK(factor > 0);
5103     if (ok_to_expand_) {
5104       if (factor > kMaxExpansionFactor) {
5105         // Avoid integer overflow of the current expansion factor.
5106         ok_to_expand_ = false;
5107         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5108       } else {
5109         int new_factor = saved_expansion_factor_ * factor;
5110         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5111         compiler->set_current_expansion_factor(new_factor);
5112       }
5113     }
5114   }
5115 
~RegExpExpansionLimiter()5116   ~RegExpExpansionLimiter() {
5117     compiler_->set_current_expansion_factor(saved_expansion_factor_);
5118   }
5119 
ok_to_expand()5120   bool ok_to_expand() { return ok_to_expand_; }
5121 
5122  private:
5123   RegExpCompiler* compiler_;
5124   int saved_expansion_factor_;
5125   bool ok_to_expand_;
5126 
5127   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5128 };
5129 
5130 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5131 RegExpNode* RegExpQuantifier::ToNode(int min,
5132                                      int max,
5133                                      bool is_greedy,
5134                                      RegExpTree* body,
5135                                      RegExpCompiler* compiler,
5136                                      RegExpNode* on_success,
5137                                      bool not_at_start) {
5138   // x{f, t} becomes this:
5139   //
5140   //             (r++)<-.
5141   //               |     `
5142   //               |     (x)
5143   //               v     ^
5144   //      (r=0)-->(?)---/ [if r < t]
5145   //               |
5146   //   [if r >= f] \----> ...
5147   //
5148 
5149   // 15.10.2.5 RepeatMatcher algorithm.
5150   // The parser has already eliminated the case where max is 0.  In the case
5151   // where max_match is zero the parser has removed the quantifier if min was
5152   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
5153 
5154   // If we know that we cannot match zero length then things are a little
5155   // simpler since we don't need to make the special zero length match check
5156   // from step 2.1.  If the min and max are small we can unroll a little in
5157   // this case.
5158   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
5159   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
5160   if (max == 0) return on_success;  // This can happen due to recursion.
5161   bool body_can_be_empty = (body->min_match() == 0);
5162   int body_start_reg = RegExpCompiler::kNoRegister;
5163   Interval capture_registers = body->CaptureRegisters();
5164   bool needs_capture_clearing = !capture_registers.is_empty();
5165   Zone* zone = compiler->zone();
5166 
5167   if (body_can_be_empty) {
5168     body_start_reg = compiler->AllocateRegister();
5169   } else if (compiler->optimize() && !needs_capture_clearing) {
5170     // Only unroll if there are no captures and the body can't be
5171     // empty.
5172     {
5173       RegExpExpansionLimiter limiter(
5174           compiler, min + ((max != min) ? 1 : 0));
5175       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5176         int new_max = (max == kInfinity) ? max : max - min;
5177         // Recurse once to get the loop or optional matches after the fixed
5178         // ones.
5179         RegExpNode* answer = ToNode(
5180             0, new_max, is_greedy, body, compiler, on_success, true);
5181         // Unroll the forced matches from 0 to min.  This can cause chains of
5182         // TextNodes (which the parser does not generate).  These should be
5183         // combined if it turns out they hinder good code generation.
5184         for (int i = 0; i < min; i++) {
5185           answer = body->ToNode(compiler, answer);
5186         }
5187         return answer;
5188       }
5189     }
5190     if (max <= kMaxUnrolledMaxMatches && min == 0) {
5191       DCHECK(max > 0);  // Due to the 'if' above.
5192       RegExpExpansionLimiter limiter(compiler, max);
5193       if (limiter.ok_to_expand()) {
5194         // Unroll the optional matches up to max.
5195         RegExpNode* answer = on_success;
5196         for (int i = 0; i < max; i++) {
5197           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5198           if (is_greedy) {
5199             alternation->AddAlternative(
5200                 GuardedAlternative(body->ToNode(compiler, answer)));
5201             alternation->AddAlternative(GuardedAlternative(on_success));
5202           } else {
5203             alternation->AddAlternative(GuardedAlternative(on_success));
5204             alternation->AddAlternative(
5205                 GuardedAlternative(body->ToNode(compiler, answer)));
5206           }
5207           answer = alternation;
5208           if (not_at_start && !compiler->read_backward()) {
5209             alternation->set_not_at_start();
5210           }
5211         }
5212         return answer;
5213       }
5214     }
5215   }
5216   bool has_min = min > 0;
5217   bool has_max = max < RegExpTree::kInfinity;
5218   bool needs_counter = has_min || has_max;
5219   int reg_ctr = needs_counter
5220       ? compiler->AllocateRegister()
5221       : RegExpCompiler::kNoRegister;
5222   LoopChoiceNode* center = new (zone)
5223       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5224   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5225   RegExpNode* loop_return = needs_counter
5226       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5227       : static_cast<RegExpNode*>(center);
5228   if (body_can_be_empty) {
5229     // If the body can be empty we need to check if it was and then
5230     // backtrack.
5231     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5232                                               reg_ctr,
5233                                               min,
5234                                               loop_return);
5235   }
5236   RegExpNode* body_node = body->ToNode(compiler, loop_return);
5237   if (body_can_be_empty) {
5238     // If the body can be empty we need to store the start position
5239     // so we can bail out if it was empty.
5240     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5241   }
5242   if (needs_capture_clearing) {
5243     // Before entering the body of this loop we need to clear captures.
5244     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5245   }
5246   GuardedAlternative body_alt(body_node);
5247   if (has_max) {
5248     Guard* body_guard =
5249         new(zone) Guard(reg_ctr, Guard::LT, max);
5250     body_alt.AddGuard(body_guard, zone);
5251   }
5252   GuardedAlternative rest_alt(on_success);
5253   if (has_min) {
5254     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5255     rest_alt.AddGuard(rest_guard, zone);
5256   }
5257   if (is_greedy) {
5258     center->AddLoopAlternative(body_alt);
5259     center->AddContinueAlternative(rest_alt);
5260   } else {
5261     center->AddContinueAlternative(rest_alt);
5262     center->AddLoopAlternative(body_alt);
5263   }
5264   if (needs_counter) {
5265     return ActionNode::SetRegister(reg_ctr, 0, center);
5266   } else {
5267     return center;
5268   }
5269 }
5270 
5271 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5272 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5273                                     RegExpNode* on_success) {
5274   NodeInfo info;
5275   Zone* zone = compiler->zone();
5276 
5277   switch (assertion_type()) {
5278     case START_OF_LINE:
5279       return AssertionNode::AfterNewline(on_success);
5280     case START_OF_INPUT:
5281       return AssertionNode::AtStart(on_success);
5282     case BOUNDARY:
5283       return AssertionNode::AtBoundary(on_success);
5284     case NON_BOUNDARY:
5285       return AssertionNode::AtNonBoundary(on_success);
5286     case END_OF_INPUT:
5287       return AssertionNode::AtEnd(on_success);
5288     case END_OF_LINE: {
5289       // Compile $ in multiline regexps as an alternation with a positive
5290       // lookahead in one side and an end-of-input on the other side.
5291       // We need two registers for the lookahead.
5292       int stack_pointer_register = compiler->AllocateRegister();
5293       int position_register = compiler->AllocateRegister();
5294       // The ChoiceNode to distinguish between a newline and end-of-input.
5295       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5296       // Create a newline atom.
5297       ZoneList<CharacterRange>* newline_ranges =
5298           new(zone) ZoneList<CharacterRange>(3, zone);
5299       CharacterRange::AddClassEscape('n', newline_ranges, zone);
5300       RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5301       TextNode* newline_matcher = new (zone) TextNode(
5302           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5303                                    stack_pointer_register, position_register,
5304                                    0,   // No captures inside.
5305                                    -1,  // Ignored if no captures.
5306                                    on_success));
5307       // Create an end-of-input matcher.
5308       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5309           stack_pointer_register,
5310           position_register,
5311           newline_matcher);
5312       // Add the two alternatives to the ChoiceNode.
5313       GuardedAlternative eol_alternative(end_of_line);
5314       result->AddAlternative(eol_alternative);
5315       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5316       result->AddAlternative(end_alternative);
5317       return result;
5318     }
5319     default:
5320       UNREACHABLE();
5321   }
5322   return on_success;
5323 }
5324 
5325 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5326 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5327                                         RegExpNode* on_success) {
5328   return new (compiler->zone())
5329       BackReferenceNode(RegExpCapture::StartRegister(index()),
5330                         RegExpCapture::EndRegister(index()),
5331                         compiler->read_backward(), on_success);
5332 }
5333 
5334 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5335 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5336                                 RegExpNode* on_success) {
5337   return on_success;
5338 }
5339 
5340 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5341 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5342                                      RegExpNode* on_success) {
5343   int stack_pointer_register = compiler->AllocateRegister();
5344   int position_register = compiler->AllocateRegister();
5345 
5346   const int registers_per_capture = 2;
5347   const int register_of_first_capture = 2;
5348   int register_count = capture_count_ * registers_per_capture;
5349   int register_start =
5350     register_of_first_capture + capture_from_ * registers_per_capture;
5351 
5352   RegExpNode* result;
5353   bool was_reading_backward = compiler->read_backward();
5354   compiler->set_read_backward(type() == LOOKBEHIND);
5355   if (is_positive()) {
5356     result = ActionNode::BeginSubmatch(
5357         stack_pointer_register, position_register,
5358         body()->ToNode(compiler,
5359                        ActionNode::PositiveSubmatchSuccess(
5360                            stack_pointer_register, position_register,
5361                            register_count, register_start, on_success)));
5362   } else {
5363     // We use a ChoiceNode for a negative lookahead because it has most of
5364     // the characteristics we need.  It has the body of the lookahead as its
5365     // first alternative and the expression after the lookahead of the second
5366     // alternative.  If the first alternative succeeds then the
5367     // NegativeSubmatchSuccess will unwind the stack including everything the
5368     // choice node set up and backtrack.  If the first alternative fails then
5369     // the second alternative is tried, which is exactly the desired result
5370     // for a negative lookahead.  The NegativeLookaheadChoiceNode is a special
5371     // ChoiceNode that knows to ignore the first exit when calculating quick
5372     // checks.
5373     Zone* zone = compiler->zone();
5374 
5375     GuardedAlternative body_alt(
5376         body()->ToNode(compiler, new (zone) NegativeSubmatchSuccess(
5377                                      stack_pointer_register, position_register,
5378                                      register_count, register_start, zone)));
5379     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5380         body_alt, GuardedAlternative(on_success), zone);
5381     result = ActionNode::BeginSubmatch(stack_pointer_register,
5382                                        position_register, choice_node);
5383   }
5384   compiler->set_read_backward(was_reading_backward);
5385   return result;
5386 }
5387 
5388 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5389 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5390                                   RegExpNode* on_success) {
5391   return ToNode(body(), index(), compiler, on_success);
5392 }
5393 
5394 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5395 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5396                                   int index,
5397                                   RegExpCompiler* compiler,
5398                                   RegExpNode* on_success) {
5399   DCHECK_NOT_NULL(body);
5400   int start_reg = RegExpCapture::StartRegister(index);
5401   int end_reg = RegExpCapture::EndRegister(index);
5402   if (compiler->read_backward()) std::swap(start_reg, end_reg);
5403   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5404   RegExpNode* body_node = body->ToNode(compiler, store_end);
5405   return ActionNode::StorePosition(start_reg, true, body_node);
5406 }
5407 
5408 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5409 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5410                                       RegExpNode* on_success) {
5411   ZoneList<RegExpTree*>* children = nodes();
5412   RegExpNode* current = on_success;
5413   if (compiler->read_backward()) {
5414     for (int i = 0; i < children->length(); i++) {
5415       current = children->at(i)->ToNode(compiler, current);
5416     }
5417   } else {
5418     for (int i = children->length() - 1; i >= 0; i--) {
5419       current = children->at(i)->ToNode(compiler, current);
5420     }
5421   }
5422   return current;
5423 }
5424 
5425 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5426 static void AddClass(const int* elmv,
5427                      int elmc,
5428                      ZoneList<CharacterRange>* ranges,
5429                      Zone* zone) {
5430   elmc--;
5431   DCHECK(elmv[elmc] == 0x10000);
5432   for (int i = 0; i < elmc; i += 2) {
5433     DCHECK(elmv[i] < elmv[i + 1]);
5434     ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5435   }
5436 }
5437 
5438 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5439 static void AddClassNegated(const int *elmv,
5440                             int elmc,
5441                             ZoneList<CharacterRange>* ranges,
5442                             Zone* zone) {
5443   elmc--;
5444   DCHECK(elmv[elmc] == 0x10000);
5445   DCHECK(elmv[0] != 0x0000);
5446   DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5447   uc16 last = 0x0000;
5448   for (int i = 0; i < elmc; i += 2) {
5449     DCHECK(last <= elmv[i] - 1);
5450     DCHECK(elmv[i] < elmv[i + 1]);
5451     ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5452     last = elmv[i + 1];
5453   }
5454   ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5455 }
5456 
5457 
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5458 void CharacterRange::AddClassEscape(uc16 type,
5459                                     ZoneList<CharacterRange>* ranges,
5460                                     Zone* zone) {
5461   switch (type) {
5462     case 's':
5463       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5464       break;
5465     case 'S':
5466       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5467       break;
5468     case 'w':
5469       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5470       break;
5471     case 'W':
5472       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5473       break;
5474     case 'd':
5475       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5476       break;
5477     case 'D':
5478       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5479       break;
5480     case '.':
5481       AddClassNegated(kLineTerminatorRanges,
5482                       kLineTerminatorRangeCount,
5483                       ranges,
5484                       zone);
5485       break;
5486     // This is not a character range as defined by the spec but a
5487     // convenient shorthand for a character class that matches any
5488     // character.
5489     case '*':
5490       ranges->Add(CharacterRange::Everything(), zone);
5491       break;
5492     // This is the set of characters matched by the $ and ^ symbols
5493     // in multiline mode.
5494     case 'n':
5495       AddClass(kLineTerminatorRanges,
5496                kLineTerminatorRangeCount,
5497                ranges,
5498                zone);
5499       break;
5500     default:
5501       UNREACHABLE();
5502   }
5503 }
5504 
5505 
GetWordBounds()5506 Vector<const int> CharacterRange::GetWordBounds() {
5507   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5508 }
5509 
5510 
5511 class CharacterRangeSplitter {
5512  public:
CharacterRangeSplitter(ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5513   CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5514                          ZoneList<CharacterRange>** excluded,
5515                          Zone* zone)
5516       : included_(included),
5517         excluded_(excluded),
5518         zone_(zone) { }
5519   void Call(uc16 from, DispatchTable::Entry entry);
5520 
5521   static const int kInBase = 0;
5522   static const int kInOverlay = 1;
5523 
5524  private:
5525   ZoneList<CharacterRange>** included_;
5526   ZoneList<CharacterRange>** excluded_;
5527   Zone* zone_;
5528 };
5529 
5530 
Call(uc16 from,DispatchTable::Entry entry)5531 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5532   if (!entry.out_set()->Get(kInBase)) return;
5533   ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5534     ? included_
5535     : excluded_;
5536   if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5537   (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5538 }
5539 
5540 
Split(ZoneList<CharacterRange> * base,Vector<const int> overlay,ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5541 void CharacterRange::Split(ZoneList<CharacterRange>* base,
5542                            Vector<const int> overlay,
5543                            ZoneList<CharacterRange>** included,
5544                            ZoneList<CharacterRange>** excluded,
5545                            Zone* zone) {
5546   DCHECK_NULL(*included);
5547   DCHECK_NULL(*excluded);
5548   DispatchTable table(zone);
5549   for (int i = 0; i < base->length(); i++)
5550     table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5551   for (int i = 0; i < overlay.length(); i += 2) {
5552     table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5553                    CharacterRangeSplitter::kInOverlay, zone);
5554   }
5555   CharacterRangeSplitter callback(included, excluded, zone);
5556   table.ForEach(&callback);
5557 }
5558 
5559 
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5560 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5561                                         ZoneList<CharacterRange>* ranges,
5562                                         bool is_one_byte) {
5563   uc16 bottom = from();
5564   uc16 top = to();
5565   if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5566     if (bottom > String::kMaxOneByteCharCode) return;
5567     if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5568   }
5569   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5570   if (top == bottom) {
5571     // If this is a singleton we just expand the one character.
5572     int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5573     for (int i = 0; i < length; i++) {
5574       uc32 chr = chars[i];
5575       if (chr != bottom) {
5576         ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5577       }
5578     }
5579   } else {
5580     // If this is a range we expand the characters block by block,
5581     // expanding contiguous subranges (blocks) one at a time.
5582     // The approach is as follows.  For a given start character we
5583     // look up the remainder of the block that contains it (represented
5584     // by the end point), for instance we find 'z' if the character
5585     // is 'c'.  A block is characterized by the property
5586     // that all characters uncanonicalize in the same way, except that
5587     // each entry in the result is incremented by the distance from the first
5588     // element.  So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5589     // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5590     // Once we've found the end point we look up its uncanonicalization
5591     // and produce a range for each element.  For instance for [c-f]
5592     // we look up ['z', 'Z'] and produce [c-f] and [C-F].  We then only
5593     // add a range if it is not already contained in the input, so [c-f]
5594     // will be skipped but [C-F] will be added.  If this range is not
5595     // completely contained in a block we do this for all the blocks
5596     // covered by the range (handling characters that is not in a block
5597     // as a "singleton block").
5598     unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5599     int pos = bottom;
5600     while (pos <= top) {
5601       int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5602       uc16 block_end;
5603       if (length == 0) {
5604         block_end = pos;
5605       } else {
5606         DCHECK_EQ(1, length);
5607         block_end = range[0];
5608       }
5609       int end = (block_end > top) ? top : block_end;
5610       length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5611       for (int i = 0; i < length; i++) {
5612         uc32 c = range[i];
5613         uc16 range_from = c - (block_end - pos);
5614         uc16 range_to = c - (block_end - end);
5615         if (!(bottom <= range_from && range_to <= top)) {
5616           ranges->Add(CharacterRange(range_from, range_to), zone);
5617         }
5618       }
5619       pos = end + 1;
5620     }
5621   }
5622 }
5623 
5624 
IsCanonical(ZoneList<CharacterRange> * ranges)5625 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5626   DCHECK_NOT_NULL(ranges);
5627   int n = ranges->length();
5628   if (n <= 1) return true;
5629   int max = ranges->at(0).to();
5630   for (int i = 1; i < n; i++) {
5631     CharacterRange next_range = ranges->at(i);
5632     if (next_range.from() <= max + 1) return false;
5633     max = next_range.to();
5634   }
5635   return true;
5636 }
5637 
5638 
ranges(Zone * zone)5639 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5640   if (ranges_ == NULL) {
5641     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5642     CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5643   }
5644   return ranges_;
5645 }
5646 
5647 
5648 // Move a number of elements in a zonelist to another position
5649 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5650 static void MoveRanges(ZoneList<CharacterRange>* list,
5651                        int from,
5652                        int to,
5653                        int count) {
5654   // Ranges are potentially overlapping.
5655   if (from < to) {
5656     for (int i = count - 1; i >= 0; i--) {
5657       list->at(to + i) = list->at(from + i);
5658     }
5659   } else {
5660     for (int i = 0; i < count; i++) {
5661       list->at(to + i) = list->at(from + i);
5662     }
5663   }
5664 }
5665 
5666 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)5667 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5668                                       int count,
5669                                       CharacterRange insert) {
5670   // Inserts a range into list[0..count[, which must be sorted
5671   // by from value and non-overlapping and non-adjacent, using at most
5672   // list[0..count] for the result. Returns the number of resulting
5673   // canonicalized ranges. Inserting a range may collapse existing ranges into
5674   // fewer ranges, so the return value can be anything in the range 1..count+1.
5675   uc16 from = insert.from();
5676   uc16 to = insert.to();
5677   int start_pos = 0;
5678   int end_pos = count;
5679   for (int i = count - 1; i >= 0; i--) {
5680     CharacterRange current = list->at(i);
5681     if (current.from() > to + 1) {
5682       end_pos = i;
5683     } else if (current.to() + 1 < from) {
5684       start_pos = i + 1;
5685       break;
5686     }
5687   }
5688 
5689   // Inserted range overlaps, or is adjacent to, ranges at positions
5690   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5691   // not affected by the insertion.
5692   // If start_pos == end_pos, the range must be inserted before start_pos.
5693   // if start_pos < end_pos, the entire range from start_pos to end_pos
5694   // must be merged with the insert range.
5695 
5696   if (start_pos == end_pos) {
5697     // Insert between existing ranges at position start_pos.
5698     if (start_pos < count) {
5699       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5700     }
5701     list->at(start_pos) = insert;
5702     return count + 1;
5703   }
5704   if (start_pos + 1 == end_pos) {
5705     // Replace single existing range at position start_pos.
5706     CharacterRange to_replace = list->at(start_pos);
5707     int new_from = Min(to_replace.from(), from);
5708     int new_to = Max(to_replace.to(), to);
5709     list->at(start_pos) = CharacterRange(new_from, new_to);
5710     return count;
5711   }
5712   // Replace a number of existing ranges from start_pos to end_pos - 1.
5713   // Move the remaining ranges down.
5714 
5715   int new_from = Min(list->at(start_pos).from(), from);
5716   int new_to = Max(list->at(end_pos - 1).to(), to);
5717   if (end_pos < count) {
5718     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5719   }
5720   list->at(start_pos) = CharacterRange(new_from, new_to);
5721   return count - (end_pos - start_pos) + 1;
5722 }
5723 
5724 
Canonicalize()5725 void CharacterSet::Canonicalize() {
5726   // Special/default classes are always considered canonical. The result
5727   // of calling ranges() will be sorted.
5728   if (ranges_ == NULL) return;
5729   CharacterRange::Canonicalize(ranges_);
5730 }
5731 
5732 
Canonicalize(ZoneList<CharacterRange> * character_ranges)5733 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5734   if (character_ranges->length() <= 1) return;
5735   // Check whether ranges are already canonical (increasing, non-overlapping,
5736   // non-adjacent).
5737   int n = character_ranges->length();
5738   int max = character_ranges->at(0).to();
5739   int i = 1;
5740   while (i < n) {
5741     CharacterRange current = character_ranges->at(i);
5742     if (current.from() <= max + 1) {
5743       break;
5744     }
5745     max = current.to();
5746     i++;
5747   }
5748   // Canonical until the i'th range. If that's all of them, we are done.
5749   if (i == n) return;
5750 
5751   // The ranges at index i and forward are not canonicalized. Make them so by
5752   // doing the equivalent of insertion sort (inserting each into the previous
5753   // list, in order).
5754   // Notice that inserting a range can reduce the number of ranges in the
5755   // result due to combining of adjacent and overlapping ranges.
5756   int read = i;  // Range to insert.
5757   int num_canonical = i;  // Length of canonicalized part of list.
5758   do {
5759     num_canonical = InsertRangeInCanonicalList(character_ranges,
5760                                                num_canonical,
5761                                                character_ranges->at(read));
5762     read++;
5763   } while (read < n);
5764   character_ranges->Rewind(num_canonical);
5765 
5766   DCHECK(CharacterRange::IsCanonical(character_ranges));
5767 }
5768 
5769 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)5770 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5771                             ZoneList<CharacterRange>* negated_ranges,
5772                             Zone* zone) {
5773   DCHECK(CharacterRange::IsCanonical(ranges));
5774   DCHECK_EQ(0, negated_ranges->length());
5775   int range_count = ranges->length();
5776   uc16 from = 0;
5777   int i = 0;
5778   if (range_count > 0 && ranges->at(0).from() == 0) {
5779     from = ranges->at(0).to();
5780     i = 1;
5781   }
5782   while (i < range_count) {
5783     CharacterRange range = ranges->at(i);
5784     negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5785     from = range.to();
5786     i++;
5787   }
5788   if (from < String::kMaxUtf16CodeUnit) {
5789     negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5790                         zone);
5791   }
5792 }
5793 
5794 
5795 // -------------------------------------------------------------------
5796 // Splay tree
5797 
5798 
Extend(unsigned value,Zone * zone)5799 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5800   if (Get(value))
5801     return this;
5802   if (successors(zone) != NULL) {
5803     for (int i = 0; i < successors(zone)->length(); i++) {
5804       OutSet* successor = successors(zone)->at(i);
5805       if (successor->Get(value))
5806         return successor;
5807     }
5808   } else {
5809     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5810   }
5811   OutSet* result = new(zone) OutSet(first_, remaining_);
5812   result->Set(value, zone);
5813   successors(zone)->Add(result, zone);
5814   return result;
5815 }
5816 
5817 
Set(unsigned value,Zone * zone)5818 void OutSet::Set(unsigned value, Zone *zone) {
5819   if (value < kFirstLimit) {
5820     first_ |= (1 << value);
5821   } else {
5822     if (remaining_ == NULL)
5823       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5824     if (remaining_->is_empty() || !remaining_->Contains(value))
5825       remaining_->Add(value, zone);
5826   }
5827 }
5828 
5829 
Get(unsigned value) const5830 bool OutSet::Get(unsigned value) const {
5831   if (value < kFirstLimit) {
5832     return (first_ & (1 << value)) != 0;
5833   } else if (remaining_ == NULL) {
5834     return false;
5835   } else {
5836     return remaining_->Contains(value);
5837   }
5838 }
5839 
5840 
5841 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5842 
5843 
AddRange(CharacterRange full_range,int value,Zone * zone)5844 void DispatchTable::AddRange(CharacterRange full_range, int value,
5845                              Zone* zone) {
5846   CharacterRange current = full_range;
5847   if (tree()->is_empty()) {
5848     // If this is the first range we just insert into the table.
5849     ZoneSplayTree<Config>::Locator loc;
5850     bool inserted = tree()->Insert(current.from(), &loc);
5851     DCHECK(inserted);
5852     USE(inserted);
5853     loc.set_value(Entry(current.from(), current.to(),
5854                         empty()->Extend(value, zone)));
5855     return;
5856   }
5857   // First see if there is a range to the left of this one that
5858   // overlaps.
5859   ZoneSplayTree<Config>::Locator loc;
5860   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5861     Entry* entry = &loc.value();
5862     // If we've found a range that overlaps with this one, and it
5863     // starts strictly to the left of this one, we have to fix it
5864     // because the following code only handles ranges that start on
5865     // or after the start point of the range we're adding.
5866     if (entry->from() < current.from() && entry->to() >= current.from()) {
5867       // Snap the overlapping range in half around the start point of
5868       // the range we're adding.
5869       CharacterRange left(entry->from(), current.from() - 1);
5870       CharacterRange right(current.from(), entry->to());
5871       // The left part of the overlapping range doesn't overlap.
5872       // Truncate the whole entry to be just the left part.
5873       entry->set_to(left.to());
5874       // The right part is the one that overlaps.  We add this part
5875       // to the map and let the next step deal with merging it with
5876       // the range we're adding.
5877       ZoneSplayTree<Config>::Locator loc;
5878       bool inserted = tree()->Insert(right.from(), &loc);
5879       DCHECK(inserted);
5880       USE(inserted);
5881       loc.set_value(Entry(right.from(),
5882                           right.to(),
5883                           entry->out_set()));
5884     }
5885   }
5886   while (current.is_valid()) {
5887     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5888         (loc.value().from() <= current.to()) &&
5889         (loc.value().to() >= current.from())) {
5890       Entry* entry = &loc.value();
5891       // We have overlap.  If there is space between the start point of
5892       // the range we're adding and where the overlapping range starts
5893       // then we have to add a range covering just that space.
5894       if (current.from() < entry->from()) {
5895         ZoneSplayTree<Config>::Locator ins;
5896         bool inserted = tree()->Insert(current.from(), &ins);
5897         DCHECK(inserted);
5898         USE(inserted);
5899         ins.set_value(Entry(current.from(),
5900                             entry->from() - 1,
5901                             empty()->Extend(value, zone)));
5902         current.set_from(entry->from());
5903       }
5904       DCHECK_EQ(current.from(), entry->from());
5905       // If the overlapping range extends beyond the one we want to add
5906       // we have to snap the right part off and add it separately.
5907       if (entry->to() > current.to()) {
5908         ZoneSplayTree<Config>::Locator ins;
5909         bool inserted = tree()->Insert(current.to() + 1, &ins);
5910         DCHECK(inserted);
5911         USE(inserted);
5912         ins.set_value(Entry(current.to() + 1,
5913                             entry->to(),
5914                             entry->out_set()));
5915         entry->set_to(current.to());
5916       }
5917       DCHECK(entry->to() <= current.to());
5918       // The overlapping range is now completely contained by the range
5919       // we're adding so we can just update it and move the start point
5920       // of the range we're adding just past it.
5921       entry->AddValue(value, zone);
5922       // Bail out if the last interval ended at 0xFFFF since otherwise
5923       // adding 1 will wrap around to 0.
5924       if (entry->to() == String::kMaxUtf16CodeUnit)
5925         break;
5926       DCHECK(entry->to() + 1 > current.from());
5927       current.set_from(entry->to() + 1);
5928     } else {
5929       // There is no overlap so we can just add the range
5930       ZoneSplayTree<Config>::Locator ins;
5931       bool inserted = tree()->Insert(current.from(), &ins);
5932       DCHECK(inserted);
5933       USE(inserted);
5934       ins.set_value(Entry(current.from(),
5935                           current.to(),
5936                           empty()->Extend(value, zone)));
5937       break;
5938     }
5939   }
5940 }
5941 
5942 
Get(uc16 value)5943 OutSet* DispatchTable::Get(uc16 value) {
5944   ZoneSplayTree<Config>::Locator loc;
5945   if (!tree()->FindGreatestLessThan(value, &loc))
5946     return empty();
5947   Entry* entry = &loc.value();
5948   if (value <= entry->to())
5949     return entry->out_set();
5950   else
5951     return empty();
5952 }
5953 
5954 
5955 // -------------------------------------------------------------------
5956 // Analysis
5957 
5958 
EnsureAnalyzed(RegExpNode * that)5959 void Analysis::EnsureAnalyzed(RegExpNode* that) {
5960   StackLimitCheck check(isolate());
5961   if (check.HasOverflowed()) {
5962     fail("Stack overflow");
5963     return;
5964   }
5965   if (that->info()->been_analyzed || that->info()->being_analyzed)
5966     return;
5967   that->info()->being_analyzed = true;
5968   that->Accept(this);
5969   that->info()->being_analyzed = false;
5970   that->info()->been_analyzed = true;
5971 }
5972 
5973 
VisitEnd(EndNode * that)5974 void Analysis::VisitEnd(EndNode* that) {
5975   // nothing to do
5976 }
5977 
5978 
CalculateOffsets()5979 void TextNode::CalculateOffsets() {
5980   int element_count = elements()->length();
5981   // Set up the offsets of the elements relative to the start.  This is a fixed
5982   // quantity since a TextNode can only contain fixed-width things.
5983   int cp_offset = 0;
5984   for (int i = 0; i < element_count; i++) {
5985     TextElement& elm = elements()->at(i);
5986     elm.set_cp_offset(cp_offset);
5987     cp_offset += elm.length();
5988   }
5989 }
5990 
5991 
VisitText(TextNode * that)5992 void Analysis::VisitText(TextNode* that) {
5993   if (ignore_case_) {
5994     that->MakeCaseIndependent(isolate(), is_one_byte_);
5995   }
5996   EnsureAnalyzed(that->on_success());
5997   if (!has_failed()) {
5998     that->CalculateOffsets();
5999   }
6000 }
6001 
6002 
VisitAction(ActionNode * that)6003 void Analysis::VisitAction(ActionNode* that) {
6004   RegExpNode* target = that->on_success();
6005   EnsureAnalyzed(target);
6006   if (!has_failed()) {
6007     // If the next node is interested in what it follows then this node
6008     // has to be interested too so it can pass the information on.
6009     that->info()->AddFromFollowing(target->info());
6010   }
6011 }
6012 
6013 
VisitChoice(ChoiceNode * that)6014 void Analysis::VisitChoice(ChoiceNode* that) {
6015   NodeInfo* info = that->info();
6016   for (int i = 0; i < that->alternatives()->length(); i++) {
6017     RegExpNode* node = that->alternatives()->at(i).node();
6018     EnsureAnalyzed(node);
6019     if (has_failed()) return;
6020     // Anything the following nodes need to know has to be known by
6021     // this node also, so it can pass it on.
6022     info->AddFromFollowing(node->info());
6023   }
6024 }
6025 
6026 
VisitLoopChoice(LoopChoiceNode * that)6027 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6028   NodeInfo* info = that->info();
6029   for (int i = 0; i < that->alternatives()->length(); i++) {
6030     RegExpNode* node = that->alternatives()->at(i).node();
6031     if (node != that->loop_node()) {
6032       EnsureAnalyzed(node);
6033       if (has_failed()) return;
6034       info->AddFromFollowing(node->info());
6035     }
6036   }
6037   // Check the loop last since it may need the value of this node
6038   // to get a correct result.
6039   EnsureAnalyzed(that->loop_node());
6040   if (!has_failed()) {
6041     info->AddFromFollowing(that->loop_node()->info());
6042   }
6043 }
6044 
6045 
VisitBackReference(BackReferenceNode * that)6046 void Analysis::VisitBackReference(BackReferenceNode* that) {
6047   EnsureAnalyzed(that->on_success());
6048 }
6049 
6050 
VisitAssertion(AssertionNode * that)6051 void Analysis::VisitAssertion(AssertionNode* that) {
6052   EnsureAnalyzed(that->on_success());
6053 }
6054 
6055 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6056 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6057                                      BoyerMooreLookahead* bm,
6058                                      bool not_at_start) {
6059   // Working out the set of characters that a backreference can match is too
6060   // hard, so we just say that any character can match.
6061   bm->SetRest(offset);
6062   SaveBMInfo(bm, not_at_start, offset);
6063 }
6064 
6065 
6066 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6067               RegExpMacroAssembler::kTableSize);
6068 
6069 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6070 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6071                               BoyerMooreLookahead* bm, bool not_at_start) {
6072   ZoneList<GuardedAlternative>* alts = alternatives();
6073   budget = (budget - 1) / alts->length();
6074   for (int i = 0; i < alts->length(); i++) {
6075     GuardedAlternative& alt = alts->at(i);
6076     if (alt.guards() != NULL && alt.guards()->length() != 0) {
6077       bm->SetRest(offset);  // Give up trying to fill in info.
6078       SaveBMInfo(bm, not_at_start, offset);
6079       return;
6080     }
6081     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6082   }
6083   SaveBMInfo(bm, not_at_start, offset);
6084 }
6085 
6086 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6087 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6088                             BoyerMooreLookahead* bm, bool not_at_start) {
6089   if (initial_offset >= bm->length()) return;
6090   int offset = initial_offset;
6091   int max_char = bm->max_char();
6092   for (int i = 0; i < elements()->length(); i++) {
6093     if (offset >= bm->length()) {
6094       if (initial_offset == 0) set_bm_info(not_at_start, bm);
6095       return;
6096     }
6097     TextElement text = elements()->at(i);
6098     if (text.text_type() == TextElement::ATOM) {
6099       RegExpAtom* atom = text.atom();
6100       for (int j = 0; j < atom->length(); j++, offset++) {
6101         if (offset >= bm->length()) {
6102           if (initial_offset == 0) set_bm_info(not_at_start, bm);
6103           return;
6104         }
6105         uc16 character = atom->data()[j];
6106         if (bm->compiler()->ignore_case()) {
6107           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6108           int length = GetCaseIndependentLetters(
6109               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6110               chars);
6111           for (int j = 0; j < length; j++) {
6112             bm->Set(offset, chars[j]);
6113           }
6114         } else {
6115           if (character <= max_char) bm->Set(offset, character);
6116         }
6117       }
6118     } else {
6119       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6120       RegExpCharacterClass* char_class = text.char_class();
6121       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6122       if (char_class->is_negated()) {
6123         bm->SetAll(offset);
6124       } else {
6125         for (int k = 0; k < ranges->length(); k++) {
6126           CharacterRange& range = ranges->at(k);
6127           if (range.from() > max_char) continue;
6128           int to = Min(max_char, static_cast<int>(range.to()));
6129           bm->SetInterval(offset, Interval(range.from(), to));
6130         }
6131       }
6132       offset++;
6133     }
6134   }
6135   if (offset >= bm->length()) {
6136     if (initial_offset == 0) set_bm_info(not_at_start, bm);
6137     return;
6138   }
6139   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6140                              true);  // Not at start after a text node.
6141   if (initial_offset == 0) set_bm_info(not_at_start, bm);
6142 }
6143 
6144 
6145 // -------------------------------------------------------------------
6146 // Dispatch table construction
6147 
6148 
VisitEnd(EndNode * that)6149 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6150   AddRange(CharacterRange::Everything());
6151 }
6152 
6153 
BuildTable(ChoiceNode * node)6154 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6155   node->set_being_calculated(true);
6156   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6157   for (int i = 0; i < alternatives->length(); i++) {
6158     set_choice_index(i);
6159     alternatives->at(i).node()->Accept(this);
6160   }
6161   node->set_being_calculated(false);
6162 }
6163 
6164 
6165 class AddDispatchRange {
6166  public:
AddDispatchRange(DispatchTableConstructor * constructor)6167   explicit AddDispatchRange(DispatchTableConstructor* constructor)
6168     : constructor_(constructor) { }
6169   void Call(uc32 from, DispatchTable::Entry entry);
6170  private:
6171   DispatchTableConstructor* constructor_;
6172 };
6173 
6174 
Call(uc32 from,DispatchTable::Entry entry)6175 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6176   CharacterRange range(from, entry.to());
6177   constructor_->AddRange(range);
6178 }
6179 
6180 
VisitChoice(ChoiceNode * node)6181 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6182   if (node->being_calculated())
6183     return;
6184   DispatchTable* table = node->GetTable(ignore_case_);
6185   AddDispatchRange adder(this);
6186   table->ForEach(&adder);
6187 }
6188 
6189 
VisitBackReference(BackReferenceNode * that)6190 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6191   // TODO(160): Find the node that we refer back to and propagate its start
6192   // set back to here.  For now we just accept anything.
6193   AddRange(CharacterRange::Everything());
6194 }
6195 
6196 
VisitAssertion(AssertionNode * that)6197 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6198   RegExpNode* target = that->on_success();
6199   target->Accept(this);
6200 }
6201 
6202 
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6203 static int CompareRangeByFrom(const CharacterRange* a,
6204                               const CharacterRange* b) {
6205   return Compare<uc16>(a->from(), b->from());
6206 }
6207 
6208 
AddInverse(ZoneList<CharacterRange> * ranges)6209 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6210   ranges->Sort(CompareRangeByFrom);
6211   uc16 last = 0;
6212   for (int i = 0; i < ranges->length(); i++) {
6213     CharacterRange range = ranges->at(i);
6214     if (last < range.from())
6215       AddRange(CharacterRange(last, range.from() - 1));
6216     if (range.to() >= last) {
6217       if (range.to() == String::kMaxUtf16CodeUnit) {
6218         return;
6219       } else {
6220         last = range.to() + 1;
6221       }
6222     }
6223   }
6224   AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
6225 }
6226 
6227 
VisitText(TextNode * that)6228 void DispatchTableConstructor::VisitText(TextNode* that) {
6229   TextElement elm = that->elements()->at(0);
6230   switch (elm.text_type()) {
6231     case TextElement::ATOM: {
6232       uc16 c = elm.atom()->data()[0];
6233       AddRange(CharacterRange(c, c));
6234       break;
6235     }
6236     case TextElement::CHAR_CLASS: {
6237       RegExpCharacterClass* tree = elm.char_class();
6238       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6239       if (tree->is_negated()) {
6240         AddInverse(ranges);
6241       } else {
6242         for (int i = 0; i < ranges->length(); i++)
6243           AddRange(ranges->at(i));
6244       }
6245       break;
6246     }
6247     default: {
6248       UNIMPLEMENTED();
6249     }
6250   }
6251 }
6252 
6253 
VisitAction(ActionNode * that)6254 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6255   RegExpNode* target = that->on_success();
6256   target->Accept(this);
6257 }
6258 
6259 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,bool ignore_case,bool is_global,bool is_multiline,bool is_sticky,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6260 RegExpEngine::CompilationResult RegExpEngine::Compile(
6261     Isolate* isolate, Zone* zone, RegExpCompileData* data, bool ignore_case,
6262     bool is_global, bool is_multiline, bool is_sticky, Handle<String> pattern,
6263     Handle<String> sample_subject, bool is_one_byte) {
6264   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6265     return IrregexpRegExpTooBig(isolate);
6266   }
6267   RegExpCompiler compiler(isolate, zone, data->capture_count, ignore_case,
6268                           is_one_byte);
6269 
6270   if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6271 
6272   // Sample some characters from the middle of the string.
6273   static const int kSampleSize = 128;
6274 
6275   sample_subject = String::Flatten(sample_subject);
6276   int chars_sampled = 0;
6277   int half_way = (sample_subject->length() - kSampleSize) / 2;
6278   for (int i = Max(0, half_way);
6279        i < sample_subject->length() && chars_sampled < kSampleSize;
6280        i++, chars_sampled++) {
6281     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6282   }
6283 
6284   // Wrap the body of the regexp in capture #0.
6285   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6286                                                     0,
6287                                                     &compiler,
6288                                                     compiler.accept());
6289   RegExpNode* node = captured_body;
6290   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6291   bool is_start_anchored = data->tree->IsAnchoredAtStart();
6292   int max_length = data->tree->max_match();
6293   if (!is_start_anchored && !is_sticky) {
6294     // Add a .*? at the beginning, outside the body capture, unless
6295     // this expression is anchored at the beginning or sticky.
6296     RegExpNode* loop_node = RegExpQuantifier::ToNode(
6297         0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6298         &compiler, captured_body, data->contains_anchor);
6299 
6300     if (data->contains_anchor) {
6301       // Unroll loop once, to take care of the case that might start
6302       // at the start of input.
6303       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6304       first_step_node->AddAlternative(GuardedAlternative(captured_body));
6305       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6306           new (zone) RegExpCharacterClass('*'), false, loop_node)));
6307       node = first_step_node;
6308     } else {
6309       node = loop_node;
6310     }
6311   }
6312   if (is_one_byte) {
6313     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6314     // Do it again to propagate the new nodes to places where they were not
6315     // put because they had not been calculated yet.
6316     if (node != NULL) {
6317       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6318     }
6319   }
6320 
6321   if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6322   data->node = node;
6323   Analysis analysis(isolate, ignore_case, is_one_byte);
6324   analysis.EnsureAnalyzed(node);
6325   if (analysis.has_failed()) {
6326     const char* error_message = analysis.error_message();
6327     return CompilationResult(isolate, error_message);
6328   }
6329 
6330   // Create the correct assembler for the architecture.
6331 #ifndef V8_INTERPRETED_REGEXP
6332   // Native regexp implementation.
6333 
6334   NativeRegExpMacroAssembler::Mode mode =
6335       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6336                   : NativeRegExpMacroAssembler::UC16;
6337 
6338 #if V8_TARGET_ARCH_IA32
6339   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6340                                            (data->capture_count + 1) * 2);
6341 #elif V8_TARGET_ARCH_X64
6342   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6343                                           (data->capture_count + 1) * 2);
6344 #elif V8_TARGET_ARCH_ARM
6345   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6346                                           (data->capture_count + 1) * 2);
6347 #elif V8_TARGET_ARCH_ARM64
6348   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6349                                             (data->capture_count + 1) * 2);
6350 #elif V8_TARGET_ARCH_PPC
6351   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6352                                           (data->capture_count + 1) * 2);
6353 #elif V8_TARGET_ARCH_MIPS
6354   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6355                                            (data->capture_count + 1) * 2);
6356 #elif V8_TARGET_ARCH_MIPS64
6357   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6358                                            (data->capture_count + 1) * 2);
6359 #elif V8_TARGET_ARCH_X87
6360   RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6361                                           (data->capture_count + 1) * 2);
6362 #else
6363 #error "Unsupported architecture"
6364 #endif
6365 
6366 #else  // V8_INTERPRETED_REGEXP
6367   // Interpreted regexp implementation.
6368   EmbeddedVector<byte, 1024> codes;
6369   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6370 #endif  // V8_INTERPRETED_REGEXP
6371 
6372   macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6373 
6374   // Inserted here, instead of in Assembler, because it depends on information
6375   // in the AST that isn't replicated in the Node structure.
6376   static const int kMaxBacksearchLimit = 1024;
6377   if (is_end_anchored &&
6378       !is_start_anchored &&
6379       max_length < kMaxBacksearchLimit) {
6380     macro_assembler.SetCurrentPositionFromEnd(max_length);
6381   }
6382 
6383   if (is_global) {
6384     macro_assembler.set_global_mode(
6385         (data->tree->min_match() > 0)
6386             ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6387             : RegExpMacroAssembler::GLOBAL);
6388   }
6389 
6390   return compiler.Assemble(&macro_assembler,
6391                            node,
6392                            data->capture_count,
6393                            pattern);
6394 }
6395 
6396 
TooMuchRegExpCode(Handle<String> pattern)6397 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6398   Heap* heap = pattern->GetHeap();
6399   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6400   if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6401       heap->isolate()->memory_allocator()->SizeExecutable() >
6402           RegExpImpl::kRegExpExecutableMemoryLimit) {
6403     too_much = true;
6404   }
6405   return too_much;
6406 }
6407 
6408 
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6409 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6410                                    Object* key_pattern,
6411                                    FixedArray** last_match_cache,
6412                                    ResultsCacheType type) {
6413   FixedArray* cache;
6414   if (!key_string->IsInternalizedString()) return Smi::FromInt(0);
6415   if (type == STRING_SPLIT_SUBSTRINGS) {
6416     DCHECK(key_pattern->IsString());
6417     if (!key_pattern->IsInternalizedString()) return Smi::FromInt(0);
6418     cache = heap->string_split_cache();
6419   } else {
6420     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6421     DCHECK(key_pattern->IsFixedArray());
6422     cache = heap->regexp_multiple_cache();
6423   }
6424 
6425   uint32_t hash = key_string->Hash();
6426   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6427                     ~(kArrayEntriesPerCacheEntry - 1));
6428   if (cache->get(index + kStringOffset) != key_string ||
6429       cache->get(index + kPatternOffset) != key_pattern) {
6430     index =
6431         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6432     if (cache->get(index + kStringOffset) != key_string ||
6433         cache->get(index + kPatternOffset) != key_pattern) {
6434       return Smi::FromInt(0);
6435     }
6436   }
6437 
6438   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6439   return cache->get(index + kArrayOffset);
6440 }
6441 
6442 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6443 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6444                                Handle<Object> key_pattern,
6445                                Handle<FixedArray> value_array,
6446                                Handle<FixedArray> last_match_cache,
6447                                ResultsCacheType type) {
6448   Factory* factory = isolate->factory();
6449   Handle<FixedArray> cache;
6450   if (!key_string->IsInternalizedString()) return;
6451   if (type == STRING_SPLIT_SUBSTRINGS) {
6452     DCHECK(key_pattern->IsString());
6453     if (!key_pattern->IsInternalizedString()) return;
6454     cache = factory->string_split_cache();
6455   } else {
6456     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6457     DCHECK(key_pattern->IsFixedArray());
6458     cache = factory->regexp_multiple_cache();
6459   }
6460 
6461   uint32_t hash = key_string->Hash();
6462   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6463                     ~(kArrayEntriesPerCacheEntry - 1));
6464   if (cache->get(index + kStringOffset) == Smi::FromInt(0)) {
6465     cache->set(index + kStringOffset, *key_string);
6466     cache->set(index + kPatternOffset, *key_pattern);
6467     cache->set(index + kArrayOffset, *value_array);
6468     cache->set(index + kLastMatchOffset, *last_match_cache);
6469   } else {
6470     uint32_t index2 =
6471         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6472     if (cache->get(index2 + kStringOffset) == Smi::FromInt(0)) {
6473       cache->set(index2 + kStringOffset, *key_string);
6474       cache->set(index2 + kPatternOffset, *key_pattern);
6475       cache->set(index2 + kArrayOffset, *value_array);
6476       cache->set(index2 + kLastMatchOffset, *last_match_cache);
6477     } else {
6478       cache->set(index2 + kStringOffset, Smi::FromInt(0));
6479       cache->set(index2 + kPatternOffset, Smi::FromInt(0));
6480       cache->set(index2 + kArrayOffset, Smi::FromInt(0));
6481       cache->set(index2 + kLastMatchOffset, Smi::FromInt(0));
6482       cache->set(index + kStringOffset, *key_string);
6483       cache->set(index + kPatternOffset, *key_pattern);
6484       cache->set(index + kArrayOffset, *value_array);
6485       cache->set(index + kLastMatchOffset, *last_match_cache);
6486     }
6487   }
6488   // If the array is a reasonably short list of substrings, convert it into a
6489   // list of internalized strings.
6490   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6491     for (int i = 0; i < value_array->length(); i++) {
6492       Handle<String> str(String::cast(value_array->get(i)), isolate);
6493       Handle<String> internalized_str = factory->InternalizeString(str);
6494       value_array->set(i, *internalized_str);
6495     }
6496   }
6497   // Convert backing store to a copy-on-write array.
6498   value_array->set_map_no_write_barrier(*factory->fixed_cow_array_map());
6499 }
6500 
6501 
Clear(FixedArray * cache)6502 void RegExpResultsCache::Clear(FixedArray* cache) {
6503   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6504     cache->set(i, Smi::FromInt(0));
6505   }
6506 }
6507 
6508 }  // namespace internal
6509 }  // namespace v8
6510