1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/jsregexp.h"
6 
7 #include <memory>
8 #include <vector>
9 
10 #include "src/base/platform/platform.h"
11 #include "src/code-tracer.h"
12 #include "src/compilation-cache.h"
13 #include "src/elements.h"
14 #include "src/execution.h"
15 #include "src/heap/factory.h"
16 #include "src/isolate-inl.h"
17 #include "src/messages.h"
18 #include "src/ostreams.h"
19 #include "src/regexp/interpreter-irregexp.h"
20 #include "src/regexp/jsregexp-inl.h"
21 #include "src/regexp/regexp-macro-assembler-irregexp.h"
22 #include "src/regexp/regexp-macro-assembler-tracer.h"
23 #include "src/regexp/regexp-macro-assembler.h"
24 #include "src/regexp/regexp-parser.h"
25 #include "src/regexp/regexp-stack.h"
26 #include "src/runtime/runtime.h"
27 #include "src/splay-tree-inl.h"
28 #include "src/string-search.h"
29 #include "src/unicode-decoder.h"
30 #include "src/unicode-inl.h"
31 
32 #ifdef V8_INTL_SUPPORT
33 #include "unicode/uniset.h"
34 #include "unicode/utypes.h"
35 #endif  // V8_INTL_SUPPORT
36 
37 #ifndef V8_INTERPRETED_REGEXP
38 #if V8_TARGET_ARCH_IA32
39 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
40 #elif V8_TARGET_ARCH_X64
41 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
42 #elif V8_TARGET_ARCH_ARM64
43 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
44 #elif V8_TARGET_ARCH_ARM
45 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
46 #elif V8_TARGET_ARCH_PPC
47 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
48 #elif V8_TARGET_ARCH_S390
49 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
50 #elif V8_TARGET_ARCH_MIPS
51 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
52 #elif V8_TARGET_ARCH_MIPS64
53 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
54 #else
55 #error Unsupported target architecture.
56 #endif
57 #endif
58 
59 
60 namespace v8 {
61 namespace internal {
62 
63 V8_WARN_UNUSED_RESULT
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)64 static inline MaybeHandle<Object> ThrowRegExpException(
65     Isolate* isolate, Handle<JSRegExp> re, Handle<String> pattern,
66     Handle<String> error_text) {
67   THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
68                                           pattern, error_text),
69                   Object);
70 }
71 
ThrowRegExpException(Isolate * isolate,Handle<JSRegExp> re,Handle<String> error_text)72 inline void ThrowRegExpException(Isolate* isolate, Handle<JSRegExp> re,
73                                  Handle<String> error_text) {
74   USE(ThrowRegExpException(isolate, re, Handle<String>(re->Pattern(), isolate),
75                            error_text));
76 }
77 
78 
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)79 ContainedInLattice AddRange(ContainedInLattice containment,
80                             const int* ranges,
81                             int ranges_length,
82                             Interval new_range) {
83   DCHECK_EQ(1, ranges_length & 1);
84   DCHECK_EQ(String::kMaxCodePoint + 1, ranges[ranges_length - 1]);
85   if (containment == kLatticeUnknown) return containment;
86   bool inside = false;
87   int last = 0;
88   for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
89     // Consider the range from last to ranges[i].
90     // We haven't got to the new range yet.
91     if (ranges[i] <= new_range.from()) continue;
92     // New range is wholly inside last-ranges[i].  Note that new_range.to() is
93     // inclusive, but the values in ranges are not.
94     if (last <= new_range.from() && new_range.to() < ranges[i]) {
95       return Combine(containment, inside ? kLatticeIn : kLatticeOut);
96     }
97     return kLatticeUnknown;
98   }
99   return containment;
100 }
101 
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107 
108 // Identifies the sort of regexps where the regexp engine is faster
109 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)110 static bool HasFewDifferentCharacters(Handle<String> pattern) {
111   int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
112   if (length <= kPatternTooShortForBoyerMoore) return false;
113   const int kMod = 128;
114   bool character_found[kMod];
115   int different = 0;
116   memset(&character_found[0], 0, sizeof(character_found));
117   for (int i = 0; i < length; i++) {
118     int ch = (pattern->Get(i) & (kMod - 1));
119     if (!character_found[ch]) {
120       character_found[ch] = true;
121       different++;
122       // We declare a regexp low-alphabet if it has at least 3 times as many
123       // characters as it has different characters.
124       if (different * 3 > length) return false;
125     }
126   }
127   return true;
128 }
129 
130 // Generic RegExp methods. Dispatches to implementation specific methods.
131 
Compile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)132 MaybeHandle<Object> RegExpImpl::Compile(Isolate* isolate, Handle<JSRegExp> re,
133                                         Handle<String> pattern,
134                                         JSRegExp::Flags flags) {
135   DCHECK(pattern->IsFlat());
136 
137   Zone zone(isolate->allocator(), ZONE_NAME);
138   CompilationCache* compilation_cache = isolate->compilation_cache();
139   MaybeHandle<FixedArray> maybe_cached =
140       compilation_cache->LookupRegExp(pattern, flags);
141   Handle<FixedArray> cached;
142   if (maybe_cached.ToHandle(&cached)) {
143     re->set_data(*cached);
144     return re;
145   }
146 
147   PostponeInterruptsScope postpone(isolate);
148   RegExpCompileData parse_result;
149   FlatStringReader reader(isolate, pattern);
150   DCHECK(!isolate->has_pending_exception());
151   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
152                                  &parse_result)) {
153     // Throw an exception if we fail to parse the pattern.
154     return ThrowRegExpException(isolate, re, pattern, parse_result.error);
155   }
156 
157   bool has_been_compiled = false;
158 
159   if (parse_result.simple && !IgnoreCase(flags) && !IsSticky(flags) &&
160       !HasFewDifferentCharacters(pattern)) {
161     // Parse-tree is a single atom that is equal to the pattern.
162     AtomCompile(isolate, re, pattern, flags, pattern);
163     has_been_compiled = true;
164   } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
165              parse_result.capture_count == 0) {
166     RegExpAtom* atom = parse_result.tree->AsAtom();
167     Vector<const uc16> atom_pattern = atom->data();
168     Handle<String> atom_string;
169     ASSIGN_RETURN_ON_EXCEPTION(
170         isolate, atom_string,
171         isolate->factory()->NewStringFromTwoByte(atom_pattern), Object);
172     if (!IgnoreCase(atom->flags()) && !HasFewDifferentCharacters(atom_string)) {
173       AtomCompile(isolate, re, pattern, flags, atom_string);
174       has_been_compiled = true;
175     }
176   }
177   if (!has_been_compiled) {
178     IrregexpInitialize(isolate, re, pattern, flags, parse_result.capture_count);
179   }
180   DCHECK(re->data()->IsFixedArray());
181   // Compilation succeeded so the data is set on the regexp
182   // and we can store it in the cache.
183   Handle<FixedArray> data(FixedArray::cast(re->data()), isolate);
184   compilation_cache->PutRegExp(pattern, flags, data);
185 
186   return re;
187 }
188 
Exec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)189 MaybeHandle<Object> RegExpImpl::Exec(Isolate* isolate, Handle<JSRegExp> regexp,
190                                      Handle<String> subject, int index,
191                                      Handle<RegExpMatchInfo> last_match_info) {
192   switch (regexp->TypeTag()) {
193     case JSRegExp::ATOM:
194       return AtomExec(isolate, regexp, subject, index, last_match_info);
195     case JSRegExp::IRREGEXP: {
196       return IrregexpExec(isolate, regexp, subject, index, last_match_info);
197     }
198     default:
199       UNREACHABLE();
200   }
201 }
202 
203 
204 // RegExp Atom implementation: Simple string search using indexOf.
205 
AtomCompile(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)206 void RegExpImpl::AtomCompile(Isolate* isolate, Handle<JSRegExp> re,
207                              Handle<String> pattern, JSRegExp::Flags flags,
208                              Handle<String> match_pattern) {
209   isolate->factory()->SetRegExpAtomData(re, JSRegExp::ATOM, pattern, flags,
210                                         match_pattern);
211 }
212 
SetAtomLastCapture(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,String * subject,int from,int to)213 static void SetAtomLastCapture(Isolate* isolate,
214                                Handle<RegExpMatchInfo> last_match_info,
215                                String* subject, int from, int to) {
216   SealHandleScope shs(isolate);
217   last_match_info->SetNumberOfCaptureRegisters(2);
218   last_match_info->SetLastSubject(subject);
219   last_match_info->SetLastInput(subject);
220   last_match_info->SetCapture(0, from);
221   last_match_info->SetCapture(1, to);
222 }
223 
AtomExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)224 int RegExpImpl::AtomExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
225                             Handle<String> subject, int index, int32_t* output,
226                             int output_size) {
227   DCHECK_LE(0, index);
228   DCHECK_LE(index, subject->length());
229 
230   subject = String::Flatten(isolate, subject);
231   DisallowHeapAllocation no_gc;  // ensure vectors stay valid
232 
233   String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
234   int needle_len = needle->length();
235   DCHECK(needle->IsFlat());
236   DCHECK_LT(0, needle_len);
237 
238   if (index + needle_len > subject->length()) {
239     return RegExpImpl::RE_FAILURE;
240   }
241 
242   for (int i = 0; i < output_size; i += 2) {
243     String::FlatContent needle_content = needle->GetFlatContent();
244     String::FlatContent subject_content = subject->GetFlatContent();
245     DCHECK(needle_content.IsFlat());
246     DCHECK(subject_content.IsFlat());
247     // dispatch on type of strings
248     index =
249         (needle_content.IsOneByte()
250              ? (subject_content.IsOneByte()
251                     ? SearchString(isolate, subject_content.ToOneByteVector(),
252                                    needle_content.ToOneByteVector(), index)
253                     : SearchString(isolate, subject_content.ToUC16Vector(),
254                                    needle_content.ToOneByteVector(), index))
255              : (subject_content.IsOneByte()
256                     ? SearchString(isolate, subject_content.ToOneByteVector(),
257                                    needle_content.ToUC16Vector(), index)
258                     : SearchString(isolate, subject_content.ToUC16Vector(),
259                                    needle_content.ToUC16Vector(), index)));
260     if (index == -1) {
261       return i / 2;  // Return number of matches.
262     } else {
263       output[i] = index;
264       output[i+1] = index + needle_len;
265       index += needle_len;
266     }
267   }
268   return output_size / 2;
269 }
270 
AtomExec(Isolate * isolate,Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)271 Handle<Object> RegExpImpl::AtomExec(Isolate* isolate, Handle<JSRegExp> re,
272                                     Handle<String> subject, int index,
273                                     Handle<RegExpMatchInfo> last_match_info) {
274   static const int kNumRegisters = 2;
275   STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
276   int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
277 
278   int res =
279       AtomExecRaw(isolate, re, subject, index, output_registers, kNumRegisters);
280 
281   if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
282 
283   DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
284   SealHandleScope shs(isolate);
285   SetAtomLastCapture(isolate, last_match_info, *subject, output_registers[0],
286                      output_registers[1]);
287   return last_match_info;
288 }
289 
290 
291 // Irregexp implementation.
292 
293 // Ensures that the regexp object contains a compiled version of the
294 // source for either one-byte or two-byte subject strings.
295 // If the compiled version doesn't already exist, it is compiled
296 // from the source pattern.
297 // If compilation fails, an exception is thrown and this function
298 // returns false.
EnsureCompiledIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)299 bool RegExpImpl::EnsureCompiledIrregexp(Isolate* isolate, Handle<JSRegExp> re,
300                                         Handle<String> sample_subject,
301                                         bool is_one_byte) {
302   Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
303 #ifdef V8_INTERPRETED_REGEXP
304   if (compiled_code->IsByteArray()) return true;
305 #else  // V8_INTERPRETED_REGEXP (RegExp native code)
306   if (compiled_code->IsCode()) return true;
307 #endif
308   return CompileIrregexp(isolate, re, sample_subject, is_one_byte);
309 }
310 
CompileIrregexp(Isolate * isolate,Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)311 bool RegExpImpl::CompileIrregexp(Isolate* isolate, Handle<JSRegExp> re,
312                                  Handle<String> sample_subject,
313                                  bool is_one_byte) {
314   // Compile the RegExp.
315   Zone zone(isolate->allocator(), ZONE_NAME);
316   PostponeInterruptsScope postpone(isolate);
317 #ifdef DEBUG
318   Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
319   // When arriving here entry can only be a smi representing an uncompiled
320   // regexp.
321   DCHECK(entry->IsSmi());
322   int entry_value = Smi::ToInt(entry);
323   DCHECK_EQ(JSRegExp::kUninitializedValue, entry_value);
324 #endif
325 
326   JSRegExp::Flags flags = re->GetFlags();
327 
328   Handle<String> pattern(re->Pattern(), isolate);
329   pattern = String::Flatten(isolate, pattern);
330   RegExpCompileData compile_data;
331   FlatStringReader reader(isolate, pattern);
332   if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
333                                  &compile_data)) {
334     // Throw an exception if we fail to parse the pattern.
335     // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
336     USE(ThrowRegExpException(isolate, re, pattern, compile_data.error));
337     return false;
338   }
339   RegExpEngine::CompilationResult result =
340       RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
341                             sample_subject, is_one_byte);
342   if (result.error_message != nullptr) {
343     // Unable to compile regexp.
344     if (FLAG_abort_on_stack_or_string_length_overflow &&
345         strncmp(result.error_message, "Stack overflow", 15) == 0) {
346       FATAL("Aborting on stack overflow");
347     }
348     Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
349         CStrVector(result.error_message)).ToHandleChecked();
350     ThrowRegExpException(isolate, re, error_message);
351     return false;
352   }
353 
354   Handle<FixedArray> data =
355       Handle<FixedArray>(FixedArray::cast(re->data()), isolate);
356   data->set(JSRegExp::code_index(is_one_byte), result.code);
357   SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
358   int register_max = IrregexpMaxRegisterCount(*data);
359   if (result.num_registers > register_max) {
360     SetIrregexpMaxRegisterCount(*data, result.num_registers);
361   }
362 
363   return true;
364 }
365 
366 
IrregexpMaxRegisterCount(FixedArray * re)367 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
368   return Smi::cast(
369       re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
370 }
371 
372 
SetIrregexpMaxRegisterCount(FixedArray * re,int value)373 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
374   re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
375 }
376 
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)377 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
378                                            Handle<FixedArray> value) {
379   if (value.is_null()) {
380     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
381   } else {
382     re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
383   }
384 }
385 
IrregexpNumberOfCaptures(FixedArray * re)386 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
387   return Smi::ToInt(re->get(JSRegExp::kIrregexpCaptureCountIndex));
388 }
389 
390 
IrregexpNumberOfRegisters(FixedArray * re)391 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
392   return Smi::ToInt(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex));
393 }
394 
395 
IrregexpByteCode(FixedArray * re,bool is_one_byte)396 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
397   return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
398 }
399 
400 
IrregexpNativeCode(FixedArray * re,bool is_one_byte)401 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
402   return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
403 }
404 
IrregexpInitialize(Isolate * isolate,Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)405 void RegExpImpl::IrregexpInitialize(Isolate* isolate, Handle<JSRegExp> re,
406                                     Handle<String> pattern,
407                                     JSRegExp::Flags flags, int capture_count) {
408   // Initialize compiled code entries to null.
409   isolate->factory()->SetRegExpIrregexpData(re, JSRegExp::IRREGEXP, pattern,
410                                             flags, capture_count);
411 }
412 
IrregexpPrepare(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject)413 int RegExpImpl::IrregexpPrepare(Isolate* isolate, Handle<JSRegExp> regexp,
414                                 Handle<String> subject) {
415   DCHECK(subject->IsFlat());
416 
417   // Check representation of the underlying storage.
418   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
419   if (!EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte)) return -1;
420 
421 #ifdef V8_INTERPRETED_REGEXP
422   // Byte-code regexp needs space allocated for all its registers.
423   // The result captures are copied to the start of the registers array
424   // if the match succeeds.  This way those registers are not clobbered
425   // when we set the last match info from last successful match.
426   return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
427          (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
428 #else  // V8_INTERPRETED_REGEXP
429   // Native regexp only needs room to output captures. Registers are handled
430   // internally.
431   return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
432 #endif  // V8_INTERPRETED_REGEXP
433 }
434 
IrregexpExecRaw(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)435 int RegExpImpl::IrregexpExecRaw(Isolate* isolate, Handle<JSRegExp> regexp,
436                                 Handle<String> subject, int index,
437                                 int32_t* output, int output_size) {
438   Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
439 
440   DCHECK_LE(0, index);
441   DCHECK_LE(index, subject->length());
442   DCHECK(subject->IsFlat());
443 
444   bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
445 
446 #ifndef V8_INTERPRETED_REGEXP
447   DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
448   do {
449     EnsureCompiledIrregexp(isolate, regexp, subject, is_one_byte);
450     Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
451     // The stack is used to allocate registers for the compiled regexp code.
452     // This means that in case of failure, the output registers array is left
453     // untouched and contains the capture results from the previous successful
454     // match.  We can use that to set the last match info lazily.
455     NativeRegExpMacroAssembler::Result res =
456         NativeRegExpMacroAssembler::Match(code,
457                                           subject,
458                                           output,
459                                           output_size,
460                                           index,
461                                           isolate);
462     if (res != NativeRegExpMacroAssembler::RETRY) {
463       DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
464              isolate->has_pending_exception());
465       STATIC_ASSERT(
466           static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
467       STATIC_ASSERT(
468           static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
469       STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
470                     == RE_EXCEPTION);
471       return static_cast<IrregexpResult>(res);
472     }
473     // If result is RETRY, the string has changed representation, and we
474     // must restart from scratch.
475     // In this case, it means we must make sure we are prepared to handle
476     // the, potentially, different subject (the string can switch between
477     // being internal and external, and even between being Latin1 and UC16,
478     // but the characters are always the same).
479     IrregexpPrepare(isolate, regexp, subject);
480     is_one_byte = subject->IsOneByteRepresentationUnderneath();
481   } while (true);
482   UNREACHABLE();
483 #else  // V8_INTERPRETED_REGEXP
484 
485   DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
486   // We must have done EnsureCompiledIrregexp, so we can get the number of
487   // registers.
488   int number_of_capture_registers =
489       (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
490   int32_t* raw_output = &output[number_of_capture_registers];
491   // We do not touch the actual capture result registers until we know there
492   // has been a match so that we can use those capture results to set the
493   // last match info.
494   for (int i = number_of_capture_registers - 1; i >= 0; i--) {
495     raw_output[i] = -1;
496   }
497   Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
498                                isolate);
499 
500   IrregexpResult result = IrregexpInterpreter::Match(isolate,
501                                                      byte_codes,
502                                                      subject,
503                                                      raw_output,
504                                                      index);
505   if (result == RE_SUCCESS) {
506     // Copy capture results to the start of the registers array.
507     MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
508   }
509   if (result == RE_EXCEPTION) {
510     DCHECK(!isolate->has_pending_exception());
511     isolate->StackOverflow();
512   }
513   return result;
514 #endif  // V8_INTERPRETED_REGEXP
515 }
516 
IrregexpExec(Isolate * isolate,Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)517 MaybeHandle<Object> RegExpImpl::IrregexpExec(
518     Isolate* isolate, Handle<JSRegExp> regexp, Handle<String> subject,
519     int previous_index, Handle<RegExpMatchInfo> last_match_info) {
520   DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
521 
522   subject = String::Flatten(isolate, subject);
523 
524   // Prepare space for the return values.
525 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
526   if (FLAG_trace_regexp_bytecodes) {
527     String* pattern = regexp->Pattern();
528     PrintF("\n\nRegexp match:   /%s/\n\n", pattern->ToCString().get());
529     PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
530   }
531 #endif
532   int required_registers =
533       RegExpImpl::IrregexpPrepare(isolate, regexp, subject);
534   if (required_registers < 0) {
535     // Compiling failed with an exception.
536     DCHECK(isolate->has_pending_exception());
537     return MaybeHandle<Object>();
538   }
539 
540   int32_t* output_registers = nullptr;
541   if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
542     output_registers = NewArray<int32_t>(required_registers);
543   }
544   std::unique_ptr<int32_t[]> auto_release(output_registers);
545   if (output_registers == nullptr) {
546     output_registers = isolate->jsregexp_static_offsets_vector();
547   }
548 
549   int res =
550       RegExpImpl::IrregexpExecRaw(isolate, regexp, subject, previous_index,
551                                   output_registers, required_registers);
552   if (res == RE_SUCCESS) {
553     int capture_count =
554         IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
555     return SetLastMatchInfo(isolate, last_match_info, subject, capture_count,
556                             output_registers);
557   }
558   if (res == RE_EXCEPTION) {
559     DCHECK(isolate->has_pending_exception());
560     return MaybeHandle<Object>();
561   }
562   DCHECK(res == RE_FAILURE);
563   return isolate->factory()->null_value();
564 }
565 
SetLastMatchInfo(Isolate * isolate,Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)566 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
567     Isolate* isolate, Handle<RegExpMatchInfo> last_match_info,
568     Handle<String> subject, int capture_count, int32_t* match) {
569   // This is the only place where match infos can grow. If, after executing the
570   // regexp, RegExpExecStub finds that the match info is too small, it restarts
571   // execution in RegExpImpl::Exec, which finally grows the match info right
572   // here.
573 
574   int capture_register_count = (capture_count + 1) * 2;
575   Handle<RegExpMatchInfo> result = RegExpMatchInfo::ReserveCaptures(
576       isolate, last_match_info, capture_register_count);
577   result->SetNumberOfCaptureRegisters(capture_register_count);
578 
579   if (*result != *last_match_info) {
580     // The match info has been reallocated, update the corresponding reference
581     // on the native context.
582     if (*last_match_info == *isolate->regexp_last_match_info()) {
583       isolate->native_context()->set_regexp_last_match_info(*result);
584     } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
585       isolate->native_context()->set_regexp_internal_match_info(*result);
586     }
587   }
588 
589   DisallowHeapAllocation no_allocation;
590   if (match != nullptr) {
591     for (int i = 0; i < capture_register_count; i += 2) {
592       result->SetCapture(i, match[i]);
593       result->SetCapture(i + 1, match[i + 1]);
594     }
595   }
596   result->SetLastSubject(*subject);
597   result->SetLastInput(*subject);
598   return result;
599 }
600 
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)601 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
602                                      Handle<String> subject, Isolate* isolate)
603     : register_array_(nullptr),
604       register_array_size_(0),
605       regexp_(regexp),
606       subject_(subject),
607       isolate_(isolate) {
608 #ifdef V8_INTERPRETED_REGEXP
609   bool interpreted = true;
610 #else
611   bool interpreted = false;
612 #endif  // V8_INTERPRETED_REGEXP
613 
614   if (regexp_->TypeTag() == JSRegExp::ATOM) {
615     static const int kAtomRegistersPerMatch = 2;
616     registers_per_match_ = kAtomRegistersPerMatch;
617     // There is no distinction between interpreted and native for atom regexps.
618     interpreted = false;
619   } else {
620     registers_per_match_ =
621         RegExpImpl::IrregexpPrepare(isolate_, regexp_, subject_);
622     if (registers_per_match_ < 0) {
623       num_matches_ = -1;  // Signal exception.
624       return;
625     }
626   }
627 
628   DCHECK(IsGlobal(regexp->GetFlags()));
629   if (!interpreted) {
630     register_array_size_ =
631         Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
632     max_matches_ = register_array_size_ / registers_per_match_;
633   } else {
634     // Global loop in interpreted regexp is not implemented.  We choose
635     // the size of the offsets vector so that it can only store one match.
636     register_array_size_ = registers_per_match_;
637     max_matches_ = 1;
638   }
639 
640   if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
641     register_array_ = NewArray<int32_t>(register_array_size_);
642   } else {
643     register_array_ = isolate->jsregexp_static_offsets_vector();
644   }
645 
646   // Set state so that fetching the results the first time triggers a call
647   // to the compiled regexp.
648   current_match_index_ = max_matches_ - 1;
649   num_matches_ = max_matches_;
650   DCHECK_LE(2, registers_per_match_);  // Each match has at least one capture.
651   DCHECK_GE(register_array_size_, registers_per_match_);
652   int32_t* last_match =
653       &register_array_[current_match_index_ * registers_per_match_];
654   last_match[0] = -1;
655   last_match[1] = 0;
656 }
657 
AdvanceZeroLength(int last_index)658 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
659   if (IsUnicode(regexp_->GetFlags()) && last_index + 1 < subject_->length() &&
660       unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
661       unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
662     // Advance over the surrogate pair.
663     return last_index + 2;
664   }
665   return last_index + 1;
666 }
667 
668 // -------------------------------------------------------------------
669 // Implementation of the Irregexp regular expression engine.
670 //
671 // The Irregexp regular expression engine is intended to be a complete
672 // implementation of ECMAScript regular expressions.  It generates either
673 // bytecodes or native code.
674 
675 //   The Irregexp regexp engine is structured in three steps.
676 //   1) The parser generates an abstract syntax tree.  See ast.cc.
677 //   2) From the AST a node network is created.  The nodes are all
678 //      subclasses of RegExpNode.  The nodes represent states when
679 //      executing a regular expression.  Several optimizations are
680 //      performed on the node network.
681 //   3) From the nodes we generate either byte codes or native code
682 //      that can actually execute the regular expression (perform
683 //      the search).  The code generation step is described in more
684 //      detail below.
685 
686 // Code generation.
687 //
688 //   The nodes are divided into four main categories.
689 //   * Choice nodes
690 //        These represent places where the regular expression can
691 //        match in more than one way.  For example on entry to an
692 //        alternation (foo|bar) or a repetition (*, +, ? or {}).
693 //   * Action nodes
694 //        These represent places where some action should be
695 //        performed.  Examples include recording the current position
696 //        in the input string to a register (in order to implement
697 //        captures) or other actions on register for example in order
698 //        to implement the counters needed for {} repetitions.
699 //   * Matching nodes
700 //        These attempt to match some element part of the input string.
701 //        Examples of elements include character classes, plain strings
702 //        or back references.
703 //   * End nodes
704 //        These are used to implement the actions required on finding
705 //        a successful match or failing to find a match.
706 //
707 //   The code generated (whether as byte codes or native code) maintains
708 //   some state as it runs.  This consists of the following elements:
709 //
710 //   * The capture registers.  Used for string captures.
711 //   * Other registers.  Used for counters etc.
712 //   * The current position.
713 //   * The stack of backtracking information.  Used when a matching node
714 //     fails to find a match and needs to try an alternative.
715 //
716 // Conceptual regular expression execution model:
717 //
718 //   There is a simple conceptual model of regular expression execution
719 //   which will be presented first.  The actual code generated is a more
720 //   efficient simulation of the simple conceptual model:
721 //
722 //   * Choice nodes are implemented as follows:
723 //     For each choice except the last {
724 //       push current position
725 //       push backtrack code location
726 //       <generate code to test for choice>
727 //       backtrack code location:
728 //       pop current position
729 //     }
730 //     <generate code to test for last choice>
731 //
732 //   * Actions nodes are generated as follows
733 //     <push affected registers on backtrack stack>
734 //     <generate code to perform action>
735 //     push backtrack code location
736 //     <generate code to test for following nodes>
737 //     backtrack code location:
738 //     <pop affected registers to restore their state>
739 //     <pop backtrack location from stack and go to it>
740 //
741 //   * Matching nodes are generated as follows:
742 //     if input string matches at current position
743 //       update current position
744 //       <generate code to test for following nodes>
745 //     else
746 //       <pop backtrack location from stack and go to it>
747 //
748 //   Thus it can be seen that the current position is saved and restored
749 //   by the choice nodes, whereas the registers are saved and restored by
750 //   by the action nodes that manipulate them.
751 //
752 //   The other interesting aspect of this model is that nodes are generated
753 //   at the point where they are needed by a recursive call to Emit().  If
754 //   the node has already been code generated then the Emit() call will
755 //   generate a jump to the previously generated code instead.  In order to
756 //   limit recursion it is possible for the Emit() function to put the node
757 //   on a work list for later generation and instead generate a jump.  The
758 //   destination of the jump is resolved later when the code is generated.
759 //
760 // Actual regular expression code generation.
761 //
762 //   Code generation is actually more complicated than the above.  In order
763 //   to improve the efficiency of the generated code some optimizations are
764 //   performed
765 //
766 //   * Choice nodes have 1-character lookahead.
767 //     A choice node looks at the following character and eliminates some of
768 //     the choices immediately based on that character.  This is not yet
769 //     implemented.
770 //   * Simple greedy loops store reduced backtracking information.
771 //     A quantifier like /.*foo/m will greedily match the whole input.  It will
772 //     then need to backtrack to a point where it can match "foo".  The naive
773 //     implementation of this would push each character position onto the
774 //     backtracking stack, then pop them off one by one.  This would use space
775 //     proportional to the length of the input string.  However since the "."
776 //     can only match in one way and always has a constant length (in this case
777 //     of 1) it suffices to store the current position on the top of the stack
778 //     once.  Matching now becomes merely incrementing the current position and
779 //     backtracking becomes decrementing the current position and checking the
780 //     result against the stored current position.  This is faster and saves
781 //     space.
782 //   * The current state is virtualized.
783 //     This is used to defer expensive operations until it is clear that they
784 //     are needed and to generate code for a node more than once, allowing
785 //     specialized an efficient versions of the code to be created. This is
786 //     explained in the section below.
787 //
788 // Execution state virtualization.
789 //
790 //   Instead of emitting code, nodes that manipulate the state can record their
791 //   manipulation in an object called the Trace.  The Trace object can record a
792 //   current position offset, an optional backtrack code location on the top of
793 //   the virtualized backtrack stack and some register changes.  When a node is
794 //   to be emitted it can flush the Trace or update it.  Flushing the Trace
795 //   will emit code to bring the actual state into line with the virtual state.
796 //   Avoiding flushing the state can postpone some work (e.g. updates of capture
797 //   registers).  Postponing work can save time when executing the regular
798 //   expression since it may be found that the work never has to be done as a
799 //   failure to match can occur.  In addition it is much faster to jump to a
800 //   known backtrack code location than it is to pop an unknown backtrack
801 //   location from the stack and jump there.
802 //
803 //   The virtual state found in the Trace affects code generation.  For example
804 //   the virtual state contains the difference between the actual current
805 //   position and the virtual current position, and matching code needs to use
806 //   this offset to attempt a match in the correct location of the input
807 //   string.  Therefore code generated for a non-trivial trace is specialized
808 //   to that trace.  The code generator therefore has the ability to generate
809 //   code for each node several times.  In order to limit the size of the
810 //   generated code there is an arbitrary limit on how many specialized sets of
811 //   code may be generated for a given node.  If the limit is reached, the
812 //   trace is flushed and a generic version of the code for a node is emitted.
813 //   This is subsequently used for that node.  The code emitted for non-generic
814 //   trace is not recorded in the node and so it cannot currently be reused in
815 //   the event that code generation is requested for an identical trace.
816 
817 
AppendToText(RegExpText * text,Zone * zone)818 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
819   UNREACHABLE();
820 }
821 
822 
AppendToText(RegExpText * text,Zone * zone)823 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
824   text->AddElement(TextElement::Atom(this), zone);
825 }
826 
827 
AppendToText(RegExpText * text,Zone * zone)828 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
829   text->AddElement(TextElement::CharClass(this), zone);
830 }
831 
832 
AppendToText(RegExpText * text,Zone * zone)833 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
834   for (int i = 0; i < elements()->length(); i++)
835     text->AddElement(elements()->at(i), zone);
836 }
837 
838 
Atom(RegExpAtom * atom)839 TextElement TextElement::Atom(RegExpAtom* atom) {
840   return TextElement(ATOM, atom);
841 }
842 
843 
CharClass(RegExpCharacterClass * char_class)844 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
845   return TextElement(CHAR_CLASS, char_class);
846 }
847 
848 
length() const849 int TextElement::length() const {
850   switch (text_type()) {
851     case ATOM:
852       return atom()->length();
853 
854     case CHAR_CLASS:
855       return 1;
856   }
857   UNREACHABLE();
858 }
859 
860 
GetTable(bool ignore_case)861 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
862   if (table_ == nullptr) {
863     table_ = new(zone()) DispatchTable(zone());
864     DispatchTableConstructor cons(table_, ignore_case, zone());
865     cons.BuildTable(this);
866   }
867   return table_;
868 }
869 
870 
871 class FrequencyCollator {
872  public:
FrequencyCollator()873   FrequencyCollator() : total_samples_(0) {
874     for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
875       frequencies_[i] = CharacterFrequency(i);
876     }
877   }
878 
CountCharacter(int character)879   void CountCharacter(int character) {
880     int index = (character & RegExpMacroAssembler::kTableMask);
881     frequencies_[index].Increment();
882     total_samples_++;
883   }
884 
885   // Does not measure in percent, but rather per-128 (the table size from the
886   // regexp macro assembler).
Frequency(int in_character)887   int Frequency(int in_character) {
888     DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
889     if (total_samples_ < 1) return 1;  // Division by zero.
890     int freq_in_per128 =
891         (frequencies_[in_character].counter() * 128) / total_samples_;
892     return freq_in_per128;
893   }
894 
895  private:
896   class CharacterFrequency {
897    public:
CharacterFrequency()898     CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)899     explicit CharacterFrequency(int character)
900         : counter_(0), character_(character) { }
901 
Increment()902     void Increment() { counter_++; }
counter()903     int counter() { return counter_; }
character()904     int character() { return character_; }
905 
906    private:
907     int counter_;
908     int character_;
909   };
910 
911 
912  private:
913   CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
914   int total_samples_;
915 };
916 
917 
918 class RegExpCompiler {
919  public:
920   RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
921                  bool is_one_byte);
922 
AllocateRegister()923   int AllocateRegister() {
924     if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
925       reg_exp_too_big_ = true;
926       return next_register_;
927     }
928     return next_register_++;
929   }
930 
931   // Lookarounds to match lone surrogates for unicode character class matches
932   // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()933   int UnicodeLookaroundStackRegister() {
934     if (unicode_lookaround_stack_register_ == kNoRegister) {
935       unicode_lookaround_stack_register_ = AllocateRegister();
936     }
937     return unicode_lookaround_stack_register_;
938   }
939 
UnicodeLookaroundPositionRegister()940   int UnicodeLookaroundPositionRegister() {
941     if (unicode_lookaround_position_register_ == kNoRegister) {
942       unicode_lookaround_position_register_ = AllocateRegister();
943     }
944     return unicode_lookaround_position_register_;
945   }
946 
947   RegExpEngine::CompilationResult Assemble(Isolate* isolate,
948                                            RegExpMacroAssembler* assembler,
949                                            RegExpNode* start, int capture_count,
950                                            Handle<String> pattern);
951 
AddWork(RegExpNode * node)952   inline void AddWork(RegExpNode* node) {
953     if (!node->on_work_list() && !node->label()->is_bound()) {
954       node->set_on_work_list(true);
955       work_list_->push_back(node);
956     }
957   }
958 
959   static const int kImplementationOffset = 0;
960   static const int kNumberOfRegistersOffset = 0;
961   static const int kCodeOffset = 1;
962 
macro_assembler()963   RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()964   EndNode* accept() { return accept_; }
965 
966   static const int kMaxRecursion = 100;
recursion_depth()967   inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()968   inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()969   inline void DecrementRecursionDepth() { recursion_depth_--; }
970 
SetRegExpTooBig()971   void SetRegExpTooBig() { reg_exp_too_big_ = true; }
972 
one_byte()973   inline bool one_byte() { return one_byte_; }
optimize()974   inline bool optimize() { return optimize_; }
set_optimize(bool value)975   inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()976   inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)977   inline void set_limiting_recursion(bool value) {
978     limiting_recursion_ = value;
979   }
read_backward()980   bool read_backward() { return read_backward_; }
set_read_backward(bool value)981   void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()982   FrequencyCollator* frequency_collator() { return &frequency_collator_; }
983 
current_expansion_factor()984   int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)985   void set_current_expansion_factor(int value) {
986     current_expansion_factor_ = value;
987   }
988 
isolate() const989   Isolate* isolate() const { return isolate_; }
zone() const990   Zone* zone() const { return zone_; }
991 
992   static const int kNoRegister = -1;
993 
994  private:
995   EndNode* accept_;
996   int next_register_;
997   int unicode_lookaround_stack_register_;
998   int unicode_lookaround_position_register_;
999   std::vector<RegExpNode*>* work_list_;
1000   int recursion_depth_;
1001   RegExpMacroAssembler* macro_assembler_;
1002   bool one_byte_;
1003   bool reg_exp_too_big_;
1004   bool limiting_recursion_;
1005   bool optimize_;
1006   bool read_backward_;
1007   int current_expansion_factor_;
1008   FrequencyCollator frequency_collator_;
1009   Isolate* isolate_;
1010   Zone* zone_;
1011 };
1012 
1013 
1014 class RecursionCheck {
1015  public:
RecursionCheck(RegExpCompiler * compiler)1016   explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1017     compiler->IncrementRecursionDepth();
1018   }
~RecursionCheck()1019   ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1020  private:
1021   RegExpCompiler* compiler_;
1022 };
1023 
1024 
IrregexpRegExpTooBig(Isolate * isolate)1025 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1026   return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1027 }
1028 
1029 
1030 // Attempts to compile the regexp using an Irregexp code generator.  Returns
1031 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,bool one_byte)1032 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1033                                bool one_byte)
1034     : next_register_(2 * (capture_count + 1)),
1035       unicode_lookaround_stack_register_(kNoRegister),
1036       unicode_lookaround_position_register_(kNoRegister),
1037       work_list_(nullptr),
1038       recursion_depth_(0),
1039       one_byte_(one_byte),
1040       reg_exp_too_big_(false),
1041       limiting_recursion_(false),
1042       optimize_(FLAG_regexp_optimization),
1043       read_backward_(false),
1044       current_expansion_factor_(1),
1045       frequency_collator_(),
1046       isolate_(isolate),
1047       zone_(zone) {
1048   accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1049   DCHECK_GE(RegExpMacroAssembler::kMaxRegister, next_register_ - 1);
1050 }
1051 
Assemble(Isolate * isolate,RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1052 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1053     Isolate* isolate, RegExpMacroAssembler* macro_assembler, RegExpNode* start,
1054     int capture_count, Handle<String> pattern) {
1055 #ifdef DEBUG
1056   if (FLAG_trace_regexp_assembler)
1057     macro_assembler_ = new RegExpMacroAssemblerTracer(isolate, macro_assembler);
1058   else
1059 #endif
1060     macro_assembler_ = macro_assembler;
1061 
1062   std::vector<RegExpNode*> work_list;
1063   work_list_ = &work_list;
1064   Label fail;
1065   macro_assembler_->PushBacktrack(&fail);
1066   Trace new_trace;
1067   start->Emit(this, &new_trace);
1068   macro_assembler_->Bind(&fail);
1069   macro_assembler_->Fail();
1070   while (!work_list.empty()) {
1071     RegExpNode* node = work_list.back();
1072     work_list.pop_back();
1073     node->set_on_work_list(false);
1074     if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1075   }
1076   if (reg_exp_too_big_) {
1077     macro_assembler_->AbortedCodeGeneration();
1078     return IrregexpRegExpTooBig(isolate_);
1079   }
1080 
1081   Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1082   isolate->IncreaseTotalRegexpCodeGenerated(code->Size());
1083   work_list_ = nullptr;
1084 #if defined(ENABLE_DISASSEMBLER) && !defined(V8_INTERPRETED_REGEXP)
1085   if (FLAG_print_code) {
1086     CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1087     OFStream os(trace_scope.file());
1088     Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1089   }
1090 #endif
1091 #ifdef DEBUG
1092   if (FLAG_trace_regexp_assembler) {
1093     delete macro_assembler_;
1094   }
1095 #endif
1096   return RegExpEngine::CompilationResult(*code, next_register_);
1097 }
1098 
1099 
Mentions(int that)1100 bool Trace::DeferredAction::Mentions(int that) {
1101   if (action_type() == ActionNode::CLEAR_CAPTURES) {
1102     Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1103     return range.Contains(that);
1104   } else {
1105     return reg() == that;
1106   }
1107 }
1108 
1109 
mentions_reg(int reg)1110 bool Trace::mentions_reg(int reg) {
1111   for (DeferredAction* action = actions_; action != nullptr;
1112        action = action->next()) {
1113     if (action->Mentions(reg))
1114       return true;
1115   }
1116   return false;
1117 }
1118 
1119 
GetStoredPosition(int reg,int * cp_offset)1120 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1121   DCHECK_EQ(0, *cp_offset);
1122   for (DeferredAction* action = actions_; action != nullptr;
1123        action = action->next()) {
1124     if (action->Mentions(reg)) {
1125       if (action->action_type() == ActionNode::STORE_POSITION) {
1126         *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1127         return true;
1128       } else {
1129         return false;
1130       }
1131     }
1132   }
1133   return false;
1134 }
1135 
1136 
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1137 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1138                                  Zone* zone) {
1139   int max_register = RegExpCompiler::kNoRegister;
1140   for (DeferredAction* action = actions_; action != nullptr;
1141        action = action->next()) {
1142     if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1143       Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1144       for (int i = range.from(); i <= range.to(); i++)
1145         affected_registers->Set(i, zone);
1146       if (range.to() > max_register) max_register = range.to();
1147     } else {
1148       affected_registers->Set(action->reg(), zone);
1149       if (action->reg() > max_register) max_register = action->reg();
1150     }
1151   }
1152   return max_register;
1153 }
1154 
1155 
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1156 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1157                                      int max_register,
1158                                      const OutSet& registers_to_pop,
1159                                      const OutSet& registers_to_clear) {
1160   for (int reg = max_register; reg >= 0; reg--) {
1161     if (registers_to_pop.Get(reg)) {
1162       assembler->PopRegister(reg);
1163     } else if (registers_to_clear.Get(reg)) {
1164       int clear_to = reg;
1165       while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1166         reg--;
1167       }
1168       assembler->ClearRegisters(reg, clear_to);
1169     }
1170   }
1171 }
1172 
1173 
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1174 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1175                                    int max_register,
1176                                    const OutSet& affected_registers,
1177                                    OutSet* registers_to_pop,
1178                                    OutSet* registers_to_clear,
1179                                    Zone* zone) {
1180   // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1181   const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1182 
1183   // Count pushes performed to force a stack limit check occasionally.
1184   int pushes = 0;
1185 
1186   for (int reg = 0; reg <= max_register; reg++) {
1187     if (!affected_registers.Get(reg)) {
1188       continue;
1189     }
1190 
1191     // The chronologically first deferred action in the trace
1192     // is used to infer the action needed to restore a register
1193     // to its previous state (or not, if it's safe to ignore it).
1194     enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1195     DeferredActionUndoType undo_action = IGNORE;
1196 
1197     int value = 0;
1198     bool absolute = false;
1199     bool clear = false;
1200     static const int kNoStore = kMinInt;
1201     int store_position = kNoStore;
1202     // This is a little tricky because we are scanning the actions in reverse
1203     // historical order (newest first).
1204     for (DeferredAction* action = actions_; action != nullptr;
1205          action = action->next()) {
1206       if (action->Mentions(reg)) {
1207         switch (action->action_type()) {
1208           case ActionNode::SET_REGISTER: {
1209             Trace::DeferredSetRegister* psr =
1210                 static_cast<Trace::DeferredSetRegister*>(action);
1211             if (!absolute) {
1212               value += psr->value();
1213               absolute = true;
1214             }
1215             // SET_REGISTER is currently only used for newly introduced loop
1216             // counters. They can have a significant previous value if they
1217             // occur in a loop. TODO(lrn): Propagate this information, so
1218             // we can set undo_action to IGNORE if we know there is no value to
1219             // restore.
1220             undo_action = RESTORE;
1221             DCHECK_EQ(store_position, kNoStore);
1222             DCHECK(!clear);
1223             break;
1224           }
1225           case ActionNode::INCREMENT_REGISTER:
1226             if (!absolute) {
1227               value++;
1228             }
1229             DCHECK_EQ(store_position, kNoStore);
1230             DCHECK(!clear);
1231             undo_action = RESTORE;
1232             break;
1233           case ActionNode::STORE_POSITION: {
1234             Trace::DeferredCapture* pc =
1235                 static_cast<Trace::DeferredCapture*>(action);
1236             if (!clear && store_position == kNoStore) {
1237               store_position = pc->cp_offset();
1238             }
1239 
1240             // For captures we know that stores and clears alternate.
1241             // Other register, are never cleared, and if the occur
1242             // inside a loop, they might be assigned more than once.
1243             if (reg <= 1) {
1244               // Registers zero and one, aka "capture zero", is
1245               // always set correctly if we succeed. There is no
1246               // need to undo a setting on backtrack, because we
1247               // will set it again or fail.
1248               undo_action = IGNORE;
1249             } else {
1250               undo_action = pc->is_capture() ? CLEAR : RESTORE;
1251             }
1252             DCHECK(!absolute);
1253             DCHECK_EQ(value, 0);
1254             break;
1255           }
1256           case ActionNode::CLEAR_CAPTURES: {
1257             // Since we're scanning in reverse order, if we've already
1258             // set the position we have to ignore historically earlier
1259             // clearing operations.
1260             if (store_position == kNoStore) {
1261               clear = true;
1262             }
1263             undo_action = RESTORE;
1264             DCHECK(!absolute);
1265             DCHECK_EQ(value, 0);
1266             break;
1267           }
1268           default:
1269             UNREACHABLE();
1270             break;
1271         }
1272       }
1273     }
1274     // Prepare for the undo-action (e.g., push if it's going to be popped).
1275     if (undo_action == RESTORE) {
1276       pushes++;
1277       RegExpMacroAssembler::StackCheckFlag stack_check =
1278           RegExpMacroAssembler::kNoStackLimitCheck;
1279       if (pushes == push_limit) {
1280         stack_check = RegExpMacroAssembler::kCheckStackLimit;
1281         pushes = 0;
1282       }
1283 
1284       assembler->PushRegister(reg, stack_check);
1285       registers_to_pop->Set(reg, zone);
1286     } else if (undo_action == CLEAR) {
1287       registers_to_clear->Set(reg, zone);
1288     }
1289     // Perform the chronologically last action (or accumulated increment)
1290     // for the register.
1291     if (store_position != kNoStore) {
1292       assembler->WriteCurrentPositionToRegister(reg, store_position);
1293     } else if (clear) {
1294       assembler->ClearRegisters(reg, reg);
1295     } else if (absolute) {
1296       assembler->SetRegister(reg, value);
1297     } else if (value != 0) {
1298       assembler->AdvanceRegister(reg, value);
1299     }
1300   }
1301 }
1302 
1303 
1304 // This is called as we come into a loop choice node and some other tricky
1305 // nodes.  It normalizes the state of the code generator to ensure we can
1306 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1307 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1308   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1309 
1310   DCHECK(!is_trivial());
1311 
1312   if (actions_ == nullptr && backtrack() == nullptr) {
1313     // Here we just have some deferred cp advances to fix and we are back to
1314     // a normal situation.  We may also have to forget some information gained
1315     // through a quick check that was already performed.
1316     if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1317     // Create a new trivial state and generate the node with that.
1318     Trace new_state;
1319     successor->Emit(compiler, &new_state);
1320     return;
1321   }
1322 
1323   // Generate deferred actions here along with code to undo them again.
1324   OutSet affected_registers;
1325 
1326   if (backtrack() != nullptr) {
1327     // Here we have a concrete backtrack location.  These are set up by choice
1328     // nodes and so they indicate that we have a deferred save of the current
1329     // position which we may need to emit here.
1330     assembler->PushCurrentPosition();
1331   }
1332 
1333   int max_register = FindAffectedRegisters(&affected_registers,
1334                                            compiler->zone());
1335   OutSet registers_to_pop;
1336   OutSet registers_to_clear;
1337   PerformDeferredActions(assembler,
1338                          max_register,
1339                          affected_registers,
1340                          &registers_to_pop,
1341                          &registers_to_clear,
1342                          compiler->zone());
1343   if (cp_offset_ != 0) {
1344     assembler->AdvanceCurrentPosition(cp_offset_);
1345   }
1346 
1347   // Create a new trivial state and generate the node with that.
1348   Label undo;
1349   assembler->PushBacktrack(&undo);
1350   if (successor->KeepRecursing(compiler)) {
1351     Trace new_state;
1352     successor->Emit(compiler, &new_state);
1353   } else {
1354     compiler->AddWork(successor);
1355     assembler->GoTo(successor->label());
1356   }
1357 
1358   // On backtrack we need to restore state.
1359   assembler->Bind(&undo);
1360   RestoreAffectedRegisters(assembler,
1361                            max_register,
1362                            registers_to_pop,
1363                            registers_to_clear);
1364   if (backtrack() == nullptr) {
1365     assembler->Backtrack();
1366   } else {
1367     assembler->PopCurrentPosition();
1368     assembler->GoTo(backtrack());
1369   }
1370 }
1371 
1372 
Emit(RegExpCompiler * compiler,Trace * trace)1373 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1374   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1375 
1376   // Omit flushing the trace. We discard the entire stack frame anyway.
1377 
1378   if (!label()->is_bound()) {
1379     // We are completely independent of the trace, since we ignore it,
1380     // so this code can be used as the generic version.
1381     assembler->Bind(label());
1382   }
1383 
1384   // Throw away everything on the backtrack stack since the start
1385   // of the negative submatch and restore the character position.
1386   assembler->ReadCurrentPositionFromRegister(current_position_register_);
1387   assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1388   if (clear_capture_count_ > 0) {
1389     // Clear any captures that might have been performed during the success
1390     // of the body of the negative look-ahead.
1391     int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1392     assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1393   }
1394   // Now that we have unwound the stack we find at the top of the stack the
1395   // backtrack that the BeginSubmatch node got.
1396   assembler->Backtrack();
1397 }
1398 
1399 
Emit(RegExpCompiler * compiler,Trace * trace)1400 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1401   if (!trace->is_trivial()) {
1402     trace->Flush(compiler, this);
1403     return;
1404   }
1405   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1406   if (!label()->is_bound()) {
1407     assembler->Bind(label());
1408   }
1409   switch (action_) {
1410     case ACCEPT:
1411       assembler->Succeed();
1412       return;
1413     case BACKTRACK:
1414       assembler->GoTo(trace->backtrack());
1415       return;
1416     case NEGATIVE_SUBMATCH_SUCCESS:
1417       // This case is handled in a different virtual method.
1418       UNREACHABLE();
1419   }
1420   UNIMPLEMENTED();
1421 }
1422 
1423 
AddGuard(Guard * guard,Zone * zone)1424 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1425   if (guards_ == nullptr) guards_ = new (zone) ZoneList<Guard*>(1, zone);
1426   guards_->Add(guard, zone);
1427 }
1428 
1429 
SetRegister(int reg,int val,RegExpNode * on_success)1430 ActionNode* ActionNode::SetRegister(int reg,
1431                                     int val,
1432                                     RegExpNode* on_success) {
1433   ActionNode* result =
1434       new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1435   result->data_.u_store_register.reg = reg;
1436   result->data_.u_store_register.value = val;
1437   return result;
1438 }
1439 
1440 
IncrementRegister(int reg,RegExpNode * on_success)1441 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1442   ActionNode* result =
1443       new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1444   result->data_.u_increment_register.reg = reg;
1445   return result;
1446 }
1447 
1448 
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1449 ActionNode* ActionNode::StorePosition(int reg,
1450                                       bool is_capture,
1451                                       RegExpNode* on_success) {
1452   ActionNode* result =
1453       new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1454   result->data_.u_position_register.reg = reg;
1455   result->data_.u_position_register.is_capture = is_capture;
1456   return result;
1457 }
1458 
1459 
ClearCaptures(Interval range,RegExpNode * on_success)1460 ActionNode* ActionNode::ClearCaptures(Interval range,
1461                                       RegExpNode* on_success) {
1462   ActionNode* result =
1463       new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1464   result->data_.u_clear_captures.range_from = range.from();
1465   result->data_.u_clear_captures.range_to = range.to();
1466   return result;
1467 }
1468 
1469 
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1470 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1471                                       int position_reg,
1472                                       RegExpNode* on_success) {
1473   ActionNode* result =
1474       new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1475   result->data_.u_submatch.stack_pointer_register = stack_reg;
1476   result->data_.u_submatch.current_position_register = position_reg;
1477   return result;
1478 }
1479 
1480 
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1481 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1482                                                 int position_reg,
1483                                                 int clear_register_count,
1484                                                 int clear_register_from,
1485                                                 RegExpNode* on_success) {
1486   ActionNode* result =
1487       new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1488   result->data_.u_submatch.stack_pointer_register = stack_reg;
1489   result->data_.u_submatch.current_position_register = position_reg;
1490   result->data_.u_submatch.clear_register_count = clear_register_count;
1491   result->data_.u_submatch.clear_register_from = clear_register_from;
1492   return result;
1493 }
1494 
1495 
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1496 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1497                                         int repetition_register,
1498                                         int repetition_limit,
1499                                         RegExpNode* on_success) {
1500   ActionNode* result =
1501       new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1502   result->data_.u_empty_match_check.start_register = start_register;
1503   result->data_.u_empty_match_check.repetition_register = repetition_register;
1504   result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1505   return result;
1506 }
1507 
1508 
1509 #define DEFINE_ACCEPT(Type)                                          \
1510   void Type##Node::Accept(NodeVisitor* visitor) {                    \
1511     visitor->Visit##Type(this);                                      \
1512   }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1513 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1514 #undef DEFINE_ACCEPT
1515 
1516 
1517 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1518   visitor->VisitLoopChoice(this);
1519 }
1520 
1521 
1522 // -------------------------------------------------------------------
1523 // Emit code.
1524 
1525 
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1526 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1527                                Guard* guard,
1528                                Trace* trace) {
1529   switch (guard->op()) {
1530     case Guard::LT:
1531       DCHECK(!trace->mentions_reg(guard->reg()));
1532       macro_assembler->IfRegisterGE(guard->reg(),
1533                                     guard->value(),
1534                                     trace->backtrack());
1535       break;
1536     case Guard::GEQ:
1537       DCHECK(!trace->mentions_reg(guard->reg()));
1538       macro_assembler->IfRegisterLT(guard->reg(),
1539                                     guard->value(),
1540                                     trace->backtrack());
1541       break;
1542   }
1543 }
1544 
1545 
1546 // Returns the number of characters in the equivalence class, omitting those
1547 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1548 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1549                                      bool one_byte_subject,
1550                                      unibrow::uchar* letters) {
1551   int length =
1552       isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1553   // Unibrow returns 0 or 1 for characters where case independence is
1554   // trivial.
1555   if (length == 0) {
1556     letters[0] = character;
1557     length = 1;
1558   }
1559 
1560   if (one_byte_subject) {
1561     int new_length = 0;
1562     for (int i = 0; i < length; i++) {
1563       if (letters[i] <= String::kMaxOneByteCharCode) {
1564         letters[new_length++] = letters[i];
1565       }
1566     }
1567     length = new_length;
1568   }
1569 
1570   return length;
1571 }
1572 
1573 
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1574 static inline bool EmitSimpleCharacter(Isolate* isolate,
1575                                        RegExpCompiler* compiler,
1576                                        uc16 c,
1577                                        Label* on_failure,
1578                                        int cp_offset,
1579                                        bool check,
1580                                        bool preloaded) {
1581   RegExpMacroAssembler* assembler = compiler->macro_assembler();
1582   bool bound_checked = false;
1583   if (!preloaded) {
1584     assembler->LoadCurrentCharacter(
1585         cp_offset,
1586         on_failure,
1587         check);
1588     bound_checked = true;
1589   }
1590   assembler->CheckNotCharacter(c, on_failure);
1591   return bound_checked;
1592 }
1593 
1594 
1595 // Only emits non-letters (things that don't have case).  Only used for case
1596 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1597 static inline bool EmitAtomNonLetter(Isolate* isolate,
1598                                      RegExpCompiler* compiler,
1599                                      uc16 c,
1600                                      Label* on_failure,
1601                                      int cp_offset,
1602                                      bool check,
1603                                      bool preloaded) {
1604   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1605   bool one_byte = compiler->one_byte();
1606   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1607   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1608   if (length < 1) {
1609     // This can't match.  Must be an one-byte subject and a non-one-byte
1610     // character.  We do not need to do anything since the one-byte pass
1611     // already handled this.
1612     return false;  // Bounds not checked.
1613   }
1614   bool checked = false;
1615   // We handle the length > 1 case in a later pass.
1616   if (length == 1) {
1617     if (one_byte && c > String::kMaxOneByteCharCodeU) {
1618       // Can't match - see above.
1619       return false;  // Bounds not checked.
1620     }
1621     if (!preloaded) {
1622       macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1623       checked = check;
1624     }
1625     macro_assembler->CheckNotCharacter(c, on_failure);
1626   }
1627   return checked;
1628 }
1629 
1630 
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1631 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1632                                       bool one_byte, uc16 c1, uc16 c2,
1633                                       Label* on_failure) {
1634   uc16 char_mask;
1635   if (one_byte) {
1636     char_mask = String::kMaxOneByteCharCode;
1637   } else {
1638     char_mask = String::kMaxUtf16CodeUnit;
1639   }
1640   uc16 exor = c1 ^ c2;
1641   // Check whether exor has only one bit set.
1642   if (((exor - 1) & exor) == 0) {
1643     // If c1 and c2 differ only by one bit.
1644     // Ecma262UnCanonicalize always gives the highest number last.
1645     DCHECK(c2 > c1);
1646     uc16 mask = char_mask ^ exor;
1647     macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1648     return true;
1649   }
1650   DCHECK(c2 > c1);
1651   uc16 diff = c2 - c1;
1652   if (((diff - 1) & diff) == 0 && c1 >= diff) {
1653     // If the characters differ by 2^n but don't differ by one bit then
1654     // subtract the difference from the found character, then do the or
1655     // trick.  We avoid the theoretical case where negative numbers are
1656     // involved in order to simplify code generation.
1657     uc16 mask = char_mask ^ diff;
1658     macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1659                                                     diff,
1660                                                     mask,
1661                                                     on_failure);
1662     return true;
1663   }
1664   return false;
1665 }
1666 
1667 
1668 typedef bool EmitCharacterFunction(Isolate* isolate,
1669                                    RegExpCompiler* compiler,
1670                                    uc16 c,
1671                                    Label* on_failure,
1672                                    int cp_offset,
1673                                    bool check,
1674                                    bool preloaded);
1675 
1676 // Only emits letters (things that have case).  Only used for case independent
1677 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1678 static inline bool EmitAtomLetter(Isolate* isolate,
1679                                   RegExpCompiler* compiler,
1680                                   uc16 c,
1681                                   Label* on_failure,
1682                                   int cp_offset,
1683                                   bool check,
1684                                   bool preloaded) {
1685   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1686   bool one_byte = compiler->one_byte();
1687   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1688   int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1689   if (length <= 1) return false;
1690   // We may not need to check against the end of the input string
1691   // if this character lies before a character that matched.
1692   if (!preloaded) {
1693     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1694   }
1695   Label ok;
1696   DCHECK_EQ(4, unibrow::Ecma262UnCanonicalize::kMaxWidth);
1697   switch (length) {
1698     case 2: {
1699       if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1700                                     chars[1], on_failure)) {
1701       } else {
1702         macro_assembler->CheckCharacter(chars[0], &ok);
1703         macro_assembler->CheckNotCharacter(chars[1], on_failure);
1704         macro_assembler->Bind(&ok);
1705       }
1706       break;
1707     }
1708     case 4:
1709       macro_assembler->CheckCharacter(chars[3], &ok);
1710       V8_FALLTHROUGH;
1711     case 3:
1712       macro_assembler->CheckCharacter(chars[0], &ok);
1713       macro_assembler->CheckCharacter(chars[1], &ok);
1714       macro_assembler->CheckNotCharacter(chars[2], on_failure);
1715       macro_assembler->Bind(&ok);
1716       break;
1717     default:
1718       UNREACHABLE();
1719       break;
1720   }
1721   return true;
1722 }
1723 
1724 
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1725 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1726                              int border,
1727                              Label* fall_through,
1728                              Label* above_or_equal,
1729                              Label* below) {
1730   if (below != fall_through) {
1731     masm->CheckCharacterLT(border, below);
1732     if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1733   } else {
1734     masm->CheckCharacterGT(border - 1, above_or_equal);
1735   }
1736 }
1737 
1738 
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1739 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1740                                    int first,
1741                                    int last,
1742                                    Label* fall_through,
1743                                    Label* in_range,
1744                                    Label* out_of_range) {
1745   if (in_range == fall_through) {
1746     if (first == last) {
1747       masm->CheckNotCharacter(first, out_of_range);
1748     } else {
1749       masm->CheckCharacterNotInRange(first, last, out_of_range);
1750     }
1751   } else {
1752     if (first == last) {
1753       masm->CheckCharacter(first, in_range);
1754     } else {
1755       masm->CheckCharacterInRange(first, last, in_range);
1756     }
1757     if (out_of_range != fall_through) masm->GoTo(out_of_range);
1758   }
1759 }
1760 
1761 
1762 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1763 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1764 static void EmitUseLookupTable(
1765     RegExpMacroAssembler* masm,
1766     ZoneList<int>* ranges,
1767     int start_index,
1768     int end_index,
1769     int min_char,
1770     Label* fall_through,
1771     Label* even_label,
1772     Label* odd_label) {
1773   static const int kSize = RegExpMacroAssembler::kTableSize;
1774   static const int kMask = RegExpMacroAssembler::kTableMask;
1775 
1776   int base = (min_char & ~kMask);
1777   USE(base);
1778 
1779   // Assert that everything is on one kTableSize page.
1780   for (int i = start_index; i <= end_index; i++) {
1781     DCHECK_EQ(ranges->at(i) & ~kMask, base);
1782   }
1783   DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1784 
1785   char templ[kSize];
1786   Label* on_bit_set;
1787   Label* on_bit_clear;
1788   int bit;
1789   if (even_label == fall_through) {
1790     on_bit_set = odd_label;
1791     on_bit_clear = even_label;
1792     bit = 1;
1793   } else {
1794     on_bit_set = even_label;
1795     on_bit_clear = odd_label;
1796     bit = 0;
1797   }
1798   for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1799     templ[i] = bit;
1800   }
1801   int j = 0;
1802   bit ^= 1;
1803   for (int i = start_index; i < end_index; i++) {
1804     for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1805       templ[j] = bit;
1806     }
1807     bit ^= 1;
1808   }
1809   for (int i = j; i < kSize; i++) {
1810     templ[i] = bit;
1811   }
1812   Factory* factory = masm->isolate()->factory();
1813   // TODO(erikcorry): Cache these.
1814   Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1815   for (int i = 0; i < kSize; i++) {
1816     ba->set(i, templ[i]);
1817   }
1818   masm->CheckBitInTable(ba, on_bit_set);
1819   if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1820 }
1821 
1822 
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1823 static void CutOutRange(RegExpMacroAssembler* masm,
1824                         ZoneList<int>* ranges,
1825                         int start_index,
1826                         int end_index,
1827                         int cut_index,
1828                         Label* even_label,
1829                         Label* odd_label) {
1830   bool odd = (((cut_index - start_index) & 1) == 1);
1831   Label* in_range_label = odd ? odd_label : even_label;
1832   Label dummy;
1833   EmitDoubleBoundaryTest(masm,
1834                          ranges->at(cut_index),
1835                          ranges->at(cut_index + 1) - 1,
1836                          &dummy,
1837                          in_range_label,
1838                          &dummy);
1839   DCHECK(!dummy.is_linked());
1840   // Cut out the single range by rewriting the array.  This creates a new
1841   // range that is a merger of the two ranges on either side of the one we
1842   // are cutting out.  The oddity of the labels is preserved.
1843   for (int j = cut_index; j > start_index; j--) {
1844     ranges->at(j) = ranges->at(j - 1);
1845   }
1846   for (int j = cut_index + 1; j < end_index; j++) {
1847     ranges->at(j) = ranges->at(j + 1);
1848   }
1849 }
1850 
1851 
1852 // Unicode case.  Split the search space into kSize spaces that are handled
1853 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1854 static void SplitSearchSpace(ZoneList<int>* ranges,
1855                              int start_index,
1856                              int end_index,
1857                              int* new_start_index,
1858                              int* new_end_index,
1859                              int* border) {
1860   static const int kSize = RegExpMacroAssembler::kTableSize;
1861   static const int kMask = RegExpMacroAssembler::kTableMask;
1862 
1863   int first = ranges->at(start_index);
1864   int last = ranges->at(end_index) - 1;
1865 
1866   *new_start_index = start_index;
1867   *border = (ranges->at(start_index) & ~kMask) + kSize;
1868   while (*new_start_index < end_index) {
1869     if (ranges->at(*new_start_index) > *border) break;
1870     (*new_start_index)++;
1871   }
1872   // new_start_index is the index of the first edge that is beyond the
1873   // current kSize space.
1874 
1875   // For very large search spaces we do a binary chop search of the non-Latin1
1876   // space instead of just going to the end of the current kSize space.  The
1877   // heuristics are complicated a little by the fact that any 128-character
1878   // encoding space can be quickly tested with a table lookup, so we don't
1879   // wish to do binary chop search at a smaller granularity than that.  A
1880   // 128-character space can take up a lot of space in the ranges array if,
1881   // for example, we only want to match every second character (eg. the lower
1882   // case characters on some Unicode pages).
1883   int binary_chop_index = (end_index + start_index) / 2;
1884   // The first test ensures that we get to the code that handles the Latin1
1885   // range with a single not-taken branch, speeding up this important
1886   // character range (even non-Latin1 charset-based text has spaces and
1887   // punctuation).
1888   if (*border - 1 > String::kMaxOneByteCharCode &&  // Latin1 case.
1889       end_index - start_index > (*new_start_index - start_index) * 2 &&
1890       last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1891       ranges->at(binary_chop_index) >= first + 2 * kSize) {
1892     int scan_forward_for_section_border = binary_chop_index;;
1893     int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1894 
1895     while (scan_forward_for_section_border < end_index) {
1896       if (ranges->at(scan_forward_for_section_border) > new_border) {
1897         *new_start_index = scan_forward_for_section_border;
1898         *border = new_border;
1899         break;
1900       }
1901       scan_forward_for_section_border++;
1902     }
1903   }
1904 
1905   DCHECK(*new_start_index > start_index);
1906   *new_end_index = *new_start_index - 1;
1907   if (ranges->at(*new_end_index) == *border) {
1908     (*new_end_index)--;
1909   }
1910   if (*border >= ranges->at(end_index)) {
1911     *border = ranges->at(end_index);
1912     *new_start_index = end_index;  // Won't be used.
1913     *new_end_index = end_index - 1;
1914   }
1915 }
1916 
1917 // Gets a series of segment boundaries representing a character class.  If the
1918 // character is in the range between an even and an odd boundary (counting from
1919 // start_index) then go to even_label, otherwise go to odd_label.  We already
1920 // know that the character is in the range of min_char to max_char inclusive.
1921 // Either label can be nullptr indicating backtracking.  Either label can also
1922 // be equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1923 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1924                              int start_index, int end_index, uc32 min_char,
1925                              uc32 max_char, Label* fall_through,
1926                              Label* even_label, Label* odd_label) {
1927   DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1928   DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1929 
1930   int first = ranges->at(start_index);
1931   int last = ranges->at(end_index) - 1;
1932 
1933   DCHECK_LT(min_char, first);
1934 
1935   // Just need to test if the character is before or on-or-after
1936   // a particular character.
1937   if (start_index == end_index) {
1938     EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1939     return;
1940   }
1941 
1942   // Another almost trivial case:  There is one interval in the middle that is
1943   // different from the end intervals.
1944   if (start_index + 1 == end_index) {
1945     EmitDoubleBoundaryTest(
1946         masm, first, last, fall_through, even_label, odd_label);
1947     return;
1948   }
1949 
1950   // It's not worth using table lookup if there are very few intervals in the
1951   // character class.
1952   if (end_index - start_index <= 6) {
1953     // It is faster to test for individual characters, so we look for those
1954     // first, then try arbitrary ranges in the second round.
1955     static int kNoCutIndex = -1;
1956     int cut = kNoCutIndex;
1957     for (int i = start_index; i < end_index; i++) {
1958       if (ranges->at(i) == ranges->at(i + 1) - 1) {
1959         cut = i;
1960         break;
1961       }
1962     }
1963     if (cut == kNoCutIndex) cut = start_index;
1964     CutOutRange(
1965         masm, ranges, start_index, end_index, cut, even_label, odd_label);
1966     DCHECK_GE(end_index - start_index, 2);
1967     GenerateBranches(masm,
1968                      ranges,
1969                      start_index + 1,
1970                      end_index - 1,
1971                      min_char,
1972                      max_char,
1973                      fall_through,
1974                      even_label,
1975                      odd_label);
1976     return;
1977   }
1978 
1979   // If there are a lot of intervals in the regexp, then we will use tables to
1980   // determine whether the character is inside or outside the character class.
1981   static const int kBits = RegExpMacroAssembler::kTableSizeBits;
1982 
1983   if ((max_char >> kBits) == (min_char >> kBits)) {
1984     EmitUseLookupTable(masm,
1985                        ranges,
1986                        start_index,
1987                        end_index,
1988                        min_char,
1989                        fall_through,
1990                        even_label,
1991                        odd_label);
1992     return;
1993   }
1994 
1995   if ((min_char >> kBits) != (first >> kBits)) {
1996     masm->CheckCharacterLT(first, odd_label);
1997     GenerateBranches(masm,
1998                      ranges,
1999                      start_index + 1,
2000                      end_index,
2001                      first,
2002                      max_char,
2003                      fall_through,
2004                      odd_label,
2005                      even_label);
2006     return;
2007   }
2008 
2009   int new_start_index = 0;
2010   int new_end_index = 0;
2011   int border = 0;
2012 
2013   SplitSearchSpace(ranges,
2014                    start_index,
2015                    end_index,
2016                    &new_start_index,
2017                    &new_end_index,
2018                    &border);
2019 
2020   Label handle_rest;
2021   Label* above = &handle_rest;
2022   if (border == last + 1) {
2023     // We didn't find any section that started after the limit, so everything
2024     // above the border is one of the terminal labels.
2025     above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2026     DCHECK(new_end_index == end_index - 1);
2027   }
2028 
2029   DCHECK_LE(start_index, new_end_index);
2030   DCHECK_LE(new_start_index, end_index);
2031   DCHECK_LT(start_index, new_start_index);
2032   DCHECK_LT(new_end_index, end_index);
2033   DCHECK(new_end_index + 1 == new_start_index ||
2034          (new_end_index + 2 == new_start_index &&
2035           border == ranges->at(new_end_index + 1)));
2036   DCHECK_LT(min_char, border - 1);
2037   DCHECK_LT(border, max_char);
2038   DCHECK_LT(ranges->at(new_end_index), border);
2039   DCHECK(border < ranges->at(new_start_index) ||
2040          (border == ranges->at(new_start_index) &&
2041           new_start_index == end_index &&
2042           new_end_index == end_index - 1 &&
2043           border == last + 1));
2044   DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2045 
2046   masm->CheckCharacterGT(border - 1, above);
2047   Label dummy;
2048   GenerateBranches(masm,
2049                    ranges,
2050                    start_index,
2051                    new_end_index,
2052                    min_char,
2053                    border - 1,
2054                    &dummy,
2055                    even_label,
2056                    odd_label);
2057   if (handle_rest.is_linked()) {
2058     masm->Bind(&handle_rest);
2059     bool flip = (new_start_index & 1) != (start_index & 1);
2060     GenerateBranches(masm,
2061                      ranges,
2062                      new_start_index,
2063                      end_index,
2064                      border,
2065                      max_char,
2066                      &dummy,
2067                      flip ? odd_label : even_label,
2068                      flip ? even_label : odd_label);
2069   }
2070 }
2071 
2072 
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2073 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2074                           RegExpCharacterClass* cc, bool one_byte,
2075                           Label* on_failure, int cp_offset, bool check_offset,
2076                           bool preloaded, Zone* zone) {
2077   ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2078   CharacterRange::Canonicalize(ranges);
2079 
2080   int max_char;
2081   if (one_byte) {
2082     max_char = String::kMaxOneByteCharCode;
2083   } else {
2084     max_char = String::kMaxUtf16CodeUnit;
2085   }
2086 
2087   int range_count = ranges->length();
2088 
2089   int last_valid_range = range_count - 1;
2090   while (last_valid_range >= 0) {
2091     CharacterRange& range = ranges->at(last_valid_range);
2092     if (range.from() <= max_char) {
2093       break;
2094     }
2095     last_valid_range--;
2096   }
2097 
2098   if (last_valid_range < 0) {
2099     if (!cc->is_negated()) {
2100       macro_assembler->GoTo(on_failure);
2101     }
2102     if (check_offset) {
2103       macro_assembler->CheckPosition(cp_offset, on_failure);
2104     }
2105     return;
2106   }
2107 
2108   if (last_valid_range == 0 &&
2109       ranges->at(0).IsEverything(max_char)) {
2110     if (cc->is_negated()) {
2111       macro_assembler->GoTo(on_failure);
2112     } else {
2113       // This is a common case hit by non-anchored expressions.
2114       if (check_offset) {
2115         macro_assembler->CheckPosition(cp_offset, on_failure);
2116       }
2117     }
2118     return;
2119   }
2120 
2121   if (!preloaded) {
2122     macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2123   }
2124 
2125   if (cc->is_standard(zone) &&
2126       macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2127                                                   on_failure)) {
2128       return;
2129   }
2130 
2131 
2132   // A new list with ascending entries.  Each entry is a code unit
2133   // where there is a boundary between code units that are part of
2134   // the class and code units that are not.  Normally we insert an
2135   // entry at zero which goes to the failure label, but if there
2136   // was already one there we fall through for success on that entry.
2137   // Subsequent entries have alternating meaning (success/failure).
2138   ZoneList<int>* range_boundaries =
2139       new(zone) ZoneList<int>(last_valid_range, zone);
2140 
2141   bool zeroth_entry_is_failure = !cc->is_negated();
2142 
2143   for (int i = 0; i <= last_valid_range; i++) {
2144     CharacterRange& range = ranges->at(i);
2145     if (range.from() == 0) {
2146       DCHECK_EQ(i, 0);
2147       zeroth_entry_is_failure = !zeroth_entry_is_failure;
2148     } else {
2149       range_boundaries->Add(range.from(), zone);
2150     }
2151     range_boundaries->Add(range.to() + 1, zone);
2152   }
2153   int end_index = range_boundaries->length() - 1;
2154   if (range_boundaries->at(end_index) > max_char) {
2155     end_index--;
2156   }
2157 
2158   Label fall_through;
2159   GenerateBranches(macro_assembler,
2160                    range_boundaries,
2161                    0,  // start_index.
2162                    end_index,
2163                    0,  // min_char.
2164                    max_char,
2165                    &fall_through,
2166                    zeroth_entry_is_failure ? &fall_through : on_failure,
2167                    zeroth_entry_is_failure ? on_failure : &fall_through);
2168   macro_assembler->Bind(&fall_through);
2169 }
2170 
2171 
~RegExpNode()2172 RegExpNode::~RegExpNode() {
2173 }
2174 
2175 
LimitVersions(RegExpCompiler * compiler,Trace * trace)2176 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2177                                                   Trace* trace) {
2178   // If we are generating a greedy loop then don't stop and don't reuse code.
2179   if (trace->stop_node() != nullptr) {
2180     return CONTINUE;
2181   }
2182 
2183   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2184   if (trace->is_trivial()) {
2185     if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2186       // If a generic version is already scheduled to be generated or we have
2187       // recursed too deeply then just generate a jump to that code.
2188       macro_assembler->GoTo(&label_);
2189       // This will queue it up for generation of a generic version if it hasn't
2190       // already been queued.
2191       compiler->AddWork(this);
2192       return DONE;
2193     }
2194     // Generate generic version of the node and bind the label for later use.
2195     macro_assembler->Bind(&label_);
2196     return CONTINUE;
2197   }
2198 
2199   // We are being asked to make a non-generic version.  Keep track of how many
2200   // non-generic versions we generate so as not to overdo it.
2201   trace_count_++;
2202   if (KeepRecursing(compiler) && compiler->optimize() &&
2203       trace_count_ < kMaxCopiesCodeGenerated) {
2204     return CONTINUE;
2205   }
2206 
2207   // If we get here code has been generated for this node too many times or
2208   // recursion is too deep.  Time to switch to a generic version.  The code for
2209   // generic versions above can handle deep recursion properly.
2210   bool was_limiting = compiler->limiting_recursion();
2211   compiler->set_limiting_recursion(true);
2212   trace->Flush(compiler, this);
2213   compiler->set_limiting_recursion(was_limiting);
2214   return DONE;
2215 }
2216 
2217 
KeepRecursing(RegExpCompiler * compiler)2218 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2219   return !compiler->limiting_recursion() &&
2220          compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2221 }
2222 
2223 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2224 int ActionNode::EatsAtLeast(int still_to_find,
2225                             int budget,
2226                             bool not_at_start) {
2227   if (budget <= 0) return 0;
2228   if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0;  // Rewinds input!
2229   return on_success()->EatsAtLeast(still_to_find,
2230                                    budget - 1,
2231                                    not_at_start);
2232 }
2233 
2234 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2235 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2236                               BoyerMooreLookahead* bm, bool not_at_start) {
2237   if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2238     on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2239   }
2240   SaveBMInfo(bm, not_at_start, offset);
2241 }
2242 
2243 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2244 int AssertionNode::EatsAtLeast(int still_to_find,
2245                                int budget,
2246                                bool not_at_start) {
2247   if (budget <= 0) return 0;
2248   // If we know we are not at the start and we are asked "how many characters
2249   // will you match if you succeed?" then we can answer anything since false
2250   // implies false.  So lets just return the max answer (still_to_find) since
2251   // that won't prevent us from preloading a lot of characters for the other
2252   // branches in the node graph.
2253   if (assertion_type() == AT_START && not_at_start) return still_to_find;
2254   return on_success()->EatsAtLeast(still_to_find,
2255                                    budget - 1,
2256                                    not_at_start);
2257 }
2258 
2259 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2260 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2261                                  BoyerMooreLookahead* bm, bool not_at_start) {
2262   // Match the behaviour of EatsAtLeast on this node.
2263   if (assertion_type() == AT_START && not_at_start) return;
2264   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2265   SaveBMInfo(bm, not_at_start, offset);
2266 }
2267 
2268 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2269 int BackReferenceNode::EatsAtLeast(int still_to_find,
2270                                    int budget,
2271                                    bool not_at_start) {
2272   if (read_backward()) return 0;
2273   if (budget <= 0) return 0;
2274   return on_success()->EatsAtLeast(still_to_find,
2275                                    budget - 1,
2276                                    not_at_start);
2277 }
2278 
2279 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2280 int TextNode::EatsAtLeast(int still_to_find,
2281                           int budget,
2282                           bool not_at_start) {
2283   if (read_backward()) return 0;
2284   int answer = Length();
2285   if (answer >= still_to_find) return answer;
2286   if (budget <= 0) return answer;
2287   // We are not at start after this node so we set the last argument to 'true'.
2288   return answer + on_success()->EatsAtLeast(still_to_find - answer,
2289                                             budget - 1,
2290                                             true);
2291 }
2292 
2293 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2294 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2295                                               bool not_at_start) {
2296   if (budget <= 0) return 0;
2297   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2298   // afterwards.
2299   RegExpNode* node = alternatives_->at(1).node();
2300   return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2301 }
2302 
2303 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2304 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2305     QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2306     bool not_at_start) {
2307   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2308   // afterwards.
2309   RegExpNode* node = alternatives_->at(1).node();
2310   return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2311 }
2312 
2313 
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2314 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2315                                   int budget,
2316                                   RegExpNode* ignore_this_node,
2317                                   bool not_at_start) {
2318   if (budget <= 0) return 0;
2319   int min = 100;
2320   int choice_count = alternatives_->length();
2321   budget = (budget - 1) / choice_count;
2322   for (int i = 0; i < choice_count; i++) {
2323     RegExpNode* node = alternatives_->at(i).node();
2324     if (node == ignore_this_node) continue;
2325     int node_eats_at_least =
2326         node->EatsAtLeast(still_to_find, budget, not_at_start);
2327     if (node_eats_at_least < min) min = node_eats_at_least;
2328     if (min == 0) return 0;
2329   }
2330   return min;
2331 }
2332 
2333 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2334 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2335                                 int budget,
2336                                 bool not_at_start) {
2337   return EatsAtLeastHelper(still_to_find,
2338                            budget - 1,
2339                            loop_node_,
2340                            not_at_start);
2341 }
2342 
2343 
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2344 int ChoiceNode::EatsAtLeast(int still_to_find,
2345                             int budget,
2346                             bool not_at_start) {
2347   return EatsAtLeastHelper(still_to_find, budget, nullptr, not_at_start);
2348 }
2349 
2350 
2351 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2352 static inline uint32_t SmearBitsRight(uint32_t v) {
2353   v |= v >> 1;
2354   v |= v >> 2;
2355   v |= v >> 4;
2356   v |= v >> 8;
2357   v |= v >> 16;
2358   return v;
2359 }
2360 
2361 
Rationalize(bool asc)2362 bool QuickCheckDetails::Rationalize(bool asc) {
2363   bool found_useful_op = false;
2364   uint32_t char_mask;
2365   if (asc) {
2366     char_mask = String::kMaxOneByteCharCode;
2367   } else {
2368     char_mask = String::kMaxUtf16CodeUnit;
2369   }
2370   mask_ = 0;
2371   value_ = 0;
2372   int char_shift = 0;
2373   for (int i = 0; i < characters_; i++) {
2374     Position* pos = &positions_[i];
2375     if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2376       found_useful_op = true;
2377     }
2378     mask_ |= (pos->mask & char_mask) << char_shift;
2379     value_ |= (pos->value & char_mask) << char_shift;
2380     char_shift += asc ? 8 : 16;
2381   }
2382   return found_useful_op;
2383 }
2384 
2385 
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2386 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2387                                 Trace* bounds_check_trace,
2388                                 Trace* trace,
2389                                 bool preload_has_checked_bounds,
2390                                 Label* on_possible_success,
2391                                 QuickCheckDetails* details,
2392                                 bool fall_through_on_failure) {
2393   if (details->characters() == 0) return false;
2394   GetQuickCheckDetails(
2395       details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2396   if (details->cannot_match()) return false;
2397   if (!details->Rationalize(compiler->one_byte())) return false;
2398   DCHECK(details->characters() == 1 ||
2399          compiler->macro_assembler()->CanReadUnaligned());
2400   uint32_t mask = details->mask();
2401   uint32_t value = details->value();
2402 
2403   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2404 
2405   if (trace->characters_preloaded() != details->characters()) {
2406     DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2407     // We are attempting to preload the minimum number of characters
2408     // any choice would eat, so if the bounds check fails, then none of the
2409     // choices can succeed, so we can just immediately backtrack, rather
2410     // than go to the next choice.
2411     assembler->LoadCurrentCharacter(trace->cp_offset(),
2412                                     bounds_check_trace->backtrack(),
2413                                     !preload_has_checked_bounds,
2414                                     details->characters());
2415   }
2416 
2417 
2418   bool need_mask = true;
2419 
2420   if (details->characters() == 1) {
2421     // If number of characters preloaded is 1 then we used a byte or 16 bit
2422     // load so the value is already masked down.
2423     uint32_t char_mask;
2424     if (compiler->one_byte()) {
2425       char_mask = String::kMaxOneByteCharCode;
2426     } else {
2427       char_mask = String::kMaxUtf16CodeUnit;
2428     }
2429     if ((mask & char_mask) == char_mask) need_mask = false;
2430     mask &= char_mask;
2431   } else {
2432     // For 2-character preloads in one-byte mode or 1-character preloads in
2433     // two-byte mode we also use a 16 bit load with zero extend.
2434     static const uint32_t kTwoByteMask = 0xFFFF;
2435     static const uint32_t kFourByteMask = 0xFFFFFFFF;
2436     if (details->characters() == 2 && compiler->one_byte()) {
2437       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2438     } else if (details->characters() == 1 && !compiler->one_byte()) {
2439       if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2440     } else {
2441       if (mask == kFourByteMask) need_mask = false;
2442     }
2443   }
2444 
2445   if (fall_through_on_failure) {
2446     if (need_mask) {
2447       assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2448     } else {
2449       assembler->CheckCharacter(value, on_possible_success);
2450     }
2451   } else {
2452     if (need_mask) {
2453       assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2454     } else {
2455       assembler->CheckNotCharacter(value, trace->backtrack());
2456     }
2457   }
2458   return true;
2459 }
2460 
2461 
2462 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2463 // super-class in the .h file).
2464 //
2465 // We iterate along the text object, building up for each character a
2466 // mask and value that can be used to test for a quick failure to match.
2467 // The masks and values for the positions will be combined into a single
2468 // machine word for the current character width in order to be used in
2469 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2470 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2471                                     RegExpCompiler* compiler,
2472                                     int characters_filled_in,
2473                                     bool not_at_start) {
2474   // Do not collect any quick check details if the text node reads backward,
2475   // since it reads in the opposite direction than we use for quick checks.
2476   if (read_backward()) return;
2477   Isolate* isolate = compiler->macro_assembler()->isolate();
2478   DCHECK(characters_filled_in < details->characters());
2479   int characters = details->characters();
2480   int char_mask;
2481   if (compiler->one_byte()) {
2482     char_mask = String::kMaxOneByteCharCode;
2483   } else {
2484     char_mask = String::kMaxUtf16CodeUnit;
2485   }
2486   for (int k = 0; k < elements()->length(); k++) {
2487     TextElement elm = elements()->at(k);
2488     if (elm.text_type() == TextElement::ATOM) {
2489       Vector<const uc16> quarks = elm.atom()->data();
2490       for (int i = 0; i < characters && i < quarks.length(); i++) {
2491         QuickCheckDetails::Position* pos =
2492             details->positions(characters_filled_in);
2493         uc16 c = quarks[i];
2494         if (elm.atom()->ignore_case()) {
2495           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2496           int length = GetCaseIndependentLetters(isolate, c,
2497                                                  compiler->one_byte(), chars);
2498           if (length == 0) {
2499             // This can happen because all case variants are non-Latin1, but we
2500             // know the input is Latin1.
2501             details->set_cannot_match();
2502             pos->determines_perfectly = false;
2503             return;
2504           }
2505           if (length == 1) {
2506             // This letter has no case equivalents, so it's nice and simple
2507             // and the mask-compare will determine definitely whether we have
2508             // a match at this character position.
2509             pos->mask = char_mask;
2510             pos->value = c;
2511             pos->determines_perfectly = true;
2512           } else {
2513             uint32_t common_bits = char_mask;
2514             uint32_t bits = chars[0];
2515             for (int j = 1; j < length; j++) {
2516               uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2517               common_bits ^= differing_bits;
2518               bits &= common_bits;
2519             }
2520             // If length is 2 and common bits has only one zero in it then
2521             // our mask and compare instruction will determine definitely
2522             // whether we have a match at this character position.  Otherwise
2523             // it can only be an approximate check.
2524             uint32_t one_zero = (common_bits | ~char_mask);
2525             if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2526               pos->determines_perfectly = true;
2527             }
2528             pos->mask = common_bits;
2529             pos->value = bits;
2530           }
2531         } else {
2532           // Don't ignore case.  Nice simple case where the mask-compare will
2533           // determine definitely whether we have a match at this character
2534           // position.
2535           if (c > char_mask) {
2536             details->set_cannot_match();
2537             pos->determines_perfectly = false;
2538             return;
2539           }
2540           pos->mask = char_mask;
2541           pos->value = c;
2542           pos->determines_perfectly = true;
2543         }
2544         characters_filled_in++;
2545         DCHECK(characters_filled_in <= details->characters());
2546         if (characters_filled_in == details->characters()) {
2547           return;
2548         }
2549       }
2550     } else {
2551       QuickCheckDetails::Position* pos =
2552           details->positions(characters_filled_in);
2553       RegExpCharacterClass* tree = elm.char_class();
2554       ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2555       DCHECK(!ranges->is_empty());
2556       if (tree->is_negated()) {
2557         // A quick check uses multi-character mask and compare.  There is no
2558         // useful way to incorporate a negative char class into this scheme
2559         // so we just conservatively create a mask and value that will always
2560         // succeed.
2561         pos->mask = 0;
2562         pos->value = 0;
2563       } else {
2564         int first_range = 0;
2565         while (ranges->at(first_range).from() > char_mask) {
2566           first_range++;
2567           if (first_range == ranges->length()) {
2568             details->set_cannot_match();
2569             pos->determines_perfectly = false;
2570             return;
2571           }
2572         }
2573         CharacterRange range = ranges->at(first_range);
2574         uc16 from = range.from();
2575         uc16 to = range.to();
2576         if (to > char_mask) {
2577           to = char_mask;
2578         }
2579         uint32_t differing_bits = (from ^ to);
2580         // A mask and compare is only perfect if the differing bits form a
2581         // number like 00011111 with one single block of trailing 1s.
2582         if ((differing_bits & (differing_bits + 1)) == 0 &&
2583              from + differing_bits == to) {
2584           pos->determines_perfectly = true;
2585         }
2586         uint32_t common_bits = ~SmearBitsRight(differing_bits);
2587         uint32_t bits = (from & common_bits);
2588         for (int i = first_range + 1; i < ranges->length(); i++) {
2589           CharacterRange range = ranges->at(i);
2590           uc16 from = range.from();
2591           uc16 to = range.to();
2592           if (from > char_mask) continue;
2593           if (to > char_mask) to = char_mask;
2594           // Here we are combining more ranges into the mask and compare
2595           // value.  With each new range the mask becomes more sparse and
2596           // so the chances of a false positive rise.  A character class
2597           // with multiple ranges is assumed never to be equivalent to a
2598           // mask and compare operation.
2599           pos->determines_perfectly = false;
2600           uint32_t new_common_bits = (from ^ to);
2601           new_common_bits = ~SmearBitsRight(new_common_bits);
2602           common_bits &= new_common_bits;
2603           bits &= new_common_bits;
2604           uint32_t differing_bits = (from & common_bits) ^ bits;
2605           common_bits ^= differing_bits;
2606           bits &= common_bits;
2607         }
2608         pos->mask = common_bits;
2609         pos->value = bits;
2610       }
2611       characters_filled_in++;
2612       DCHECK(characters_filled_in <= details->characters());
2613       if (characters_filled_in == details->characters()) {
2614         return;
2615       }
2616     }
2617   }
2618   DCHECK(characters_filled_in != details->characters());
2619   if (!details->cannot_match()) {
2620     on_success()-> GetQuickCheckDetails(details,
2621                                         compiler,
2622                                         characters_filled_in,
2623                                         true);
2624   }
2625 }
2626 
2627 
Clear()2628 void QuickCheckDetails::Clear() {
2629   for (int i = 0; i < characters_; i++) {
2630     positions_[i].mask = 0;
2631     positions_[i].value = 0;
2632     positions_[i].determines_perfectly = false;
2633   }
2634   characters_ = 0;
2635 }
2636 
2637 
Advance(int by,bool one_byte)2638 void QuickCheckDetails::Advance(int by, bool one_byte) {
2639   if (by >= characters_ || by < 0) {
2640     DCHECK_IMPLIES(by < 0, characters_ == 0);
2641     Clear();
2642     return;
2643   }
2644   DCHECK_LE(characters_ - by, 4);
2645   DCHECK_LE(characters_, 4);
2646   for (int i = 0; i < characters_ - by; i++) {
2647     positions_[i] = positions_[by + i];
2648   }
2649   for (int i = characters_ - by; i < characters_; i++) {
2650     positions_[i].mask = 0;
2651     positions_[i].value = 0;
2652     positions_[i].determines_perfectly = false;
2653   }
2654   characters_ -= by;
2655   // We could change mask_ and value_ here but we would never advance unless
2656   // they had already been used in a check and they won't be used again because
2657   // it would gain us nothing.  So there's no point.
2658 }
2659 
2660 
Merge(QuickCheckDetails * other,int from_index)2661 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2662   DCHECK(characters_ == other->characters_);
2663   if (other->cannot_match_) {
2664     return;
2665   }
2666   if (cannot_match_) {
2667     *this = *other;
2668     return;
2669   }
2670   for (int i = from_index; i < characters_; i++) {
2671     QuickCheckDetails::Position* pos = positions(i);
2672     QuickCheckDetails::Position* other_pos = other->positions(i);
2673     if (pos->mask != other_pos->mask ||
2674         pos->value != other_pos->value ||
2675         !other_pos->determines_perfectly) {
2676       // Our mask-compare operation will be approximate unless we have the
2677       // exact same operation on both sides of the alternation.
2678       pos->determines_perfectly = false;
2679     }
2680     pos->mask &= other_pos->mask;
2681     pos->value &= pos->mask;
2682     other_pos->value &= pos->mask;
2683     uc16 differing_bits = (pos->value ^ other_pos->value);
2684     pos->mask &= ~differing_bits;
2685     pos->value &= pos->mask;
2686   }
2687 }
2688 
2689 
2690 class VisitMarker {
2691  public:
VisitMarker(NodeInfo * info)2692   explicit VisitMarker(NodeInfo* info) : info_(info) {
2693     DCHECK(!info->visited);
2694     info->visited = true;
2695   }
~VisitMarker()2696   ~VisitMarker() {
2697     info_->visited = false;
2698   }
2699  private:
2700   NodeInfo* info_;
2701 };
2702 
FilterOneByte(int depth)2703 RegExpNode* SeqRegExpNode::FilterOneByte(int depth) {
2704   if (info()->replacement_calculated) return replacement();
2705   if (depth < 0) return this;
2706   DCHECK(!info()->visited);
2707   VisitMarker marker(info());
2708   return FilterSuccessor(depth - 1);
2709 }
2710 
FilterSuccessor(int depth)2711 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth) {
2712   RegExpNode* next = on_success_->FilterOneByte(depth - 1);
2713   if (next == nullptr) return set_replacement(nullptr);
2714   on_success_ = next;
2715   return set_replacement(this);
2716 }
2717 
2718 // We need to check for the following characters: 0x39C 0x3BC 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2719 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2720   // TODO(dcarney): this could be a lot more efficient.
2721   return range.Contains(0x039C) || range.Contains(0x03BC) ||
2722          range.Contains(0x0178);
2723 }
2724 
2725 
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2726 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2727   for (int i = 0; i < ranges->length(); i++) {
2728     // TODO(dcarney): this could be a lot more efficient.
2729     if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2730   }
2731   return false;
2732 }
2733 
FilterOneByte(int depth)2734 RegExpNode* TextNode::FilterOneByte(int depth) {
2735   if (info()->replacement_calculated) return replacement();
2736   if (depth < 0) return this;
2737   DCHECK(!info()->visited);
2738   VisitMarker marker(info());
2739   int element_count = elements()->length();
2740   for (int i = 0; i < element_count; i++) {
2741     TextElement elm = elements()->at(i);
2742     if (elm.text_type() == TextElement::ATOM) {
2743       Vector<const uc16> quarks = elm.atom()->data();
2744       for (int j = 0; j < quarks.length(); j++) {
2745         uint16_t c = quarks[j];
2746         if (elm.atom()->ignore_case()) {
2747           c = unibrow::Latin1::TryConvertToLatin1(c);
2748         }
2749         if (c > unibrow::Latin1::kMaxChar) return set_replacement(nullptr);
2750         // Replace quark in case we converted to Latin-1.
2751         uint16_t* writable_quarks = const_cast<uint16_t*>(quarks.start());
2752         writable_quarks[j] = c;
2753       }
2754     } else {
2755       DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2756       RegExpCharacterClass* cc = elm.char_class();
2757       ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2758       CharacterRange::Canonicalize(ranges);
2759       // Now they are in order so we only need to look at the first.
2760       int range_count = ranges->length();
2761       if (cc->is_negated()) {
2762         if (range_count != 0 &&
2763             ranges->at(0).from() == 0 &&
2764             ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2765           // This will be handled in a later filter.
2766           if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2767             continue;
2768           return set_replacement(nullptr);
2769         }
2770       } else {
2771         if (range_count == 0 ||
2772             ranges->at(0).from() > String::kMaxOneByteCharCode) {
2773           // This will be handled in a later filter.
2774           if (IgnoreCase(cc->flags()) && RangesContainLatin1Equivalents(ranges))
2775             continue;
2776           return set_replacement(nullptr);
2777         }
2778       }
2779     }
2780   }
2781   return FilterSuccessor(depth - 1);
2782 }
2783 
FilterOneByte(int depth)2784 RegExpNode* LoopChoiceNode::FilterOneByte(int depth) {
2785   if (info()->replacement_calculated) return replacement();
2786   if (depth < 0) return this;
2787   if (info()->visited) return this;
2788   {
2789     VisitMarker marker(info());
2790 
2791     RegExpNode* continue_replacement = continue_node_->FilterOneByte(depth - 1);
2792     // If we can't continue after the loop then there is no sense in doing the
2793     // loop.
2794     if (continue_replacement == nullptr) return set_replacement(nullptr);
2795   }
2796 
2797   return ChoiceNode::FilterOneByte(depth - 1);
2798 }
2799 
FilterOneByte(int depth)2800 RegExpNode* ChoiceNode::FilterOneByte(int depth) {
2801   if (info()->replacement_calculated) return replacement();
2802   if (depth < 0) return this;
2803   if (info()->visited) return this;
2804   VisitMarker marker(info());
2805   int choice_count = alternatives_->length();
2806 
2807   for (int i = 0; i < choice_count; i++) {
2808     GuardedAlternative alternative = alternatives_->at(i);
2809     if (alternative.guards() != nullptr &&
2810         alternative.guards()->length() != 0) {
2811       set_replacement(this);
2812       return this;
2813     }
2814   }
2815 
2816   int surviving = 0;
2817   RegExpNode* survivor = nullptr;
2818   for (int i = 0; i < choice_count; i++) {
2819     GuardedAlternative alternative = alternatives_->at(i);
2820     RegExpNode* replacement = alternative.node()->FilterOneByte(depth - 1);
2821     DCHECK(replacement != this);  // No missing EMPTY_MATCH_CHECK.
2822     if (replacement != nullptr) {
2823       alternatives_->at(i).set_node(replacement);
2824       surviving++;
2825       survivor = replacement;
2826     }
2827   }
2828   if (surviving < 2) return set_replacement(survivor);
2829 
2830   set_replacement(this);
2831   if (surviving == choice_count) {
2832     return this;
2833   }
2834   // Only some of the nodes survived the filtering.  We need to rebuild the
2835   // alternatives list.
2836   ZoneList<GuardedAlternative>* new_alternatives =
2837       new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2838   for (int i = 0; i < choice_count; i++) {
2839     RegExpNode* replacement =
2840         alternatives_->at(i).node()->FilterOneByte(depth - 1);
2841     if (replacement != nullptr) {
2842       alternatives_->at(i).set_node(replacement);
2843       new_alternatives->Add(alternatives_->at(i), zone());
2844     }
2845   }
2846   alternatives_ = new_alternatives;
2847   return this;
2848 }
2849 
FilterOneByte(int depth)2850 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth) {
2851   if (info()->replacement_calculated) return replacement();
2852   if (depth < 0) return this;
2853   if (info()->visited) return this;
2854   VisitMarker marker(info());
2855   // Alternative 0 is the negative lookahead, alternative 1 is what comes
2856   // afterwards.
2857   RegExpNode* node = alternatives_->at(1).node();
2858   RegExpNode* replacement = node->FilterOneByte(depth - 1);
2859   if (replacement == nullptr) return set_replacement(nullptr);
2860   alternatives_->at(1).set_node(replacement);
2861 
2862   RegExpNode* neg_node = alternatives_->at(0).node();
2863   RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1);
2864   // If the negative lookahead is always going to fail then
2865   // we don't need to check it.
2866   if (neg_replacement == nullptr) return set_replacement(replacement);
2867   alternatives_->at(0).set_node(neg_replacement);
2868   return set_replacement(this);
2869 }
2870 
2871 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2872 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2873                                           RegExpCompiler* compiler,
2874                                           int characters_filled_in,
2875                                           bool not_at_start) {
2876   if (body_can_be_zero_length_ || info()->visited) return;
2877   VisitMarker marker(info());
2878   return ChoiceNode::GetQuickCheckDetails(details,
2879                                           compiler,
2880                                           characters_filled_in,
2881                                           not_at_start);
2882 }
2883 
2884 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2885 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2886                                   BoyerMooreLookahead* bm, bool not_at_start) {
2887   if (body_can_be_zero_length_ || budget <= 0) {
2888     bm->SetRest(offset);
2889     SaveBMInfo(bm, not_at_start, offset);
2890     return;
2891   }
2892   ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2893   SaveBMInfo(bm, not_at_start, offset);
2894 }
2895 
2896 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2897 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2898                                       RegExpCompiler* compiler,
2899                                       int characters_filled_in,
2900                                       bool not_at_start) {
2901   not_at_start = (not_at_start || not_at_start_);
2902   int choice_count = alternatives_->length();
2903   DCHECK_LT(0, choice_count);
2904   alternatives_->at(0).node()->GetQuickCheckDetails(details,
2905                                                     compiler,
2906                                                     characters_filled_in,
2907                                                     not_at_start);
2908   for (int i = 1; i < choice_count; i++) {
2909     QuickCheckDetails new_details(details->characters());
2910     RegExpNode* node = alternatives_->at(i).node();
2911     node->GetQuickCheckDetails(&new_details, compiler,
2912                                characters_filled_in,
2913                                not_at_start);
2914     // Here we merge the quick match details of the two branches.
2915     details->Merge(&new_details, characters_filled_in);
2916   }
2917 }
2918 
2919 
2920 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2921 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2922                           Label* word,
2923                           Label* non_word,
2924                           bool fall_through_on_word) {
2925   if (assembler->CheckSpecialCharacterClass(
2926           fall_through_on_word ? 'w' : 'W',
2927           fall_through_on_word ? non_word : word)) {
2928     // Optimized implementation available.
2929     return;
2930   }
2931   assembler->CheckCharacterGT('z', non_word);
2932   assembler->CheckCharacterLT('0', non_word);
2933   assembler->CheckCharacterGT('a' - 1, word);
2934   assembler->CheckCharacterLT('9' + 1, word);
2935   assembler->CheckCharacterLT('A', non_word);
2936   assembler->CheckCharacterLT('Z' + 1, word);
2937   if (fall_through_on_word) {
2938     assembler->CheckNotCharacter('_', non_word);
2939   } else {
2940     assembler->CheckCharacter('_', word);
2941   }
2942 }
2943 
2944 
2945 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
2946 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)2947 static void EmitHat(RegExpCompiler* compiler,
2948                     RegExpNode* on_success,
2949                     Trace* trace) {
2950   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2951   // We will be loading the previous character into the current character
2952   // register.
2953   Trace new_trace(*trace);
2954   new_trace.InvalidateCurrentCharacter();
2955 
2956   Label ok;
2957   if (new_trace.cp_offset() == 0) {
2958     // The start of input counts as a newline in this context, so skip to
2959     // ok if we are at the start.
2960     assembler->CheckAtStart(&ok);
2961   }
2962   // We already checked that we are not at the start of input so it must be
2963   // OK to load the previous character.
2964   assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
2965                                   new_trace.backtrack(),
2966                                   false);
2967   if (!assembler->CheckSpecialCharacterClass('n',
2968                                              new_trace.backtrack())) {
2969     // Newline means \n, \r, 0x2028 or 0x2029.
2970     if (!compiler->one_byte()) {
2971       assembler->CheckCharacterAfterAnd(0x2028, 0xFFFE, &ok);
2972     }
2973     assembler->CheckCharacter('\n', &ok);
2974     assembler->CheckNotCharacter('\r', new_trace.backtrack());
2975   }
2976   assembler->Bind(&ok);
2977   on_success->Emit(compiler, &new_trace);
2978 }
2979 
2980 
2981 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)2982 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
2983   RegExpMacroAssembler* assembler = compiler->macro_assembler();
2984   Isolate* isolate = assembler->isolate();
2985   Trace::TriBool next_is_word_character = Trace::UNKNOWN;
2986   bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
2987   BoyerMooreLookahead* lookahead = bm_info(not_at_start);
2988   if (lookahead == nullptr) {
2989     int eats_at_least =
2990         Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
2991                                                     kRecursionBudget,
2992                                                     not_at_start));
2993     if (eats_at_least >= 1) {
2994       BoyerMooreLookahead* bm =
2995           new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
2996       FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
2997       if (bm->at(0)->is_non_word())
2998         next_is_word_character = Trace::FALSE_VALUE;
2999       if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3000     }
3001   } else {
3002     if (lookahead->at(0)->is_non_word())
3003       next_is_word_character = Trace::FALSE_VALUE;
3004     if (lookahead->at(0)->is_word())
3005       next_is_word_character = Trace::TRUE_VALUE;
3006   }
3007   bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3008   if (next_is_word_character == Trace::UNKNOWN) {
3009     Label before_non_word;
3010     Label before_word;
3011     if (trace->characters_preloaded() != 1) {
3012       assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3013     }
3014     // Fall through on non-word.
3015     EmitWordCheck(assembler, &before_word, &before_non_word, false);
3016     // Next character is not a word character.
3017     assembler->Bind(&before_non_word);
3018     Label ok;
3019     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3020     assembler->GoTo(&ok);
3021 
3022     assembler->Bind(&before_word);
3023     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3024     assembler->Bind(&ok);
3025   } else if (next_is_word_character == Trace::TRUE_VALUE) {
3026     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3027   } else {
3028     DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3029     BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3030   }
3031 }
3032 
3033 
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3034 void AssertionNode::BacktrackIfPrevious(
3035     RegExpCompiler* compiler,
3036     Trace* trace,
3037     AssertionNode::IfPrevious backtrack_if_previous) {
3038   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3039   Trace new_trace(*trace);
3040   new_trace.InvalidateCurrentCharacter();
3041 
3042   Label fall_through, dummy;
3043 
3044   Label* non_word = backtrack_if_previous == kIsNonWord ?
3045                     new_trace.backtrack() :
3046                     &fall_through;
3047   Label* word = backtrack_if_previous == kIsNonWord ?
3048                 &fall_through :
3049                 new_trace.backtrack();
3050 
3051   if (new_trace.cp_offset() == 0) {
3052     // The start of input counts as a non-word character, so the question is
3053     // decided if we are at the start.
3054     assembler->CheckAtStart(non_word);
3055   }
3056   // We already checked that we are not at the start of input so it must be
3057   // OK to load the previous character.
3058   assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3059   EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3060 
3061   assembler->Bind(&fall_through);
3062   on_success()->Emit(compiler, &new_trace);
3063 }
3064 
3065 
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3066 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3067                                          RegExpCompiler* compiler,
3068                                          int filled_in,
3069                                          bool not_at_start) {
3070   if (assertion_type_ == AT_START && not_at_start) {
3071     details->set_cannot_match();
3072     return;
3073   }
3074   return on_success()->GetQuickCheckDetails(details,
3075                                             compiler,
3076                                             filled_in,
3077                                             not_at_start);
3078 }
3079 
3080 
Emit(RegExpCompiler * compiler,Trace * trace)3081 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3082   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3083   switch (assertion_type_) {
3084     case AT_END: {
3085       Label ok;
3086       assembler->CheckPosition(trace->cp_offset(), &ok);
3087       assembler->GoTo(trace->backtrack());
3088       assembler->Bind(&ok);
3089       break;
3090     }
3091     case AT_START: {
3092       if (trace->at_start() == Trace::FALSE_VALUE) {
3093         assembler->GoTo(trace->backtrack());
3094         return;
3095       }
3096       if (trace->at_start() == Trace::UNKNOWN) {
3097         assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3098         Trace at_start_trace = *trace;
3099         at_start_trace.set_at_start(Trace::TRUE_VALUE);
3100         on_success()->Emit(compiler, &at_start_trace);
3101         return;
3102       }
3103     }
3104     break;
3105     case AFTER_NEWLINE:
3106       EmitHat(compiler, on_success(), trace);
3107       return;
3108     case AT_BOUNDARY:
3109     case AT_NON_BOUNDARY: {
3110       EmitBoundaryCheck(compiler, trace);
3111       return;
3112     }
3113   }
3114   on_success()->Emit(compiler, trace);
3115 }
3116 
3117 
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3118 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3119   if (quick_check == nullptr) return false;
3120   if (offset >= quick_check->characters()) return false;
3121   return quick_check->positions(offset)->determines_perfectly;
3122 }
3123 
3124 
UpdateBoundsCheck(int index,int * checked_up_to)3125 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3126   if (index > *checked_up_to) {
3127     *checked_up_to = index;
3128   }
3129 }
3130 
3131 
3132 // We call this repeatedly to generate code for each pass over the text node.
3133 // The passes are in increasing order of difficulty because we hope one
3134 // of the first passes will fail in which case we are saved the work of the
3135 // later passes.  for example for the case independent regexp /%[asdfghjkl]a/
3136 // we will check the '%' in the first pass, the case independent 'a' in the
3137 // second pass and the character class in the last pass.
3138 //
3139 // The passes are done from right to left, so for example to test for /bar/
3140 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3141 // and then a 'b' with offset 0.  This means we can avoid the end-of-input
3142 // bounds check most of the time.  In the example we only need to check for
3143 // end-of-input when loading the putative 'r'.
3144 //
3145 // A slight complication involves the fact that the first character may already
3146 // be fetched into a register by the previous node.  In this case we want to
3147 // do the test for that character first.  We do this in separate passes.  The
3148 // 'preloaded' argument indicates that we are doing such a 'pass'.  If such a
3149 // pass has been performed then subsequent passes will have true in
3150 // first_element_checked to indicate that that character does not need to be
3151 // checked again.
3152 //
3153 // In addition to all this we are passed a Trace, which can
3154 // contain an AlternativeGeneration object.  In this AlternativeGeneration
3155 // object we can see details of any quick check that was already passed in
3156 // order to get to the code we are now generating.  The quick check can involve
3157 // loading characters, which means we do not need to recheck the bounds
3158 // up to the limit the quick check already checked.  In addition the quick
3159 // check can have involved a mask and compare operation which may simplify
3160 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3161 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3162                             TextEmitPassType pass,
3163                             bool preloaded,
3164                             Trace* trace,
3165                             bool first_element_checked,
3166                             int* checked_up_to) {
3167   RegExpMacroAssembler* assembler = compiler->macro_assembler();
3168   Isolate* isolate = assembler->isolate();
3169   bool one_byte = compiler->one_byte();
3170   Label* backtrack = trace->backtrack();
3171   QuickCheckDetails* quick_check = trace->quick_check_performed();
3172   int element_count = elements()->length();
3173   int backward_offset = read_backward() ? -Length() : 0;
3174   for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3175     TextElement elm = elements()->at(i);
3176     int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3177     if (elm.text_type() == TextElement::ATOM) {
3178       if (SkipPass(pass, elm.atom()->ignore_case())) continue;
3179       Vector<const uc16> quarks = elm.atom()->data();
3180       for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3181         if (first_element_checked && i == 0 && j == 0) continue;
3182         if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3183         EmitCharacterFunction* emit_function = nullptr;
3184         uc16 quark = quarks[j];
3185         if (elm.atom()->ignore_case()) {
3186           // Everywhere else we assume that a non-Latin-1 character cannot match
3187           // a Latin-1 character. Avoid the cases where this is assumption is
3188           // invalid by using the Latin1 equivalent instead.
3189           quark = unibrow::Latin1::TryConvertToLatin1(quark);
3190         }
3191         switch (pass) {
3192           case NON_LATIN1_MATCH:
3193             DCHECK(one_byte);
3194             if (quark > String::kMaxOneByteCharCode) {
3195               assembler->GoTo(backtrack);
3196               return;
3197             }
3198             break;
3199           case NON_LETTER_CHARACTER_MATCH:
3200             emit_function = &EmitAtomNonLetter;
3201             break;
3202           case SIMPLE_CHARACTER_MATCH:
3203             emit_function = &EmitSimpleCharacter;
3204             break;
3205           case CASE_CHARACTER_MATCH:
3206             emit_function = &EmitAtomLetter;
3207             break;
3208           default:
3209             break;
3210         }
3211         if (emit_function != nullptr) {
3212           bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3213           bool bound_checked =
3214               emit_function(isolate, compiler, quark, backtrack, cp_offset + j,
3215                             bounds_check, preloaded);
3216           if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3217         }
3218       }
3219     } else {
3220       DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3221       if (pass == CHARACTER_CLASS_MATCH) {
3222         if (first_element_checked && i == 0) continue;
3223         if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3224         RegExpCharacterClass* cc = elm.char_class();
3225         bool bounds_check = *checked_up_to < cp_offset || read_backward();
3226         EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3227                       bounds_check, preloaded, zone());
3228         UpdateBoundsCheck(cp_offset, checked_up_to);
3229       }
3230     }
3231   }
3232 }
3233 
3234 
Length()3235 int TextNode::Length() {
3236   TextElement elm = elements()->last();
3237   DCHECK_LE(0, elm.cp_offset());
3238   return elm.cp_offset() + elm.length();
3239 }
3240 
SkipPass(TextEmitPassType pass,bool ignore_case)3241 bool TextNode::SkipPass(TextEmitPassType pass, bool ignore_case) {
3242   if (ignore_case) {
3243     return pass == SIMPLE_CHARACTER_MATCH;
3244   } else {
3245     return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3246   }
3247 }
3248 
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)3249 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3250                                              ZoneList<CharacterRange>* ranges,
3251                                              bool read_backward,
3252                                              RegExpNode* on_success,
3253                                              JSRegExp::Flags flags) {
3254   DCHECK_NOT_NULL(ranges);
3255   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3256   elms->Add(TextElement::CharClass(
3257                 new (zone) RegExpCharacterClass(zone, ranges, flags)),
3258             zone);
3259   return new (zone) TextNode(elms, read_backward, on_success);
3260 }
3261 
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success,JSRegExp::Flags flags)3262 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3263                                            CharacterRange trail,
3264                                            bool read_backward,
3265                                            RegExpNode* on_success,
3266                                            JSRegExp::Flags flags) {
3267   ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3268   ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3269   ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3270   elms->Add(TextElement::CharClass(
3271                 new (zone) RegExpCharacterClass(zone, lead_ranges, flags)),
3272             zone);
3273   elms->Add(TextElement::CharClass(
3274                 new (zone) RegExpCharacterClass(zone, trail_ranges, flags)),
3275             zone);
3276   return new (zone) TextNode(elms, read_backward, on_success);
3277 }
3278 
3279 
3280 // This generates the code to match a text node.  A text node can contain
3281 // straight character sequences (possibly to be matched in a case-independent
3282 // way) and character classes.  For efficiency we do not do this in a single
3283 // pass from left to right.  Instead we pass over the text node several times,
3284 // emitting code for some character positions every time.  See the comment on
3285 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3286 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3287   LimitResult limit_result = LimitVersions(compiler, trace);
3288   if (limit_result == DONE) return;
3289   DCHECK(limit_result == CONTINUE);
3290 
3291   if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3292     compiler->SetRegExpTooBig();
3293     return;
3294   }
3295 
3296   if (compiler->one_byte()) {
3297     int dummy = 0;
3298     TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3299   }
3300 
3301   bool first_elt_done = false;
3302   int bound_checked_to = trace->cp_offset() - 1;
3303   bound_checked_to += trace->bound_checked_up_to();
3304 
3305   // If a character is preloaded into the current character register then
3306   // check that now.
3307   if (trace->characters_preloaded() == 1) {
3308     for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3309       TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), true, trace,
3310                    false, &bound_checked_to);
3311     }
3312     first_elt_done = true;
3313   }
3314 
3315   for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3316     TextEmitPass(compiler, static_cast<TextEmitPassType>(pass), false, trace,
3317                  first_elt_done, &bound_checked_to);
3318   }
3319 
3320   Trace successor_trace(*trace);
3321   // If we advance backward, we may end up at the start.
3322   successor_trace.AdvanceCurrentPositionInTrace(
3323       read_backward() ? -Length() : Length(), compiler);
3324   successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3325                                                : Trace::FALSE_VALUE);
3326   RecursionCheck rc(compiler);
3327   on_success()->Emit(compiler, &successor_trace);
3328 }
3329 
3330 
InvalidateCurrentCharacter()3331 void Trace::InvalidateCurrentCharacter() {
3332   characters_preloaded_ = 0;
3333 }
3334 
3335 
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3336 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3337   // We don't have an instruction for shifting the current character register
3338   // down or for using a shifted value for anything so lets just forget that
3339   // we preloaded any characters into it.
3340   characters_preloaded_ = 0;
3341   // Adjust the offsets of the quick check performed information.  This
3342   // information is used to find out what we already determined about the
3343   // characters by means of mask and compare.
3344   quick_check_performed_.Advance(by, compiler->one_byte());
3345   cp_offset_ += by;
3346   if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3347     compiler->SetRegExpTooBig();
3348     cp_offset_ = 0;
3349   }
3350   bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3351 }
3352 
3353 
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3354 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3355   int element_count = elements()->length();
3356   for (int i = 0; i < element_count; i++) {
3357     TextElement elm = elements()->at(i);
3358     if (elm.text_type() == TextElement::CHAR_CLASS) {
3359       RegExpCharacterClass* cc = elm.char_class();
3360 #ifdef V8_INTL_SUPPORT
3361       bool case_equivalents_already_added =
3362           NeedsUnicodeCaseEquivalents(cc->flags());
3363 #else
3364       bool case_equivalents_already_added = false;
3365 #endif
3366       if (IgnoreCase(cc->flags()) && !case_equivalents_already_added) {
3367         // None of the standard character classes is different in the case
3368         // independent case and it slows us down if we don't know that.
3369         if (cc->is_standard(zone())) continue;
3370         ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3371         CharacterRange::AddCaseEquivalents(isolate, zone(), ranges,
3372                                            is_one_byte);
3373       }
3374     }
3375   }
3376 }
3377 
3378 
GreedyLoopTextLength()3379 int TextNode::GreedyLoopTextLength() { return Length(); }
3380 
3381 
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3382 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3383     RegExpCompiler* compiler) {
3384   if (read_backward()) return nullptr;
3385   if (elements()->length() != 1) return nullptr;
3386   TextElement elm = elements()->at(0);
3387   if (elm.text_type() != TextElement::CHAR_CLASS) return nullptr;
3388   RegExpCharacterClass* node = elm.char_class();
3389   ZoneList<CharacterRange>* ranges = node->ranges(zone());
3390   CharacterRange::Canonicalize(ranges);
3391   if (node->is_negated()) {
3392     return ranges->length() == 0 ? on_success() : nullptr;
3393   }
3394   if (ranges->length() != 1) return nullptr;
3395   uint32_t max_char;
3396   if (compiler->one_byte()) {
3397     max_char = String::kMaxOneByteCharCode;
3398   } else {
3399     max_char = String::kMaxUtf16CodeUnit;
3400   }
3401   return ranges->at(0).IsEverything(max_char) ? on_success() : nullptr;
3402 }
3403 
3404 
3405 // Finds the fixed match length of a sequence of nodes that goes from
3406 // this alternative and back to this choice node.  If there are variable
3407 // length nodes or other complications in the way then return a sentinel
3408 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3409 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3410     GuardedAlternative* alternative) {
3411   int length = 0;
3412   RegExpNode* node = alternative->node();
3413   // Later we will generate code for all these text nodes using recursion
3414   // so we have to limit the max number.
3415   int recursion_depth = 0;
3416   while (node != this) {
3417     if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3418       return kNodeIsTooComplexForGreedyLoops;
3419     }
3420     int node_length = node->GreedyLoopTextLength();
3421     if (node_length == kNodeIsTooComplexForGreedyLoops) {
3422       return kNodeIsTooComplexForGreedyLoops;
3423     }
3424     length += node_length;
3425     SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3426     node = seq_node->on_success();
3427   }
3428   return read_backward() ? -length : length;
3429 }
3430 
3431 
AddLoopAlternative(GuardedAlternative alt)3432 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3433   DCHECK_NULL(loop_node_);
3434   AddAlternative(alt);
3435   loop_node_ = alt.node();
3436 }
3437 
3438 
AddContinueAlternative(GuardedAlternative alt)3439 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3440   DCHECK_NULL(continue_node_);
3441   AddAlternative(alt);
3442   continue_node_ = alt.node();
3443 }
3444 
3445 
Emit(RegExpCompiler * compiler,Trace * trace)3446 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3447   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3448   if (trace->stop_node() == this) {
3449     // Back edge of greedy optimized loop node graph.
3450     int text_length =
3451         GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3452     DCHECK_NE(kNodeIsTooComplexForGreedyLoops, text_length);
3453     // Update the counter-based backtracking info on the stack.  This is an
3454     // optimization for greedy loops (see below).
3455     DCHECK(trace->cp_offset() == text_length);
3456     macro_assembler->AdvanceCurrentPosition(text_length);
3457     macro_assembler->GoTo(trace->loop_label());
3458     return;
3459   }
3460   DCHECK_NULL(trace->stop_node());
3461   if (!trace->is_trivial()) {
3462     trace->Flush(compiler, this);
3463     return;
3464   }
3465   ChoiceNode::Emit(compiler, trace);
3466 }
3467 
3468 
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3469 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3470                                            int eats_at_least) {
3471   int preload_characters = Min(4, eats_at_least);
3472   DCHECK_LE(preload_characters, 4);
3473   if (compiler->macro_assembler()->CanReadUnaligned()) {
3474     bool one_byte = compiler->one_byte();
3475     if (one_byte) {
3476       // We can't preload 3 characters because there is no machine instruction
3477       // to do that.  We can't just load 4 because we could be reading
3478       // beyond the end of the string, which could cause a memory fault.
3479       if (preload_characters == 3) preload_characters = 2;
3480     } else {
3481       if (preload_characters > 2) preload_characters = 2;
3482     }
3483   } else {
3484     if (preload_characters > 1) preload_characters = 1;
3485   }
3486   return preload_characters;
3487 }
3488 
3489 
3490 // This class is used when generating the alternatives in a choice node.  It
3491 // records the way the alternative is being code generated.
3492 class AlternativeGeneration: public Malloced {
3493  public:
AlternativeGeneration()3494   AlternativeGeneration()
3495       : possible_success(),
3496         expects_preload(false),
3497         after(),
3498         quick_check_details() { }
3499   Label possible_success;
3500   bool expects_preload;
3501   Label after;
3502   QuickCheckDetails quick_check_details;
3503 };
3504 
3505 
3506 // Creates a list of AlternativeGenerations.  If the list has a reasonable
3507 // size then it is on the stack, otherwise the excess is on the heap.
3508 class AlternativeGenerationList {
3509  public:
AlternativeGenerationList(int count,Zone * zone)3510   AlternativeGenerationList(int count, Zone* zone)
3511       : alt_gens_(count, zone) {
3512     for (int i = 0; i < count && i < kAFew; i++) {
3513       alt_gens_.Add(a_few_alt_gens_ + i, zone);
3514     }
3515     for (int i = kAFew; i < count; i++) {
3516       alt_gens_.Add(new AlternativeGeneration(), zone);
3517     }
3518   }
~AlternativeGenerationList()3519   ~AlternativeGenerationList() {
3520     for (int i = kAFew; i < alt_gens_.length(); i++) {
3521       delete alt_gens_[i];
3522       alt_gens_[i] = nullptr;
3523     }
3524   }
3525 
at(int i)3526   AlternativeGeneration* at(int i) {
3527     return alt_gens_[i];
3528   }
3529 
3530  private:
3531   static const int kAFew = 10;
3532   ZoneList<AlternativeGeneration*> alt_gens_;
3533   AlternativeGeneration a_few_alt_gens_[kAFew];
3534 };
3535 
3536 
3537 static const uc32 kRangeEndMarker = 0x110000;
3538 
3539 // The '2' variant is has inclusive from and exclusive to.
3540 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3541 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3542 static const int kSpaceRanges[] = {
3543     '\t',   '\r' + 1, ' ',    ' ' + 1, 0x00A0, 0x00A1, 0x1680,
3544     0x1681, 0x2000,   0x200B, 0x2028,  0x202A, 0x202F, 0x2030,
3545     0x205F, 0x2060,   0x3000, 0x3001,  0xFEFF, 0xFF00, kRangeEndMarker};
3546 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3547 
3548 static const int kWordRanges[] = {
3549     '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3550 static const int kWordRangeCount = arraysize(kWordRanges);
3551 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3552 static const int kDigitRangeCount = arraysize(kDigitRanges);
3553 static const int kSurrogateRanges[] = {
3554     kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3555 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3556 static const int kLineTerminatorRanges[] = {
3557     0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3558 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3559 
Set(int character)3560 void BoyerMoorePositionInfo::Set(int character) {
3561   SetInterval(Interval(character, character));
3562 }
3563 
3564 
SetInterval(const Interval & interval)3565 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3566   s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3567   w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3568   d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3569   surrogate_ =
3570       AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3571   if (interval.to() - interval.from() >= kMapSize - 1) {
3572     if (map_count_ != kMapSize) {
3573       map_count_ = kMapSize;
3574       for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3575     }
3576     return;
3577   }
3578   for (int i = interval.from(); i <= interval.to(); i++) {
3579     int mod_character = (i & kMask);
3580     if (!map_->at(mod_character)) {
3581       map_count_++;
3582       map_->at(mod_character) = true;
3583     }
3584     if (map_count_ == kMapSize) return;
3585   }
3586 }
3587 
3588 
SetAll()3589 void BoyerMoorePositionInfo::SetAll() {
3590   s_ = w_ = d_ = kLatticeUnknown;
3591   if (map_count_ != kMapSize) {
3592     map_count_ = kMapSize;
3593     for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3594   }
3595 }
3596 
3597 
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3598 BoyerMooreLookahead::BoyerMooreLookahead(
3599     int length, RegExpCompiler* compiler, Zone* zone)
3600     : length_(length),
3601       compiler_(compiler) {
3602   if (compiler->one_byte()) {
3603     max_char_ = String::kMaxOneByteCharCode;
3604   } else {
3605     max_char_ = String::kMaxUtf16CodeUnit;
3606   }
3607   bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3608   for (int i = 0; i < length; i++) {
3609     bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3610   }
3611 }
3612 
3613 
3614 // Find the longest range of lookahead that has the fewest number of different
3615 // characters that can occur at a given position.  Since we are optimizing two
3616 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3617 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3618   int biggest_points = 0;
3619   // If more than 32 characters out of 128 can occur it is unlikely that we can
3620   // be lucky enough to step forwards much of the time.
3621   const int kMaxMax = 32;
3622   for (int max_number_of_chars = 4;
3623        max_number_of_chars < kMaxMax;
3624        max_number_of_chars *= 2) {
3625     biggest_points =
3626         FindBestInterval(max_number_of_chars, biggest_points, from, to);
3627   }
3628   if (biggest_points == 0) return false;
3629   return true;
3630 }
3631 
3632 
3633 // Find the highest-points range between 0 and length_ where the character
3634 // information is not too vague.  'Too vague' means that there are more than
3635 // max_number_of_chars that can occur at this position.  Calculates the number
3636 // of points as the product of width-of-the-range and
3637 // probability-of-finding-one-of-the-characters, where the probability is
3638 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3639 int BoyerMooreLookahead::FindBestInterval(
3640     int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3641   int biggest_points = old_biggest_points;
3642   static const int kSize = RegExpMacroAssembler::kTableSize;
3643   for (int i = 0; i < length_; ) {
3644     while (i < length_ && Count(i) > max_number_of_chars) i++;
3645     if (i == length_) break;
3646     int remembered_from = i;
3647     bool union_map[kSize];
3648     for (int j = 0; j < kSize; j++) union_map[j] = false;
3649     while (i < length_ && Count(i) <= max_number_of_chars) {
3650       BoyerMoorePositionInfo* map = bitmaps_->at(i);
3651       for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3652       i++;
3653     }
3654     int frequency = 0;
3655     for (int j = 0; j < kSize; j++) {
3656       if (union_map[j]) {
3657         // Add 1 to the frequency to give a small per-character boost for
3658         // the cases where our sampling is not good enough and many
3659         // characters have a frequency of zero.  This means the frequency
3660         // can theoretically be up to 2*kSize though we treat it mostly as
3661         // a fraction of kSize.
3662         frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3663       }
3664     }
3665     // We use the probability of skipping times the distance we are skipping to
3666     // judge the effectiveness of this.  Actually we have a cut-off:  By
3667     // dividing by 2 we switch off the skipping if the probability of skipping
3668     // is less than 50%.  This is because the multibyte mask-and-compare
3669     // skipping in quickcheck is more likely to do well on this case.
3670     bool in_quickcheck_range =
3671         ((i - remembered_from < 4) ||
3672          (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3673     // Called 'probability' but it is only a rough estimate and can actually
3674     // be outside the 0-kSize range.
3675     int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3676     int points = (i - remembered_from) * probability;
3677     if (points > biggest_points) {
3678       *from = remembered_from;
3679       *to = i - 1;
3680       biggest_points = points;
3681     }
3682   }
3683   return biggest_points;
3684 }
3685 
3686 
3687 // Take all the characters that will not prevent a successful match if they
3688 // occur in the subject string in the range between min_lookahead and
3689 // max_lookahead (inclusive) measured from the current position.  If the
3690 // character at max_lookahead offset is not one of these characters, then we
3691 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3692 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3693                                       int max_lookahead,
3694                                       Handle<ByteArray> boolean_skip_table) {
3695   const int kSize = RegExpMacroAssembler::kTableSize;
3696 
3697   const int kSkipArrayEntry = 0;
3698   const int kDontSkipArrayEntry = 1;
3699 
3700   for (int i = 0; i < kSize; i++) {
3701     boolean_skip_table->set(i, kSkipArrayEntry);
3702   }
3703   int skip = max_lookahead + 1 - min_lookahead;
3704 
3705   for (int i = max_lookahead; i >= min_lookahead; i--) {
3706     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3707     for (int j = 0; j < kSize; j++) {
3708       if (map->at(j)) {
3709         boolean_skip_table->set(j, kDontSkipArrayEntry);
3710       }
3711     }
3712   }
3713 
3714   return skip;
3715 }
3716 
3717 
3718 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3719 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3720   const int kSize = RegExpMacroAssembler::kTableSize;
3721 
3722   int min_lookahead = 0;
3723   int max_lookahead = 0;
3724 
3725   if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3726 
3727   bool found_single_character = false;
3728   int single_character = 0;
3729   for (int i = max_lookahead; i >= min_lookahead; i--) {
3730     BoyerMoorePositionInfo* map = bitmaps_->at(i);
3731     if (map->map_count() > 1 ||
3732         (found_single_character && map->map_count() != 0)) {
3733       found_single_character = false;
3734       break;
3735     }
3736     for (int j = 0; j < kSize; j++) {
3737       if (map->at(j)) {
3738         found_single_character = true;
3739         single_character = j;
3740         break;
3741       }
3742     }
3743   }
3744 
3745   int lookahead_width = max_lookahead + 1 - min_lookahead;
3746 
3747   if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3748     // The mask-compare can probably handle this better.
3749     return;
3750   }
3751 
3752   if (found_single_character) {
3753     Label cont, again;
3754     masm->Bind(&again);
3755     masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3756     if (max_char_ > kSize) {
3757       masm->CheckCharacterAfterAnd(single_character,
3758                                    RegExpMacroAssembler::kTableMask,
3759                                    &cont);
3760     } else {
3761       masm->CheckCharacter(single_character, &cont);
3762     }
3763     masm->AdvanceCurrentPosition(lookahead_width);
3764     masm->GoTo(&again);
3765     masm->Bind(&cont);
3766     return;
3767   }
3768 
3769   Factory* factory = masm->isolate()->factory();
3770   Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3771   int skip_distance = GetSkipTable(
3772       min_lookahead, max_lookahead, boolean_skip_table);
3773   DCHECK_NE(0, skip_distance);
3774 
3775   Label cont, again;
3776   masm->Bind(&again);
3777   masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3778   masm->CheckBitInTable(boolean_skip_table, &cont);
3779   masm->AdvanceCurrentPosition(skip_distance);
3780   masm->GoTo(&again);
3781   masm->Bind(&cont);
3782 }
3783 
3784 
3785 /* Code generation for choice nodes.
3786  *
3787  * We generate quick checks that do a mask and compare to eliminate a
3788  * choice.  If the quick check succeeds then it jumps to the continuation to
3789  * do slow checks and check subsequent nodes.  If it fails (the common case)
3790  * it falls through to the next choice.
3791  *
3792  * Here is the desired flow graph.  Nodes directly below each other imply
3793  * fallthrough.  Alternatives 1 and 2 have quick checks.  Alternative
3794  * 3 doesn't have a quick check so we have to call the slow check.
3795  * Nodes are marked Qn for quick checks and Sn for slow checks.  The entire
3796  * regexp continuation is generated directly after the Sn node, up to the
3797  * next GoTo if we decide to reuse some already generated code.  Some
3798  * nodes expect preload_characters to be preloaded into the current
3799  * character register.  R nodes do this preloading.  Vertices are marked
3800  * F for failures and S for success (possible success in the case of quick
3801  * nodes).  L, V, < and > are used as arrow heads.
3802  *
3803  * ----------> R
3804  *             |
3805  *             V
3806  *            Q1 -----> S1
3807  *             |   S   /
3808  *            F|      /
3809  *             |    F/
3810  *             |    /
3811  *             |   R
3812  *             |  /
3813  *             V L
3814  *            Q2 -----> S2
3815  *             |   S   /
3816  *            F|      /
3817  *             |    F/
3818  *             |    /
3819  *             |   R
3820  *             |  /
3821  *             V L
3822  *            S3
3823  *             |
3824  *            F|
3825  *             |
3826  *             R
3827  *             |
3828  * backtrack   V
3829  * <----------Q4
3830  *   \    F    |
3831  *    \        |S
3832  *     \   F   V
3833  *      \-----S4
3834  *
3835  * For greedy loops we push the current position, then generate the code that
3836  * eats the input specially in EmitGreedyLoop.  The other choice (the
3837  * continuation) is generated by the normal code in EmitChoices, and steps back
3838  * in the input to the starting position when it fails to match.  The loop code
3839  * looks like this (U is the unwind code that steps back in the greedy loop).
3840  *
3841  *              _____
3842  *             /     \
3843  *             V     |
3844  * ----------> S1    |
3845  *            /|     |
3846  *           / |S    |
3847  *         F/  \_____/
3848  *         /
3849  *        |<-----
3850  *        |      \
3851  *        V       |S
3852  *        Q2 ---> U----->backtrack
3853  *        |  F   /
3854  *       S|     /
3855  *        V  F /
3856  *        S2--/
3857  */
3858 
GreedyLoopState(bool not_at_start)3859 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3860   counter_backtrack_trace_.set_backtrack(&label_);
3861   if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3862 }
3863 
3864 
AssertGuardsMentionRegisters(Trace * trace)3865 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3866 #ifdef DEBUG
3867   int choice_count = alternatives_->length();
3868   for (int i = 0; i < choice_count - 1; i++) {
3869     GuardedAlternative alternative = alternatives_->at(i);
3870     ZoneList<Guard*>* guards = alternative.guards();
3871     int guard_count = (guards == nullptr) ? 0 : guards->length();
3872     for (int j = 0; j < guard_count; j++) {
3873       DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3874     }
3875   }
3876 #endif
3877 }
3878 
3879 
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3880 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3881                               Trace* current_trace,
3882                               PreloadState* state) {
3883     if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3884       // Save some time by looking at most one machine word ahead.
3885       state->eats_at_least_ =
3886           EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3887                       current_trace->at_start() == Trace::FALSE_VALUE);
3888     }
3889     state->preload_characters_ =
3890         CalculatePreloadCharacters(compiler, state->eats_at_least_);
3891 
3892     state->preload_is_current_ =
3893         (current_trace->characters_preloaded() == state->preload_characters_);
3894     state->preload_has_checked_bounds_ = state->preload_is_current_;
3895 }
3896 
3897 
Emit(RegExpCompiler * compiler,Trace * trace)3898 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3899   int choice_count = alternatives_->length();
3900 
3901   if (choice_count == 1 && alternatives_->at(0).guards() == nullptr) {
3902     alternatives_->at(0).node()->Emit(compiler, trace);
3903     return;
3904   }
3905 
3906   AssertGuardsMentionRegisters(trace);
3907 
3908   LimitResult limit_result = LimitVersions(compiler, trace);
3909   if (limit_result == DONE) return;
3910   DCHECK(limit_result == CONTINUE);
3911 
3912   // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3913   // other choice nodes we only flush if we are out of code size budget.
3914   if (trace->flush_budget() == 0 && trace->actions() != nullptr) {
3915     trace->Flush(compiler, this);
3916     return;
3917   }
3918 
3919   RecursionCheck rc(compiler);
3920 
3921   PreloadState preload;
3922   preload.init();
3923   GreedyLoopState greedy_loop_state(not_at_start());
3924 
3925   int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3926   AlternativeGenerationList alt_gens(choice_count, zone());
3927 
3928   if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3929     trace = EmitGreedyLoop(compiler,
3930                            trace,
3931                            &alt_gens,
3932                            &preload,
3933                            &greedy_loop_state,
3934                            text_length);
3935   } else {
3936     // TODO(erikcorry): Delete this.  We don't need this label, but it makes us
3937     // match the traces produced pre-cleanup.
3938     Label second_choice;
3939     compiler->macro_assembler()->Bind(&second_choice);
3940 
3941     preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3942 
3943     EmitChoices(compiler,
3944                 &alt_gens,
3945                 0,
3946                 trace,
3947                 &preload);
3948   }
3949 
3950   // At this point we need to generate slow checks for the alternatives where
3951   // the quick check was inlined.  We can recognize these because the associated
3952   // label was bound.
3953   int new_flush_budget = trace->flush_budget() / choice_count;
3954   for (int i = 0; i < choice_count; i++) {
3955     AlternativeGeneration* alt_gen = alt_gens.at(i);
3956     Trace new_trace(*trace);
3957     // If there are actions to be flushed we have to limit how many times
3958     // they are flushed.  Take the budget of the parent trace and distribute
3959     // it fairly amongst the children.
3960     if (new_trace.actions() != nullptr) {
3961       new_trace.set_flush_budget(new_flush_budget);
3962     }
3963     bool next_expects_preload =
3964         i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3965     EmitOutOfLineContinuation(compiler,
3966                               &new_trace,
3967                               alternatives_->at(i),
3968                               alt_gen,
3969                               preload.preload_characters_,
3970                               next_expects_preload);
3971   }
3972 }
3973 
3974 
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)3975 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
3976                                   Trace* trace,
3977                                   AlternativeGenerationList* alt_gens,
3978                                   PreloadState* preload,
3979                                   GreedyLoopState* greedy_loop_state,
3980                                   int text_length) {
3981   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3982   // Here we have special handling for greedy loops containing only text nodes
3983   // and other simple nodes.  These are handled by pushing the current
3984   // position on the stack and then incrementing the current position each
3985   // time around the switch.  On backtrack we decrement the current position
3986   // and check it against the pushed value.  This avoids pushing backtrack
3987   // information for each iteration of the loop, which could take up a lot of
3988   // space.
3989   DCHECK(trace->stop_node() == nullptr);
3990   macro_assembler->PushCurrentPosition();
3991   Label greedy_match_failed;
3992   Trace greedy_match_trace;
3993   if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
3994   greedy_match_trace.set_backtrack(&greedy_match_failed);
3995   Label loop_label;
3996   macro_assembler->Bind(&loop_label);
3997   greedy_match_trace.set_stop_node(this);
3998   greedy_match_trace.set_loop_label(&loop_label);
3999   alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4000   macro_assembler->Bind(&greedy_match_failed);
4001 
4002   Label second_choice;  // For use in greedy matches.
4003   macro_assembler->Bind(&second_choice);
4004 
4005   Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4006 
4007   EmitChoices(compiler,
4008               alt_gens,
4009               1,
4010               new_trace,
4011               preload);
4012 
4013   macro_assembler->Bind(greedy_loop_state->label());
4014   // If we have unwound to the bottom then backtrack.
4015   macro_assembler->CheckGreedyLoop(trace->backtrack());
4016   // Otherwise try the second priority at an earlier position.
4017   macro_assembler->AdvanceCurrentPosition(-text_length);
4018   macro_assembler->GoTo(&second_choice);
4019   return new_trace;
4020 }
4021 
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4022 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4023                                               Trace* trace) {
4024   int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4025   if (alternatives_->length() != 2) return eats_at_least;
4026 
4027   GuardedAlternative alt1 = alternatives_->at(1);
4028   if (alt1.guards() != nullptr && alt1.guards()->length() != 0) {
4029     return eats_at_least;
4030   }
4031   RegExpNode* eats_anything_node = alt1.node();
4032   if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4033     return eats_at_least;
4034   }
4035 
4036   // Really we should be creating a new trace when we execute this function,
4037   // but there is no need, because the code it generates cannot backtrack, and
4038   // we always arrive here with a trivial trace (since it's the entry to a
4039   // loop.  That also implies that there are no preloaded characters, which is
4040   // good, because it means we won't be violating any assumptions by
4041   // overwriting those characters with new load instructions.
4042   DCHECK(trace->is_trivial());
4043 
4044   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4045   Isolate* isolate = macro_assembler->isolate();
4046   // At this point we know that we are at a non-greedy loop that will eat
4047   // any character one at a time.  Any non-anchored regexp has such a
4048   // loop prepended to it in order to find where it starts.  We look for
4049   // a pattern of the form ...abc... where we can look 6 characters ahead
4050   // and step forwards 3 if the character is not one of abc.  Abc need
4051   // not be atoms, they can be any reasonably limited character class or
4052   // small alternation.
4053   BoyerMooreLookahead* bm = bm_info(false);
4054   if (bm == nullptr) {
4055     eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4056                         EatsAtLeast(kMaxLookaheadForBoyerMoore,
4057                                     kRecursionBudget,
4058                                     false));
4059     if (eats_at_least >= 1) {
4060       bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4061                                            compiler,
4062                                            zone());
4063       GuardedAlternative alt0 = alternatives_->at(0);
4064       alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4065     }
4066   }
4067   if (bm != nullptr) {
4068     bm->EmitSkipInstructions(macro_assembler);
4069   }
4070   return eats_at_least;
4071 }
4072 
4073 
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4074 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4075                              AlternativeGenerationList* alt_gens,
4076                              int first_choice,
4077                              Trace* trace,
4078                              PreloadState* preload) {
4079   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4080   SetUpPreLoad(compiler, trace, preload);
4081 
4082   // For now we just call all choices one after the other.  The idea ultimately
4083   // is to use the Dispatch table to try only the relevant ones.
4084   int choice_count = alternatives_->length();
4085 
4086   int new_flush_budget = trace->flush_budget() / choice_count;
4087 
4088   for (int i = first_choice; i < choice_count; i++) {
4089     bool is_last = i == choice_count - 1;
4090     bool fall_through_on_failure = !is_last;
4091     GuardedAlternative alternative = alternatives_->at(i);
4092     AlternativeGeneration* alt_gen = alt_gens->at(i);
4093     alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4094     ZoneList<Guard*>* guards = alternative.guards();
4095     int guard_count = (guards == nullptr) ? 0 : guards->length();
4096     Trace new_trace(*trace);
4097     new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4098                                          preload->preload_characters_ :
4099                                          0);
4100     if (preload->preload_has_checked_bounds_) {
4101       new_trace.set_bound_checked_up_to(preload->preload_characters_);
4102     }
4103     new_trace.quick_check_performed()->Clear();
4104     if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4105     if (!is_last) {
4106       new_trace.set_backtrack(&alt_gen->after);
4107     }
4108     alt_gen->expects_preload = preload->preload_is_current_;
4109     bool generate_full_check_inline = false;
4110     if (compiler->optimize() &&
4111         try_to_emit_quick_check_for_alternative(i == 0) &&
4112         alternative.node()->EmitQuickCheck(
4113             compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4114             &alt_gen->possible_success, &alt_gen->quick_check_details,
4115             fall_through_on_failure)) {
4116       // Quick check was generated for this choice.
4117       preload->preload_is_current_ = true;
4118       preload->preload_has_checked_bounds_ = true;
4119       // If we generated the quick check to fall through on possible success,
4120       // we now need to generate the full check inline.
4121       if (!fall_through_on_failure) {
4122         macro_assembler->Bind(&alt_gen->possible_success);
4123         new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4124         new_trace.set_characters_preloaded(preload->preload_characters_);
4125         new_trace.set_bound_checked_up_to(preload->preload_characters_);
4126         generate_full_check_inline = true;
4127       }
4128     } else if (alt_gen->quick_check_details.cannot_match()) {
4129       if (!fall_through_on_failure) {
4130         macro_assembler->GoTo(trace->backtrack());
4131       }
4132       continue;
4133     } else {
4134       // No quick check was generated.  Put the full code here.
4135       // If this is not the first choice then there could be slow checks from
4136       // previous cases that go here when they fail.  There's no reason to
4137       // insist that they preload characters since the slow check we are about
4138       // to generate probably can't use it.
4139       if (i != first_choice) {
4140         alt_gen->expects_preload = false;
4141         new_trace.InvalidateCurrentCharacter();
4142       }
4143       generate_full_check_inline = true;
4144     }
4145     if (generate_full_check_inline) {
4146       if (new_trace.actions() != nullptr) {
4147         new_trace.set_flush_budget(new_flush_budget);
4148       }
4149       for (int j = 0; j < guard_count; j++) {
4150         GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4151       }
4152       alternative.node()->Emit(compiler, &new_trace);
4153       preload->preload_is_current_ = false;
4154     }
4155     macro_assembler->Bind(&alt_gen->after);
4156   }
4157 }
4158 
4159 
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4160 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4161                                            Trace* trace,
4162                                            GuardedAlternative alternative,
4163                                            AlternativeGeneration* alt_gen,
4164                                            int preload_characters,
4165                                            bool next_expects_preload) {
4166   if (!alt_gen->possible_success.is_linked()) return;
4167 
4168   RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4169   macro_assembler->Bind(&alt_gen->possible_success);
4170   Trace out_of_line_trace(*trace);
4171   out_of_line_trace.set_characters_preloaded(preload_characters);
4172   out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4173   if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4174   ZoneList<Guard*>* guards = alternative.guards();
4175   int guard_count = (guards == nullptr) ? 0 : guards->length();
4176   if (next_expects_preload) {
4177     Label reload_current_char;
4178     out_of_line_trace.set_backtrack(&reload_current_char);
4179     for (int j = 0; j < guard_count; j++) {
4180       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4181     }
4182     alternative.node()->Emit(compiler, &out_of_line_trace);
4183     macro_assembler->Bind(&reload_current_char);
4184     // Reload the current character, since the next quick check expects that.
4185     // We don't need to check bounds here because we only get into this
4186     // code through a quick check which already did the checked load.
4187     macro_assembler->LoadCurrentCharacter(trace->cp_offset(), nullptr, false,
4188                                           preload_characters);
4189     macro_assembler->GoTo(&(alt_gen->after));
4190   } else {
4191     out_of_line_trace.set_backtrack(&(alt_gen->after));
4192     for (int j = 0; j < guard_count; j++) {
4193       GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4194     }
4195     alternative.node()->Emit(compiler, &out_of_line_trace);
4196   }
4197 }
4198 
4199 
Emit(RegExpCompiler * compiler,Trace * trace)4200 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4201   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4202   LimitResult limit_result = LimitVersions(compiler, trace);
4203   if (limit_result == DONE) return;
4204   DCHECK(limit_result == CONTINUE);
4205 
4206   RecursionCheck rc(compiler);
4207 
4208   switch (action_type_) {
4209     case STORE_POSITION: {
4210       Trace::DeferredCapture
4211           new_capture(data_.u_position_register.reg,
4212                       data_.u_position_register.is_capture,
4213                       trace);
4214       Trace new_trace = *trace;
4215       new_trace.add_action(&new_capture);
4216       on_success()->Emit(compiler, &new_trace);
4217       break;
4218     }
4219     case INCREMENT_REGISTER: {
4220       Trace::DeferredIncrementRegister
4221           new_increment(data_.u_increment_register.reg);
4222       Trace new_trace = *trace;
4223       new_trace.add_action(&new_increment);
4224       on_success()->Emit(compiler, &new_trace);
4225       break;
4226     }
4227     case SET_REGISTER: {
4228       Trace::DeferredSetRegister
4229           new_set(data_.u_store_register.reg, data_.u_store_register.value);
4230       Trace new_trace = *trace;
4231       new_trace.add_action(&new_set);
4232       on_success()->Emit(compiler, &new_trace);
4233       break;
4234     }
4235     case CLEAR_CAPTURES: {
4236       Trace::DeferredClearCaptures
4237         new_capture(Interval(data_.u_clear_captures.range_from,
4238                              data_.u_clear_captures.range_to));
4239       Trace new_trace = *trace;
4240       new_trace.add_action(&new_capture);
4241       on_success()->Emit(compiler, &new_trace);
4242       break;
4243     }
4244     case BEGIN_SUBMATCH:
4245       if (!trace->is_trivial()) {
4246         trace->Flush(compiler, this);
4247       } else {
4248         assembler->WriteCurrentPositionToRegister(
4249             data_.u_submatch.current_position_register, 0);
4250         assembler->WriteStackPointerToRegister(
4251             data_.u_submatch.stack_pointer_register);
4252         on_success()->Emit(compiler, trace);
4253       }
4254       break;
4255     case EMPTY_MATCH_CHECK: {
4256       int start_pos_reg = data_.u_empty_match_check.start_register;
4257       int stored_pos = 0;
4258       int rep_reg = data_.u_empty_match_check.repetition_register;
4259       bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4260       bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4261       if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4262         // If we know we haven't advanced and there is no minimum we
4263         // can just backtrack immediately.
4264         assembler->GoTo(trace->backtrack());
4265       } else if (know_dist && stored_pos < trace->cp_offset()) {
4266         // If we know we've advanced we can generate the continuation
4267         // immediately.
4268         on_success()->Emit(compiler, trace);
4269       } else if (!trace->is_trivial()) {
4270         trace->Flush(compiler, this);
4271       } else {
4272         Label skip_empty_check;
4273         // If we have a minimum number of repetitions we check the current
4274         // number first and skip the empty check if it's not enough.
4275         if (has_minimum) {
4276           int limit = data_.u_empty_match_check.repetition_limit;
4277           assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4278         }
4279         // If the match is empty we bail out, otherwise we fall through
4280         // to the on-success continuation.
4281         assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4282                                    trace->backtrack());
4283         assembler->Bind(&skip_empty_check);
4284         on_success()->Emit(compiler, trace);
4285       }
4286       break;
4287     }
4288     case POSITIVE_SUBMATCH_SUCCESS: {
4289       if (!trace->is_trivial()) {
4290         trace->Flush(compiler, this);
4291         return;
4292       }
4293       assembler->ReadCurrentPositionFromRegister(
4294           data_.u_submatch.current_position_register);
4295       assembler->ReadStackPointerFromRegister(
4296           data_.u_submatch.stack_pointer_register);
4297       int clear_register_count = data_.u_submatch.clear_register_count;
4298       if (clear_register_count == 0) {
4299         on_success()->Emit(compiler, trace);
4300         return;
4301       }
4302       int clear_registers_from = data_.u_submatch.clear_register_from;
4303       Label clear_registers_backtrack;
4304       Trace new_trace = *trace;
4305       new_trace.set_backtrack(&clear_registers_backtrack);
4306       on_success()->Emit(compiler, &new_trace);
4307 
4308       assembler->Bind(&clear_registers_backtrack);
4309       int clear_registers_to = clear_registers_from + clear_register_count - 1;
4310       assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4311 
4312       DCHECK(trace->backtrack() == nullptr);
4313       assembler->Backtrack();
4314       return;
4315     }
4316     default:
4317       UNREACHABLE();
4318   }
4319 }
4320 
4321 
Emit(RegExpCompiler * compiler,Trace * trace)4322 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4323   RegExpMacroAssembler* assembler = compiler->macro_assembler();
4324   if (!trace->is_trivial()) {
4325     trace->Flush(compiler, this);
4326     return;
4327   }
4328 
4329   LimitResult limit_result = LimitVersions(compiler, trace);
4330   if (limit_result == DONE) return;
4331   DCHECK(limit_result == CONTINUE);
4332 
4333   RecursionCheck rc(compiler);
4334 
4335   DCHECK_EQ(start_reg_ + 1, end_reg_);
4336   if (IgnoreCase(flags_)) {
4337     assembler->CheckNotBackReferenceIgnoreCase(
4338         start_reg_, read_backward(), IsUnicode(flags_), trace->backtrack());
4339   } else {
4340     assembler->CheckNotBackReference(start_reg_, read_backward(),
4341                                      trace->backtrack());
4342   }
4343   // We are going to advance backward, so we may end up at the start.
4344   if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4345 
4346   // Check that the back reference does not end inside a surrogate pair.
4347   if (IsUnicode(flags_) && !compiler->one_byte()) {
4348     assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4349   }
4350   on_success()->Emit(compiler, trace);
4351 }
4352 
4353 
4354 // -------------------------------------------------------------------
4355 // Dot/dotty output
4356 
4357 
4358 #ifdef DEBUG
4359 
4360 
4361 class DotPrinter: public NodeVisitor {
4362  public:
DotPrinter(std::ostream & os,bool ignore_case)4363   DotPrinter(std::ostream& os, bool ignore_case)  // NOLINT
4364       : os_(os),
4365         ignore_case_(ignore_case) {}
4366   void PrintNode(const char* label, RegExpNode* node);
4367   void Visit(RegExpNode* node);
4368   void PrintAttributes(RegExpNode* from);
4369   void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4370 #define DECLARE_VISIT(Type)                                          \
4371   virtual void Visit##Type(Type##Node* that);
4372 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4373 #undef DECLARE_VISIT
4374  private:
4375   std::ostream& os_;
4376   bool ignore_case_;
4377 };
4378 
4379 
PrintNode(const char * label,RegExpNode * node)4380 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4381   os_ << "digraph G {\n  graph [label=\"";
4382   for (int i = 0; label[i]; i++) {
4383     switch (label[i]) {
4384       case '\\':
4385         os_ << "\\\\";
4386         break;
4387       case '"':
4388         os_ << "\"";
4389         break;
4390       default:
4391         os_ << label[i];
4392         break;
4393     }
4394   }
4395   os_ << "\"];\n";
4396   Visit(node);
4397   os_ << "}" << std::endl;
4398 }
4399 
4400 
Visit(RegExpNode * node)4401 void DotPrinter::Visit(RegExpNode* node) {
4402   if (node->info()->visited) return;
4403   node->info()->visited = true;
4404   node->Accept(this);
4405 }
4406 
4407 
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4408 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4409   os_ << "  n" << from << " -> n" << on_failure << " [style=dotted];\n";
4410   Visit(on_failure);
4411 }
4412 
4413 
4414 class TableEntryBodyPrinter {
4415  public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4416   TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice)  // NOLINT
4417       : os_(os),
4418         choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4419   void Call(uc16 from, DispatchTable::Entry entry) {
4420     OutSet* out_set = entry.out_set();
4421     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4422       if (out_set->Get(i)) {
4423         os_ << "    n" << choice() << ":s" << from << "o" << i << " -> n"
4424             << choice()->alternatives()->at(i).node() << ";\n";
4425       }
4426     }
4427   }
4428  private:
choice()4429   ChoiceNode* choice() { return choice_; }
4430   std::ostream& os_;
4431   ChoiceNode* choice_;
4432 };
4433 
4434 
4435 class TableEntryHeaderPrinter {
4436  public:
TableEntryHeaderPrinter(std::ostream & os)4437   explicit TableEntryHeaderPrinter(std::ostream& os)  // NOLINT
4438       : first_(true),
4439         os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4440   void Call(uc16 from, DispatchTable::Entry entry) {
4441     if (first_) {
4442       first_ = false;
4443     } else {
4444       os_ << "|";
4445     }
4446     os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4447     OutSet* out_set = entry.out_set();
4448     int priority = 0;
4449     for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4450       if (out_set->Get(i)) {
4451         if (priority > 0) os_ << "|";
4452         os_ << "<s" << from << "o" << i << "> " << priority;
4453         priority++;
4454       }
4455     }
4456     os_ << "}}";
4457   }
4458 
4459  private:
4460   bool first_;
4461   std::ostream& os_;
4462 };
4463 
4464 
4465 class AttributePrinter {
4466  public:
AttributePrinter(std::ostream & os)4467   explicit AttributePrinter(std::ostream& os)  // NOLINT
4468       : os_(os),
4469         first_(true) {}
PrintSeparator()4470   void PrintSeparator() {
4471     if (first_) {
4472       first_ = false;
4473     } else {
4474       os_ << "|";
4475     }
4476   }
PrintBit(const char * name,bool value)4477   void PrintBit(const char* name, bool value) {
4478     if (!value) return;
4479     PrintSeparator();
4480     os_ << "{" << name << "}";
4481   }
PrintPositive(const char * name,int value)4482   void PrintPositive(const char* name, int value) {
4483     if (value < 0) return;
4484     PrintSeparator();
4485     os_ << "{" << name << "|" << value << "}";
4486   }
4487 
4488  private:
4489   std::ostream& os_;
4490   bool first_;
4491 };
4492 
4493 
PrintAttributes(RegExpNode * that)4494 void DotPrinter::PrintAttributes(RegExpNode* that) {
4495   os_ << "  a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4496       << "margin=0.1, fontsize=10, label=\"{";
4497   AttributePrinter printer(os_);
4498   NodeInfo* info = that->info();
4499   printer.PrintBit("NI", info->follows_newline_interest);
4500   printer.PrintBit("WI", info->follows_word_interest);
4501   printer.PrintBit("SI", info->follows_start_interest);
4502   Label* label = that->label();
4503   if (label->is_bound())
4504     printer.PrintPositive("@", label->pos());
4505   os_ << "}\"];\n"
4506       << "  a" << that << " -> n" << that
4507       << " [style=dashed, color=grey, arrowhead=none];\n";
4508 }
4509 
4510 
4511 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4512 void DotPrinter::VisitChoice(ChoiceNode* that) {
4513   if (kPrintDispatchTable) {
4514     os_ << "  n" << that << " [shape=Mrecord, label=\"";
4515     TableEntryHeaderPrinter header_printer(os_);
4516     that->GetTable(ignore_case_)->ForEach(&header_printer);
4517     os_ << "\"]\n";
4518     PrintAttributes(that);
4519     TableEntryBodyPrinter body_printer(os_, that);
4520     that->GetTable(ignore_case_)->ForEach(&body_printer);
4521   } else {
4522     os_ << "  n" << that << " [shape=Mrecord, label=\"?\"];\n";
4523     for (int i = 0; i < that->alternatives()->length(); i++) {
4524       GuardedAlternative alt = that->alternatives()->at(i);
4525       os_ << "  n" << that << " -> n" << alt.node();
4526     }
4527   }
4528   for (int i = 0; i < that->alternatives()->length(); i++) {
4529     GuardedAlternative alt = that->alternatives()->at(i);
4530     alt.node()->Accept(this);
4531   }
4532 }
4533 
4534 
VisitText(TextNode * that)4535 void DotPrinter::VisitText(TextNode* that) {
4536   Zone* zone = that->zone();
4537   os_ << "  n" << that << " [label=\"";
4538   for (int i = 0; i < that->elements()->length(); i++) {
4539     if (i > 0) os_ << " ";
4540     TextElement elm = that->elements()->at(i);
4541     switch (elm.text_type()) {
4542       case TextElement::ATOM: {
4543         Vector<const uc16> data = elm.atom()->data();
4544         for (int i = 0; i < data.length(); i++) {
4545           os_ << static_cast<char>(data[i]);
4546         }
4547         break;
4548       }
4549       case TextElement::CHAR_CLASS: {
4550         RegExpCharacterClass* node = elm.char_class();
4551         os_ << "[";
4552         if (node->is_negated()) os_ << "^";
4553         for (int j = 0; j < node->ranges(zone)->length(); j++) {
4554           CharacterRange range = node->ranges(zone)->at(j);
4555           os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4556         }
4557         os_ << "]";
4558         break;
4559       }
4560       default:
4561         UNREACHABLE();
4562     }
4563   }
4564   os_ << "\", shape=box, peripheries=2];\n";
4565   PrintAttributes(that);
4566   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4567   Visit(that->on_success());
4568 }
4569 
4570 
VisitBackReference(BackReferenceNode * that)4571 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4572   os_ << "  n" << that << " [label=\"$" << that->start_register() << "..$"
4573       << that->end_register() << "\", shape=doubleoctagon];\n";
4574   PrintAttributes(that);
4575   os_ << "  n" << that << " -> n" << that->on_success() << ";\n";
4576   Visit(that->on_success());
4577 }
4578 
4579 
VisitEnd(EndNode * that)4580 void DotPrinter::VisitEnd(EndNode* that) {
4581   os_ << "  n" << that << " [style=bold, shape=point];\n";
4582   PrintAttributes(that);
4583 }
4584 
4585 
VisitAssertion(AssertionNode * that)4586 void DotPrinter::VisitAssertion(AssertionNode* that) {
4587   os_ << "  n" << that << " [";
4588   switch (that->assertion_type()) {
4589     case AssertionNode::AT_END:
4590       os_ << "label=\"$\", shape=septagon";
4591       break;
4592     case AssertionNode::AT_START:
4593       os_ << "label=\"^\", shape=septagon";
4594       break;
4595     case AssertionNode::AT_BOUNDARY:
4596       os_ << "label=\"\\b\", shape=septagon";
4597       break;
4598     case AssertionNode::AT_NON_BOUNDARY:
4599       os_ << "label=\"\\B\", shape=septagon";
4600       break;
4601     case AssertionNode::AFTER_NEWLINE:
4602       os_ << "label=\"(?<=\\n)\", shape=septagon";
4603       break;
4604   }
4605   os_ << "];\n";
4606   PrintAttributes(that);
4607   RegExpNode* successor = that->on_success();
4608   os_ << "  n" << that << " -> n" << successor << ";\n";
4609   Visit(successor);
4610 }
4611 
4612 
VisitAction(ActionNode * that)4613 void DotPrinter::VisitAction(ActionNode* that) {
4614   os_ << "  n" << that << " [";
4615   switch (that->action_type_) {
4616     case ActionNode::SET_REGISTER:
4617       os_ << "label=\"$" << that->data_.u_store_register.reg
4618           << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4619       break;
4620     case ActionNode::INCREMENT_REGISTER:
4621       os_ << "label=\"$" << that->data_.u_increment_register.reg
4622           << "++\", shape=octagon";
4623       break;
4624     case ActionNode::STORE_POSITION:
4625       os_ << "label=\"$" << that->data_.u_position_register.reg
4626           << ":=$pos\", shape=octagon";
4627       break;
4628     case ActionNode::BEGIN_SUBMATCH:
4629       os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4630           << ":=$pos,begin\", shape=septagon";
4631       break;
4632     case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4633       os_ << "label=\"escape\", shape=septagon";
4634       break;
4635     case ActionNode::EMPTY_MATCH_CHECK:
4636       os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4637           << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4638           << "<" << that->data_.u_empty_match_check.repetition_limit
4639           << "?\", shape=septagon";
4640       break;
4641     case ActionNode::CLEAR_CAPTURES: {
4642       os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4643           << " to $" << that->data_.u_clear_captures.range_to
4644           << "\", shape=septagon";
4645       break;
4646     }
4647   }
4648   os_ << "];\n";
4649   PrintAttributes(that);
4650   RegExpNode* successor = that->on_success();
4651   os_ << "  n" << that << " -> n" << successor << ";\n";
4652   Visit(successor);
4653 }
4654 
4655 
4656 class DispatchTableDumper {
4657  public:
DispatchTableDumper(std::ostream & os)4658   explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4659   void Call(uc16 key, DispatchTable::Entry entry);
4660  private:
4661   std::ostream& os_;
4662 };
4663 
4664 
Call(uc16 key,DispatchTable::Entry entry)4665 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4666   os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4667   OutSet* set = entry.out_set();
4668   bool first = true;
4669   for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4670     if (set->Get(i)) {
4671       if (first) {
4672         first = false;
4673       } else {
4674         os_ << ", ";
4675       }
4676       os_ << i;
4677     }
4678   }
4679   os_ << "}\n";
4680 }
4681 
4682 
Dump()4683 void DispatchTable::Dump() {
4684   OFStream os(stderr);
4685   DispatchTableDumper dumper(os);
4686   tree()->ForEach(&dumper);
4687 }
4688 
4689 
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4690 void RegExpEngine::DotPrint(const char* label,
4691                             RegExpNode* node,
4692                             bool ignore_case) {
4693   StdoutStream os;
4694   DotPrinter printer(os, ignore_case);
4695   printer.PrintNode(label, node);
4696 }
4697 
4698 
4699 #endif  // DEBUG
4700 
4701 
4702 // -------------------------------------------------------------------
4703 // Tree to graph conversion
4704 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4705 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4706                                RegExpNode* on_success) {
4707   ZoneList<TextElement>* elms =
4708       new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4709   elms->Add(TextElement::Atom(this), compiler->zone());
4710   return new (compiler->zone())
4711       TextNode(elms, compiler->read_backward(), on_success);
4712 }
4713 
4714 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4715 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4716                                RegExpNode* on_success) {
4717   return new (compiler->zone())
4718       TextNode(elements(), compiler->read_backward(), on_success);
4719 }
4720 
4721 
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4722 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4723                                  const int* special_class,
4724                                  int length) {
4725   length--;  // Remove final marker.
4726   DCHECK_EQ(kRangeEndMarker, special_class[length]);
4727   DCHECK_NE(0, ranges->length());
4728   DCHECK_NE(0, length);
4729   DCHECK_NE(0, special_class[0]);
4730   if (ranges->length() != (length >> 1) + 1) {
4731     return false;
4732   }
4733   CharacterRange range = ranges->at(0);
4734   if (range.from() != 0) {
4735     return false;
4736   }
4737   for (int i = 0; i < length; i += 2) {
4738     if (special_class[i] != (range.to() + 1)) {
4739       return false;
4740     }
4741     range = ranges->at((i >> 1) + 1);
4742     if (special_class[i+1] != range.from()) {
4743       return false;
4744     }
4745   }
4746   if (range.to() != String::kMaxCodePoint) {
4747     return false;
4748   }
4749   return true;
4750 }
4751 
4752 
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4753 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4754                           const int* special_class,
4755                           int length) {
4756   length--;  // Remove final marker.
4757   DCHECK_EQ(kRangeEndMarker, special_class[length]);
4758   if (ranges->length() * 2 != length) {
4759     return false;
4760   }
4761   for (int i = 0; i < length; i += 2) {
4762     CharacterRange range = ranges->at(i >> 1);
4763     if (range.from() != special_class[i] ||
4764         range.to() != special_class[i + 1] - 1) {
4765       return false;
4766     }
4767   }
4768   return true;
4769 }
4770 
4771 
is_standard(Zone * zone)4772 bool RegExpCharacterClass::is_standard(Zone* zone) {
4773   // TODO(lrn): Remove need for this function, by not throwing away information
4774   // along the way.
4775   if (is_negated()) {
4776     return false;
4777   }
4778   if (set_.is_standard()) {
4779     return true;
4780   }
4781   if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4782     set_.set_standard_set_type('s');
4783     return true;
4784   }
4785   if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4786     set_.set_standard_set_type('S');
4787     return true;
4788   }
4789   if (CompareInverseRanges(set_.ranges(zone),
4790                            kLineTerminatorRanges,
4791                            kLineTerminatorRangeCount)) {
4792     set_.set_standard_set_type('.');
4793     return true;
4794   }
4795   if (CompareRanges(set_.ranges(zone),
4796                     kLineTerminatorRanges,
4797                     kLineTerminatorRangeCount)) {
4798     set_.set_standard_set_type('n');
4799     return true;
4800   }
4801   if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4802     set_.set_standard_set_type('w');
4803     return true;
4804   }
4805   if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4806     set_.set_standard_set_type('W');
4807     return true;
4808   }
4809   return false;
4810 }
4811 
4812 
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4813 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4814                                            ZoneList<CharacterRange>* base)
4815     : zone_(zone),
4816       table_(zone),
4817       bmp_(nullptr),
4818       lead_surrogates_(nullptr),
4819       trail_surrogates_(nullptr),
4820       non_bmp_(nullptr) {
4821   // The unicode range splitter categorizes given character ranges into:
4822   // - Code points from the BMP representable by one code unit.
4823   // - Code points outside the BMP that need to be split into surrogate pairs.
4824   // - Lone lead surrogates.
4825   // - Lone trail surrogates.
4826   // Lone surrogates are valid code points, even though no actual characters.
4827   // They require special matching to make sure we do not split surrogate pairs.
4828   // We use the dispatch table to accomplish this. The base range is split up
4829   // by the table by the overlay ranges, and the Call callback is used to
4830   // filter and collect ranges for each category.
4831   for (int i = 0; i < base->length(); i++) {
4832     table_.AddRange(base->at(i), kBase, zone_);
4833   }
4834   // Add overlay ranges.
4835   table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4836                   kBmpCodePoints, zone_);
4837   table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4838                   kLeadSurrogates, zone_);
4839   table_.AddRange(
4840       CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4841       kTrailSurrogates, zone_);
4842   table_.AddRange(
4843       CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4844       kBmpCodePoints, zone_);
4845   table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4846                   kNonBmpCodePoints, zone_);
4847   table_.ForEach(this);
4848 }
4849 
4850 
Call(uc32 from,DispatchTable::Entry entry)4851 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4852   OutSet* outset = entry.out_set();
4853   if (!outset->Get(kBase)) return;
4854   ZoneList<CharacterRange>** target = nullptr;
4855   if (outset->Get(kBmpCodePoints)) {
4856     target = &bmp_;
4857   } else if (outset->Get(kLeadSurrogates)) {
4858     target = &lead_surrogates_;
4859   } else if (outset->Get(kTrailSurrogates)) {
4860     target = &trail_surrogates_;
4861   } else {
4862     DCHECK(outset->Get(kNonBmpCodePoints));
4863     target = &non_bmp_;
4864   }
4865   if (*target == nullptr)
4866     *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4867   (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4868 }
4869 
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4870 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4871                       RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4872   ZoneList<CharacterRange>* bmp = splitter->bmp();
4873   if (bmp == nullptr) return;
4874   JSRegExp::Flags default_flags = JSRegExp::Flags();
4875   result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4876       compiler->zone(), bmp, compiler->read_backward(), on_success,
4877       default_flags)));
4878 }
4879 
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4880 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4881                              RegExpNode* on_success,
4882                              UnicodeRangeSplitter* splitter) {
4883   ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4884   if (non_bmp == nullptr) return;
4885   DCHECK(!compiler->one_byte());
4886   Zone* zone = compiler->zone();
4887   JSRegExp::Flags default_flags = JSRegExp::Flags();
4888   CharacterRange::Canonicalize(non_bmp);
4889   for (int i = 0; i < non_bmp->length(); i++) {
4890     // Match surrogate pair.
4891     // E.g. [\u10005-\u11005] becomes
4892     //      \ud800[\udc05-\udfff]|
4893     //      [\ud801-\ud803][\udc00-\udfff]|
4894     //      \ud804[\udc00-\udc05]
4895     uc32 from = non_bmp->at(i).from();
4896     uc32 to = non_bmp->at(i).to();
4897     uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4898     uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4899     uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4900     uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4901     if (from_l == to_l) {
4902       // The lead surrogate is the same.
4903       result->AddAlternative(
4904           GuardedAlternative(TextNode::CreateForSurrogatePair(
4905               zone, CharacterRange::Singleton(from_l),
4906               CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4907               on_success, default_flags)));
4908     } else {
4909       if (from_t != kTrailSurrogateStart) {
4910         // Add [from_l][from_t-\udfff]
4911         result->AddAlternative(
4912             GuardedAlternative(TextNode::CreateForSurrogatePair(
4913                 zone, CharacterRange::Singleton(from_l),
4914                 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4915                 compiler->read_backward(), on_success, default_flags)));
4916         from_l++;
4917       }
4918       if (to_t != kTrailSurrogateEnd) {
4919         // Add [to_l][\udc00-to_t]
4920         result->AddAlternative(
4921             GuardedAlternative(TextNode::CreateForSurrogatePair(
4922                 zone, CharacterRange::Singleton(to_l),
4923                 CharacterRange::Range(kTrailSurrogateStart, to_t),
4924                 compiler->read_backward(), on_success, default_flags)));
4925         to_l--;
4926       }
4927       if (from_l <= to_l) {
4928         // Add [from_l-to_l][\udc00-\udfff]
4929         result->AddAlternative(
4930             GuardedAlternative(TextNode::CreateForSurrogatePair(
4931                 zone, CharacterRange::Range(from_l, to_l),
4932                 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4933                 compiler->read_backward(), on_success, default_flags)));
4934       }
4935     }
4936   }
4937 }
4938 
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)4939 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
4940     RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
4941     ZoneList<CharacterRange>* match, RegExpNode* on_success, bool read_backward,
4942     JSRegExp::Flags flags) {
4943   Zone* zone = compiler->zone();
4944   RegExpNode* match_node = TextNode::CreateForCharacterRanges(
4945       zone, match, read_backward, on_success, flags);
4946   int stack_register = compiler->UnicodeLookaroundStackRegister();
4947   int position_register = compiler->UnicodeLookaroundPositionRegister();
4948   RegExpLookaround::Builder lookaround(false, match_node, stack_register,
4949                                        position_register);
4950   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4951       zone, lookbehind, !read_backward, lookaround.on_match_success(), flags);
4952   return lookaround.ForMatch(negative_match);
4953 }
4954 
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward,JSRegExp::Flags flags)4955 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
4956     RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
4957     ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
4958     bool read_backward, JSRegExp::Flags flags) {
4959   Zone* zone = compiler->zone();
4960   int stack_register = compiler->UnicodeLookaroundStackRegister();
4961   int position_register = compiler->UnicodeLookaroundPositionRegister();
4962   RegExpLookaround::Builder lookaround(false, on_success, stack_register,
4963                                        position_register);
4964   RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
4965       zone, lookahead, read_backward, lookaround.on_match_success(), flags);
4966   return TextNode::CreateForCharacterRanges(
4967       zone, match, read_backward, lookaround.ForMatch(negative_match), flags);
4968 }
4969 
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4970 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4971                            RegExpNode* on_success,
4972                            UnicodeRangeSplitter* splitter) {
4973   JSRegExp::Flags default_flags = JSRegExp::Flags();
4974   ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
4975   if (lead_surrogates == nullptr) return;
4976   Zone* zone = compiler->zone();
4977   // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
4978   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
4979       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
4980 
4981   RegExpNode* match;
4982   if (compiler->read_backward()) {
4983     // Reading backward. Assert that reading forward, there is no trail
4984     // surrogate, and then backward match the lead surrogate.
4985     match = NegativeLookaroundAgainstReadDirectionAndMatch(
4986         compiler, trail_surrogates, lead_surrogates, on_success, true,
4987         default_flags);
4988   } else {
4989     // Reading forward. Forward match the lead surrogate and assert that
4990     // no trail surrogate follows.
4991     match = MatchAndNegativeLookaroundInReadDirection(
4992         compiler, lead_surrogates, trail_surrogates, on_success, false,
4993         default_flags);
4994   }
4995   result->AddAlternative(GuardedAlternative(match));
4996 }
4997 
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4998 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
4999                             RegExpNode* on_success,
5000                             UnicodeRangeSplitter* splitter) {
5001   JSRegExp::Flags default_flags = JSRegExp::Flags();
5002   ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5003   if (trail_surrogates == nullptr) return;
5004   Zone* zone = compiler->zone();
5005   // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5006   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5007       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5008 
5009   RegExpNode* match;
5010   if (compiler->read_backward()) {
5011     // Reading backward. Backward match the trail surrogate and assert that no
5012     // lead surrogate precedes it.
5013     match = MatchAndNegativeLookaroundInReadDirection(
5014         compiler, trail_surrogates, lead_surrogates, on_success, true,
5015         default_flags);
5016   } else {
5017     // Reading forward. Assert that reading backward, there is no lead
5018     // surrogate, and then forward match the trail surrogate.
5019     match = NegativeLookaroundAgainstReadDirectionAndMatch(
5020         compiler, lead_surrogates, trail_surrogates, on_success, false,
5021         default_flags);
5022   }
5023   result->AddAlternative(GuardedAlternative(match));
5024 }
5025 
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5026 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5027                               RegExpNode* on_success) {
5028   // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5029   DCHECK(!compiler->read_backward());
5030   Zone* zone = compiler->zone();
5031   // Advance any character. If the character happens to be a lead surrogate and
5032   // we advanced into the middle of a surrogate pair, it will work out, as
5033   // nothing will match from there. We will have to advance again, consuming
5034   // the associated trail surrogate.
5035   ZoneList<CharacterRange>* range = CharacterRange::List(
5036       zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5037   JSRegExp::Flags default_flags = JSRegExp::Flags();
5038   return TextNode::CreateForCharacterRanges(zone, range, false, on_success,
5039                                             default_flags);
5040 }
5041 
AddUnicodeCaseEquivalents(ZoneList<CharacterRange> * ranges,Zone * zone)5042 void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
5043 #ifdef V8_INTL_SUPPORT
5044   DCHECK(CharacterRange::IsCanonical(ranges));
5045 
5046   // Micro-optimization to avoid passing large ranges to UnicodeSet::closeOver.
5047   // See also https://crbug.com/v8/6727.
5048   // TODO(jgruber): This only covers the special case of the {0,0x10FFFF} range,
5049   // which we use frequently internally. But large ranges can also easily be
5050   // created by the user. We might want to have a more general caching mechanism
5051   // for such ranges.
5052   if (ranges->length() == 1 && ranges->at(0).IsEverything(kNonBmpEnd)) return;
5053 
5054   // Use ICU to compute the case fold closure over the ranges.
5055   icu::UnicodeSet set;
5056   for (int i = 0; i < ranges->length(); i++) {
5057     set.add(ranges->at(i).from(), ranges->at(i).to());
5058   }
5059   ranges->Clear();
5060   set.closeOver(USET_CASE_INSENSITIVE);
5061   // Full case mapping map single characters to multiple characters.
5062   // Those are represented as strings in the set. Remove them so that
5063   // we end up with only simple and common case mappings.
5064   set.removeAllStrings();
5065   for (int i = 0; i < set.getRangeCount(); i++) {
5066     ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
5067                 zone);
5068   }
5069   // No errors and everything we collected have been ranges.
5070   CharacterRange::Canonicalize(ranges);
5071 #endif  // V8_INTL_SUPPORT
5072 }
5073 
5074 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5075 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5076                                          RegExpNode* on_success) {
5077   set_.Canonicalize();
5078   Zone* zone = compiler->zone();
5079   ZoneList<CharacterRange>* ranges = this->ranges(zone);
5080   if (NeedsUnicodeCaseEquivalents(flags_)) {
5081     AddUnicodeCaseEquivalents(ranges, zone);
5082   }
5083   if (IsUnicode(flags_) && !compiler->one_byte() &&
5084       !contains_split_surrogate()) {
5085     if (is_negated()) {
5086       ZoneList<CharacterRange>* negated =
5087           new (zone) ZoneList<CharacterRange>(2, zone);
5088       CharacterRange::Negate(ranges, negated, zone);
5089       ranges = negated;
5090     }
5091     if (ranges->length() == 0) {
5092       JSRegExp::Flags default_flags;
5093       RegExpCharacterClass* fail =
5094           new (zone) RegExpCharacterClass(zone, ranges, default_flags);
5095       return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5096     }
5097     if (standard_type() == '*') {
5098       return UnanchoredAdvance(compiler, on_success);
5099     } else {
5100       ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5101       UnicodeRangeSplitter splitter(zone, ranges);
5102       AddBmpCharacters(compiler, result, on_success, &splitter);
5103       AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5104       AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5105       AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5106       return result;
5107     }
5108   } else {
5109     return new (zone) TextNode(this, compiler->read_backward(), on_success);
5110   }
5111 }
5112 
5113 
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5114 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5115   RegExpAtom* atom1 = (*a)->AsAtom();
5116   RegExpAtom* atom2 = (*b)->AsAtom();
5117   uc16 character1 = atom1->data().at(0);
5118   uc16 character2 = atom2->data().at(0);
5119   if (character1 < character2) return -1;
5120   if (character1 > character2) return 1;
5121   return 0;
5122 }
5123 
5124 
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5125 static unibrow::uchar Canonical(
5126     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5127     unibrow::uchar c) {
5128   unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5129   int length = canonicalize->get(c, '\0', chars);
5130   DCHECK_LE(length, 1);
5131   unibrow::uchar canonical = c;
5132   if (length == 1) canonical = chars[0];
5133   return canonical;
5134 }
5135 
5136 
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5137 int CompareFirstCharCaseIndependent(
5138     unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5139     RegExpTree* const* a, RegExpTree* const* b) {
5140   RegExpAtom* atom1 = (*a)->AsAtom();
5141   RegExpAtom* atom2 = (*b)->AsAtom();
5142   unibrow::uchar character1 = atom1->data().at(0);
5143   unibrow::uchar character2 = atom2->data().at(0);
5144   if (character1 == character2) return 0;
5145   if (character1 >= 'a' || character2 >= 'a') {
5146     character1 = Canonical(canonicalize, character1);
5147     character2 = Canonical(canonicalize, character2);
5148   }
5149   return static_cast<int>(character1) - static_cast<int>(character2);
5150 }
5151 
5152 
5153 // We can stable sort runs of atoms, since the order does not matter if they
5154 // start with different characters.
5155 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5156 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5157   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5158   int length = alternatives->length();
5159   bool found_consecutive_atoms = false;
5160   for (int i = 0; i < length; i++) {
5161     while (i < length) {
5162       RegExpTree* alternative = alternatives->at(i);
5163       if (alternative->IsAtom()) break;
5164       i++;
5165     }
5166     // i is length or it is the index of an atom.
5167     if (i == length) break;
5168     int first_atom = i;
5169     JSRegExp::Flags flags = alternatives->at(i)->AsAtom()->flags();
5170     i++;
5171     while (i < length) {
5172       RegExpTree* alternative = alternatives->at(i);
5173       if (!alternative->IsAtom()) break;
5174       if (alternative->AsAtom()->flags() != flags) break;
5175       i++;
5176     }
5177     // Sort atoms to get ones with common prefixes together.
5178     // This step is more tricky if we are in a case-independent regexp,
5179     // because it would change /is|I/ to /I|is/, and order matters when
5180     // the regexp parts don't match only disjoint starting points. To fix
5181     // this we have a version of CompareFirstChar that uses case-
5182     // independent character classes for comparison.
5183     DCHECK_LT(first_atom, alternatives->length());
5184     DCHECK_LE(i, alternatives->length());
5185     DCHECK_LE(first_atom, i);
5186     if (IgnoreCase(flags)) {
5187       unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5188           compiler->isolate()->regexp_macro_assembler_canonicalize();
5189       auto compare_closure =
5190           [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5191             return CompareFirstCharCaseIndependent(canonicalize, a, b);
5192           };
5193       alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5194     } else {
5195       alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5196     }
5197     if (i - first_atom > 1) found_consecutive_atoms = true;
5198   }
5199   return found_consecutive_atoms;
5200 }
5201 
5202 
5203 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5204 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5205   Zone* zone = compiler->zone();
5206   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5207   int length = alternatives->length();
5208 
5209   int write_posn = 0;
5210   int i = 0;
5211   while (i < length) {
5212     RegExpTree* alternative = alternatives->at(i);
5213     if (!alternative->IsAtom()) {
5214       alternatives->at(write_posn++) = alternatives->at(i);
5215       i++;
5216       continue;
5217     }
5218     RegExpAtom* const atom = alternative->AsAtom();
5219     JSRegExp::Flags flags = atom->flags();
5220     unibrow::uchar common_prefix = atom->data().at(0);
5221     int first_with_prefix = i;
5222     int prefix_length = atom->length();
5223     i++;
5224     while (i < length) {
5225       alternative = alternatives->at(i);
5226       if (!alternative->IsAtom()) break;
5227       RegExpAtom* const atom = alternative->AsAtom();
5228       if (atom->flags() != flags) break;
5229       unibrow::uchar new_prefix = atom->data().at(0);
5230       if (new_prefix != common_prefix) {
5231         if (!IgnoreCase(flags)) break;
5232         unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5233             compiler->isolate()->regexp_macro_assembler_canonicalize();
5234         new_prefix = Canonical(canonicalize, new_prefix);
5235         common_prefix = Canonical(canonicalize, common_prefix);
5236         if (new_prefix != common_prefix) break;
5237       }
5238       prefix_length = Min(prefix_length, atom->length());
5239       i++;
5240     }
5241     if (i > first_with_prefix + 2) {
5242       // Found worthwhile run of alternatives with common prefix of at least one
5243       // character.  The sorting function above did not sort on more than one
5244       // character for reasons of correctness, but there may still be a longer
5245       // common prefix if the terms were similar or presorted in the input.
5246       // Find out how long the common prefix is.
5247       int run_length = i - first_with_prefix;
5248       RegExpAtom* const atom = alternatives->at(first_with_prefix)->AsAtom();
5249       for (int j = 1; j < run_length && prefix_length > 1; j++) {
5250         RegExpAtom* old_atom =
5251             alternatives->at(j + first_with_prefix)->AsAtom();
5252         for (int k = 1; k < prefix_length; k++) {
5253           if (atom->data().at(k) != old_atom->data().at(k)) {
5254             prefix_length = k;
5255             break;
5256           }
5257         }
5258       }
5259       RegExpAtom* prefix = new (zone)
5260           RegExpAtom(atom->data().SubVector(0, prefix_length), flags);
5261       ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5262       pair->Add(prefix, zone);
5263       ZoneList<RegExpTree*>* suffixes =
5264           new (zone) ZoneList<RegExpTree*>(run_length, zone);
5265       for (int j = 0; j < run_length; j++) {
5266         RegExpAtom* old_atom =
5267             alternatives->at(j + first_with_prefix)->AsAtom();
5268         int len = old_atom->length();
5269         if (len == prefix_length) {
5270           suffixes->Add(new (zone) RegExpEmpty(), zone);
5271         } else {
5272           RegExpTree* suffix = new (zone) RegExpAtom(
5273               old_atom->data().SubVector(prefix_length, old_atom->length()),
5274               flags);
5275           suffixes->Add(suffix, zone);
5276         }
5277       }
5278       pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5279       alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5280     } else {
5281       // Just copy any non-worthwhile alternatives.
5282       for (int j = first_with_prefix; j < i; j++) {
5283         alternatives->at(write_posn++) = alternatives->at(j);
5284       }
5285     }
5286   }
5287   alternatives->Rewind(write_posn);  // Trim end of array.
5288 }
5289 
5290 
5291 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5292 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5293     RegExpCompiler* compiler) {
5294   Zone* zone = compiler->zone();
5295   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5296   int length = alternatives->length();
5297 
5298   int write_posn = 0;
5299   int i = 0;
5300   while (i < length) {
5301     RegExpTree* alternative = alternatives->at(i);
5302     if (!alternative->IsAtom()) {
5303       alternatives->at(write_posn++) = alternatives->at(i);
5304       i++;
5305       continue;
5306     }
5307     RegExpAtom* const atom = alternative->AsAtom();
5308     if (atom->length() != 1) {
5309       alternatives->at(write_posn++) = alternatives->at(i);
5310       i++;
5311       continue;
5312     }
5313     JSRegExp::Flags flags = atom->flags();
5314     DCHECK_IMPLIES(IsUnicode(flags),
5315                    !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5316     bool contains_trail_surrogate =
5317         unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5318     int first_in_run = i;
5319     i++;
5320     // Find a run of single-character atom alternatives that have identical
5321     // flags (case independence and unicode-ness).
5322     while (i < length) {
5323       alternative = alternatives->at(i);
5324       if (!alternative->IsAtom()) break;
5325       RegExpAtom* const atom = alternative->AsAtom();
5326       if (atom->length() != 1) break;
5327       if (atom->flags() != flags) break;
5328       DCHECK_IMPLIES(IsUnicode(flags),
5329                      !unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5330       contains_trail_surrogate |=
5331           unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5332       i++;
5333     }
5334     if (i > first_in_run + 1) {
5335       // Found non-trivial run of single-character alternatives.
5336       int run_length = i - first_in_run;
5337       ZoneList<CharacterRange>* ranges =
5338           new (zone) ZoneList<CharacterRange>(2, zone);
5339       for (int j = 0; j < run_length; j++) {
5340         RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5341         DCHECK_EQ(old_atom->length(), 1);
5342         ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5343       }
5344       RegExpCharacterClass::CharacterClassFlags character_class_flags;
5345       if (IsUnicode(flags) && contains_trail_surrogate) {
5346         character_class_flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5347       }
5348       alternatives->at(write_posn++) = new (zone)
5349           RegExpCharacterClass(zone, ranges, flags, character_class_flags);
5350     } else {
5351       // Just copy any trivial alternatives.
5352       for (int j = first_in_run; j < i; j++) {
5353         alternatives->at(write_posn++) = alternatives->at(j);
5354       }
5355     }
5356   }
5357   alternatives->Rewind(write_posn);  // Trim end of array.
5358 }
5359 
5360 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5361 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5362                                       RegExpNode* on_success) {
5363   ZoneList<RegExpTree*>* alternatives = this->alternatives();
5364 
5365   if (alternatives->length() > 2) {
5366     bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5367     if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5368     FixSingleCharacterDisjunctions(compiler);
5369     if (alternatives->length() == 1) {
5370       return alternatives->at(0)->ToNode(compiler, on_success);
5371     }
5372   }
5373 
5374   int length = alternatives->length();
5375 
5376   ChoiceNode* result =
5377       new(compiler->zone()) ChoiceNode(length, compiler->zone());
5378   for (int i = 0; i < length; i++) {
5379     GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5380                                                                on_success));
5381     result->AddAlternative(alternative);
5382   }
5383   return result;
5384 }
5385 
5386 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5387 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5388                                      RegExpNode* on_success) {
5389   return ToNode(min(),
5390                 max(),
5391                 is_greedy(),
5392                 body(),
5393                 compiler,
5394                 on_success);
5395 }
5396 
5397 
5398 // Scoped object to keep track of how much we unroll quantifier loops in the
5399 // regexp graph generator.
5400 class RegExpExpansionLimiter {
5401  public:
5402   static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5403   RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5404       : compiler_(compiler),
5405         saved_expansion_factor_(compiler->current_expansion_factor()),
5406         ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5407     DCHECK_LT(0, factor);
5408     if (ok_to_expand_) {
5409       if (factor > kMaxExpansionFactor) {
5410         // Avoid integer overflow of the current expansion factor.
5411         ok_to_expand_ = false;
5412         compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5413       } else {
5414         int new_factor = saved_expansion_factor_ * factor;
5415         ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5416         compiler->set_current_expansion_factor(new_factor);
5417       }
5418     }
5419   }
5420 
~RegExpExpansionLimiter()5421   ~RegExpExpansionLimiter() {
5422     compiler_->set_current_expansion_factor(saved_expansion_factor_);
5423   }
5424 
ok_to_expand()5425   bool ok_to_expand() { return ok_to_expand_; }
5426 
5427  private:
5428   RegExpCompiler* compiler_;
5429   int saved_expansion_factor_;
5430   bool ok_to_expand_;
5431 
5432   DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5433 };
5434 
5435 
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5436 RegExpNode* RegExpQuantifier::ToNode(int min,
5437                                      int max,
5438                                      bool is_greedy,
5439                                      RegExpTree* body,
5440                                      RegExpCompiler* compiler,
5441                                      RegExpNode* on_success,
5442                                      bool not_at_start) {
5443   // x{f, t} becomes this:
5444   //
5445   //             (r++)<-.
5446   //               |     `
5447   //               |     (x)
5448   //               v     ^
5449   //      (r=0)-->(?)---/ [if r < t]
5450   //               |
5451   //   [if r >= f] \----> ...
5452   //
5453 
5454   // 15.10.2.5 RepeatMatcher algorithm.
5455   // The parser has already eliminated the case where max is 0.  In the case
5456   // where max_match is zero the parser has removed the quantifier if min was
5457   // > 0 and removed the atom if min was 0.  See AddQuantifierToAtom.
5458 
5459   // If we know that we cannot match zero length then things are a little
5460   // simpler since we don't need to make the special zero length match check
5461   // from step 2.1.  If the min and max are small we can unroll a little in
5462   // this case.
5463   static const int kMaxUnrolledMinMatches = 3;  // Unroll (foo)+ and (foo){3,}
5464   static const int kMaxUnrolledMaxMatches = 3;  // Unroll (foo)? and (foo){x,3}
5465   if (max == 0) return on_success;  // This can happen due to recursion.
5466   bool body_can_be_empty = (body->min_match() == 0);
5467   int body_start_reg = RegExpCompiler::kNoRegister;
5468   Interval capture_registers = body->CaptureRegisters();
5469   bool needs_capture_clearing = !capture_registers.is_empty();
5470   Zone* zone = compiler->zone();
5471 
5472   if (body_can_be_empty) {
5473     body_start_reg = compiler->AllocateRegister();
5474   } else if (compiler->optimize() && !needs_capture_clearing) {
5475     // Only unroll if there are no captures and the body can't be
5476     // empty.
5477     {
5478       RegExpExpansionLimiter limiter(
5479           compiler, min + ((max != min) ? 1 : 0));
5480       if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5481         int new_max = (max == kInfinity) ? max : max - min;
5482         // Recurse once to get the loop or optional matches after the fixed
5483         // ones.
5484         RegExpNode* answer = ToNode(
5485             0, new_max, is_greedy, body, compiler, on_success, true);
5486         // Unroll the forced matches from 0 to min.  This can cause chains of
5487         // TextNodes (which the parser does not generate).  These should be
5488         // combined if it turns out they hinder good code generation.
5489         for (int i = 0; i < min; i++) {
5490           answer = body->ToNode(compiler, answer);
5491         }
5492         return answer;
5493       }
5494     }
5495     if (max <= kMaxUnrolledMaxMatches && min == 0) {
5496       DCHECK_LT(0, max);  // Due to the 'if' above.
5497       RegExpExpansionLimiter limiter(compiler, max);
5498       if (limiter.ok_to_expand()) {
5499         // Unroll the optional matches up to max.
5500         RegExpNode* answer = on_success;
5501         for (int i = 0; i < max; i++) {
5502           ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5503           if (is_greedy) {
5504             alternation->AddAlternative(
5505                 GuardedAlternative(body->ToNode(compiler, answer)));
5506             alternation->AddAlternative(GuardedAlternative(on_success));
5507           } else {
5508             alternation->AddAlternative(GuardedAlternative(on_success));
5509             alternation->AddAlternative(
5510                 GuardedAlternative(body->ToNode(compiler, answer)));
5511           }
5512           answer = alternation;
5513           if (not_at_start && !compiler->read_backward()) {
5514             alternation->set_not_at_start();
5515           }
5516         }
5517         return answer;
5518       }
5519     }
5520   }
5521   bool has_min = min > 0;
5522   bool has_max = max < RegExpTree::kInfinity;
5523   bool needs_counter = has_min || has_max;
5524   int reg_ctr = needs_counter
5525       ? compiler->AllocateRegister()
5526       : RegExpCompiler::kNoRegister;
5527   LoopChoiceNode* center = new (zone)
5528       LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5529   if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5530   RegExpNode* loop_return = needs_counter
5531       ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5532       : static_cast<RegExpNode*>(center);
5533   if (body_can_be_empty) {
5534     // If the body can be empty we need to check if it was and then
5535     // backtrack.
5536     loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5537                                               reg_ctr,
5538                                               min,
5539                                               loop_return);
5540   }
5541   RegExpNode* body_node = body->ToNode(compiler, loop_return);
5542   if (body_can_be_empty) {
5543     // If the body can be empty we need to store the start position
5544     // so we can bail out if it was empty.
5545     body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5546   }
5547   if (needs_capture_clearing) {
5548     // Before entering the body of this loop we need to clear captures.
5549     body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5550   }
5551   GuardedAlternative body_alt(body_node);
5552   if (has_max) {
5553     Guard* body_guard =
5554         new(zone) Guard(reg_ctr, Guard::LT, max);
5555     body_alt.AddGuard(body_guard, zone);
5556   }
5557   GuardedAlternative rest_alt(on_success);
5558   if (has_min) {
5559     Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5560     rest_alt.AddGuard(rest_guard, zone);
5561   }
5562   if (is_greedy) {
5563     center->AddLoopAlternative(body_alt);
5564     center->AddContinueAlternative(rest_alt);
5565   } else {
5566     center->AddContinueAlternative(rest_alt);
5567     center->AddLoopAlternative(body_alt);
5568   }
5569   if (needs_counter) {
5570     return ActionNode::SetRegister(reg_ctr, 0, center);
5571   } else {
5572     return center;
5573   }
5574 }
5575 
5576 namespace {
5577 // Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
5578 //         \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
BoundaryAssertionAsLookaround(RegExpCompiler * compiler,RegExpNode * on_success,RegExpAssertion::AssertionType type,JSRegExp::Flags flags)5579 RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
5580                                           RegExpNode* on_success,
5581                                           RegExpAssertion::AssertionType type,
5582                                           JSRegExp::Flags flags) {
5583   DCHECK(NeedsUnicodeCaseEquivalents(flags));
5584   Zone* zone = compiler->zone();
5585   ZoneList<CharacterRange>* word_range =
5586       new (zone) ZoneList<CharacterRange>(2, zone);
5587   CharacterRange::AddClassEscape('w', word_range, true, zone);
5588   int stack_register = compiler->UnicodeLookaroundStackRegister();
5589   int position_register = compiler->UnicodeLookaroundPositionRegister();
5590   ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5591   // Add two choices. The (non-)boundary could start with a word or
5592   // a non-word-character.
5593   for (int i = 0; i < 2; i++) {
5594     bool lookbehind_for_word = i == 0;
5595     bool lookahead_for_word =
5596         (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
5597     // Look to the left.
5598     RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
5599                                          stack_register, position_register);
5600     RegExpNode* backward = TextNode::CreateForCharacterRanges(
5601         zone, word_range, true, lookbehind.on_match_success(), flags);
5602     // Look to the right.
5603     RegExpLookaround::Builder lookahead(lookahead_for_word,
5604                                         lookbehind.ForMatch(backward),
5605                                         stack_register, position_register);
5606     RegExpNode* forward = TextNode::CreateForCharacterRanges(
5607         zone, word_range, false, lookahead.on_match_success(), flags);
5608     result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
5609   }
5610   return result;
5611 }
5612 }  // anonymous namespace
5613 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5614 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5615                                     RegExpNode* on_success) {
5616   NodeInfo info;
5617   Zone* zone = compiler->zone();
5618 
5619   switch (assertion_type()) {
5620     case START_OF_LINE:
5621       return AssertionNode::AfterNewline(on_success);
5622     case START_OF_INPUT:
5623       return AssertionNode::AtStart(on_success);
5624     case BOUNDARY:
5625       return NeedsUnicodeCaseEquivalents(flags_)
5626                  ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY,
5627                                                  flags_)
5628                  : AssertionNode::AtBoundary(on_success);
5629     case NON_BOUNDARY:
5630       return NeedsUnicodeCaseEquivalents(flags_)
5631                  ? BoundaryAssertionAsLookaround(compiler, on_success,
5632                                                  NON_BOUNDARY, flags_)
5633                  : AssertionNode::AtNonBoundary(on_success);
5634     case END_OF_INPUT:
5635       return AssertionNode::AtEnd(on_success);
5636     case END_OF_LINE: {
5637       // Compile $ in multiline regexps as an alternation with a positive
5638       // lookahead in one side and an end-of-input on the other side.
5639       // We need two registers for the lookahead.
5640       int stack_pointer_register = compiler->AllocateRegister();
5641       int position_register = compiler->AllocateRegister();
5642       // The ChoiceNode to distinguish between a newline and end-of-input.
5643       ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5644       // Create a newline atom.
5645       ZoneList<CharacterRange>* newline_ranges =
5646           new(zone) ZoneList<CharacterRange>(3, zone);
5647       CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
5648       JSRegExp::Flags default_flags = JSRegExp::Flags();
5649       RegExpCharacterClass* newline_atom =
5650           new (zone) RegExpCharacterClass('n', default_flags);
5651       TextNode* newline_matcher = new (zone) TextNode(
5652           newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5653                                    stack_pointer_register, position_register,
5654                                    0,   // No captures inside.
5655                                    -1,  // Ignored if no captures.
5656                                    on_success));
5657       // Create an end-of-input matcher.
5658       RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5659           stack_pointer_register,
5660           position_register,
5661           newline_matcher);
5662       // Add the two alternatives to the ChoiceNode.
5663       GuardedAlternative eol_alternative(end_of_line);
5664       result->AddAlternative(eol_alternative);
5665       GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5666       result->AddAlternative(end_alternative);
5667       return result;
5668     }
5669     default:
5670       UNREACHABLE();
5671   }
5672   return on_success;
5673 }
5674 
5675 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5676 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5677                                         RegExpNode* on_success) {
5678   return new (compiler->zone())
5679       BackReferenceNode(RegExpCapture::StartRegister(index()),
5680                         RegExpCapture::EndRegister(index()), flags_,
5681                         compiler->read_backward(), on_success);
5682 }
5683 
5684 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5685 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5686                                 RegExpNode* on_success) {
5687   return on_success;
5688 }
5689 
5690 
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5691 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5692                                    int stack_pointer_register,
5693                                    int position_register,
5694                                    int capture_register_count,
5695                                    int capture_register_start)
5696     : is_positive_(is_positive),
5697       on_success_(on_success),
5698       stack_pointer_register_(stack_pointer_register),
5699       position_register_(position_register) {
5700   if (is_positive_) {
5701     on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5702         stack_pointer_register, position_register, capture_register_count,
5703         capture_register_start, on_success_);
5704   } else {
5705     Zone* zone = on_success_->zone();
5706     on_match_success_ = new (zone) NegativeSubmatchSuccess(
5707         stack_pointer_register, position_register, capture_register_count,
5708         capture_register_start, zone);
5709   }
5710 }
5711 
5712 
ForMatch(RegExpNode * match)5713 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5714   if (is_positive_) {
5715     return ActionNode::BeginSubmatch(stack_pointer_register_,
5716                                      position_register_, match);
5717   } else {
5718     Zone* zone = on_success_->zone();
5719     // We use a ChoiceNode to represent the negative lookaround. The first
5720     // alternative is the negative match. On success, the end node backtracks.
5721     // On failure, the second alternative is tried and leads to success.
5722     // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5723     // first exit when calculating quick checks.
5724     ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5725         GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5726     return ActionNode::BeginSubmatch(stack_pointer_register_,
5727                                      position_register_, choice_node);
5728   }
5729 }
5730 
5731 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5732 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5733                                      RegExpNode* on_success) {
5734   int stack_pointer_register = compiler->AllocateRegister();
5735   int position_register = compiler->AllocateRegister();
5736 
5737   const int registers_per_capture = 2;
5738   const int register_of_first_capture = 2;
5739   int register_count = capture_count_ * registers_per_capture;
5740   int register_start =
5741     register_of_first_capture + capture_from_ * registers_per_capture;
5742 
5743   RegExpNode* result;
5744   bool was_reading_backward = compiler->read_backward();
5745   compiler->set_read_backward(type() == LOOKBEHIND);
5746   Builder builder(is_positive(), on_success, stack_pointer_register,
5747                   position_register, register_count, register_start);
5748   RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5749   result = builder.ForMatch(match);
5750   compiler->set_read_backward(was_reading_backward);
5751   return result;
5752 }
5753 
5754 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5755 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5756                                   RegExpNode* on_success) {
5757   return ToNode(body(), index(), compiler, on_success);
5758 }
5759 
5760 
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5761 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5762                                   int index,
5763                                   RegExpCompiler* compiler,
5764                                   RegExpNode* on_success) {
5765   DCHECK_NOT_NULL(body);
5766   int start_reg = RegExpCapture::StartRegister(index);
5767   int end_reg = RegExpCapture::EndRegister(index);
5768   if (compiler->read_backward()) std::swap(start_reg, end_reg);
5769   RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5770   RegExpNode* body_node = body->ToNode(compiler, store_end);
5771   return ActionNode::StorePosition(start_reg, true, body_node);
5772 }
5773 
5774 
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5775 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5776                                       RegExpNode* on_success) {
5777   ZoneList<RegExpTree*>* children = nodes();
5778   RegExpNode* current = on_success;
5779   if (compiler->read_backward()) {
5780     for (int i = 0; i < children->length(); i++) {
5781       current = children->at(i)->ToNode(compiler, current);
5782     }
5783   } else {
5784     for (int i = children->length() - 1; i >= 0; i--) {
5785       current = children->at(i)->ToNode(compiler, current);
5786     }
5787   }
5788   return current;
5789 }
5790 
5791 
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5792 static void AddClass(const int* elmv,
5793                      int elmc,
5794                      ZoneList<CharacterRange>* ranges,
5795                      Zone* zone) {
5796   elmc--;
5797   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5798   for (int i = 0; i < elmc; i += 2) {
5799     DCHECK(elmv[i] < elmv[i + 1]);
5800     ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5801   }
5802 }
5803 
5804 
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5805 static void AddClassNegated(const int *elmv,
5806                             int elmc,
5807                             ZoneList<CharacterRange>* ranges,
5808                             Zone* zone) {
5809   elmc--;
5810   DCHECK_EQ(kRangeEndMarker, elmv[elmc]);
5811   DCHECK_NE(0x0000, elmv[0]);
5812   DCHECK_NE(String::kMaxCodePoint, elmv[elmc - 1]);
5813   uc16 last = 0x0000;
5814   for (int i = 0; i < elmc; i += 2) {
5815     DCHECK(last <= elmv[i] - 1);
5816     DCHECK(elmv[i] < elmv[i + 1]);
5817     ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5818     last = elmv[i + 1];
5819   }
5820   ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5821 }
5822 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,bool add_unicode_case_equivalents,Zone * zone)5823 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5824                                     bool add_unicode_case_equivalents,
5825                                     Zone* zone) {
5826   if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
5827     // See #sec-runtime-semantics-wordcharacters-abstract-operation
5828     // In case of unicode and ignore_case, we need to create the closure over
5829     // case equivalent characters before negating.
5830     ZoneList<CharacterRange>* new_ranges =
5831         new (zone) ZoneList<CharacterRange>(2, zone);
5832     AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
5833     AddUnicodeCaseEquivalents(new_ranges, zone);
5834     if (type == 'W') {
5835       ZoneList<CharacterRange>* negated =
5836           new (zone) ZoneList<CharacterRange>(2, zone);
5837       CharacterRange::Negate(new_ranges, negated, zone);
5838       new_ranges = negated;
5839     }
5840     ranges->AddAll(*new_ranges, zone);
5841     return;
5842   }
5843   AddClassEscape(type, ranges, zone);
5844 }
5845 
AddClassEscape(char type,ZoneList<CharacterRange> * ranges,Zone * zone)5846 void CharacterRange::AddClassEscape(char type, ZoneList<CharacterRange>* ranges,
5847                                     Zone* zone) {
5848   switch (type) {
5849     case 's':
5850       AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5851       break;
5852     case 'S':
5853       AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5854       break;
5855     case 'w':
5856       AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5857       break;
5858     case 'W':
5859       AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5860       break;
5861     case 'd':
5862       AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5863       break;
5864     case 'D':
5865       AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5866       break;
5867     case '.':
5868       AddClassNegated(kLineTerminatorRanges,
5869                       kLineTerminatorRangeCount,
5870                       ranges,
5871                       zone);
5872       break;
5873     // This is not a character range as defined by the spec but a
5874     // convenient shorthand for a character class that matches any
5875     // character.
5876     case '*':
5877       ranges->Add(CharacterRange::Everything(), zone);
5878       break;
5879     // This is the set of characters matched by the $ and ^ symbols
5880     // in multiline mode.
5881     case 'n':
5882       AddClass(kLineTerminatorRanges,
5883                kLineTerminatorRangeCount,
5884                ranges,
5885                zone);
5886       break;
5887     default:
5888       UNREACHABLE();
5889   }
5890 }
5891 
5892 
GetWordBounds()5893 Vector<const int> CharacterRange::GetWordBounds() {
5894   return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5895 }
5896 
5897 // static
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5898 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5899                                         ZoneList<CharacterRange>* ranges,
5900                                         bool is_one_byte) {
5901   CharacterRange::Canonicalize(ranges);
5902   int range_count = ranges->length();
5903   for (int i = 0; i < range_count; i++) {
5904     CharacterRange range = ranges->at(i);
5905     uc32 bottom = range.from();
5906     if (bottom > String::kMaxUtf16CodeUnit) continue;
5907     uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5908     // Nothing to be done for surrogates.
5909     if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) continue;
5910     if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5911       if (bottom > String::kMaxOneByteCharCode) continue;
5912       if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5913     }
5914     unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5915     if (top == bottom) {
5916       // If this is a singleton we just expand the one character.
5917       int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5918       for (int i = 0; i < length; i++) {
5919         uc32 chr = chars[i];
5920         if (chr != bottom) {
5921           ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5922         }
5923       }
5924     } else {
5925       // If this is a range we expand the characters block by block, expanding
5926       // contiguous subranges (blocks) one at a time.  The approach is as
5927       // follows.  For a given start character we look up the remainder of the
5928       // block that contains it (represented by the end point), for instance we
5929       // find 'z' if the character is 'c'.  A block is characterized by the
5930       // property that all characters uncanonicalize in the same way, except
5931       // that each entry in the result is incremented by the distance from the
5932       // first element.  So a-z is a block because 'a' uncanonicalizes to ['a',
5933       // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k].  Once
5934       // we've found the end point we look up its uncanonicalization and
5935       // produce a range for each element.  For instance for [c-f] we look up
5936       // ['z', 'Z'] and produce [c-f] and [C-F].  We then only add a range if
5937       // it is not already contained in the input, so [c-f] will be skipped but
5938       // [C-F] will be added.  If this range is not completely contained in a
5939       // block we do this for all the blocks covered by the range (handling
5940       // characters that is not in a block as a "singleton block").
5941       unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5942       int pos = bottom;
5943       while (pos <= top) {
5944         int length =
5945             isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5946         uc32 block_end;
5947         if (length == 0) {
5948           block_end = pos;
5949         } else {
5950           DCHECK_EQ(1, length);
5951           block_end = equivalents[0];
5952         }
5953         int end = (block_end > top) ? top : block_end;
5954         length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5955                                                          equivalents);
5956         for (int i = 0; i < length; i++) {
5957           uc32 c = equivalents[i];
5958           uc32 range_from = c - (block_end - pos);
5959           uc32 range_to = c - (block_end - end);
5960           if (!(bottom <= range_from && range_to <= top)) {
5961             ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5962           }
5963         }
5964         pos = end + 1;
5965       }
5966     }
5967   }
5968 }
5969 
5970 
IsCanonical(ZoneList<CharacterRange> * ranges)5971 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5972   DCHECK_NOT_NULL(ranges);
5973   int n = ranges->length();
5974   if (n <= 1) return true;
5975   int max = ranges->at(0).to();
5976   for (int i = 1; i < n; i++) {
5977     CharacterRange next_range = ranges->at(i);
5978     if (next_range.from() <= max + 1) return false;
5979     max = next_range.to();
5980   }
5981   return true;
5982 }
5983 
5984 
ranges(Zone * zone)5985 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5986   if (ranges_ == nullptr) {
5987     ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5988     CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
5989   }
5990   return ranges_;
5991 }
5992 
5993 
5994 // Move a number of elements in a zonelist to another position
5995 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5996 static void MoveRanges(ZoneList<CharacterRange>* list,
5997                        int from,
5998                        int to,
5999                        int count) {
6000   // Ranges are potentially overlapping.
6001   if (from < to) {
6002     for (int i = count - 1; i >= 0; i--) {
6003       list->at(to + i) = list->at(from + i);
6004     }
6005   } else {
6006     for (int i = 0; i < count; i++) {
6007       list->at(to + i) = list->at(from + i);
6008     }
6009   }
6010 }
6011 
6012 
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)6013 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6014                                       int count,
6015                                       CharacterRange insert) {
6016   // Inserts a range into list[0..count[, which must be sorted
6017   // by from value and non-overlapping and non-adjacent, using at most
6018   // list[0..count] for the result. Returns the number of resulting
6019   // canonicalized ranges. Inserting a range may collapse existing ranges into
6020   // fewer ranges, so the return value can be anything in the range 1..count+1.
6021   uc32 from = insert.from();
6022   uc32 to = insert.to();
6023   int start_pos = 0;
6024   int end_pos = count;
6025   for (int i = count - 1; i >= 0; i--) {
6026     CharacterRange current = list->at(i);
6027     if (current.from() > to + 1) {
6028       end_pos = i;
6029     } else if (current.to() + 1 < from) {
6030       start_pos = i + 1;
6031       break;
6032     }
6033   }
6034 
6035   // Inserted range overlaps, or is adjacent to, ranges at positions
6036   // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6037   // not affected by the insertion.
6038   // If start_pos == end_pos, the range must be inserted before start_pos.
6039   // if start_pos < end_pos, the entire range from start_pos to end_pos
6040   // must be merged with the insert range.
6041 
6042   if (start_pos == end_pos) {
6043     // Insert between existing ranges at position start_pos.
6044     if (start_pos < count) {
6045       MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6046     }
6047     list->at(start_pos) = insert;
6048     return count + 1;
6049   }
6050   if (start_pos + 1 == end_pos) {
6051     // Replace single existing range at position start_pos.
6052     CharacterRange to_replace = list->at(start_pos);
6053     int new_from = Min(to_replace.from(), from);
6054     int new_to = Max(to_replace.to(), to);
6055     list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6056     return count;
6057   }
6058   // Replace a number of existing ranges from start_pos to end_pos - 1.
6059   // Move the remaining ranges down.
6060 
6061   int new_from = Min(list->at(start_pos).from(), from);
6062   int new_to = Max(list->at(end_pos - 1).to(), to);
6063   if (end_pos < count) {
6064     MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6065   }
6066   list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6067   return count - (end_pos - start_pos) + 1;
6068 }
6069 
6070 
Canonicalize()6071 void CharacterSet::Canonicalize() {
6072   // Special/default classes are always considered canonical. The result
6073   // of calling ranges() will be sorted.
6074   if (ranges_ == nullptr) return;
6075   CharacterRange::Canonicalize(ranges_);
6076 }
6077 
6078 
Canonicalize(ZoneList<CharacterRange> * character_ranges)6079 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6080   if (character_ranges->length() <= 1) return;
6081   // Check whether ranges are already canonical (increasing, non-overlapping,
6082   // non-adjacent).
6083   int n = character_ranges->length();
6084   int max = character_ranges->at(0).to();
6085   int i = 1;
6086   while (i < n) {
6087     CharacterRange current = character_ranges->at(i);
6088     if (current.from() <= max + 1) {
6089       break;
6090     }
6091     max = current.to();
6092     i++;
6093   }
6094   // Canonical until the i'th range. If that's all of them, we are done.
6095   if (i == n) return;
6096 
6097   // The ranges at index i and forward are not canonicalized. Make them so by
6098   // doing the equivalent of insertion sort (inserting each into the previous
6099   // list, in order).
6100   // Notice that inserting a range can reduce the number of ranges in the
6101   // result due to combining of adjacent and overlapping ranges.
6102   int read = i;  // Range to insert.
6103   int num_canonical = i;  // Length of canonicalized part of list.
6104   do {
6105     num_canonical = InsertRangeInCanonicalList(character_ranges,
6106                                                num_canonical,
6107                                                character_ranges->at(read));
6108     read++;
6109   } while (read < n);
6110   character_ranges->Rewind(num_canonical);
6111 
6112   DCHECK(CharacterRange::IsCanonical(character_ranges));
6113 }
6114 
6115 
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6116 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6117                             ZoneList<CharacterRange>* negated_ranges,
6118                             Zone* zone) {
6119   DCHECK(CharacterRange::IsCanonical(ranges));
6120   DCHECK_EQ(0, negated_ranges->length());
6121   int range_count = ranges->length();
6122   uc32 from = 0;
6123   int i = 0;
6124   if (range_count > 0 && ranges->at(0).from() == 0) {
6125     from = ranges->at(0).to() + 1;
6126     i = 1;
6127   }
6128   while (i < range_count) {
6129     CharacterRange range = ranges->at(i);
6130     negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6131     from = range.to() + 1;
6132     i++;
6133   }
6134   if (from < String::kMaxCodePoint) {
6135     negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6136                         zone);
6137   }
6138 }
6139 
6140 
6141 // -------------------------------------------------------------------
6142 // Splay tree
6143 
6144 
Extend(unsigned value,Zone * zone)6145 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6146   if (Get(value))
6147     return this;
6148   if (successors(zone) != nullptr) {
6149     for (int i = 0; i < successors(zone)->length(); i++) {
6150       OutSet* successor = successors(zone)->at(i);
6151       if (successor->Get(value))
6152         return successor;
6153     }
6154   } else {
6155     successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6156   }
6157   OutSet* result = new(zone) OutSet(first_, remaining_);
6158   result->Set(value, zone);
6159   successors(zone)->Add(result, zone);
6160   return result;
6161 }
6162 
6163 
Set(unsigned value,Zone * zone)6164 void OutSet::Set(unsigned value, Zone *zone) {
6165   if (value < kFirstLimit) {
6166     first_ |= (1 << value);
6167   } else {
6168     if (remaining_ == nullptr)
6169       remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6170     if (remaining_->is_empty() || !remaining_->Contains(value))
6171       remaining_->Add(value, zone);
6172   }
6173 }
6174 
6175 
Get(unsigned value) const6176 bool OutSet::Get(unsigned value) const {
6177   if (value < kFirstLimit) {
6178     return (first_ & (1 << value)) != 0;
6179   } else if (remaining_ == nullptr) {
6180     return false;
6181   } else {
6182     return remaining_->Contains(value);
6183   }
6184 }
6185 
6186 
6187 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6188 
6189 
AddRange(CharacterRange full_range,int value,Zone * zone)6190 void DispatchTable::AddRange(CharacterRange full_range, int value,
6191                              Zone* zone) {
6192   CharacterRange current = full_range;
6193   if (tree()->is_empty()) {
6194     // If this is the first range we just insert into the table.
6195     ZoneSplayTree<Config>::Locator loc;
6196     bool inserted = tree()->Insert(current.from(), &loc);
6197     DCHECK(inserted);
6198     USE(inserted);
6199     loc.set_value(Entry(current.from(), current.to(),
6200                         empty()->Extend(value, zone)));
6201     return;
6202   }
6203   // First see if there is a range to the left of this one that
6204   // overlaps.
6205   ZoneSplayTree<Config>::Locator loc;
6206   if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6207     Entry* entry = &loc.value();
6208     // If we've found a range that overlaps with this one, and it
6209     // starts strictly to the left of this one, we have to fix it
6210     // because the following code only handles ranges that start on
6211     // or after the start point of the range we're adding.
6212     if (entry->from() < current.from() && entry->to() >= current.from()) {
6213       // Snap the overlapping range in half around the start point of
6214       // the range we're adding.
6215       CharacterRange left =
6216           CharacterRange::Range(entry->from(), current.from() - 1);
6217       CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6218       // The left part of the overlapping range doesn't overlap.
6219       // Truncate the whole entry to be just the left part.
6220       entry->set_to(left.to());
6221       // The right part is the one that overlaps.  We add this part
6222       // to the map and let the next step deal with merging it with
6223       // the range we're adding.
6224       ZoneSplayTree<Config>::Locator loc;
6225       bool inserted = tree()->Insert(right.from(), &loc);
6226       DCHECK(inserted);
6227       USE(inserted);
6228       loc.set_value(Entry(right.from(),
6229                           right.to(),
6230                           entry->out_set()));
6231     }
6232   }
6233   while (current.is_valid()) {
6234     if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6235         (loc.value().from() <= current.to()) &&
6236         (loc.value().to() >= current.from())) {
6237       Entry* entry = &loc.value();
6238       // We have overlap.  If there is space between the start point of
6239       // the range we're adding and where the overlapping range starts
6240       // then we have to add a range covering just that space.
6241       if (current.from() < entry->from()) {
6242         ZoneSplayTree<Config>::Locator ins;
6243         bool inserted = tree()->Insert(current.from(), &ins);
6244         DCHECK(inserted);
6245         USE(inserted);
6246         ins.set_value(Entry(current.from(),
6247                             entry->from() - 1,
6248                             empty()->Extend(value, zone)));
6249         current.set_from(entry->from());
6250       }
6251       DCHECK_EQ(current.from(), entry->from());
6252       // If the overlapping range extends beyond the one we want to add
6253       // we have to snap the right part off and add it separately.
6254       if (entry->to() > current.to()) {
6255         ZoneSplayTree<Config>::Locator ins;
6256         bool inserted = tree()->Insert(current.to() + 1, &ins);
6257         DCHECK(inserted);
6258         USE(inserted);
6259         ins.set_value(Entry(current.to() + 1,
6260                             entry->to(),
6261                             entry->out_set()));
6262         entry->set_to(current.to());
6263       }
6264       DCHECK(entry->to() <= current.to());
6265       // The overlapping range is now completely contained by the range
6266       // we're adding so we can just update it and move the start point
6267       // of the range we're adding just past it.
6268       entry->AddValue(value, zone);
6269       DCHECK(entry->to() + 1 > current.from());
6270       current.set_from(entry->to() + 1);
6271     } else {
6272       // There is no overlap so we can just add the range
6273       ZoneSplayTree<Config>::Locator ins;
6274       bool inserted = tree()->Insert(current.from(), &ins);
6275       DCHECK(inserted);
6276       USE(inserted);
6277       ins.set_value(Entry(current.from(),
6278                           current.to(),
6279                           empty()->Extend(value, zone)));
6280       break;
6281     }
6282   }
6283 }
6284 
6285 
Get(uc32 value)6286 OutSet* DispatchTable::Get(uc32 value) {
6287   ZoneSplayTree<Config>::Locator loc;
6288   if (!tree()->FindGreatestLessThan(value, &loc))
6289     return empty();
6290   Entry* entry = &loc.value();
6291   if (value <= entry->to())
6292     return entry->out_set();
6293   else
6294     return empty();
6295 }
6296 
6297 
6298 // -------------------------------------------------------------------
6299 // Analysis
6300 
6301 
EnsureAnalyzed(RegExpNode * that)6302 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6303   StackLimitCheck check(isolate());
6304   if (check.HasOverflowed()) {
6305     fail("Stack overflow");
6306     return;
6307   }
6308   if (that->info()->been_analyzed || that->info()->being_analyzed)
6309     return;
6310   that->info()->being_analyzed = true;
6311   that->Accept(this);
6312   that->info()->being_analyzed = false;
6313   that->info()->been_analyzed = true;
6314 }
6315 
6316 
VisitEnd(EndNode * that)6317 void Analysis::VisitEnd(EndNode* that) {
6318   // nothing to do
6319 }
6320 
6321 
CalculateOffsets()6322 void TextNode::CalculateOffsets() {
6323   int element_count = elements()->length();
6324   // Set up the offsets of the elements relative to the start.  This is a fixed
6325   // quantity since a TextNode can only contain fixed-width things.
6326   int cp_offset = 0;
6327   for (int i = 0; i < element_count; i++) {
6328     TextElement& elm = elements()->at(i);
6329     elm.set_cp_offset(cp_offset);
6330     cp_offset += elm.length();
6331   }
6332 }
6333 
6334 
VisitText(TextNode * that)6335 void Analysis::VisitText(TextNode* that) {
6336   that->MakeCaseIndependent(isolate(), is_one_byte_);
6337   EnsureAnalyzed(that->on_success());
6338   if (!has_failed()) {
6339     that->CalculateOffsets();
6340   }
6341 }
6342 
6343 
VisitAction(ActionNode * that)6344 void Analysis::VisitAction(ActionNode* that) {
6345   RegExpNode* target = that->on_success();
6346   EnsureAnalyzed(target);
6347   if (!has_failed()) {
6348     // If the next node is interested in what it follows then this node
6349     // has to be interested too so it can pass the information on.
6350     that->info()->AddFromFollowing(target->info());
6351   }
6352 }
6353 
6354 
VisitChoice(ChoiceNode * that)6355 void Analysis::VisitChoice(ChoiceNode* that) {
6356   NodeInfo* info = that->info();
6357   for (int i = 0; i < that->alternatives()->length(); i++) {
6358     RegExpNode* node = that->alternatives()->at(i).node();
6359     EnsureAnalyzed(node);
6360     if (has_failed()) return;
6361     // Anything the following nodes need to know has to be known by
6362     // this node also, so it can pass it on.
6363     info->AddFromFollowing(node->info());
6364   }
6365 }
6366 
6367 
VisitLoopChoice(LoopChoiceNode * that)6368 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6369   NodeInfo* info = that->info();
6370   for (int i = 0; i < that->alternatives()->length(); i++) {
6371     RegExpNode* node = that->alternatives()->at(i).node();
6372     if (node != that->loop_node()) {
6373       EnsureAnalyzed(node);
6374       if (has_failed()) return;
6375       info->AddFromFollowing(node->info());
6376     }
6377   }
6378   // Check the loop last since it may need the value of this node
6379   // to get a correct result.
6380   EnsureAnalyzed(that->loop_node());
6381   if (!has_failed()) {
6382     info->AddFromFollowing(that->loop_node()->info());
6383   }
6384 }
6385 
6386 
VisitBackReference(BackReferenceNode * that)6387 void Analysis::VisitBackReference(BackReferenceNode* that) {
6388   EnsureAnalyzed(that->on_success());
6389 }
6390 
6391 
VisitAssertion(AssertionNode * that)6392 void Analysis::VisitAssertion(AssertionNode* that) {
6393   EnsureAnalyzed(that->on_success());
6394 }
6395 
6396 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6397 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6398                                      BoyerMooreLookahead* bm,
6399                                      bool not_at_start) {
6400   // Working out the set of characters that a backreference can match is too
6401   // hard, so we just say that any character can match.
6402   bm->SetRest(offset);
6403   SaveBMInfo(bm, not_at_start, offset);
6404 }
6405 
6406 
6407 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6408               RegExpMacroAssembler::kTableSize);
6409 
6410 
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6411 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6412                               BoyerMooreLookahead* bm, bool not_at_start) {
6413   ZoneList<GuardedAlternative>* alts = alternatives();
6414   budget = (budget - 1) / alts->length();
6415   for (int i = 0; i < alts->length(); i++) {
6416     GuardedAlternative& alt = alts->at(i);
6417     if (alt.guards() != nullptr && alt.guards()->length() != 0) {
6418       bm->SetRest(offset);  // Give up trying to fill in info.
6419       SaveBMInfo(bm, not_at_start, offset);
6420       return;
6421     }
6422     alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6423   }
6424   SaveBMInfo(bm, not_at_start, offset);
6425 }
6426 
6427 
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6428 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6429                             BoyerMooreLookahead* bm, bool not_at_start) {
6430   if (initial_offset >= bm->length()) return;
6431   int offset = initial_offset;
6432   int max_char = bm->max_char();
6433   for (int i = 0; i < elements()->length(); i++) {
6434     if (offset >= bm->length()) {
6435       if (initial_offset == 0) set_bm_info(not_at_start, bm);
6436       return;
6437     }
6438     TextElement text = elements()->at(i);
6439     if (text.text_type() == TextElement::ATOM) {
6440       RegExpAtom* atom = text.atom();
6441       for (int j = 0; j < atom->length(); j++, offset++) {
6442         if (offset >= bm->length()) {
6443           if (initial_offset == 0) set_bm_info(not_at_start, bm);
6444           return;
6445         }
6446         uc16 character = atom->data()[j];
6447         if (IgnoreCase(atom->flags())) {
6448           unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6449           int length = GetCaseIndependentLetters(
6450               isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6451               chars);
6452           for (int j = 0; j < length; j++) {
6453             bm->Set(offset, chars[j]);
6454           }
6455         } else {
6456           if (character <= max_char) bm->Set(offset, character);
6457         }
6458       }
6459     } else {
6460       DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6461       RegExpCharacterClass* char_class = text.char_class();
6462       ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6463       if (char_class->is_negated()) {
6464         bm->SetAll(offset);
6465       } else {
6466         for (int k = 0; k < ranges->length(); k++) {
6467           CharacterRange& range = ranges->at(k);
6468           if (range.from() > max_char) continue;
6469           int to = Min(max_char, static_cast<int>(range.to()));
6470           bm->SetInterval(offset, Interval(range.from(), to));
6471         }
6472       }
6473       offset++;
6474     }
6475   }
6476   if (offset >= bm->length()) {
6477     if (initial_offset == 0) set_bm_info(not_at_start, bm);
6478     return;
6479   }
6480   on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6481                              true);  // Not at start after a text node.
6482   if (initial_offset == 0) set_bm_info(not_at_start, bm);
6483 }
6484 
6485 
6486 // -------------------------------------------------------------------
6487 // Dispatch table construction
6488 
6489 
VisitEnd(EndNode * that)6490 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6491   AddRange(CharacterRange::Everything());
6492 }
6493 
6494 
BuildTable(ChoiceNode * node)6495 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6496   node->set_being_calculated(true);
6497   ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6498   for (int i = 0; i < alternatives->length(); i++) {
6499     set_choice_index(i);
6500     alternatives->at(i).node()->Accept(this);
6501   }
6502   node->set_being_calculated(false);
6503 }
6504 
6505 
6506 class AddDispatchRange {
6507  public:
AddDispatchRange(DispatchTableConstructor * constructor)6508   explicit AddDispatchRange(DispatchTableConstructor* constructor)
6509     : constructor_(constructor) { }
6510   void Call(uc32 from, DispatchTable::Entry entry);
6511  private:
6512   DispatchTableConstructor* constructor_;
6513 };
6514 
6515 
Call(uc32 from,DispatchTable::Entry entry)6516 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6517   constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6518 }
6519 
6520 
VisitChoice(ChoiceNode * node)6521 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6522   if (node->being_calculated())
6523     return;
6524   DispatchTable* table = node->GetTable(ignore_case_);
6525   AddDispatchRange adder(this);
6526   table->ForEach(&adder);
6527 }
6528 
6529 
VisitBackReference(BackReferenceNode * that)6530 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6531   // TODO(160): Find the node that we refer back to and propagate its start
6532   // set back to here.  For now we just accept anything.
6533   AddRange(CharacterRange::Everything());
6534 }
6535 
6536 
VisitAssertion(AssertionNode * that)6537 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6538   RegExpNode* target = that->on_success();
6539   target->Accept(this);
6540 }
6541 
6542 
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6543 static int CompareRangeByFrom(const CharacterRange* a,
6544                               const CharacterRange* b) {
6545   return Compare<uc16>(a->from(), b->from());
6546 }
6547 
6548 
AddInverse(ZoneList<CharacterRange> * ranges)6549 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6550   ranges->Sort(CompareRangeByFrom);
6551   uc16 last = 0;
6552   for (int i = 0; i < ranges->length(); i++) {
6553     CharacterRange range = ranges->at(i);
6554     if (last < range.from())
6555       AddRange(CharacterRange::Range(last, range.from() - 1));
6556     if (range.to() >= last) {
6557       if (range.to() == String::kMaxCodePoint) {
6558         return;
6559       } else {
6560         last = range.to() + 1;
6561       }
6562     }
6563   }
6564   AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6565 }
6566 
6567 
VisitText(TextNode * that)6568 void DispatchTableConstructor::VisitText(TextNode* that) {
6569   TextElement elm = that->elements()->at(0);
6570   switch (elm.text_type()) {
6571     case TextElement::ATOM: {
6572       uc16 c = elm.atom()->data()[0];
6573       AddRange(CharacterRange::Range(c, c));
6574       break;
6575     }
6576     case TextElement::CHAR_CLASS: {
6577       RegExpCharacterClass* tree = elm.char_class();
6578       ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6579       if (tree->is_negated()) {
6580         AddInverse(ranges);
6581       } else {
6582         for (int i = 0; i < ranges->length(); i++)
6583           AddRange(ranges->at(i));
6584       }
6585       break;
6586     }
6587     default: {
6588       UNIMPLEMENTED();
6589     }
6590   }
6591 }
6592 
6593 
VisitAction(ActionNode * that)6594 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6595   RegExpNode* target = that->on_success();
6596   target->Accept(this);
6597 }
6598 
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success,JSRegExp::Flags flags)6599 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6600                                               RegExpNode* on_success,
6601                                               JSRegExp::Flags flags) {
6602   // If the regexp matching starts within a surrogate pair, step back
6603   // to the lead surrogate and start matching from there.
6604   DCHECK(!compiler->read_backward());
6605   Zone* zone = compiler->zone();
6606   ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6607       zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6608   ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6609       zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6610 
6611   ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6612 
6613   int stack_register = compiler->UnicodeLookaroundStackRegister();
6614   int position_register = compiler->UnicodeLookaroundPositionRegister();
6615   RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6616       zone, lead_surrogates, true, on_success, flags);
6617   RegExpLookaround::Builder builder(true, step_back, stack_register,
6618                                     position_register);
6619   RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6620       zone, trail_surrogates, false, builder.on_match_success(), flags);
6621 
6622   optional_step_back->AddAlternative(
6623       GuardedAlternative(builder.ForMatch(match_trail)));
6624   optional_step_back->AddAlternative(GuardedAlternative(on_success));
6625 
6626   return optional_step_back;
6627 }
6628 
6629 
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6630 RegExpEngine::CompilationResult RegExpEngine::Compile(
6631     Isolate* isolate, Zone* zone, RegExpCompileData* data,
6632     JSRegExp::Flags flags, Handle<String> pattern,
6633     Handle<String> sample_subject, bool is_one_byte) {
6634   if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6635     return IrregexpRegExpTooBig(isolate);
6636   }
6637   bool is_sticky = IsSticky(flags);
6638   bool is_global = IsGlobal(flags);
6639   bool is_unicode = IsUnicode(flags);
6640   RegExpCompiler compiler(isolate, zone, data->capture_count, is_one_byte);
6641 
6642   if (compiler.optimize())
6643     compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
6644 
6645   // Sample some characters from the middle of the string.
6646   static const int kSampleSize = 128;
6647 
6648   sample_subject = String::Flatten(isolate, sample_subject);
6649   int chars_sampled = 0;
6650   int half_way = (sample_subject->length() - kSampleSize) / 2;
6651   for (int i = Max(0, half_way);
6652        i < sample_subject->length() && chars_sampled < kSampleSize;
6653        i++, chars_sampled++) {
6654     compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6655   }
6656 
6657   // Wrap the body of the regexp in capture #0.
6658   RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6659                                                     0,
6660                                                     &compiler,
6661                                                     compiler.accept());
6662   RegExpNode* node = captured_body;
6663   bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6664   bool is_start_anchored = data->tree->IsAnchoredAtStart();
6665   int max_length = data->tree->max_match();
6666   if (!is_start_anchored && !is_sticky) {
6667     // Add a .*? at the beginning, outside the body capture, unless
6668     // this expression is anchored at the beginning or sticky.
6669     JSRegExp::Flags default_flags = JSRegExp::Flags();
6670     RegExpNode* loop_node = RegExpQuantifier::ToNode(
6671         0, RegExpTree::kInfinity, false,
6672         new (zone) RegExpCharacterClass('*', default_flags), &compiler,
6673         captured_body, data->contains_anchor);
6674 
6675     if (data->contains_anchor) {
6676       // Unroll loop once, to take care of the case that might start
6677       // at the start of input.
6678       ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6679       first_step_node->AddAlternative(GuardedAlternative(captured_body));
6680       first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6681           new (zone) RegExpCharacterClass('*', default_flags), false,
6682           loop_node)));
6683       node = first_step_node;
6684     } else {
6685       node = loop_node;
6686     }
6687   }
6688   if (is_one_byte) {
6689     node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6690     // Do it again to propagate the new nodes to places where they were not
6691     // put because they had not been calculated yet.
6692     if (node != nullptr) {
6693       node = node->FilterOneByte(RegExpCompiler::kMaxRecursion);
6694     }
6695   } else if (is_unicode && (is_global || is_sticky)) {
6696     node = OptionallyStepBackToLeadSurrogate(&compiler, node, flags);
6697   }
6698 
6699   if (node == nullptr) node = new (zone) EndNode(EndNode::BACKTRACK, zone);
6700   data->node = node;
6701   Analysis analysis(isolate, is_one_byte);
6702   analysis.EnsureAnalyzed(node);
6703   if (analysis.has_failed()) {
6704     const char* error_message = analysis.error_message();
6705     return CompilationResult(isolate, error_message);
6706   }
6707 
6708   // Create the correct assembler for the architecture.
6709 #ifndef V8_INTERPRETED_REGEXP
6710   // Native regexp implementation.
6711 
6712   NativeRegExpMacroAssembler::Mode mode =
6713       is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6714                   : NativeRegExpMacroAssembler::UC16;
6715 
6716 #if V8_TARGET_ARCH_IA32
6717   RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6718                                            (data->capture_count + 1) * 2);
6719 #elif V8_TARGET_ARCH_X64
6720   RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6721                                           (data->capture_count + 1) * 2);
6722 #elif V8_TARGET_ARCH_ARM
6723   RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6724                                           (data->capture_count + 1) * 2);
6725 #elif V8_TARGET_ARCH_ARM64
6726   RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6727                                             (data->capture_count + 1) * 2);
6728 #elif V8_TARGET_ARCH_S390
6729   RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6730                                            (data->capture_count + 1) * 2);
6731 #elif V8_TARGET_ARCH_PPC
6732   RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6733                                           (data->capture_count + 1) * 2);
6734 #elif V8_TARGET_ARCH_MIPS
6735   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6736                                            (data->capture_count + 1) * 2);
6737 #elif V8_TARGET_ARCH_MIPS64
6738   RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6739                                            (data->capture_count + 1) * 2);
6740 #else
6741 #error "Unsupported architecture"
6742 #endif
6743 
6744 #else  // V8_INTERPRETED_REGEXP
6745   // Interpreted regexp implementation.
6746   EmbeddedVector<byte, 1024> codes;
6747   RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6748 #endif  // V8_INTERPRETED_REGEXP
6749 
6750   macro_assembler.set_slow_safe(TooMuchRegExpCode(isolate, pattern));
6751 
6752   // Inserted here, instead of in Assembler, because it depends on information
6753   // in the AST that isn't replicated in the Node structure.
6754   static const int kMaxBacksearchLimit = 1024;
6755   if (is_end_anchored && !is_start_anchored && !is_sticky &&
6756       max_length < kMaxBacksearchLimit) {
6757     macro_assembler.SetCurrentPositionFromEnd(max_length);
6758   }
6759 
6760   if (is_global) {
6761     RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6762     if (data->tree->min_match() > 0) {
6763       mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6764     } else if (is_unicode) {
6765       mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6766     }
6767     macro_assembler.set_global_mode(mode);
6768   }
6769 
6770   return compiler.Assemble(isolate, &macro_assembler, node, data->capture_count,
6771                            pattern);
6772 }
6773 
TooMuchRegExpCode(Isolate * isolate,Handle<String> pattern)6774 bool RegExpEngine::TooMuchRegExpCode(Isolate* isolate, Handle<String> pattern) {
6775   Heap* heap = isolate->heap();
6776   bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6777   if (heap->isolate()->total_regexp_code_generated() >
6778           RegExpImpl::kRegExpCompiledLimit &&
6779       heap->CommittedMemoryExecutable() >
6780           RegExpImpl::kRegExpExecutableMemoryLimit) {
6781     too_much = true;
6782   }
6783   return too_much;
6784 }
6785 
6786 
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6787 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6788                                    Object* key_pattern,
6789                                    FixedArray** last_match_cache,
6790                                    ResultsCacheType type) {
6791   FixedArray* cache;
6792   if (!key_string->IsInternalizedString()) return Smi::kZero;
6793   if (type == STRING_SPLIT_SUBSTRINGS) {
6794     DCHECK(key_pattern->IsString());
6795     if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6796     cache = heap->string_split_cache();
6797   } else {
6798     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6799     DCHECK(key_pattern->IsFixedArray());
6800     cache = heap->regexp_multiple_cache();
6801   }
6802 
6803   uint32_t hash = key_string->Hash();
6804   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6805                     ~(kArrayEntriesPerCacheEntry - 1));
6806   if (cache->get(index + kStringOffset) != key_string ||
6807       cache->get(index + kPatternOffset) != key_pattern) {
6808     index =
6809         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6810     if (cache->get(index + kStringOffset) != key_string ||
6811         cache->get(index + kPatternOffset) != key_pattern) {
6812       return Smi::kZero;
6813     }
6814   }
6815 
6816   *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6817   return cache->get(index + kArrayOffset);
6818 }
6819 
6820 
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6821 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6822                                Handle<Object> key_pattern,
6823                                Handle<FixedArray> value_array,
6824                                Handle<FixedArray> last_match_cache,
6825                                ResultsCacheType type) {
6826   Factory* factory = isolate->factory();
6827   Handle<FixedArray> cache;
6828   if (!key_string->IsInternalizedString()) return;
6829   if (type == STRING_SPLIT_SUBSTRINGS) {
6830     DCHECK(key_pattern->IsString());
6831     if (!key_pattern->IsInternalizedString()) return;
6832     cache = factory->string_split_cache();
6833   } else {
6834     DCHECK(type == REGEXP_MULTIPLE_INDICES);
6835     DCHECK(key_pattern->IsFixedArray());
6836     cache = factory->regexp_multiple_cache();
6837   }
6838 
6839   uint32_t hash = key_string->Hash();
6840   uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6841                     ~(kArrayEntriesPerCacheEntry - 1));
6842   if (cache->get(index + kStringOffset) == Smi::kZero) {
6843     cache->set(index + kStringOffset, *key_string);
6844     cache->set(index + kPatternOffset, *key_pattern);
6845     cache->set(index + kArrayOffset, *value_array);
6846     cache->set(index + kLastMatchOffset, *last_match_cache);
6847   } else {
6848     uint32_t index2 =
6849         ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6850     if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6851       cache->set(index2 + kStringOffset, *key_string);
6852       cache->set(index2 + kPatternOffset, *key_pattern);
6853       cache->set(index2 + kArrayOffset, *value_array);
6854       cache->set(index2 + kLastMatchOffset, *last_match_cache);
6855     } else {
6856       cache->set(index2 + kStringOffset, Smi::kZero);
6857       cache->set(index2 + kPatternOffset, Smi::kZero);
6858       cache->set(index2 + kArrayOffset, Smi::kZero);
6859       cache->set(index2 + kLastMatchOffset, Smi::kZero);
6860       cache->set(index + kStringOffset, *key_string);
6861       cache->set(index + kPatternOffset, *key_pattern);
6862       cache->set(index + kArrayOffset, *value_array);
6863       cache->set(index + kLastMatchOffset, *last_match_cache);
6864     }
6865   }
6866   // If the array is a reasonably short list of substrings, convert it into a
6867   // list of internalized strings.
6868   if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6869     for (int i = 0; i < value_array->length(); i++) {
6870       Handle<String> str(String::cast(value_array->get(i)), isolate);
6871       Handle<String> internalized_str = factory->InternalizeString(str);
6872       value_array->set(i, *internalized_str);
6873     }
6874   }
6875   // Convert backing store to a copy-on-write array.
6876   value_array->set_map_no_write_barrier(
6877       ReadOnlyRoots(isolate).fixed_cow_array_map());
6878 }
6879 
6880 
Clear(FixedArray * cache)6881 void RegExpResultsCache::Clear(FixedArray* cache) {
6882   for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6883     cache->set(i, Smi::kZero);
6884   }
6885 }
6886 
6887 }  // namespace internal
6888 }  // namespace v8
6889