1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/v8.h"
6
7 #include "src/ast.h"
8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h"
10 #include "src/compiler.h"
11 #include "src/execution.h"
12 #include "src/factory.h"
13 #include "src/jsregexp-inl.h"
14 #include "src/jsregexp.h"
15 #include "src/ostreams.h"
16 #include "src/parser.h"
17 #include "src/regexp-macro-assembler.h"
18 #include "src/regexp-macro-assembler-irregexp.h"
19 #include "src/regexp-macro-assembler-tracer.h"
20 #include "src/regexp-stack.h"
21 #include "src/runtime.h"
22 #include "src/string-search.h"
23
24 #ifndef V8_INTERPRETED_REGEXP
25 #if V8_TARGET_ARCH_IA32
26 #include "src/ia32/regexp-macro-assembler-ia32.h" // NOLINT
27 #elif V8_TARGET_ARCH_X64
28 #include "src/x64/regexp-macro-assembler-x64.h" // NOLINT
29 #elif V8_TARGET_ARCH_ARM64
30 #include "src/arm64/regexp-macro-assembler-arm64.h" // NOLINT
31 #elif V8_TARGET_ARCH_ARM
32 #include "src/arm/regexp-macro-assembler-arm.h" // NOLINT
33 #elif V8_TARGET_ARCH_MIPS
34 #include "src/mips/regexp-macro-assembler-mips.h" // NOLINT
35 #elif V8_TARGET_ARCH_MIPS64
36 #include "src/mips64/regexp-macro-assembler-mips64.h" // NOLINT
37 #elif V8_TARGET_ARCH_X87
38 #include "src/x87/regexp-macro-assembler-x87.h" // NOLINT
39 #else
40 #error Unsupported target architecture.
41 #endif
42 #endif
43
44 #include "src/interpreter-irregexp.h"
45
46
47 namespace v8 {
48 namespace internal {
49
CreateRegExpLiteral(Handle<JSFunction> constructor,Handle<String> pattern,Handle<String> flags)50 MaybeHandle<Object> RegExpImpl::CreateRegExpLiteral(
51 Handle<JSFunction> constructor,
52 Handle<String> pattern,
53 Handle<String> flags) {
54 // Call the construct code with 2 arguments.
55 Handle<Object> argv[] = { pattern, flags };
56 return Execution::New(constructor, arraysize(argv), argv);
57 }
58
59
RegExpFlagsFromString(Handle<String> str)60 static JSRegExp::Flags RegExpFlagsFromString(Handle<String> str) {
61 int flags = JSRegExp::NONE;
62 for (int i = 0; i < str->length(); i++) {
63 switch (str->Get(i)) {
64 case 'i':
65 flags |= JSRegExp::IGNORE_CASE;
66 break;
67 case 'g':
68 flags |= JSRegExp::GLOBAL;
69 break;
70 case 'm':
71 flags |= JSRegExp::MULTILINE;
72 break;
73 case 'y':
74 if (FLAG_harmony_regexps) flags |= JSRegExp::STICKY;
75 break;
76 }
77 }
78 return JSRegExp::Flags(flags);
79 }
80
81
82 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text,const char * message)83 static inline MaybeHandle<Object> ThrowRegExpException(
84 Handle<JSRegExp> re,
85 Handle<String> pattern,
86 Handle<String> error_text,
87 const char* message) {
88 Isolate* isolate = re->GetIsolate();
89 Factory* factory = isolate->factory();
90 Handle<FixedArray> elements = factory->NewFixedArray(2);
91 elements->set(0, *pattern);
92 elements->set(1, *error_text);
93 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
94 Handle<Object> regexp_err;
95 THROW_NEW_ERROR(isolate, NewSyntaxError(message, array), Object);
96 }
97
98
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)99 ContainedInLattice AddRange(ContainedInLattice containment,
100 const int* ranges,
101 int ranges_length,
102 Interval new_range) {
103 DCHECK((ranges_length & 1) == 1);
104 DCHECK(ranges[ranges_length - 1] == String::kMaxUtf16CodeUnit + 1);
105 if (containment == kLatticeUnknown) return containment;
106 bool inside = false;
107 int last = 0;
108 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
109 // Consider the range from last to ranges[i].
110 // We haven't got to the new range yet.
111 if (ranges[i] <= new_range.from()) continue;
112 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
113 // inclusive, but the values in ranges are not.
114 if (last <= new_range.from() && new_range.to() < ranges[i]) {
115 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
116 }
117 return kLatticeUnknown;
118 }
119 return containment;
120 }
121
122
123 // More makes code generation slower, less makes V8 benchmark score lower.
124 const int kMaxLookaheadForBoyerMoore = 8;
125 // In a 3-character pattern you can maximally step forwards 3 characters
126 // at a time, which is not always enough to pay for the extra logic.
127 const int kPatternTooShortForBoyerMoore = 2;
128
129
130 // Identifies the sort of regexps where the regexp engine is faster
131 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)132 static bool HasFewDifferentCharacters(Handle<String> pattern) {
133 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
134 if (length <= kPatternTooShortForBoyerMoore) return false;
135 const int kMod = 128;
136 bool character_found[kMod];
137 int different = 0;
138 memset(&character_found[0], 0, sizeof(character_found));
139 for (int i = 0; i < length; i++) {
140 int ch = (pattern->Get(i) & (kMod - 1));
141 if (!character_found[ch]) {
142 character_found[ch] = true;
143 different++;
144 // We declare a regexp low-alphabet if it has at least 3 times as many
145 // characters as it has different characters.
146 if (different * 3 > length) return false;
147 }
148 }
149 return true;
150 }
151
152
153 // Generic RegExp methods. Dispatches to implementation specific methods.
154
155
Compile(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> flag_str)156 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
157 Handle<String> pattern,
158 Handle<String> flag_str) {
159 Isolate* isolate = re->GetIsolate();
160 Zone zone(isolate);
161 JSRegExp::Flags flags = RegExpFlagsFromString(flag_str);
162 CompilationCache* compilation_cache = isolate->compilation_cache();
163 MaybeHandle<FixedArray> maybe_cached =
164 compilation_cache->LookupRegExp(pattern, flags);
165 Handle<FixedArray> cached;
166 bool in_cache = maybe_cached.ToHandle(&cached);
167 LOG(isolate, RegExpCompileEvent(re, in_cache));
168
169 Handle<Object> result;
170 if (in_cache) {
171 re->set_data(*cached);
172 return re;
173 }
174 pattern = String::Flatten(pattern);
175 PostponeInterruptsScope postpone(isolate);
176 RegExpCompileData parse_result;
177 FlatStringReader reader(isolate, pattern);
178 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
179 &parse_result, &zone)) {
180 // Throw an exception if we fail to parse the pattern.
181 return ThrowRegExpException(re,
182 pattern,
183 parse_result.error,
184 "malformed_regexp");
185 }
186
187 bool has_been_compiled = false;
188
189 if (parse_result.simple &&
190 !flags.is_ignore_case() &&
191 !flags.is_sticky() &&
192 !HasFewDifferentCharacters(pattern)) {
193 // Parse-tree is a single atom that is equal to the pattern.
194 AtomCompile(re, pattern, flags, pattern);
195 has_been_compiled = true;
196 } else if (parse_result.tree->IsAtom() &&
197 !flags.is_ignore_case() &&
198 !flags.is_sticky() &&
199 parse_result.capture_count == 0) {
200 RegExpAtom* atom = parse_result.tree->AsAtom();
201 Vector<const uc16> atom_pattern = atom->data();
202 Handle<String> atom_string;
203 ASSIGN_RETURN_ON_EXCEPTION(
204 isolate, atom_string,
205 isolate->factory()->NewStringFromTwoByte(atom_pattern),
206 Object);
207 if (!HasFewDifferentCharacters(atom_string)) {
208 AtomCompile(re, pattern, flags, atom_string);
209 has_been_compiled = true;
210 }
211 }
212 if (!has_been_compiled) {
213 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
214 }
215 DCHECK(re->data()->IsFixedArray());
216 // Compilation succeeded so the data is set on the regexp
217 // and we can store it in the cache.
218 Handle<FixedArray> data(FixedArray::cast(re->data()));
219 compilation_cache->PutRegExp(pattern, flags, data);
220
221 return re;
222 }
223
224
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<JSArray> last_match_info)225 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
226 Handle<String> subject,
227 int index,
228 Handle<JSArray> last_match_info) {
229 switch (regexp->TypeTag()) {
230 case JSRegExp::ATOM:
231 return AtomExec(regexp, subject, index, last_match_info);
232 case JSRegExp::IRREGEXP: {
233 return IrregexpExec(regexp, subject, index, last_match_info);
234 }
235 default:
236 UNREACHABLE();
237 return MaybeHandle<Object>();
238 }
239 }
240
241
242 // RegExp Atom implementation: Simple string search using indexOf.
243
244
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)245 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
246 Handle<String> pattern,
247 JSRegExp::Flags flags,
248 Handle<String> match_pattern) {
249 re->GetIsolate()->factory()->SetRegExpAtomData(re,
250 JSRegExp::ATOM,
251 pattern,
252 flags,
253 match_pattern);
254 }
255
256
SetAtomLastCapture(FixedArray * array,String * subject,int from,int to)257 static void SetAtomLastCapture(FixedArray* array,
258 String* subject,
259 int from,
260 int to) {
261 SealHandleScope shs(array->GetIsolate());
262 RegExpImpl::SetLastCaptureCount(array, 2);
263 RegExpImpl::SetLastSubject(array, subject);
264 RegExpImpl::SetLastInput(array, subject);
265 RegExpImpl::SetCapture(array, 0, from);
266 RegExpImpl::SetCapture(array, 1, to);
267 }
268
269
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)270 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
271 Handle<String> subject,
272 int index,
273 int32_t* output,
274 int output_size) {
275 Isolate* isolate = regexp->GetIsolate();
276
277 DCHECK(0 <= index);
278 DCHECK(index <= subject->length());
279
280 subject = String::Flatten(subject);
281 DisallowHeapAllocation no_gc; // ensure vectors stay valid
282
283 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
284 int needle_len = needle->length();
285 DCHECK(needle->IsFlat());
286 DCHECK_LT(0, needle_len);
287
288 if (index + needle_len > subject->length()) {
289 return RegExpImpl::RE_FAILURE;
290 }
291
292 for (int i = 0; i < output_size; i += 2) {
293 String::FlatContent needle_content = needle->GetFlatContent();
294 String::FlatContent subject_content = subject->GetFlatContent();
295 DCHECK(needle_content.IsFlat());
296 DCHECK(subject_content.IsFlat());
297 // dispatch on type of strings
298 index =
299 (needle_content.IsOneByte()
300 ? (subject_content.IsOneByte()
301 ? SearchString(isolate, subject_content.ToOneByteVector(),
302 needle_content.ToOneByteVector(), index)
303 : SearchString(isolate, subject_content.ToUC16Vector(),
304 needle_content.ToOneByteVector(), index))
305 : (subject_content.IsOneByte()
306 ? SearchString(isolate, subject_content.ToOneByteVector(),
307 needle_content.ToUC16Vector(), index)
308 : SearchString(isolate, subject_content.ToUC16Vector(),
309 needle_content.ToUC16Vector(), index)));
310 if (index == -1) {
311 return i / 2; // Return number of matches.
312 } else {
313 output[i] = index;
314 output[i+1] = index + needle_len;
315 index += needle_len;
316 }
317 }
318 return output_size / 2;
319 }
320
321
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<JSArray> last_match_info)322 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re,
323 Handle<String> subject,
324 int index,
325 Handle<JSArray> last_match_info) {
326 Isolate* isolate = re->GetIsolate();
327
328 static const int kNumRegisters = 2;
329 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
330 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
331
332 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
333
334 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
335
336 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
337 SealHandleScope shs(isolate);
338 FixedArray* array = FixedArray::cast(last_match_info->elements());
339 SetAtomLastCapture(array, *subject, output_registers[0], output_registers[1]);
340 return last_match_info;
341 }
342
343
344 // Irregexp implementation.
345
346 // Ensures that the regexp object contains a compiled version of the
347 // source for either one-byte or two-byte subject strings.
348 // If the compiled version doesn't already exist, it is compiled
349 // from the source pattern.
350 // If compilation fails, an exception is thrown and this function
351 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)352 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
353 Handle<String> sample_subject,
354 bool is_one_byte) {
355 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
356 #ifdef V8_INTERPRETED_REGEXP
357 if (compiled_code->IsByteArray()) return true;
358 #else // V8_INTERPRETED_REGEXP (RegExp native code)
359 if (compiled_code->IsCode()) return true;
360 #endif
361 // We could potentially have marked this as flushable, but have kept
362 // a saved version if we did not flush it yet.
363 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
364 if (saved_code->IsCode()) {
365 // Reinstate the code in the original place.
366 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
367 DCHECK(compiled_code->IsSmi());
368 return true;
369 }
370 return CompileIrregexp(re, sample_subject, is_one_byte);
371 }
372
373
CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,Handle<String> error_message,Isolate * isolate)374 static void CreateRegExpErrorObjectAndThrow(Handle<JSRegExp> re,
375 Handle<String> error_message,
376 Isolate* isolate) {
377 Factory* factory = isolate->factory();
378 Handle<FixedArray> elements = factory->NewFixedArray(2);
379 elements->set(0, re->Pattern());
380 elements->set(1, *error_message);
381 Handle<JSArray> array = factory->NewJSArrayWithElements(elements);
382 Handle<Object> error;
383 MaybeHandle<Object> maybe_error =
384 factory->NewSyntaxError("malformed_regexp", array);
385 if (maybe_error.ToHandle(&error)) isolate->Throw(*error);
386 }
387
388
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)389 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
390 Handle<String> sample_subject,
391 bool is_one_byte) {
392 // Compile the RegExp.
393 Isolate* isolate = re->GetIsolate();
394 Zone zone(isolate);
395 PostponeInterruptsScope postpone(isolate);
396 // If we had a compilation error the last time this is saved at the
397 // saved code index.
398 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
399 // When arriving here entry can only be a smi, either representing an
400 // uncompiled regexp, a previous compilation error, or code that has
401 // been flushed.
402 DCHECK(entry->IsSmi());
403 int entry_value = Smi::cast(entry)->value();
404 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
405 entry_value == JSRegExp::kCompilationErrorValue ||
406 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
407
408 if (entry_value == JSRegExp::kCompilationErrorValue) {
409 // A previous compilation failed and threw an error which we store in
410 // the saved code index (we store the error message, not the actual
411 // error). Recreate the error object and throw it.
412 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
413 DCHECK(error_string->IsString());
414 Handle<String> error_message(String::cast(error_string));
415 CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
416 return false;
417 }
418
419 JSRegExp::Flags flags = re->GetFlags();
420
421 Handle<String> pattern(re->Pattern());
422 pattern = String::Flatten(pattern);
423 RegExpCompileData compile_data;
424 FlatStringReader reader(isolate, pattern);
425 if (!RegExpParser::ParseRegExp(&reader, flags.is_multiline(),
426 &compile_data,
427 &zone)) {
428 // Throw an exception if we fail to parse the pattern.
429 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
430 USE(ThrowRegExpException(re,
431 pattern,
432 compile_data.error,
433 "malformed_regexp"));
434 return false;
435 }
436 RegExpEngine::CompilationResult result = RegExpEngine::Compile(
437 &compile_data, flags.is_ignore_case(), flags.is_global(),
438 flags.is_multiline(), flags.is_sticky(), pattern, sample_subject,
439 is_one_byte, &zone);
440 if (result.error_message != NULL) {
441 // Unable to compile regexp.
442 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
443 CStrVector(result.error_message)).ToHandleChecked();
444 CreateRegExpErrorObjectAndThrow(re, error_message, isolate);
445 return false;
446 }
447
448 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
449 data->set(JSRegExp::code_index(is_one_byte), result.code);
450 int register_max = IrregexpMaxRegisterCount(*data);
451 if (result.num_registers > register_max) {
452 SetIrregexpMaxRegisterCount(*data, result.num_registers);
453 }
454
455 return true;
456 }
457
458
IrregexpMaxRegisterCount(FixedArray * re)459 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
460 return Smi::cast(
461 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
462 }
463
464
SetIrregexpMaxRegisterCount(FixedArray * re,int value)465 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
466 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
467 }
468
469
IrregexpNumberOfCaptures(FixedArray * re)470 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
471 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
472 }
473
474
IrregexpNumberOfRegisters(FixedArray * re)475 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
476 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
477 }
478
479
IrregexpByteCode(FixedArray * re,bool is_one_byte)480 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
481 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
482 }
483
484
IrregexpNativeCode(FixedArray * re,bool is_one_byte)485 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
486 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
487 }
488
489
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)490 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
491 Handle<String> pattern,
492 JSRegExp::Flags flags,
493 int capture_count) {
494 // Initialize compiled code entries to null.
495 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
496 JSRegExp::IRREGEXP,
497 pattern,
498 flags,
499 capture_count);
500 }
501
502
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)503 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
504 Handle<String> subject) {
505 subject = String::Flatten(subject);
506
507 // Check representation of the underlying storage.
508 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
509 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
510
511 #ifdef V8_INTERPRETED_REGEXP
512 // Byte-code regexp needs space allocated for all its registers.
513 // The result captures are copied to the start of the registers array
514 // if the match succeeds. This way those registers are not clobbered
515 // when we set the last match info from last successful match.
516 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
517 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
518 #else // V8_INTERPRETED_REGEXP
519 // Native regexp only needs room to output captures. Registers are handled
520 // internally.
521 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
522 #endif // V8_INTERPRETED_REGEXP
523 }
524
525
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)526 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
527 Handle<String> subject,
528 int index,
529 int32_t* output,
530 int output_size) {
531 Isolate* isolate = regexp->GetIsolate();
532
533 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
534
535 DCHECK(index >= 0);
536 DCHECK(index <= subject->length());
537 DCHECK(subject->IsFlat());
538
539 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
540
541 #ifndef V8_INTERPRETED_REGEXP
542 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
543 do {
544 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
545 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
546 // The stack is used to allocate registers for the compiled regexp code.
547 // This means that in case of failure, the output registers array is left
548 // untouched and contains the capture results from the previous successful
549 // match. We can use that to set the last match info lazily.
550 NativeRegExpMacroAssembler::Result res =
551 NativeRegExpMacroAssembler::Match(code,
552 subject,
553 output,
554 output_size,
555 index,
556 isolate);
557 if (res != NativeRegExpMacroAssembler::RETRY) {
558 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
559 isolate->has_pending_exception());
560 STATIC_ASSERT(
561 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
562 STATIC_ASSERT(
563 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
564 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
565 == RE_EXCEPTION);
566 return static_cast<IrregexpResult>(res);
567 }
568 // If result is RETRY, the string has changed representation, and we
569 // must restart from scratch.
570 // In this case, it means we must make sure we are prepared to handle
571 // the, potentially, different subject (the string can switch between
572 // being internal and external, and even between being Latin1 and UC16,
573 // but the characters are always the same).
574 IrregexpPrepare(regexp, subject);
575 is_one_byte = subject->IsOneByteRepresentationUnderneath();
576 } while (true);
577 UNREACHABLE();
578 return RE_EXCEPTION;
579 #else // V8_INTERPRETED_REGEXP
580
581 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
582 // We must have done EnsureCompiledIrregexp, so we can get the number of
583 // registers.
584 int number_of_capture_registers =
585 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
586 int32_t* raw_output = &output[number_of_capture_registers];
587 // We do not touch the actual capture result registers until we know there
588 // has been a match so that we can use those capture results to set the
589 // last match info.
590 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
591 raw_output[i] = -1;
592 }
593 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
594 isolate);
595
596 IrregexpResult result = IrregexpInterpreter::Match(isolate,
597 byte_codes,
598 subject,
599 raw_output,
600 index);
601 if (result == RE_SUCCESS) {
602 // Copy capture results to the start of the registers array.
603 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
604 }
605 if (result == RE_EXCEPTION) {
606 DCHECK(!isolate->has_pending_exception());
607 isolate->StackOverflow();
608 }
609 return result;
610 #endif // V8_INTERPRETED_REGEXP
611 }
612
613
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<JSArray> last_match_info)614 MaybeHandle<Object> RegExpImpl::IrregexpExec(Handle<JSRegExp> regexp,
615 Handle<String> subject,
616 int previous_index,
617 Handle<JSArray> last_match_info) {
618 Isolate* isolate = regexp->GetIsolate();
619 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
620
621 // Prepare space for the return values.
622 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
623 if (FLAG_trace_regexp_bytecodes) {
624 String* pattern = regexp->Pattern();
625 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
626 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
627 }
628 #endif
629 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
630 if (required_registers < 0) {
631 // Compiling failed with an exception.
632 DCHECK(isolate->has_pending_exception());
633 return MaybeHandle<Object>();
634 }
635
636 int32_t* output_registers = NULL;
637 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
638 output_registers = NewArray<int32_t>(required_registers);
639 }
640 SmartArrayPointer<int32_t> auto_release(output_registers);
641 if (output_registers == NULL) {
642 output_registers = isolate->jsregexp_static_offsets_vector();
643 }
644
645 int res = RegExpImpl::IrregexpExecRaw(
646 regexp, subject, previous_index, output_registers, required_registers);
647 if (res == RE_SUCCESS) {
648 int capture_count =
649 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
650 return SetLastMatchInfo(
651 last_match_info, subject, capture_count, output_registers);
652 }
653 if (res == RE_EXCEPTION) {
654 DCHECK(isolate->has_pending_exception());
655 return MaybeHandle<Object>();
656 }
657 DCHECK(res == RE_FAILURE);
658 return isolate->factory()->null_value();
659 }
660
661
SetLastMatchInfo(Handle<JSArray> last_match_info,Handle<String> subject,int capture_count,int32_t * match)662 Handle<JSArray> RegExpImpl::SetLastMatchInfo(Handle<JSArray> last_match_info,
663 Handle<String> subject,
664 int capture_count,
665 int32_t* match) {
666 DCHECK(last_match_info->HasFastObjectElements());
667 int capture_register_count = (capture_count + 1) * 2;
668 JSArray::EnsureSize(last_match_info,
669 capture_register_count + kLastMatchOverhead);
670 DisallowHeapAllocation no_allocation;
671 FixedArray* array = FixedArray::cast(last_match_info->elements());
672 if (match != NULL) {
673 for (int i = 0; i < capture_register_count; i += 2) {
674 SetCapture(array, i, match[i]);
675 SetCapture(array, i + 1, match[i + 1]);
676 }
677 }
678 SetLastCaptureCount(array, capture_register_count);
679 SetLastSubject(array, *subject);
680 SetLastInput(array, *subject);
681 return last_match_info;
682 }
683
684
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,bool is_global,Isolate * isolate)685 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
686 Handle<String> subject,
687 bool is_global,
688 Isolate* isolate)
689 : register_array_(NULL),
690 register_array_size_(0),
691 regexp_(regexp),
692 subject_(subject) {
693 #ifdef V8_INTERPRETED_REGEXP
694 bool interpreted = true;
695 #else
696 bool interpreted = false;
697 #endif // V8_INTERPRETED_REGEXP
698
699 if (regexp_->TypeTag() == JSRegExp::ATOM) {
700 static const int kAtomRegistersPerMatch = 2;
701 registers_per_match_ = kAtomRegistersPerMatch;
702 // There is no distinction between interpreted and native for atom regexps.
703 interpreted = false;
704 } else {
705 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
706 if (registers_per_match_ < 0) {
707 num_matches_ = -1; // Signal exception.
708 return;
709 }
710 }
711
712 if (is_global && !interpreted) {
713 register_array_size_ =
714 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
715 max_matches_ = register_array_size_ / registers_per_match_;
716 } else {
717 // Global loop in interpreted regexp is not implemented. We choose
718 // the size of the offsets vector so that it can only store one match.
719 register_array_size_ = registers_per_match_;
720 max_matches_ = 1;
721 }
722
723 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
724 register_array_ = NewArray<int32_t>(register_array_size_);
725 } else {
726 register_array_ = isolate->jsregexp_static_offsets_vector();
727 }
728
729 // Set state so that fetching the results the first time triggers a call
730 // to the compiled regexp.
731 current_match_index_ = max_matches_ - 1;
732 num_matches_ = max_matches_;
733 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
734 DCHECK_GE(register_array_size_, registers_per_match_);
735 int32_t* last_match =
736 ®ister_array_[current_match_index_ * registers_per_match_];
737 last_match[0] = -1;
738 last_match[1] = 0;
739 }
740
741
742 // -------------------------------------------------------------------
743 // Implementation of the Irregexp regular expression engine.
744 //
745 // The Irregexp regular expression engine is intended to be a complete
746 // implementation of ECMAScript regular expressions. It generates either
747 // bytecodes or native code.
748
749 // The Irregexp regexp engine is structured in three steps.
750 // 1) The parser generates an abstract syntax tree. See ast.cc.
751 // 2) From the AST a node network is created. The nodes are all
752 // subclasses of RegExpNode. The nodes represent states when
753 // executing a regular expression. Several optimizations are
754 // performed on the node network.
755 // 3) From the nodes we generate either byte codes or native code
756 // that can actually execute the regular expression (perform
757 // the search). The code generation step is described in more
758 // detail below.
759
760 // Code generation.
761 //
762 // The nodes are divided into four main categories.
763 // * Choice nodes
764 // These represent places where the regular expression can
765 // match in more than one way. For example on entry to an
766 // alternation (foo|bar) or a repetition (*, +, ? or {}).
767 // * Action nodes
768 // These represent places where some action should be
769 // performed. Examples include recording the current position
770 // in the input string to a register (in order to implement
771 // captures) or other actions on register for example in order
772 // to implement the counters needed for {} repetitions.
773 // * Matching nodes
774 // These attempt to match some element part of the input string.
775 // Examples of elements include character classes, plain strings
776 // or back references.
777 // * End nodes
778 // These are used to implement the actions required on finding
779 // a successful match or failing to find a match.
780 //
781 // The code generated (whether as byte codes or native code) maintains
782 // some state as it runs. This consists of the following elements:
783 //
784 // * The capture registers. Used for string captures.
785 // * Other registers. Used for counters etc.
786 // * The current position.
787 // * The stack of backtracking information. Used when a matching node
788 // fails to find a match and needs to try an alternative.
789 //
790 // Conceptual regular expression execution model:
791 //
792 // There is a simple conceptual model of regular expression execution
793 // which will be presented first. The actual code generated is a more
794 // efficient simulation of the simple conceptual model:
795 //
796 // * Choice nodes are implemented as follows:
797 // For each choice except the last {
798 // push current position
799 // push backtrack code location
800 // <generate code to test for choice>
801 // backtrack code location:
802 // pop current position
803 // }
804 // <generate code to test for last choice>
805 //
806 // * Actions nodes are generated as follows
807 // <push affected registers on backtrack stack>
808 // <generate code to perform action>
809 // push backtrack code location
810 // <generate code to test for following nodes>
811 // backtrack code location:
812 // <pop affected registers to restore their state>
813 // <pop backtrack location from stack and go to it>
814 //
815 // * Matching nodes are generated as follows:
816 // if input string matches at current position
817 // update current position
818 // <generate code to test for following nodes>
819 // else
820 // <pop backtrack location from stack and go to it>
821 //
822 // Thus it can be seen that the current position is saved and restored
823 // by the choice nodes, whereas the registers are saved and restored by
824 // by the action nodes that manipulate them.
825 //
826 // The other interesting aspect of this model is that nodes are generated
827 // at the point where they are needed by a recursive call to Emit(). If
828 // the node has already been code generated then the Emit() call will
829 // generate a jump to the previously generated code instead. In order to
830 // limit recursion it is possible for the Emit() function to put the node
831 // on a work list for later generation and instead generate a jump. The
832 // destination of the jump is resolved later when the code is generated.
833 //
834 // Actual regular expression code generation.
835 //
836 // Code generation is actually more complicated than the above. In order
837 // to improve the efficiency of the generated code some optimizations are
838 // performed
839 //
840 // * Choice nodes have 1-character lookahead.
841 // A choice node looks at the following character and eliminates some of
842 // the choices immediately based on that character. This is not yet
843 // implemented.
844 // * Simple greedy loops store reduced backtracking information.
845 // A quantifier like /.*foo/m will greedily match the whole input. It will
846 // then need to backtrack to a point where it can match "foo". The naive
847 // implementation of this would push each character position onto the
848 // backtracking stack, then pop them off one by one. This would use space
849 // proportional to the length of the input string. However since the "."
850 // can only match in one way and always has a constant length (in this case
851 // of 1) it suffices to store the current position on the top of the stack
852 // once. Matching now becomes merely incrementing the current position and
853 // backtracking becomes decrementing the current position and checking the
854 // result against the stored current position. This is faster and saves
855 // space.
856 // * The current state is virtualized.
857 // This is used to defer expensive operations until it is clear that they
858 // are needed and to generate code for a node more than once, allowing
859 // specialized an efficient versions of the code to be created. This is
860 // explained in the section below.
861 //
862 // Execution state virtualization.
863 //
864 // Instead of emitting code, nodes that manipulate the state can record their
865 // manipulation in an object called the Trace. The Trace object can record a
866 // current position offset, an optional backtrack code location on the top of
867 // the virtualized backtrack stack and some register changes. When a node is
868 // to be emitted it can flush the Trace or update it. Flushing the Trace
869 // will emit code to bring the actual state into line with the virtual state.
870 // Avoiding flushing the state can postpone some work (e.g. updates of capture
871 // registers). Postponing work can save time when executing the regular
872 // expression since it may be found that the work never has to be done as a
873 // failure to match can occur. In addition it is much faster to jump to a
874 // known backtrack code location than it is to pop an unknown backtrack
875 // location from the stack and jump there.
876 //
877 // The virtual state found in the Trace affects code generation. For example
878 // the virtual state contains the difference between the actual current
879 // position and the virtual current position, and matching code needs to use
880 // this offset to attempt a match in the correct location of the input
881 // string. Therefore code generated for a non-trivial trace is specialized
882 // to that trace. The code generator therefore has the ability to generate
883 // code for each node several times. In order to limit the size of the
884 // generated code there is an arbitrary limit on how many specialized sets of
885 // code may be generated for a given node. If the limit is reached, the
886 // trace is flushed and a generic version of the code for a node is emitted.
887 // This is subsequently used for that node. The code emitted for non-generic
888 // trace is not recorded in the node and so it cannot currently be reused in
889 // the event that code generation is requested for an identical trace.
890
891
AppendToText(RegExpText * text,Zone * zone)892 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
893 UNREACHABLE();
894 }
895
896
AppendToText(RegExpText * text,Zone * zone)897 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
898 text->AddElement(TextElement::Atom(this), zone);
899 }
900
901
AppendToText(RegExpText * text,Zone * zone)902 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
903 text->AddElement(TextElement::CharClass(this), zone);
904 }
905
906
AppendToText(RegExpText * text,Zone * zone)907 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
908 for (int i = 0; i < elements()->length(); i++)
909 text->AddElement(elements()->at(i), zone);
910 }
911
912
Atom(RegExpAtom * atom)913 TextElement TextElement::Atom(RegExpAtom* atom) {
914 return TextElement(ATOM, atom);
915 }
916
917
CharClass(RegExpCharacterClass * char_class)918 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
919 return TextElement(CHAR_CLASS, char_class);
920 }
921
922
length() const923 int TextElement::length() const {
924 switch (text_type()) {
925 case ATOM:
926 return atom()->length();
927
928 case CHAR_CLASS:
929 return 1;
930 }
931 UNREACHABLE();
932 return 0;
933 }
934
935
GetTable(bool ignore_case)936 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
937 if (table_ == NULL) {
938 table_ = new(zone()) DispatchTable(zone());
939 DispatchTableConstructor cons(table_, ignore_case, zone());
940 cons.BuildTable(this);
941 }
942 return table_;
943 }
944
945
946 class FrequencyCollator {
947 public:
FrequencyCollator()948 FrequencyCollator() : total_samples_(0) {
949 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
950 frequencies_[i] = CharacterFrequency(i);
951 }
952 }
953
CountCharacter(int character)954 void CountCharacter(int character) {
955 int index = (character & RegExpMacroAssembler::kTableMask);
956 frequencies_[index].Increment();
957 total_samples_++;
958 }
959
960 // Does not measure in percent, but rather per-128 (the table size from the
961 // regexp macro assembler).
Frequency(int in_character)962 int Frequency(int in_character) {
963 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
964 if (total_samples_ < 1) return 1; // Division by zero.
965 int freq_in_per128 =
966 (frequencies_[in_character].counter() * 128) / total_samples_;
967 return freq_in_per128;
968 }
969
970 private:
971 class CharacterFrequency {
972 public:
CharacterFrequency()973 CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)974 explicit CharacterFrequency(int character)
975 : counter_(0), character_(character) { }
976
Increment()977 void Increment() { counter_++; }
counter()978 int counter() { return counter_; }
character()979 int character() { return character_; }
980
981 private:
982 int counter_;
983 int character_;
984 };
985
986
987 private:
988 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
989 int total_samples_;
990 };
991
992
993 class RegExpCompiler {
994 public:
995 RegExpCompiler(int capture_count, bool ignore_case, bool is_one_byte,
996 Zone* zone);
997
AllocateRegister()998 int AllocateRegister() {
999 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
1000 reg_exp_too_big_ = true;
1001 return next_register_;
1002 }
1003 return next_register_++;
1004 }
1005
1006 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
1007 RegExpNode* start,
1008 int capture_count,
1009 Handle<String> pattern);
1010
AddWork(RegExpNode * node)1011 inline void AddWork(RegExpNode* node) { work_list_->Add(node); }
1012
1013 static const int kImplementationOffset = 0;
1014 static const int kNumberOfRegistersOffset = 0;
1015 static const int kCodeOffset = 1;
1016
macro_assembler()1017 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1018 EndNode* accept() { return accept_; }
1019
1020 static const int kMaxRecursion = 100;
recursion_depth()1021 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1022 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1023 inline void DecrementRecursionDepth() { recursion_depth_--; }
1024
SetRegExpTooBig()1025 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1026
ignore_case()1027 inline bool ignore_case() { return ignore_case_; }
one_byte()1028 inline bool one_byte() { return one_byte_; }
frequency_collator()1029 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1030
current_expansion_factor()1031 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1032 void set_current_expansion_factor(int value) {
1033 current_expansion_factor_ = value;
1034 }
1035
zone() const1036 Zone* zone() const { return zone_; }
1037
1038 static const int kNoRegister = -1;
1039
1040 private:
1041 EndNode* accept_;
1042 int next_register_;
1043 List<RegExpNode*>* work_list_;
1044 int recursion_depth_;
1045 RegExpMacroAssembler* macro_assembler_;
1046 bool ignore_case_;
1047 bool one_byte_;
1048 bool reg_exp_too_big_;
1049 int current_expansion_factor_;
1050 FrequencyCollator frequency_collator_;
1051 Zone* zone_;
1052 };
1053
1054
1055 class RecursionCheck {
1056 public:
RecursionCheck(RegExpCompiler * compiler)1057 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1058 compiler->IncrementRecursionDepth();
1059 }
~RecursionCheck()1060 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1061 private:
1062 RegExpCompiler* compiler_;
1063 };
1064
1065
IrregexpRegExpTooBig(Isolate * isolate)1066 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1067 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1068 }
1069
1070
1071 // Attempts to compile the regexp using an Irregexp code generator. Returns
1072 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(int capture_count,bool ignore_case,bool one_byte,Zone * zone)1073 RegExpCompiler::RegExpCompiler(int capture_count, bool ignore_case,
1074 bool one_byte, Zone* zone)
1075 : next_register_(2 * (capture_count + 1)),
1076 work_list_(NULL),
1077 recursion_depth_(0),
1078 ignore_case_(ignore_case),
1079 one_byte_(one_byte),
1080 reg_exp_too_big_(false),
1081 current_expansion_factor_(1),
1082 frequency_collator_(),
1083 zone_(zone) {
1084 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1085 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1086 }
1087
1088
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1089 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1090 RegExpMacroAssembler* macro_assembler,
1091 RegExpNode* start,
1092 int capture_count,
1093 Handle<String> pattern) {
1094 Heap* heap = pattern->GetHeap();
1095
1096 bool use_slow_safe_regexp_compiler = false;
1097 if (heap->total_regexp_code_generated() >
1098 RegExpImpl::kRegWxpCompiledLimit &&
1099 heap->isolate()->memory_allocator()->SizeExecutable() >
1100 RegExpImpl::kRegExpExecutableMemoryLimit) {
1101 use_slow_safe_regexp_compiler = true;
1102 }
1103
1104 macro_assembler->set_slow_safe(use_slow_safe_regexp_compiler);
1105
1106 #ifdef DEBUG
1107 if (FLAG_trace_regexp_assembler)
1108 macro_assembler_ = new RegExpMacroAssemblerTracer(macro_assembler);
1109 else
1110 #endif
1111 macro_assembler_ = macro_assembler;
1112
1113 List <RegExpNode*> work_list(0);
1114 work_list_ = &work_list;
1115 Label fail;
1116 macro_assembler_->PushBacktrack(&fail);
1117 Trace new_trace;
1118 start->Emit(this, &new_trace);
1119 macro_assembler_->Bind(&fail);
1120 macro_assembler_->Fail();
1121 while (!work_list.is_empty()) {
1122 work_list.RemoveLast()->Emit(this, &new_trace);
1123 }
1124 if (reg_exp_too_big_) return IrregexpRegExpTooBig(zone_->isolate());
1125
1126 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1127 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1128 work_list_ = NULL;
1129 #ifdef DEBUG
1130 if (FLAG_print_code) {
1131 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1132 OFStream os(trace_scope.file());
1133 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1134 }
1135 if (FLAG_trace_regexp_assembler) {
1136 delete macro_assembler_;
1137 }
1138 #endif
1139 return RegExpEngine::CompilationResult(*code, next_register_);
1140 }
1141
1142
Mentions(int that)1143 bool Trace::DeferredAction::Mentions(int that) {
1144 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1145 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1146 return range.Contains(that);
1147 } else {
1148 return reg() == that;
1149 }
1150 }
1151
1152
mentions_reg(int reg)1153 bool Trace::mentions_reg(int reg) {
1154 for (DeferredAction* action = actions_;
1155 action != NULL;
1156 action = action->next()) {
1157 if (action->Mentions(reg))
1158 return true;
1159 }
1160 return false;
1161 }
1162
1163
GetStoredPosition(int reg,int * cp_offset)1164 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1165 DCHECK_EQ(0, *cp_offset);
1166 for (DeferredAction* action = actions_;
1167 action != NULL;
1168 action = action->next()) {
1169 if (action->Mentions(reg)) {
1170 if (action->action_type() == ActionNode::STORE_POSITION) {
1171 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1172 return true;
1173 } else {
1174 return false;
1175 }
1176 }
1177 }
1178 return false;
1179 }
1180
1181
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1182 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1183 Zone* zone) {
1184 int max_register = RegExpCompiler::kNoRegister;
1185 for (DeferredAction* action = actions_;
1186 action != NULL;
1187 action = action->next()) {
1188 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1189 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1190 for (int i = range.from(); i <= range.to(); i++)
1191 affected_registers->Set(i, zone);
1192 if (range.to() > max_register) max_register = range.to();
1193 } else {
1194 affected_registers->Set(action->reg(), zone);
1195 if (action->reg() > max_register) max_register = action->reg();
1196 }
1197 }
1198 return max_register;
1199 }
1200
1201
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1202 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1203 int max_register,
1204 const OutSet& registers_to_pop,
1205 const OutSet& registers_to_clear) {
1206 for (int reg = max_register; reg >= 0; reg--) {
1207 if (registers_to_pop.Get(reg)) {
1208 assembler->PopRegister(reg);
1209 } else if (registers_to_clear.Get(reg)) {
1210 int clear_to = reg;
1211 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1212 reg--;
1213 }
1214 assembler->ClearRegisters(reg, clear_to);
1215 }
1216 }
1217 }
1218
1219
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1220 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1221 int max_register,
1222 const OutSet& affected_registers,
1223 OutSet* registers_to_pop,
1224 OutSet* registers_to_clear,
1225 Zone* zone) {
1226 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1227 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1228
1229 // Count pushes performed to force a stack limit check occasionally.
1230 int pushes = 0;
1231
1232 for (int reg = 0; reg <= max_register; reg++) {
1233 if (!affected_registers.Get(reg)) {
1234 continue;
1235 }
1236
1237 // The chronologically first deferred action in the trace
1238 // is used to infer the action needed to restore a register
1239 // to its previous state (or not, if it's safe to ignore it).
1240 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1241 DeferredActionUndoType undo_action = IGNORE;
1242
1243 int value = 0;
1244 bool absolute = false;
1245 bool clear = false;
1246 int store_position = -1;
1247 // This is a little tricky because we are scanning the actions in reverse
1248 // historical order (newest first).
1249 for (DeferredAction* action = actions_;
1250 action != NULL;
1251 action = action->next()) {
1252 if (action->Mentions(reg)) {
1253 switch (action->action_type()) {
1254 case ActionNode::SET_REGISTER: {
1255 Trace::DeferredSetRegister* psr =
1256 static_cast<Trace::DeferredSetRegister*>(action);
1257 if (!absolute) {
1258 value += psr->value();
1259 absolute = true;
1260 }
1261 // SET_REGISTER is currently only used for newly introduced loop
1262 // counters. They can have a significant previous value if they
1263 // occour in a loop. TODO(lrn): Propagate this information, so
1264 // we can set undo_action to IGNORE if we know there is no value to
1265 // restore.
1266 undo_action = RESTORE;
1267 DCHECK_EQ(store_position, -1);
1268 DCHECK(!clear);
1269 break;
1270 }
1271 case ActionNode::INCREMENT_REGISTER:
1272 if (!absolute) {
1273 value++;
1274 }
1275 DCHECK_EQ(store_position, -1);
1276 DCHECK(!clear);
1277 undo_action = RESTORE;
1278 break;
1279 case ActionNode::STORE_POSITION: {
1280 Trace::DeferredCapture* pc =
1281 static_cast<Trace::DeferredCapture*>(action);
1282 if (!clear && store_position == -1) {
1283 store_position = pc->cp_offset();
1284 }
1285
1286 // For captures we know that stores and clears alternate.
1287 // Other register, are never cleared, and if the occur
1288 // inside a loop, they might be assigned more than once.
1289 if (reg <= 1) {
1290 // Registers zero and one, aka "capture zero", is
1291 // always set correctly if we succeed. There is no
1292 // need to undo a setting on backtrack, because we
1293 // will set it again or fail.
1294 undo_action = IGNORE;
1295 } else {
1296 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1297 }
1298 DCHECK(!absolute);
1299 DCHECK_EQ(value, 0);
1300 break;
1301 }
1302 case ActionNode::CLEAR_CAPTURES: {
1303 // Since we're scanning in reverse order, if we've already
1304 // set the position we have to ignore historically earlier
1305 // clearing operations.
1306 if (store_position == -1) {
1307 clear = true;
1308 }
1309 undo_action = RESTORE;
1310 DCHECK(!absolute);
1311 DCHECK_EQ(value, 0);
1312 break;
1313 }
1314 default:
1315 UNREACHABLE();
1316 break;
1317 }
1318 }
1319 }
1320 // Prepare for the undo-action (e.g., push if it's going to be popped).
1321 if (undo_action == RESTORE) {
1322 pushes++;
1323 RegExpMacroAssembler::StackCheckFlag stack_check =
1324 RegExpMacroAssembler::kNoStackLimitCheck;
1325 if (pushes == push_limit) {
1326 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1327 pushes = 0;
1328 }
1329
1330 assembler->PushRegister(reg, stack_check);
1331 registers_to_pop->Set(reg, zone);
1332 } else if (undo_action == CLEAR) {
1333 registers_to_clear->Set(reg, zone);
1334 }
1335 // Perform the chronologically last action (or accumulated increment)
1336 // for the register.
1337 if (store_position != -1) {
1338 assembler->WriteCurrentPositionToRegister(reg, store_position);
1339 } else if (clear) {
1340 assembler->ClearRegisters(reg, reg);
1341 } else if (absolute) {
1342 assembler->SetRegister(reg, value);
1343 } else if (value != 0) {
1344 assembler->AdvanceRegister(reg, value);
1345 }
1346 }
1347 }
1348
1349
1350 // This is called as we come into a loop choice node and some other tricky
1351 // nodes. It normalizes the state of the code generator to ensure we can
1352 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1353 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1354 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1355
1356 DCHECK(!is_trivial());
1357
1358 if (actions_ == NULL && backtrack() == NULL) {
1359 // Here we just have some deferred cp advances to fix and we are back to
1360 // a normal situation. We may also have to forget some information gained
1361 // through a quick check that was already performed.
1362 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1363 // Create a new trivial state and generate the node with that.
1364 Trace new_state;
1365 successor->Emit(compiler, &new_state);
1366 return;
1367 }
1368
1369 // Generate deferred actions here along with code to undo them again.
1370 OutSet affected_registers;
1371
1372 if (backtrack() != NULL) {
1373 // Here we have a concrete backtrack location. These are set up by choice
1374 // nodes and so they indicate that we have a deferred save of the current
1375 // position which we may need to emit here.
1376 assembler->PushCurrentPosition();
1377 }
1378
1379 int max_register = FindAffectedRegisters(&affected_registers,
1380 compiler->zone());
1381 OutSet registers_to_pop;
1382 OutSet registers_to_clear;
1383 PerformDeferredActions(assembler,
1384 max_register,
1385 affected_registers,
1386 ®isters_to_pop,
1387 ®isters_to_clear,
1388 compiler->zone());
1389 if (cp_offset_ != 0) {
1390 assembler->AdvanceCurrentPosition(cp_offset_);
1391 }
1392
1393 // Create a new trivial state and generate the node with that.
1394 Label undo;
1395 assembler->PushBacktrack(&undo);
1396 Trace new_state;
1397 successor->Emit(compiler, &new_state);
1398
1399 // On backtrack we need to restore state.
1400 assembler->Bind(&undo);
1401 RestoreAffectedRegisters(assembler,
1402 max_register,
1403 registers_to_pop,
1404 registers_to_clear);
1405 if (backtrack() == NULL) {
1406 assembler->Backtrack();
1407 } else {
1408 assembler->PopCurrentPosition();
1409 assembler->GoTo(backtrack());
1410 }
1411 }
1412
1413
Emit(RegExpCompiler * compiler,Trace * trace)1414 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1415 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1416
1417 // Omit flushing the trace. We discard the entire stack frame anyway.
1418
1419 if (!label()->is_bound()) {
1420 // We are completely independent of the trace, since we ignore it,
1421 // so this code can be used as the generic version.
1422 assembler->Bind(label());
1423 }
1424
1425 // Throw away everything on the backtrack stack since the start
1426 // of the negative submatch and restore the character position.
1427 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1428 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1429 if (clear_capture_count_ > 0) {
1430 // Clear any captures that might have been performed during the success
1431 // of the body of the negative look-ahead.
1432 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1433 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1434 }
1435 // Now that we have unwound the stack we find at the top of the stack the
1436 // backtrack that the BeginSubmatch node got.
1437 assembler->Backtrack();
1438 }
1439
1440
Emit(RegExpCompiler * compiler,Trace * trace)1441 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1442 if (!trace->is_trivial()) {
1443 trace->Flush(compiler, this);
1444 return;
1445 }
1446 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1447 if (!label()->is_bound()) {
1448 assembler->Bind(label());
1449 }
1450 switch (action_) {
1451 case ACCEPT:
1452 assembler->Succeed();
1453 return;
1454 case BACKTRACK:
1455 assembler->GoTo(trace->backtrack());
1456 return;
1457 case NEGATIVE_SUBMATCH_SUCCESS:
1458 // This case is handled in a different virtual method.
1459 UNREACHABLE();
1460 }
1461 UNIMPLEMENTED();
1462 }
1463
1464
AddGuard(Guard * guard,Zone * zone)1465 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1466 if (guards_ == NULL)
1467 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1468 guards_->Add(guard, zone);
1469 }
1470
1471
SetRegister(int reg,int val,RegExpNode * on_success)1472 ActionNode* ActionNode::SetRegister(int reg,
1473 int val,
1474 RegExpNode* on_success) {
1475 ActionNode* result =
1476 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1477 result->data_.u_store_register.reg = reg;
1478 result->data_.u_store_register.value = val;
1479 return result;
1480 }
1481
1482
IncrementRegister(int reg,RegExpNode * on_success)1483 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1484 ActionNode* result =
1485 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1486 result->data_.u_increment_register.reg = reg;
1487 return result;
1488 }
1489
1490
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1491 ActionNode* ActionNode::StorePosition(int reg,
1492 bool is_capture,
1493 RegExpNode* on_success) {
1494 ActionNode* result =
1495 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1496 result->data_.u_position_register.reg = reg;
1497 result->data_.u_position_register.is_capture = is_capture;
1498 return result;
1499 }
1500
1501
ClearCaptures(Interval range,RegExpNode * on_success)1502 ActionNode* ActionNode::ClearCaptures(Interval range,
1503 RegExpNode* on_success) {
1504 ActionNode* result =
1505 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1506 result->data_.u_clear_captures.range_from = range.from();
1507 result->data_.u_clear_captures.range_to = range.to();
1508 return result;
1509 }
1510
1511
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1512 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1513 int position_reg,
1514 RegExpNode* on_success) {
1515 ActionNode* result =
1516 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1517 result->data_.u_submatch.stack_pointer_register = stack_reg;
1518 result->data_.u_submatch.current_position_register = position_reg;
1519 return result;
1520 }
1521
1522
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1523 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1524 int position_reg,
1525 int clear_register_count,
1526 int clear_register_from,
1527 RegExpNode* on_success) {
1528 ActionNode* result =
1529 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1530 result->data_.u_submatch.stack_pointer_register = stack_reg;
1531 result->data_.u_submatch.current_position_register = position_reg;
1532 result->data_.u_submatch.clear_register_count = clear_register_count;
1533 result->data_.u_submatch.clear_register_from = clear_register_from;
1534 return result;
1535 }
1536
1537
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1538 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1539 int repetition_register,
1540 int repetition_limit,
1541 RegExpNode* on_success) {
1542 ActionNode* result =
1543 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1544 result->data_.u_empty_match_check.start_register = start_register;
1545 result->data_.u_empty_match_check.repetition_register = repetition_register;
1546 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1547 return result;
1548 }
1549
1550
1551 #define DEFINE_ACCEPT(Type) \
1552 void Type##Node::Accept(NodeVisitor* visitor) { \
1553 visitor->Visit##Type(this); \
1554 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1555 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1556 #undef DEFINE_ACCEPT
1557
1558
1559 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1560 visitor->VisitLoopChoice(this);
1561 }
1562
1563
1564 // -------------------------------------------------------------------
1565 // Emit code.
1566
1567
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1568 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1569 Guard* guard,
1570 Trace* trace) {
1571 switch (guard->op()) {
1572 case Guard::LT:
1573 DCHECK(!trace->mentions_reg(guard->reg()));
1574 macro_assembler->IfRegisterGE(guard->reg(),
1575 guard->value(),
1576 trace->backtrack());
1577 break;
1578 case Guard::GEQ:
1579 DCHECK(!trace->mentions_reg(guard->reg()));
1580 macro_assembler->IfRegisterLT(guard->reg(),
1581 guard->value(),
1582 trace->backtrack());
1583 break;
1584 }
1585 }
1586
1587
1588 // Returns the number of characters in the equivalence class, omitting those
1589 // that cannot occur in the source string because it is ASCII.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1590 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1591 bool one_byte_subject,
1592 unibrow::uchar* letters) {
1593 int length =
1594 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1595 // Unibrow returns 0 or 1 for characters where case independence is
1596 // trivial.
1597 if (length == 0) {
1598 letters[0] = character;
1599 length = 1;
1600 }
1601 if (!one_byte_subject || character <= String::kMaxOneByteCharCode) {
1602 return length;
1603 }
1604
1605 // The standard requires that non-ASCII characters cannot have ASCII
1606 // character codes in their equivalence class.
1607 // TODO(dcarney): issue 3550 this is not actually true for Latin1 anymore,
1608 // is it? For example, \u00C5 is equivalent to \u212B.
1609 return 0;
1610 }
1611
1612
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1613 static inline bool EmitSimpleCharacter(Isolate* isolate,
1614 RegExpCompiler* compiler,
1615 uc16 c,
1616 Label* on_failure,
1617 int cp_offset,
1618 bool check,
1619 bool preloaded) {
1620 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1621 bool bound_checked = false;
1622 if (!preloaded) {
1623 assembler->LoadCurrentCharacter(
1624 cp_offset,
1625 on_failure,
1626 check);
1627 bound_checked = true;
1628 }
1629 assembler->CheckNotCharacter(c, on_failure);
1630 return bound_checked;
1631 }
1632
1633
1634 // Only emits non-letters (things that don't have case). Only used for case
1635 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1636 static inline bool EmitAtomNonLetter(Isolate* isolate,
1637 RegExpCompiler* compiler,
1638 uc16 c,
1639 Label* on_failure,
1640 int cp_offset,
1641 bool check,
1642 bool preloaded) {
1643 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1644 bool one_byte = compiler->one_byte();
1645 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1646 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1647 if (length < 1) {
1648 // This can't match. Must be an one-byte subject and a non-one-byte
1649 // character. We do not need to do anything since the one-byte pass
1650 // already handled this.
1651 return false; // Bounds not checked.
1652 }
1653 bool checked = false;
1654 // We handle the length > 1 case in a later pass.
1655 if (length == 1) {
1656 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1657 // Can't match - see above.
1658 return false; // Bounds not checked.
1659 }
1660 if (!preloaded) {
1661 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1662 checked = check;
1663 }
1664 macro_assembler->CheckNotCharacter(c, on_failure);
1665 }
1666 return checked;
1667 }
1668
1669
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1670 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1671 bool one_byte, uc16 c1, uc16 c2,
1672 Label* on_failure) {
1673 uc16 char_mask;
1674 if (one_byte) {
1675 char_mask = String::kMaxOneByteCharCode;
1676 } else {
1677 char_mask = String::kMaxUtf16CodeUnit;
1678 }
1679 uc16 exor = c1 ^ c2;
1680 // Check whether exor has only one bit set.
1681 if (((exor - 1) & exor) == 0) {
1682 // If c1 and c2 differ only by one bit.
1683 // Ecma262UnCanonicalize always gives the highest number last.
1684 DCHECK(c2 > c1);
1685 uc16 mask = char_mask ^ exor;
1686 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1687 return true;
1688 }
1689 DCHECK(c2 > c1);
1690 uc16 diff = c2 - c1;
1691 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1692 // If the characters differ by 2^n but don't differ by one bit then
1693 // subtract the difference from the found character, then do the or
1694 // trick. We avoid the theoretical case where negative numbers are
1695 // involved in order to simplify code generation.
1696 uc16 mask = char_mask ^ diff;
1697 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1698 diff,
1699 mask,
1700 on_failure);
1701 return true;
1702 }
1703 return false;
1704 }
1705
1706
1707 typedef bool EmitCharacterFunction(Isolate* isolate,
1708 RegExpCompiler* compiler,
1709 uc16 c,
1710 Label* on_failure,
1711 int cp_offset,
1712 bool check,
1713 bool preloaded);
1714
1715 // Only emits letters (things that have case). Only used for case independent
1716 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1717 static inline bool EmitAtomLetter(Isolate* isolate,
1718 RegExpCompiler* compiler,
1719 uc16 c,
1720 Label* on_failure,
1721 int cp_offset,
1722 bool check,
1723 bool preloaded) {
1724 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1725 bool one_byte = compiler->one_byte();
1726 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1727 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1728 if (length <= 1) return false;
1729 // We may not need to check against the end of the input string
1730 // if this character lies before a character that matched.
1731 if (!preloaded) {
1732 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1733 }
1734 Label ok;
1735 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1736 switch (length) {
1737 case 2: {
1738 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1739 chars[1], on_failure)) {
1740 } else {
1741 macro_assembler->CheckCharacter(chars[0], &ok);
1742 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1743 macro_assembler->Bind(&ok);
1744 }
1745 break;
1746 }
1747 case 4:
1748 macro_assembler->CheckCharacter(chars[3], &ok);
1749 // Fall through!
1750 case 3:
1751 macro_assembler->CheckCharacter(chars[0], &ok);
1752 macro_assembler->CheckCharacter(chars[1], &ok);
1753 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1754 macro_assembler->Bind(&ok);
1755 break;
1756 default:
1757 UNREACHABLE();
1758 break;
1759 }
1760 return true;
1761 }
1762
1763
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1764 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1765 int border,
1766 Label* fall_through,
1767 Label* above_or_equal,
1768 Label* below) {
1769 if (below != fall_through) {
1770 masm->CheckCharacterLT(border, below);
1771 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1772 } else {
1773 masm->CheckCharacterGT(border - 1, above_or_equal);
1774 }
1775 }
1776
1777
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1778 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1779 int first,
1780 int last,
1781 Label* fall_through,
1782 Label* in_range,
1783 Label* out_of_range) {
1784 if (in_range == fall_through) {
1785 if (first == last) {
1786 masm->CheckNotCharacter(first, out_of_range);
1787 } else {
1788 masm->CheckCharacterNotInRange(first, last, out_of_range);
1789 }
1790 } else {
1791 if (first == last) {
1792 masm->CheckCharacter(first, in_range);
1793 } else {
1794 masm->CheckCharacterInRange(first, last, in_range);
1795 }
1796 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1797 }
1798 }
1799
1800
1801 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1802 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1803 static void EmitUseLookupTable(
1804 RegExpMacroAssembler* masm,
1805 ZoneList<int>* ranges,
1806 int start_index,
1807 int end_index,
1808 int min_char,
1809 Label* fall_through,
1810 Label* even_label,
1811 Label* odd_label) {
1812 static const int kSize = RegExpMacroAssembler::kTableSize;
1813 static const int kMask = RegExpMacroAssembler::kTableMask;
1814
1815 int base = (min_char & ~kMask);
1816 USE(base);
1817
1818 // Assert that everything is on one kTableSize page.
1819 for (int i = start_index; i <= end_index; i++) {
1820 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1821 }
1822 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1823
1824 char templ[kSize];
1825 Label* on_bit_set;
1826 Label* on_bit_clear;
1827 int bit;
1828 if (even_label == fall_through) {
1829 on_bit_set = odd_label;
1830 on_bit_clear = even_label;
1831 bit = 1;
1832 } else {
1833 on_bit_set = even_label;
1834 on_bit_clear = odd_label;
1835 bit = 0;
1836 }
1837 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1838 templ[i] = bit;
1839 }
1840 int j = 0;
1841 bit ^= 1;
1842 for (int i = start_index; i < end_index; i++) {
1843 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1844 templ[j] = bit;
1845 }
1846 bit ^= 1;
1847 }
1848 for (int i = j; i < kSize; i++) {
1849 templ[i] = bit;
1850 }
1851 Factory* factory = masm->zone()->isolate()->factory();
1852 // TODO(erikcorry): Cache these.
1853 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1854 for (int i = 0; i < kSize; i++) {
1855 ba->set(i, templ[i]);
1856 }
1857 masm->CheckBitInTable(ba, on_bit_set);
1858 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1859 }
1860
1861
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1862 static void CutOutRange(RegExpMacroAssembler* masm,
1863 ZoneList<int>* ranges,
1864 int start_index,
1865 int end_index,
1866 int cut_index,
1867 Label* even_label,
1868 Label* odd_label) {
1869 bool odd = (((cut_index - start_index) & 1) == 1);
1870 Label* in_range_label = odd ? odd_label : even_label;
1871 Label dummy;
1872 EmitDoubleBoundaryTest(masm,
1873 ranges->at(cut_index),
1874 ranges->at(cut_index + 1) - 1,
1875 &dummy,
1876 in_range_label,
1877 &dummy);
1878 DCHECK(!dummy.is_linked());
1879 // Cut out the single range by rewriting the array. This creates a new
1880 // range that is a merger of the two ranges on either side of the one we
1881 // are cutting out. The oddity of the labels is preserved.
1882 for (int j = cut_index; j > start_index; j--) {
1883 ranges->at(j) = ranges->at(j - 1);
1884 }
1885 for (int j = cut_index + 1; j < end_index; j++) {
1886 ranges->at(j) = ranges->at(j + 1);
1887 }
1888 }
1889
1890
1891 // Unicode case. Split the search space into kSize spaces that are handled
1892 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1893 static void SplitSearchSpace(ZoneList<int>* ranges,
1894 int start_index,
1895 int end_index,
1896 int* new_start_index,
1897 int* new_end_index,
1898 int* border) {
1899 static const int kSize = RegExpMacroAssembler::kTableSize;
1900 static const int kMask = RegExpMacroAssembler::kTableMask;
1901
1902 int first = ranges->at(start_index);
1903 int last = ranges->at(end_index) - 1;
1904
1905 *new_start_index = start_index;
1906 *border = (ranges->at(start_index) & ~kMask) + kSize;
1907 while (*new_start_index < end_index) {
1908 if (ranges->at(*new_start_index) > *border) break;
1909 (*new_start_index)++;
1910 }
1911 // new_start_index is the index of the first edge that is beyond the
1912 // current kSize space.
1913
1914 // For very large search spaces we do a binary chop search of the non-Latin1
1915 // space instead of just going to the end of the current kSize space. The
1916 // heuristics are complicated a little by the fact that any 128-character
1917 // encoding space can be quickly tested with a table lookup, so we don't
1918 // wish to do binary chop search at a smaller granularity than that. A
1919 // 128-character space can take up a lot of space in the ranges array if,
1920 // for example, we only want to match every second character (eg. the lower
1921 // case characters on some Unicode pages).
1922 int binary_chop_index = (end_index + start_index) / 2;
1923 // The first test ensures that we get to the code that handles the Latin1
1924 // range with a single not-taken branch, speeding up this important
1925 // character range (even non-Latin1 charset-based text has spaces and
1926 // punctuation).
1927 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1928 end_index - start_index > (*new_start_index - start_index) * 2 &&
1929 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1930 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1931 int scan_forward_for_section_border = binary_chop_index;;
1932 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1933
1934 while (scan_forward_for_section_border < end_index) {
1935 if (ranges->at(scan_forward_for_section_border) > new_border) {
1936 *new_start_index = scan_forward_for_section_border;
1937 *border = new_border;
1938 break;
1939 }
1940 scan_forward_for_section_border++;
1941 }
1942 }
1943
1944 DCHECK(*new_start_index > start_index);
1945 *new_end_index = *new_start_index - 1;
1946 if (ranges->at(*new_end_index) == *border) {
1947 (*new_end_index)--;
1948 }
1949 if (*border >= ranges->at(end_index)) {
1950 *border = ranges->at(end_index);
1951 *new_start_index = end_index; // Won't be used.
1952 *new_end_index = end_index - 1;
1953 }
1954 }
1955
1956
1957 // Gets a series of segment boundaries representing a character class. If the
1958 // character is in the range between an even and an odd boundary (counting from
1959 // start_index) then go to even_label, otherwise go to odd_label. We already
1960 // know that the character is in the range of min_char to max_char inclusive.
1961 // Either label can be NULL indicating backtracking. Either label can also be
1962 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc16 min_char,uc16 max_char,Label * fall_through,Label * even_label,Label * odd_label)1963 static void GenerateBranches(RegExpMacroAssembler* masm,
1964 ZoneList<int>* ranges,
1965 int start_index,
1966 int end_index,
1967 uc16 min_char,
1968 uc16 max_char,
1969 Label* fall_through,
1970 Label* even_label,
1971 Label* odd_label) {
1972 int first = ranges->at(start_index);
1973 int last = ranges->at(end_index) - 1;
1974
1975 DCHECK_LT(min_char, first);
1976
1977 // Just need to test if the character is before or on-or-after
1978 // a particular character.
1979 if (start_index == end_index) {
1980 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1981 return;
1982 }
1983
1984 // Another almost trivial case: There is one interval in the middle that is
1985 // different from the end intervals.
1986 if (start_index + 1 == end_index) {
1987 EmitDoubleBoundaryTest(
1988 masm, first, last, fall_through, even_label, odd_label);
1989 return;
1990 }
1991
1992 // It's not worth using table lookup if there are very few intervals in the
1993 // character class.
1994 if (end_index - start_index <= 6) {
1995 // It is faster to test for individual characters, so we look for those
1996 // first, then try arbitrary ranges in the second round.
1997 static int kNoCutIndex = -1;
1998 int cut = kNoCutIndex;
1999 for (int i = start_index; i < end_index; i++) {
2000 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2001 cut = i;
2002 break;
2003 }
2004 }
2005 if (cut == kNoCutIndex) cut = start_index;
2006 CutOutRange(
2007 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2008 DCHECK_GE(end_index - start_index, 2);
2009 GenerateBranches(masm,
2010 ranges,
2011 start_index + 1,
2012 end_index - 1,
2013 min_char,
2014 max_char,
2015 fall_through,
2016 even_label,
2017 odd_label);
2018 return;
2019 }
2020
2021 // If there are a lot of intervals in the regexp, then we will use tables to
2022 // determine whether the character is inside or outside the character class.
2023 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2024
2025 if ((max_char >> kBits) == (min_char >> kBits)) {
2026 EmitUseLookupTable(masm,
2027 ranges,
2028 start_index,
2029 end_index,
2030 min_char,
2031 fall_through,
2032 even_label,
2033 odd_label);
2034 return;
2035 }
2036
2037 if ((min_char >> kBits) != (first >> kBits)) {
2038 masm->CheckCharacterLT(first, odd_label);
2039 GenerateBranches(masm,
2040 ranges,
2041 start_index + 1,
2042 end_index,
2043 first,
2044 max_char,
2045 fall_through,
2046 odd_label,
2047 even_label);
2048 return;
2049 }
2050
2051 int new_start_index = 0;
2052 int new_end_index = 0;
2053 int border = 0;
2054
2055 SplitSearchSpace(ranges,
2056 start_index,
2057 end_index,
2058 &new_start_index,
2059 &new_end_index,
2060 &border);
2061
2062 Label handle_rest;
2063 Label* above = &handle_rest;
2064 if (border == last + 1) {
2065 // We didn't find any section that started after the limit, so everything
2066 // above the border is one of the terminal labels.
2067 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2068 DCHECK(new_end_index == end_index - 1);
2069 }
2070
2071 DCHECK_LE(start_index, new_end_index);
2072 DCHECK_LE(new_start_index, end_index);
2073 DCHECK_LT(start_index, new_start_index);
2074 DCHECK_LT(new_end_index, end_index);
2075 DCHECK(new_end_index + 1 == new_start_index ||
2076 (new_end_index + 2 == new_start_index &&
2077 border == ranges->at(new_end_index + 1)));
2078 DCHECK_LT(min_char, border - 1);
2079 DCHECK_LT(border, max_char);
2080 DCHECK_LT(ranges->at(new_end_index), border);
2081 DCHECK(border < ranges->at(new_start_index) ||
2082 (border == ranges->at(new_start_index) &&
2083 new_start_index == end_index &&
2084 new_end_index == end_index - 1 &&
2085 border == last + 1));
2086 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2087
2088 masm->CheckCharacterGT(border - 1, above);
2089 Label dummy;
2090 GenerateBranches(masm,
2091 ranges,
2092 start_index,
2093 new_end_index,
2094 min_char,
2095 border - 1,
2096 &dummy,
2097 even_label,
2098 odd_label);
2099 if (handle_rest.is_linked()) {
2100 masm->Bind(&handle_rest);
2101 bool flip = (new_start_index & 1) != (start_index & 1);
2102 GenerateBranches(masm,
2103 ranges,
2104 new_start_index,
2105 end_index,
2106 border,
2107 max_char,
2108 &dummy,
2109 flip ? odd_label : even_label,
2110 flip ? even_label : odd_label);
2111 }
2112 }
2113
2114
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2115 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2116 RegExpCharacterClass* cc, bool one_byte,
2117 Label* on_failure, int cp_offset, bool check_offset,
2118 bool preloaded, Zone* zone) {
2119 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2120 if (!CharacterRange::IsCanonical(ranges)) {
2121 CharacterRange::Canonicalize(ranges);
2122 }
2123
2124 int max_char;
2125 if (one_byte) {
2126 max_char = String::kMaxOneByteCharCode;
2127 } else {
2128 max_char = String::kMaxUtf16CodeUnit;
2129 }
2130
2131 int range_count = ranges->length();
2132
2133 int last_valid_range = range_count - 1;
2134 while (last_valid_range >= 0) {
2135 CharacterRange& range = ranges->at(last_valid_range);
2136 if (range.from() <= max_char) {
2137 break;
2138 }
2139 last_valid_range--;
2140 }
2141
2142 if (last_valid_range < 0) {
2143 if (!cc->is_negated()) {
2144 macro_assembler->GoTo(on_failure);
2145 }
2146 if (check_offset) {
2147 macro_assembler->CheckPosition(cp_offset, on_failure);
2148 }
2149 return;
2150 }
2151
2152 if (last_valid_range == 0 &&
2153 ranges->at(0).IsEverything(max_char)) {
2154 if (cc->is_negated()) {
2155 macro_assembler->GoTo(on_failure);
2156 } else {
2157 // This is a common case hit by non-anchored expressions.
2158 if (check_offset) {
2159 macro_assembler->CheckPosition(cp_offset, on_failure);
2160 }
2161 }
2162 return;
2163 }
2164 if (last_valid_range == 0 &&
2165 !cc->is_negated() &&
2166 ranges->at(0).IsEverything(max_char)) {
2167 // This is a common case hit by non-anchored expressions.
2168 if (check_offset) {
2169 macro_assembler->CheckPosition(cp_offset, on_failure);
2170 }
2171 return;
2172 }
2173
2174 if (!preloaded) {
2175 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2176 }
2177
2178 if (cc->is_standard(zone) &&
2179 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2180 on_failure)) {
2181 return;
2182 }
2183
2184
2185 // A new list with ascending entries. Each entry is a code unit
2186 // where there is a boundary between code units that are part of
2187 // the class and code units that are not. Normally we insert an
2188 // entry at zero which goes to the failure label, but if there
2189 // was already one there we fall through for success on that entry.
2190 // Subsequent entries have alternating meaning (success/failure).
2191 ZoneList<int>* range_boundaries =
2192 new(zone) ZoneList<int>(last_valid_range, zone);
2193
2194 bool zeroth_entry_is_failure = !cc->is_negated();
2195
2196 for (int i = 0; i <= last_valid_range; i++) {
2197 CharacterRange& range = ranges->at(i);
2198 if (range.from() == 0) {
2199 DCHECK_EQ(i, 0);
2200 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2201 } else {
2202 range_boundaries->Add(range.from(), zone);
2203 }
2204 range_boundaries->Add(range.to() + 1, zone);
2205 }
2206 int end_index = range_boundaries->length() - 1;
2207 if (range_boundaries->at(end_index) > max_char) {
2208 end_index--;
2209 }
2210
2211 Label fall_through;
2212 GenerateBranches(macro_assembler,
2213 range_boundaries,
2214 0, // start_index.
2215 end_index,
2216 0, // min_char.
2217 max_char,
2218 &fall_through,
2219 zeroth_entry_is_failure ? &fall_through : on_failure,
2220 zeroth_entry_is_failure ? on_failure : &fall_through);
2221 macro_assembler->Bind(&fall_through);
2222 }
2223
2224
~RegExpNode()2225 RegExpNode::~RegExpNode() {
2226 }
2227
2228
LimitVersions(RegExpCompiler * compiler,Trace * trace)2229 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2230 Trace* trace) {
2231 // If we are generating a greedy loop then don't stop and don't reuse code.
2232 if (trace->stop_node() != NULL) {
2233 return CONTINUE;
2234 }
2235
2236 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2237 if (trace->is_trivial()) {
2238 if (label_.is_bound()) {
2239 // We are being asked to generate a generic version, but that's already
2240 // been done so just go to it.
2241 macro_assembler->GoTo(&label_);
2242 return DONE;
2243 }
2244 if (compiler->recursion_depth() >= RegExpCompiler::kMaxRecursion) {
2245 // To avoid too deep recursion we push the node to the work queue and just
2246 // generate a goto here.
2247 compiler->AddWork(this);
2248 macro_assembler->GoTo(&label_);
2249 return DONE;
2250 }
2251 // Generate generic version of the node and bind the label for later use.
2252 macro_assembler->Bind(&label_);
2253 return CONTINUE;
2254 }
2255
2256 // We are being asked to make a non-generic version. Keep track of how many
2257 // non-generic versions we generate so as not to overdo it.
2258 trace_count_++;
2259 if (FLAG_regexp_optimization &&
2260 trace_count_ < kMaxCopiesCodeGenerated &&
2261 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion) {
2262 return CONTINUE;
2263 }
2264
2265 // If we get here code has been generated for this node too many times or
2266 // recursion is too deep. Time to switch to a generic version. The code for
2267 // generic versions above can handle deep recursion properly.
2268 trace->Flush(compiler, this);
2269 return DONE;
2270 }
2271
2272
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2273 int ActionNode::EatsAtLeast(int still_to_find,
2274 int budget,
2275 bool not_at_start) {
2276 if (budget <= 0) return 0;
2277 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2278 return on_success()->EatsAtLeast(still_to_find,
2279 budget - 1,
2280 not_at_start);
2281 }
2282
2283
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2284 void ActionNode::FillInBMInfo(int offset,
2285 int budget,
2286 BoyerMooreLookahead* bm,
2287 bool not_at_start) {
2288 if (action_type_ == BEGIN_SUBMATCH) {
2289 bm->SetRest(offset);
2290 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2291 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2292 }
2293 SaveBMInfo(bm, not_at_start, offset);
2294 }
2295
2296
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2297 int AssertionNode::EatsAtLeast(int still_to_find,
2298 int budget,
2299 bool not_at_start) {
2300 if (budget <= 0) return 0;
2301 // If we know we are not at the start and we are asked "how many characters
2302 // will you match if you succeed?" then we can answer anything since false
2303 // implies false. So lets just return the max answer (still_to_find) since
2304 // that won't prevent us from preloading a lot of characters for the other
2305 // branches in the node graph.
2306 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2307 return on_success()->EatsAtLeast(still_to_find,
2308 budget - 1,
2309 not_at_start);
2310 }
2311
2312
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2313 void AssertionNode::FillInBMInfo(int offset,
2314 int budget,
2315 BoyerMooreLookahead* bm,
2316 bool not_at_start) {
2317 // Match the behaviour of EatsAtLeast on this node.
2318 if (assertion_type() == AT_START && not_at_start) return;
2319 on_success()->FillInBMInfo(offset, budget - 1, bm, not_at_start);
2320 SaveBMInfo(bm, not_at_start, offset);
2321 }
2322
2323
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2324 int BackReferenceNode::EatsAtLeast(int still_to_find,
2325 int budget,
2326 bool not_at_start) {
2327 if (budget <= 0) return 0;
2328 return on_success()->EatsAtLeast(still_to_find,
2329 budget - 1,
2330 not_at_start);
2331 }
2332
2333
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2334 int TextNode::EatsAtLeast(int still_to_find,
2335 int budget,
2336 bool not_at_start) {
2337 int answer = Length();
2338 if (answer >= still_to_find) return answer;
2339 if (budget <= 0) return answer;
2340 // We are not at start after this node so we set the last argument to 'true'.
2341 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2342 budget - 1,
2343 true);
2344 }
2345
2346
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2347 int NegativeLookaheadChoiceNode::EatsAtLeast(int still_to_find,
2348 int budget,
2349 bool not_at_start) {
2350 if (budget <= 0) return 0;
2351 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2352 // afterwards.
2353 RegExpNode* node = alternatives_->at(1).node();
2354 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2355 }
2356
2357
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2358 void NegativeLookaheadChoiceNode::GetQuickCheckDetails(
2359 QuickCheckDetails* details,
2360 RegExpCompiler* compiler,
2361 int filled_in,
2362 bool not_at_start) {
2363 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2364 // afterwards.
2365 RegExpNode* node = alternatives_->at(1).node();
2366 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2367 }
2368
2369
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2370 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2371 int budget,
2372 RegExpNode* ignore_this_node,
2373 bool not_at_start) {
2374 if (budget <= 0) return 0;
2375 int min = 100;
2376 int choice_count = alternatives_->length();
2377 budget = (budget - 1) / choice_count;
2378 for (int i = 0; i < choice_count; i++) {
2379 RegExpNode* node = alternatives_->at(i).node();
2380 if (node == ignore_this_node) continue;
2381 int node_eats_at_least =
2382 node->EatsAtLeast(still_to_find, budget, not_at_start);
2383 if (node_eats_at_least < min) min = node_eats_at_least;
2384 if (min == 0) return 0;
2385 }
2386 return min;
2387 }
2388
2389
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2390 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2391 int budget,
2392 bool not_at_start) {
2393 return EatsAtLeastHelper(still_to_find,
2394 budget - 1,
2395 loop_node_,
2396 not_at_start);
2397 }
2398
2399
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2400 int ChoiceNode::EatsAtLeast(int still_to_find,
2401 int budget,
2402 bool not_at_start) {
2403 return EatsAtLeastHelper(still_to_find,
2404 budget,
2405 NULL,
2406 not_at_start);
2407 }
2408
2409
2410 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2411 static inline uint32_t SmearBitsRight(uint32_t v) {
2412 v |= v >> 1;
2413 v |= v >> 2;
2414 v |= v >> 4;
2415 v |= v >> 8;
2416 v |= v >> 16;
2417 return v;
2418 }
2419
2420
Rationalize(bool asc)2421 bool QuickCheckDetails::Rationalize(bool asc) {
2422 bool found_useful_op = false;
2423 uint32_t char_mask;
2424 if (asc) {
2425 char_mask = String::kMaxOneByteCharCode;
2426 } else {
2427 char_mask = String::kMaxUtf16CodeUnit;
2428 }
2429 mask_ = 0;
2430 value_ = 0;
2431 int char_shift = 0;
2432 for (int i = 0; i < characters_; i++) {
2433 Position* pos = &positions_[i];
2434 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2435 found_useful_op = true;
2436 }
2437 mask_ |= (pos->mask & char_mask) << char_shift;
2438 value_ |= (pos->value & char_mask) << char_shift;
2439 char_shift += asc ? 8 : 16;
2440 }
2441 return found_useful_op;
2442 }
2443
2444
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2445 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2446 Trace* bounds_check_trace,
2447 Trace* trace,
2448 bool preload_has_checked_bounds,
2449 Label* on_possible_success,
2450 QuickCheckDetails* details,
2451 bool fall_through_on_failure) {
2452 if (details->characters() == 0) return false;
2453 GetQuickCheckDetails(
2454 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2455 if (details->cannot_match()) return false;
2456 if (!details->Rationalize(compiler->one_byte())) return false;
2457 DCHECK(details->characters() == 1 ||
2458 compiler->macro_assembler()->CanReadUnaligned());
2459 uint32_t mask = details->mask();
2460 uint32_t value = details->value();
2461
2462 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2463
2464 if (trace->characters_preloaded() != details->characters()) {
2465 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2466 // We are attempting to preload the minimum number of characters
2467 // any choice would eat, so if the bounds check fails, then none of the
2468 // choices can succeed, so we can just immediately backtrack, rather
2469 // than go to the next choice.
2470 assembler->LoadCurrentCharacter(trace->cp_offset(),
2471 bounds_check_trace->backtrack(),
2472 !preload_has_checked_bounds,
2473 details->characters());
2474 }
2475
2476
2477 bool need_mask = true;
2478
2479 if (details->characters() == 1) {
2480 // If number of characters preloaded is 1 then we used a byte or 16 bit
2481 // load so the value is already masked down.
2482 uint32_t char_mask;
2483 if (compiler->one_byte()) {
2484 char_mask = String::kMaxOneByteCharCode;
2485 } else {
2486 char_mask = String::kMaxUtf16CodeUnit;
2487 }
2488 if ((mask & char_mask) == char_mask) need_mask = false;
2489 mask &= char_mask;
2490 } else {
2491 // For 2-character preloads in one-byte mode or 1-character preloads in
2492 // two-byte mode we also use a 16 bit load with zero extend.
2493 if (details->characters() == 2 && compiler->one_byte()) {
2494 if ((mask & 0xffff) == 0xffff) need_mask = false;
2495 } else if (details->characters() == 1 && !compiler->one_byte()) {
2496 if ((mask & 0xffff) == 0xffff) need_mask = false;
2497 } else {
2498 if (mask == 0xffffffff) need_mask = false;
2499 }
2500 }
2501
2502 if (fall_through_on_failure) {
2503 if (need_mask) {
2504 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2505 } else {
2506 assembler->CheckCharacter(value, on_possible_success);
2507 }
2508 } else {
2509 if (need_mask) {
2510 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2511 } else {
2512 assembler->CheckNotCharacter(value, trace->backtrack());
2513 }
2514 }
2515 return true;
2516 }
2517
2518
2519 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2520 // super-class in the .h file).
2521 //
2522 // We iterate along the text object, building up for each character a
2523 // mask and value that can be used to test for a quick failure to match.
2524 // The masks and values for the positions will be combined into a single
2525 // machine word for the current character width in order to be used in
2526 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2527 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2528 RegExpCompiler* compiler,
2529 int characters_filled_in,
2530 bool not_at_start) {
2531 Isolate* isolate = compiler->macro_assembler()->zone()->isolate();
2532 DCHECK(characters_filled_in < details->characters());
2533 int characters = details->characters();
2534 int char_mask;
2535 if (compiler->one_byte()) {
2536 char_mask = String::kMaxOneByteCharCode;
2537 } else {
2538 char_mask = String::kMaxUtf16CodeUnit;
2539 }
2540 for (int k = 0; k < elms_->length(); k++) {
2541 TextElement elm = elms_->at(k);
2542 if (elm.text_type() == TextElement::ATOM) {
2543 Vector<const uc16> quarks = elm.atom()->data();
2544 for (int i = 0; i < characters && i < quarks.length(); i++) {
2545 QuickCheckDetails::Position* pos =
2546 details->positions(characters_filled_in);
2547 uc16 c = quarks[i];
2548 if (c > char_mask) {
2549 // If we expect a non-Latin1 character from an one-byte string,
2550 // there is no way we can match. Not even case-independent
2551 // matching can turn an Latin1 character into non-Latin1 or
2552 // vice versa.
2553 // TODO(dcarney): issue 3550. Verify that this works as expected.
2554 // For example, \u0178 is uppercase of \u00ff (y-umlaut).
2555 details->set_cannot_match();
2556 pos->determines_perfectly = false;
2557 return;
2558 }
2559 if (compiler->ignore_case()) {
2560 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2561 int length = GetCaseIndependentLetters(isolate, c,
2562 compiler->one_byte(), chars);
2563 DCHECK(length != 0); // Can only happen if c > char_mask (see above).
2564 if (length == 1) {
2565 // This letter has no case equivalents, so it's nice and simple
2566 // and the mask-compare will determine definitely whether we have
2567 // a match at this character position.
2568 pos->mask = char_mask;
2569 pos->value = c;
2570 pos->determines_perfectly = true;
2571 } else {
2572 uint32_t common_bits = char_mask;
2573 uint32_t bits = chars[0];
2574 for (int j = 1; j < length; j++) {
2575 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2576 common_bits ^= differing_bits;
2577 bits &= common_bits;
2578 }
2579 // If length is 2 and common bits has only one zero in it then
2580 // our mask and compare instruction will determine definitely
2581 // whether we have a match at this character position. Otherwise
2582 // it can only be an approximate check.
2583 uint32_t one_zero = (common_bits | ~char_mask);
2584 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2585 pos->determines_perfectly = true;
2586 }
2587 pos->mask = common_bits;
2588 pos->value = bits;
2589 }
2590 } else {
2591 // Don't ignore case. Nice simple case where the mask-compare will
2592 // determine definitely whether we have a match at this character
2593 // position.
2594 pos->mask = char_mask;
2595 pos->value = c;
2596 pos->determines_perfectly = true;
2597 }
2598 characters_filled_in++;
2599 DCHECK(characters_filled_in <= details->characters());
2600 if (characters_filled_in == details->characters()) {
2601 return;
2602 }
2603 }
2604 } else {
2605 QuickCheckDetails::Position* pos =
2606 details->positions(characters_filled_in);
2607 RegExpCharacterClass* tree = elm.char_class();
2608 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2609 if (tree->is_negated()) {
2610 // A quick check uses multi-character mask and compare. There is no
2611 // useful way to incorporate a negative char class into this scheme
2612 // so we just conservatively create a mask and value that will always
2613 // succeed.
2614 pos->mask = 0;
2615 pos->value = 0;
2616 } else {
2617 int first_range = 0;
2618 while (ranges->at(first_range).from() > char_mask) {
2619 first_range++;
2620 if (first_range == ranges->length()) {
2621 details->set_cannot_match();
2622 pos->determines_perfectly = false;
2623 return;
2624 }
2625 }
2626 CharacterRange range = ranges->at(first_range);
2627 uc16 from = range.from();
2628 uc16 to = range.to();
2629 if (to > char_mask) {
2630 to = char_mask;
2631 }
2632 uint32_t differing_bits = (from ^ to);
2633 // A mask and compare is only perfect if the differing bits form a
2634 // number like 00011111 with one single block of trailing 1s.
2635 if ((differing_bits & (differing_bits + 1)) == 0 &&
2636 from + differing_bits == to) {
2637 pos->determines_perfectly = true;
2638 }
2639 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2640 uint32_t bits = (from & common_bits);
2641 for (int i = first_range + 1; i < ranges->length(); i++) {
2642 CharacterRange range = ranges->at(i);
2643 uc16 from = range.from();
2644 uc16 to = range.to();
2645 if (from > char_mask) continue;
2646 if (to > char_mask) to = char_mask;
2647 // Here we are combining more ranges into the mask and compare
2648 // value. With each new range the mask becomes more sparse and
2649 // so the chances of a false positive rise. A character class
2650 // with multiple ranges is assumed never to be equivalent to a
2651 // mask and compare operation.
2652 pos->determines_perfectly = false;
2653 uint32_t new_common_bits = (from ^ to);
2654 new_common_bits = ~SmearBitsRight(new_common_bits);
2655 common_bits &= new_common_bits;
2656 bits &= new_common_bits;
2657 uint32_t differing_bits = (from & common_bits) ^ bits;
2658 common_bits ^= differing_bits;
2659 bits &= common_bits;
2660 }
2661 pos->mask = common_bits;
2662 pos->value = bits;
2663 }
2664 characters_filled_in++;
2665 DCHECK(characters_filled_in <= details->characters());
2666 if (characters_filled_in == details->characters()) {
2667 return;
2668 }
2669 }
2670 }
2671 DCHECK(characters_filled_in != details->characters());
2672 if (!details->cannot_match()) {
2673 on_success()-> GetQuickCheckDetails(details,
2674 compiler,
2675 characters_filled_in,
2676 true);
2677 }
2678 }
2679
2680
Clear()2681 void QuickCheckDetails::Clear() {
2682 for (int i = 0; i < characters_; i++) {
2683 positions_[i].mask = 0;
2684 positions_[i].value = 0;
2685 positions_[i].determines_perfectly = false;
2686 }
2687 characters_ = 0;
2688 }
2689
2690
Advance(int by,bool one_byte)2691 void QuickCheckDetails::Advance(int by, bool one_byte) {
2692 DCHECK(by >= 0);
2693 if (by >= characters_) {
2694 Clear();
2695 return;
2696 }
2697 for (int i = 0; i < characters_ - by; i++) {
2698 positions_[i] = positions_[by + i];
2699 }
2700 for (int i = characters_ - by; i < characters_; i++) {
2701 positions_[i].mask = 0;
2702 positions_[i].value = 0;
2703 positions_[i].determines_perfectly = false;
2704 }
2705 characters_ -= by;
2706 // We could change mask_ and value_ here but we would never advance unless
2707 // they had already been used in a check and they won't be used again because
2708 // it would gain us nothing. So there's no point.
2709 }
2710
2711
Merge(QuickCheckDetails * other,int from_index)2712 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2713 DCHECK(characters_ == other->characters_);
2714 if (other->cannot_match_) {
2715 return;
2716 }
2717 if (cannot_match_) {
2718 *this = *other;
2719 return;
2720 }
2721 for (int i = from_index; i < characters_; i++) {
2722 QuickCheckDetails::Position* pos = positions(i);
2723 QuickCheckDetails::Position* other_pos = other->positions(i);
2724 if (pos->mask != other_pos->mask ||
2725 pos->value != other_pos->value ||
2726 !other_pos->determines_perfectly) {
2727 // Our mask-compare operation will be approximate unless we have the
2728 // exact same operation on both sides of the alternation.
2729 pos->determines_perfectly = false;
2730 }
2731 pos->mask &= other_pos->mask;
2732 pos->value &= pos->mask;
2733 other_pos->value &= pos->mask;
2734 uc16 differing_bits = (pos->value ^ other_pos->value);
2735 pos->mask &= ~differing_bits;
2736 pos->value &= pos->mask;
2737 }
2738 }
2739
2740
2741 class VisitMarker {
2742 public:
VisitMarker(NodeInfo * info)2743 explicit VisitMarker(NodeInfo* info) : info_(info) {
2744 DCHECK(!info->visited);
2745 info->visited = true;
2746 }
~VisitMarker()2747 ~VisitMarker() {
2748 info_->visited = false;
2749 }
2750 private:
2751 NodeInfo* info_;
2752 };
2753
2754
FilterOneByte(int depth,bool ignore_case)2755 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2756 if (info()->replacement_calculated) return replacement();
2757 if (depth < 0) return this;
2758 DCHECK(!info()->visited);
2759 VisitMarker marker(info());
2760 return FilterSuccessor(depth - 1, ignore_case);
2761 }
2762
2763
FilterSuccessor(int depth,bool ignore_case)2764 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2765 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2766 if (next == NULL) return set_replacement(NULL);
2767 on_success_ = next;
2768 return set_replacement(this);
2769 }
2770
2771
2772 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2773 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2774 // TODO(dcarney): this could be a lot more efficient.
2775 return range.Contains(0x39c) ||
2776 range.Contains(0x3bc) || range.Contains(0x178);
2777 }
2778
2779
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2780 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2781 for (int i = 0; i < ranges->length(); i++) {
2782 // TODO(dcarney): this could be a lot more efficient.
2783 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2784 }
2785 return false;
2786 }
2787
2788
FilterOneByte(int depth,bool ignore_case)2789 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2790 if (info()->replacement_calculated) return replacement();
2791 if (depth < 0) return this;
2792 DCHECK(!info()->visited);
2793 VisitMarker marker(info());
2794 int element_count = elms_->length();
2795 for (int i = 0; i < element_count; i++) {
2796 TextElement elm = elms_->at(i);
2797 if (elm.text_type() == TextElement::ATOM) {
2798 Vector<const uc16> quarks = elm.atom()->data();
2799 for (int j = 0; j < quarks.length(); j++) {
2800 uint16_t c = quarks[j];
2801 if (c <= String::kMaxOneByteCharCode) continue;
2802 if (!ignore_case) return set_replacement(NULL);
2803 // Here, we need to check for characters whose upper and lower cases
2804 // are outside the Latin-1 range.
2805 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2806 // Character is outside Latin-1 completely
2807 if (converted == 0) return set_replacement(NULL);
2808 // Convert quark to Latin-1 in place.
2809 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2810 copy[j] = converted;
2811 }
2812 } else {
2813 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2814 RegExpCharacterClass* cc = elm.char_class();
2815 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2816 if (!CharacterRange::IsCanonical(ranges)) {
2817 CharacterRange::Canonicalize(ranges);
2818 }
2819 // Now they are in order so we only need to look at the first.
2820 int range_count = ranges->length();
2821 if (cc->is_negated()) {
2822 if (range_count != 0 &&
2823 ranges->at(0).from() == 0 &&
2824 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2825 // This will be handled in a later filter.
2826 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2827 return set_replacement(NULL);
2828 }
2829 } else {
2830 if (range_count == 0 ||
2831 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2832 // This will be handled in a later filter.
2833 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2834 return set_replacement(NULL);
2835 }
2836 }
2837 }
2838 }
2839 return FilterSuccessor(depth - 1, ignore_case);
2840 }
2841
2842
FilterOneByte(int depth,bool ignore_case)2843 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2844 if (info()->replacement_calculated) return replacement();
2845 if (depth < 0) return this;
2846 if (info()->visited) return this;
2847 {
2848 VisitMarker marker(info());
2849
2850 RegExpNode* continue_replacement =
2851 continue_node_->FilterOneByte(depth - 1, ignore_case);
2852 // If we can't continue after the loop then there is no sense in doing the
2853 // loop.
2854 if (continue_replacement == NULL) return set_replacement(NULL);
2855 }
2856
2857 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2858 }
2859
2860
FilterOneByte(int depth,bool ignore_case)2861 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2862 if (info()->replacement_calculated) return replacement();
2863 if (depth < 0) return this;
2864 if (info()->visited) return this;
2865 VisitMarker marker(info());
2866 int choice_count = alternatives_->length();
2867
2868 for (int i = 0; i < choice_count; i++) {
2869 GuardedAlternative alternative = alternatives_->at(i);
2870 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2871 set_replacement(this);
2872 return this;
2873 }
2874 }
2875
2876 int surviving = 0;
2877 RegExpNode* survivor = NULL;
2878 for (int i = 0; i < choice_count; i++) {
2879 GuardedAlternative alternative = alternatives_->at(i);
2880 RegExpNode* replacement =
2881 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2882 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2883 if (replacement != NULL) {
2884 alternatives_->at(i).set_node(replacement);
2885 surviving++;
2886 survivor = replacement;
2887 }
2888 }
2889 if (surviving < 2) return set_replacement(survivor);
2890
2891 set_replacement(this);
2892 if (surviving == choice_count) {
2893 return this;
2894 }
2895 // Only some of the nodes survived the filtering. We need to rebuild the
2896 // alternatives list.
2897 ZoneList<GuardedAlternative>* new_alternatives =
2898 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2899 for (int i = 0; i < choice_count; i++) {
2900 RegExpNode* replacement =
2901 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2902 if (replacement != NULL) {
2903 alternatives_->at(i).set_node(replacement);
2904 new_alternatives->Add(alternatives_->at(i), zone());
2905 }
2906 }
2907 alternatives_ = new_alternatives;
2908 return this;
2909 }
2910
2911
FilterOneByte(int depth,bool ignore_case)2912 RegExpNode* NegativeLookaheadChoiceNode::FilterOneByte(int depth,
2913 bool ignore_case) {
2914 if (info()->replacement_calculated) return replacement();
2915 if (depth < 0) return this;
2916 if (info()->visited) return this;
2917 VisitMarker marker(info());
2918 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2919 // afterwards.
2920 RegExpNode* node = alternatives_->at(1).node();
2921 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2922 if (replacement == NULL) return set_replacement(NULL);
2923 alternatives_->at(1).set_node(replacement);
2924
2925 RegExpNode* neg_node = alternatives_->at(0).node();
2926 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2927 // If the negative lookahead is always going to fail then
2928 // we don't need to check it.
2929 if (neg_replacement == NULL) return set_replacement(replacement);
2930 alternatives_->at(0).set_node(neg_replacement);
2931 return set_replacement(this);
2932 }
2933
2934
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2935 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2936 RegExpCompiler* compiler,
2937 int characters_filled_in,
2938 bool not_at_start) {
2939 if (body_can_be_zero_length_ || info()->visited) return;
2940 VisitMarker marker(info());
2941 return ChoiceNode::GetQuickCheckDetails(details,
2942 compiler,
2943 characters_filled_in,
2944 not_at_start);
2945 }
2946
2947
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2948 void LoopChoiceNode::FillInBMInfo(int offset,
2949 int budget,
2950 BoyerMooreLookahead* bm,
2951 bool not_at_start) {
2952 if (body_can_be_zero_length_ || budget <= 0) {
2953 bm->SetRest(offset);
2954 SaveBMInfo(bm, not_at_start, offset);
2955 return;
2956 }
2957 ChoiceNode::FillInBMInfo(offset, budget - 1, bm, not_at_start);
2958 SaveBMInfo(bm, not_at_start, offset);
2959 }
2960
2961
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2962 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2963 RegExpCompiler* compiler,
2964 int characters_filled_in,
2965 bool not_at_start) {
2966 not_at_start = (not_at_start || not_at_start_);
2967 int choice_count = alternatives_->length();
2968 DCHECK(choice_count > 0);
2969 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2970 compiler,
2971 characters_filled_in,
2972 not_at_start);
2973 for (int i = 1; i < choice_count; i++) {
2974 QuickCheckDetails new_details(details->characters());
2975 RegExpNode* node = alternatives_->at(i).node();
2976 node->GetQuickCheckDetails(&new_details, compiler,
2977 characters_filled_in,
2978 not_at_start);
2979 // Here we merge the quick match details of the two branches.
2980 details->Merge(&new_details, characters_filled_in);
2981 }
2982 }
2983
2984
2985 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2986 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2987 Label* word,
2988 Label* non_word,
2989 bool fall_through_on_word) {
2990 if (assembler->CheckSpecialCharacterClass(
2991 fall_through_on_word ? 'w' : 'W',
2992 fall_through_on_word ? non_word : word)) {
2993 // Optimized implementation available.
2994 return;
2995 }
2996 assembler->CheckCharacterGT('z', non_word);
2997 assembler->CheckCharacterLT('0', non_word);
2998 assembler->CheckCharacterGT('a' - 1, word);
2999 assembler->CheckCharacterLT('9' + 1, word);
3000 assembler->CheckCharacterLT('A', non_word);
3001 assembler->CheckCharacterLT('Z' + 1, word);
3002 if (fall_through_on_word) {
3003 assembler->CheckNotCharacter('_', non_word);
3004 } else {
3005 assembler->CheckCharacter('_', word);
3006 }
3007 }
3008
3009
3010 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3011 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3012 static void EmitHat(RegExpCompiler* compiler,
3013 RegExpNode* on_success,
3014 Trace* trace) {
3015 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3016 // We will be loading the previous character into the current character
3017 // register.
3018 Trace new_trace(*trace);
3019 new_trace.InvalidateCurrentCharacter();
3020
3021 Label ok;
3022 if (new_trace.cp_offset() == 0) {
3023 // The start of input counts as a newline in this context, so skip to
3024 // ok if we are at the start.
3025 assembler->CheckAtStart(&ok);
3026 }
3027 // We already checked that we are not at the start of input so it must be
3028 // OK to load the previous character.
3029 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3030 new_trace.backtrack(),
3031 false);
3032 if (!assembler->CheckSpecialCharacterClass('n',
3033 new_trace.backtrack())) {
3034 // Newline means \n, \r, 0x2028 or 0x2029.
3035 if (!compiler->one_byte()) {
3036 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3037 }
3038 assembler->CheckCharacter('\n', &ok);
3039 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3040 }
3041 assembler->Bind(&ok);
3042 on_success->Emit(compiler, &new_trace);
3043 }
3044
3045
3046 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3047 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3048 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3049 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3050 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3051 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3052 if (lookahead == NULL) {
3053 int eats_at_least =
3054 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3055 kRecursionBudget,
3056 not_at_start));
3057 if (eats_at_least >= 1) {
3058 BoyerMooreLookahead* bm =
3059 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3060 FillInBMInfo(0, kRecursionBudget, bm, not_at_start);
3061 if (bm->at(0)->is_non_word())
3062 next_is_word_character = Trace::FALSE_VALUE;
3063 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3064 }
3065 } else {
3066 if (lookahead->at(0)->is_non_word())
3067 next_is_word_character = Trace::FALSE_VALUE;
3068 if (lookahead->at(0)->is_word())
3069 next_is_word_character = Trace::TRUE_VALUE;
3070 }
3071 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3072 if (next_is_word_character == Trace::UNKNOWN) {
3073 Label before_non_word;
3074 Label before_word;
3075 if (trace->characters_preloaded() != 1) {
3076 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3077 }
3078 // Fall through on non-word.
3079 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3080 // Next character is not a word character.
3081 assembler->Bind(&before_non_word);
3082 Label ok;
3083 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3084 assembler->GoTo(&ok);
3085
3086 assembler->Bind(&before_word);
3087 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3088 assembler->Bind(&ok);
3089 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3090 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3091 } else {
3092 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3094 }
3095 }
3096
3097
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3098 void AssertionNode::BacktrackIfPrevious(
3099 RegExpCompiler* compiler,
3100 Trace* trace,
3101 AssertionNode::IfPrevious backtrack_if_previous) {
3102 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3103 Trace new_trace(*trace);
3104 new_trace.InvalidateCurrentCharacter();
3105
3106 Label fall_through, dummy;
3107
3108 Label* non_word = backtrack_if_previous == kIsNonWord ?
3109 new_trace.backtrack() :
3110 &fall_through;
3111 Label* word = backtrack_if_previous == kIsNonWord ?
3112 &fall_through :
3113 new_trace.backtrack();
3114
3115 if (new_trace.cp_offset() == 0) {
3116 // The start of input counts as a non-word character, so the question is
3117 // decided if we are at the start.
3118 assembler->CheckAtStart(non_word);
3119 }
3120 // We already checked that we are not at the start of input so it must be
3121 // OK to load the previous character.
3122 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3123 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3124
3125 assembler->Bind(&fall_through);
3126 on_success()->Emit(compiler, &new_trace);
3127 }
3128
3129
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3130 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3131 RegExpCompiler* compiler,
3132 int filled_in,
3133 bool not_at_start) {
3134 if (assertion_type_ == AT_START && not_at_start) {
3135 details->set_cannot_match();
3136 return;
3137 }
3138 return on_success()->GetQuickCheckDetails(details,
3139 compiler,
3140 filled_in,
3141 not_at_start);
3142 }
3143
3144
Emit(RegExpCompiler * compiler,Trace * trace)3145 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3146 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3147 switch (assertion_type_) {
3148 case AT_END: {
3149 Label ok;
3150 assembler->CheckPosition(trace->cp_offset(), &ok);
3151 assembler->GoTo(trace->backtrack());
3152 assembler->Bind(&ok);
3153 break;
3154 }
3155 case AT_START: {
3156 if (trace->at_start() == Trace::FALSE_VALUE) {
3157 assembler->GoTo(trace->backtrack());
3158 return;
3159 }
3160 if (trace->at_start() == Trace::UNKNOWN) {
3161 assembler->CheckNotAtStart(trace->backtrack());
3162 Trace at_start_trace = *trace;
3163 at_start_trace.set_at_start(true);
3164 on_success()->Emit(compiler, &at_start_trace);
3165 return;
3166 }
3167 }
3168 break;
3169 case AFTER_NEWLINE:
3170 EmitHat(compiler, on_success(), trace);
3171 return;
3172 case AT_BOUNDARY:
3173 case AT_NON_BOUNDARY: {
3174 EmitBoundaryCheck(compiler, trace);
3175 return;
3176 }
3177 }
3178 on_success()->Emit(compiler, trace);
3179 }
3180
3181
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3182 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3183 if (quick_check == NULL) return false;
3184 if (offset >= quick_check->characters()) return false;
3185 return quick_check->positions(offset)->determines_perfectly;
3186 }
3187
3188
UpdateBoundsCheck(int index,int * checked_up_to)3189 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3190 if (index > *checked_up_to) {
3191 *checked_up_to = index;
3192 }
3193 }
3194
3195
3196 // We call this repeatedly to generate code for each pass over the text node.
3197 // The passes are in increasing order of difficulty because we hope one
3198 // of the first passes will fail in which case we are saved the work of the
3199 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3200 // we will check the '%' in the first pass, the case independent 'a' in the
3201 // second pass and the character class in the last pass.
3202 //
3203 // The passes are done from right to left, so for example to test for /bar/
3204 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3205 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3206 // bounds check most of the time. In the example we only need to check for
3207 // end-of-input when loading the putative 'r'.
3208 //
3209 // A slight complication involves the fact that the first character may already
3210 // be fetched into a register by the previous node. In this case we want to
3211 // do the test for that character first. We do this in separate passes. The
3212 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3213 // pass has been performed then subsequent passes will have true in
3214 // first_element_checked to indicate that that character does not need to be
3215 // checked again.
3216 //
3217 // In addition to all this we are passed a Trace, which can
3218 // contain an AlternativeGeneration object. In this AlternativeGeneration
3219 // object we can see details of any quick check that was already passed in
3220 // order to get to the code we are now generating. The quick check can involve
3221 // loading characters, which means we do not need to recheck the bounds
3222 // up to the limit the quick check already checked. In addition the quick
3223 // check can have involved a mask and compare operation which may simplify
3224 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3225 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3226 TextEmitPassType pass,
3227 bool preloaded,
3228 Trace* trace,
3229 bool first_element_checked,
3230 int* checked_up_to) {
3231 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3232 Isolate* isolate = assembler->zone()->isolate();
3233 bool one_byte = compiler->one_byte();
3234 Label* backtrack = trace->backtrack();
3235 QuickCheckDetails* quick_check = trace->quick_check_performed();
3236 int element_count = elms_->length();
3237 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3238 TextElement elm = elms_->at(i);
3239 int cp_offset = trace->cp_offset() + elm.cp_offset();
3240 if (elm.text_type() == TextElement::ATOM) {
3241 Vector<const uc16> quarks = elm.atom()->data();
3242 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3243 if (first_element_checked && i == 0 && j == 0) continue;
3244 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3245 EmitCharacterFunction* emit_function = NULL;
3246 switch (pass) {
3247 case NON_LATIN1_MATCH:
3248 DCHECK(one_byte);
3249 if (quarks[j] > String::kMaxOneByteCharCode) {
3250 assembler->GoTo(backtrack);
3251 return;
3252 }
3253 break;
3254 case NON_LETTER_CHARACTER_MATCH:
3255 emit_function = &EmitAtomNonLetter;
3256 break;
3257 case SIMPLE_CHARACTER_MATCH:
3258 emit_function = &EmitSimpleCharacter;
3259 break;
3260 case CASE_CHARACTER_MATCH:
3261 emit_function = &EmitAtomLetter;
3262 break;
3263 default:
3264 break;
3265 }
3266 if (emit_function != NULL) {
3267 bool bound_checked = emit_function(isolate,
3268 compiler,
3269 quarks[j],
3270 backtrack,
3271 cp_offset + j,
3272 *checked_up_to < cp_offset + j,
3273 preloaded);
3274 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3275 }
3276 }
3277 } else {
3278 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3279 if (pass == CHARACTER_CLASS_MATCH) {
3280 if (first_element_checked && i == 0) continue;
3281 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3282 RegExpCharacterClass* cc = elm.char_class();
3283 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3284 *checked_up_to < cp_offset, preloaded, zone());
3285 UpdateBoundsCheck(cp_offset, checked_up_to);
3286 }
3287 }
3288 }
3289 }
3290
3291
Length()3292 int TextNode::Length() {
3293 TextElement elm = elms_->last();
3294 DCHECK(elm.cp_offset() >= 0);
3295 return elm.cp_offset() + elm.length();
3296 }
3297
3298
SkipPass(int int_pass,bool ignore_case)3299 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3300 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3301 if (ignore_case) {
3302 return pass == SIMPLE_CHARACTER_MATCH;
3303 } else {
3304 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3305 }
3306 }
3307
3308
3309 // This generates the code to match a text node. A text node can contain
3310 // straight character sequences (possibly to be matched in a case-independent
3311 // way) and character classes. For efficiency we do not do this in a single
3312 // pass from left to right. Instead we pass over the text node several times,
3313 // emitting code for some character positions every time. See the comment on
3314 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3315 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3316 LimitResult limit_result = LimitVersions(compiler, trace);
3317 if (limit_result == DONE) return;
3318 DCHECK(limit_result == CONTINUE);
3319
3320 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3321 compiler->SetRegExpTooBig();
3322 return;
3323 }
3324
3325 if (compiler->one_byte()) {
3326 int dummy = 0;
3327 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3328 }
3329
3330 bool first_elt_done = false;
3331 int bound_checked_to = trace->cp_offset() - 1;
3332 bound_checked_to += trace->bound_checked_up_to();
3333
3334 // If a character is preloaded into the current character register then
3335 // check that now.
3336 if (trace->characters_preloaded() == 1) {
3337 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3338 if (!SkipPass(pass, compiler->ignore_case())) {
3339 TextEmitPass(compiler,
3340 static_cast<TextEmitPassType>(pass),
3341 true,
3342 trace,
3343 false,
3344 &bound_checked_to);
3345 }
3346 }
3347 first_elt_done = true;
3348 }
3349
3350 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3351 if (!SkipPass(pass, compiler->ignore_case())) {
3352 TextEmitPass(compiler,
3353 static_cast<TextEmitPassType>(pass),
3354 false,
3355 trace,
3356 first_elt_done,
3357 &bound_checked_to);
3358 }
3359 }
3360
3361 Trace successor_trace(*trace);
3362 successor_trace.set_at_start(false);
3363 successor_trace.AdvanceCurrentPositionInTrace(Length(), compiler);
3364 RecursionCheck rc(compiler);
3365 on_success()->Emit(compiler, &successor_trace);
3366 }
3367
3368
InvalidateCurrentCharacter()3369 void Trace::InvalidateCurrentCharacter() {
3370 characters_preloaded_ = 0;
3371 }
3372
3373
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3374 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3375 DCHECK(by > 0);
3376 // We don't have an instruction for shifting the current character register
3377 // down or for using a shifted value for anything so lets just forget that
3378 // we preloaded any characters into it.
3379 characters_preloaded_ = 0;
3380 // Adjust the offsets of the quick check performed information. This
3381 // information is used to find out what we already determined about the
3382 // characters by means of mask and compare.
3383 quick_check_performed_.Advance(by, compiler->one_byte());
3384 cp_offset_ += by;
3385 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3386 compiler->SetRegExpTooBig();
3387 cp_offset_ = 0;
3388 }
3389 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3390 }
3391
3392
MakeCaseIndependent(bool is_one_byte)3393 void TextNode::MakeCaseIndependent(bool is_one_byte) {
3394 int element_count = elms_->length();
3395 for (int i = 0; i < element_count; i++) {
3396 TextElement elm = elms_->at(i);
3397 if (elm.text_type() == TextElement::CHAR_CLASS) {
3398 RegExpCharacterClass* cc = elm.char_class();
3399 // None of the standard character classes is different in the case
3400 // independent case and it slows us down if we don't know that.
3401 if (cc->is_standard(zone())) continue;
3402 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3403 int range_count = ranges->length();
3404 for (int j = 0; j < range_count; j++) {
3405 ranges->at(j).AddCaseEquivalents(ranges, is_one_byte, zone());
3406 }
3407 }
3408 }
3409 }
3410
3411
GreedyLoopTextLength()3412 int TextNode::GreedyLoopTextLength() {
3413 TextElement elm = elms_->at(elms_->length() - 1);
3414 return elm.cp_offset() + elm.length();
3415 }
3416
3417
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3418 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3419 RegExpCompiler* compiler) {
3420 if (elms_->length() != 1) return NULL;
3421 TextElement elm = elms_->at(0);
3422 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3423 RegExpCharacterClass* node = elm.char_class();
3424 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3425 if (!CharacterRange::IsCanonical(ranges)) {
3426 CharacterRange::Canonicalize(ranges);
3427 }
3428 if (node->is_negated()) {
3429 return ranges->length() == 0 ? on_success() : NULL;
3430 }
3431 if (ranges->length() != 1) return NULL;
3432 uint32_t max_char;
3433 if (compiler->one_byte()) {
3434 max_char = String::kMaxOneByteCharCode;
3435 } else {
3436 max_char = String::kMaxUtf16CodeUnit;
3437 }
3438 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3439 }
3440
3441
3442 // Finds the fixed match length of a sequence of nodes that goes from
3443 // this alternative and back to this choice node. If there are variable
3444 // length nodes or other complications in the way then return a sentinel
3445 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3446 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3447 GuardedAlternative* alternative) {
3448 int length = 0;
3449 RegExpNode* node = alternative->node();
3450 // Later we will generate code for all these text nodes using recursion
3451 // so we have to limit the max number.
3452 int recursion_depth = 0;
3453 while (node != this) {
3454 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3455 return kNodeIsTooComplexForGreedyLoops;
3456 }
3457 int node_length = node->GreedyLoopTextLength();
3458 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3459 return kNodeIsTooComplexForGreedyLoops;
3460 }
3461 length += node_length;
3462 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3463 node = seq_node->on_success();
3464 }
3465 return length;
3466 }
3467
3468
AddLoopAlternative(GuardedAlternative alt)3469 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3470 DCHECK_EQ(loop_node_, NULL);
3471 AddAlternative(alt);
3472 loop_node_ = alt.node();
3473 }
3474
3475
AddContinueAlternative(GuardedAlternative alt)3476 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3477 DCHECK_EQ(continue_node_, NULL);
3478 AddAlternative(alt);
3479 continue_node_ = alt.node();
3480 }
3481
3482
Emit(RegExpCompiler * compiler,Trace * trace)3483 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3484 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3485 if (trace->stop_node() == this) {
3486 // Back edge of greedy optimized loop node graph.
3487 int text_length =
3488 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3489 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3490 // Update the counter-based backtracking info on the stack. This is an
3491 // optimization for greedy loops (see below).
3492 DCHECK(trace->cp_offset() == text_length);
3493 macro_assembler->AdvanceCurrentPosition(text_length);
3494 macro_assembler->GoTo(trace->loop_label());
3495 return;
3496 }
3497 DCHECK(trace->stop_node() == NULL);
3498 if (!trace->is_trivial()) {
3499 trace->Flush(compiler, this);
3500 return;
3501 }
3502 ChoiceNode::Emit(compiler, trace);
3503 }
3504
3505
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3506 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3507 int eats_at_least) {
3508 int preload_characters = Min(4, eats_at_least);
3509 if (compiler->macro_assembler()->CanReadUnaligned()) {
3510 bool one_byte = compiler->one_byte();
3511 if (one_byte) {
3512 if (preload_characters > 4) preload_characters = 4;
3513 // We can't preload 3 characters because there is no machine instruction
3514 // to do that. We can't just load 4 because we could be reading
3515 // beyond the end of the string, which could cause a memory fault.
3516 if (preload_characters == 3) preload_characters = 2;
3517 } else {
3518 if (preload_characters > 2) preload_characters = 2;
3519 }
3520 } else {
3521 if (preload_characters > 1) preload_characters = 1;
3522 }
3523 return preload_characters;
3524 }
3525
3526
3527 // This class is used when generating the alternatives in a choice node. It
3528 // records the way the alternative is being code generated.
3529 class AlternativeGeneration: public Malloced {
3530 public:
AlternativeGeneration()3531 AlternativeGeneration()
3532 : possible_success(),
3533 expects_preload(false),
3534 after(),
3535 quick_check_details() { }
3536 Label possible_success;
3537 bool expects_preload;
3538 Label after;
3539 QuickCheckDetails quick_check_details;
3540 };
3541
3542
3543 // Creates a list of AlternativeGenerations. If the list has a reasonable
3544 // size then it is on the stack, otherwise the excess is on the heap.
3545 class AlternativeGenerationList {
3546 public:
AlternativeGenerationList(int count,Zone * zone)3547 AlternativeGenerationList(int count, Zone* zone)
3548 : alt_gens_(count, zone) {
3549 for (int i = 0; i < count && i < kAFew; i++) {
3550 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3551 }
3552 for (int i = kAFew; i < count; i++) {
3553 alt_gens_.Add(new AlternativeGeneration(), zone);
3554 }
3555 }
~AlternativeGenerationList()3556 ~AlternativeGenerationList() {
3557 for (int i = kAFew; i < alt_gens_.length(); i++) {
3558 delete alt_gens_[i];
3559 alt_gens_[i] = NULL;
3560 }
3561 }
3562
at(int i)3563 AlternativeGeneration* at(int i) {
3564 return alt_gens_[i];
3565 }
3566
3567 private:
3568 static const int kAFew = 10;
3569 ZoneList<AlternativeGeneration*> alt_gens_;
3570 AlternativeGeneration a_few_alt_gens_[kAFew];
3571 };
3572
3573
3574 // The '2' variant is has inclusive from and exclusive to.
3575 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3576 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3577 static const int kSpaceRanges[] = { '\t', '\r' + 1, ' ', ' ' + 1,
3578 0x00A0, 0x00A1, 0x1680, 0x1681, 0x180E, 0x180F, 0x2000, 0x200B,
3579 0x2028, 0x202A, 0x202F, 0x2030, 0x205F, 0x2060, 0x3000, 0x3001,
3580 0xFEFF, 0xFF00, 0x10000 };
3581 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3582
3583 static const int kWordRanges[] = {
3584 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, 0x10000 };
3585 static const int kWordRangeCount = arraysize(kWordRanges);
3586 static const int kDigitRanges[] = { '0', '9' + 1, 0x10000 };
3587 static const int kDigitRangeCount = arraysize(kDigitRanges);
3588 static const int kSurrogateRanges[] = { 0xd800, 0xe000, 0x10000 };
3589 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3590 static const int kLineTerminatorRanges[] = { 0x000A, 0x000B, 0x000D, 0x000E,
3591 0x2028, 0x202A, 0x10000 };
3592 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3593
3594
Set(int character)3595 void BoyerMoorePositionInfo::Set(int character) {
3596 SetInterval(Interval(character, character));
3597 }
3598
3599
SetInterval(const Interval & interval)3600 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3601 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3602 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3603 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3604 surrogate_ =
3605 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3606 if (interval.to() - interval.from() >= kMapSize - 1) {
3607 if (map_count_ != kMapSize) {
3608 map_count_ = kMapSize;
3609 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3610 }
3611 return;
3612 }
3613 for (int i = interval.from(); i <= interval.to(); i++) {
3614 int mod_character = (i & kMask);
3615 if (!map_->at(mod_character)) {
3616 map_count_++;
3617 map_->at(mod_character) = true;
3618 }
3619 if (map_count_ == kMapSize) return;
3620 }
3621 }
3622
3623
SetAll()3624 void BoyerMoorePositionInfo::SetAll() {
3625 s_ = w_ = d_ = kLatticeUnknown;
3626 if (map_count_ != kMapSize) {
3627 map_count_ = kMapSize;
3628 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3629 }
3630 }
3631
3632
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3633 BoyerMooreLookahead::BoyerMooreLookahead(
3634 int length, RegExpCompiler* compiler, Zone* zone)
3635 : length_(length),
3636 compiler_(compiler) {
3637 if (compiler->one_byte()) {
3638 max_char_ = String::kMaxOneByteCharCode;
3639 } else {
3640 max_char_ = String::kMaxUtf16CodeUnit;
3641 }
3642 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3643 for (int i = 0; i < length; i++) {
3644 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3645 }
3646 }
3647
3648
3649 // Find the longest range of lookahead that has the fewest number of different
3650 // characters that can occur at a given position. Since we are optimizing two
3651 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3652 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3653 int biggest_points = 0;
3654 // If more than 32 characters out of 128 can occur it is unlikely that we can
3655 // be lucky enough to step forwards much of the time.
3656 const int kMaxMax = 32;
3657 for (int max_number_of_chars = 4;
3658 max_number_of_chars < kMaxMax;
3659 max_number_of_chars *= 2) {
3660 biggest_points =
3661 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3662 }
3663 if (biggest_points == 0) return false;
3664 return true;
3665 }
3666
3667
3668 // Find the highest-points range between 0 and length_ where the character
3669 // information is not too vague. 'Too vague' means that there are more than
3670 // max_number_of_chars that can occur at this position. Calculates the number
3671 // of points as the product of width-of-the-range and
3672 // probability-of-finding-one-of-the-characters, where the probability is
3673 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3674 int BoyerMooreLookahead::FindBestInterval(
3675 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3676 int biggest_points = old_biggest_points;
3677 static const int kSize = RegExpMacroAssembler::kTableSize;
3678 for (int i = 0; i < length_; ) {
3679 while (i < length_ && Count(i) > max_number_of_chars) i++;
3680 if (i == length_) break;
3681 int remembered_from = i;
3682 bool union_map[kSize];
3683 for (int j = 0; j < kSize; j++) union_map[j] = false;
3684 while (i < length_ && Count(i) <= max_number_of_chars) {
3685 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3686 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3687 i++;
3688 }
3689 int frequency = 0;
3690 for (int j = 0; j < kSize; j++) {
3691 if (union_map[j]) {
3692 // Add 1 to the frequency to give a small per-character boost for
3693 // the cases where our sampling is not good enough and many
3694 // characters have a frequency of zero. This means the frequency
3695 // can theoretically be up to 2*kSize though we treat it mostly as
3696 // a fraction of kSize.
3697 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3698 }
3699 }
3700 // We use the probability of skipping times the distance we are skipping to
3701 // judge the effectiveness of this. Actually we have a cut-off: By
3702 // dividing by 2 we switch off the skipping if the probability of skipping
3703 // is less than 50%. This is because the multibyte mask-and-compare
3704 // skipping in quickcheck is more likely to do well on this case.
3705 bool in_quickcheck_range =
3706 ((i - remembered_from < 4) ||
3707 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3708 // Called 'probability' but it is only a rough estimate and can actually
3709 // be outside the 0-kSize range.
3710 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3711 int points = (i - remembered_from) * probability;
3712 if (points > biggest_points) {
3713 *from = remembered_from;
3714 *to = i - 1;
3715 biggest_points = points;
3716 }
3717 }
3718 return biggest_points;
3719 }
3720
3721
3722 // Take all the characters that will not prevent a successful match if they
3723 // occur in the subject string in the range between min_lookahead and
3724 // max_lookahead (inclusive) measured from the current position. If the
3725 // character at max_lookahead offset is not one of these characters, then we
3726 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3727 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3728 int max_lookahead,
3729 Handle<ByteArray> boolean_skip_table) {
3730 const int kSize = RegExpMacroAssembler::kTableSize;
3731
3732 const int kSkipArrayEntry = 0;
3733 const int kDontSkipArrayEntry = 1;
3734
3735 for (int i = 0; i < kSize; i++) {
3736 boolean_skip_table->set(i, kSkipArrayEntry);
3737 }
3738 int skip = max_lookahead + 1 - min_lookahead;
3739
3740 for (int i = max_lookahead; i >= min_lookahead; i--) {
3741 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3742 for (int j = 0; j < kSize; j++) {
3743 if (map->at(j)) {
3744 boolean_skip_table->set(j, kDontSkipArrayEntry);
3745 }
3746 }
3747 }
3748
3749 return skip;
3750 }
3751
3752
3753 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3754 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3755 const int kSize = RegExpMacroAssembler::kTableSize;
3756
3757 int min_lookahead = 0;
3758 int max_lookahead = 0;
3759
3760 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3761
3762 bool found_single_character = false;
3763 int single_character = 0;
3764 for (int i = max_lookahead; i >= min_lookahead; i--) {
3765 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3766 if (map->map_count() > 1 ||
3767 (found_single_character && map->map_count() != 0)) {
3768 found_single_character = false;
3769 break;
3770 }
3771 for (int j = 0; j < kSize; j++) {
3772 if (map->at(j)) {
3773 found_single_character = true;
3774 single_character = j;
3775 break;
3776 }
3777 }
3778 }
3779
3780 int lookahead_width = max_lookahead + 1 - min_lookahead;
3781
3782 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3783 // The mask-compare can probably handle this better.
3784 return;
3785 }
3786
3787 if (found_single_character) {
3788 Label cont, again;
3789 masm->Bind(&again);
3790 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3791 if (max_char_ > kSize) {
3792 masm->CheckCharacterAfterAnd(single_character,
3793 RegExpMacroAssembler::kTableMask,
3794 &cont);
3795 } else {
3796 masm->CheckCharacter(single_character, &cont);
3797 }
3798 masm->AdvanceCurrentPosition(lookahead_width);
3799 masm->GoTo(&again);
3800 masm->Bind(&cont);
3801 return;
3802 }
3803
3804 Factory* factory = masm->zone()->isolate()->factory();
3805 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3806 int skip_distance = GetSkipTable(
3807 min_lookahead, max_lookahead, boolean_skip_table);
3808 DCHECK(skip_distance != 0);
3809
3810 Label cont, again;
3811 masm->Bind(&again);
3812 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3813 masm->CheckBitInTable(boolean_skip_table, &cont);
3814 masm->AdvanceCurrentPosition(skip_distance);
3815 masm->GoTo(&again);
3816 masm->Bind(&cont);
3817 }
3818
3819
3820 /* Code generation for choice nodes.
3821 *
3822 * We generate quick checks that do a mask and compare to eliminate a
3823 * choice. If the quick check succeeds then it jumps to the continuation to
3824 * do slow checks and check subsequent nodes. If it fails (the common case)
3825 * it falls through to the next choice.
3826 *
3827 * Here is the desired flow graph. Nodes directly below each other imply
3828 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3829 * 3 doesn't have a quick check so we have to call the slow check.
3830 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3831 * regexp continuation is generated directly after the Sn node, up to the
3832 * next GoTo if we decide to reuse some already generated code. Some
3833 * nodes expect preload_characters to be preloaded into the current
3834 * character register. R nodes do this preloading. Vertices are marked
3835 * F for failures and S for success (possible success in the case of quick
3836 * nodes). L, V, < and > are used as arrow heads.
3837 *
3838 * ----------> R
3839 * |
3840 * V
3841 * Q1 -----> S1
3842 * | S /
3843 * F| /
3844 * | F/
3845 * | /
3846 * | R
3847 * | /
3848 * V L
3849 * Q2 -----> S2
3850 * | S /
3851 * F| /
3852 * | F/
3853 * | /
3854 * | R
3855 * | /
3856 * V L
3857 * S3
3858 * |
3859 * F|
3860 * |
3861 * R
3862 * |
3863 * backtrack V
3864 * <----------Q4
3865 * \ F |
3866 * \ |S
3867 * \ F V
3868 * \-----S4
3869 *
3870 * For greedy loops we push the current position, then generate the code that
3871 * eats the input specially in EmitGreedyLoop. The other choice (the
3872 * continuation) is generated by the normal code in EmitChoices, and steps back
3873 * in the input to the starting position when it fails to match. The loop code
3874 * looks like this (U is the unwind code that steps back in the greedy loop).
3875 *
3876 * _____
3877 * / \
3878 * V |
3879 * ----------> S1 |
3880 * /| |
3881 * / |S |
3882 * F/ \_____/
3883 * /
3884 * |<-----
3885 * | \
3886 * V |S
3887 * Q2 ---> U----->backtrack
3888 * | F /
3889 * S| /
3890 * V F /
3891 * S2--/
3892 */
3893
GreedyLoopState(bool not_at_start)3894 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3895 counter_backtrack_trace_.set_backtrack(&label_);
3896 if (not_at_start) counter_backtrack_trace_.set_at_start(false);
3897 }
3898
3899
AssertGuardsMentionRegisters(Trace * trace)3900 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3901 #ifdef DEBUG
3902 int choice_count = alternatives_->length();
3903 for (int i = 0; i < choice_count - 1; i++) {
3904 GuardedAlternative alternative = alternatives_->at(i);
3905 ZoneList<Guard*>* guards = alternative.guards();
3906 int guard_count = (guards == NULL) ? 0 : guards->length();
3907 for (int j = 0; j < guard_count; j++) {
3908 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3909 }
3910 }
3911 #endif
3912 }
3913
3914
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3915 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3916 Trace* current_trace,
3917 PreloadState* state) {
3918 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3919 // Save some time by looking at most one machine word ahead.
3920 state->eats_at_least_ =
3921 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3922 current_trace->at_start() == Trace::FALSE_VALUE);
3923 }
3924 state->preload_characters_ =
3925 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3926
3927 state->preload_is_current_ =
3928 (current_trace->characters_preloaded() == state->preload_characters_);
3929 state->preload_has_checked_bounds_ = state->preload_is_current_;
3930 }
3931
3932
Emit(RegExpCompiler * compiler,Trace * trace)3933 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3934 int choice_count = alternatives_->length();
3935
3936 AssertGuardsMentionRegisters(trace);
3937
3938 LimitResult limit_result = LimitVersions(compiler, trace);
3939 if (limit_result == DONE) return;
3940 DCHECK(limit_result == CONTINUE);
3941
3942 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3943 // other choice nodes we only flush if we are out of code size budget.
3944 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3945 trace->Flush(compiler, this);
3946 return;
3947 }
3948
3949 RecursionCheck rc(compiler);
3950
3951 PreloadState preload;
3952 preload.init();
3953 GreedyLoopState greedy_loop_state(not_at_start());
3954
3955 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3956 AlternativeGenerationList alt_gens(choice_count, zone());
3957
3958 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
3959 trace = EmitGreedyLoop(compiler,
3960 trace,
3961 &alt_gens,
3962 &preload,
3963 &greedy_loop_state,
3964 text_length);
3965 } else {
3966 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
3967 // match the traces produced pre-cleanup.
3968 Label second_choice;
3969 compiler->macro_assembler()->Bind(&second_choice);
3970
3971 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
3972
3973 EmitChoices(compiler,
3974 &alt_gens,
3975 0,
3976 trace,
3977 &preload);
3978 }
3979
3980 // At this point we need to generate slow checks for the alternatives where
3981 // the quick check was inlined. We can recognize these because the associated
3982 // label was bound.
3983 int new_flush_budget = trace->flush_budget() / choice_count;
3984 for (int i = 0; i < choice_count; i++) {
3985 AlternativeGeneration* alt_gen = alt_gens.at(i);
3986 Trace new_trace(*trace);
3987 // If there are actions to be flushed we have to limit how many times
3988 // they are flushed. Take the budget of the parent trace and distribute
3989 // it fairly amongst the children.
3990 if (new_trace.actions() != NULL) {
3991 new_trace.set_flush_budget(new_flush_budget);
3992 }
3993 bool next_expects_preload =
3994 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
3995 EmitOutOfLineContinuation(compiler,
3996 &new_trace,
3997 alternatives_->at(i),
3998 alt_gen,
3999 preload.preload_characters_,
4000 next_expects_preload);
4001 }
4002 }
4003
4004
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)4005 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4006 Trace* trace,
4007 AlternativeGenerationList* alt_gens,
4008 PreloadState* preload,
4009 GreedyLoopState* greedy_loop_state,
4010 int text_length) {
4011 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4012 // Here we have special handling for greedy loops containing only text nodes
4013 // and other simple nodes. These are handled by pushing the current
4014 // position on the stack and then incrementing the current position each
4015 // time around the switch. On backtrack we decrement the current position
4016 // and check it against the pushed value. This avoids pushing backtrack
4017 // information for each iteration of the loop, which could take up a lot of
4018 // space.
4019 DCHECK(trace->stop_node() == NULL);
4020 macro_assembler->PushCurrentPosition();
4021 Label greedy_match_failed;
4022 Trace greedy_match_trace;
4023 if (not_at_start()) greedy_match_trace.set_at_start(false);
4024 greedy_match_trace.set_backtrack(&greedy_match_failed);
4025 Label loop_label;
4026 macro_assembler->Bind(&loop_label);
4027 greedy_match_trace.set_stop_node(this);
4028 greedy_match_trace.set_loop_label(&loop_label);
4029 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4030 macro_assembler->Bind(&greedy_match_failed);
4031
4032 Label second_choice; // For use in greedy matches.
4033 macro_assembler->Bind(&second_choice);
4034
4035 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4036
4037 EmitChoices(compiler,
4038 alt_gens,
4039 1,
4040 new_trace,
4041 preload);
4042
4043 macro_assembler->Bind(greedy_loop_state->label());
4044 // If we have unwound to the bottom then backtrack.
4045 macro_assembler->CheckGreedyLoop(trace->backtrack());
4046 // Otherwise try the second priority at an earlier position.
4047 macro_assembler->AdvanceCurrentPosition(-text_length);
4048 macro_assembler->GoTo(&second_choice);
4049 return new_trace;
4050 }
4051
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4052 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4053 Trace* trace) {
4054 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4055 if (alternatives_->length() != 2) return eats_at_least;
4056
4057 GuardedAlternative alt1 = alternatives_->at(1);
4058 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4059 return eats_at_least;
4060 }
4061 RegExpNode* eats_anything_node = alt1.node();
4062 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4063 return eats_at_least;
4064 }
4065
4066 // Really we should be creating a new trace when we execute this function,
4067 // but there is no need, because the code it generates cannot backtrack, and
4068 // we always arrive here with a trivial trace (since it's the entry to a
4069 // loop. That also implies that there are no preloaded characters, which is
4070 // good, because it means we won't be violating any assumptions by
4071 // overwriting those characters with new load instructions.
4072 DCHECK(trace->is_trivial());
4073
4074 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4075 // At this point we know that we are at a non-greedy loop that will eat
4076 // any character one at a time. Any non-anchored regexp has such a
4077 // loop prepended to it in order to find where it starts. We look for
4078 // a pattern of the form ...abc... where we can look 6 characters ahead
4079 // and step forwards 3 if the character is not one of abc. Abc need
4080 // not be atoms, they can be any reasonably limited character class or
4081 // small alternation.
4082 BoyerMooreLookahead* bm = bm_info(false);
4083 if (bm == NULL) {
4084 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4085 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4086 kRecursionBudget,
4087 false));
4088 if (eats_at_least >= 1) {
4089 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4090 compiler,
4091 zone());
4092 GuardedAlternative alt0 = alternatives_->at(0);
4093 alt0.node()->FillInBMInfo(0, kRecursionBudget, bm, false);
4094 }
4095 }
4096 if (bm != NULL) {
4097 bm->EmitSkipInstructions(macro_assembler);
4098 }
4099 return eats_at_least;
4100 }
4101
4102
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4103 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4104 AlternativeGenerationList* alt_gens,
4105 int first_choice,
4106 Trace* trace,
4107 PreloadState* preload) {
4108 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4109 SetUpPreLoad(compiler, trace, preload);
4110
4111 // For now we just call all choices one after the other. The idea ultimately
4112 // is to use the Dispatch table to try only the relevant ones.
4113 int choice_count = alternatives_->length();
4114
4115 int new_flush_budget = trace->flush_budget() / choice_count;
4116
4117 for (int i = first_choice; i < choice_count; i++) {
4118 bool is_last = i == choice_count - 1;
4119 bool fall_through_on_failure = !is_last;
4120 GuardedAlternative alternative = alternatives_->at(i);
4121 AlternativeGeneration* alt_gen = alt_gens->at(i);
4122 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4123 ZoneList<Guard*>* guards = alternative.guards();
4124 int guard_count = (guards == NULL) ? 0 : guards->length();
4125 Trace new_trace(*trace);
4126 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4127 preload->preload_characters_ :
4128 0);
4129 if (preload->preload_has_checked_bounds_) {
4130 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4131 }
4132 new_trace.quick_check_performed()->Clear();
4133 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4134 if (!is_last) {
4135 new_trace.set_backtrack(&alt_gen->after);
4136 }
4137 alt_gen->expects_preload = preload->preload_is_current_;
4138 bool generate_full_check_inline = false;
4139 if (FLAG_regexp_optimization &&
4140 try_to_emit_quick_check_for_alternative(i == 0) &&
4141 alternative.node()->EmitQuickCheck(compiler,
4142 trace,
4143 &new_trace,
4144 preload->preload_has_checked_bounds_,
4145 &alt_gen->possible_success,
4146 &alt_gen->quick_check_details,
4147 fall_through_on_failure)) {
4148 // Quick check was generated for this choice.
4149 preload->preload_is_current_ = true;
4150 preload->preload_has_checked_bounds_ = true;
4151 // If we generated the quick check to fall through on possible success,
4152 // we now need to generate the full check inline.
4153 if (!fall_through_on_failure) {
4154 macro_assembler->Bind(&alt_gen->possible_success);
4155 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4156 new_trace.set_characters_preloaded(preload->preload_characters_);
4157 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4158 generate_full_check_inline = true;
4159 }
4160 } else if (alt_gen->quick_check_details.cannot_match()) {
4161 if (!fall_through_on_failure) {
4162 macro_assembler->GoTo(trace->backtrack());
4163 }
4164 continue;
4165 } else {
4166 // No quick check was generated. Put the full code here.
4167 // If this is not the first choice then there could be slow checks from
4168 // previous cases that go here when they fail. There's no reason to
4169 // insist that they preload characters since the slow check we are about
4170 // to generate probably can't use it.
4171 if (i != first_choice) {
4172 alt_gen->expects_preload = false;
4173 new_trace.InvalidateCurrentCharacter();
4174 }
4175 generate_full_check_inline = true;
4176 }
4177 if (generate_full_check_inline) {
4178 if (new_trace.actions() != NULL) {
4179 new_trace.set_flush_budget(new_flush_budget);
4180 }
4181 for (int j = 0; j < guard_count; j++) {
4182 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4183 }
4184 alternative.node()->Emit(compiler, &new_trace);
4185 preload->preload_is_current_ = false;
4186 }
4187 macro_assembler->Bind(&alt_gen->after);
4188 }
4189 }
4190
4191
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4192 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4193 Trace* trace,
4194 GuardedAlternative alternative,
4195 AlternativeGeneration* alt_gen,
4196 int preload_characters,
4197 bool next_expects_preload) {
4198 if (!alt_gen->possible_success.is_linked()) return;
4199
4200 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4201 macro_assembler->Bind(&alt_gen->possible_success);
4202 Trace out_of_line_trace(*trace);
4203 out_of_line_trace.set_characters_preloaded(preload_characters);
4204 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4205 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4206 ZoneList<Guard*>* guards = alternative.guards();
4207 int guard_count = (guards == NULL) ? 0 : guards->length();
4208 if (next_expects_preload) {
4209 Label reload_current_char;
4210 out_of_line_trace.set_backtrack(&reload_current_char);
4211 for (int j = 0; j < guard_count; j++) {
4212 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4213 }
4214 alternative.node()->Emit(compiler, &out_of_line_trace);
4215 macro_assembler->Bind(&reload_current_char);
4216 // Reload the current character, since the next quick check expects that.
4217 // We don't need to check bounds here because we only get into this
4218 // code through a quick check which already did the checked load.
4219 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4220 NULL,
4221 false,
4222 preload_characters);
4223 macro_assembler->GoTo(&(alt_gen->after));
4224 } else {
4225 out_of_line_trace.set_backtrack(&(alt_gen->after));
4226 for (int j = 0; j < guard_count; j++) {
4227 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4228 }
4229 alternative.node()->Emit(compiler, &out_of_line_trace);
4230 }
4231 }
4232
4233
Emit(RegExpCompiler * compiler,Trace * trace)4234 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4235 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4236 LimitResult limit_result = LimitVersions(compiler, trace);
4237 if (limit_result == DONE) return;
4238 DCHECK(limit_result == CONTINUE);
4239
4240 RecursionCheck rc(compiler);
4241
4242 switch (action_type_) {
4243 case STORE_POSITION: {
4244 Trace::DeferredCapture
4245 new_capture(data_.u_position_register.reg,
4246 data_.u_position_register.is_capture,
4247 trace);
4248 Trace new_trace = *trace;
4249 new_trace.add_action(&new_capture);
4250 on_success()->Emit(compiler, &new_trace);
4251 break;
4252 }
4253 case INCREMENT_REGISTER: {
4254 Trace::DeferredIncrementRegister
4255 new_increment(data_.u_increment_register.reg);
4256 Trace new_trace = *trace;
4257 new_trace.add_action(&new_increment);
4258 on_success()->Emit(compiler, &new_trace);
4259 break;
4260 }
4261 case SET_REGISTER: {
4262 Trace::DeferredSetRegister
4263 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4264 Trace new_trace = *trace;
4265 new_trace.add_action(&new_set);
4266 on_success()->Emit(compiler, &new_trace);
4267 break;
4268 }
4269 case CLEAR_CAPTURES: {
4270 Trace::DeferredClearCaptures
4271 new_capture(Interval(data_.u_clear_captures.range_from,
4272 data_.u_clear_captures.range_to));
4273 Trace new_trace = *trace;
4274 new_trace.add_action(&new_capture);
4275 on_success()->Emit(compiler, &new_trace);
4276 break;
4277 }
4278 case BEGIN_SUBMATCH:
4279 if (!trace->is_trivial()) {
4280 trace->Flush(compiler, this);
4281 } else {
4282 assembler->WriteCurrentPositionToRegister(
4283 data_.u_submatch.current_position_register, 0);
4284 assembler->WriteStackPointerToRegister(
4285 data_.u_submatch.stack_pointer_register);
4286 on_success()->Emit(compiler, trace);
4287 }
4288 break;
4289 case EMPTY_MATCH_CHECK: {
4290 int start_pos_reg = data_.u_empty_match_check.start_register;
4291 int stored_pos = 0;
4292 int rep_reg = data_.u_empty_match_check.repetition_register;
4293 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4294 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4295 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4296 // If we know we haven't advanced and there is no minimum we
4297 // can just backtrack immediately.
4298 assembler->GoTo(trace->backtrack());
4299 } else if (know_dist && stored_pos < trace->cp_offset()) {
4300 // If we know we've advanced we can generate the continuation
4301 // immediately.
4302 on_success()->Emit(compiler, trace);
4303 } else if (!trace->is_trivial()) {
4304 trace->Flush(compiler, this);
4305 } else {
4306 Label skip_empty_check;
4307 // If we have a minimum number of repetitions we check the current
4308 // number first and skip the empty check if it's not enough.
4309 if (has_minimum) {
4310 int limit = data_.u_empty_match_check.repetition_limit;
4311 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4312 }
4313 // If the match is empty we bail out, otherwise we fall through
4314 // to the on-success continuation.
4315 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4316 trace->backtrack());
4317 assembler->Bind(&skip_empty_check);
4318 on_success()->Emit(compiler, trace);
4319 }
4320 break;
4321 }
4322 case POSITIVE_SUBMATCH_SUCCESS: {
4323 if (!trace->is_trivial()) {
4324 trace->Flush(compiler, this);
4325 return;
4326 }
4327 assembler->ReadCurrentPositionFromRegister(
4328 data_.u_submatch.current_position_register);
4329 assembler->ReadStackPointerFromRegister(
4330 data_.u_submatch.stack_pointer_register);
4331 int clear_register_count = data_.u_submatch.clear_register_count;
4332 if (clear_register_count == 0) {
4333 on_success()->Emit(compiler, trace);
4334 return;
4335 }
4336 int clear_registers_from = data_.u_submatch.clear_register_from;
4337 Label clear_registers_backtrack;
4338 Trace new_trace = *trace;
4339 new_trace.set_backtrack(&clear_registers_backtrack);
4340 on_success()->Emit(compiler, &new_trace);
4341
4342 assembler->Bind(&clear_registers_backtrack);
4343 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4344 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4345
4346 DCHECK(trace->backtrack() == NULL);
4347 assembler->Backtrack();
4348 return;
4349 }
4350 default:
4351 UNREACHABLE();
4352 }
4353 }
4354
4355
Emit(RegExpCompiler * compiler,Trace * trace)4356 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4357 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4358 if (!trace->is_trivial()) {
4359 trace->Flush(compiler, this);
4360 return;
4361 }
4362
4363 LimitResult limit_result = LimitVersions(compiler, trace);
4364 if (limit_result == DONE) return;
4365 DCHECK(limit_result == CONTINUE);
4366
4367 RecursionCheck rc(compiler);
4368
4369 DCHECK_EQ(start_reg_ + 1, end_reg_);
4370 if (compiler->ignore_case()) {
4371 assembler->CheckNotBackReferenceIgnoreCase(start_reg_,
4372 trace->backtrack());
4373 } else {
4374 assembler->CheckNotBackReference(start_reg_, trace->backtrack());
4375 }
4376 on_success()->Emit(compiler, trace);
4377 }
4378
4379
4380 // -------------------------------------------------------------------
4381 // Dot/dotty output
4382
4383
4384 #ifdef DEBUG
4385
4386
4387 class DotPrinter: public NodeVisitor {
4388 public:
DotPrinter(OStream & os,bool ignore_case)4389 DotPrinter(OStream& os, bool ignore_case) // NOLINT
4390 : os_(os),
4391 ignore_case_(ignore_case) {}
4392 void PrintNode(const char* label, RegExpNode* node);
4393 void Visit(RegExpNode* node);
4394 void PrintAttributes(RegExpNode* from);
4395 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4396 #define DECLARE_VISIT(Type) \
4397 virtual void Visit##Type(Type##Node* that);
4398 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4399 #undef DECLARE_VISIT
4400 private:
4401 OStream& os_;
4402 bool ignore_case_;
4403 };
4404
4405
PrintNode(const char * label,RegExpNode * node)4406 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4407 os_ << "digraph G {\n graph [label=\"";
4408 for (int i = 0; label[i]; i++) {
4409 switch (label[i]) {
4410 case '\\':
4411 os_ << "\\\\";
4412 break;
4413 case '"':
4414 os_ << "\"";
4415 break;
4416 default:
4417 os_ << label[i];
4418 break;
4419 }
4420 }
4421 os_ << "\"];\n";
4422 Visit(node);
4423 os_ << "}" << endl;
4424 }
4425
4426
Visit(RegExpNode * node)4427 void DotPrinter::Visit(RegExpNode* node) {
4428 if (node->info()->visited) return;
4429 node->info()->visited = true;
4430 node->Accept(this);
4431 }
4432
4433
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4434 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4435 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4436 Visit(on_failure);
4437 }
4438
4439
4440 class TableEntryBodyPrinter {
4441 public:
TableEntryBodyPrinter(OStream & os,ChoiceNode * choice)4442 TableEntryBodyPrinter(OStream& os, ChoiceNode* choice) // NOLINT
4443 : os_(os),
4444 choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4445 void Call(uc16 from, DispatchTable::Entry entry) {
4446 OutSet* out_set = entry.out_set();
4447 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4448 if (out_set->Get(i)) {
4449 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4450 << choice()->alternatives()->at(i).node() << ";\n";
4451 }
4452 }
4453 }
4454 private:
choice()4455 ChoiceNode* choice() { return choice_; }
4456 OStream& os_;
4457 ChoiceNode* choice_;
4458 };
4459
4460
4461 class TableEntryHeaderPrinter {
4462 public:
TableEntryHeaderPrinter(OStream & os)4463 explicit TableEntryHeaderPrinter(OStream& os) // NOLINT
4464 : first_(true),
4465 os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4466 void Call(uc16 from, DispatchTable::Entry entry) {
4467 if (first_) {
4468 first_ = false;
4469 } else {
4470 os_ << "|";
4471 }
4472 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4473 OutSet* out_set = entry.out_set();
4474 int priority = 0;
4475 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4476 if (out_set->Get(i)) {
4477 if (priority > 0) os_ << "|";
4478 os_ << "<s" << from << "o" << i << "> " << priority;
4479 priority++;
4480 }
4481 }
4482 os_ << "}}";
4483 }
4484
4485 private:
4486 bool first_;
4487 OStream& os_;
4488 };
4489
4490
4491 class AttributePrinter {
4492 public:
AttributePrinter(OStream & os)4493 explicit AttributePrinter(OStream& os) // NOLINT
4494 : os_(os),
4495 first_(true) {}
PrintSeparator()4496 void PrintSeparator() {
4497 if (first_) {
4498 first_ = false;
4499 } else {
4500 os_ << "|";
4501 }
4502 }
PrintBit(const char * name,bool value)4503 void PrintBit(const char* name, bool value) {
4504 if (!value) return;
4505 PrintSeparator();
4506 os_ << "{" << name << "}";
4507 }
PrintPositive(const char * name,int value)4508 void PrintPositive(const char* name, int value) {
4509 if (value < 0) return;
4510 PrintSeparator();
4511 os_ << "{" << name << "|" << value << "}";
4512 }
4513
4514 private:
4515 OStream& os_;
4516 bool first_;
4517 };
4518
4519
PrintAttributes(RegExpNode * that)4520 void DotPrinter::PrintAttributes(RegExpNode* that) {
4521 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4522 << "margin=0.1, fontsize=10, label=\"{";
4523 AttributePrinter printer(os_);
4524 NodeInfo* info = that->info();
4525 printer.PrintBit("NI", info->follows_newline_interest);
4526 printer.PrintBit("WI", info->follows_word_interest);
4527 printer.PrintBit("SI", info->follows_start_interest);
4528 Label* label = that->label();
4529 if (label->is_bound())
4530 printer.PrintPositive("@", label->pos());
4531 os_ << "}\"];\n"
4532 << " a" << that << " -> n" << that
4533 << " [style=dashed, color=grey, arrowhead=none];\n";
4534 }
4535
4536
4537 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4538 void DotPrinter::VisitChoice(ChoiceNode* that) {
4539 if (kPrintDispatchTable) {
4540 os_ << " n" << that << " [shape=Mrecord, label=\"";
4541 TableEntryHeaderPrinter header_printer(os_);
4542 that->GetTable(ignore_case_)->ForEach(&header_printer);
4543 os_ << "\"]\n";
4544 PrintAttributes(that);
4545 TableEntryBodyPrinter body_printer(os_, that);
4546 that->GetTable(ignore_case_)->ForEach(&body_printer);
4547 } else {
4548 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4549 for (int i = 0; i < that->alternatives()->length(); i++) {
4550 GuardedAlternative alt = that->alternatives()->at(i);
4551 os_ << " n" << that << " -> n" << alt.node();
4552 }
4553 }
4554 for (int i = 0; i < that->alternatives()->length(); i++) {
4555 GuardedAlternative alt = that->alternatives()->at(i);
4556 alt.node()->Accept(this);
4557 }
4558 }
4559
4560
VisitText(TextNode * that)4561 void DotPrinter::VisitText(TextNode* that) {
4562 Zone* zone = that->zone();
4563 os_ << " n" << that << " [label=\"";
4564 for (int i = 0; i < that->elements()->length(); i++) {
4565 if (i > 0) os_ << " ";
4566 TextElement elm = that->elements()->at(i);
4567 switch (elm.text_type()) {
4568 case TextElement::ATOM: {
4569 Vector<const uc16> data = elm.atom()->data();
4570 for (int i = 0; i < data.length(); i++) {
4571 os_ << static_cast<char>(data[i]);
4572 }
4573 break;
4574 }
4575 case TextElement::CHAR_CLASS: {
4576 RegExpCharacterClass* node = elm.char_class();
4577 os_ << "[";
4578 if (node->is_negated()) os_ << "^";
4579 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4580 CharacterRange range = node->ranges(zone)->at(j);
4581 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4582 }
4583 os_ << "]";
4584 break;
4585 }
4586 default:
4587 UNREACHABLE();
4588 }
4589 }
4590 os_ << "\", shape=box, peripheries=2];\n";
4591 PrintAttributes(that);
4592 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4593 Visit(that->on_success());
4594 }
4595
4596
VisitBackReference(BackReferenceNode * that)4597 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4598 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4599 << that->end_register() << "\", shape=doubleoctagon];\n";
4600 PrintAttributes(that);
4601 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4602 Visit(that->on_success());
4603 }
4604
4605
VisitEnd(EndNode * that)4606 void DotPrinter::VisitEnd(EndNode* that) {
4607 os_ << " n" << that << " [style=bold, shape=point];\n";
4608 PrintAttributes(that);
4609 }
4610
4611
VisitAssertion(AssertionNode * that)4612 void DotPrinter::VisitAssertion(AssertionNode* that) {
4613 os_ << " n" << that << " [";
4614 switch (that->assertion_type()) {
4615 case AssertionNode::AT_END:
4616 os_ << "label=\"$\", shape=septagon";
4617 break;
4618 case AssertionNode::AT_START:
4619 os_ << "label=\"^\", shape=septagon";
4620 break;
4621 case AssertionNode::AT_BOUNDARY:
4622 os_ << "label=\"\\b\", shape=septagon";
4623 break;
4624 case AssertionNode::AT_NON_BOUNDARY:
4625 os_ << "label=\"\\B\", shape=septagon";
4626 break;
4627 case AssertionNode::AFTER_NEWLINE:
4628 os_ << "label=\"(?<=\\n)\", shape=septagon";
4629 break;
4630 }
4631 os_ << "];\n";
4632 PrintAttributes(that);
4633 RegExpNode* successor = that->on_success();
4634 os_ << " n" << that << " -> n" << successor << ";\n";
4635 Visit(successor);
4636 }
4637
4638
VisitAction(ActionNode * that)4639 void DotPrinter::VisitAction(ActionNode* that) {
4640 os_ << " n" << that << " [";
4641 switch (that->action_type_) {
4642 case ActionNode::SET_REGISTER:
4643 os_ << "label=\"$" << that->data_.u_store_register.reg
4644 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4645 break;
4646 case ActionNode::INCREMENT_REGISTER:
4647 os_ << "label=\"$" << that->data_.u_increment_register.reg
4648 << "++\", shape=octagon";
4649 break;
4650 case ActionNode::STORE_POSITION:
4651 os_ << "label=\"$" << that->data_.u_position_register.reg
4652 << ":=$pos\", shape=octagon";
4653 break;
4654 case ActionNode::BEGIN_SUBMATCH:
4655 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4656 << ":=$pos,begin\", shape=septagon";
4657 break;
4658 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4659 os_ << "label=\"escape\", shape=septagon";
4660 break;
4661 case ActionNode::EMPTY_MATCH_CHECK:
4662 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4663 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4664 << "<" << that->data_.u_empty_match_check.repetition_limit
4665 << "?\", shape=septagon";
4666 break;
4667 case ActionNode::CLEAR_CAPTURES: {
4668 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4669 << " to $" << that->data_.u_clear_captures.range_to
4670 << "\", shape=septagon";
4671 break;
4672 }
4673 }
4674 os_ << "];\n";
4675 PrintAttributes(that);
4676 RegExpNode* successor = that->on_success();
4677 os_ << " n" << that << " -> n" << successor << ";\n";
4678 Visit(successor);
4679 }
4680
4681
4682 class DispatchTableDumper {
4683 public:
DispatchTableDumper(OStream & os)4684 explicit DispatchTableDumper(OStream& os) : os_(os) {}
4685 void Call(uc16 key, DispatchTable::Entry entry);
4686 private:
4687 OStream& os_;
4688 };
4689
4690
Call(uc16 key,DispatchTable::Entry entry)4691 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4692 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4693 OutSet* set = entry.out_set();
4694 bool first = true;
4695 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4696 if (set->Get(i)) {
4697 if (first) {
4698 first = false;
4699 } else {
4700 os_ << ", ";
4701 }
4702 os_ << i;
4703 }
4704 }
4705 os_ << "}\n";
4706 }
4707
4708
Dump()4709 void DispatchTable::Dump() {
4710 OFStream os(stderr);
4711 DispatchTableDumper dumper(os);
4712 tree()->ForEach(&dumper);
4713 }
4714
4715
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4716 void RegExpEngine::DotPrint(const char* label,
4717 RegExpNode* node,
4718 bool ignore_case) {
4719 OFStream os(stdout);
4720 DotPrinter printer(os, ignore_case);
4721 printer.PrintNode(label, node);
4722 }
4723
4724
4725 #endif // DEBUG
4726
4727
4728 // -------------------------------------------------------------------
4729 // Tree to graph conversion
4730
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4731 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4732 RegExpNode* on_success) {
4733 ZoneList<TextElement>* elms =
4734 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4735 elms->Add(TextElement::Atom(this), compiler->zone());
4736 return new(compiler->zone()) TextNode(elms, on_success);
4737 }
4738
4739
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4740 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4741 RegExpNode* on_success) {
4742 return new(compiler->zone()) TextNode(elements(), on_success);
4743 }
4744
4745
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4746 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4747 const int* special_class,
4748 int length) {
4749 length--; // Remove final 0x10000.
4750 DCHECK(special_class[length] == 0x10000);
4751 DCHECK(ranges->length() != 0);
4752 DCHECK(length != 0);
4753 DCHECK(special_class[0] != 0);
4754 if (ranges->length() != (length >> 1) + 1) {
4755 return false;
4756 }
4757 CharacterRange range = ranges->at(0);
4758 if (range.from() != 0) {
4759 return false;
4760 }
4761 for (int i = 0; i < length; i += 2) {
4762 if (special_class[i] != (range.to() + 1)) {
4763 return false;
4764 }
4765 range = ranges->at((i >> 1) + 1);
4766 if (special_class[i+1] != range.from()) {
4767 return false;
4768 }
4769 }
4770 if (range.to() != 0xffff) {
4771 return false;
4772 }
4773 return true;
4774 }
4775
4776
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4777 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4778 const int* special_class,
4779 int length) {
4780 length--; // Remove final 0x10000.
4781 DCHECK(special_class[length] == 0x10000);
4782 if (ranges->length() * 2 != length) {
4783 return false;
4784 }
4785 for (int i = 0; i < length; i += 2) {
4786 CharacterRange range = ranges->at(i >> 1);
4787 if (range.from() != special_class[i] ||
4788 range.to() != special_class[i + 1] - 1) {
4789 return false;
4790 }
4791 }
4792 return true;
4793 }
4794
4795
is_standard(Zone * zone)4796 bool RegExpCharacterClass::is_standard(Zone* zone) {
4797 // TODO(lrn): Remove need for this function, by not throwing away information
4798 // along the way.
4799 if (is_negated_) {
4800 return false;
4801 }
4802 if (set_.is_standard()) {
4803 return true;
4804 }
4805 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4806 set_.set_standard_set_type('s');
4807 return true;
4808 }
4809 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4810 set_.set_standard_set_type('S');
4811 return true;
4812 }
4813 if (CompareInverseRanges(set_.ranges(zone),
4814 kLineTerminatorRanges,
4815 kLineTerminatorRangeCount)) {
4816 set_.set_standard_set_type('.');
4817 return true;
4818 }
4819 if (CompareRanges(set_.ranges(zone),
4820 kLineTerminatorRanges,
4821 kLineTerminatorRangeCount)) {
4822 set_.set_standard_set_type('n');
4823 return true;
4824 }
4825 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4826 set_.set_standard_set_type('w');
4827 return true;
4828 }
4829 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4830 set_.set_standard_set_type('W');
4831 return true;
4832 }
4833 return false;
4834 }
4835
4836
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4837 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4838 RegExpNode* on_success) {
4839 return new(compiler->zone()) TextNode(this, on_success);
4840 }
4841
4842
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4843 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4844 RegExpNode* on_success) {
4845 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4846 int length = alternatives->length();
4847 ChoiceNode* result =
4848 new(compiler->zone()) ChoiceNode(length, compiler->zone());
4849 for (int i = 0; i < length; i++) {
4850 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4851 on_success));
4852 result->AddAlternative(alternative);
4853 }
4854 return result;
4855 }
4856
4857
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4858 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
4859 RegExpNode* on_success) {
4860 return ToNode(min(),
4861 max(),
4862 is_greedy(),
4863 body(),
4864 compiler,
4865 on_success);
4866 }
4867
4868
4869 // Scoped object to keep track of how much we unroll quantifier loops in the
4870 // regexp graph generator.
4871 class RegExpExpansionLimiter {
4872 public:
4873 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)4874 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
4875 : compiler_(compiler),
4876 saved_expansion_factor_(compiler->current_expansion_factor()),
4877 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
4878 DCHECK(factor > 0);
4879 if (ok_to_expand_) {
4880 if (factor > kMaxExpansionFactor) {
4881 // Avoid integer overflow of the current expansion factor.
4882 ok_to_expand_ = false;
4883 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
4884 } else {
4885 int new_factor = saved_expansion_factor_ * factor;
4886 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
4887 compiler->set_current_expansion_factor(new_factor);
4888 }
4889 }
4890 }
4891
~RegExpExpansionLimiter()4892 ~RegExpExpansionLimiter() {
4893 compiler_->set_current_expansion_factor(saved_expansion_factor_);
4894 }
4895
ok_to_expand()4896 bool ok_to_expand() { return ok_to_expand_; }
4897
4898 private:
4899 RegExpCompiler* compiler_;
4900 int saved_expansion_factor_;
4901 bool ok_to_expand_;
4902
4903 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
4904 };
4905
4906
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)4907 RegExpNode* RegExpQuantifier::ToNode(int min,
4908 int max,
4909 bool is_greedy,
4910 RegExpTree* body,
4911 RegExpCompiler* compiler,
4912 RegExpNode* on_success,
4913 bool not_at_start) {
4914 // x{f, t} becomes this:
4915 //
4916 // (r++)<-.
4917 // | `
4918 // | (x)
4919 // v ^
4920 // (r=0)-->(?)---/ [if r < t]
4921 // |
4922 // [if r >= f] \----> ...
4923 //
4924
4925 // 15.10.2.5 RepeatMatcher algorithm.
4926 // The parser has already eliminated the case where max is 0. In the case
4927 // where max_match is zero the parser has removed the quantifier if min was
4928 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
4929
4930 // If we know that we cannot match zero length then things are a little
4931 // simpler since we don't need to make the special zero length match check
4932 // from step 2.1. If the min and max are small we can unroll a little in
4933 // this case.
4934 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
4935 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
4936 if (max == 0) return on_success; // This can happen due to recursion.
4937 bool body_can_be_empty = (body->min_match() == 0);
4938 int body_start_reg = RegExpCompiler::kNoRegister;
4939 Interval capture_registers = body->CaptureRegisters();
4940 bool needs_capture_clearing = !capture_registers.is_empty();
4941 Zone* zone = compiler->zone();
4942
4943 if (body_can_be_empty) {
4944 body_start_reg = compiler->AllocateRegister();
4945 } else if (FLAG_regexp_optimization && !needs_capture_clearing) {
4946 // Only unroll if there are no captures and the body can't be
4947 // empty.
4948 {
4949 RegExpExpansionLimiter limiter(
4950 compiler, min + ((max != min) ? 1 : 0));
4951 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
4952 int new_max = (max == kInfinity) ? max : max - min;
4953 // Recurse once to get the loop or optional matches after the fixed
4954 // ones.
4955 RegExpNode* answer = ToNode(
4956 0, new_max, is_greedy, body, compiler, on_success, true);
4957 // Unroll the forced matches from 0 to min. This can cause chains of
4958 // TextNodes (which the parser does not generate). These should be
4959 // combined if it turns out they hinder good code generation.
4960 for (int i = 0; i < min; i++) {
4961 answer = body->ToNode(compiler, answer);
4962 }
4963 return answer;
4964 }
4965 }
4966 if (max <= kMaxUnrolledMaxMatches && min == 0) {
4967 DCHECK(max > 0); // Due to the 'if' above.
4968 RegExpExpansionLimiter limiter(compiler, max);
4969 if (limiter.ok_to_expand()) {
4970 // Unroll the optional matches up to max.
4971 RegExpNode* answer = on_success;
4972 for (int i = 0; i < max; i++) {
4973 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
4974 if (is_greedy) {
4975 alternation->AddAlternative(
4976 GuardedAlternative(body->ToNode(compiler, answer)));
4977 alternation->AddAlternative(GuardedAlternative(on_success));
4978 } else {
4979 alternation->AddAlternative(GuardedAlternative(on_success));
4980 alternation->AddAlternative(
4981 GuardedAlternative(body->ToNode(compiler, answer)));
4982 }
4983 answer = alternation;
4984 if (not_at_start) alternation->set_not_at_start();
4985 }
4986 return answer;
4987 }
4988 }
4989 }
4990 bool has_min = min > 0;
4991 bool has_max = max < RegExpTree::kInfinity;
4992 bool needs_counter = has_min || has_max;
4993 int reg_ctr = needs_counter
4994 ? compiler->AllocateRegister()
4995 : RegExpCompiler::kNoRegister;
4996 LoopChoiceNode* center = new(zone) LoopChoiceNode(body->min_match() == 0,
4997 zone);
4998 if (not_at_start) center->set_not_at_start();
4999 RegExpNode* loop_return = needs_counter
5000 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5001 : static_cast<RegExpNode*>(center);
5002 if (body_can_be_empty) {
5003 // If the body can be empty we need to check if it was and then
5004 // backtrack.
5005 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5006 reg_ctr,
5007 min,
5008 loop_return);
5009 }
5010 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5011 if (body_can_be_empty) {
5012 // If the body can be empty we need to store the start position
5013 // so we can bail out if it was empty.
5014 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5015 }
5016 if (needs_capture_clearing) {
5017 // Before entering the body of this loop we need to clear captures.
5018 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5019 }
5020 GuardedAlternative body_alt(body_node);
5021 if (has_max) {
5022 Guard* body_guard =
5023 new(zone) Guard(reg_ctr, Guard::LT, max);
5024 body_alt.AddGuard(body_guard, zone);
5025 }
5026 GuardedAlternative rest_alt(on_success);
5027 if (has_min) {
5028 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5029 rest_alt.AddGuard(rest_guard, zone);
5030 }
5031 if (is_greedy) {
5032 center->AddLoopAlternative(body_alt);
5033 center->AddContinueAlternative(rest_alt);
5034 } else {
5035 center->AddContinueAlternative(rest_alt);
5036 center->AddLoopAlternative(body_alt);
5037 }
5038 if (needs_counter) {
5039 return ActionNode::SetRegister(reg_ctr, 0, center);
5040 } else {
5041 return center;
5042 }
5043 }
5044
5045
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5046 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5047 RegExpNode* on_success) {
5048 NodeInfo info;
5049 Zone* zone = compiler->zone();
5050
5051 switch (assertion_type()) {
5052 case START_OF_LINE:
5053 return AssertionNode::AfterNewline(on_success);
5054 case START_OF_INPUT:
5055 return AssertionNode::AtStart(on_success);
5056 case BOUNDARY:
5057 return AssertionNode::AtBoundary(on_success);
5058 case NON_BOUNDARY:
5059 return AssertionNode::AtNonBoundary(on_success);
5060 case END_OF_INPUT:
5061 return AssertionNode::AtEnd(on_success);
5062 case END_OF_LINE: {
5063 // Compile $ in multiline regexps as an alternation with a positive
5064 // lookahead in one side and an end-of-input on the other side.
5065 // We need two registers for the lookahead.
5066 int stack_pointer_register = compiler->AllocateRegister();
5067 int position_register = compiler->AllocateRegister();
5068 // The ChoiceNode to distinguish between a newline and end-of-input.
5069 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5070 // Create a newline atom.
5071 ZoneList<CharacterRange>* newline_ranges =
5072 new(zone) ZoneList<CharacterRange>(3, zone);
5073 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5074 RegExpCharacterClass* newline_atom = new(zone) RegExpCharacterClass('n');
5075 TextNode* newline_matcher = new(zone) TextNode(
5076 newline_atom,
5077 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5078 position_register,
5079 0, // No captures inside.
5080 -1, // Ignored if no captures.
5081 on_success));
5082 // Create an end-of-input matcher.
5083 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5084 stack_pointer_register,
5085 position_register,
5086 newline_matcher);
5087 // Add the two alternatives to the ChoiceNode.
5088 GuardedAlternative eol_alternative(end_of_line);
5089 result->AddAlternative(eol_alternative);
5090 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5091 result->AddAlternative(end_alternative);
5092 return result;
5093 }
5094 default:
5095 UNREACHABLE();
5096 }
5097 return on_success;
5098 }
5099
5100
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5101 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5102 RegExpNode* on_success) {
5103 return new(compiler->zone())
5104 BackReferenceNode(RegExpCapture::StartRegister(index()),
5105 RegExpCapture::EndRegister(index()),
5106 on_success);
5107 }
5108
5109
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5110 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5111 RegExpNode* on_success) {
5112 return on_success;
5113 }
5114
5115
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5116 RegExpNode* RegExpLookahead::ToNode(RegExpCompiler* compiler,
5117 RegExpNode* on_success) {
5118 int stack_pointer_register = compiler->AllocateRegister();
5119 int position_register = compiler->AllocateRegister();
5120
5121 const int registers_per_capture = 2;
5122 const int register_of_first_capture = 2;
5123 int register_count = capture_count_ * registers_per_capture;
5124 int register_start =
5125 register_of_first_capture + capture_from_ * registers_per_capture;
5126
5127 RegExpNode* success;
5128 if (is_positive()) {
5129 RegExpNode* node = ActionNode::BeginSubmatch(
5130 stack_pointer_register,
5131 position_register,
5132 body()->ToNode(
5133 compiler,
5134 ActionNode::PositiveSubmatchSuccess(stack_pointer_register,
5135 position_register,
5136 register_count,
5137 register_start,
5138 on_success)));
5139 return node;
5140 } else {
5141 // We use a ChoiceNode for a negative lookahead because it has most of
5142 // the characteristics we need. It has the body of the lookahead as its
5143 // first alternative and the expression after the lookahead of the second
5144 // alternative. If the first alternative succeeds then the
5145 // NegativeSubmatchSuccess will unwind the stack including everything the
5146 // choice node set up and backtrack. If the first alternative fails then
5147 // the second alternative is tried, which is exactly the desired result
5148 // for a negative lookahead. The NegativeLookaheadChoiceNode is a special
5149 // ChoiceNode that knows to ignore the first exit when calculating quick
5150 // checks.
5151 Zone* zone = compiler->zone();
5152
5153 GuardedAlternative body_alt(
5154 body()->ToNode(
5155 compiler,
5156 success = new(zone) NegativeSubmatchSuccess(stack_pointer_register,
5157 position_register,
5158 register_count,
5159 register_start,
5160 zone)));
5161 ChoiceNode* choice_node =
5162 new(zone) NegativeLookaheadChoiceNode(body_alt,
5163 GuardedAlternative(on_success),
5164 zone);
5165 return ActionNode::BeginSubmatch(stack_pointer_register,
5166 position_register,
5167 choice_node);
5168 }
5169 }
5170
5171
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5172 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5173 RegExpNode* on_success) {
5174 return ToNode(body(), index(), compiler, on_success);
5175 }
5176
5177
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5178 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5179 int index,
5180 RegExpCompiler* compiler,
5181 RegExpNode* on_success) {
5182 int start_reg = RegExpCapture::StartRegister(index);
5183 int end_reg = RegExpCapture::EndRegister(index);
5184 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5185 RegExpNode* body_node = body->ToNode(compiler, store_end);
5186 return ActionNode::StorePosition(start_reg, true, body_node);
5187 }
5188
5189
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5190 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5191 RegExpNode* on_success) {
5192 ZoneList<RegExpTree*>* children = nodes();
5193 RegExpNode* current = on_success;
5194 for (int i = children->length() - 1; i >= 0; i--) {
5195 current = children->at(i)->ToNode(compiler, current);
5196 }
5197 return current;
5198 }
5199
5200
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5201 static void AddClass(const int* elmv,
5202 int elmc,
5203 ZoneList<CharacterRange>* ranges,
5204 Zone* zone) {
5205 elmc--;
5206 DCHECK(elmv[elmc] == 0x10000);
5207 for (int i = 0; i < elmc; i += 2) {
5208 DCHECK(elmv[i] < elmv[i + 1]);
5209 ranges->Add(CharacterRange(elmv[i], elmv[i + 1] - 1), zone);
5210 }
5211 }
5212
5213
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5214 static void AddClassNegated(const int *elmv,
5215 int elmc,
5216 ZoneList<CharacterRange>* ranges,
5217 Zone* zone) {
5218 elmc--;
5219 DCHECK(elmv[elmc] == 0x10000);
5220 DCHECK(elmv[0] != 0x0000);
5221 DCHECK(elmv[elmc-1] != String::kMaxUtf16CodeUnit);
5222 uc16 last = 0x0000;
5223 for (int i = 0; i < elmc; i += 2) {
5224 DCHECK(last <= elmv[i] - 1);
5225 DCHECK(elmv[i] < elmv[i + 1]);
5226 ranges->Add(CharacterRange(last, elmv[i] - 1), zone);
5227 last = elmv[i + 1];
5228 }
5229 ranges->Add(CharacterRange(last, String::kMaxUtf16CodeUnit), zone);
5230 }
5231
5232
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5233 void CharacterRange::AddClassEscape(uc16 type,
5234 ZoneList<CharacterRange>* ranges,
5235 Zone* zone) {
5236 switch (type) {
5237 case 's':
5238 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5239 break;
5240 case 'S':
5241 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5242 break;
5243 case 'w':
5244 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5245 break;
5246 case 'W':
5247 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5248 break;
5249 case 'd':
5250 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5251 break;
5252 case 'D':
5253 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5254 break;
5255 case '.':
5256 AddClassNegated(kLineTerminatorRanges,
5257 kLineTerminatorRangeCount,
5258 ranges,
5259 zone);
5260 break;
5261 // This is not a character range as defined by the spec but a
5262 // convenient shorthand for a character class that matches any
5263 // character.
5264 case '*':
5265 ranges->Add(CharacterRange::Everything(), zone);
5266 break;
5267 // This is the set of characters matched by the $ and ^ symbols
5268 // in multiline mode.
5269 case 'n':
5270 AddClass(kLineTerminatorRanges,
5271 kLineTerminatorRangeCount,
5272 ranges,
5273 zone);
5274 break;
5275 default:
5276 UNREACHABLE();
5277 }
5278 }
5279
5280
GetWordBounds()5281 Vector<const int> CharacterRange::GetWordBounds() {
5282 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5283 }
5284
5285
5286 class CharacterRangeSplitter {
5287 public:
CharacterRangeSplitter(ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5288 CharacterRangeSplitter(ZoneList<CharacterRange>** included,
5289 ZoneList<CharacterRange>** excluded,
5290 Zone* zone)
5291 : included_(included),
5292 excluded_(excluded),
5293 zone_(zone) { }
5294 void Call(uc16 from, DispatchTable::Entry entry);
5295
5296 static const int kInBase = 0;
5297 static const int kInOverlay = 1;
5298
5299 private:
5300 ZoneList<CharacterRange>** included_;
5301 ZoneList<CharacterRange>** excluded_;
5302 Zone* zone_;
5303 };
5304
5305
Call(uc16 from,DispatchTable::Entry entry)5306 void CharacterRangeSplitter::Call(uc16 from, DispatchTable::Entry entry) {
5307 if (!entry.out_set()->Get(kInBase)) return;
5308 ZoneList<CharacterRange>** target = entry.out_set()->Get(kInOverlay)
5309 ? included_
5310 : excluded_;
5311 if (*target == NULL) *target = new(zone_) ZoneList<CharacterRange>(2, zone_);
5312 (*target)->Add(CharacterRange(entry.from(), entry.to()), zone_);
5313 }
5314
5315
Split(ZoneList<CharacterRange> * base,Vector<const int> overlay,ZoneList<CharacterRange> ** included,ZoneList<CharacterRange> ** excluded,Zone * zone)5316 void CharacterRange::Split(ZoneList<CharacterRange>* base,
5317 Vector<const int> overlay,
5318 ZoneList<CharacterRange>** included,
5319 ZoneList<CharacterRange>** excluded,
5320 Zone* zone) {
5321 DCHECK_EQ(NULL, *included);
5322 DCHECK_EQ(NULL, *excluded);
5323 DispatchTable table(zone);
5324 for (int i = 0; i < base->length(); i++)
5325 table.AddRange(base->at(i), CharacterRangeSplitter::kInBase, zone);
5326 for (int i = 0; i < overlay.length(); i += 2) {
5327 table.AddRange(CharacterRange(overlay[i], overlay[i + 1] - 1),
5328 CharacterRangeSplitter::kInOverlay, zone);
5329 }
5330 CharacterRangeSplitter callback(included, excluded, zone);
5331 table.ForEach(&callback);
5332 }
5333
5334
AddCaseEquivalents(ZoneList<CharacterRange> * ranges,bool is_one_byte,Zone * zone)5335 void CharacterRange::AddCaseEquivalents(ZoneList<CharacterRange>* ranges,
5336 bool is_one_byte, Zone* zone) {
5337 Isolate* isolate = zone->isolate();
5338 uc16 bottom = from();
5339 uc16 top = to();
5340 if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) {
5341 if (bottom > String::kMaxOneByteCharCode) return;
5342 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5343 }
5344 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5345 if (top == bottom) {
5346 // If this is a singleton we just expand the one character.
5347 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5348 for (int i = 0; i < length; i++) {
5349 uc32 chr = chars[i];
5350 if (chr != bottom) {
5351 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5352 }
5353 }
5354 } else {
5355 // If this is a range we expand the characters block by block,
5356 // expanding contiguous subranges (blocks) one at a time.
5357 // The approach is as follows. For a given start character we
5358 // look up the remainder of the block that contains it (represented
5359 // by the end point), for instance we find 'z' if the character
5360 // is 'c'. A block is characterized by the property
5361 // that all characters uncanonicalize in the same way, except that
5362 // each entry in the result is incremented by the distance from the first
5363 // element. So a-z is a block because 'a' uncanonicalizes to ['a', 'A'] and
5364 // the k'th letter uncanonicalizes to ['a' + k, 'A' + k].
5365 // Once we've found the end point we look up its uncanonicalization
5366 // and produce a range for each element. For instance for [c-f]
5367 // we look up ['z', 'Z'] and produce [c-f] and [C-F]. We then only
5368 // add a range if it is not already contained in the input, so [c-f]
5369 // will be skipped but [C-F] will be added. If this range is not
5370 // completely contained in a block we do this for all the blocks
5371 // covered by the range (handling characters that is not in a block
5372 // as a "singleton block").
5373 unibrow::uchar range[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5374 int pos = bottom;
5375 while (pos <= top) {
5376 int length = isolate->jsregexp_canonrange()->get(pos, '\0', range);
5377 uc16 block_end;
5378 if (length == 0) {
5379 block_end = pos;
5380 } else {
5381 DCHECK_EQ(1, length);
5382 block_end = range[0];
5383 }
5384 int end = (block_end > top) ? top : block_end;
5385 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0', range);
5386 for (int i = 0; i < length; i++) {
5387 uc32 c = range[i];
5388 uc16 range_from = c - (block_end - pos);
5389 uc16 range_to = c - (block_end - end);
5390 if (!(bottom <= range_from && range_to <= top)) {
5391 ranges->Add(CharacterRange(range_from, range_to), zone);
5392 }
5393 }
5394 pos = end + 1;
5395 }
5396 }
5397 }
5398
5399
IsCanonical(ZoneList<CharacterRange> * ranges)5400 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5401 DCHECK_NOT_NULL(ranges);
5402 int n = ranges->length();
5403 if (n <= 1) return true;
5404 int max = ranges->at(0).to();
5405 for (int i = 1; i < n; i++) {
5406 CharacterRange next_range = ranges->at(i);
5407 if (next_range.from() <= max + 1) return false;
5408 max = next_range.to();
5409 }
5410 return true;
5411 }
5412
5413
ranges(Zone * zone)5414 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5415 if (ranges_ == NULL) {
5416 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5417 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5418 }
5419 return ranges_;
5420 }
5421
5422
5423 // Move a number of elements in a zonelist to another position
5424 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5425 static void MoveRanges(ZoneList<CharacterRange>* list,
5426 int from,
5427 int to,
5428 int count) {
5429 // Ranges are potentially overlapping.
5430 if (from < to) {
5431 for (int i = count - 1; i >= 0; i--) {
5432 list->at(to + i) = list->at(from + i);
5433 }
5434 } else {
5435 for (int i = 0; i < count; i++) {
5436 list->at(to + i) = list->at(from + i);
5437 }
5438 }
5439 }
5440
5441
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)5442 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
5443 int count,
5444 CharacterRange insert) {
5445 // Inserts a range into list[0..count[, which must be sorted
5446 // by from value and non-overlapping and non-adjacent, using at most
5447 // list[0..count] for the result. Returns the number of resulting
5448 // canonicalized ranges. Inserting a range may collapse existing ranges into
5449 // fewer ranges, so the return value can be anything in the range 1..count+1.
5450 uc16 from = insert.from();
5451 uc16 to = insert.to();
5452 int start_pos = 0;
5453 int end_pos = count;
5454 for (int i = count - 1; i >= 0; i--) {
5455 CharacterRange current = list->at(i);
5456 if (current.from() > to + 1) {
5457 end_pos = i;
5458 } else if (current.to() + 1 < from) {
5459 start_pos = i + 1;
5460 break;
5461 }
5462 }
5463
5464 // Inserted range overlaps, or is adjacent to, ranges at positions
5465 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
5466 // not affected by the insertion.
5467 // If start_pos == end_pos, the range must be inserted before start_pos.
5468 // if start_pos < end_pos, the entire range from start_pos to end_pos
5469 // must be merged with the insert range.
5470
5471 if (start_pos == end_pos) {
5472 // Insert between existing ranges at position start_pos.
5473 if (start_pos < count) {
5474 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
5475 }
5476 list->at(start_pos) = insert;
5477 return count + 1;
5478 }
5479 if (start_pos + 1 == end_pos) {
5480 // Replace single existing range at position start_pos.
5481 CharacterRange to_replace = list->at(start_pos);
5482 int new_from = Min(to_replace.from(), from);
5483 int new_to = Max(to_replace.to(), to);
5484 list->at(start_pos) = CharacterRange(new_from, new_to);
5485 return count;
5486 }
5487 // Replace a number of existing ranges from start_pos to end_pos - 1.
5488 // Move the remaining ranges down.
5489
5490 int new_from = Min(list->at(start_pos).from(), from);
5491 int new_to = Max(list->at(end_pos - 1).to(), to);
5492 if (end_pos < count) {
5493 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
5494 }
5495 list->at(start_pos) = CharacterRange(new_from, new_to);
5496 return count - (end_pos - start_pos) + 1;
5497 }
5498
5499
Canonicalize()5500 void CharacterSet::Canonicalize() {
5501 // Special/default classes are always considered canonical. The result
5502 // of calling ranges() will be sorted.
5503 if (ranges_ == NULL) return;
5504 CharacterRange::Canonicalize(ranges_);
5505 }
5506
5507
Canonicalize(ZoneList<CharacterRange> * character_ranges)5508 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
5509 if (character_ranges->length() <= 1) return;
5510 // Check whether ranges are already canonical (increasing, non-overlapping,
5511 // non-adjacent).
5512 int n = character_ranges->length();
5513 int max = character_ranges->at(0).to();
5514 int i = 1;
5515 while (i < n) {
5516 CharacterRange current = character_ranges->at(i);
5517 if (current.from() <= max + 1) {
5518 break;
5519 }
5520 max = current.to();
5521 i++;
5522 }
5523 // Canonical until the i'th range. If that's all of them, we are done.
5524 if (i == n) return;
5525
5526 // The ranges at index i and forward are not canonicalized. Make them so by
5527 // doing the equivalent of insertion sort (inserting each into the previous
5528 // list, in order).
5529 // Notice that inserting a range can reduce the number of ranges in the
5530 // result due to combining of adjacent and overlapping ranges.
5531 int read = i; // Range to insert.
5532 int num_canonical = i; // Length of canonicalized part of list.
5533 do {
5534 num_canonical = InsertRangeInCanonicalList(character_ranges,
5535 num_canonical,
5536 character_ranges->at(read));
5537 read++;
5538 } while (read < n);
5539 character_ranges->Rewind(num_canonical);
5540
5541 DCHECK(CharacterRange::IsCanonical(character_ranges));
5542 }
5543
5544
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)5545 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
5546 ZoneList<CharacterRange>* negated_ranges,
5547 Zone* zone) {
5548 DCHECK(CharacterRange::IsCanonical(ranges));
5549 DCHECK_EQ(0, negated_ranges->length());
5550 int range_count = ranges->length();
5551 uc16 from = 0;
5552 int i = 0;
5553 if (range_count > 0 && ranges->at(0).from() == 0) {
5554 from = ranges->at(0).to();
5555 i = 1;
5556 }
5557 while (i < range_count) {
5558 CharacterRange range = ranges->at(i);
5559 negated_ranges->Add(CharacterRange(from + 1, range.from() - 1), zone);
5560 from = range.to();
5561 i++;
5562 }
5563 if (from < String::kMaxUtf16CodeUnit) {
5564 negated_ranges->Add(CharacterRange(from + 1, String::kMaxUtf16CodeUnit),
5565 zone);
5566 }
5567 }
5568
5569
5570 // -------------------------------------------------------------------
5571 // Splay tree
5572
5573
Extend(unsigned value,Zone * zone)5574 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
5575 if (Get(value))
5576 return this;
5577 if (successors(zone) != NULL) {
5578 for (int i = 0; i < successors(zone)->length(); i++) {
5579 OutSet* successor = successors(zone)->at(i);
5580 if (successor->Get(value))
5581 return successor;
5582 }
5583 } else {
5584 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
5585 }
5586 OutSet* result = new(zone) OutSet(first_, remaining_);
5587 result->Set(value, zone);
5588 successors(zone)->Add(result, zone);
5589 return result;
5590 }
5591
5592
Set(unsigned value,Zone * zone)5593 void OutSet::Set(unsigned value, Zone *zone) {
5594 if (value < kFirstLimit) {
5595 first_ |= (1 << value);
5596 } else {
5597 if (remaining_ == NULL)
5598 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
5599 if (remaining_->is_empty() || !remaining_->Contains(value))
5600 remaining_->Add(value, zone);
5601 }
5602 }
5603
5604
Get(unsigned value) const5605 bool OutSet::Get(unsigned value) const {
5606 if (value < kFirstLimit) {
5607 return (first_ & (1 << value)) != 0;
5608 } else if (remaining_ == NULL) {
5609 return false;
5610 } else {
5611 return remaining_->Contains(value);
5612 }
5613 }
5614
5615
5616 const uc16 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
5617
5618
AddRange(CharacterRange full_range,int value,Zone * zone)5619 void DispatchTable::AddRange(CharacterRange full_range, int value,
5620 Zone* zone) {
5621 CharacterRange current = full_range;
5622 if (tree()->is_empty()) {
5623 // If this is the first range we just insert into the table.
5624 ZoneSplayTree<Config>::Locator loc;
5625 DCHECK_RESULT(tree()->Insert(current.from(), &loc));
5626 loc.set_value(Entry(current.from(), current.to(),
5627 empty()->Extend(value, zone)));
5628 return;
5629 }
5630 // First see if there is a range to the left of this one that
5631 // overlaps.
5632 ZoneSplayTree<Config>::Locator loc;
5633 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
5634 Entry* entry = &loc.value();
5635 // If we've found a range that overlaps with this one, and it
5636 // starts strictly to the left of this one, we have to fix it
5637 // because the following code only handles ranges that start on
5638 // or after the start point of the range we're adding.
5639 if (entry->from() < current.from() && entry->to() >= current.from()) {
5640 // Snap the overlapping range in half around the start point of
5641 // the range we're adding.
5642 CharacterRange left(entry->from(), current.from() - 1);
5643 CharacterRange right(current.from(), entry->to());
5644 // The left part of the overlapping range doesn't overlap.
5645 // Truncate the whole entry to be just the left part.
5646 entry->set_to(left.to());
5647 // The right part is the one that overlaps. We add this part
5648 // to the map and let the next step deal with merging it with
5649 // the range we're adding.
5650 ZoneSplayTree<Config>::Locator loc;
5651 DCHECK_RESULT(tree()->Insert(right.from(), &loc));
5652 loc.set_value(Entry(right.from(),
5653 right.to(),
5654 entry->out_set()));
5655 }
5656 }
5657 while (current.is_valid()) {
5658 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
5659 (loc.value().from() <= current.to()) &&
5660 (loc.value().to() >= current.from())) {
5661 Entry* entry = &loc.value();
5662 // We have overlap. If there is space between the start point of
5663 // the range we're adding and where the overlapping range starts
5664 // then we have to add a range covering just that space.
5665 if (current.from() < entry->from()) {
5666 ZoneSplayTree<Config>::Locator ins;
5667 DCHECK_RESULT(tree()->Insert(current.from(), &ins));
5668 ins.set_value(Entry(current.from(),
5669 entry->from() - 1,
5670 empty()->Extend(value, zone)));
5671 current.set_from(entry->from());
5672 }
5673 DCHECK_EQ(current.from(), entry->from());
5674 // If the overlapping range extends beyond the one we want to add
5675 // we have to snap the right part off and add it separately.
5676 if (entry->to() > current.to()) {
5677 ZoneSplayTree<Config>::Locator ins;
5678 DCHECK_RESULT(tree()->Insert(current.to() + 1, &ins));
5679 ins.set_value(Entry(current.to() + 1,
5680 entry->to(),
5681 entry->out_set()));
5682 entry->set_to(current.to());
5683 }
5684 DCHECK(entry->to() <= current.to());
5685 // The overlapping range is now completely contained by the range
5686 // we're adding so we can just update it and move the start point
5687 // of the range we're adding just past it.
5688 entry->AddValue(value, zone);
5689 // Bail out if the last interval ended at 0xFFFF since otherwise
5690 // adding 1 will wrap around to 0.
5691 if (entry->to() == String::kMaxUtf16CodeUnit)
5692 break;
5693 DCHECK(entry->to() + 1 > current.from());
5694 current.set_from(entry->to() + 1);
5695 } else {
5696 // There is no overlap so we can just add the range
5697 ZoneSplayTree<Config>::Locator ins;
5698 DCHECK_RESULT(tree()->Insert(current.from(), &ins));
5699 ins.set_value(Entry(current.from(),
5700 current.to(),
5701 empty()->Extend(value, zone)));
5702 break;
5703 }
5704 }
5705 }
5706
5707
Get(uc16 value)5708 OutSet* DispatchTable::Get(uc16 value) {
5709 ZoneSplayTree<Config>::Locator loc;
5710 if (!tree()->FindGreatestLessThan(value, &loc))
5711 return empty();
5712 Entry* entry = &loc.value();
5713 if (value <= entry->to())
5714 return entry->out_set();
5715 else
5716 return empty();
5717 }
5718
5719
5720 // -------------------------------------------------------------------
5721 // Analysis
5722
5723
EnsureAnalyzed(RegExpNode * that)5724 void Analysis::EnsureAnalyzed(RegExpNode* that) {
5725 StackLimitCheck check(that->zone()->isolate());
5726 if (check.HasOverflowed()) {
5727 fail("Stack overflow");
5728 return;
5729 }
5730 if (that->info()->been_analyzed || that->info()->being_analyzed)
5731 return;
5732 that->info()->being_analyzed = true;
5733 that->Accept(this);
5734 that->info()->being_analyzed = false;
5735 that->info()->been_analyzed = true;
5736 }
5737
5738
VisitEnd(EndNode * that)5739 void Analysis::VisitEnd(EndNode* that) {
5740 // nothing to do
5741 }
5742
5743
CalculateOffsets()5744 void TextNode::CalculateOffsets() {
5745 int element_count = elements()->length();
5746 // Set up the offsets of the elements relative to the start. This is a fixed
5747 // quantity since a TextNode can only contain fixed-width things.
5748 int cp_offset = 0;
5749 for (int i = 0; i < element_count; i++) {
5750 TextElement& elm = elements()->at(i);
5751 elm.set_cp_offset(cp_offset);
5752 cp_offset += elm.length();
5753 }
5754 }
5755
5756
VisitText(TextNode * that)5757 void Analysis::VisitText(TextNode* that) {
5758 if (ignore_case_) {
5759 that->MakeCaseIndependent(is_one_byte_);
5760 }
5761 EnsureAnalyzed(that->on_success());
5762 if (!has_failed()) {
5763 that->CalculateOffsets();
5764 }
5765 }
5766
5767
VisitAction(ActionNode * that)5768 void Analysis::VisitAction(ActionNode* that) {
5769 RegExpNode* target = that->on_success();
5770 EnsureAnalyzed(target);
5771 if (!has_failed()) {
5772 // If the next node is interested in what it follows then this node
5773 // has to be interested too so it can pass the information on.
5774 that->info()->AddFromFollowing(target->info());
5775 }
5776 }
5777
5778
VisitChoice(ChoiceNode * that)5779 void Analysis::VisitChoice(ChoiceNode* that) {
5780 NodeInfo* info = that->info();
5781 for (int i = 0; i < that->alternatives()->length(); i++) {
5782 RegExpNode* node = that->alternatives()->at(i).node();
5783 EnsureAnalyzed(node);
5784 if (has_failed()) return;
5785 // Anything the following nodes need to know has to be known by
5786 // this node also, so it can pass it on.
5787 info->AddFromFollowing(node->info());
5788 }
5789 }
5790
5791
VisitLoopChoice(LoopChoiceNode * that)5792 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
5793 NodeInfo* info = that->info();
5794 for (int i = 0; i < that->alternatives()->length(); i++) {
5795 RegExpNode* node = that->alternatives()->at(i).node();
5796 if (node != that->loop_node()) {
5797 EnsureAnalyzed(node);
5798 if (has_failed()) return;
5799 info->AddFromFollowing(node->info());
5800 }
5801 }
5802 // Check the loop last since it may need the value of this node
5803 // to get a correct result.
5804 EnsureAnalyzed(that->loop_node());
5805 if (!has_failed()) {
5806 info->AddFromFollowing(that->loop_node()->info());
5807 }
5808 }
5809
5810
VisitBackReference(BackReferenceNode * that)5811 void Analysis::VisitBackReference(BackReferenceNode* that) {
5812 EnsureAnalyzed(that->on_success());
5813 }
5814
5815
VisitAssertion(AssertionNode * that)5816 void Analysis::VisitAssertion(AssertionNode* that) {
5817 EnsureAnalyzed(that->on_success());
5818 }
5819
5820
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5821 void BackReferenceNode::FillInBMInfo(int offset,
5822 int budget,
5823 BoyerMooreLookahead* bm,
5824 bool not_at_start) {
5825 // Working out the set of characters that a backreference can match is too
5826 // hard, so we just say that any character can match.
5827 bm->SetRest(offset);
5828 SaveBMInfo(bm, not_at_start, offset);
5829 }
5830
5831
5832 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
5833 RegExpMacroAssembler::kTableSize);
5834
5835
FillInBMInfo(int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5836 void ChoiceNode::FillInBMInfo(int offset,
5837 int budget,
5838 BoyerMooreLookahead* bm,
5839 bool not_at_start) {
5840 ZoneList<GuardedAlternative>* alts = alternatives();
5841 budget = (budget - 1) / alts->length();
5842 for (int i = 0; i < alts->length(); i++) {
5843 GuardedAlternative& alt = alts->at(i);
5844 if (alt.guards() != NULL && alt.guards()->length() != 0) {
5845 bm->SetRest(offset); // Give up trying to fill in info.
5846 SaveBMInfo(bm, not_at_start, offset);
5847 return;
5848 }
5849 alt.node()->FillInBMInfo(offset, budget, bm, not_at_start);
5850 }
5851 SaveBMInfo(bm, not_at_start, offset);
5852 }
5853
5854
FillInBMInfo(int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)5855 void TextNode::FillInBMInfo(int initial_offset,
5856 int budget,
5857 BoyerMooreLookahead* bm,
5858 bool not_at_start) {
5859 if (initial_offset >= bm->length()) return;
5860 int offset = initial_offset;
5861 int max_char = bm->max_char();
5862 for (int i = 0; i < elements()->length(); i++) {
5863 if (offset >= bm->length()) {
5864 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5865 return;
5866 }
5867 TextElement text = elements()->at(i);
5868 if (text.text_type() == TextElement::ATOM) {
5869 RegExpAtom* atom = text.atom();
5870 for (int j = 0; j < atom->length(); j++, offset++) {
5871 if (offset >= bm->length()) {
5872 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5873 return;
5874 }
5875 uc16 character = atom->data()[j];
5876 if (bm->compiler()->ignore_case()) {
5877 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5878 int length = GetCaseIndependentLetters(
5879 Isolate::Current(),
5880 character,
5881 bm->max_char() == String::kMaxOneByteCharCode,
5882 chars);
5883 for (int j = 0; j < length; j++) {
5884 bm->Set(offset, chars[j]);
5885 }
5886 } else {
5887 if (character <= max_char) bm->Set(offset, character);
5888 }
5889 }
5890 } else {
5891 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
5892 RegExpCharacterClass* char_class = text.char_class();
5893 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
5894 if (char_class->is_negated()) {
5895 bm->SetAll(offset);
5896 } else {
5897 for (int k = 0; k < ranges->length(); k++) {
5898 CharacterRange& range = ranges->at(k);
5899 if (range.from() > max_char) continue;
5900 int to = Min(max_char, static_cast<int>(range.to()));
5901 bm->SetInterval(offset, Interval(range.from(), to));
5902 }
5903 }
5904 offset++;
5905 }
5906 }
5907 if (offset >= bm->length()) {
5908 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5909 return;
5910 }
5911 on_success()->FillInBMInfo(offset,
5912 budget - 1,
5913 bm,
5914 true); // Not at start after a text node.
5915 if (initial_offset == 0) set_bm_info(not_at_start, bm);
5916 }
5917
5918
5919 // -------------------------------------------------------------------
5920 // Dispatch table construction
5921
5922
VisitEnd(EndNode * that)5923 void DispatchTableConstructor::VisitEnd(EndNode* that) {
5924 AddRange(CharacterRange::Everything());
5925 }
5926
5927
BuildTable(ChoiceNode * node)5928 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
5929 node->set_being_calculated(true);
5930 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
5931 for (int i = 0; i < alternatives->length(); i++) {
5932 set_choice_index(i);
5933 alternatives->at(i).node()->Accept(this);
5934 }
5935 node->set_being_calculated(false);
5936 }
5937
5938
5939 class AddDispatchRange {
5940 public:
AddDispatchRange(DispatchTableConstructor * constructor)5941 explicit AddDispatchRange(DispatchTableConstructor* constructor)
5942 : constructor_(constructor) { }
5943 void Call(uc32 from, DispatchTable::Entry entry);
5944 private:
5945 DispatchTableConstructor* constructor_;
5946 };
5947
5948
Call(uc32 from,DispatchTable::Entry entry)5949 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
5950 CharacterRange range(from, entry.to());
5951 constructor_->AddRange(range);
5952 }
5953
5954
VisitChoice(ChoiceNode * node)5955 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
5956 if (node->being_calculated())
5957 return;
5958 DispatchTable* table = node->GetTable(ignore_case_);
5959 AddDispatchRange adder(this);
5960 table->ForEach(&adder);
5961 }
5962
5963
VisitBackReference(BackReferenceNode * that)5964 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
5965 // TODO(160): Find the node that we refer back to and propagate its start
5966 // set back to here. For now we just accept anything.
5967 AddRange(CharacterRange::Everything());
5968 }
5969
5970
VisitAssertion(AssertionNode * that)5971 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
5972 RegExpNode* target = that->on_success();
5973 target->Accept(this);
5974 }
5975
5976
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)5977 static int CompareRangeByFrom(const CharacterRange* a,
5978 const CharacterRange* b) {
5979 return Compare<uc16>(a->from(), b->from());
5980 }
5981
5982
AddInverse(ZoneList<CharacterRange> * ranges)5983 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
5984 ranges->Sort(CompareRangeByFrom);
5985 uc16 last = 0;
5986 for (int i = 0; i < ranges->length(); i++) {
5987 CharacterRange range = ranges->at(i);
5988 if (last < range.from())
5989 AddRange(CharacterRange(last, range.from() - 1));
5990 if (range.to() >= last) {
5991 if (range.to() == String::kMaxUtf16CodeUnit) {
5992 return;
5993 } else {
5994 last = range.to() + 1;
5995 }
5996 }
5997 }
5998 AddRange(CharacterRange(last, String::kMaxUtf16CodeUnit));
5999 }
6000
6001
VisitText(TextNode * that)6002 void DispatchTableConstructor::VisitText(TextNode* that) {
6003 TextElement elm = that->elements()->at(0);
6004 switch (elm.text_type()) {
6005 case TextElement::ATOM: {
6006 uc16 c = elm.atom()->data()[0];
6007 AddRange(CharacterRange(c, c));
6008 break;
6009 }
6010 case TextElement::CHAR_CLASS: {
6011 RegExpCharacterClass* tree = elm.char_class();
6012 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6013 if (tree->is_negated()) {
6014 AddInverse(ranges);
6015 } else {
6016 for (int i = 0; i < ranges->length(); i++)
6017 AddRange(ranges->at(i));
6018 }
6019 break;
6020 }
6021 default: {
6022 UNIMPLEMENTED();
6023 }
6024 }
6025 }
6026
6027
VisitAction(ActionNode * that)6028 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6029 RegExpNode* target = that->on_success();
6030 target->Accept(this);
6031 }
6032
6033
Compile(RegExpCompileData * data,bool ignore_case,bool is_global,bool is_multiline,bool is_sticky,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte,Zone * zone)6034 RegExpEngine::CompilationResult RegExpEngine::Compile(
6035 RegExpCompileData* data, bool ignore_case, bool is_global,
6036 bool is_multiline, bool is_sticky, Handle<String> pattern,
6037 Handle<String> sample_subject, bool is_one_byte, Zone* zone) {
6038 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6039 return IrregexpRegExpTooBig(zone->isolate());
6040 }
6041 RegExpCompiler compiler(data->capture_count, ignore_case, is_one_byte, zone);
6042
6043 // Sample some characters from the middle of the string.
6044 static const int kSampleSize = 128;
6045
6046 sample_subject = String::Flatten(sample_subject);
6047 int chars_sampled = 0;
6048 int half_way = (sample_subject->length() - kSampleSize) / 2;
6049 for (int i = Max(0, half_way);
6050 i < sample_subject->length() && chars_sampled < kSampleSize;
6051 i++, chars_sampled++) {
6052 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6053 }
6054
6055 // Wrap the body of the regexp in capture #0.
6056 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6057 0,
6058 &compiler,
6059 compiler.accept());
6060 RegExpNode* node = captured_body;
6061 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6062 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6063 int max_length = data->tree->max_match();
6064 if (!is_start_anchored && !is_sticky) {
6065 // Add a .*? at the beginning, outside the body capture, unless
6066 // this expression is anchored at the beginning or sticky.
6067 RegExpNode* loop_node =
6068 RegExpQuantifier::ToNode(0,
6069 RegExpTree::kInfinity,
6070 false,
6071 new(zone) RegExpCharacterClass('*'),
6072 &compiler,
6073 captured_body,
6074 data->contains_anchor);
6075
6076 if (data->contains_anchor) {
6077 // Unroll loop once, to take care of the case that might start
6078 // at the start of input.
6079 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6080 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6081 first_step_node->AddAlternative(GuardedAlternative(
6082 new(zone) TextNode(new(zone) RegExpCharacterClass('*'), loop_node)));
6083 node = first_step_node;
6084 } else {
6085 node = loop_node;
6086 }
6087 }
6088 if (is_one_byte) {
6089 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6090 // Do it again to propagate the new nodes to places where they were not
6091 // put because they had not been calculated yet.
6092 if (node != NULL) {
6093 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6094 }
6095 }
6096
6097 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6098 data->node = node;
6099 Analysis analysis(ignore_case, is_one_byte);
6100 analysis.EnsureAnalyzed(node);
6101 if (analysis.has_failed()) {
6102 const char* error_message = analysis.error_message();
6103 return CompilationResult(zone->isolate(), error_message);
6104 }
6105
6106 // Create the correct assembler for the architecture.
6107 #ifndef V8_INTERPRETED_REGEXP
6108 // Native regexp implementation.
6109
6110 NativeRegExpMacroAssembler::Mode mode =
6111 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6112 : NativeRegExpMacroAssembler::UC16;
6113
6114 #if V8_TARGET_ARCH_IA32
6115 RegExpMacroAssemblerIA32 macro_assembler(mode, (data->capture_count + 1) * 2,
6116 zone);
6117 #elif V8_TARGET_ARCH_X64
6118 RegExpMacroAssemblerX64 macro_assembler(mode, (data->capture_count + 1) * 2,
6119 zone);
6120 #elif V8_TARGET_ARCH_ARM
6121 RegExpMacroAssemblerARM macro_assembler(mode, (data->capture_count + 1) * 2,
6122 zone);
6123 #elif V8_TARGET_ARCH_ARM64
6124 RegExpMacroAssemblerARM64 macro_assembler(mode, (data->capture_count + 1) * 2,
6125 zone);
6126 #elif V8_TARGET_ARCH_MIPS
6127 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6128 zone);
6129 #elif V8_TARGET_ARCH_MIPS64
6130 RegExpMacroAssemblerMIPS macro_assembler(mode, (data->capture_count + 1) * 2,
6131 zone);
6132 #elif V8_TARGET_ARCH_X87
6133 RegExpMacroAssemblerX87 macro_assembler(mode, (data->capture_count + 1) * 2,
6134 zone);
6135 #else
6136 #error "Unsupported architecture"
6137 #endif
6138
6139 #else // V8_INTERPRETED_REGEXP
6140 // Interpreted regexp implementation.
6141 EmbeddedVector<byte, 1024> codes;
6142 RegExpMacroAssemblerIrregexp macro_assembler(codes, zone);
6143 #endif // V8_INTERPRETED_REGEXP
6144
6145 // Inserted here, instead of in Assembler, because it depends on information
6146 // in the AST that isn't replicated in the Node structure.
6147 static const int kMaxBacksearchLimit = 1024;
6148 if (is_end_anchored &&
6149 !is_start_anchored &&
6150 max_length < kMaxBacksearchLimit) {
6151 macro_assembler.SetCurrentPositionFromEnd(max_length);
6152 }
6153
6154 if (is_global) {
6155 macro_assembler.set_global_mode(
6156 (data->tree->min_match() > 0)
6157 ? RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK
6158 : RegExpMacroAssembler::GLOBAL);
6159 }
6160
6161 return compiler.Assemble(¯o_assembler,
6162 node,
6163 data->capture_count,
6164 pattern);
6165 }
6166
6167
6168 }} // namespace v8::internal
6169