1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/jsregexp.h"
6
7 #include <memory>
8
9 #include "src/base/platform/platform.h"
10 #include "src/compilation-cache.h"
11 #include "src/elements.h"
12 #include "src/execution.h"
13 #include "src/factory.h"
14 #include "src/isolate-inl.h"
15 #include "src/messages.h"
16 #include "src/ostreams.h"
17 #include "src/regexp/interpreter-irregexp.h"
18 #include "src/regexp/jsregexp-inl.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-macro-assembler.h"
22 #include "src/regexp/regexp-parser.h"
23 #include "src/regexp/regexp-stack.h"
24 #include "src/runtime/runtime.h"
25 #include "src/splay-tree-inl.h"
26 #include "src/string-search.h"
27 #include "src/unicode-decoder.h"
28
29 #ifdef V8_I18N_SUPPORT
30 #include "unicode/uset.h"
31 #include "unicode/utypes.h"
32 #endif // V8_I18N_SUPPORT
33
34 #ifndef V8_INTERPRETED_REGEXP
35 #if V8_TARGET_ARCH_IA32
36 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
37 #elif V8_TARGET_ARCH_X64
38 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
39 #elif V8_TARGET_ARCH_ARM64
40 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
41 #elif V8_TARGET_ARCH_ARM
42 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
43 #elif V8_TARGET_ARCH_PPC
44 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
45 #elif V8_TARGET_ARCH_S390
46 #include "src/regexp/s390/regexp-macro-assembler-s390.h"
47 #elif V8_TARGET_ARCH_MIPS
48 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
49 #elif V8_TARGET_ARCH_MIPS64
50 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
51 #elif V8_TARGET_ARCH_X87
52 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
53 #else
54 #error Unsupported target architecture.
55 #endif
56 #endif
57
58
59 namespace v8 {
60 namespace internal {
61
62 MUST_USE_RESULT
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> pattern,Handle<String> error_text)63 static inline MaybeHandle<Object> ThrowRegExpException(
64 Handle<JSRegExp> re, Handle<String> pattern, Handle<String> error_text) {
65 Isolate* isolate = re->GetIsolate();
66 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
67 pattern, error_text),
68 Object);
69 }
70
71
ThrowRegExpException(Handle<JSRegExp> re,Handle<String> error_text)72 inline void ThrowRegExpException(Handle<JSRegExp> re,
73 Handle<String> error_text) {
74 USE(ThrowRegExpException(re, Handle<String>(re->Pattern()), error_text));
75 }
76
77
AddRange(ContainedInLattice containment,const int * ranges,int ranges_length,Interval new_range)78 ContainedInLattice AddRange(ContainedInLattice containment,
79 const int* ranges,
80 int ranges_length,
81 Interval new_range) {
82 DCHECK((ranges_length & 1) == 1);
83 DCHECK(ranges[ranges_length - 1] == String::kMaxCodePoint + 1);
84 if (containment == kLatticeUnknown) return containment;
85 bool inside = false;
86 int last = 0;
87 for (int i = 0; i < ranges_length; inside = !inside, last = ranges[i], i++) {
88 // Consider the range from last to ranges[i].
89 // We haven't got to the new range yet.
90 if (ranges[i] <= new_range.from()) continue;
91 // New range is wholly inside last-ranges[i]. Note that new_range.to() is
92 // inclusive, but the values in ranges are not.
93 if (last <= new_range.from() && new_range.to() < ranges[i]) {
94 return Combine(containment, inside ? kLatticeIn : kLatticeOut);
95 }
96 return kLatticeUnknown;
97 }
98 return containment;
99 }
100
101
102 // More makes code generation slower, less makes V8 benchmark score lower.
103 const int kMaxLookaheadForBoyerMoore = 8;
104 // In a 3-character pattern you can maximally step forwards 3 characters
105 // at a time, which is not always enough to pay for the extra logic.
106 const int kPatternTooShortForBoyerMoore = 2;
107
108
109 // Identifies the sort of regexps where the regexp engine is faster
110 // than the code used for atom matches.
HasFewDifferentCharacters(Handle<String> pattern)111 static bool HasFewDifferentCharacters(Handle<String> pattern) {
112 int length = Min(kMaxLookaheadForBoyerMoore, pattern->length());
113 if (length <= kPatternTooShortForBoyerMoore) return false;
114 const int kMod = 128;
115 bool character_found[kMod];
116 int different = 0;
117 memset(&character_found[0], 0, sizeof(character_found));
118 for (int i = 0; i < length; i++) {
119 int ch = (pattern->Get(i) & (kMod - 1));
120 if (!character_found[ch]) {
121 character_found[ch] = true;
122 different++;
123 // We declare a regexp low-alphabet if it has at least 3 times as many
124 // characters as it has different characters.
125 if (different * 3 > length) return false;
126 }
127 }
128 return true;
129 }
130
131
132 // Generic RegExp methods. Dispatches to implementation specific methods.
133
134
Compile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags)135 MaybeHandle<Object> RegExpImpl::Compile(Handle<JSRegExp> re,
136 Handle<String> pattern,
137 JSRegExp::Flags flags) {
138 Isolate* isolate = re->GetIsolate();
139 Zone zone(isolate->allocator(), ZONE_NAME);
140 CompilationCache* compilation_cache = isolate->compilation_cache();
141 MaybeHandle<FixedArray> maybe_cached =
142 compilation_cache->LookupRegExp(pattern, flags);
143 Handle<FixedArray> cached;
144 if (maybe_cached.ToHandle(&cached)) {
145 re->set_data(*cached);
146 return re;
147 }
148 pattern = String::Flatten(pattern);
149 PostponeInterruptsScope postpone(isolate);
150 RegExpCompileData parse_result;
151 FlatStringReader reader(isolate, pattern);
152 if (!RegExpParser::ParseRegExp(re->GetIsolate(), &zone, &reader, flags,
153 &parse_result)) {
154 // Throw an exception if we fail to parse the pattern.
155 return ThrowRegExpException(re, pattern, parse_result.error);
156 }
157
158 bool has_been_compiled = false;
159
160 if (parse_result.simple && !(flags & JSRegExp::kIgnoreCase) &&
161 !(flags & JSRegExp::kSticky) && !HasFewDifferentCharacters(pattern)) {
162 // Parse-tree is a single atom that is equal to the pattern.
163 AtomCompile(re, pattern, flags, pattern);
164 has_been_compiled = true;
165 } else if (parse_result.tree->IsAtom() && !(flags & JSRegExp::kIgnoreCase) &&
166 !(flags & JSRegExp::kSticky) && parse_result.capture_count == 0) {
167 RegExpAtom* atom = parse_result.tree->AsAtom();
168 Vector<const uc16> atom_pattern = atom->data();
169 Handle<String> atom_string;
170 ASSIGN_RETURN_ON_EXCEPTION(
171 isolate, atom_string,
172 isolate->factory()->NewStringFromTwoByte(atom_pattern),
173 Object);
174 if (!HasFewDifferentCharacters(atom_string)) {
175 AtomCompile(re, pattern, flags, atom_string);
176 has_been_compiled = true;
177 }
178 }
179 if (!has_been_compiled) {
180 IrregexpInitialize(re, pattern, flags, parse_result.capture_count);
181 }
182 DCHECK(re->data()->IsFixedArray());
183 // Compilation succeeded so the data is set on the regexp
184 // and we can store it in the cache.
185 Handle<FixedArray> data(FixedArray::cast(re->data()));
186 compilation_cache->PutRegExp(pattern, flags, data);
187
188 return re;
189 }
190
Exec(Handle<JSRegExp> regexp,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)191 MaybeHandle<Object> RegExpImpl::Exec(Handle<JSRegExp> regexp,
192 Handle<String> subject, int index,
193 Handle<RegExpMatchInfo> last_match_info) {
194 switch (regexp->TypeTag()) {
195 case JSRegExp::ATOM:
196 return AtomExec(regexp, subject, index, last_match_info);
197 case JSRegExp::IRREGEXP: {
198 return IrregexpExec(regexp, subject, index, last_match_info);
199 }
200 default:
201 UNREACHABLE();
202 return MaybeHandle<Object>();
203 }
204 }
205
206
207 // RegExp Atom implementation: Simple string search using indexOf.
208
209
AtomCompile(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,Handle<String> match_pattern)210 void RegExpImpl::AtomCompile(Handle<JSRegExp> re,
211 Handle<String> pattern,
212 JSRegExp::Flags flags,
213 Handle<String> match_pattern) {
214 re->GetIsolate()->factory()->SetRegExpAtomData(re,
215 JSRegExp::ATOM,
216 pattern,
217 flags,
218 match_pattern);
219 }
220
SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,String * subject,int from,int to)221 static void SetAtomLastCapture(Handle<RegExpMatchInfo> last_match_info,
222 String* subject, int from, int to) {
223 SealHandleScope shs(last_match_info->GetIsolate());
224 last_match_info->SetNumberOfCaptureRegisters(2);
225 last_match_info->SetLastSubject(subject);
226 last_match_info->SetLastInput(subject);
227 last_match_info->SetCapture(0, from);
228 last_match_info->SetCapture(1, to);
229 }
230
231
AtomExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)232 int RegExpImpl::AtomExecRaw(Handle<JSRegExp> regexp,
233 Handle<String> subject,
234 int index,
235 int32_t* output,
236 int output_size) {
237 Isolate* isolate = regexp->GetIsolate();
238
239 DCHECK(0 <= index);
240 DCHECK(index <= subject->length());
241
242 subject = String::Flatten(subject);
243 DisallowHeapAllocation no_gc; // ensure vectors stay valid
244
245 String* needle = String::cast(regexp->DataAt(JSRegExp::kAtomPatternIndex));
246 int needle_len = needle->length();
247 DCHECK(needle->IsFlat());
248 DCHECK_LT(0, needle_len);
249
250 if (index + needle_len > subject->length()) {
251 return RegExpImpl::RE_FAILURE;
252 }
253
254 for (int i = 0; i < output_size; i += 2) {
255 String::FlatContent needle_content = needle->GetFlatContent();
256 String::FlatContent subject_content = subject->GetFlatContent();
257 DCHECK(needle_content.IsFlat());
258 DCHECK(subject_content.IsFlat());
259 // dispatch on type of strings
260 index =
261 (needle_content.IsOneByte()
262 ? (subject_content.IsOneByte()
263 ? SearchString(isolate, subject_content.ToOneByteVector(),
264 needle_content.ToOneByteVector(), index)
265 : SearchString(isolate, subject_content.ToUC16Vector(),
266 needle_content.ToOneByteVector(), index))
267 : (subject_content.IsOneByte()
268 ? SearchString(isolate, subject_content.ToOneByteVector(),
269 needle_content.ToUC16Vector(), index)
270 : SearchString(isolate, subject_content.ToUC16Vector(),
271 needle_content.ToUC16Vector(), index)));
272 if (index == -1) {
273 return i / 2; // Return number of matches.
274 } else {
275 output[i] = index;
276 output[i+1] = index + needle_len;
277 index += needle_len;
278 }
279 }
280 return output_size / 2;
281 }
282
AtomExec(Handle<JSRegExp> re,Handle<String> subject,int index,Handle<RegExpMatchInfo> last_match_info)283 Handle<Object> RegExpImpl::AtomExec(Handle<JSRegExp> re, Handle<String> subject,
284 int index,
285 Handle<RegExpMatchInfo> last_match_info) {
286 Isolate* isolate = re->GetIsolate();
287
288 static const int kNumRegisters = 2;
289 STATIC_ASSERT(kNumRegisters <= Isolate::kJSRegexpStaticOffsetsVectorSize);
290 int32_t* output_registers = isolate->jsregexp_static_offsets_vector();
291
292 int res = AtomExecRaw(re, subject, index, output_registers, kNumRegisters);
293
294 if (res == RegExpImpl::RE_FAILURE) return isolate->factory()->null_value();
295
296 DCHECK_EQ(res, RegExpImpl::RE_SUCCESS);
297 SealHandleScope shs(isolate);
298 SetAtomLastCapture(last_match_info, *subject, output_registers[0],
299 output_registers[1]);
300 return last_match_info;
301 }
302
303
304 // Irregexp implementation.
305
306 // Ensures that the regexp object contains a compiled version of the
307 // source for either one-byte or two-byte subject strings.
308 // If the compiled version doesn't already exist, it is compiled
309 // from the source pattern.
310 // If compilation fails, an exception is thrown and this function
311 // returns false.
EnsureCompiledIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)312 bool RegExpImpl::EnsureCompiledIrregexp(Handle<JSRegExp> re,
313 Handle<String> sample_subject,
314 bool is_one_byte) {
315 Object* compiled_code = re->DataAt(JSRegExp::code_index(is_one_byte));
316 #ifdef V8_INTERPRETED_REGEXP
317 if (compiled_code->IsByteArray()) return true;
318 #else // V8_INTERPRETED_REGEXP (RegExp native code)
319 if (compiled_code->IsCode()) return true;
320 #endif
321 // We could potentially have marked this as flushable, but have kept
322 // a saved version if we did not flush it yet.
323 Object* saved_code = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
324 if (saved_code->IsCode()) {
325 // Reinstate the code in the original place.
326 re->SetDataAt(JSRegExp::code_index(is_one_byte), saved_code);
327 DCHECK(compiled_code->IsSmi());
328 return true;
329 }
330 return CompileIrregexp(re, sample_subject, is_one_byte);
331 }
332
333
CompileIrregexp(Handle<JSRegExp> re,Handle<String> sample_subject,bool is_one_byte)334 bool RegExpImpl::CompileIrregexp(Handle<JSRegExp> re,
335 Handle<String> sample_subject,
336 bool is_one_byte) {
337 // Compile the RegExp.
338 Isolate* isolate = re->GetIsolate();
339 Zone zone(isolate->allocator(), ZONE_NAME);
340 PostponeInterruptsScope postpone(isolate);
341 // If we had a compilation error the last time this is saved at the
342 // saved code index.
343 Object* entry = re->DataAt(JSRegExp::code_index(is_one_byte));
344 // When arriving here entry can only be a smi, either representing an
345 // uncompiled regexp, a previous compilation error, or code that has
346 // been flushed.
347 DCHECK(entry->IsSmi());
348 int entry_value = Smi::cast(entry)->value();
349 DCHECK(entry_value == JSRegExp::kUninitializedValue ||
350 entry_value == JSRegExp::kCompilationErrorValue ||
351 (entry_value < JSRegExp::kCodeAgeMask && entry_value >= 0));
352
353 if (entry_value == JSRegExp::kCompilationErrorValue) {
354 // A previous compilation failed and threw an error which we store in
355 // the saved code index (we store the error message, not the actual
356 // error). Recreate the error object and throw it.
357 Object* error_string = re->DataAt(JSRegExp::saved_code_index(is_one_byte));
358 DCHECK(error_string->IsString());
359 Handle<String> error_message(String::cast(error_string));
360 ThrowRegExpException(re, error_message);
361 return false;
362 }
363
364 JSRegExp::Flags flags = re->GetFlags();
365
366 Handle<String> pattern(re->Pattern());
367 pattern = String::Flatten(pattern);
368 RegExpCompileData compile_data;
369 FlatStringReader reader(isolate, pattern);
370 if (!RegExpParser::ParseRegExp(isolate, &zone, &reader, flags,
371 &compile_data)) {
372 // Throw an exception if we fail to parse the pattern.
373 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
374 USE(ThrowRegExpException(re, pattern, compile_data.error));
375 return false;
376 }
377 RegExpEngine::CompilationResult result =
378 RegExpEngine::Compile(isolate, &zone, &compile_data, flags, pattern,
379 sample_subject, is_one_byte);
380 if (result.error_message != NULL) {
381 // Unable to compile regexp.
382 Handle<String> error_message = isolate->factory()->NewStringFromUtf8(
383 CStrVector(result.error_message)).ToHandleChecked();
384 ThrowRegExpException(re, error_message);
385 return false;
386 }
387
388 Handle<FixedArray> data = Handle<FixedArray>(FixedArray::cast(re->data()));
389 data->set(JSRegExp::code_index(is_one_byte), result.code);
390 SetIrregexpCaptureNameMap(*data, compile_data.capture_name_map);
391 int register_max = IrregexpMaxRegisterCount(*data);
392 if (result.num_registers > register_max) {
393 SetIrregexpMaxRegisterCount(*data, result.num_registers);
394 }
395
396 return true;
397 }
398
399
IrregexpMaxRegisterCount(FixedArray * re)400 int RegExpImpl::IrregexpMaxRegisterCount(FixedArray* re) {
401 return Smi::cast(
402 re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
403 }
404
405
SetIrregexpMaxRegisterCount(FixedArray * re,int value)406 void RegExpImpl::SetIrregexpMaxRegisterCount(FixedArray* re, int value) {
407 re->set(JSRegExp::kIrregexpMaxRegisterCountIndex, Smi::FromInt(value));
408 }
409
SetIrregexpCaptureNameMap(FixedArray * re,Handle<FixedArray> value)410 void RegExpImpl::SetIrregexpCaptureNameMap(FixedArray* re,
411 Handle<FixedArray> value) {
412 if (value.is_null()) {
413 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, Smi::kZero);
414 } else {
415 re->set(JSRegExp::kIrregexpCaptureNameMapIndex, *value);
416 }
417 }
418
IrregexpNumberOfCaptures(FixedArray * re)419 int RegExpImpl::IrregexpNumberOfCaptures(FixedArray* re) {
420 return Smi::cast(re->get(JSRegExp::kIrregexpCaptureCountIndex))->value();
421 }
422
423
IrregexpNumberOfRegisters(FixedArray * re)424 int RegExpImpl::IrregexpNumberOfRegisters(FixedArray* re) {
425 return Smi::cast(re->get(JSRegExp::kIrregexpMaxRegisterCountIndex))->value();
426 }
427
428
IrregexpByteCode(FixedArray * re,bool is_one_byte)429 ByteArray* RegExpImpl::IrregexpByteCode(FixedArray* re, bool is_one_byte) {
430 return ByteArray::cast(re->get(JSRegExp::code_index(is_one_byte)));
431 }
432
433
IrregexpNativeCode(FixedArray * re,bool is_one_byte)434 Code* RegExpImpl::IrregexpNativeCode(FixedArray* re, bool is_one_byte) {
435 return Code::cast(re->get(JSRegExp::code_index(is_one_byte)));
436 }
437
438
IrregexpInitialize(Handle<JSRegExp> re,Handle<String> pattern,JSRegExp::Flags flags,int capture_count)439 void RegExpImpl::IrregexpInitialize(Handle<JSRegExp> re,
440 Handle<String> pattern,
441 JSRegExp::Flags flags,
442 int capture_count) {
443 // Initialize compiled code entries to null.
444 re->GetIsolate()->factory()->SetRegExpIrregexpData(re,
445 JSRegExp::IRREGEXP,
446 pattern,
447 flags,
448 capture_count);
449 }
450
451
IrregexpPrepare(Handle<JSRegExp> regexp,Handle<String> subject)452 int RegExpImpl::IrregexpPrepare(Handle<JSRegExp> regexp,
453 Handle<String> subject) {
454 subject = String::Flatten(subject);
455
456 // Check representation of the underlying storage.
457 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
458 if (!EnsureCompiledIrregexp(regexp, subject, is_one_byte)) return -1;
459
460 #ifdef V8_INTERPRETED_REGEXP
461 // Byte-code regexp needs space allocated for all its registers.
462 // The result captures are copied to the start of the registers array
463 // if the match succeeds. This way those registers are not clobbered
464 // when we set the last match info from last successful match.
465 return IrregexpNumberOfRegisters(FixedArray::cast(regexp->data())) +
466 (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
467 #else // V8_INTERPRETED_REGEXP
468 // Native regexp only needs room to output captures. Registers are handled
469 // internally.
470 return (IrregexpNumberOfCaptures(FixedArray::cast(regexp->data())) + 1) * 2;
471 #endif // V8_INTERPRETED_REGEXP
472 }
473
474
IrregexpExecRaw(Handle<JSRegExp> regexp,Handle<String> subject,int index,int32_t * output,int output_size)475 int RegExpImpl::IrregexpExecRaw(Handle<JSRegExp> regexp,
476 Handle<String> subject,
477 int index,
478 int32_t* output,
479 int output_size) {
480 Isolate* isolate = regexp->GetIsolate();
481
482 Handle<FixedArray> irregexp(FixedArray::cast(regexp->data()), isolate);
483
484 DCHECK(index >= 0);
485 DCHECK(index <= subject->length());
486 DCHECK(subject->IsFlat());
487
488 bool is_one_byte = subject->IsOneByteRepresentationUnderneath();
489
490 #ifndef V8_INTERPRETED_REGEXP
491 DCHECK(output_size >= (IrregexpNumberOfCaptures(*irregexp) + 1) * 2);
492 do {
493 EnsureCompiledIrregexp(regexp, subject, is_one_byte);
494 Handle<Code> code(IrregexpNativeCode(*irregexp, is_one_byte), isolate);
495 // The stack is used to allocate registers for the compiled regexp code.
496 // This means that in case of failure, the output registers array is left
497 // untouched and contains the capture results from the previous successful
498 // match. We can use that to set the last match info lazily.
499 NativeRegExpMacroAssembler::Result res =
500 NativeRegExpMacroAssembler::Match(code,
501 subject,
502 output,
503 output_size,
504 index,
505 isolate);
506 if (res != NativeRegExpMacroAssembler::RETRY) {
507 DCHECK(res != NativeRegExpMacroAssembler::EXCEPTION ||
508 isolate->has_pending_exception());
509 STATIC_ASSERT(
510 static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) == RE_SUCCESS);
511 STATIC_ASSERT(
512 static_cast<int>(NativeRegExpMacroAssembler::FAILURE) == RE_FAILURE);
513 STATIC_ASSERT(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION)
514 == RE_EXCEPTION);
515 return static_cast<IrregexpResult>(res);
516 }
517 // If result is RETRY, the string has changed representation, and we
518 // must restart from scratch.
519 // In this case, it means we must make sure we are prepared to handle
520 // the, potentially, different subject (the string can switch between
521 // being internal and external, and even between being Latin1 and UC16,
522 // but the characters are always the same).
523 IrregexpPrepare(regexp, subject);
524 is_one_byte = subject->IsOneByteRepresentationUnderneath();
525 } while (true);
526 UNREACHABLE();
527 return RE_EXCEPTION;
528 #else // V8_INTERPRETED_REGEXP
529
530 DCHECK(output_size >= IrregexpNumberOfRegisters(*irregexp));
531 // We must have done EnsureCompiledIrregexp, so we can get the number of
532 // registers.
533 int number_of_capture_registers =
534 (IrregexpNumberOfCaptures(*irregexp) + 1) * 2;
535 int32_t* raw_output = &output[number_of_capture_registers];
536 // We do not touch the actual capture result registers until we know there
537 // has been a match so that we can use those capture results to set the
538 // last match info.
539 for (int i = number_of_capture_registers - 1; i >= 0; i--) {
540 raw_output[i] = -1;
541 }
542 Handle<ByteArray> byte_codes(IrregexpByteCode(*irregexp, is_one_byte),
543 isolate);
544
545 IrregexpResult result = IrregexpInterpreter::Match(isolate,
546 byte_codes,
547 subject,
548 raw_output,
549 index);
550 if (result == RE_SUCCESS) {
551 // Copy capture results to the start of the registers array.
552 MemCopy(output, raw_output, number_of_capture_registers * sizeof(int32_t));
553 }
554 if (result == RE_EXCEPTION) {
555 DCHECK(!isolate->has_pending_exception());
556 isolate->StackOverflow();
557 }
558 return result;
559 #endif // V8_INTERPRETED_REGEXP
560 }
561
IrregexpExec(Handle<JSRegExp> regexp,Handle<String> subject,int previous_index,Handle<RegExpMatchInfo> last_match_info)562 MaybeHandle<Object> RegExpImpl::IrregexpExec(
563 Handle<JSRegExp> regexp, Handle<String> subject, int previous_index,
564 Handle<RegExpMatchInfo> last_match_info) {
565 Isolate* isolate = regexp->GetIsolate();
566 DCHECK_EQ(regexp->TypeTag(), JSRegExp::IRREGEXP);
567
568 // Prepare space for the return values.
569 #if defined(V8_INTERPRETED_REGEXP) && defined(DEBUG)
570 if (FLAG_trace_regexp_bytecodes) {
571 String* pattern = regexp->Pattern();
572 PrintF("\n\nRegexp match: /%s/\n\n", pattern->ToCString().get());
573 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
574 }
575 #endif
576 int required_registers = RegExpImpl::IrregexpPrepare(regexp, subject);
577 if (required_registers < 0) {
578 // Compiling failed with an exception.
579 DCHECK(isolate->has_pending_exception());
580 return MaybeHandle<Object>();
581 }
582
583 int32_t* output_registers = NULL;
584 if (required_registers > Isolate::kJSRegexpStaticOffsetsVectorSize) {
585 output_registers = NewArray<int32_t>(required_registers);
586 }
587 std::unique_ptr<int32_t[]> auto_release(output_registers);
588 if (output_registers == NULL) {
589 output_registers = isolate->jsregexp_static_offsets_vector();
590 }
591
592 int res = RegExpImpl::IrregexpExecRaw(
593 regexp, subject, previous_index, output_registers, required_registers);
594 if (res == RE_SUCCESS) {
595 int capture_count =
596 IrregexpNumberOfCaptures(FixedArray::cast(regexp->data()));
597 return SetLastMatchInfo(
598 last_match_info, subject, capture_count, output_registers);
599 }
600 if (res == RE_EXCEPTION) {
601 DCHECK(isolate->has_pending_exception());
602 return MaybeHandle<Object>();
603 }
604 DCHECK(res == RE_FAILURE);
605 return isolate->factory()->null_value();
606 }
607
SetLastMatchInfo(Handle<RegExpMatchInfo> last_match_info,Handle<String> subject,int capture_count,int32_t * match)608 Handle<RegExpMatchInfo> RegExpImpl::SetLastMatchInfo(
609 Handle<RegExpMatchInfo> last_match_info, Handle<String> subject,
610 int capture_count, int32_t* match) {
611 // This is the only place where match infos can grow. If, after executing the
612 // regexp, RegExpExecStub finds that the match info is too small, it restarts
613 // execution in RegExpImpl::Exec, which finally grows the match info right
614 // here.
615
616 int capture_register_count = (capture_count + 1) * 2;
617 Handle<RegExpMatchInfo> result =
618 RegExpMatchInfo::ReserveCaptures(last_match_info, capture_register_count);
619 result->SetNumberOfCaptureRegisters(capture_register_count);
620
621 if (*result != *last_match_info) {
622 // The match info has been reallocated, update the corresponding reference
623 // on the native context.
624 Isolate* isolate = last_match_info->GetIsolate();
625 if (*last_match_info == *isolate->regexp_last_match_info()) {
626 isolate->native_context()->set_regexp_last_match_info(*result);
627 } else if (*last_match_info == *isolate->regexp_internal_match_info()) {
628 isolate->native_context()->set_regexp_internal_match_info(*result);
629 }
630 }
631
632 DisallowHeapAllocation no_allocation;
633 if (match != NULL) {
634 for (int i = 0; i < capture_register_count; i += 2) {
635 result->SetCapture(i, match[i]);
636 result->SetCapture(i + 1, match[i + 1]);
637 }
638 }
639 result->SetLastSubject(*subject);
640 result->SetLastInput(*subject);
641 return result;
642 }
643
644
GlobalCache(Handle<JSRegExp> regexp,Handle<String> subject,Isolate * isolate)645 RegExpImpl::GlobalCache::GlobalCache(Handle<JSRegExp> regexp,
646 Handle<String> subject,
647 Isolate* isolate)
648 : register_array_(NULL),
649 register_array_size_(0),
650 regexp_(regexp),
651 subject_(subject) {
652 #ifdef V8_INTERPRETED_REGEXP
653 bool interpreted = true;
654 #else
655 bool interpreted = false;
656 #endif // V8_INTERPRETED_REGEXP
657
658 if (regexp_->TypeTag() == JSRegExp::ATOM) {
659 static const int kAtomRegistersPerMatch = 2;
660 registers_per_match_ = kAtomRegistersPerMatch;
661 // There is no distinction between interpreted and native for atom regexps.
662 interpreted = false;
663 } else {
664 registers_per_match_ = RegExpImpl::IrregexpPrepare(regexp_, subject_);
665 if (registers_per_match_ < 0) {
666 num_matches_ = -1; // Signal exception.
667 return;
668 }
669 }
670
671 DCHECK_NE(0, regexp->GetFlags() & JSRegExp::kGlobal);
672 if (!interpreted) {
673 register_array_size_ =
674 Max(registers_per_match_, Isolate::kJSRegexpStaticOffsetsVectorSize);
675 max_matches_ = register_array_size_ / registers_per_match_;
676 } else {
677 // Global loop in interpreted regexp is not implemented. We choose
678 // the size of the offsets vector so that it can only store one match.
679 register_array_size_ = registers_per_match_;
680 max_matches_ = 1;
681 }
682
683 if (register_array_size_ > Isolate::kJSRegexpStaticOffsetsVectorSize) {
684 register_array_ = NewArray<int32_t>(register_array_size_);
685 } else {
686 register_array_ = isolate->jsregexp_static_offsets_vector();
687 }
688
689 // Set state so that fetching the results the first time triggers a call
690 // to the compiled regexp.
691 current_match_index_ = max_matches_ - 1;
692 num_matches_ = max_matches_;
693 DCHECK(registers_per_match_ >= 2); // Each match has at least one capture.
694 DCHECK_GE(register_array_size_, registers_per_match_);
695 int32_t* last_match =
696 ®ister_array_[current_match_index_ * registers_per_match_];
697 last_match[0] = -1;
698 last_match[1] = 0;
699 }
700
AdvanceZeroLength(int last_index)701 int RegExpImpl::GlobalCache::AdvanceZeroLength(int last_index) {
702 if ((regexp_->GetFlags() & JSRegExp::kUnicode) != 0 &&
703 last_index + 1 < subject_->length() &&
704 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
705 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
706 // Advance over the surrogate pair.
707 return last_index + 2;
708 }
709 return last_index + 1;
710 }
711
712 // -------------------------------------------------------------------
713 // Implementation of the Irregexp regular expression engine.
714 //
715 // The Irregexp regular expression engine is intended to be a complete
716 // implementation of ECMAScript regular expressions. It generates either
717 // bytecodes or native code.
718
719 // The Irregexp regexp engine is structured in three steps.
720 // 1) The parser generates an abstract syntax tree. See ast.cc.
721 // 2) From the AST a node network is created. The nodes are all
722 // subclasses of RegExpNode. The nodes represent states when
723 // executing a regular expression. Several optimizations are
724 // performed on the node network.
725 // 3) From the nodes we generate either byte codes or native code
726 // that can actually execute the regular expression (perform
727 // the search). The code generation step is described in more
728 // detail below.
729
730 // Code generation.
731 //
732 // The nodes are divided into four main categories.
733 // * Choice nodes
734 // These represent places where the regular expression can
735 // match in more than one way. For example on entry to an
736 // alternation (foo|bar) or a repetition (*, +, ? or {}).
737 // * Action nodes
738 // These represent places where some action should be
739 // performed. Examples include recording the current position
740 // in the input string to a register (in order to implement
741 // captures) or other actions on register for example in order
742 // to implement the counters needed for {} repetitions.
743 // * Matching nodes
744 // These attempt to match some element part of the input string.
745 // Examples of elements include character classes, plain strings
746 // or back references.
747 // * End nodes
748 // These are used to implement the actions required on finding
749 // a successful match or failing to find a match.
750 //
751 // The code generated (whether as byte codes or native code) maintains
752 // some state as it runs. This consists of the following elements:
753 //
754 // * The capture registers. Used for string captures.
755 // * Other registers. Used for counters etc.
756 // * The current position.
757 // * The stack of backtracking information. Used when a matching node
758 // fails to find a match and needs to try an alternative.
759 //
760 // Conceptual regular expression execution model:
761 //
762 // There is a simple conceptual model of regular expression execution
763 // which will be presented first. The actual code generated is a more
764 // efficient simulation of the simple conceptual model:
765 //
766 // * Choice nodes are implemented as follows:
767 // For each choice except the last {
768 // push current position
769 // push backtrack code location
770 // <generate code to test for choice>
771 // backtrack code location:
772 // pop current position
773 // }
774 // <generate code to test for last choice>
775 //
776 // * Actions nodes are generated as follows
777 // <push affected registers on backtrack stack>
778 // <generate code to perform action>
779 // push backtrack code location
780 // <generate code to test for following nodes>
781 // backtrack code location:
782 // <pop affected registers to restore their state>
783 // <pop backtrack location from stack and go to it>
784 //
785 // * Matching nodes are generated as follows:
786 // if input string matches at current position
787 // update current position
788 // <generate code to test for following nodes>
789 // else
790 // <pop backtrack location from stack and go to it>
791 //
792 // Thus it can be seen that the current position is saved and restored
793 // by the choice nodes, whereas the registers are saved and restored by
794 // by the action nodes that manipulate them.
795 //
796 // The other interesting aspect of this model is that nodes are generated
797 // at the point where they are needed by a recursive call to Emit(). If
798 // the node has already been code generated then the Emit() call will
799 // generate a jump to the previously generated code instead. In order to
800 // limit recursion it is possible for the Emit() function to put the node
801 // on a work list for later generation and instead generate a jump. The
802 // destination of the jump is resolved later when the code is generated.
803 //
804 // Actual regular expression code generation.
805 //
806 // Code generation is actually more complicated than the above. In order
807 // to improve the efficiency of the generated code some optimizations are
808 // performed
809 //
810 // * Choice nodes have 1-character lookahead.
811 // A choice node looks at the following character and eliminates some of
812 // the choices immediately based on that character. This is not yet
813 // implemented.
814 // * Simple greedy loops store reduced backtracking information.
815 // A quantifier like /.*foo/m will greedily match the whole input. It will
816 // then need to backtrack to a point where it can match "foo". The naive
817 // implementation of this would push each character position onto the
818 // backtracking stack, then pop them off one by one. This would use space
819 // proportional to the length of the input string. However since the "."
820 // can only match in one way and always has a constant length (in this case
821 // of 1) it suffices to store the current position on the top of the stack
822 // once. Matching now becomes merely incrementing the current position and
823 // backtracking becomes decrementing the current position and checking the
824 // result against the stored current position. This is faster and saves
825 // space.
826 // * The current state is virtualized.
827 // This is used to defer expensive operations until it is clear that they
828 // are needed and to generate code for a node more than once, allowing
829 // specialized an efficient versions of the code to be created. This is
830 // explained in the section below.
831 //
832 // Execution state virtualization.
833 //
834 // Instead of emitting code, nodes that manipulate the state can record their
835 // manipulation in an object called the Trace. The Trace object can record a
836 // current position offset, an optional backtrack code location on the top of
837 // the virtualized backtrack stack and some register changes. When a node is
838 // to be emitted it can flush the Trace or update it. Flushing the Trace
839 // will emit code to bring the actual state into line with the virtual state.
840 // Avoiding flushing the state can postpone some work (e.g. updates of capture
841 // registers). Postponing work can save time when executing the regular
842 // expression since it may be found that the work never has to be done as a
843 // failure to match can occur. In addition it is much faster to jump to a
844 // known backtrack code location than it is to pop an unknown backtrack
845 // location from the stack and jump there.
846 //
847 // The virtual state found in the Trace affects code generation. For example
848 // the virtual state contains the difference between the actual current
849 // position and the virtual current position, and matching code needs to use
850 // this offset to attempt a match in the correct location of the input
851 // string. Therefore code generated for a non-trivial trace is specialized
852 // to that trace. The code generator therefore has the ability to generate
853 // code for each node several times. In order to limit the size of the
854 // generated code there is an arbitrary limit on how many specialized sets of
855 // code may be generated for a given node. If the limit is reached, the
856 // trace is flushed and a generic version of the code for a node is emitted.
857 // This is subsequently used for that node. The code emitted for non-generic
858 // trace is not recorded in the node and so it cannot currently be reused in
859 // the event that code generation is requested for an identical trace.
860
861
AppendToText(RegExpText * text,Zone * zone)862 void RegExpTree::AppendToText(RegExpText* text, Zone* zone) {
863 UNREACHABLE();
864 }
865
866
AppendToText(RegExpText * text,Zone * zone)867 void RegExpAtom::AppendToText(RegExpText* text, Zone* zone) {
868 text->AddElement(TextElement::Atom(this), zone);
869 }
870
871
AppendToText(RegExpText * text,Zone * zone)872 void RegExpCharacterClass::AppendToText(RegExpText* text, Zone* zone) {
873 text->AddElement(TextElement::CharClass(this), zone);
874 }
875
876
AppendToText(RegExpText * text,Zone * zone)877 void RegExpText::AppendToText(RegExpText* text, Zone* zone) {
878 for (int i = 0; i < elements()->length(); i++)
879 text->AddElement(elements()->at(i), zone);
880 }
881
882
Atom(RegExpAtom * atom)883 TextElement TextElement::Atom(RegExpAtom* atom) {
884 return TextElement(ATOM, atom);
885 }
886
887
CharClass(RegExpCharacterClass * char_class)888 TextElement TextElement::CharClass(RegExpCharacterClass* char_class) {
889 return TextElement(CHAR_CLASS, char_class);
890 }
891
892
length() const893 int TextElement::length() const {
894 switch (text_type()) {
895 case ATOM:
896 return atom()->length();
897
898 case CHAR_CLASS:
899 return 1;
900 }
901 UNREACHABLE();
902 return 0;
903 }
904
905
GetTable(bool ignore_case)906 DispatchTable* ChoiceNode::GetTable(bool ignore_case) {
907 if (table_ == NULL) {
908 table_ = new(zone()) DispatchTable(zone());
909 DispatchTableConstructor cons(table_, ignore_case, zone());
910 cons.BuildTable(this);
911 }
912 return table_;
913 }
914
915
916 class FrequencyCollator {
917 public:
FrequencyCollator()918 FrequencyCollator() : total_samples_(0) {
919 for (int i = 0; i < RegExpMacroAssembler::kTableSize; i++) {
920 frequencies_[i] = CharacterFrequency(i);
921 }
922 }
923
CountCharacter(int character)924 void CountCharacter(int character) {
925 int index = (character & RegExpMacroAssembler::kTableMask);
926 frequencies_[index].Increment();
927 total_samples_++;
928 }
929
930 // Does not measure in percent, but rather per-128 (the table size from the
931 // regexp macro assembler).
Frequency(int in_character)932 int Frequency(int in_character) {
933 DCHECK((in_character & RegExpMacroAssembler::kTableMask) == in_character);
934 if (total_samples_ < 1) return 1; // Division by zero.
935 int freq_in_per128 =
936 (frequencies_[in_character].counter() * 128) / total_samples_;
937 return freq_in_per128;
938 }
939
940 private:
941 class CharacterFrequency {
942 public:
CharacterFrequency()943 CharacterFrequency() : counter_(0), character_(-1) { }
CharacterFrequency(int character)944 explicit CharacterFrequency(int character)
945 : counter_(0), character_(character) { }
946
Increment()947 void Increment() { counter_++; }
counter()948 int counter() { return counter_; }
character()949 int character() { return character_; }
950
951 private:
952 int counter_;
953 int character_;
954 };
955
956
957 private:
958 CharacterFrequency frequencies_[RegExpMacroAssembler::kTableSize];
959 int total_samples_;
960 };
961
962
963 class RegExpCompiler {
964 public:
965 RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
966 JSRegExp::Flags flags, bool is_one_byte);
967
AllocateRegister()968 int AllocateRegister() {
969 if (next_register_ >= RegExpMacroAssembler::kMaxRegister) {
970 reg_exp_too_big_ = true;
971 return next_register_;
972 }
973 return next_register_++;
974 }
975
976 // Lookarounds to match lone surrogates for unicode character class matches
977 // are never nested. We can therefore reuse registers.
UnicodeLookaroundStackRegister()978 int UnicodeLookaroundStackRegister() {
979 if (unicode_lookaround_stack_register_ == kNoRegister) {
980 unicode_lookaround_stack_register_ = AllocateRegister();
981 }
982 return unicode_lookaround_stack_register_;
983 }
984
UnicodeLookaroundPositionRegister()985 int UnicodeLookaroundPositionRegister() {
986 if (unicode_lookaround_position_register_ == kNoRegister) {
987 unicode_lookaround_position_register_ = AllocateRegister();
988 }
989 return unicode_lookaround_position_register_;
990 }
991
992 RegExpEngine::CompilationResult Assemble(RegExpMacroAssembler* assembler,
993 RegExpNode* start,
994 int capture_count,
995 Handle<String> pattern);
996
AddWork(RegExpNode * node)997 inline void AddWork(RegExpNode* node) {
998 if (!node->on_work_list() && !node->label()->is_bound()) {
999 node->set_on_work_list(true);
1000 work_list_->Add(node);
1001 }
1002 }
1003
1004 static const int kImplementationOffset = 0;
1005 static const int kNumberOfRegistersOffset = 0;
1006 static const int kCodeOffset = 1;
1007
macro_assembler()1008 RegExpMacroAssembler* macro_assembler() { return macro_assembler_; }
accept()1009 EndNode* accept() { return accept_; }
1010
1011 static const int kMaxRecursion = 100;
recursion_depth()1012 inline int recursion_depth() { return recursion_depth_; }
IncrementRecursionDepth()1013 inline void IncrementRecursionDepth() { recursion_depth_++; }
DecrementRecursionDepth()1014 inline void DecrementRecursionDepth() { recursion_depth_--; }
1015
SetRegExpTooBig()1016 void SetRegExpTooBig() { reg_exp_too_big_ = true; }
1017
ignore_case()1018 inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
unicode()1019 inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
one_byte()1020 inline bool one_byte() { return one_byte_; }
optimize()1021 inline bool optimize() { return optimize_; }
set_optimize(bool value)1022 inline void set_optimize(bool value) { optimize_ = value; }
limiting_recursion()1023 inline bool limiting_recursion() { return limiting_recursion_; }
set_limiting_recursion(bool value)1024 inline void set_limiting_recursion(bool value) {
1025 limiting_recursion_ = value;
1026 }
read_backward()1027 bool read_backward() { return read_backward_; }
set_read_backward(bool value)1028 void set_read_backward(bool value) { read_backward_ = value; }
frequency_collator()1029 FrequencyCollator* frequency_collator() { return &frequency_collator_; }
1030
current_expansion_factor()1031 int current_expansion_factor() { return current_expansion_factor_; }
set_current_expansion_factor(int value)1032 void set_current_expansion_factor(int value) {
1033 current_expansion_factor_ = value;
1034 }
1035
isolate() const1036 Isolate* isolate() const { return isolate_; }
zone() const1037 Zone* zone() const { return zone_; }
1038
1039 static const int kNoRegister = -1;
1040
1041 private:
1042 EndNode* accept_;
1043 int next_register_;
1044 int unicode_lookaround_stack_register_;
1045 int unicode_lookaround_position_register_;
1046 List<RegExpNode*>* work_list_;
1047 int recursion_depth_;
1048 RegExpMacroAssembler* macro_assembler_;
1049 JSRegExp::Flags flags_;
1050 bool one_byte_;
1051 bool reg_exp_too_big_;
1052 bool limiting_recursion_;
1053 bool optimize_;
1054 bool read_backward_;
1055 int current_expansion_factor_;
1056 FrequencyCollator frequency_collator_;
1057 Isolate* isolate_;
1058 Zone* zone_;
1059 };
1060
1061
1062 class RecursionCheck {
1063 public:
RecursionCheck(RegExpCompiler * compiler)1064 explicit RecursionCheck(RegExpCompiler* compiler) : compiler_(compiler) {
1065 compiler->IncrementRecursionDepth();
1066 }
~RecursionCheck()1067 ~RecursionCheck() { compiler_->DecrementRecursionDepth(); }
1068 private:
1069 RegExpCompiler* compiler_;
1070 };
1071
1072
IrregexpRegExpTooBig(Isolate * isolate)1073 static RegExpEngine::CompilationResult IrregexpRegExpTooBig(Isolate* isolate) {
1074 return RegExpEngine::CompilationResult(isolate, "RegExp too big");
1075 }
1076
1077
1078 // Attempts to compile the regexp using an Irregexp code generator. Returns
1079 // a fixed array or a null handle depending on whether it succeeded.
RegExpCompiler(Isolate * isolate,Zone * zone,int capture_count,JSRegExp::Flags flags,bool one_byte)1080 RegExpCompiler::RegExpCompiler(Isolate* isolate, Zone* zone, int capture_count,
1081 JSRegExp::Flags flags, bool one_byte)
1082 : next_register_(2 * (capture_count + 1)),
1083 unicode_lookaround_stack_register_(kNoRegister),
1084 unicode_lookaround_position_register_(kNoRegister),
1085 work_list_(NULL),
1086 recursion_depth_(0),
1087 flags_(flags),
1088 one_byte_(one_byte),
1089 reg_exp_too_big_(false),
1090 limiting_recursion_(false),
1091 optimize_(FLAG_regexp_optimization),
1092 read_backward_(false),
1093 current_expansion_factor_(1),
1094 frequency_collator_(),
1095 isolate_(isolate),
1096 zone_(zone) {
1097 accept_ = new(zone) EndNode(EndNode::ACCEPT, zone);
1098 DCHECK(next_register_ - 1 <= RegExpMacroAssembler::kMaxRegister);
1099 }
1100
1101
Assemble(RegExpMacroAssembler * macro_assembler,RegExpNode * start,int capture_count,Handle<String> pattern)1102 RegExpEngine::CompilationResult RegExpCompiler::Assemble(
1103 RegExpMacroAssembler* macro_assembler,
1104 RegExpNode* start,
1105 int capture_count,
1106 Handle<String> pattern) {
1107 Heap* heap = pattern->GetHeap();
1108
1109 #ifdef DEBUG
1110 if (FLAG_trace_regexp_assembler)
1111 macro_assembler_ =
1112 new RegExpMacroAssemblerTracer(isolate(), macro_assembler);
1113 else
1114 #endif
1115 macro_assembler_ = macro_assembler;
1116
1117 List <RegExpNode*> work_list(0);
1118 work_list_ = &work_list;
1119 Label fail;
1120 macro_assembler_->PushBacktrack(&fail);
1121 Trace new_trace;
1122 start->Emit(this, &new_trace);
1123 macro_assembler_->Bind(&fail);
1124 macro_assembler_->Fail();
1125 while (!work_list.is_empty()) {
1126 RegExpNode* node = work_list.RemoveLast();
1127 node->set_on_work_list(false);
1128 if (!node->label()->is_bound()) node->Emit(this, &new_trace);
1129 }
1130 if (reg_exp_too_big_) {
1131 macro_assembler_->AbortedCodeGeneration();
1132 return IrregexpRegExpTooBig(isolate_);
1133 }
1134
1135 Handle<HeapObject> code = macro_assembler_->GetCode(pattern);
1136 heap->IncreaseTotalRegexpCodeGenerated(code->Size());
1137 work_list_ = NULL;
1138 #ifdef ENABLE_DISASSEMBLER
1139 if (FLAG_print_code) {
1140 CodeTracer::Scope trace_scope(heap->isolate()->GetCodeTracer());
1141 OFStream os(trace_scope.file());
1142 Handle<Code>::cast(code)->Disassemble(pattern->ToCString().get(), os);
1143 }
1144 #endif
1145 #ifdef DEBUG
1146 if (FLAG_trace_regexp_assembler) {
1147 delete macro_assembler_;
1148 }
1149 #endif
1150 return RegExpEngine::CompilationResult(*code, next_register_);
1151 }
1152
1153
Mentions(int that)1154 bool Trace::DeferredAction::Mentions(int that) {
1155 if (action_type() == ActionNode::CLEAR_CAPTURES) {
1156 Interval range = static_cast<DeferredClearCaptures*>(this)->range();
1157 return range.Contains(that);
1158 } else {
1159 return reg() == that;
1160 }
1161 }
1162
1163
mentions_reg(int reg)1164 bool Trace::mentions_reg(int reg) {
1165 for (DeferredAction* action = actions_;
1166 action != NULL;
1167 action = action->next()) {
1168 if (action->Mentions(reg))
1169 return true;
1170 }
1171 return false;
1172 }
1173
1174
GetStoredPosition(int reg,int * cp_offset)1175 bool Trace::GetStoredPosition(int reg, int* cp_offset) {
1176 DCHECK_EQ(0, *cp_offset);
1177 for (DeferredAction* action = actions_;
1178 action != NULL;
1179 action = action->next()) {
1180 if (action->Mentions(reg)) {
1181 if (action->action_type() == ActionNode::STORE_POSITION) {
1182 *cp_offset = static_cast<DeferredCapture*>(action)->cp_offset();
1183 return true;
1184 } else {
1185 return false;
1186 }
1187 }
1188 }
1189 return false;
1190 }
1191
1192
FindAffectedRegisters(OutSet * affected_registers,Zone * zone)1193 int Trace::FindAffectedRegisters(OutSet* affected_registers,
1194 Zone* zone) {
1195 int max_register = RegExpCompiler::kNoRegister;
1196 for (DeferredAction* action = actions_;
1197 action != NULL;
1198 action = action->next()) {
1199 if (action->action_type() == ActionNode::CLEAR_CAPTURES) {
1200 Interval range = static_cast<DeferredClearCaptures*>(action)->range();
1201 for (int i = range.from(); i <= range.to(); i++)
1202 affected_registers->Set(i, zone);
1203 if (range.to() > max_register) max_register = range.to();
1204 } else {
1205 affected_registers->Set(action->reg(), zone);
1206 if (action->reg() > max_register) max_register = action->reg();
1207 }
1208 }
1209 return max_register;
1210 }
1211
1212
RestoreAffectedRegisters(RegExpMacroAssembler * assembler,int max_register,const OutSet & registers_to_pop,const OutSet & registers_to_clear)1213 void Trace::RestoreAffectedRegisters(RegExpMacroAssembler* assembler,
1214 int max_register,
1215 const OutSet& registers_to_pop,
1216 const OutSet& registers_to_clear) {
1217 for (int reg = max_register; reg >= 0; reg--) {
1218 if (registers_to_pop.Get(reg)) {
1219 assembler->PopRegister(reg);
1220 } else if (registers_to_clear.Get(reg)) {
1221 int clear_to = reg;
1222 while (reg > 0 && registers_to_clear.Get(reg - 1)) {
1223 reg--;
1224 }
1225 assembler->ClearRegisters(reg, clear_to);
1226 }
1227 }
1228 }
1229
1230
PerformDeferredActions(RegExpMacroAssembler * assembler,int max_register,const OutSet & affected_registers,OutSet * registers_to_pop,OutSet * registers_to_clear,Zone * zone)1231 void Trace::PerformDeferredActions(RegExpMacroAssembler* assembler,
1232 int max_register,
1233 const OutSet& affected_registers,
1234 OutSet* registers_to_pop,
1235 OutSet* registers_to_clear,
1236 Zone* zone) {
1237 // The "+1" is to avoid a push_limit of zero if stack_limit_slack() is 1.
1238 const int push_limit = (assembler->stack_limit_slack() + 1) / 2;
1239
1240 // Count pushes performed to force a stack limit check occasionally.
1241 int pushes = 0;
1242
1243 for (int reg = 0; reg <= max_register; reg++) {
1244 if (!affected_registers.Get(reg)) {
1245 continue;
1246 }
1247
1248 // The chronologically first deferred action in the trace
1249 // is used to infer the action needed to restore a register
1250 // to its previous state (or not, if it's safe to ignore it).
1251 enum DeferredActionUndoType { IGNORE, RESTORE, CLEAR };
1252 DeferredActionUndoType undo_action = IGNORE;
1253
1254 int value = 0;
1255 bool absolute = false;
1256 bool clear = false;
1257 static const int kNoStore = kMinInt;
1258 int store_position = kNoStore;
1259 // This is a little tricky because we are scanning the actions in reverse
1260 // historical order (newest first).
1261 for (DeferredAction* action = actions_;
1262 action != NULL;
1263 action = action->next()) {
1264 if (action->Mentions(reg)) {
1265 switch (action->action_type()) {
1266 case ActionNode::SET_REGISTER: {
1267 Trace::DeferredSetRegister* psr =
1268 static_cast<Trace::DeferredSetRegister*>(action);
1269 if (!absolute) {
1270 value += psr->value();
1271 absolute = true;
1272 }
1273 // SET_REGISTER is currently only used for newly introduced loop
1274 // counters. They can have a significant previous value if they
1275 // occour in a loop. TODO(lrn): Propagate this information, so
1276 // we can set undo_action to IGNORE if we know there is no value to
1277 // restore.
1278 undo_action = RESTORE;
1279 DCHECK_EQ(store_position, kNoStore);
1280 DCHECK(!clear);
1281 break;
1282 }
1283 case ActionNode::INCREMENT_REGISTER:
1284 if (!absolute) {
1285 value++;
1286 }
1287 DCHECK_EQ(store_position, kNoStore);
1288 DCHECK(!clear);
1289 undo_action = RESTORE;
1290 break;
1291 case ActionNode::STORE_POSITION: {
1292 Trace::DeferredCapture* pc =
1293 static_cast<Trace::DeferredCapture*>(action);
1294 if (!clear && store_position == kNoStore) {
1295 store_position = pc->cp_offset();
1296 }
1297
1298 // For captures we know that stores and clears alternate.
1299 // Other register, are never cleared, and if the occur
1300 // inside a loop, they might be assigned more than once.
1301 if (reg <= 1) {
1302 // Registers zero and one, aka "capture zero", is
1303 // always set correctly if we succeed. There is no
1304 // need to undo a setting on backtrack, because we
1305 // will set it again or fail.
1306 undo_action = IGNORE;
1307 } else {
1308 undo_action = pc->is_capture() ? CLEAR : RESTORE;
1309 }
1310 DCHECK(!absolute);
1311 DCHECK_EQ(value, 0);
1312 break;
1313 }
1314 case ActionNode::CLEAR_CAPTURES: {
1315 // Since we're scanning in reverse order, if we've already
1316 // set the position we have to ignore historically earlier
1317 // clearing operations.
1318 if (store_position == kNoStore) {
1319 clear = true;
1320 }
1321 undo_action = RESTORE;
1322 DCHECK(!absolute);
1323 DCHECK_EQ(value, 0);
1324 break;
1325 }
1326 default:
1327 UNREACHABLE();
1328 break;
1329 }
1330 }
1331 }
1332 // Prepare for the undo-action (e.g., push if it's going to be popped).
1333 if (undo_action == RESTORE) {
1334 pushes++;
1335 RegExpMacroAssembler::StackCheckFlag stack_check =
1336 RegExpMacroAssembler::kNoStackLimitCheck;
1337 if (pushes == push_limit) {
1338 stack_check = RegExpMacroAssembler::kCheckStackLimit;
1339 pushes = 0;
1340 }
1341
1342 assembler->PushRegister(reg, stack_check);
1343 registers_to_pop->Set(reg, zone);
1344 } else if (undo_action == CLEAR) {
1345 registers_to_clear->Set(reg, zone);
1346 }
1347 // Perform the chronologically last action (or accumulated increment)
1348 // for the register.
1349 if (store_position != kNoStore) {
1350 assembler->WriteCurrentPositionToRegister(reg, store_position);
1351 } else if (clear) {
1352 assembler->ClearRegisters(reg, reg);
1353 } else if (absolute) {
1354 assembler->SetRegister(reg, value);
1355 } else if (value != 0) {
1356 assembler->AdvanceRegister(reg, value);
1357 }
1358 }
1359 }
1360
1361
1362 // This is called as we come into a loop choice node and some other tricky
1363 // nodes. It normalizes the state of the code generator to ensure we can
1364 // generate generic code.
Flush(RegExpCompiler * compiler,RegExpNode * successor)1365 void Trace::Flush(RegExpCompiler* compiler, RegExpNode* successor) {
1366 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1367
1368 DCHECK(!is_trivial());
1369
1370 if (actions_ == NULL && backtrack() == NULL) {
1371 // Here we just have some deferred cp advances to fix and we are back to
1372 // a normal situation. We may also have to forget some information gained
1373 // through a quick check that was already performed.
1374 if (cp_offset_ != 0) assembler->AdvanceCurrentPosition(cp_offset_);
1375 // Create a new trivial state and generate the node with that.
1376 Trace new_state;
1377 successor->Emit(compiler, &new_state);
1378 return;
1379 }
1380
1381 // Generate deferred actions here along with code to undo them again.
1382 OutSet affected_registers;
1383
1384 if (backtrack() != NULL) {
1385 // Here we have a concrete backtrack location. These are set up by choice
1386 // nodes and so they indicate that we have a deferred save of the current
1387 // position which we may need to emit here.
1388 assembler->PushCurrentPosition();
1389 }
1390
1391 int max_register = FindAffectedRegisters(&affected_registers,
1392 compiler->zone());
1393 OutSet registers_to_pop;
1394 OutSet registers_to_clear;
1395 PerformDeferredActions(assembler,
1396 max_register,
1397 affected_registers,
1398 ®isters_to_pop,
1399 ®isters_to_clear,
1400 compiler->zone());
1401 if (cp_offset_ != 0) {
1402 assembler->AdvanceCurrentPosition(cp_offset_);
1403 }
1404
1405 // Create a new trivial state and generate the node with that.
1406 Label undo;
1407 assembler->PushBacktrack(&undo);
1408 if (successor->KeepRecursing(compiler)) {
1409 Trace new_state;
1410 successor->Emit(compiler, &new_state);
1411 } else {
1412 compiler->AddWork(successor);
1413 assembler->GoTo(successor->label());
1414 }
1415
1416 // On backtrack we need to restore state.
1417 assembler->Bind(&undo);
1418 RestoreAffectedRegisters(assembler,
1419 max_register,
1420 registers_to_pop,
1421 registers_to_clear);
1422 if (backtrack() == NULL) {
1423 assembler->Backtrack();
1424 } else {
1425 assembler->PopCurrentPosition();
1426 assembler->GoTo(backtrack());
1427 }
1428 }
1429
1430
Emit(RegExpCompiler * compiler,Trace * trace)1431 void NegativeSubmatchSuccess::Emit(RegExpCompiler* compiler, Trace* trace) {
1432 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1433
1434 // Omit flushing the trace. We discard the entire stack frame anyway.
1435
1436 if (!label()->is_bound()) {
1437 // We are completely independent of the trace, since we ignore it,
1438 // so this code can be used as the generic version.
1439 assembler->Bind(label());
1440 }
1441
1442 // Throw away everything on the backtrack stack since the start
1443 // of the negative submatch and restore the character position.
1444 assembler->ReadCurrentPositionFromRegister(current_position_register_);
1445 assembler->ReadStackPointerFromRegister(stack_pointer_register_);
1446 if (clear_capture_count_ > 0) {
1447 // Clear any captures that might have been performed during the success
1448 // of the body of the negative look-ahead.
1449 int clear_capture_end = clear_capture_start_ + clear_capture_count_ - 1;
1450 assembler->ClearRegisters(clear_capture_start_, clear_capture_end);
1451 }
1452 // Now that we have unwound the stack we find at the top of the stack the
1453 // backtrack that the BeginSubmatch node got.
1454 assembler->Backtrack();
1455 }
1456
1457
Emit(RegExpCompiler * compiler,Trace * trace)1458 void EndNode::Emit(RegExpCompiler* compiler, Trace* trace) {
1459 if (!trace->is_trivial()) {
1460 trace->Flush(compiler, this);
1461 return;
1462 }
1463 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1464 if (!label()->is_bound()) {
1465 assembler->Bind(label());
1466 }
1467 switch (action_) {
1468 case ACCEPT:
1469 assembler->Succeed();
1470 return;
1471 case BACKTRACK:
1472 assembler->GoTo(trace->backtrack());
1473 return;
1474 case NEGATIVE_SUBMATCH_SUCCESS:
1475 // This case is handled in a different virtual method.
1476 UNREACHABLE();
1477 }
1478 UNIMPLEMENTED();
1479 }
1480
1481
AddGuard(Guard * guard,Zone * zone)1482 void GuardedAlternative::AddGuard(Guard* guard, Zone* zone) {
1483 if (guards_ == NULL)
1484 guards_ = new(zone) ZoneList<Guard*>(1, zone);
1485 guards_->Add(guard, zone);
1486 }
1487
1488
SetRegister(int reg,int val,RegExpNode * on_success)1489 ActionNode* ActionNode::SetRegister(int reg,
1490 int val,
1491 RegExpNode* on_success) {
1492 ActionNode* result =
1493 new(on_success->zone()) ActionNode(SET_REGISTER, on_success);
1494 result->data_.u_store_register.reg = reg;
1495 result->data_.u_store_register.value = val;
1496 return result;
1497 }
1498
1499
IncrementRegister(int reg,RegExpNode * on_success)1500 ActionNode* ActionNode::IncrementRegister(int reg, RegExpNode* on_success) {
1501 ActionNode* result =
1502 new(on_success->zone()) ActionNode(INCREMENT_REGISTER, on_success);
1503 result->data_.u_increment_register.reg = reg;
1504 return result;
1505 }
1506
1507
StorePosition(int reg,bool is_capture,RegExpNode * on_success)1508 ActionNode* ActionNode::StorePosition(int reg,
1509 bool is_capture,
1510 RegExpNode* on_success) {
1511 ActionNode* result =
1512 new(on_success->zone()) ActionNode(STORE_POSITION, on_success);
1513 result->data_.u_position_register.reg = reg;
1514 result->data_.u_position_register.is_capture = is_capture;
1515 return result;
1516 }
1517
1518
ClearCaptures(Interval range,RegExpNode * on_success)1519 ActionNode* ActionNode::ClearCaptures(Interval range,
1520 RegExpNode* on_success) {
1521 ActionNode* result =
1522 new(on_success->zone()) ActionNode(CLEAR_CAPTURES, on_success);
1523 result->data_.u_clear_captures.range_from = range.from();
1524 result->data_.u_clear_captures.range_to = range.to();
1525 return result;
1526 }
1527
1528
BeginSubmatch(int stack_reg,int position_reg,RegExpNode * on_success)1529 ActionNode* ActionNode::BeginSubmatch(int stack_reg,
1530 int position_reg,
1531 RegExpNode* on_success) {
1532 ActionNode* result =
1533 new(on_success->zone()) ActionNode(BEGIN_SUBMATCH, on_success);
1534 result->data_.u_submatch.stack_pointer_register = stack_reg;
1535 result->data_.u_submatch.current_position_register = position_reg;
1536 return result;
1537 }
1538
1539
PositiveSubmatchSuccess(int stack_reg,int position_reg,int clear_register_count,int clear_register_from,RegExpNode * on_success)1540 ActionNode* ActionNode::PositiveSubmatchSuccess(int stack_reg,
1541 int position_reg,
1542 int clear_register_count,
1543 int clear_register_from,
1544 RegExpNode* on_success) {
1545 ActionNode* result =
1546 new(on_success->zone()) ActionNode(POSITIVE_SUBMATCH_SUCCESS, on_success);
1547 result->data_.u_submatch.stack_pointer_register = stack_reg;
1548 result->data_.u_submatch.current_position_register = position_reg;
1549 result->data_.u_submatch.clear_register_count = clear_register_count;
1550 result->data_.u_submatch.clear_register_from = clear_register_from;
1551 return result;
1552 }
1553
1554
EmptyMatchCheck(int start_register,int repetition_register,int repetition_limit,RegExpNode * on_success)1555 ActionNode* ActionNode::EmptyMatchCheck(int start_register,
1556 int repetition_register,
1557 int repetition_limit,
1558 RegExpNode* on_success) {
1559 ActionNode* result =
1560 new(on_success->zone()) ActionNode(EMPTY_MATCH_CHECK, on_success);
1561 result->data_.u_empty_match_check.start_register = start_register;
1562 result->data_.u_empty_match_check.repetition_register = repetition_register;
1563 result->data_.u_empty_match_check.repetition_limit = repetition_limit;
1564 return result;
1565 }
1566
1567
1568 #define DEFINE_ACCEPT(Type) \
1569 void Type##Node::Accept(NodeVisitor* visitor) { \
1570 visitor->Visit##Type(this); \
1571 }
FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)1572 FOR_EACH_NODE_TYPE(DEFINE_ACCEPT)
1573 #undef DEFINE_ACCEPT
1574
1575
1576 void LoopChoiceNode::Accept(NodeVisitor* visitor) {
1577 visitor->VisitLoopChoice(this);
1578 }
1579
1580
1581 // -------------------------------------------------------------------
1582 // Emit code.
1583
1584
GenerateGuard(RegExpMacroAssembler * macro_assembler,Guard * guard,Trace * trace)1585 void ChoiceNode::GenerateGuard(RegExpMacroAssembler* macro_assembler,
1586 Guard* guard,
1587 Trace* trace) {
1588 switch (guard->op()) {
1589 case Guard::LT:
1590 DCHECK(!trace->mentions_reg(guard->reg()));
1591 macro_assembler->IfRegisterGE(guard->reg(),
1592 guard->value(),
1593 trace->backtrack());
1594 break;
1595 case Guard::GEQ:
1596 DCHECK(!trace->mentions_reg(guard->reg()));
1597 macro_assembler->IfRegisterLT(guard->reg(),
1598 guard->value(),
1599 trace->backtrack());
1600 break;
1601 }
1602 }
1603
1604
1605 // Returns the number of characters in the equivalence class, omitting those
1606 // that cannot occur in the source string because it is Latin1.
GetCaseIndependentLetters(Isolate * isolate,uc16 character,bool one_byte_subject,unibrow::uchar * letters)1607 static int GetCaseIndependentLetters(Isolate* isolate, uc16 character,
1608 bool one_byte_subject,
1609 unibrow::uchar* letters) {
1610 int length =
1611 isolate->jsregexp_uncanonicalize()->get(character, '\0', letters);
1612 // Unibrow returns 0 or 1 for characters where case independence is
1613 // trivial.
1614 if (length == 0) {
1615 letters[0] = character;
1616 length = 1;
1617 }
1618
1619 if (one_byte_subject) {
1620 int new_length = 0;
1621 for (int i = 0; i < length; i++) {
1622 if (letters[i] <= String::kMaxOneByteCharCode) {
1623 letters[new_length++] = letters[i];
1624 }
1625 }
1626 length = new_length;
1627 }
1628
1629 return length;
1630 }
1631
1632
EmitSimpleCharacter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1633 static inline bool EmitSimpleCharacter(Isolate* isolate,
1634 RegExpCompiler* compiler,
1635 uc16 c,
1636 Label* on_failure,
1637 int cp_offset,
1638 bool check,
1639 bool preloaded) {
1640 RegExpMacroAssembler* assembler = compiler->macro_assembler();
1641 bool bound_checked = false;
1642 if (!preloaded) {
1643 assembler->LoadCurrentCharacter(
1644 cp_offset,
1645 on_failure,
1646 check);
1647 bound_checked = true;
1648 }
1649 assembler->CheckNotCharacter(c, on_failure);
1650 return bound_checked;
1651 }
1652
1653
1654 // Only emits non-letters (things that don't have case). Only used for case
1655 // independent matches.
EmitAtomNonLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1656 static inline bool EmitAtomNonLetter(Isolate* isolate,
1657 RegExpCompiler* compiler,
1658 uc16 c,
1659 Label* on_failure,
1660 int cp_offset,
1661 bool check,
1662 bool preloaded) {
1663 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1664 bool one_byte = compiler->one_byte();
1665 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1666 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1667 if (length < 1) {
1668 // This can't match. Must be an one-byte subject and a non-one-byte
1669 // character. We do not need to do anything since the one-byte pass
1670 // already handled this.
1671 return false; // Bounds not checked.
1672 }
1673 bool checked = false;
1674 // We handle the length > 1 case in a later pass.
1675 if (length == 1) {
1676 if (one_byte && c > String::kMaxOneByteCharCodeU) {
1677 // Can't match - see above.
1678 return false; // Bounds not checked.
1679 }
1680 if (!preloaded) {
1681 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1682 checked = check;
1683 }
1684 macro_assembler->CheckNotCharacter(c, on_failure);
1685 }
1686 return checked;
1687 }
1688
1689
ShortCutEmitCharacterPair(RegExpMacroAssembler * macro_assembler,bool one_byte,uc16 c1,uc16 c2,Label * on_failure)1690 static bool ShortCutEmitCharacterPair(RegExpMacroAssembler* macro_assembler,
1691 bool one_byte, uc16 c1, uc16 c2,
1692 Label* on_failure) {
1693 uc16 char_mask;
1694 if (one_byte) {
1695 char_mask = String::kMaxOneByteCharCode;
1696 } else {
1697 char_mask = String::kMaxUtf16CodeUnit;
1698 }
1699 uc16 exor = c1 ^ c2;
1700 // Check whether exor has only one bit set.
1701 if (((exor - 1) & exor) == 0) {
1702 // If c1 and c2 differ only by one bit.
1703 // Ecma262UnCanonicalize always gives the highest number last.
1704 DCHECK(c2 > c1);
1705 uc16 mask = char_mask ^ exor;
1706 macro_assembler->CheckNotCharacterAfterAnd(c1, mask, on_failure);
1707 return true;
1708 }
1709 DCHECK(c2 > c1);
1710 uc16 diff = c2 - c1;
1711 if (((diff - 1) & diff) == 0 && c1 >= diff) {
1712 // If the characters differ by 2^n but don't differ by one bit then
1713 // subtract the difference from the found character, then do the or
1714 // trick. We avoid the theoretical case where negative numbers are
1715 // involved in order to simplify code generation.
1716 uc16 mask = char_mask ^ diff;
1717 macro_assembler->CheckNotCharacterAfterMinusAnd(c1 - diff,
1718 diff,
1719 mask,
1720 on_failure);
1721 return true;
1722 }
1723 return false;
1724 }
1725
1726
1727 typedef bool EmitCharacterFunction(Isolate* isolate,
1728 RegExpCompiler* compiler,
1729 uc16 c,
1730 Label* on_failure,
1731 int cp_offset,
1732 bool check,
1733 bool preloaded);
1734
1735 // Only emits letters (things that have case). Only used for case independent
1736 // matches.
EmitAtomLetter(Isolate * isolate,RegExpCompiler * compiler,uc16 c,Label * on_failure,int cp_offset,bool check,bool preloaded)1737 static inline bool EmitAtomLetter(Isolate* isolate,
1738 RegExpCompiler* compiler,
1739 uc16 c,
1740 Label* on_failure,
1741 int cp_offset,
1742 bool check,
1743 bool preloaded) {
1744 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
1745 bool one_byte = compiler->one_byte();
1746 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1747 int length = GetCaseIndependentLetters(isolate, c, one_byte, chars);
1748 if (length <= 1) return false;
1749 // We may not need to check against the end of the input string
1750 // if this character lies before a character that matched.
1751 if (!preloaded) {
1752 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check);
1753 }
1754 Label ok;
1755 DCHECK(unibrow::Ecma262UnCanonicalize::kMaxWidth == 4);
1756 switch (length) {
1757 case 2: {
1758 if (ShortCutEmitCharacterPair(macro_assembler, one_byte, chars[0],
1759 chars[1], on_failure)) {
1760 } else {
1761 macro_assembler->CheckCharacter(chars[0], &ok);
1762 macro_assembler->CheckNotCharacter(chars[1], on_failure);
1763 macro_assembler->Bind(&ok);
1764 }
1765 break;
1766 }
1767 case 4:
1768 macro_assembler->CheckCharacter(chars[3], &ok);
1769 // Fall through!
1770 case 3:
1771 macro_assembler->CheckCharacter(chars[0], &ok);
1772 macro_assembler->CheckCharacter(chars[1], &ok);
1773 macro_assembler->CheckNotCharacter(chars[2], on_failure);
1774 macro_assembler->Bind(&ok);
1775 break;
1776 default:
1777 UNREACHABLE();
1778 break;
1779 }
1780 return true;
1781 }
1782
1783
EmitBoundaryTest(RegExpMacroAssembler * masm,int border,Label * fall_through,Label * above_or_equal,Label * below)1784 static void EmitBoundaryTest(RegExpMacroAssembler* masm,
1785 int border,
1786 Label* fall_through,
1787 Label* above_or_equal,
1788 Label* below) {
1789 if (below != fall_through) {
1790 masm->CheckCharacterLT(border, below);
1791 if (above_or_equal != fall_through) masm->GoTo(above_or_equal);
1792 } else {
1793 masm->CheckCharacterGT(border - 1, above_or_equal);
1794 }
1795 }
1796
1797
EmitDoubleBoundaryTest(RegExpMacroAssembler * masm,int first,int last,Label * fall_through,Label * in_range,Label * out_of_range)1798 static void EmitDoubleBoundaryTest(RegExpMacroAssembler* masm,
1799 int first,
1800 int last,
1801 Label* fall_through,
1802 Label* in_range,
1803 Label* out_of_range) {
1804 if (in_range == fall_through) {
1805 if (first == last) {
1806 masm->CheckNotCharacter(first, out_of_range);
1807 } else {
1808 masm->CheckCharacterNotInRange(first, last, out_of_range);
1809 }
1810 } else {
1811 if (first == last) {
1812 masm->CheckCharacter(first, in_range);
1813 } else {
1814 masm->CheckCharacterInRange(first, last, in_range);
1815 }
1816 if (out_of_range != fall_through) masm->GoTo(out_of_range);
1817 }
1818 }
1819
1820
1821 // even_label is for ranges[i] to ranges[i + 1] where i - start_index is even.
1822 // odd_label is for ranges[i] to ranges[i + 1] where i - start_index is odd.
EmitUseLookupTable(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int min_char,Label * fall_through,Label * even_label,Label * odd_label)1823 static void EmitUseLookupTable(
1824 RegExpMacroAssembler* masm,
1825 ZoneList<int>* ranges,
1826 int start_index,
1827 int end_index,
1828 int min_char,
1829 Label* fall_through,
1830 Label* even_label,
1831 Label* odd_label) {
1832 static const int kSize = RegExpMacroAssembler::kTableSize;
1833 static const int kMask = RegExpMacroAssembler::kTableMask;
1834
1835 int base = (min_char & ~kMask);
1836 USE(base);
1837
1838 // Assert that everything is on one kTableSize page.
1839 for (int i = start_index; i <= end_index; i++) {
1840 DCHECK_EQ(ranges->at(i) & ~kMask, base);
1841 }
1842 DCHECK(start_index == 0 || (ranges->at(start_index - 1) & ~kMask) <= base);
1843
1844 char templ[kSize];
1845 Label* on_bit_set;
1846 Label* on_bit_clear;
1847 int bit;
1848 if (even_label == fall_through) {
1849 on_bit_set = odd_label;
1850 on_bit_clear = even_label;
1851 bit = 1;
1852 } else {
1853 on_bit_set = even_label;
1854 on_bit_clear = odd_label;
1855 bit = 0;
1856 }
1857 for (int i = 0; i < (ranges->at(start_index) & kMask) && i < kSize; i++) {
1858 templ[i] = bit;
1859 }
1860 int j = 0;
1861 bit ^= 1;
1862 for (int i = start_index; i < end_index; i++) {
1863 for (j = (ranges->at(i) & kMask); j < (ranges->at(i + 1) & kMask); j++) {
1864 templ[j] = bit;
1865 }
1866 bit ^= 1;
1867 }
1868 for (int i = j; i < kSize; i++) {
1869 templ[i] = bit;
1870 }
1871 Factory* factory = masm->isolate()->factory();
1872 // TODO(erikcorry): Cache these.
1873 Handle<ByteArray> ba = factory->NewByteArray(kSize, TENURED);
1874 for (int i = 0; i < kSize; i++) {
1875 ba->set(i, templ[i]);
1876 }
1877 masm->CheckBitInTable(ba, on_bit_set);
1878 if (on_bit_clear != fall_through) masm->GoTo(on_bit_clear);
1879 }
1880
1881
CutOutRange(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,int cut_index,Label * even_label,Label * odd_label)1882 static void CutOutRange(RegExpMacroAssembler* masm,
1883 ZoneList<int>* ranges,
1884 int start_index,
1885 int end_index,
1886 int cut_index,
1887 Label* even_label,
1888 Label* odd_label) {
1889 bool odd = (((cut_index - start_index) & 1) == 1);
1890 Label* in_range_label = odd ? odd_label : even_label;
1891 Label dummy;
1892 EmitDoubleBoundaryTest(masm,
1893 ranges->at(cut_index),
1894 ranges->at(cut_index + 1) - 1,
1895 &dummy,
1896 in_range_label,
1897 &dummy);
1898 DCHECK(!dummy.is_linked());
1899 // Cut out the single range by rewriting the array. This creates a new
1900 // range that is a merger of the two ranges on either side of the one we
1901 // are cutting out. The oddity of the labels is preserved.
1902 for (int j = cut_index; j > start_index; j--) {
1903 ranges->at(j) = ranges->at(j - 1);
1904 }
1905 for (int j = cut_index + 1; j < end_index; j++) {
1906 ranges->at(j) = ranges->at(j + 1);
1907 }
1908 }
1909
1910
1911 // Unicode case. Split the search space into kSize spaces that are handled
1912 // with recursion.
SplitSearchSpace(ZoneList<int> * ranges,int start_index,int end_index,int * new_start_index,int * new_end_index,int * border)1913 static void SplitSearchSpace(ZoneList<int>* ranges,
1914 int start_index,
1915 int end_index,
1916 int* new_start_index,
1917 int* new_end_index,
1918 int* border) {
1919 static const int kSize = RegExpMacroAssembler::kTableSize;
1920 static const int kMask = RegExpMacroAssembler::kTableMask;
1921
1922 int first = ranges->at(start_index);
1923 int last = ranges->at(end_index) - 1;
1924
1925 *new_start_index = start_index;
1926 *border = (ranges->at(start_index) & ~kMask) + kSize;
1927 while (*new_start_index < end_index) {
1928 if (ranges->at(*new_start_index) > *border) break;
1929 (*new_start_index)++;
1930 }
1931 // new_start_index is the index of the first edge that is beyond the
1932 // current kSize space.
1933
1934 // For very large search spaces we do a binary chop search of the non-Latin1
1935 // space instead of just going to the end of the current kSize space. The
1936 // heuristics are complicated a little by the fact that any 128-character
1937 // encoding space can be quickly tested with a table lookup, so we don't
1938 // wish to do binary chop search at a smaller granularity than that. A
1939 // 128-character space can take up a lot of space in the ranges array if,
1940 // for example, we only want to match every second character (eg. the lower
1941 // case characters on some Unicode pages).
1942 int binary_chop_index = (end_index + start_index) / 2;
1943 // The first test ensures that we get to the code that handles the Latin1
1944 // range with a single not-taken branch, speeding up this important
1945 // character range (even non-Latin1 charset-based text has spaces and
1946 // punctuation).
1947 if (*border - 1 > String::kMaxOneByteCharCode && // Latin1 case.
1948 end_index - start_index > (*new_start_index - start_index) * 2 &&
1949 last - first > kSize * 2 && binary_chop_index > *new_start_index &&
1950 ranges->at(binary_chop_index) >= first + 2 * kSize) {
1951 int scan_forward_for_section_border = binary_chop_index;;
1952 int new_border = (ranges->at(binary_chop_index) | kMask) + 1;
1953
1954 while (scan_forward_for_section_border < end_index) {
1955 if (ranges->at(scan_forward_for_section_border) > new_border) {
1956 *new_start_index = scan_forward_for_section_border;
1957 *border = new_border;
1958 break;
1959 }
1960 scan_forward_for_section_border++;
1961 }
1962 }
1963
1964 DCHECK(*new_start_index > start_index);
1965 *new_end_index = *new_start_index - 1;
1966 if (ranges->at(*new_end_index) == *border) {
1967 (*new_end_index)--;
1968 }
1969 if (*border >= ranges->at(end_index)) {
1970 *border = ranges->at(end_index);
1971 *new_start_index = end_index; // Won't be used.
1972 *new_end_index = end_index - 1;
1973 }
1974 }
1975
1976
1977 // Gets a series of segment boundaries representing a character class. If the
1978 // character is in the range between an even and an odd boundary (counting from
1979 // start_index) then go to even_label, otherwise go to odd_label. We already
1980 // know that the character is in the range of min_char to max_char inclusive.
1981 // Either label can be NULL indicating backtracking. Either label can also be
1982 // equal to the fall_through label.
GenerateBranches(RegExpMacroAssembler * masm,ZoneList<int> * ranges,int start_index,int end_index,uc32 min_char,uc32 max_char,Label * fall_through,Label * even_label,Label * odd_label)1983 static void GenerateBranches(RegExpMacroAssembler* masm, ZoneList<int>* ranges,
1984 int start_index, int end_index, uc32 min_char,
1985 uc32 max_char, Label* fall_through,
1986 Label* even_label, Label* odd_label) {
1987 DCHECK_LE(min_char, String::kMaxUtf16CodeUnit);
1988 DCHECK_LE(max_char, String::kMaxUtf16CodeUnit);
1989
1990 int first = ranges->at(start_index);
1991 int last = ranges->at(end_index) - 1;
1992
1993 DCHECK_LT(min_char, first);
1994
1995 // Just need to test if the character is before or on-or-after
1996 // a particular character.
1997 if (start_index == end_index) {
1998 EmitBoundaryTest(masm, first, fall_through, even_label, odd_label);
1999 return;
2000 }
2001
2002 // Another almost trivial case: There is one interval in the middle that is
2003 // different from the end intervals.
2004 if (start_index + 1 == end_index) {
2005 EmitDoubleBoundaryTest(
2006 masm, first, last, fall_through, even_label, odd_label);
2007 return;
2008 }
2009
2010 // It's not worth using table lookup if there are very few intervals in the
2011 // character class.
2012 if (end_index - start_index <= 6) {
2013 // It is faster to test for individual characters, so we look for those
2014 // first, then try arbitrary ranges in the second round.
2015 static int kNoCutIndex = -1;
2016 int cut = kNoCutIndex;
2017 for (int i = start_index; i < end_index; i++) {
2018 if (ranges->at(i) == ranges->at(i + 1) - 1) {
2019 cut = i;
2020 break;
2021 }
2022 }
2023 if (cut == kNoCutIndex) cut = start_index;
2024 CutOutRange(
2025 masm, ranges, start_index, end_index, cut, even_label, odd_label);
2026 DCHECK_GE(end_index - start_index, 2);
2027 GenerateBranches(masm,
2028 ranges,
2029 start_index + 1,
2030 end_index - 1,
2031 min_char,
2032 max_char,
2033 fall_through,
2034 even_label,
2035 odd_label);
2036 return;
2037 }
2038
2039 // If there are a lot of intervals in the regexp, then we will use tables to
2040 // determine whether the character is inside or outside the character class.
2041 static const int kBits = RegExpMacroAssembler::kTableSizeBits;
2042
2043 if ((max_char >> kBits) == (min_char >> kBits)) {
2044 EmitUseLookupTable(masm,
2045 ranges,
2046 start_index,
2047 end_index,
2048 min_char,
2049 fall_through,
2050 even_label,
2051 odd_label);
2052 return;
2053 }
2054
2055 if ((min_char >> kBits) != (first >> kBits)) {
2056 masm->CheckCharacterLT(first, odd_label);
2057 GenerateBranches(masm,
2058 ranges,
2059 start_index + 1,
2060 end_index,
2061 first,
2062 max_char,
2063 fall_through,
2064 odd_label,
2065 even_label);
2066 return;
2067 }
2068
2069 int new_start_index = 0;
2070 int new_end_index = 0;
2071 int border = 0;
2072
2073 SplitSearchSpace(ranges,
2074 start_index,
2075 end_index,
2076 &new_start_index,
2077 &new_end_index,
2078 &border);
2079
2080 Label handle_rest;
2081 Label* above = &handle_rest;
2082 if (border == last + 1) {
2083 // We didn't find any section that started after the limit, so everything
2084 // above the border is one of the terminal labels.
2085 above = (end_index & 1) != (start_index & 1) ? odd_label : even_label;
2086 DCHECK(new_end_index == end_index - 1);
2087 }
2088
2089 DCHECK_LE(start_index, new_end_index);
2090 DCHECK_LE(new_start_index, end_index);
2091 DCHECK_LT(start_index, new_start_index);
2092 DCHECK_LT(new_end_index, end_index);
2093 DCHECK(new_end_index + 1 == new_start_index ||
2094 (new_end_index + 2 == new_start_index &&
2095 border == ranges->at(new_end_index + 1)));
2096 DCHECK_LT(min_char, border - 1);
2097 DCHECK_LT(border, max_char);
2098 DCHECK_LT(ranges->at(new_end_index), border);
2099 DCHECK(border < ranges->at(new_start_index) ||
2100 (border == ranges->at(new_start_index) &&
2101 new_start_index == end_index &&
2102 new_end_index == end_index - 1 &&
2103 border == last + 1));
2104 DCHECK(new_start_index == 0 || border >= ranges->at(new_start_index - 1));
2105
2106 masm->CheckCharacterGT(border - 1, above);
2107 Label dummy;
2108 GenerateBranches(masm,
2109 ranges,
2110 start_index,
2111 new_end_index,
2112 min_char,
2113 border - 1,
2114 &dummy,
2115 even_label,
2116 odd_label);
2117 if (handle_rest.is_linked()) {
2118 masm->Bind(&handle_rest);
2119 bool flip = (new_start_index & 1) != (start_index & 1);
2120 GenerateBranches(masm,
2121 ranges,
2122 new_start_index,
2123 end_index,
2124 border,
2125 max_char,
2126 &dummy,
2127 flip ? odd_label : even_label,
2128 flip ? even_label : odd_label);
2129 }
2130 }
2131
2132
EmitCharClass(RegExpMacroAssembler * macro_assembler,RegExpCharacterClass * cc,bool one_byte,Label * on_failure,int cp_offset,bool check_offset,bool preloaded,Zone * zone)2133 static void EmitCharClass(RegExpMacroAssembler* macro_assembler,
2134 RegExpCharacterClass* cc, bool one_byte,
2135 Label* on_failure, int cp_offset, bool check_offset,
2136 bool preloaded, Zone* zone) {
2137 ZoneList<CharacterRange>* ranges = cc->ranges(zone);
2138 CharacterRange::Canonicalize(ranges);
2139
2140 int max_char;
2141 if (one_byte) {
2142 max_char = String::kMaxOneByteCharCode;
2143 } else {
2144 max_char = String::kMaxUtf16CodeUnit;
2145 }
2146
2147 int range_count = ranges->length();
2148
2149 int last_valid_range = range_count - 1;
2150 while (last_valid_range >= 0) {
2151 CharacterRange& range = ranges->at(last_valid_range);
2152 if (range.from() <= max_char) {
2153 break;
2154 }
2155 last_valid_range--;
2156 }
2157
2158 if (last_valid_range < 0) {
2159 if (!cc->is_negated()) {
2160 macro_assembler->GoTo(on_failure);
2161 }
2162 if (check_offset) {
2163 macro_assembler->CheckPosition(cp_offset, on_failure);
2164 }
2165 return;
2166 }
2167
2168 if (last_valid_range == 0 &&
2169 ranges->at(0).IsEverything(max_char)) {
2170 if (cc->is_negated()) {
2171 macro_assembler->GoTo(on_failure);
2172 } else {
2173 // This is a common case hit by non-anchored expressions.
2174 if (check_offset) {
2175 macro_assembler->CheckPosition(cp_offset, on_failure);
2176 }
2177 }
2178 return;
2179 }
2180
2181 if (!preloaded) {
2182 macro_assembler->LoadCurrentCharacter(cp_offset, on_failure, check_offset);
2183 }
2184
2185 if (cc->is_standard(zone) &&
2186 macro_assembler->CheckSpecialCharacterClass(cc->standard_type(),
2187 on_failure)) {
2188 return;
2189 }
2190
2191
2192 // A new list with ascending entries. Each entry is a code unit
2193 // where there is a boundary between code units that are part of
2194 // the class and code units that are not. Normally we insert an
2195 // entry at zero which goes to the failure label, but if there
2196 // was already one there we fall through for success on that entry.
2197 // Subsequent entries have alternating meaning (success/failure).
2198 ZoneList<int>* range_boundaries =
2199 new(zone) ZoneList<int>(last_valid_range, zone);
2200
2201 bool zeroth_entry_is_failure = !cc->is_negated();
2202
2203 for (int i = 0; i <= last_valid_range; i++) {
2204 CharacterRange& range = ranges->at(i);
2205 if (range.from() == 0) {
2206 DCHECK_EQ(i, 0);
2207 zeroth_entry_is_failure = !zeroth_entry_is_failure;
2208 } else {
2209 range_boundaries->Add(range.from(), zone);
2210 }
2211 range_boundaries->Add(range.to() + 1, zone);
2212 }
2213 int end_index = range_boundaries->length() - 1;
2214 if (range_boundaries->at(end_index) > max_char) {
2215 end_index--;
2216 }
2217
2218 Label fall_through;
2219 GenerateBranches(macro_assembler,
2220 range_boundaries,
2221 0, // start_index.
2222 end_index,
2223 0, // min_char.
2224 max_char,
2225 &fall_through,
2226 zeroth_entry_is_failure ? &fall_through : on_failure,
2227 zeroth_entry_is_failure ? on_failure : &fall_through);
2228 macro_assembler->Bind(&fall_through);
2229 }
2230
2231
~RegExpNode()2232 RegExpNode::~RegExpNode() {
2233 }
2234
2235
LimitVersions(RegExpCompiler * compiler,Trace * trace)2236 RegExpNode::LimitResult RegExpNode::LimitVersions(RegExpCompiler* compiler,
2237 Trace* trace) {
2238 // If we are generating a greedy loop then don't stop and don't reuse code.
2239 if (trace->stop_node() != NULL) {
2240 return CONTINUE;
2241 }
2242
2243 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
2244 if (trace->is_trivial()) {
2245 if (label_.is_bound() || on_work_list() || !KeepRecursing(compiler)) {
2246 // If a generic version is already scheduled to be generated or we have
2247 // recursed too deeply then just generate a jump to that code.
2248 macro_assembler->GoTo(&label_);
2249 // This will queue it up for generation of a generic version if it hasn't
2250 // already been queued.
2251 compiler->AddWork(this);
2252 return DONE;
2253 }
2254 // Generate generic version of the node and bind the label for later use.
2255 macro_assembler->Bind(&label_);
2256 return CONTINUE;
2257 }
2258
2259 // We are being asked to make a non-generic version. Keep track of how many
2260 // non-generic versions we generate so as not to overdo it.
2261 trace_count_++;
2262 if (KeepRecursing(compiler) && compiler->optimize() &&
2263 trace_count_ < kMaxCopiesCodeGenerated) {
2264 return CONTINUE;
2265 }
2266
2267 // If we get here code has been generated for this node too many times or
2268 // recursion is too deep. Time to switch to a generic version. The code for
2269 // generic versions above can handle deep recursion properly.
2270 bool was_limiting = compiler->limiting_recursion();
2271 compiler->set_limiting_recursion(true);
2272 trace->Flush(compiler, this);
2273 compiler->set_limiting_recursion(was_limiting);
2274 return DONE;
2275 }
2276
2277
KeepRecursing(RegExpCompiler * compiler)2278 bool RegExpNode::KeepRecursing(RegExpCompiler* compiler) {
2279 return !compiler->limiting_recursion() &&
2280 compiler->recursion_depth() <= RegExpCompiler::kMaxRecursion;
2281 }
2282
2283
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2284 int ActionNode::EatsAtLeast(int still_to_find,
2285 int budget,
2286 bool not_at_start) {
2287 if (budget <= 0) return 0;
2288 if (action_type_ == POSITIVE_SUBMATCH_SUCCESS) return 0; // Rewinds input!
2289 return on_success()->EatsAtLeast(still_to_find,
2290 budget - 1,
2291 not_at_start);
2292 }
2293
2294
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2295 void ActionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2296 BoyerMooreLookahead* bm, bool not_at_start) {
2297 if (action_type_ == BEGIN_SUBMATCH) {
2298 bm->SetRest(offset);
2299 } else if (action_type_ != POSITIVE_SUBMATCH_SUCCESS) {
2300 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2301 }
2302 SaveBMInfo(bm, not_at_start, offset);
2303 }
2304
2305
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2306 int AssertionNode::EatsAtLeast(int still_to_find,
2307 int budget,
2308 bool not_at_start) {
2309 if (budget <= 0) return 0;
2310 // If we know we are not at the start and we are asked "how many characters
2311 // will you match if you succeed?" then we can answer anything since false
2312 // implies false. So lets just return the max answer (still_to_find) since
2313 // that won't prevent us from preloading a lot of characters for the other
2314 // branches in the node graph.
2315 if (assertion_type() == AT_START && not_at_start) return still_to_find;
2316 return on_success()->EatsAtLeast(still_to_find,
2317 budget - 1,
2318 not_at_start);
2319 }
2320
2321
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2322 void AssertionNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2323 BoyerMooreLookahead* bm, bool not_at_start) {
2324 // Match the behaviour of EatsAtLeast on this node.
2325 if (assertion_type() == AT_START && not_at_start) return;
2326 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2327 SaveBMInfo(bm, not_at_start, offset);
2328 }
2329
2330
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2331 int BackReferenceNode::EatsAtLeast(int still_to_find,
2332 int budget,
2333 bool not_at_start) {
2334 if (read_backward()) return 0;
2335 if (budget <= 0) return 0;
2336 return on_success()->EatsAtLeast(still_to_find,
2337 budget - 1,
2338 not_at_start);
2339 }
2340
2341
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2342 int TextNode::EatsAtLeast(int still_to_find,
2343 int budget,
2344 bool not_at_start) {
2345 if (read_backward()) return 0;
2346 int answer = Length();
2347 if (answer >= still_to_find) return answer;
2348 if (budget <= 0) return answer;
2349 // We are not at start after this node so we set the last argument to 'true'.
2350 return answer + on_success()->EatsAtLeast(still_to_find - answer,
2351 budget - 1,
2352 true);
2353 }
2354
2355
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2356 int NegativeLookaroundChoiceNode::EatsAtLeast(int still_to_find, int budget,
2357 bool not_at_start) {
2358 if (budget <= 0) return 0;
2359 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2360 // afterwards.
2361 RegExpNode* node = alternatives_->at(1).node();
2362 return node->EatsAtLeast(still_to_find, budget - 1, not_at_start);
2363 }
2364
2365
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)2366 void NegativeLookaroundChoiceNode::GetQuickCheckDetails(
2367 QuickCheckDetails* details, RegExpCompiler* compiler, int filled_in,
2368 bool not_at_start) {
2369 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2370 // afterwards.
2371 RegExpNode* node = alternatives_->at(1).node();
2372 return node->GetQuickCheckDetails(details, compiler, filled_in, not_at_start);
2373 }
2374
2375
EatsAtLeastHelper(int still_to_find,int budget,RegExpNode * ignore_this_node,bool not_at_start)2376 int ChoiceNode::EatsAtLeastHelper(int still_to_find,
2377 int budget,
2378 RegExpNode* ignore_this_node,
2379 bool not_at_start) {
2380 if (budget <= 0) return 0;
2381 int min = 100;
2382 int choice_count = alternatives_->length();
2383 budget = (budget - 1) / choice_count;
2384 for (int i = 0; i < choice_count; i++) {
2385 RegExpNode* node = alternatives_->at(i).node();
2386 if (node == ignore_this_node) continue;
2387 int node_eats_at_least =
2388 node->EatsAtLeast(still_to_find, budget, not_at_start);
2389 if (node_eats_at_least < min) min = node_eats_at_least;
2390 if (min == 0) return 0;
2391 }
2392 return min;
2393 }
2394
2395
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2396 int LoopChoiceNode::EatsAtLeast(int still_to_find,
2397 int budget,
2398 bool not_at_start) {
2399 return EatsAtLeastHelper(still_to_find,
2400 budget - 1,
2401 loop_node_,
2402 not_at_start);
2403 }
2404
2405
EatsAtLeast(int still_to_find,int budget,bool not_at_start)2406 int ChoiceNode::EatsAtLeast(int still_to_find,
2407 int budget,
2408 bool not_at_start) {
2409 return EatsAtLeastHelper(still_to_find,
2410 budget,
2411 NULL,
2412 not_at_start);
2413 }
2414
2415
2416 // Takes the left-most 1-bit and smears it out, setting all bits to its right.
SmearBitsRight(uint32_t v)2417 static inline uint32_t SmearBitsRight(uint32_t v) {
2418 v |= v >> 1;
2419 v |= v >> 2;
2420 v |= v >> 4;
2421 v |= v >> 8;
2422 v |= v >> 16;
2423 return v;
2424 }
2425
2426
Rationalize(bool asc)2427 bool QuickCheckDetails::Rationalize(bool asc) {
2428 bool found_useful_op = false;
2429 uint32_t char_mask;
2430 if (asc) {
2431 char_mask = String::kMaxOneByteCharCode;
2432 } else {
2433 char_mask = String::kMaxUtf16CodeUnit;
2434 }
2435 mask_ = 0;
2436 value_ = 0;
2437 int char_shift = 0;
2438 for (int i = 0; i < characters_; i++) {
2439 Position* pos = &positions_[i];
2440 if ((pos->mask & String::kMaxOneByteCharCode) != 0) {
2441 found_useful_op = true;
2442 }
2443 mask_ |= (pos->mask & char_mask) << char_shift;
2444 value_ |= (pos->value & char_mask) << char_shift;
2445 char_shift += asc ? 8 : 16;
2446 }
2447 return found_useful_op;
2448 }
2449
2450
EmitQuickCheck(RegExpCompiler * compiler,Trace * bounds_check_trace,Trace * trace,bool preload_has_checked_bounds,Label * on_possible_success,QuickCheckDetails * details,bool fall_through_on_failure)2451 bool RegExpNode::EmitQuickCheck(RegExpCompiler* compiler,
2452 Trace* bounds_check_trace,
2453 Trace* trace,
2454 bool preload_has_checked_bounds,
2455 Label* on_possible_success,
2456 QuickCheckDetails* details,
2457 bool fall_through_on_failure) {
2458 if (details->characters() == 0) return false;
2459 GetQuickCheckDetails(
2460 details, compiler, 0, trace->at_start() == Trace::FALSE_VALUE);
2461 if (details->cannot_match()) return false;
2462 if (!details->Rationalize(compiler->one_byte())) return false;
2463 DCHECK(details->characters() == 1 ||
2464 compiler->macro_assembler()->CanReadUnaligned());
2465 uint32_t mask = details->mask();
2466 uint32_t value = details->value();
2467
2468 RegExpMacroAssembler* assembler = compiler->macro_assembler();
2469
2470 if (trace->characters_preloaded() != details->characters()) {
2471 DCHECK(trace->cp_offset() == bounds_check_trace->cp_offset());
2472 // We are attempting to preload the minimum number of characters
2473 // any choice would eat, so if the bounds check fails, then none of the
2474 // choices can succeed, so we can just immediately backtrack, rather
2475 // than go to the next choice.
2476 assembler->LoadCurrentCharacter(trace->cp_offset(),
2477 bounds_check_trace->backtrack(),
2478 !preload_has_checked_bounds,
2479 details->characters());
2480 }
2481
2482
2483 bool need_mask = true;
2484
2485 if (details->characters() == 1) {
2486 // If number of characters preloaded is 1 then we used a byte or 16 bit
2487 // load so the value is already masked down.
2488 uint32_t char_mask;
2489 if (compiler->one_byte()) {
2490 char_mask = String::kMaxOneByteCharCode;
2491 } else {
2492 char_mask = String::kMaxUtf16CodeUnit;
2493 }
2494 if ((mask & char_mask) == char_mask) need_mask = false;
2495 mask &= char_mask;
2496 } else {
2497 // For 2-character preloads in one-byte mode or 1-character preloads in
2498 // two-byte mode we also use a 16 bit load with zero extend.
2499 static const uint32_t kTwoByteMask = 0xffff;
2500 static const uint32_t kFourByteMask = 0xffffffff;
2501 if (details->characters() == 2 && compiler->one_byte()) {
2502 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2503 } else if (details->characters() == 1 && !compiler->one_byte()) {
2504 if ((mask & kTwoByteMask) == kTwoByteMask) need_mask = false;
2505 } else {
2506 if (mask == kFourByteMask) need_mask = false;
2507 }
2508 }
2509
2510 if (fall_through_on_failure) {
2511 if (need_mask) {
2512 assembler->CheckCharacterAfterAnd(value, mask, on_possible_success);
2513 } else {
2514 assembler->CheckCharacter(value, on_possible_success);
2515 }
2516 } else {
2517 if (need_mask) {
2518 assembler->CheckNotCharacterAfterAnd(value, mask, trace->backtrack());
2519 } else {
2520 assembler->CheckNotCharacter(value, trace->backtrack());
2521 }
2522 }
2523 return true;
2524 }
2525
2526
2527 // Here is the meat of GetQuickCheckDetails (see also the comment on the
2528 // super-class in the .h file).
2529 //
2530 // We iterate along the text object, building up for each character a
2531 // mask and value that can be used to test for a quick failure to match.
2532 // The masks and values for the positions will be combined into a single
2533 // machine word for the current character width in order to be used in
2534 // generating a quick check.
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2535 void TextNode::GetQuickCheckDetails(QuickCheckDetails* details,
2536 RegExpCompiler* compiler,
2537 int characters_filled_in,
2538 bool not_at_start) {
2539 // Do not collect any quick check details if the text node reads backward,
2540 // since it reads in the opposite direction than we use for quick checks.
2541 if (read_backward()) return;
2542 Isolate* isolate = compiler->macro_assembler()->isolate();
2543 DCHECK(characters_filled_in < details->characters());
2544 int characters = details->characters();
2545 int char_mask;
2546 if (compiler->one_byte()) {
2547 char_mask = String::kMaxOneByteCharCode;
2548 } else {
2549 char_mask = String::kMaxUtf16CodeUnit;
2550 }
2551 for (int k = 0; k < elements()->length(); k++) {
2552 TextElement elm = elements()->at(k);
2553 if (elm.text_type() == TextElement::ATOM) {
2554 Vector<const uc16> quarks = elm.atom()->data();
2555 for (int i = 0; i < characters && i < quarks.length(); i++) {
2556 QuickCheckDetails::Position* pos =
2557 details->positions(characters_filled_in);
2558 uc16 c = quarks[i];
2559 if (compiler->ignore_case()) {
2560 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
2561 int length = GetCaseIndependentLetters(isolate, c,
2562 compiler->one_byte(), chars);
2563 if (length == 0) {
2564 // This can happen because all case variants are non-Latin1, but we
2565 // know the input is Latin1.
2566 details->set_cannot_match();
2567 pos->determines_perfectly = false;
2568 return;
2569 }
2570 if (length == 1) {
2571 // This letter has no case equivalents, so it's nice and simple
2572 // and the mask-compare will determine definitely whether we have
2573 // a match at this character position.
2574 pos->mask = char_mask;
2575 pos->value = c;
2576 pos->determines_perfectly = true;
2577 } else {
2578 uint32_t common_bits = char_mask;
2579 uint32_t bits = chars[0];
2580 for (int j = 1; j < length; j++) {
2581 uint32_t differing_bits = ((chars[j] & common_bits) ^ bits);
2582 common_bits ^= differing_bits;
2583 bits &= common_bits;
2584 }
2585 // If length is 2 and common bits has only one zero in it then
2586 // our mask and compare instruction will determine definitely
2587 // whether we have a match at this character position. Otherwise
2588 // it can only be an approximate check.
2589 uint32_t one_zero = (common_bits | ~char_mask);
2590 if (length == 2 && ((~one_zero) & ((~one_zero) - 1)) == 0) {
2591 pos->determines_perfectly = true;
2592 }
2593 pos->mask = common_bits;
2594 pos->value = bits;
2595 }
2596 } else {
2597 // Don't ignore case. Nice simple case where the mask-compare will
2598 // determine definitely whether we have a match at this character
2599 // position.
2600 if (c > char_mask) {
2601 details->set_cannot_match();
2602 pos->determines_perfectly = false;
2603 return;
2604 }
2605 pos->mask = char_mask;
2606 pos->value = c;
2607 pos->determines_perfectly = true;
2608 }
2609 characters_filled_in++;
2610 DCHECK(characters_filled_in <= details->characters());
2611 if (characters_filled_in == details->characters()) {
2612 return;
2613 }
2614 }
2615 } else {
2616 QuickCheckDetails::Position* pos =
2617 details->positions(characters_filled_in);
2618 RegExpCharacterClass* tree = elm.char_class();
2619 ZoneList<CharacterRange>* ranges = tree->ranges(zone());
2620 if (tree->is_negated()) {
2621 // A quick check uses multi-character mask and compare. There is no
2622 // useful way to incorporate a negative char class into this scheme
2623 // so we just conservatively create a mask and value that will always
2624 // succeed.
2625 pos->mask = 0;
2626 pos->value = 0;
2627 } else {
2628 int first_range = 0;
2629 while (ranges->at(first_range).from() > char_mask) {
2630 first_range++;
2631 if (first_range == ranges->length()) {
2632 details->set_cannot_match();
2633 pos->determines_perfectly = false;
2634 return;
2635 }
2636 }
2637 CharacterRange range = ranges->at(first_range);
2638 uc16 from = range.from();
2639 uc16 to = range.to();
2640 if (to > char_mask) {
2641 to = char_mask;
2642 }
2643 uint32_t differing_bits = (from ^ to);
2644 // A mask and compare is only perfect if the differing bits form a
2645 // number like 00011111 with one single block of trailing 1s.
2646 if ((differing_bits & (differing_bits + 1)) == 0 &&
2647 from + differing_bits == to) {
2648 pos->determines_perfectly = true;
2649 }
2650 uint32_t common_bits = ~SmearBitsRight(differing_bits);
2651 uint32_t bits = (from & common_bits);
2652 for (int i = first_range + 1; i < ranges->length(); i++) {
2653 CharacterRange range = ranges->at(i);
2654 uc16 from = range.from();
2655 uc16 to = range.to();
2656 if (from > char_mask) continue;
2657 if (to > char_mask) to = char_mask;
2658 // Here we are combining more ranges into the mask and compare
2659 // value. With each new range the mask becomes more sparse and
2660 // so the chances of a false positive rise. A character class
2661 // with multiple ranges is assumed never to be equivalent to a
2662 // mask and compare operation.
2663 pos->determines_perfectly = false;
2664 uint32_t new_common_bits = (from ^ to);
2665 new_common_bits = ~SmearBitsRight(new_common_bits);
2666 common_bits &= new_common_bits;
2667 bits &= new_common_bits;
2668 uint32_t differing_bits = (from & common_bits) ^ bits;
2669 common_bits ^= differing_bits;
2670 bits &= common_bits;
2671 }
2672 pos->mask = common_bits;
2673 pos->value = bits;
2674 }
2675 characters_filled_in++;
2676 DCHECK(characters_filled_in <= details->characters());
2677 if (characters_filled_in == details->characters()) {
2678 return;
2679 }
2680 }
2681 }
2682 DCHECK(characters_filled_in != details->characters());
2683 if (!details->cannot_match()) {
2684 on_success()-> GetQuickCheckDetails(details,
2685 compiler,
2686 characters_filled_in,
2687 true);
2688 }
2689 }
2690
2691
Clear()2692 void QuickCheckDetails::Clear() {
2693 for (int i = 0; i < characters_; i++) {
2694 positions_[i].mask = 0;
2695 positions_[i].value = 0;
2696 positions_[i].determines_perfectly = false;
2697 }
2698 characters_ = 0;
2699 }
2700
2701
Advance(int by,bool one_byte)2702 void QuickCheckDetails::Advance(int by, bool one_byte) {
2703 if (by >= characters_ || by < 0) {
2704 DCHECK_IMPLIES(by < 0, characters_ == 0);
2705 Clear();
2706 return;
2707 }
2708 DCHECK_LE(characters_ - by, 4);
2709 DCHECK_LE(characters_, 4);
2710 for (int i = 0; i < characters_ - by; i++) {
2711 positions_[i] = positions_[by + i];
2712 }
2713 for (int i = characters_ - by; i < characters_; i++) {
2714 positions_[i].mask = 0;
2715 positions_[i].value = 0;
2716 positions_[i].determines_perfectly = false;
2717 }
2718 characters_ -= by;
2719 // We could change mask_ and value_ here but we would never advance unless
2720 // they had already been used in a check and they won't be used again because
2721 // it would gain us nothing. So there's no point.
2722 }
2723
2724
Merge(QuickCheckDetails * other,int from_index)2725 void QuickCheckDetails::Merge(QuickCheckDetails* other, int from_index) {
2726 DCHECK(characters_ == other->characters_);
2727 if (other->cannot_match_) {
2728 return;
2729 }
2730 if (cannot_match_) {
2731 *this = *other;
2732 return;
2733 }
2734 for (int i = from_index; i < characters_; i++) {
2735 QuickCheckDetails::Position* pos = positions(i);
2736 QuickCheckDetails::Position* other_pos = other->positions(i);
2737 if (pos->mask != other_pos->mask ||
2738 pos->value != other_pos->value ||
2739 !other_pos->determines_perfectly) {
2740 // Our mask-compare operation will be approximate unless we have the
2741 // exact same operation on both sides of the alternation.
2742 pos->determines_perfectly = false;
2743 }
2744 pos->mask &= other_pos->mask;
2745 pos->value &= pos->mask;
2746 other_pos->value &= pos->mask;
2747 uc16 differing_bits = (pos->value ^ other_pos->value);
2748 pos->mask &= ~differing_bits;
2749 pos->value &= pos->mask;
2750 }
2751 }
2752
2753
2754 class VisitMarker {
2755 public:
VisitMarker(NodeInfo * info)2756 explicit VisitMarker(NodeInfo* info) : info_(info) {
2757 DCHECK(!info->visited);
2758 info->visited = true;
2759 }
~VisitMarker()2760 ~VisitMarker() {
2761 info_->visited = false;
2762 }
2763 private:
2764 NodeInfo* info_;
2765 };
2766
2767
FilterOneByte(int depth,bool ignore_case)2768 RegExpNode* SeqRegExpNode::FilterOneByte(int depth, bool ignore_case) {
2769 if (info()->replacement_calculated) return replacement();
2770 if (depth < 0) return this;
2771 DCHECK(!info()->visited);
2772 VisitMarker marker(info());
2773 return FilterSuccessor(depth - 1, ignore_case);
2774 }
2775
2776
FilterSuccessor(int depth,bool ignore_case)2777 RegExpNode* SeqRegExpNode::FilterSuccessor(int depth, bool ignore_case) {
2778 RegExpNode* next = on_success_->FilterOneByte(depth - 1, ignore_case);
2779 if (next == NULL) return set_replacement(NULL);
2780 on_success_ = next;
2781 return set_replacement(this);
2782 }
2783
2784
2785 // We need to check for the following characters: 0x39c 0x3bc 0x178.
RangeContainsLatin1Equivalents(CharacterRange range)2786 static inline bool RangeContainsLatin1Equivalents(CharacterRange range) {
2787 // TODO(dcarney): this could be a lot more efficient.
2788 return range.Contains(0x39c) ||
2789 range.Contains(0x3bc) || range.Contains(0x178);
2790 }
2791
2792
RangesContainLatin1Equivalents(ZoneList<CharacterRange> * ranges)2793 static bool RangesContainLatin1Equivalents(ZoneList<CharacterRange>* ranges) {
2794 for (int i = 0; i < ranges->length(); i++) {
2795 // TODO(dcarney): this could be a lot more efficient.
2796 if (RangeContainsLatin1Equivalents(ranges->at(i))) return true;
2797 }
2798 return false;
2799 }
2800
2801
FilterOneByte(int depth,bool ignore_case)2802 RegExpNode* TextNode::FilterOneByte(int depth, bool ignore_case) {
2803 if (info()->replacement_calculated) return replacement();
2804 if (depth < 0) return this;
2805 DCHECK(!info()->visited);
2806 VisitMarker marker(info());
2807 int element_count = elements()->length();
2808 for (int i = 0; i < element_count; i++) {
2809 TextElement elm = elements()->at(i);
2810 if (elm.text_type() == TextElement::ATOM) {
2811 Vector<const uc16> quarks = elm.atom()->data();
2812 for (int j = 0; j < quarks.length(); j++) {
2813 uint16_t c = quarks[j];
2814 if (c <= String::kMaxOneByteCharCode) continue;
2815 if (!ignore_case) return set_replacement(NULL);
2816 // Here, we need to check for characters whose upper and lower cases
2817 // are outside the Latin-1 range.
2818 uint16_t converted = unibrow::Latin1::ConvertNonLatin1ToLatin1(c);
2819 // Character is outside Latin-1 completely
2820 if (converted == 0) return set_replacement(NULL);
2821 // Convert quark to Latin-1 in place.
2822 uint16_t* copy = const_cast<uint16_t*>(quarks.start());
2823 copy[j] = converted;
2824 }
2825 } else {
2826 DCHECK(elm.text_type() == TextElement::CHAR_CLASS);
2827 RegExpCharacterClass* cc = elm.char_class();
2828 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
2829 CharacterRange::Canonicalize(ranges);
2830 // Now they are in order so we only need to look at the first.
2831 int range_count = ranges->length();
2832 if (cc->is_negated()) {
2833 if (range_count != 0 &&
2834 ranges->at(0).from() == 0 &&
2835 ranges->at(0).to() >= String::kMaxOneByteCharCode) {
2836 // This will be handled in a later filter.
2837 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2838 return set_replacement(NULL);
2839 }
2840 } else {
2841 if (range_count == 0 ||
2842 ranges->at(0).from() > String::kMaxOneByteCharCode) {
2843 // This will be handled in a later filter.
2844 if (ignore_case && RangesContainLatin1Equivalents(ranges)) continue;
2845 return set_replacement(NULL);
2846 }
2847 }
2848 }
2849 }
2850 return FilterSuccessor(depth - 1, ignore_case);
2851 }
2852
2853
FilterOneByte(int depth,bool ignore_case)2854 RegExpNode* LoopChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2855 if (info()->replacement_calculated) return replacement();
2856 if (depth < 0) return this;
2857 if (info()->visited) return this;
2858 {
2859 VisitMarker marker(info());
2860
2861 RegExpNode* continue_replacement =
2862 continue_node_->FilterOneByte(depth - 1, ignore_case);
2863 // If we can't continue after the loop then there is no sense in doing the
2864 // loop.
2865 if (continue_replacement == NULL) return set_replacement(NULL);
2866 }
2867
2868 return ChoiceNode::FilterOneByte(depth - 1, ignore_case);
2869 }
2870
2871
FilterOneByte(int depth,bool ignore_case)2872 RegExpNode* ChoiceNode::FilterOneByte(int depth, bool ignore_case) {
2873 if (info()->replacement_calculated) return replacement();
2874 if (depth < 0) return this;
2875 if (info()->visited) return this;
2876 VisitMarker marker(info());
2877 int choice_count = alternatives_->length();
2878
2879 for (int i = 0; i < choice_count; i++) {
2880 GuardedAlternative alternative = alternatives_->at(i);
2881 if (alternative.guards() != NULL && alternative.guards()->length() != 0) {
2882 set_replacement(this);
2883 return this;
2884 }
2885 }
2886
2887 int surviving = 0;
2888 RegExpNode* survivor = NULL;
2889 for (int i = 0; i < choice_count; i++) {
2890 GuardedAlternative alternative = alternatives_->at(i);
2891 RegExpNode* replacement =
2892 alternative.node()->FilterOneByte(depth - 1, ignore_case);
2893 DCHECK(replacement != this); // No missing EMPTY_MATCH_CHECK.
2894 if (replacement != NULL) {
2895 alternatives_->at(i).set_node(replacement);
2896 surviving++;
2897 survivor = replacement;
2898 }
2899 }
2900 if (surviving < 2) return set_replacement(survivor);
2901
2902 set_replacement(this);
2903 if (surviving == choice_count) {
2904 return this;
2905 }
2906 // Only some of the nodes survived the filtering. We need to rebuild the
2907 // alternatives list.
2908 ZoneList<GuardedAlternative>* new_alternatives =
2909 new(zone()) ZoneList<GuardedAlternative>(surviving, zone());
2910 for (int i = 0; i < choice_count; i++) {
2911 RegExpNode* replacement =
2912 alternatives_->at(i).node()->FilterOneByte(depth - 1, ignore_case);
2913 if (replacement != NULL) {
2914 alternatives_->at(i).set_node(replacement);
2915 new_alternatives->Add(alternatives_->at(i), zone());
2916 }
2917 }
2918 alternatives_ = new_alternatives;
2919 return this;
2920 }
2921
2922
FilterOneByte(int depth,bool ignore_case)2923 RegExpNode* NegativeLookaroundChoiceNode::FilterOneByte(int depth,
2924 bool ignore_case) {
2925 if (info()->replacement_calculated) return replacement();
2926 if (depth < 0) return this;
2927 if (info()->visited) return this;
2928 VisitMarker marker(info());
2929 // Alternative 0 is the negative lookahead, alternative 1 is what comes
2930 // afterwards.
2931 RegExpNode* node = alternatives_->at(1).node();
2932 RegExpNode* replacement = node->FilterOneByte(depth - 1, ignore_case);
2933 if (replacement == NULL) return set_replacement(NULL);
2934 alternatives_->at(1).set_node(replacement);
2935
2936 RegExpNode* neg_node = alternatives_->at(0).node();
2937 RegExpNode* neg_replacement = neg_node->FilterOneByte(depth - 1, ignore_case);
2938 // If the negative lookahead is always going to fail then
2939 // we don't need to check it.
2940 if (neg_replacement == NULL) return set_replacement(replacement);
2941 alternatives_->at(0).set_node(neg_replacement);
2942 return set_replacement(this);
2943 }
2944
2945
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2946 void LoopChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2947 RegExpCompiler* compiler,
2948 int characters_filled_in,
2949 bool not_at_start) {
2950 if (body_can_be_zero_length_ || info()->visited) return;
2951 VisitMarker marker(info());
2952 return ChoiceNode::GetQuickCheckDetails(details,
2953 compiler,
2954 characters_filled_in,
2955 not_at_start);
2956 }
2957
2958
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)2959 void LoopChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
2960 BoyerMooreLookahead* bm, bool not_at_start) {
2961 if (body_can_be_zero_length_ || budget <= 0) {
2962 bm->SetRest(offset);
2963 SaveBMInfo(bm, not_at_start, offset);
2964 return;
2965 }
2966 ChoiceNode::FillInBMInfo(isolate, offset, budget - 1, bm, not_at_start);
2967 SaveBMInfo(bm, not_at_start, offset);
2968 }
2969
2970
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int characters_filled_in,bool not_at_start)2971 void ChoiceNode::GetQuickCheckDetails(QuickCheckDetails* details,
2972 RegExpCompiler* compiler,
2973 int characters_filled_in,
2974 bool not_at_start) {
2975 not_at_start = (not_at_start || not_at_start_);
2976 int choice_count = alternatives_->length();
2977 DCHECK(choice_count > 0);
2978 alternatives_->at(0).node()->GetQuickCheckDetails(details,
2979 compiler,
2980 characters_filled_in,
2981 not_at_start);
2982 for (int i = 1; i < choice_count; i++) {
2983 QuickCheckDetails new_details(details->characters());
2984 RegExpNode* node = alternatives_->at(i).node();
2985 node->GetQuickCheckDetails(&new_details, compiler,
2986 characters_filled_in,
2987 not_at_start);
2988 // Here we merge the quick match details of the two branches.
2989 details->Merge(&new_details, characters_filled_in);
2990 }
2991 }
2992
2993
2994 // Check for [0-9A-Z_a-z].
EmitWordCheck(RegExpMacroAssembler * assembler,Label * word,Label * non_word,bool fall_through_on_word)2995 static void EmitWordCheck(RegExpMacroAssembler* assembler,
2996 Label* word,
2997 Label* non_word,
2998 bool fall_through_on_word) {
2999 if (assembler->CheckSpecialCharacterClass(
3000 fall_through_on_word ? 'w' : 'W',
3001 fall_through_on_word ? non_word : word)) {
3002 // Optimized implementation available.
3003 return;
3004 }
3005 assembler->CheckCharacterGT('z', non_word);
3006 assembler->CheckCharacterLT('0', non_word);
3007 assembler->CheckCharacterGT('a' - 1, word);
3008 assembler->CheckCharacterLT('9' + 1, word);
3009 assembler->CheckCharacterLT('A', non_word);
3010 assembler->CheckCharacterLT('Z' + 1, word);
3011 if (fall_through_on_word) {
3012 assembler->CheckNotCharacter('_', non_word);
3013 } else {
3014 assembler->CheckCharacter('_', word);
3015 }
3016 }
3017
3018
3019 // Emit the code to check for a ^ in multiline mode (1-character lookbehind
3020 // that matches newline or the start of input).
EmitHat(RegExpCompiler * compiler,RegExpNode * on_success,Trace * trace)3021 static void EmitHat(RegExpCompiler* compiler,
3022 RegExpNode* on_success,
3023 Trace* trace) {
3024 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3025 // We will be loading the previous character into the current character
3026 // register.
3027 Trace new_trace(*trace);
3028 new_trace.InvalidateCurrentCharacter();
3029
3030 Label ok;
3031 if (new_trace.cp_offset() == 0) {
3032 // The start of input counts as a newline in this context, so skip to
3033 // ok if we are at the start.
3034 assembler->CheckAtStart(&ok);
3035 }
3036 // We already checked that we are not at the start of input so it must be
3037 // OK to load the previous character.
3038 assembler->LoadCurrentCharacter(new_trace.cp_offset() -1,
3039 new_trace.backtrack(),
3040 false);
3041 if (!assembler->CheckSpecialCharacterClass('n',
3042 new_trace.backtrack())) {
3043 // Newline means \n, \r, 0x2028 or 0x2029.
3044 if (!compiler->one_byte()) {
3045 assembler->CheckCharacterAfterAnd(0x2028, 0xfffe, &ok);
3046 }
3047 assembler->CheckCharacter('\n', &ok);
3048 assembler->CheckNotCharacter('\r', new_trace.backtrack());
3049 }
3050 assembler->Bind(&ok);
3051 on_success->Emit(compiler, &new_trace);
3052 }
3053
3054
3055 // Emit the code to handle \b and \B (word-boundary or non-word-boundary).
EmitBoundaryCheck(RegExpCompiler * compiler,Trace * trace)3056 void AssertionNode::EmitBoundaryCheck(RegExpCompiler* compiler, Trace* trace) {
3057 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3058 Isolate* isolate = assembler->isolate();
3059 Trace::TriBool next_is_word_character = Trace::UNKNOWN;
3060 bool not_at_start = (trace->at_start() == Trace::FALSE_VALUE);
3061 BoyerMooreLookahead* lookahead = bm_info(not_at_start);
3062 if (lookahead == NULL) {
3063 int eats_at_least =
3064 Min(kMaxLookaheadForBoyerMoore, EatsAtLeast(kMaxLookaheadForBoyerMoore,
3065 kRecursionBudget,
3066 not_at_start));
3067 if (eats_at_least >= 1) {
3068 BoyerMooreLookahead* bm =
3069 new(zone()) BoyerMooreLookahead(eats_at_least, compiler, zone());
3070 FillInBMInfo(isolate, 0, kRecursionBudget, bm, not_at_start);
3071 if (bm->at(0)->is_non_word())
3072 next_is_word_character = Trace::FALSE_VALUE;
3073 if (bm->at(0)->is_word()) next_is_word_character = Trace::TRUE_VALUE;
3074 }
3075 } else {
3076 if (lookahead->at(0)->is_non_word())
3077 next_is_word_character = Trace::FALSE_VALUE;
3078 if (lookahead->at(0)->is_word())
3079 next_is_word_character = Trace::TRUE_VALUE;
3080 }
3081 bool at_boundary = (assertion_type_ == AssertionNode::AT_BOUNDARY);
3082 if (next_is_word_character == Trace::UNKNOWN) {
3083 Label before_non_word;
3084 Label before_word;
3085 if (trace->characters_preloaded() != 1) {
3086 assembler->LoadCurrentCharacter(trace->cp_offset(), &before_non_word);
3087 }
3088 // Fall through on non-word.
3089 EmitWordCheck(assembler, &before_word, &before_non_word, false);
3090 // Next character is not a word character.
3091 assembler->Bind(&before_non_word);
3092 Label ok;
3093 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3094 assembler->GoTo(&ok);
3095
3096 assembler->Bind(&before_word);
3097 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3098 assembler->Bind(&ok);
3099 } else if (next_is_word_character == Trace::TRUE_VALUE) {
3100 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsWord : kIsNonWord);
3101 } else {
3102 DCHECK(next_is_word_character == Trace::FALSE_VALUE);
3103 BacktrackIfPrevious(compiler, trace, at_boundary ? kIsNonWord : kIsWord);
3104 }
3105 }
3106
3107
BacktrackIfPrevious(RegExpCompiler * compiler,Trace * trace,AssertionNode::IfPrevious backtrack_if_previous)3108 void AssertionNode::BacktrackIfPrevious(
3109 RegExpCompiler* compiler,
3110 Trace* trace,
3111 AssertionNode::IfPrevious backtrack_if_previous) {
3112 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3113 Trace new_trace(*trace);
3114 new_trace.InvalidateCurrentCharacter();
3115
3116 Label fall_through, dummy;
3117
3118 Label* non_word = backtrack_if_previous == kIsNonWord ?
3119 new_trace.backtrack() :
3120 &fall_through;
3121 Label* word = backtrack_if_previous == kIsNonWord ?
3122 &fall_through :
3123 new_trace.backtrack();
3124
3125 if (new_trace.cp_offset() == 0) {
3126 // The start of input counts as a non-word character, so the question is
3127 // decided if we are at the start.
3128 assembler->CheckAtStart(non_word);
3129 }
3130 // We already checked that we are not at the start of input so it must be
3131 // OK to load the previous character.
3132 assembler->LoadCurrentCharacter(new_trace.cp_offset() - 1, &dummy, false);
3133 EmitWordCheck(assembler, word, non_word, backtrack_if_previous == kIsNonWord);
3134
3135 assembler->Bind(&fall_through);
3136 on_success()->Emit(compiler, &new_trace);
3137 }
3138
3139
GetQuickCheckDetails(QuickCheckDetails * details,RegExpCompiler * compiler,int filled_in,bool not_at_start)3140 void AssertionNode::GetQuickCheckDetails(QuickCheckDetails* details,
3141 RegExpCompiler* compiler,
3142 int filled_in,
3143 bool not_at_start) {
3144 if (assertion_type_ == AT_START && not_at_start) {
3145 details->set_cannot_match();
3146 return;
3147 }
3148 return on_success()->GetQuickCheckDetails(details,
3149 compiler,
3150 filled_in,
3151 not_at_start);
3152 }
3153
3154
Emit(RegExpCompiler * compiler,Trace * trace)3155 void AssertionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3156 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3157 switch (assertion_type_) {
3158 case AT_END: {
3159 Label ok;
3160 assembler->CheckPosition(trace->cp_offset(), &ok);
3161 assembler->GoTo(trace->backtrack());
3162 assembler->Bind(&ok);
3163 break;
3164 }
3165 case AT_START: {
3166 if (trace->at_start() == Trace::FALSE_VALUE) {
3167 assembler->GoTo(trace->backtrack());
3168 return;
3169 }
3170 if (trace->at_start() == Trace::UNKNOWN) {
3171 assembler->CheckNotAtStart(trace->cp_offset(), trace->backtrack());
3172 Trace at_start_trace = *trace;
3173 at_start_trace.set_at_start(Trace::TRUE_VALUE);
3174 on_success()->Emit(compiler, &at_start_trace);
3175 return;
3176 }
3177 }
3178 break;
3179 case AFTER_NEWLINE:
3180 EmitHat(compiler, on_success(), trace);
3181 return;
3182 case AT_BOUNDARY:
3183 case AT_NON_BOUNDARY: {
3184 EmitBoundaryCheck(compiler, trace);
3185 return;
3186 }
3187 }
3188 on_success()->Emit(compiler, trace);
3189 }
3190
3191
DeterminedAlready(QuickCheckDetails * quick_check,int offset)3192 static bool DeterminedAlready(QuickCheckDetails* quick_check, int offset) {
3193 if (quick_check == NULL) return false;
3194 if (offset >= quick_check->characters()) return false;
3195 return quick_check->positions(offset)->determines_perfectly;
3196 }
3197
3198
UpdateBoundsCheck(int index,int * checked_up_to)3199 static void UpdateBoundsCheck(int index, int* checked_up_to) {
3200 if (index > *checked_up_to) {
3201 *checked_up_to = index;
3202 }
3203 }
3204
3205
3206 // We call this repeatedly to generate code for each pass over the text node.
3207 // The passes are in increasing order of difficulty because we hope one
3208 // of the first passes will fail in which case we are saved the work of the
3209 // later passes. for example for the case independent regexp /%[asdfghjkl]a/
3210 // we will check the '%' in the first pass, the case independent 'a' in the
3211 // second pass and the character class in the last pass.
3212 //
3213 // The passes are done from right to left, so for example to test for /bar/
3214 // we will first test for an 'r' with offset 2, then an 'a' with offset 1
3215 // and then a 'b' with offset 0. This means we can avoid the end-of-input
3216 // bounds check most of the time. In the example we only need to check for
3217 // end-of-input when loading the putative 'r'.
3218 //
3219 // A slight complication involves the fact that the first character may already
3220 // be fetched into a register by the previous node. In this case we want to
3221 // do the test for that character first. We do this in separate passes. The
3222 // 'preloaded' argument indicates that we are doing such a 'pass'. If such a
3223 // pass has been performed then subsequent passes will have true in
3224 // first_element_checked to indicate that that character does not need to be
3225 // checked again.
3226 //
3227 // In addition to all this we are passed a Trace, which can
3228 // contain an AlternativeGeneration object. In this AlternativeGeneration
3229 // object we can see details of any quick check that was already passed in
3230 // order to get to the code we are now generating. The quick check can involve
3231 // loading characters, which means we do not need to recheck the bounds
3232 // up to the limit the quick check already checked. In addition the quick
3233 // check can have involved a mask and compare operation which may simplify
3234 // or obviate the need for further checks at some character positions.
TextEmitPass(RegExpCompiler * compiler,TextEmitPassType pass,bool preloaded,Trace * trace,bool first_element_checked,int * checked_up_to)3235 void TextNode::TextEmitPass(RegExpCompiler* compiler,
3236 TextEmitPassType pass,
3237 bool preloaded,
3238 Trace* trace,
3239 bool first_element_checked,
3240 int* checked_up_to) {
3241 RegExpMacroAssembler* assembler = compiler->macro_assembler();
3242 Isolate* isolate = assembler->isolate();
3243 bool one_byte = compiler->one_byte();
3244 Label* backtrack = trace->backtrack();
3245 QuickCheckDetails* quick_check = trace->quick_check_performed();
3246 int element_count = elements()->length();
3247 int backward_offset = read_backward() ? -Length() : 0;
3248 for (int i = preloaded ? 0 : element_count - 1; i >= 0; i--) {
3249 TextElement elm = elements()->at(i);
3250 int cp_offset = trace->cp_offset() + elm.cp_offset() + backward_offset;
3251 if (elm.text_type() == TextElement::ATOM) {
3252 Vector<const uc16> quarks = elm.atom()->data();
3253 for (int j = preloaded ? 0 : quarks.length() - 1; j >= 0; j--) {
3254 if (first_element_checked && i == 0 && j == 0) continue;
3255 if (DeterminedAlready(quick_check, elm.cp_offset() + j)) continue;
3256 EmitCharacterFunction* emit_function = NULL;
3257 switch (pass) {
3258 case NON_LATIN1_MATCH:
3259 DCHECK(one_byte);
3260 if (quarks[j] > String::kMaxOneByteCharCode) {
3261 assembler->GoTo(backtrack);
3262 return;
3263 }
3264 break;
3265 case NON_LETTER_CHARACTER_MATCH:
3266 emit_function = &EmitAtomNonLetter;
3267 break;
3268 case SIMPLE_CHARACTER_MATCH:
3269 emit_function = &EmitSimpleCharacter;
3270 break;
3271 case CASE_CHARACTER_MATCH:
3272 emit_function = &EmitAtomLetter;
3273 break;
3274 default:
3275 break;
3276 }
3277 if (emit_function != NULL) {
3278 bool bounds_check = *checked_up_to < cp_offset + j || read_backward();
3279 bool bound_checked =
3280 emit_function(isolate, compiler, quarks[j], backtrack,
3281 cp_offset + j, bounds_check, preloaded);
3282 if (bound_checked) UpdateBoundsCheck(cp_offset + j, checked_up_to);
3283 }
3284 }
3285 } else {
3286 DCHECK_EQ(TextElement::CHAR_CLASS, elm.text_type());
3287 if (pass == CHARACTER_CLASS_MATCH) {
3288 if (first_element_checked && i == 0) continue;
3289 if (DeterminedAlready(quick_check, elm.cp_offset())) continue;
3290 RegExpCharacterClass* cc = elm.char_class();
3291 bool bounds_check = *checked_up_to < cp_offset || read_backward();
3292 EmitCharClass(assembler, cc, one_byte, backtrack, cp_offset,
3293 bounds_check, preloaded, zone());
3294 UpdateBoundsCheck(cp_offset, checked_up_to);
3295 }
3296 }
3297 }
3298 }
3299
3300
Length()3301 int TextNode::Length() {
3302 TextElement elm = elements()->last();
3303 DCHECK(elm.cp_offset() >= 0);
3304 return elm.cp_offset() + elm.length();
3305 }
3306
3307
SkipPass(int int_pass,bool ignore_case)3308 bool TextNode::SkipPass(int int_pass, bool ignore_case) {
3309 TextEmitPassType pass = static_cast<TextEmitPassType>(int_pass);
3310 if (ignore_case) {
3311 return pass == SIMPLE_CHARACTER_MATCH;
3312 } else {
3313 return pass == NON_LETTER_CHARACTER_MATCH || pass == CASE_CHARACTER_MATCH;
3314 }
3315 }
3316
3317
CreateForCharacterRanges(Zone * zone,ZoneList<CharacterRange> * ranges,bool read_backward,RegExpNode * on_success)3318 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3319 ZoneList<CharacterRange>* ranges,
3320 bool read_backward,
3321 RegExpNode* on_success) {
3322 DCHECK_NOT_NULL(ranges);
3323 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3324 elms->Add(
3325 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),
3326 zone);
3327 return new (zone) TextNode(elms, read_backward, on_success);
3328 }
3329
3330
CreateForSurrogatePair(Zone * zone,CharacterRange lead,CharacterRange trail,bool read_backward,RegExpNode * on_success)3331 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3332 CharacterRange trail,
3333 bool read_backward,
3334 RegExpNode* on_success) {
3335 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3336 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3337 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3338 elms->Add(TextElement::CharClass(
3339 new (zone) RegExpCharacterClass(lead_ranges, false)),
3340 zone);
3341 elms->Add(TextElement::CharClass(
3342 new (zone) RegExpCharacterClass(trail_ranges, false)),
3343 zone);
3344 return new (zone) TextNode(elms, read_backward, on_success);
3345 }
3346
3347
3348 // This generates the code to match a text node. A text node can contain
3349 // straight character sequences (possibly to be matched in a case-independent
3350 // way) and character classes. For efficiency we do not do this in a single
3351 // pass from left to right. Instead we pass over the text node several times,
3352 // emitting code for some character positions every time. See the comment on
3353 // TextEmitPass for details.
Emit(RegExpCompiler * compiler,Trace * trace)3354 void TextNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3355 LimitResult limit_result = LimitVersions(compiler, trace);
3356 if (limit_result == DONE) return;
3357 DCHECK(limit_result == CONTINUE);
3358
3359 if (trace->cp_offset() + Length() > RegExpMacroAssembler::kMaxCPOffset) {
3360 compiler->SetRegExpTooBig();
3361 return;
3362 }
3363
3364 if (compiler->one_byte()) {
3365 int dummy = 0;
3366 TextEmitPass(compiler, NON_LATIN1_MATCH, false, trace, false, &dummy);
3367 }
3368
3369 bool first_elt_done = false;
3370 int bound_checked_to = trace->cp_offset() - 1;
3371 bound_checked_to += trace->bound_checked_up_to();
3372
3373 // If a character is preloaded into the current character register then
3374 // check that now.
3375 if (trace->characters_preloaded() == 1) {
3376 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3377 if (!SkipPass(pass, compiler->ignore_case())) {
3378 TextEmitPass(compiler,
3379 static_cast<TextEmitPassType>(pass),
3380 true,
3381 trace,
3382 false,
3383 &bound_checked_to);
3384 }
3385 }
3386 first_elt_done = true;
3387 }
3388
3389 for (int pass = kFirstRealPass; pass <= kLastPass; pass++) {
3390 if (!SkipPass(pass, compiler->ignore_case())) {
3391 TextEmitPass(compiler,
3392 static_cast<TextEmitPassType>(pass),
3393 false,
3394 trace,
3395 first_elt_done,
3396 &bound_checked_to);
3397 }
3398 }
3399
3400 Trace successor_trace(*trace);
3401 // If we advance backward, we may end up at the start.
3402 successor_trace.AdvanceCurrentPositionInTrace(
3403 read_backward() ? -Length() : Length(), compiler);
3404 successor_trace.set_at_start(read_backward() ? Trace::UNKNOWN
3405 : Trace::FALSE_VALUE);
3406 RecursionCheck rc(compiler);
3407 on_success()->Emit(compiler, &successor_trace);
3408 }
3409
3410
InvalidateCurrentCharacter()3411 void Trace::InvalidateCurrentCharacter() {
3412 characters_preloaded_ = 0;
3413 }
3414
3415
AdvanceCurrentPositionInTrace(int by,RegExpCompiler * compiler)3416 void Trace::AdvanceCurrentPositionInTrace(int by, RegExpCompiler* compiler) {
3417 // We don't have an instruction for shifting the current character register
3418 // down or for using a shifted value for anything so lets just forget that
3419 // we preloaded any characters into it.
3420 characters_preloaded_ = 0;
3421 // Adjust the offsets of the quick check performed information. This
3422 // information is used to find out what we already determined about the
3423 // characters by means of mask and compare.
3424 quick_check_performed_.Advance(by, compiler->one_byte());
3425 cp_offset_ += by;
3426 if (cp_offset_ > RegExpMacroAssembler::kMaxCPOffset) {
3427 compiler->SetRegExpTooBig();
3428 cp_offset_ = 0;
3429 }
3430 bound_checked_up_to_ = Max(0, bound_checked_up_to_ - by);
3431 }
3432
3433
MakeCaseIndependent(Isolate * isolate,bool is_one_byte)3434 void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) {
3435 int element_count = elements()->length();
3436 for (int i = 0; i < element_count; i++) {
3437 TextElement elm = elements()->at(i);
3438 if (elm.text_type() == TextElement::CHAR_CLASS) {
3439 RegExpCharacterClass* cc = elm.char_class();
3440 // None of the standard character classes is different in the case
3441 // independent case and it slows us down if we don't know that.
3442 if (cc->is_standard(zone())) continue;
3443 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
3444 CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte);
3445 }
3446 }
3447 }
3448
3449
GreedyLoopTextLength()3450 int TextNode::GreedyLoopTextLength() { return Length(); }
3451
3452
GetSuccessorOfOmnivorousTextNode(RegExpCompiler * compiler)3453 RegExpNode* TextNode::GetSuccessorOfOmnivorousTextNode(
3454 RegExpCompiler* compiler) {
3455 if (read_backward()) return NULL;
3456 if (elements()->length() != 1) return NULL;
3457 TextElement elm = elements()->at(0);
3458 if (elm.text_type() != TextElement::CHAR_CLASS) return NULL;
3459 RegExpCharacterClass* node = elm.char_class();
3460 ZoneList<CharacterRange>* ranges = node->ranges(zone());
3461 CharacterRange::Canonicalize(ranges);
3462 if (node->is_negated()) {
3463 return ranges->length() == 0 ? on_success() : NULL;
3464 }
3465 if (ranges->length() != 1) return NULL;
3466 uint32_t max_char;
3467 if (compiler->one_byte()) {
3468 max_char = String::kMaxOneByteCharCode;
3469 } else {
3470 max_char = String::kMaxUtf16CodeUnit;
3471 }
3472 return ranges->at(0).IsEverything(max_char) ? on_success() : NULL;
3473 }
3474
3475
3476 // Finds the fixed match length of a sequence of nodes that goes from
3477 // this alternative and back to this choice node. If there are variable
3478 // length nodes or other complications in the way then return a sentinel
3479 // value indicating that a greedy loop cannot be constructed.
GreedyLoopTextLengthForAlternative(GuardedAlternative * alternative)3480 int ChoiceNode::GreedyLoopTextLengthForAlternative(
3481 GuardedAlternative* alternative) {
3482 int length = 0;
3483 RegExpNode* node = alternative->node();
3484 // Later we will generate code for all these text nodes using recursion
3485 // so we have to limit the max number.
3486 int recursion_depth = 0;
3487 while (node != this) {
3488 if (recursion_depth++ > RegExpCompiler::kMaxRecursion) {
3489 return kNodeIsTooComplexForGreedyLoops;
3490 }
3491 int node_length = node->GreedyLoopTextLength();
3492 if (node_length == kNodeIsTooComplexForGreedyLoops) {
3493 return kNodeIsTooComplexForGreedyLoops;
3494 }
3495 length += node_length;
3496 SeqRegExpNode* seq_node = static_cast<SeqRegExpNode*>(node);
3497 node = seq_node->on_success();
3498 }
3499 return read_backward() ? -length : length;
3500 }
3501
3502
AddLoopAlternative(GuardedAlternative alt)3503 void LoopChoiceNode::AddLoopAlternative(GuardedAlternative alt) {
3504 DCHECK_NULL(loop_node_);
3505 AddAlternative(alt);
3506 loop_node_ = alt.node();
3507 }
3508
3509
AddContinueAlternative(GuardedAlternative alt)3510 void LoopChoiceNode::AddContinueAlternative(GuardedAlternative alt) {
3511 DCHECK_NULL(continue_node_);
3512 AddAlternative(alt);
3513 continue_node_ = alt.node();
3514 }
3515
3516
Emit(RegExpCompiler * compiler,Trace * trace)3517 void LoopChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3518 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
3519 if (trace->stop_node() == this) {
3520 // Back edge of greedy optimized loop node graph.
3521 int text_length =
3522 GreedyLoopTextLengthForAlternative(&(alternatives_->at(0)));
3523 DCHECK(text_length != kNodeIsTooComplexForGreedyLoops);
3524 // Update the counter-based backtracking info on the stack. This is an
3525 // optimization for greedy loops (see below).
3526 DCHECK(trace->cp_offset() == text_length);
3527 macro_assembler->AdvanceCurrentPosition(text_length);
3528 macro_assembler->GoTo(trace->loop_label());
3529 return;
3530 }
3531 DCHECK_NULL(trace->stop_node());
3532 if (!trace->is_trivial()) {
3533 trace->Flush(compiler, this);
3534 return;
3535 }
3536 ChoiceNode::Emit(compiler, trace);
3537 }
3538
3539
CalculatePreloadCharacters(RegExpCompiler * compiler,int eats_at_least)3540 int ChoiceNode::CalculatePreloadCharacters(RegExpCompiler* compiler,
3541 int eats_at_least) {
3542 int preload_characters = Min(4, eats_at_least);
3543 if (compiler->macro_assembler()->CanReadUnaligned()) {
3544 bool one_byte = compiler->one_byte();
3545 if (one_byte) {
3546 if (preload_characters > 4) preload_characters = 4;
3547 // We can't preload 3 characters because there is no machine instruction
3548 // to do that. We can't just load 4 because we could be reading
3549 // beyond the end of the string, which could cause a memory fault.
3550 if (preload_characters == 3) preload_characters = 2;
3551 } else {
3552 if (preload_characters > 2) preload_characters = 2;
3553 }
3554 } else {
3555 if (preload_characters > 1) preload_characters = 1;
3556 }
3557 return preload_characters;
3558 }
3559
3560
3561 // This class is used when generating the alternatives in a choice node. It
3562 // records the way the alternative is being code generated.
3563 class AlternativeGeneration: public Malloced {
3564 public:
AlternativeGeneration()3565 AlternativeGeneration()
3566 : possible_success(),
3567 expects_preload(false),
3568 after(),
3569 quick_check_details() { }
3570 Label possible_success;
3571 bool expects_preload;
3572 Label after;
3573 QuickCheckDetails quick_check_details;
3574 };
3575
3576
3577 // Creates a list of AlternativeGenerations. If the list has a reasonable
3578 // size then it is on the stack, otherwise the excess is on the heap.
3579 class AlternativeGenerationList {
3580 public:
AlternativeGenerationList(int count,Zone * zone)3581 AlternativeGenerationList(int count, Zone* zone)
3582 : alt_gens_(count, zone) {
3583 for (int i = 0; i < count && i < kAFew; i++) {
3584 alt_gens_.Add(a_few_alt_gens_ + i, zone);
3585 }
3586 for (int i = kAFew; i < count; i++) {
3587 alt_gens_.Add(new AlternativeGeneration(), zone);
3588 }
3589 }
~AlternativeGenerationList()3590 ~AlternativeGenerationList() {
3591 for (int i = kAFew; i < alt_gens_.length(); i++) {
3592 delete alt_gens_[i];
3593 alt_gens_[i] = NULL;
3594 }
3595 }
3596
at(int i)3597 AlternativeGeneration* at(int i) {
3598 return alt_gens_[i];
3599 }
3600
3601 private:
3602 static const int kAFew = 10;
3603 ZoneList<AlternativeGeneration*> alt_gens_;
3604 AlternativeGeneration a_few_alt_gens_[kAFew];
3605 };
3606
3607
3608 static const uc32 kRangeEndMarker = 0x110000;
3609
3610 // The '2' variant is has inclusive from and exclusive to.
3611 // This covers \s as defined in ECMA-262 5.1, 15.10.2.12,
3612 // which include WhiteSpace (7.2) or LineTerminator (7.3) values.
3613 static const int kSpaceRanges[] = {
3614 '\t', '\r' + 1, ' ', ' ' + 1, 0x00A0, 0x00A1, 0x1680, 0x1681,
3615 0x180E, 0x180F, 0x2000, 0x200B, 0x2028, 0x202A, 0x202F, 0x2030,
3616 0x205F, 0x2060, 0x3000, 0x3001, 0xFEFF, 0xFF00, kRangeEndMarker};
3617 static const int kSpaceRangeCount = arraysize(kSpaceRanges);
3618
3619 static const int kWordRanges[] = {
3620 '0', '9' + 1, 'A', 'Z' + 1, '_', '_' + 1, 'a', 'z' + 1, kRangeEndMarker};
3621 static const int kWordRangeCount = arraysize(kWordRanges);
3622 static const int kDigitRanges[] = {'0', '9' + 1, kRangeEndMarker};
3623 static const int kDigitRangeCount = arraysize(kDigitRanges);
3624 static const int kSurrogateRanges[] = {
3625 kLeadSurrogateStart, kLeadSurrogateStart + 1, kRangeEndMarker};
3626 static const int kSurrogateRangeCount = arraysize(kSurrogateRanges);
3627 static const int kLineTerminatorRanges[] = {
3628 0x000A, 0x000B, 0x000D, 0x000E, 0x2028, 0x202A, kRangeEndMarker};
3629 static const int kLineTerminatorRangeCount = arraysize(kLineTerminatorRanges);
3630
Set(int character)3631 void BoyerMoorePositionInfo::Set(int character) {
3632 SetInterval(Interval(character, character));
3633 }
3634
3635
SetInterval(const Interval & interval)3636 void BoyerMoorePositionInfo::SetInterval(const Interval& interval) {
3637 s_ = AddRange(s_, kSpaceRanges, kSpaceRangeCount, interval);
3638 w_ = AddRange(w_, kWordRanges, kWordRangeCount, interval);
3639 d_ = AddRange(d_, kDigitRanges, kDigitRangeCount, interval);
3640 surrogate_ =
3641 AddRange(surrogate_, kSurrogateRanges, kSurrogateRangeCount, interval);
3642 if (interval.to() - interval.from() >= kMapSize - 1) {
3643 if (map_count_ != kMapSize) {
3644 map_count_ = kMapSize;
3645 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3646 }
3647 return;
3648 }
3649 for (int i = interval.from(); i <= interval.to(); i++) {
3650 int mod_character = (i & kMask);
3651 if (!map_->at(mod_character)) {
3652 map_count_++;
3653 map_->at(mod_character) = true;
3654 }
3655 if (map_count_ == kMapSize) return;
3656 }
3657 }
3658
3659
SetAll()3660 void BoyerMoorePositionInfo::SetAll() {
3661 s_ = w_ = d_ = kLatticeUnknown;
3662 if (map_count_ != kMapSize) {
3663 map_count_ = kMapSize;
3664 for (int i = 0; i < kMapSize; i++) map_->at(i) = true;
3665 }
3666 }
3667
3668
BoyerMooreLookahead(int length,RegExpCompiler * compiler,Zone * zone)3669 BoyerMooreLookahead::BoyerMooreLookahead(
3670 int length, RegExpCompiler* compiler, Zone* zone)
3671 : length_(length),
3672 compiler_(compiler) {
3673 if (compiler->one_byte()) {
3674 max_char_ = String::kMaxOneByteCharCode;
3675 } else {
3676 max_char_ = String::kMaxUtf16CodeUnit;
3677 }
3678 bitmaps_ = new(zone) ZoneList<BoyerMoorePositionInfo*>(length, zone);
3679 for (int i = 0; i < length; i++) {
3680 bitmaps_->Add(new(zone) BoyerMoorePositionInfo(zone), zone);
3681 }
3682 }
3683
3684
3685 // Find the longest range of lookahead that has the fewest number of different
3686 // characters that can occur at a given position. Since we are optimizing two
3687 // different parameters at once this is a tradeoff.
FindWorthwhileInterval(int * from,int * to)3688 bool BoyerMooreLookahead::FindWorthwhileInterval(int* from, int* to) {
3689 int biggest_points = 0;
3690 // If more than 32 characters out of 128 can occur it is unlikely that we can
3691 // be lucky enough to step forwards much of the time.
3692 const int kMaxMax = 32;
3693 for (int max_number_of_chars = 4;
3694 max_number_of_chars < kMaxMax;
3695 max_number_of_chars *= 2) {
3696 biggest_points =
3697 FindBestInterval(max_number_of_chars, biggest_points, from, to);
3698 }
3699 if (biggest_points == 0) return false;
3700 return true;
3701 }
3702
3703
3704 // Find the highest-points range between 0 and length_ where the character
3705 // information is not too vague. 'Too vague' means that there are more than
3706 // max_number_of_chars that can occur at this position. Calculates the number
3707 // of points as the product of width-of-the-range and
3708 // probability-of-finding-one-of-the-characters, where the probability is
3709 // calculated using the frequency distribution of the sample subject string.
FindBestInterval(int max_number_of_chars,int old_biggest_points,int * from,int * to)3710 int BoyerMooreLookahead::FindBestInterval(
3711 int max_number_of_chars, int old_biggest_points, int* from, int* to) {
3712 int biggest_points = old_biggest_points;
3713 static const int kSize = RegExpMacroAssembler::kTableSize;
3714 for (int i = 0; i < length_; ) {
3715 while (i < length_ && Count(i) > max_number_of_chars) i++;
3716 if (i == length_) break;
3717 int remembered_from = i;
3718 bool union_map[kSize];
3719 for (int j = 0; j < kSize; j++) union_map[j] = false;
3720 while (i < length_ && Count(i) <= max_number_of_chars) {
3721 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3722 for (int j = 0; j < kSize; j++) union_map[j] |= map->at(j);
3723 i++;
3724 }
3725 int frequency = 0;
3726 for (int j = 0; j < kSize; j++) {
3727 if (union_map[j]) {
3728 // Add 1 to the frequency to give a small per-character boost for
3729 // the cases where our sampling is not good enough and many
3730 // characters have a frequency of zero. This means the frequency
3731 // can theoretically be up to 2*kSize though we treat it mostly as
3732 // a fraction of kSize.
3733 frequency += compiler_->frequency_collator()->Frequency(j) + 1;
3734 }
3735 }
3736 // We use the probability of skipping times the distance we are skipping to
3737 // judge the effectiveness of this. Actually we have a cut-off: By
3738 // dividing by 2 we switch off the skipping if the probability of skipping
3739 // is less than 50%. This is because the multibyte mask-and-compare
3740 // skipping in quickcheck is more likely to do well on this case.
3741 bool in_quickcheck_range =
3742 ((i - remembered_from < 4) ||
3743 (compiler_->one_byte() ? remembered_from <= 4 : remembered_from <= 2));
3744 // Called 'probability' but it is only a rough estimate and can actually
3745 // be outside the 0-kSize range.
3746 int probability = (in_quickcheck_range ? kSize / 2 : kSize) - frequency;
3747 int points = (i - remembered_from) * probability;
3748 if (points > biggest_points) {
3749 *from = remembered_from;
3750 *to = i - 1;
3751 biggest_points = points;
3752 }
3753 }
3754 return biggest_points;
3755 }
3756
3757
3758 // Take all the characters that will not prevent a successful match if they
3759 // occur in the subject string in the range between min_lookahead and
3760 // max_lookahead (inclusive) measured from the current position. If the
3761 // character at max_lookahead offset is not one of these characters, then we
3762 // can safely skip forwards by the number of characters in the range.
GetSkipTable(int min_lookahead,int max_lookahead,Handle<ByteArray> boolean_skip_table)3763 int BoyerMooreLookahead::GetSkipTable(int min_lookahead,
3764 int max_lookahead,
3765 Handle<ByteArray> boolean_skip_table) {
3766 const int kSize = RegExpMacroAssembler::kTableSize;
3767
3768 const int kSkipArrayEntry = 0;
3769 const int kDontSkipArrayEntry = 1;
3770
3771 for (int i = 0; i < kSize; i++) {
3772 boolean_skip_table->set(i, kSkipArrayEntry);
3773 }
3774 int skip = max_lookahead + 1 - min_lookahead;
3775
3776 for (int i = max_lookahead; i >= min_lookahead; i--) {
3777 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3778 for (int j = 0; j < kSize; j++) {
3779 if (map->at(j)) {
3780 boolean_skip_table->set(j, kDontSkipArrayEntry);
3781 }
3782 }
3783 }
3784
3785 return skip;
3786 }
3787
3788
3789 // See comment above on the implementation of GetSkipTable.
EmitSkipInstructions(RegExpMacroAssembler * masm)3790 void BoyerMooreLookahead::EmitSkipInstructions(RegExpMacroAssembler* masm) {
3791 const int kSize = RegExpMacroAssembler::kTableSize;
3792
3793 int min_lookahead = 0;
3794 int max_lookahead = 0;
3795
3796 if (!FindWorthwhileInterval(&min_lookahead, &max_lookahead)) return;
3797
3798 bool found_single_character = false;
3799 int single_character = 0;
3800 for (int i = max_lookahead; i >= min_lookahead; i--) {
3801 BoyerMoorePositionInfo* map = bitmaps_->at(i);
3802 if (map->map_count() > 1 ||
3803 (found_single_character && map->map_count() != 0)) {
3804 found_single_character = false;
3805 break;
3806 }
3807 for (int j = 0; j < kSize; j++) {
3808 if (map->at(j)) {
3809 found_single_character = true;
3810 single_character = j;
3811 break;
3812 }
3813 }
3814 }
3815
3816 int lookahead_width = max_lookahead + 1 - min_lookahead;
3817
3818 if (found_single_character && lookahead_width == 1 && max_lookahead < 3) {
3819 // The mask-compare can probably handle this better.
3820 return;
3821 }
3822
3823 if (found_single_character) {
3824 Label cont, again;
3825 masm->Bind(&again);
3826 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3827 if (max_char_ > kSize) {
3828 masm->CheckCharacterAfterAnd(single_character,
3829 RegExpMacroAssembler::kTableMask,
3830 &cont);
3831 } else {
3832 masm->CheckCharacter(single_character, &cont);
3833 }
3834 masm->AdvanceCurrentPosition(lookahead_width);
3835 masm->GoTo(&again);
3836 masm->Bind(&cont);
3837 return;
3838 }
3839
3840 Factory* factory = masm->isolate()->factory();
3841 Handle<ByteArray> boolean_skip_table = factory->NewByteArray(kSize, TENURED);
3842 int skip_distance = GetSkipTable(
3843 min_lookahead, max_lookahead, boolean_skip_table);
3844 DCHECK(skip_distance != 0);
3845
3846 Label cont, again;
3847 masm->Bind(&again);
3848 masm->LoadCurrentCharacter(max_lookahead, &cont, true);
3849 masm->CheckBitInTable(boolean_skip_table, &cont);
3850 masm->AdvanceCurrentPosition(skip_distance);
3851 masm->GoTo(&again);
3852 masm->Bind(&cont);
3853 }
3854
3855
3856 /* Code generation for choice nodes.
3857 *
3858 * We generate quick checks that do a mask and compare to eliminate a
3859 * choice. If the quick check succeeds then it jumps to the continuation to
3860 * do slow checks and check subsequent nodes. If it fails (the common case)
3861 * it falls through to the next choice.
3862 *
3863 * Here is the desired flow graph. Nodes directly below each other imply
3864 * fallthrough. Alternatives 1 and 2 have quick checks. Alternative
3865 * 3 doesn't have a quick check so we have to call the slow check.
3866 * Nodes are marked Qn for quick checks and Sn for slow checks. The entire
3867 * regexp continuation is generated directly after the Sn node, up to the
3868 * next GoTo if we decide to reuse some already generated code. Some
3869 * nodes expect preload_characters to be preloaded into the current
3870 * character register. R nodes do this preloading. Vertices are marked
3871 * F for failures and S for success (possible success in the case of quick
3872 * nodes). L, V, < and > are used as arrow heads.
3873 *
3874 * ----------> R
3875 * |
3876 * V
3877 * Q1 -----> S1
3878 * | S /
3879 * F| /
3880 * | F/
3881 * | /
3882 * | R
3883 * | /
3884 * V L
3885 * Q2 -----> S2
3886 * | S /
3887 * F| /
3888 * | F/
3889 * | /
3890 * | R
3891 * | /
3892 * V L
3893 * S3
3894 * |
3895 * F|
3896 * |
3897 * R
3898 * |
3899 * backtrack V
3900 * <----------Q4
3901 * \ F |
3902 * \ |S
3903 * \ F V
3904 * \-----S4
3905 *
3906 * For greedy loops we push the current position, then generate the code that
3907 * eats the input specially in EmitGreedyLoop. The other choice (the
3908 * continuation) is generated by the normal code in EmitChoices, and steps back
3909 * in the input to the starting position when it fails to match. The loop code
3910 * looks like this (U is the unwind code that steps back in the greedy loop).
3911 *
3912 * _____
3913 * / \
3914 * V |
3915 * ----------> S1 |
3916 * /| |
3917 * / |S |
3918 * F/ \_____/
3919 * /
3920 * |<-----
3921 * | \
3922 * V |S
3923 * Q2 ---> U----->backtrack
3924 * | F /
3925 * S| /
3926 * V F /
3927 * S2--/
3928 */
3929
GreedyLoopState(bool not_at_start)3930 GreedyLoopState::GreedyLoopState(bool not_at_start) {
3931 counter_backtrack_trace_.set_backtrack(&label_);
3932 if (not_at_start) counter_backtrack_trace_.set_at_start(Trace::FALSE_VALUE);
3933 }
3934
3935
AssertGuardsMentionRegisters(Trace * trace)3936 void ChoiceNode::AssertGuardsMentionRegisters(Trace* trace) {
3937 #ifdef DEBUG
3938 int choice_count = alternatives_->length();
3939 for (int i = 0; i < choice_count - 1; i++) {
3940 GuardedAlternative alternative = alternatives_->at(i);
3941 ZoneList<Guard*>* guards = alternative.guards();
3942 int guard_count = (guards == NULL) ? 0 : guards->length();
3943 for (int j = 0; j < guard_count; j++) {
3944 DCHECK(!trace->mentions_reg(guards->at(j)->reg()));
3945 }
3946 }
3947 #endif
3948 }
3949
3950
SetUpPreLoad(RegExpCompiler * compiler,Trace * current_trace,PreloadState * state)3951 void ChoiceNode::SetUpPreLoad(RegExpCompiler* compiler,
3952 Trace* current_trace,
3953 PreloadState* state) {
3954 if (state->eats_at_least_ == PreloadState::kEatsAtLeastNotYetInitialized) {
3955 // Save some time by looking at most one machine word ahead.
3956 state->eats_at_least_ =
3957 EatsAtLeast(compiler->one_byte() ? 4 : 2, kRecursionBudget,
3958 current_trace->at_start() == Trace::FALSE_VALUE);
3959 }
3960 state->preload_characters_ =
3961 CalculatePreloadCharacters(compiler, state->eats_at_least_);
3962
3963 state->preload_is_current_ =
3964 (current_trace->characters_preloaded() == state->preload_characters_);
3965 state->preload_has_checked_bounds_ = state->preload_is_current_;
3966 }
3967
3968
Emit(RegExpCompiler * compiler,Trace * trace)3969 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3970 int choice_count = alternatives_->length();
3971
3972 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3973 alternatives_->at(0).node()->Emit(compiler, trace);
3974 return;
3975 }
3976
3977 AssertGuardsMentionRegisters(trace);
3978
3979 LimitResult limit_result = LimitVersions(compiler, trace);
3980 if (limit_result == DONE) return;
3981 DCHECK(limit_result == CONTINUE);
3982
3983 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3984 // other choice nodes we only flush if we are out of code size budget.
3985 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3986 trace->Flush(compiler, this);
3987 return;
3988 }
3989
3990 RecursionCheck rc(compiler);
3991
3992 PreloadState preload;
3993 preload.init();
3994 GreedyLoopState greedy_loop_state(not_at_start());
3995
3996 int text_length = GreedyLoopTextLengthForAlternative(&alternatives_->at(0));
3997 AlternativeGenerationList alt_gens(choice_count, zone());
3998
3999 if (choice_count > 1 && text_length != kNodeIsTooComplexForGreedyLoops) {
4000 trace = EmitGreedyLoop(compiler,
4001 trace,
4002 &alt_gens,
4003 &preload,
4004 &greedy_loop_state,
4005 text_length);
4006 } else {
4007 // TODO(erikcorry): Delete this. We don't need this label, but it makes us
4008 // match the traces produced pre-cleanup.
4009 Label second_choice;
4010 compiler->macro_assembler()->Bind(&second_choice);
4011
4012 preload.eats_at_least_ = EmitOptimizedUnanchoredSearch(compiler, trace);
4013
4014 EmitChoices(compiler,
4015 &alt_gens,
4016 0,
4017 trace,
4018 &preload);
4019 }
4020
4021 // At this point we need to generate slow checks for the alternatives where
4022 // the quick check was inlined. We can recognize these because the associated
4023 // label was bound.
4024 int new_flush_budget = trace->flush_budget() / choice_count;
4025 for (int i = 0; i < choice_count; i++) {
4026 AlternativeGeneration* alt_gen = alt_gens.at(i);
4027 Trace new_trace(*trace);
4028 // If there are actions to be flushed we have to limit how many times
4029 // they are flushed. Take the budget of the parent trace and distribute
4030 // it fairly amongst the children.
4031 if (new_trace.actions() != NULL) {
4032 new_trace.set_flush_budget(new_flush_budget);
4033 }
4034 bool next_expects_preload =
4035 i == choice_count - 1 ? false : alt_gens.at(i + 1)->expects_preload;
4036 EmitOutOfLineContinuation(compiler,
4037 &new_trace,
4038 alternatives_->at(i),
4039 alt_gen,
4040 preload.preload_characters_,
4041 next_expects_preload);
4042 }
4043 }
4044
4045
EmitGreedyLoop(RegExpCompiler * compiler,Trace * trace,AlternativeGenerationList * alt_gens,PreloadState * preload,GreedyLoopState * greedy_loop_state,int text_length)4046 Trace* ChoiceNode::EmitGreedyLoop(RegExpCompiler* compiler,
4047 Trace* trace,
4048 AlternativeGenerationList* alt_gens,
4049 PreloadState* preload,
4050 GreedyLoopState* greedy_loop_state,
4051 int text_length) {
4052 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4053 // Here we have special handling for greedy loops containing only text nodes
4054 // and other simple nodes. These are handled by pushing the current
4055 // position on the stack and then incrementing the current position each
4056 // time around the switch. On backtrack we decrement the current position
4057 // and check it against the pushed value. This avoids pushing backtrack
4058 // information for each iteration of the loop, which could take up a lot of
4059 // space.
4060 DCHECK(trace->stop_node() == NULL);
4061 macro_assembler->PushCurrentPosition();
4062 Label greedy_match_failed;
4063 Trace greedy_match_trace;
4064 if (not_at_start()) greedy_match_trace.set_at_start(Trace::FALSE_VALUE);
4065 greedy_match_trace.set_backtrack(&greedy_match_failed);
4066 Label loop_label;
4067 macro_assembler->Bind(&loop_label);
4068 greedy_match_trace.set_stop_node(this);
4069 greedy_match_trace.set_loop_label(&loop_label);
4070 alternatives_->at(0).node()->Emit(compiler, &greedy_match_trace);
4071 macro_assembler->Bind(&greedy_match_failed);
4072
4073 Label second_choice; // For use in greedy matches.
4074 macro_assembler->Bind(&second_choice);
4075
4076 Trace* new_trace = greedy_loop_state->counter_backtrack_trace();
4077
4078 EmitChoices(compiler,
4079 alt_gens,
4080 1,
4081 new_trace,
4082 preload);
4083
4084 macro_assembler->Bind(greedy_loop_state->label());
4085 // If we have unwound to the bottom then backtrack.
4086 macro_assembler->CheckGreedyLoop(trace->backtrack());
4087 // Otherwise try the second priority at an earlier position.
4088 macro_assembler->AdvanceCurrentPosition(-text_length);
4089 macro_assembler->GoTo(&second_choice);
4090 return new_trace;
4091 }
4092
EmitOptimizedUnanchoredSearch(RegExpCompiler * compiler,Trace * trace)4093 int ChoiceNode::EmitOptimizedUnanchoredSearch(RegExpCompiler* compiler,
4094 Trace* trace) {
4095 int eats_at_least = PreloadState::kEatsAtLeastNotYetInitialized;
4096 if (alternatives_->length() != 2) return eats_at_least;
4097
4098 GuardedAlternative alt1 = alternatives_->at(1);
4099 if (alt1.guards() != NULL && alt1.guards()->length() != 0) {
4100 return eats_at_least;
4101 }
4102 RegExpNode* eats_anything_node = alt1.node();
4103 if (eats_anything_node->GetSuccessorOfOmnivorousTextNode(compiler) != this) {
4104 return eats_at_least;
4105 }
4106
4107 // Really we should be creating a new trace when we execute this function,
4108 // but there is no need, because the code it generates cannot backtrack, and
4109 // we always arrive here with a trivial trace (since it's the entry to a
4110 // loop. That also implies that there are no preloaded characters, which is
4111 // good, because it means we won't be violating any assumptions by
4112 // overwriting those characters with new load instructions.
4113 DCHECK(trace->is_trivial());
4114
4115 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4116 Isolate* isolate = macro_assembler->isolate();
4117 // At this point we know that we are at a non-greedy loop that will eat
4118 // any character one at a time. Any non-anchored regexp has such a
4119 // loop prepended to it in order to find where it starts. We look for
4120 // a pattern of the form ...abc... where we can look 6 characters ahead
4121 // and step forwards 3 if the character is not one of abc. Abc need
4122 // not be atoms, they can be any reasonably limited character class or
4123 // small alternation.
4124 BoyerMooreLookahead* bm = bm_info(false);
4125 if (bm == NULL) {
4126 eats_at_least = Min(kMaxLookaheadForBoyerMoore,
4127 EatsAtLeast(kMaxLookaheadForBoyerMoore,
4128 kRecursionBudget,
4129 false));
4130 if (eats_at_least >= 1) {
4131 bm = new(zone()) BoyerMooreLookahead(eats_at_least,
4132 compiler,
4133 zone());
4134 GuardedAlternative alt0 = alternatives_->at(0);
4135 alt0.node()->FillInBMInfo(isolate, 0, kRecursionBudget, bm, false);
4136 }
4137 }
4138 if (bm != NULL) {
4139 bm->EmitSkipInstructions(macro_assembler);
4140 }
4141 return eats_at_least;
4142 }
4143
4144
EmitChoices(RegExpCompiler * compiler,AlternativeGenerationList * alt_gens,int first_choice,Trace * trace,PreloadState * preload)4145 void ChoiceNode::EmitChoices(RegExpCompiler* compiler,
4146 AlternativeGenerationList* alt_gens,
4147 int first_choice,
4148 Trace* trace,
4149 PreloadState* preload) {
4150 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4151 SetUpPreLoad(compiler, trace, preload);
4152
4153 // For now we just call all choices one after the other. The idea ultimately
4154 // is to use the Dispatch table to try only the relevant ones.
4155 int choice_count = alternatives_->length();
4156
4157 int new_flush_budget = trace->flush_budget() / choice_count;
4158
4159 for (int i = first_choice; i < choice_count; i++) {
4160 bool is_last = i == choice_count - 1;
4161 bool fall_through_on_failure = !is_last;
4162 GuardedAlternative alternative = alternatives_->at(i);
4163 AlternativeGeneration* alt_gen = alt_gens->at(i);
4164 alt_gen->quick_check_details.set_characters(preload->preload_characters_);
4165 ZoneList<Guard*>* guards = alternative.guards();
4166 int guard_count = (guards == NULL) ? 0 : guards->length();
4167 Trace new_trace(*trace);
4168 new_trace.set_characters_preloaded(preload->preload_is_current_ ?
4169 preload->preload_characters_ :
4170 0);
4171 if (preload->preload_has_checked_bounds_) {
4172 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4173 }
4174 new_trace.quick_check_performed()->Clear();
4175 if (not_at_start_) new_trace.set_at_start(Trace::FALSE_VALUE);
4176 if (!is_last) {
4177 new_trace.set_backtrack(&alt_gen->after);
4178 }
4179 alt_gen->expects_preload = preload->preload_is_current_;
4180 bool generate_full_check_inline = false;
4181 if (compiler->optimize() &&
4182 try_to_emit_quick_check_for_alternative(i == 0) &&
4183 alternative.node()->EmitQuickCheck(
4184 compiler, trace, &new_trace, preload->preload_has_checked_bounds_,
4185 &alt_gen->possible_success, &alt_gen->quick_check_details,
4186 fall_through_on_failure)) {
4187 // Quick check was generated for this choice.
4188 preload->preload_is_current_ = true;
4189 preload->preload_has_checked_bounds_ = true;
4190 // If we generated the quick check to fall through on possible success,
4191 // we now need to generate the full check inline.
4192 if (!fall_through_on_failure) {
4193 macro_assembler->Bind(&alt_gen->possible_success);
4194 new_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4195 new_trace.set_characters_preloaded(preload->preload_characters_);
4196 new_trace.set_bound_checked_up_to(preload->preload_characters_);
4197 generate_full_check_inline = true;
4198 }
4199 } else if (alt_gen->quick_check_details.cannot_match()) {
4200 if (!fall_through_on_failure) {
4201 macro_assembler->GoTo(trace->backtrack());
4202 }
4203 continue;
4204 } else {
4205 // No quick check was generated. Put the full code here.
4206 // If this is not the first choice then there could be slow checks from
4207 // previous cases that go here when they fail. There's no reason to
4208 // insist that they preload characters since the slow check we are about
4209 // to generate probably can't use it.
4210 if (i != first_choice) {
4211 alt_gen->expects_preload = false;
4212 new_trace.InvalidateCurrentCharacter();
4213 }
4214 generate_full_check_inline = true;
4215 }
4216 if (generate_full_check_inline) {
4217 if (new_trace.actions() != NULL) {
4218 new_trace.set_flush_budget(new_flush_budget);
4219 }
4220 for (int j = 0; j < guard_count; j++) {
4221 GenerateGuard(macro_assembler, guards->at(j), &new_trace);
4222 }
4223 alternative.node()->Emit(compiler, &new_trace);
4224 preload->preload_is_current_ = false;
4225 }
4226 macro_assembler->Bind(&alt_gen->after);
4227 }
4228 }
4229
4230
EmitOutOfLineContinuation(RegExpCompiler * compiler,Trace * trace,GuardedAlternative alternative,AlternativeGeneration * alt_gen,int preload_characters,bool next_expects_preload)4231 void ChoiceNode::EmitOutOfLineContinuation(RegExpCompiler* compiler,
4232 Trace* trace,
4233 GuardedAlternative alternative,
4234 AlternativeGeneration* alt_gen,
4235 int preload_characters,
4236 bool next_expects_preload) {
4237 if (!alt_gen->possible_success.is_linked()) return;
4238
4239 RegExpMacroAssembler* macro_assembler = compiler->macro_assembler();
4240 macro_assembler->Bind(&alt_gen->possible_success);
4241 Trace out_of_line_trace(*trace);
4242 out_of_line_trace.set_characters_preloaded(preload_characters);
4243 out_of_line_trace.set_quick_check_performed(&alt_gen->quick_check_details);
4244 if (not_at_start_) out_of_line_trace.set_at_start(Trace::FALSE_VALUE);
4245 ZoneList<Guard*>* guards = alternative.guards();
4246 int guard_count = (guards == NULL) ? 0 : guards->length();
4247 if (next_expects_preload) {
4248 Label reload_current_char;
4249 out_of_line_trace.set_backtrack(&reload_current_char);
4250 for (int j = 0; j < guard_count; j++) {
4251 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4252 }
4253 alternative.node()->Emit(compiler, &out_of_line_trace);
4254 macro_assembler->Bind(&reload_current_char);
4255 // Reload the current character, since the next quick check expects that.
4256 // We don't need to check bounds here because we only get into this
4257 // code through a quick check which already did the checked load.
4258 macro_assembler->LoadCurrentCharacter(trace->cp_offset(),
4259 NULL,
4260 false,
4261 preload_characters);
4262 macro_assembler->GoTo(&(alt_gen->after));
4263 } else {
4264 out_of_line_trace.set_backtrack(&(alt_gen->after));
4265 for (int j = 0; j < guard_count; j++) {
4266 GenerateGuard(macro_assembler, guards->at(j), &out_of_line_trace);
4267 }
4268 alternative.node()->Emit(compiler, &out_of_line_trace);
4269 }
4270 }
4271
4272
Emit(RegExpCompiler * compiler,Trace * trace)4273 void ActionNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4274 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4275 LimitResult limit_result = LimitVersions(compiler, trace);
4276 if (limit_result == DONE) return;
4277 DCHECK(limit_result == CONTINUE);
4278
4279 RecursionCheck rc(compiler);
4280
4281 switch (action_type_) {
4282 case STORE_POSITION: {
4283 Trace::DeferredCapture
4284 new_capture(data_.u_position_register.reg,
4285 data_.u_position_register.is_capture,
4286 trace);
4287 Trace new_trace = *trace;
4288 new_trace.add_action(&new_capture);
4289 on_success()->Emit(compiler, &new_trace);
4290 break;
4291 }
4292 case INCREMENT_REGISTER: {
4293 Trace::DeferredIncrementRegister
4294 new_increment(data_.u_increment_register.reg);
4295 Trace new_trace = *trace;
4296 new_trace.add_action(&new_increment);
4297 on_success()->Emit(compiler, &new_trace);
4298 break;
4299 }
4300 case SET_REGISTER: {
4301 Trace::DeferredSetRegister
4302 new_set(data_.u_store_register.reg, data_.u_store_register.value);
4303 Trace new_trace = *trace;
4304 new_trace.add_action(&new_set);
4305 on_success()->Emit(compiler, &new_trace);
4306 break;
4307 }
4308 case CLEAR_CAPTURES: {
4309 Trace::DeferredClearCaptures
4310 new_capture(Interval(data_.u_clear_captures.range_from,
4311 data_.u_clear_captures.range_to));
4312 Trace new_trace = *trace;
4313 new_trace.add_action(&new_capture);
4314 on_success()->Emit(compiler, &new_trace);
4315 break;
4316 }
4317 case BEGIN_SUBMATCH:
4318 if (!trace->is_trivial()) {
4319 trace->Flush(compiler, this);
4320 } else {
4321 assembler->WriteCurrentPositionToRegister(
4322 data_.u_submatch.current_position_register, 0);
4323 assembler->WriteStackPointerToRegister(
4324 data_.u_submatch.stack_pointer_register);
4325 on_success()->Emit(compiler, trace);
4326 }
4327 break;
4328 case EMPTY_MATCH_CHECK: {
4329 int start_pos_reg = data_.u_empty_match_check.start_register;
4330 int stored_pos = 0;
4331 int rep_reg = data_.u_empty_match_check.repetition_register;
4332 bool has_minimum = (rep_reg != RegExpCompiler::kNoRegister);
4333 bool know_dist = trace->GetStoredPosition(start_pos_reg, &stored_pos);
4334 if (know_dist && !has_minimum && stored_pos == trace->cp_offset()) {
4335 // If we know we haven't advanced and there is no minimum we
4336 // can just backtrack immediately.
4337 assembler->GoTo(trace->backtrack());
4338 } else if (know_dist && stored_pos < trace->cp_offset()) {
4339 // If we know we've advanced we can generate the continuation
4340 // immediately.
4341 on_success()->Emit(compiler, trace);
4342 } else if (!trace->is_trivial()) {
4343 trace->Flush(compiler, this);
4344 } else {
4345 Label skip_empty_check;
4346 // If we have a minimum number of repetitions we check the current
4347 // number first and skip the empty check if it's not enough.
4348 if (has_minimum) {
4349 int limit = data_.u_empty_match_check.repetition_limit;
4350 assembler->IfRegisterLT(rep_reg, limit, &skip_empty_check);
4351 }
4352 // If the match is empty we bail out, otherwise we fall through
4353 // to the on-success continuation.
4354 assembler->IfRegisterEqPos(data_.u_empty_match_check.start_register,
4355 trace->backtrack());
4356 assembler->Bind(&skip_empty_check);
4357 on_success()->Emit(compiler, trace);
4358 }
4359 break;
4360 }
4361 case POSITIVE_SUBMATCH_SUCCESS: {
4362 if (!trace->is_trivial()) {
4363 trace->Flush(compiler, this);
4364 return;
4365 }
4366 assembler->ReadCurrentPositionFromRegister(
4367 data_.u_submatch.current_position_register);
4368 assembler->ReadStackPointerFromRegister(
4369 data_.u_submatch.stack_pointer_register);
4370 int clear_register_count = data_.u_submatch.clear_register_count;
4371 if (clear_register_count == 0) {
4372 on_success()->Emit(compiler, trace);
4373 return;
4374 }
4375 int clear_registers_from = data_.u_submatch.clear_register_from;
4376 Label clear_registers_backtrack;
4377 Trace new_trace = *trace;
4378 new_trace.set_backtrack(&clear_registers_backtrack);
4379 on_success()->Emit(compiler, &new_trace);
4380
4381 assembler->Bind(&clear_registers_backtrack);
4382 int clear_registers_to = clear_registers_from + clear_register_count - 1;
4383 assembler->ClearRegisters(clear_registers_from, clear_registers_to);
4384
4385 DCHECK(trace->backtrack() == NULL);
4386 assembler->Backtrack();
4387 return;
4388 }
4389 default:
4390 UNREACHABLE();
4391 }
4392 }
4393
4394
Emit(RegExpCompiler * compiler,Trace * trace)4395 void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
4396 RegExpMacroAssembler* assembler = compiler->macro_assembler();
4397 if (!trace->is_trivial()) {
4398 trace->Flush(compiler, this);
4399 return;
4400 }
4401
4402 LimitResult limit_result = LimitVersions(compiler, trace);
4403 if (limit_result == DONE) return;
4404 DCHECK(limit_result == CONTINUE);
4405
4406 RecursionCheck rc(compiler);
4407
4408 DCHECK_EQ(start_reg_ + 1, end_reg_);
4409 if (compiler->ignore_case()) {
4410 assembler->CheckNotBackReferenceIgnoreCase(
4411 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4412 } else {
4413 assembler->CheckNotBackReference(start_reg_, read_backward(),
4414 trace->backtrack());
4415 }
4416 // We are going to advance backward, so we may end up at the start.
4417 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4418
4419 // Check that the back reference does not end inside a surrogate pair.
4420 if (compiler->unicode() && !compiler->one_byte()) {
4421 assembler->CheckNotInSurrogatePair(trace->cp_offset(), trace->backtrack());
4422 }
4423 on_success()->Emit(compiler, trace);
4424 }
4425
4426
4427 // -------------------------------------------------------------------
4428 // Dot/dotty output
4429
4430
4431 #ifdef DEBUG
4432
4433
4434 class DotPrinter: public NodeVisitor {
4435 public:
DotPrinter(std::ostream & os,bool ignore_case)4436 DotPrinter(std::ostream& os, bool ignore_case) // NOLINT
4437 : os_(os),
4438 ignore_case_(ignore_case) {}
4439 void PrintNode(const char* label, RegExpNode* node);
4440 void Visit(RegExpNode* node);
4441 void PrintAttributes(RegExpNode* from);
4442 void PrintOnFailure(RegExpNode* from, RegExpNode* to);
4443 #define DECLARE_VISIT(Type) \
4444 virtual void Visit##Type(Type##Node* that);
4445 FOR_EACH_NODE_TYPE(DECLARE_VISIT)
4446 #undef DECLARE_VISIT
4447 private:
4448 std::ostream& os_;
4449 bool ignore_case_;
4450 };
4451
4452
PrintNode(const char * label,RegExpNode * node)4453 void DotPrinter::PrintNode(const char* label, RegExpNode* node) {
4454 os_ << "digraph G {\n graph [label=\"";
4455 for (int i = 0; label[i]; i++) {
4456 switch (label[i]) {
4457 case '\\':
4458 os_ << "\\\\";
4459 break;
4460 case '"':
4461 os_ << "\"";
4462 break;
4463 default:
4464 os_ << label[i];
4465 break;
4466 }
4467 }
4468 os_ << "\"];\n";
4469 Visit(node);
4470 os_ << "}" << std::endl;
4471 }
4472
4473
Visit(RegExpNode * node)4474 void DotPrinter::Visit(RegExpNode* node) {
4475 if (node->info()->visited) return;
4476 node->info()->visited = true;
4477 node->Accept(this);
4478 }
4479
4480
PrintOnFailure(RegExpNode * from,RegExpNode * on_failure)4481 void DotPrinter::PrintOnFailure(RegExpNode* from, RegExpNode* on_failure) {
4482 os_ << " n" << from << " -> n" << on_failure << " [style=dotted];\n";
4483 Visit(on_failure);
4484 }
4485
4486
4487 class TableEntryBodyPrinter {
4488 public:
TableEntryBodyPrinter(std::ostream & os,ChoiceNode * choice)4489 TableEntryBodyPrinter(std::ostream& os, ChoiceNode* choice) // NOLINT
4490 : os_(os),
4491 choice_(choice) {}
Call(uc16 from,DispatchTable::Entry entry)4492 void Call(uc16 from, DispatchTable::Entry entry) {
4493 OutSet* out_set = entry.out_set();
4494 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4495 if (out_set->Get(i)) {
4496 os_ << " n" << choice() << ":s" << from << "o" << i << " -> n"
4497 << choice()->alternatives()->at(i).node() << ";\n";
4498 }
4499 }
4500 }
4501 private:
choice()4502 ChoiceNode* choice() { return choice_; }
4503 std::ostream& os_;
4504 ChoiceNode* choice_;
4505 };
4506
4507
4508 class TableEntryHeaderPrinter {
4509 public:
TableEntryHeaderPrinter(std::ostream & os)4510 explicit TableEntryHeaderPrinter(std::ostream& os) // NOLINT
4511 : first_(true),
4512 os_(os) {}
Call(uc16 from,DispatchTable::Entry entry)4513 void Call(uc16 from, DispatchTable::Entry entry) {
4514 if (first_) {
4515 first_ = false;
4516 } else {
4517 os_ << "|";
4518 }
4519 os_ << "{\\" << AsUC16(from) << "-\\" << AsUC16(entry.to()) << "|{";
4520 OutSet* out_set = entry.out_set();
4521 int priority = 0;
4522 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4523 if (out_set->Get(i)) {
4524 if (priority > 0) os_ << "|";
4525 os_ << "<s" << from << "o" << i << "> " << priority;
4526 priority++;
4527 }
4528 }
4529 os_ << "}}";
4530 }
4531
4532 private:
4533 bool first_;
4534 std::ostream& os_;
4535 };
4536
4537
4538 class AttributePrinter {
4539 public:
AttributePrinter(std::ostream & os)4540 explicit AttributePrinter(std::ostream& os) // NOLINT
4541 : os_(os),
4542 first_(true) {}
PrintSeparator()4543 void PrintSeparator() {
4544 if (first_) {
4545 first_ = false;
4546 } else {
4547 os_ << "|";
4548 }
4549 }
PrintBit(const char * name,bool value)4550 void PrintBit(const char* name, bool value) {
4551 if (!value) return;
4552 PrintSeparator();
4553 os_ << "{" << name << "}";
4554 }
PrintPositive(const char * name,int value)4555 void PrintPositive(const char* name, int value) {
4556 if (value < 0) return;
4557 PrintSeparator();
4558 os_ << "{" << name << "|" << value << "}";
4559 }
4560
4561 private:
4562 std::ostream& os_;
4563 bool first_;
4564 };
4565
4566
PrintAttributes(RegExpNode * that)4567 void DotPrinter::PrintAttributes(RegExpNode* that) {
4568 os_ << " a" << that << " [shape=Mrecord, color=grey, fontcolor=grey, "
4569 << "margin=0.1, fontsize=10, label=\"{";
4570 AttributePrinter printer(os_);
4571 NodeInfo* info = that->info();
4572 printer.PrintBit("NI", info->follows_newline_interest);
4573 printer.PrintBit("WI", info->follows_word_interest);
4574 printer.PrintBit("SI", info->follows_start_interest);
4575 Label* label = that->label();
4576 if (label->is_bound())
4577 printer.PrintPositive("@", label->pos());
4578 os_ << "}\"];\n"
4579 << " a" << that << " -> n" << that
4580 << " [style=dashed, color=grey, arrowhead=none];\n";
4581 }
4582
4583
4584 static const bool kPrintDispatchTable = false;
VisitChoice(ChoiceNode * that)4585 void DotPrinter::VisitChoice(ChoiceNode* that) {
4586 if (kPrintDispatchTable) {
4587 os_ << " n" << that << " [shape=Mrecord, label=\"";
4588 TableEntryHeaderPrinter header_printer(os_);
4589 that->GetTable(ignore_case_)->ForEach(&header_printer);
4590 os_ << "\"]\n";
4591 PrintAttributes(that);
4592 TableEntryBodyPrinter body_printer(os_, that);
4593 that->GetTable(ignore_case_)->ForEach(&body_printer);
4594 } else {
4595 os_ << " n" << that << " [shape=Mrecord, label=\"?\"];\n";
4596 for (int i = 0; i < that->alternatives()->length(); i++) {
4597 GuardedAlternative alt = that->alternatives()->at(i);
4598 os_ << " n" << that << " -> n" << alt.node();
4599 }
4600 }
4601 for (int i = 0; i < that->alternatives()->length(); i++) {
4602 GuardedAlternative alt = that->alternatives()->at(i);
4603 alt.node()->Accept(this);
4604 }
4605 }
4606
4607
VisitText(TextNode * that)4608 void DotPrinter::VisitText(TextNode* that) {
4609 Zone* zone = that->zone();
4610 os_ << " n" << that << " [label=\"";
4611 for (int i = 0; i < that->elements()->length(); i++) {
4612 if (i > 0) os_ << " ";
4613 TextElement elm = that->elements()->at(i);
4614 switch (elm.text_type()) {
4615 case TextElement::ATOM: {
4616 Vector<const uc16> data = elm.atom()->data();
4617 for (int i = 0; i < data.length(); i++) {
4618 os_ << static_cast<char>(data[i]);
4619 }
4620 break;
4621 }
4622 case TextElement::CHAR_CLASS: {
4623 RegExpCharacterClass* node = elm.char_class();
4624 os_ << "[";
4625 if (node->is_negated()) os_ << "^";
4626 for (int j = 0; j < node->ranges(zone)->length(); j++) {
4627 CharacterRange range = node->ranges(zone)->at(j);
4628 os_ << AsUC16(range.from()) << "-" << AsUC16(range.to());
4629 }
4630 os_ << "]";
4631 break;
4632 }
4633 default:
4634 UNREACHABLE();
4635 }
4636 }
4637 os_ << "\", shape=box, peripheries=2];\n";
4638 PrintAttributes(that);
4639 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4640 Visit(that->on_success());
4641 }
4642
4643
VisitBackReference(BackReferenceNode * that)4644 void DotPrinter::VisitBackReference(BackReferenceNode* that) {
4645 os_ << " n" << that << " [label=\"$" << that->start_register() << "..$"
4646 << that->end_register() << "\", shape=doubleoctagon];\n";
4647 PrintAttributes(that);
4648 os_ << " n" << that << " -> n" << that->on_success() << ";\n";
4649 Visit(that->on_success());
4650 }
4651
4652
VisitEnd(EndNode * that)4653 void DotPrinter::VisitEnd(EndNode* that) {
4654 os_ << " n" << that << " [style=bold, shape=point];\n";
4655 PrintAttributes(that);
4656 }
4657
4658
VisitAssertion(AssertionNode * that)4659 void DotPrinter::VisitAssertion(AssertionNode* that) {
4660 os_ << " n" << that << " [";
4661 switch (that->assertion_type()) {
4662 case AssertionNode::AT_END:
4663 os_ << "label=\"$\", shape=septagon";
4664 break;
4665 case AssertionNode::AT_START:
4666 os_ << "label=\"^\", shape=septagon";
4667 break;
4668 case AssertionNode::AT_BOUNDARY:
4669 os_ << "label=\"\\b\", shape=septagon";
4670 break;
4671 case AssertionNode::AT_NON_BOUNDARY:
4672 os_ << "label=\"\\B\", shape=septagon";
4673 break;
4674 case AssertionNode::AFTER_NEWLINE:
4675 os_ << "label=\"(?<=\\n)\", shape=septagon";
4676 break;
4677 }
4678 os_ << "];\n";
4679 PrintAttributes(that);
4680 RegExpNode* successor = that->on_success();
4681 os_ << " n" << that << " -> n" << successor << ";\n";
4682 Visit(successor);
4683 }
4684
4685
VisitAction(ActionNode * that)4686 void DotPrinter::VisitAction(ActionNode* that) {
4687 os_ << " n" << that << " [";
4688 switch (that->action_type_) {
4689 case ActionNode::SET_REGISTER:
4690 os_ << "label=\"$" << that->data_.u_store_register.reg
4691 << ":=" << that->data_.u_store_register.value << "\", shape=octagon";
4692 break;
4693 case ActionNode::INCREMENT_REGISTER:
4694 os_ << "label=\"$" << that->data_.u_increment_register.reg
4695 << "++\", shape=octagon";
4696 break;
4697 case ActionNode::STORE_POSITION:
4698 os_ << "label=\"$" << that->data_.u_position_register.reg
4699 << ":=$pos\", shape=octagon";
4700 break;
4701 case ActionNode::BEGIN_SUBMATCH:
4702 os_ << "label=\"$" << that->data_.u_submatch.current_position_register
4703 << ":=$pos,begin\", shape=septagon";
4704 break;
4705 case ActionNode::POSITIVE_SUBMATCH_SUCCESS:
4706 os_ << "label=\"escape\", shape=septagon";
4707 break;
4708 case ActionNode::EMPTY_MATCH_CHECK:
4709 os_ << "label=\"$" << that->data_.u_empty_match_check.start_register
4710 << "=$pos?,$" << that->data_.u_empty_match_check.repetition_register
4711 << "<" << that->data_.u_empty_match_check.repetition_limit
4712 << "?\", shape=septagon";
4713 break;
4714 case ActionNode::CLEAR_CAPTURES: {
4715 os_ << "label=\"clear $" << that->data_.u_clear_captures.range_from
4716 << " to $" << that->data_.u_clear_captures.range_to
4717 << "\", shape=septagon";
4718 break;
4719 }
4720 }
4721 os_ << "];\n";
4722 PrintAttributes(that);
4723 RegExpNode* successor = that->on_success();
4724 os_ << " n" << that << " -> n" << successor << ";\n";
4725 Visit(successor);
4726 }
4727
4728
4729 class DispatchTableDumper {
4730 public:
DispatchTableDumper(std::ostream & os)4731 explicit DispatchTableDumper(std::ostream& os) : os_(os) {}
4732 void Call(uc16 key, DispatchTable::Entry entry);
4733 private:
4734 std::ostream& os_;
4735 };
4736
4737
Call(uc16 key,DispatchTable::Entry entry)4738 void DispatchTableDumper::Call(uc16 key, DispatchTable::Entry entry) {
4739 os_ << "[" << AsUC16(key) << "-" << AsUC16(entry.to()) << "]: {";
4740 OutSet* set = entry.out_set();
4741 bool first = true;
4742 for (unsigned i = 0; i < OutSet::kFirstLimit; i++) {
4743 if (set->Get(i)) {
4744 if (first) {
4745 first = false;
4746 } else {
4747 os_ << ", ";
4748 }
4749 os_ << i;
4750 }
4751 }
4752 os_ << "}\n";
4753 }
4754
4755
Dump()4756 void DispatchTable::Dump() {
4757 OFStream os(stderr);
4758 DispatchTableDumper dumper(os);
4759 tree()->ForEach(&dumper);
4760 }
4761
4762
DotPrint(const char * label,RegExpNode * node,bool ignore_case)4763 void RegExpEngine::DotPrint(const char* label,
4764 RegExpNode* node,
4765 bool ignore_case) {
4766 OFStream os(stdout);
4767 DotPrinter printer(os, ignore_case);
4768 printer.PrintNode(label, node);
4769 }
4770
4771
4772 #endif // DEBUG
4773
4774
4775 // -------------------------------------------------------------------
4776 // Tree to graph conversion
4777
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4778 RegExpNode* RegExpAtom::ToNode(RegExpCompiler* compiler,
4779 RegExpNode* on_success) {
4780 ZoneList<TextElement>* elms =
4781 new(compiler->zone()) ZoneList<TextElement>(1, compiler->zone());
4782 elms->Add(TextElement::Atom(this), compiler->zone());
4783 return new (compiler->zone())
4784 TextNode(elms, compiler->read_backward(), on_success);
4785 }
4786
4787
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)4788 RegExpNode* RegExpText::ToNode(RegExpCompiler* compiler,
4789 RegExpNode* on_success) {
4790 return new (compiler->zone())
4791 TextNode(elements(), compiler->read_backward(), on_success);
4792 }
4793
4794
CompareInverseRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4795 static bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
4796 const int* special_class,
4797 int length) {
4798 length--; // Remove final marker.
4799 DCHECK(special_class[length] == kRangeEndMarker);
4800 DCHECK(ranges->length() != 0);
4801 DCHECK(length != 0);
4802 DCHECK(special_class[0] != 0);
4803 if (ranges->length() != (length >> 1) + 1) {
4804 return false;
4805 }
4806 CharacterRange range = ranges->at(0);
4807 if (range.from() != 0) {
4808 return false;
4809 }
4810 for (int i = 0; i < length; i += 2) {
4811 if (special_class[i] != (range.to() + 1)) {
4812 return false;
4813 }
4814 range = ranges->at((i >> 1) + 1);
4815 if (special_class[i+1] != range.from()) {
4816 return false;
4817 }
4818 }
4819 if (range.to() != String::kMaxCodePoint) {
4820 return false;
4821 }
4822 return true;
4823 }
4824
4825
CompareRanges(ZoneList<CharacterRange> * ranges,const int * special_class,int length)4826 static bool CompareRanges(ZoneList<CharacterRange>* ranges,
4827 const int* special_class,
4828 int length) {
4829 length--; // Remove final marker.
4830 DCHECK(special_class[length] == kRangeEndMarker);
4831 if (ranges->length() * 2 != length) {
4832 return false;
4833 }
4834 for (int i = 0; i < length; i += 2) {
4835 CharacterRange range = ranges->at(i >> 1);
4836 if (range.from() != special_class[i] ||
4837 range.to() != special_class[i + 1] - 1) {
4838 return false;
4839 }
4840 }
4841 return true;
4842 }
4843
4844
is_standard(Zone * zone)4845 bool RegExpCharacterClass::is_standard(Zone* zone) {
4846 // TODO(lrn): Remove need for this function, by not throwing away information
4847 // along the way.
4848 if (is_negated_) {
4849 return false;
4850 }
4851 if (set_.is_standard()) {
4852 return true;
4853 }
4854 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4855 set_.set_standard_set_type('s');
4856 return true;
4857 }
4858 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4859 set_.set_standard_set_type('S');
4860 return true;
4861 }
4862 if (CompareInverseRanges(set_.ranges(zone),
4863 kLineTerminatorRanges,
4864 kLineTerminatorRangeCount)) {
4865 set_.set_standard_set_type('.');
4866 return true;
4867 }
4868 if (CompareRanges(set_.ranges(zone),
4869 kLineTerminatorRanges,
4870 kLineTerminatorRangeCount)) {
4871 set_.set_standard_set_type('n');
4872 return true;
4873 }
4874 if (CompareRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4875 set_.set_standard_set_type('w');
4876 return true;
4877 }
4878 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4879 set_.set_standard_set_type('W');
4880 return true;
4881 }
4882 return false;
4883 }
4884
4885
UnicodeRangeSplitter(Zone * zone,ZoneList<CharacterRange> * base)4886 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4887 ZoneList<CharacterRange>* base)
4888 : zone_(zone),
4889 table_(zone),
4890 bmp_(nullptr),
4891 lead_surrogates_(nullptr),
4892 trail_surrogates_(nullptr),
4893 non_bmp_(nullptr) {
4894 // The unicode range splitter categorizes given character ranges into:
4895 // - Code points from the BMP representable by one code unit.
4896 // - Code points outside the BMP that need to be split into surrogate pairs.
4897 // - Lone lead surrogates.
4898 // - Lone trail surrogates.
4899 // Lone surrogates are valid code points, even though no actual characters.
4900 // They require special matching to make sure we do not split surrogate pairs.
4901 // We use the dispatch table to accomplish this. The base range is split up
4902 // by the table by the overlay ranges, and the Call callback is used to
4903 // filter and collect ranges for each category.
4904 for (int i = 0; i < base->length(); i++) {
4905 table_.AddRange(base->at(i), kBase, zone_);
4906 }
4907 // Add overlay ranges.
4908 table_.AddRange(CharacterRange::Range(0, kLeadSurrogateStart - 1),
4909 kBmpCodePoints, zone_);
4910 table_.AddRange(CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd),
4911 kLeadSurrogates, zone_);
4912 table_.AddRange(
4913 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
4914 kTrailSurrogates, zone_);
4915 table_.AddRange(
4916 CharacterRange::Range(kTrailSurrogateEnd + 1, kNonBmpStart - 1),
4917 kBmpCodePoints, zone_);
4918 table_.AddRange(CharacterRange::Range(kNonBmpStart, kNonBmpEnd),
4919 kNonBmpCodePoints, zone_);
4920 table_.ForEach(this);
4921 }
4922
4923
Call(uc32 from,DispatchTable::Entry entry)4924 void UnicodeRangeSplitter::Call(uc32 from, DispatchTable::Entry entry) {
4925 OutSet* outset = entry.out_set();
4926 if (!outset->Get(kBase)) return;
4927 ZoneList<CharacterRange>** target = NULL;
4928 if (outset->Get(kBmpCodePoints)) {
4929 target = &bmp_;
4930 } else if (outset->Get(kLeadSurrogates)) {
4931 target = &lead_surrogates_;
4932 } else if (outset->Get(kTrailSurrogates)) {
4933 target = &trail_surrogates_;
4934 } else {
4935 DCHECK(outset->Get(kNonBmpCodePoints));
4936 target = &non_bmp_;
4937 }
4938 if (*target == NULL) *target = new (zone_) ZoneList<CharacterRange>(2, zone_);
4939 (*target)->Add(CharacterRange::Range(entry.from(), entry.to()), zone_);
4940 }
4941
4942
AddBmpCharacters(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4943 void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode* result,
4944 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
4945 ZoneList<CharacterRange>* bmp = splitter->bmp();
4946 if (bmp == nullptr) return;
4947 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
4948 compiler->zone(), bmp, compiler->read_backward(), on_success)));
4949 }
4950
4951
AddNonBmpSurrogatePairs(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)4952 void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode* result,
4953 RegExpNode* on_success,
4954 UnicodeRangeSplitter* splitter) {
4955 ZoneList<CharacterRange>* non_bmp = splitter->non_bmp();
4956 if (non_bmp == nullptr) return;
4957 DCHECK(compiler->unicode());
4958 DCHECK(!compiler->one_byte());
4959 Zone* zone = compiler->zone();
4960 CharacterRange::Canonicalize(non_bmp);
4961 for (int i = 0; i < non_bmp->length(); i++) {
4962 // Match surrogate pair.
4963 // E.g. [\u10005-\u11005] becomes
4964 // \ud800[\udc05-\udfff]|
4965 // [\ud801-\ud803][\udc00-\udfff]|
4966 // \ud804[\udc00-\udc05]
4967 uc32 from = non_bmp->at(i).from();
4968 uc32 to = non_bmp->at(i).to();
4969 uc16 from_l = unibrow::Utf16::LeadSurrogate(from);
4970 uc16 from_t = unibrow::Utf16::TrailSurrogate(from);
4971 uc16 to_l = unibrow::Utf16::LeadSurrogate(to);
4972 uc16 to_t = unibrow::Utf16::TrailSurrogate(to);
4973 if (from_l == to_l) {
4974 // The lead surrogate is the same.
4975 result->AddAlternative(
4976 GuardedAlternative(TextNode::CreateForSurrogatePair(
4977 zone, CharacterRange::Singleton(from_l),
4978 CharacterRange::Range(from_t, to_t), compiler->read_backward(),
4979 on_success)));
4980 } else {
4981 if (from_t != kTrailSurrogateStart) {
4982 // Add [from_l][from_t-\udfff]
4983 result->AddAlternative(
4984 GuardedAlternative(TextNode::CreateForSurrogatePair(
4985 zone, CharacterRange::Singleton(from_l),
4986 CharacterRange::Range(from_t, kTrailSurrogateEnd),
4987 compiler->read_backward(), on_success)));
4988 from_l++;
4989 }
4990 if (to_t != kTrailSurrogateEnd) {
4991 // Add [to_l][\udc00-to_t]
4992 result->AddAlternative(
4993 GuardedAlternative(TextNode::CreateForSurrogatePair(
4994 zone, CharacterRange::Singleton(to_l),
4995 CharacterRange::Range(kTrailSurrogateStart, to_t),
4996 compiler->read_backward(), on_success)));
4997 to_l--;
4998 }
4999 if (from_l <= to_l) {
5000 // Add [from_l-to_l][\udc00-\udfff]
5001 result->AddAlternative(
5002 GuardedAlternative(TextNode::CreateForSurrogatePair(
5003 zone, CharacterRange::Range(from_l, to_l),
5004 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd),
5005 compiler->read_backward(), on_success)));
5006 }
5007 }
5008 }
5009 }
5010
5011
NegativeLookaroundAgainstReadDirectionAndMatch(RegExpCompiler * compiler,ZoneList<CharacterRange> * lookbehind,ZoneList<CharacterRange> * match,RegExpNode * on_success,bool read_backward)5012 RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
5013 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
5014 ZoneList<CharacterRange>* match, RegExpNode* on_success,
5015 bool read_backward) {
5016 Zone* zone = compiler->zone();
5017 RegExpNode* match_node = TextNode::CreateForCharacterRanges(
5018 zone, match, read_backward, on_success);
5019 int stack_register = compiler->UnicodeLookaroundStackRegister();
5020 int position_register = compiler->UnicodeLookaroundPositionRegister();
5021 RegExpLookaround::Builder lookaround(false, match_node, stack_register,
5022 position_register);
5023 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5024 zone, lookbehind, !read_backward, lookaround.on_match_success());
5025 return lookaround.ForMatch(negative_match);
5026 }
5027
5028
MatchAndNegativeLookaroundInReadDirection(RegExpCompiler * compiler,ZoneList<CharacterRange> * match,ZoneList<CharacterRange> * lookahead,RegExpNode * on_success,bool read_backward)5029 RegExpNode* MatchAndNegativeLookaroundInReadDirection(
5030 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
5031 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
5032 bool read_backward) {
5033 Zone* zone = compiler->zone();
5034 int stack_register = compiler->UnicodeLookaroundStackRegister();
5035 int position_register = compiler->UnicodeLookaroundPositionRegister();
5036 RegExpLookaround::Builder lookaround(false, on_success, stack_register,
5037 position_register);
5038 RegExpNode* negative_match = TextNode::CreateForCharacterRanges(
5039 zone, lookahead, read_backward, lookaround.on_match_success());
5040 return TextNode::CreateForCharacterRanges(
5041 zone, match, read_backward, lookaround.ForMatch(negative_match));
5042 }
5043
5044
AddLoneLeadSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5045 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5046 RegExpNode* on_success,
5047 UnicodeRangeSplitter* splitter) {
5048 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5049 if (lead_surrogates == nullptr) return;
5050 Zone* zone = compiler->zone();
5051 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5052 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5053 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5054
5055 RegExpNode* match;
5056 if (compiler->read_backward()) {
5057 // Reading backward. Assert that reading forward, there is no trail
5058 // surrogate, and then backward match the lead surrogate.
5059 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5060 compiler, trail_surrogates, lead_surrogates, on_success, true);
5061 } else {
5062 // Reading forward. Forward match the lead surrogate and assert that
5063 // no trail surrogate follows.
5064 match = MatchAndNegativeLookaroundInReadDirection(
5065 compiler, lead_surrogates, trail_surrogates, on_success, false);
5066 }
5067 result->AddAlternative(GuardedAlternative(match));
5068 }
5069
5070
AddLoneTrailSurrogates(RegExpCompiler * compiler,ChoiceNode * result,RegExpNode * on_success,UnicodeRangeSplitter * splitter)5071 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5072 RegExpNode* on_success,
5073 UnicodeRangeSplitter* splitter) {
5074 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5075 if (trail_surrogates == nullptr) return;
5076 Zone* zone = compiler->zone();
5077 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5078 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5079 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5080
5081 RegExpNode* match;
5082 if (compiler->read_backward()) {
5083 // Reading backward. Backward match the trail surrogate and assert that no
5084 // lead surrogate precedes it.
5085 match = MatchAndNegativeLookaroundInReadDirection(
5086 compiler, trail_surrogates, lead_surrogates, on_success, true);
5087 } else {
5088 // Reading forward. Assert that reading backward, there is no lead
5089 // surrogate, and then forward match the trail surrogate.
5090 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5091 compiler, lead_surrogates, trail_surrogates, on_success, false);
5092 }
5093 result->AddAlternative(GuardedAlternative(match));
5094 }
5095
UnanchoredAdvance(RegExpCompiler * compiler,RegExpNode * on_success)5096 RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
5097 RegExpNode* on_success) {
5098 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5099 DCHECK(!compiler->read_backward());
5100 Zone* zone = compiler->zone();
5101 // Advance any character. If the character happens to be a lead surrogate and
5102 // we advanced into the middle of a surrogate pair, it will work out, as
5103 // nothing will match from there. We will have to advance again, consuming
5104 // the associated trail surrogate.
5105 ZoneList<CharacterRange>* range = CharacterRange::List(
5106 zone, CharacterRange::Range(0, String::kMaxUtf16CodeUnit));
5107 return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
5108 }
5109
5110
AddUnicodeCaseEquivalents(RegExpCompiler * compiler,ZoneList<CharacterRange> * ranges)5111 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5112 ZoneList<CharacterRange>* ranges) {
5113 #ifdef V8_I18N_SUPPORT
5114 // Use ICU to compute the case fold closure over the ranges.
5115 DCHECK(compiler->unicode());
5116 DCHECK(compiler->ignore_case());
5117 USet* set = uset_openEmpty();
5118 for (int i = 0; i < ranges->length(); i++) {
5119 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5120 }
5121 ranges->Clear();
5122 uset_closeOver(set, USET_CASE_INSENSITIVE);
5123 // Full case mapping map single characters to multiple characters.
5124 // Those are represented as strings in the set. Remove them so that
5125 // we end up with only simple and common case mappings.
5126 uset_removeAllStrings(set);
5127 int item_count = uset_getItemCount(set);
5128 int item_result = 0;
5129 UErrorCode ec = U_ZERO_ERROR;
5130 Zone* zone = compiler->zone();
5131 for (int i = 0; i < item_count; i++) {
5132 uc32 start = 0;
5133 uc32 end = 0;
5134 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5135 ranges->Add(CharacterRange::Range(start, end), zone);
5136 }
5137 // No errors and everything we collected have been ranges.
5138 DCHECK_EQ(U_ZERO_ERROR, ec);
5139 DCHECK_EQ(0, item_result);
5140 uset_close(set);
5141 #else
5142 // Fallback if ICU is not included.
5143 CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
5144 ranges, compiler->one_byte());
5145 #endif // V8_I18N_SUPPORT
5146 CharacterRange::Canonicalize(ranges);
5147 }
5148
5149
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5150 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5151 RegExpNode* on_success) {
5152 set_.Canonicalize();
5153 Zone* zone = compiler->zone();
5154 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5155 if (compiler->unicode() && compiler->ignore_case()) {
5156 AddUnicodeCaseEquivalents(compiler, ranges);
5157 }
5158 if (compiler->unicode() && !compiler->one_byte()) {
5159 if (is_negated()) {
5160 ZoneList<CharacterRange>* negated =
5161 new (zone) ZoneList<CharacterRange>(2, zone);
5162 CharacterRange::Negate(ranges, negated, zone);
5163 ranges = negated;
5164 }
5165 if (ranges->length() == 0) {
5166 ranges->Add(CharacterRange::Everything(), zone);
5167 RegExpCharacterClass* fail =
5168 new (zone) RegExpCharacterClass(ranges, true);
5169 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5170 }
5171 if (standard_type() == '*') {
5172 return UnanchoredAdvance(compiler, on_success);
5173 } else {
5174 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5175 UnicodeRangeSplitter splitter(zone, ranges);
5176 AddBmpCharacters(compiler, result, on_success, &splitter);
5177 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5178 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5179 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5180 return result;
5181 }
5182 } else {
5183 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5184 }
5185 }
5186
5187
CompareFirstChar(RegExpTree * const * a,RegExpTree * const * b)5188 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5189 RegExpAtom* atom1 = (*a)->AsAtom();
5190 RegExpAtom* atom2 = (*b)->AsAtom();
5191 uc16 character1 = atom1->data().at(0);
5192 uc16 character2 = atom2->data().at(0);
5193 if (character1 < character2) return -1;
5194 if (character1 > character2) return 1;
5195 return 0;
5196 }
5197
5198
Canonical(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,unibrow::uchar c)5199 static unibrow::uchar Canonical(
5200 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5201 unibrow::uchar c) {
5202 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
5203 int length = canonicalize->get(c, '\0', chars);
5204 DCHECK_LE(length, 1);
5205 unibrow::uchar canonical = c;
5206 if (length == 1) canonical = chars[0];
5207 return canonical;
5208 }
5209
5210
CompareFirstCharCaseIndependent(unibrow::Mapping<unibrow::Ecma262Canonicalize> * canonicalize,RegExpTree * const * a,RegExpTree * const * b)5211 int CompareFirstCharCaseIndependent(
5212 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
5213 RegExpTree* const* a, RegExpTree* const* b) {
5214 RegExpAtom* atom1 = (*a)->AsAtom();
5215 RegExpAtom* atom2 = (*b)->AsAtom();
5216 unibrow::uchar character1 = atom1->data().at(0);
5217 unibrow::uchar character2 = atom2->data().at(0);
5218 if (character1 == character2) return 0;
5219 if (character1 >= 'a' || character2 >= 'a') {
5220 character1 = Canonical(canonicalize, character1);
5221 character2 = Canonical(canonicalize, character2);
5222 }
5223 return static_cast<int>(character1) - static_cast<int>(character2);
5224 }
5225
5226
5227 // We can stable sort runs of atoms, since the order does not matter if they
5228 // start with different characters.
5229 // Returns true if any consecutive atoms were found.
SortConsecutiveAtoms(RegExpCompiler * compiler)5230 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
5231 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5232 int length = alternatives->length();
5233 bool found_consecutive_atoms = false;
5234 for (int i = 0; i < length; i++) {
5235 while (i < length) {
5236 RegExpTree* alternative = alternatives->at(i);
5237 if (alternative->IsAtom()) break;
5238 i++;
5239 }
5240 // i is length or it is the index of an atom.
5241 if (i == length) break;
5242 int first_atom = i;
5243 i++;
5244 while (i < length) {
5245 RegExpTree* alternative = alternatives->at(i);
5246 if (!alternative->IsAtom()) break;
5247 i++;
5248 }
5249 // Sort atoms to get ones with common prefixes together.
5250 // This step is more tricky if we are in a case-independent regexp,
5251 // because it would change /is|I/ to /I|is/, and order matters when
5252 // the regexp parts don't match only disjoint starting points. To fix
5253 // this we have a version of CompareFirstChar that uses case-
5254 // independent character classes for comparison.
5255 DCHECK_LT(first_atom, alternatives->length());
5256 DCHECK_LE(i, alternatives->length());
5257 DCHECK_LE(first_atom, i);
5258 if (compiler->ignore_case()) {
5259 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5260 compiler->isolate()->regexp_macro_assembler_canonicalize();
5261 auto compare_closure =
5262 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
5263 return CompareFirstCharCaseIndependent(canonicalize, a, b);
5264 };
5265 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
5266 } else {
5267 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
5268 }
5269 if (i - first_atom > 1) found_consecutive_atoms = true;
5270 }
5271 return found_consecutive_atoms;
5272 }
5273
5274
5275 // Optimizes ab|ac|az to a(?:b|c|d).
RationalizeConsecutiveAtoms(RegExpCompiler * compiler)5276 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
5277 Zone* zone = compiler->zone();
5278 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5279 int length = alternatives->length();
5280
5281 int write_posn = 0;
5282 int i = 0;
5283 while (i < length) {
5284 RegExpTree* alternative = alternatives->at(i);
5285 if (!alternative->IsAtom()) {
5286 alternatives->at(write_posn++) = alternatives->at(i);
5287 i++;
5288 continue;
5289 }
5290 RegExpAtom* atom = alternative->AsAtom();
5291 unibrow::uchar common_prefix = atom->data().at(0);
5292 int first_with_prefix = i;
5293 int prefix_length = atom->length();
5294 i++;
5295 while (i < length) {
5296 alternative = alternatives->at(i);
5297 if (!alternative->IsAtom()) break;
5298 atom = alternative->AsAtom();
5299 unibrow::uchar new_prefix = atom->data().at(0);
5300 if (new_prefix != common_prefix) {
5301 if (!compiler->ignore_case()) break;
5302 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
5303 compiler->isolate()->regexp_macro_assembler_canonicalize();
5304 new_prefix = Canonical(canonicalize, new_prefix);
5305 common_prefix = Canonical(canonicalize, common_prefix);
5306 if (new_prefix != common_prefix) break;
5307 }
5308 prefix_length = Min(prefix_length, atom->length());
5309 i++;
5310 }
5311 if (i > first_with_prefix + 2) {
5312 // Found worthwhile run of alternatives with common prefix of at least one
5313 // character. The sorting function above did not sort on more than one
5314 // character for reasons of correctness, but there may still be a longer
5315 // common prefix if the terms were similar or presorted in the input.
5316 // Find out how long the common prefix is.
5317 int run_length = i - first_with_prefix;
5318 atom = alternatives->at(first_with_prefix)->AsAtom();
5319 for (int j = 1; j < run_length && prefix_length > 1; j++) {
5320 RegExpAtom* old_atom =
5321 alternatives->at(j + first_with_prefix)->AsAtom();
5322 for (int k = 1; k < prefix_length; k++) {
5323 if (atom->data().at(k) != old_atom->data().at(k)) {
5324 prefix_length = k;
5325 break;
5326 }
5327 }
5328 }
5329 RegExpAtom* prefix =
5330 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
5331 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
5332 pair->Add(prefix, zone);
5333 ZoneList<RegExpTree*>* suffixes =
5334 new (zone) ZoneList<RegExpTree*>(run_length, zone);
5335 for (int j = 0; j < run_length; j++) {
5336 RegExpAtom* old_atom =
5337 alternatives->at(j + first_with_prefix)->AsAtom();
5338 int len = old_atom->length();
5339 if (len == prefix_length) {
5340 suffixes->Add(new (zone) RegExpEmpty(), zone);
5341 } else {
5342 RegExpTree* suffix = new (zone) RegExpAtom(
5343 old_atom->data().SubVector(prefix_length, old_atom->length()));
5344 suffixes->Add(suffix, zone);
5345 }
5346 }
5347 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
5348 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
5349 } else {
5350 // Just copy any non-worthwhile alternatives.
5351 for (int j = first_with_prefix; j < i; j++) {
5352 alternatives->at(write_posn++) = alternatives->at(j);
5353 }
5354 }
5355 }
5356 alternatives->Rewind(write_posn); // Trim end of array.
5357 }
5358
5359
5360 // Optimizes b|c|z to [bcz].
FixSingleCharacterDisjunctions(RegExpCompiler * compiler)5361 void RegExpDisjunction::FixSingleCharacterDisjunctions(
5362 RegExpCompiler* compiler) {
5363 Zone* zone = compiler->zone();
5364 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5365 int length = alternatives->length();
5366
5367 int write_posn = 0;
5368 int i = 0;
5369 while (i < length) {
5370 RegExpTree* alternative = alternatives->at(i);
5371 if (!alternative->IsAtom()) {
5372 alternatives->at(write_posn++) = alternatives->at(i);
5373 i++;
5374 continue;
5375 }
5376 RegExpAtom* atom = alternative->AsAtom();
5377 if (atom->length() != 1) {
5378 alternatives->at(write_posn++) = alternatives->at(i);
5379 i++;
5380 continue;
5381 }
5382 int first_in_run = i;
5383 i++;
5384 while (i < length) {
5385 alternative = alternatives->at(i);
5386 if (!alternative->IsAtom()) break;
5387 atom = alternative->AsAtom();
5388 if (atom->length() != 1) break;
5389 i++;
5390 }
5391 if (i > first_in_run + 1) {
5392 // Found non-trivial run of single-character alternatives.
5393 int run_length = i - first_in_run;
5394 ZoneList<CharacterRange>* ranges =
5395 new (zone) ZoneList<CharacterRange>(2, zone);
5396 for (int j = 0; j < run_length; j++) {
5397 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5398 DCHECK_EQ(old_atom->length(), 1);
5399 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5400 }
5401 alternatives->at(write_posn++) =
5402 new (zone) RegExpCharacterClass(ranges, false);
5403 } else {
5404 // Just copy any trivial alternatives.
5405 for (int j = first_in_run; j < i; j++) {
5406 alternatives->at(write_posn++) = alternatives->at(j);
5407 }
5408 }
5409 }
5410 alternatives->Rewind(write_posn); // Trim end of array.
5411 }
5412
5413
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5414 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
5415 RegExpNode* on_success) {
5416 ZoneList<RegExpTree*>* alternatives = this->alternatives();
5417
5418 if (alternatives->length() > 2) {
5419 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5420 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5421 FixSingleCharacterDisjunctions(compiler);
5422 if (alternatives->length() == 1) {
5423 return alternatives->at(0)->ToNode(compiler, on_success);
5424 }
5425 }
5426
5427 int length = alternatives->length();
5428
5429 ChoiceNode* result =
5430 new(compiler->zone()) ChoiceNode(length, compiler->zone());
5431 for (int i = 0; i < length; i++) {
5432 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
5433 on_success));
5434 result->AddAlternative(alternative);
5435 }
5436 return result;
5437 }
5438
5439
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5440 RegExpNode* RegExpQuantifier::ToNode(RegExpCompiler* compiler,
5441 RegExpNode* on_success) {
5442 return ToNode(min(),
5443 max(),
5444 is_greedy(),
5445 body(),
5446 compiler,
5447 on_success);
5448 }
5449
5450
5451 // Scoped object to keep track of how much we unroll quantifier loops in the
5452 // regexp graph generator.
5453 class RegExpExpansionLimiter {
5454 public:
5455 static const int kMaxExpansionFactor = 6;
RegExpExpansionLimiter(RegExpCompiler * compiler,int factor)5456 RegExpExpansionLimiter(RegExpCompiler* compiler, int factor)
5457 : compiler_(compiler),
5458 saved_expansion_factor_(compiler->current_expansion_factor()),
5459 ok_to_expand_(saved_expansion_factor_ <= kMaxExpansionFactor) {
5460 DCHECK(factor > 0);
5461 if (ok_to_expand_) {
5462 if (factor > kMaxExpansionFactor) {
5463 // Avoid integer overflow of the current expansion factor.
5464 ok_to_expand_ = false;
5465 compiler->set_current_expansion_factor(kMaxExpansionFactor + 1);
5466 } else {
5467 int new_factor = saved_expansion_factor_ * factor;
5468 ok_to_expand_ = (new_factor <= kMaxExpansionFactor);
5469 compiler->set_current_expansion_factor(new_factor);
5470 }
5471 }
5472 }
5473
~RegExpExpansionLimiter()5474 ~RegExpExpansionLimiter() {
5475 compiler_->set_current_expansion_factor(saved_expansion_factor_);
5476 }
5477
ok_to_expand()5478 bool ok_to_expand() { return ok_to_expand_; }
5479
5480 private:
5481 RegExpCompiler* compiler_;
5482 int saved_expansion_factor_;
5483 bool ok_to_expand_;
5484
5485 DISALLOW_IMPLICIT_CONSTRUCTORS(RegExpExpansionLimiter);
5486 };
5487
5488
ToNode(int min,int max,bool is_greedy,RegExpTree * body,RegExpCompiler * compiler,RegExpNode * on_success,bool not_at_start)5489 RegExpNode* RegExpQuantifier::ToNode(int min,
5490 int max,
5491 bool is_greedy,
5492 RegExpTree* body,
5493 RegExpCompiler* compiler,
5494 RegExpNode* on_success,
5495 bool not_at_start) {
5496 // x{f, t} becomes this:
5497 //
5498 // (r++)<-.
5499 // | `
5500 // | (x)
5501 // v ^
5502 // (r=0)-->(?)---/ [if r < t]
5503 // |
5504 // [if r >= f] \----> ...
5505 //
5506
5507 // 15.10.2.5 RepeatMatcher algorithm.
5508 // The parser has already eliminated the case where max is 0. In the case
5509 // where max_match is zero the parser has removed the quantifier if min was
5510 // > 0 and removed the atom if min was 0. See AddQuantifierToAtom.
5511
5512 // If we know that we cannot match zero length then things are a little
5513 // simpler since we don't need to make the special zero length match check
5514 // from step 2.1. If the min and max are small we can unroll a little in
5515 // this case.
5516 static const int kMaxUnrolledMinMatches = 3; // Unroll (foo)+ and (foo){3,}
5517 static const int kMaxUnrolledMaxMatches = 3; // Unroll (foo)? and (foo){x,3}
5518 if (max == 0) return on_success; // This can happen due to recursion.
5519 bool body_can_be_empty = (body->min_match() == 0);
5520 int body_start_reg = RegExpCompiler::kNoRegister;
5521 Interval capture_registers = body->CaptureRegisters();
5522 bool needs_capture_clearing = !capture_registers.is_empty();
5523 Zone* zone = compiler->zone();
5524
5525 if (body_can_be_empty) {
5526 body_start_reg = compiler->AllocateRegister();
5527 } else if (compiler->optimize() && !needs_capture_clearing) {
5528 // Only unroll if there are no captures and the body can't be
5529 // empty.
5530 {
5531 RegExpExpansionLimiter limiter(
5532 compiler, min + ((max != min) ? 1 : 0));
5533 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
5534 int new_max = (max == kInfinity) ? max : max - min;
5535 // Recurse once to get the loop or optional matches after the fixed
5536 // ones.
5537 RegExpNode* answer = ToNode(
5538 0, new_max, is_greedy, body, compiler, on_success, true);
5539 // Unroll the forced matches from 0 to min. This can cause chains of
5540 // TextNodes (which the parser does not generate). These should be
5541 // combined if it turns out they hinder good code generation.
5542 for (int i = 0; i < min; i++) {
5543 answer = body->ToNode(compiler, answer);
5544 }
5545 return answer;
5546 }
5547 }
5548 if (max <= kMaxUnrolledMaxMatches && min == 0) {
5549 DCHECK(max > 0); // Due to the 'if' above.
5550 RegExpExpansionLimiter limiter(compiler, max);
5551 if (limiter.ok_to_expand()) {
5552 // Unroll the optional matches up to max.
5553 RegExpNode* answer = on_success;
5554 for (int i = 0; i < max; i++) {
5555 ChoiceNode* alternation = new(zone) ChoiceNode(2, zone);
5556 if (is_greedy) {
5557 alternation->AddAlternative(
5558 GuardedAlternative(body->ToNode(compiler, answer)));
5559 alternation->AddAlternative(GuardedAlternative(on_success));
5560 } else {
5561 alternation->AddAlternative(GuardedAlternative(on_success));
5562 alternation->AddAlternative(
5563 GuardedAlternative(body->ToNode(compiler, answer)));
5564 }
5565 answer = alternation;
5566 if (not_at_start && !compiler->read_backward()) {
5567 alternation->set_not_at_start();
5568 }
5569 }
5570 return answer;
5571 }
5572 }
5573 }
5574 bool has_min = min > 0;
5575 bool has_max = max < RegExpTree::kInfinity;
5576 bool needs_counter = has_min || has_max;
5577 int reg_ctr = needs_counter
5578 ? compiler->AllocateRegister()
5579 : RegExpCompiler::kNoRegister;
5580 LoopChoiceNode* center = new (zone)
5581 LoopChoiceNode(body->min_match() == 0, compiler->read_backward(), zone);
5582 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
5583 RegExpNode* loop_return = needs_counter
5584 ? static_cast<RegExpNode*>(ActionNode::IncrementRegister(reg_ctr, center))
5585 : static_cast<RegExpNode*>(center);
5586 if (body_can_be_empty) {
5587 // If the body can be empty we need to check if it was and then
5588 // backtrack.
5589 loop_return = ActionNode::EmptyMatchCheck(body_start_reg,
5590 reg_ctr,
5591 min,
5592 loop_return);
5593 }
5594 RegExpNode* body_node = body->ToNode(compiler, loop_return);
5595 if (body_can_be_empty) {
5596 // If the body can be empty we need to store the start position
5597 // so we can bail out if it was empty.
5598 body_node = ActionNode::StorePosition(body_start_reg, false, body_node);
5599 }
5600 if (needs_capture_clearing) {
5601 // Before entering the body of this loop we need to clear captures.
5602 body_node = ActionNode::ClearCaptures(capture_registers, body_node);
5603 }
5604 GuardedAlternative body_alt(body_node);
5605 if (has_max) {
5606 Guard* body_guard =
5607 new(zone) Guard(reg_ctr, Guard::LT, max);
5608 body_alt.AddGuard(body_guard, zone);
5609 }
5610 GuardedAlternative rest_alt(on_success);
5611 if (has_min) {
5612 Guard* rest_guard = new(compiler->zone()) Guard(reg_ctr, Guard::GEQ, min);
5613 rest_alt.AddGuard(rest_guard, zone);
5614 }
5615 if (is_greedy) {
5616 center->AddLoopAlternative(body_alt);
5617 center->AddContinueAlternative(rest_alt);
5618 } else {
5619 center->AddContinueAlternative(rest_alt);
5620 center->AddLoopAlternative(body_alt);
5621 }
5622 if (needs_counter) {
5623 return ActionNode::SetRegister(reg_ctr, 0, center);
5624 } else {
5625 return center;
5626 }
5627 }
5628
5629
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5630 RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
5631 RegExpNode* on_success) {
5632 NodeInfo info;
5633 Zone* zone = compiler->zone();
5634
5635 switch (assertion_type()) {
5636 case START_OF_LINE:
5637 return AssertionNode::AfterNewline(on_success);
5638 case START_OF_INPUT:
5639 return AssertionNode::AtStart(on_success);
5640 case BOUNDARY:
5641 return AssertionNode::AtBoundary(on_success);
5642 case NON_BOUNDARY:
5643 return AssertionNode::AtNonBoundary(on_success);
5644 case END_OF_INPUT:
5645 return AssertionNode::AtEnd(on_success);
5646 case END_OF_LINE: {
5647 // Compile $ in multiline regexps as an alternation with a positive
5648 // lookahead in one side and an end-of-input on the other side.
5649 // We need two registers for the lookahead.
5650 int stack_pointer_register = compiler->AllocateRegister();
5651 int position_register = compiler->AllocateRegister();
5652 // The ChoiceNode to distinguish between a newline and end-of-input.
5653 ChoiceNode* result = new(zone) ChoiceNode(2, zone);
5654 // Create a newline atom.
5655 ZoneList<CharacterRange>* newline_ranges =
5656 new(zone) ZoneList<CharacterRange>(3, zone);
5657 CharacterRange::AddClassEscape('n', newline_ranges, zone);
5658 RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
5659 TextNode* newline_matcher = new (zone) TextNode(
5660 newline_atom, false, ActionNode::PositiveSubmatchSuccess(
5661 stack_pointer_register, position_register,
5662 0, // No captures inside.
5663 -1, // Ignored if no captures.
5664 on_success));
5665 // Create an end-of-input matcher.
5666 RegExpNode* end_of_line = ActionNode::BeginSubmatch(
5667 stack_pointer_register,
5668 position_register,
5669 newline_matcher);
5670 // Add the two alternatives to the ChoiceNode.
5671 GuardedAlternative eol_alternative(end_of_line);
5672 result->AddAlternative(eol_alternative);
5673 GuardedAlternative end_alternative(AssertionNode::AtEnd(on_success));
5674 result->AddAlternative(end_alternative);
5675 return result;
5676 }
5677 default:
5678 UNREACHABLE();
5679 }
5680 return on_success;
5681 }
5682
5683
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5684 RegExpNode* RegExpBackReference::ToNode(RegExpCompiler* compiler,
5685 RegExpNode* on_success) {
5686 return new (compiler->zone())
5687 BackReferenceNode(RegExpCapture::StartRegister(index()),
5688 RegExpCapture::EndRegister(index()),
5689 compiler->read_backward(), on_success);
5690 }
5691
5692
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5693 RegExpNode* RegExpEmpty::ToNode(RegExpCompiler* compiler,
5694 RegExpNode* on_success) {
5695 return on_success;
5696 }
5697
5698
Builder(bool is_positive,RegExpNode * on_success,int stack_pointer_register,int position_register,int capture_register_count,int capture_register_start)5699 RegExpLookaround::Builder::Builder(bool is_positive, RegExpNode* on_success,
5700 int stack_pointer_register,
5701 int position_register,
5702 int capture_register_count,
5703 int capture_register_start)
5704 : is_positive_(is_positive),
5705 on_success_(on_success),
5706 stack_pointer_register_(stack_pointer_register),
5707 position_register_(position_register) {
5708 if (is_positive_) {
5709 on_match_success_ = ActionNode::PositiveSubmatchSuccess(
5710 stack_pointer_register, position_register, capture_register_count,
5711 capture_register_start, on_success_);
5712 } else {
5713 Zone* zone = on_success_->zone();
5714 on_match_success_ = new (zone) NegativeSubmatchSuccess(
5715 stack_pointer_register, position_register, capture_register_count,
5716 capture_register_start, zone);
5717 }
5718 }
5719
5720
ForMatch(RegExpNode * match)5721 RegExpNode* RegExpLookaround::Builder::ForMatch(RegExpNode* match) {
5722 if (is_positive_) {
5723 return ActionNode::BeginSubmatch(stack_pointer_register_,
5724 position_register_, match);
5725 } else {
5726 Zone* zone = on_success_->zone();
5727 // We use a ChoiceNode to represent the negative lookaround. The first
5728 // alternative is the negative match. On success, the end node backtracks.
5729 // On failure, the second alternative is tried and leads to success.
5730 // NegativeLookaheadChoiceNode is a special ChoiceNode that ignores the
5731 // first exit when calculating quick checks.
5732 ChoiceNode* choice_node = new (zone) NegativeLookaroundChoiceNode(
5733 GuardedAlternative(match), GuardedAlternative(on_success_), zone);
5734 return ActionNode::BeginSubmatch(stack_pointer_register_,
5735 position_register_, choice_node);
5736 }
5737 }
5738
5739
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5740 RegExpNode* RegExpLookaround::ToNode(RegExpCompiler* compiler,
5741 RegExpNode* on_success) {
5742 int stack_pointer_register = compiler->AllocateRegister();
5743 int position_register = compiler->AllocateRegister();
5744
5745 const int registers_per_capture = 2;
5746 const int register_of_first_capture = 2;
5747 int register_count = capture_count_ * registers_per_capture;
5748 int register_start =
5749 register_of_first_capture + capture_from_ * registers_per_capture;
5750
5751 RegExpNode* result;
5752 bool was_reading_backward = compiler->read_backward();
5753 compiler->set_read_backward(type() == LOOKBEHIND);
5754 Builder builder(is_positive(), on_success, stack_pointer_register,
5755 position_register, register_count, register_start);
5756 RegExpNode* match = body_->ToNode(compiler, builder.on_match_success());
5757 result = builder.ForMatch(match);
5758 compiler->set_read_backward(was_reading_backward);
5759 return result;
5760 }
5761
5762
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5763 RegExpNode* RegExpCapture::ToNode(RegExpCompiler* compiler,
5764 RegExpNode* on_success) {
5765 return ToNode(body(), index(), compiler, on_success);
5766 }
5767
5768
ToNode(RegExpTree * body,int index,RegExpCompiler * compiler,RegExpNode * on_success)5769 RegExpNode* RegExpCapture::ToNode(RegExpTree* body,
5770 int index,
5771 RegExpCompiler* compiler,
5772 RegExpNode* on_success) {
5773 DCHECK_NOT_NULL(body);
5774 int start_reg = RegExpCapture::StartRegister(index);
5775 int end_reg = RegExpCapture::EndRegister(index);
5776 if (compiler->read_backward()) std::swap(start_reg, end_reg);
5777 RegExpNode* store_end = ActionNode::StorePosition(end_reg, true, on_success);
5778 RegExpNode* body_node = body->ToNode(compiler, store_end);
5779 return ActionNode::StorePosition(start_reg, true, body_node);
5780 }
5781
5782
ToNode(RegExpCompiler * compiler,RegExpNode * on_success)5783 RegExpNode* RegExpAlternative::ToNode(RegExpCompiler* compiler,
5784 RegExpNode* on_success) {
5785 ZoneList<RegExpTree*>* children = nodes();
5786 RegExpNode* current = on_success;
5787 if (compiler->read_backward()) {
5788 for (int i = 0; i < children->length(); i++) {
5789 current = children->at(i)->ToNode(compiler, current);
5790 }
5791 } else {
5792 for (int i = children->length() - 1; i >= 0; i--) {
5793 current = children->at(i)->ToNode(compiler, current);
5794 }
5795 }
5796 return current;
5797 }
5798
5799
AddClass(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5800 static void AddClass(const int* elmv,
5801 int elmc,
5802 ZoneList<CharacterRange>* ranges,
5803 Zone* zone) {
5804 elmc--;
5805 DCHECK(elmv[elmc] == kRangeEndMarker);
5806 for (int i = 0; i < elmc; i += 2) {
5807 DCHECK(elmv[i] < elmv[i + 1]);
5808 ranges->Add(CharacterRange::Range(elmv[i], elmv[i + 1] - 1), zone);
5809 }
5810 }
5811
5812
AddClassNegated(const int * elmv,int elmc,ZoneList<CharacterRange> * ranges,Zone * zone)5813 static void AddClassNegated(const int *elmv,
5814 int elmc,
5815 ZoneList<CharacterRange>* ranges,
5816 Zone* zone) {
5817 elmc--;
5818 DCHECK(elmv[elmc] == kRangeEndMarker);
5819 DCHECK(elmv[0] != 0x0000);
5820 DCHECK(elmv[elmc - 1] != String::kMaxCodePoint);
5821 uc16 last = 0x0000;
5822 for (int i = 0; i < elmc; i += 2) {
5823 DCHECK(last <= elmv[i] - 1);
5824 DCHECK(elmv[i] < elmv[i + 1]);
5825 ranges->Add(CharacterRange::Range(last, elmv[i] - 1), zone);
5826 last = elmv[i + 1];
5827 }
5828 ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
5829 }
5830
5831
AddClassEscape(uc16 type,ZoneList<CharacterRange> * ranges,Zone * zone)5832 void CharacterRange::AddClassEscape(uc16 type,
5833 ZoneList<CharacterRange>* ranges,
5834 Zone* zone) {
5835 switch (type) {
5836 case 's':
5837 AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5838 break;
5839 case 'S':
5840 AddClassNegated(kSpaceRanges, kSpaceRangeCount, ranges, zone);
5841 break;
5842 case 'w':
5843 AddClass(kWordRanges, kWordRangeCount, ranges, zone);
5844 break;
5845 case 'W':
5846 AddClassNegated(kWordRanges, kWordRangeCount, ranges, zone);
5847 break;
5848 case 'd':
5849 AddClass(kDigitRanges, kDigitRangeCount, ranges, zone);
5850 break;
5851 case 'D':
5852 AddClassNegated(kDigitRanges, kDigitRangeCount, ranges, zone);
5853 break;
5854 case '.':
5855 AddClassNegated(kLineTerminatorRanges,
5856 kLineTerminatorRangeCount,
5857 ranges,
5858 zone);
5859 break;
5860 // This is not a character range as defined by the spec but a
5861 // convenient shorthand for a character class that matches any
5862 // character.
5863 case '*':
5864 ranges->Add(CharacterRange::Everything(), zone);
5865 break;
5866 // This is the set of characters matched by the $ and ^ symbols
5867 // in multiline mode.
5868 case 'n':
5869 AddClass(kLineTerminatorRanges,
5870 kLineTerminatorRangeCount,
5871 ranges,
5872 zone);
5873 break;
5874 default:
5875 UNREACHABLE();
5876 }
5877 }
5878
5879
GetWordBounds()5880 Vector<const int> CharacterRange::GetWordBounds() {
5881 return Vector<const int>(kWordRanges, kWordRangeCount - 1);
5882 }
5883
5884
AddCaseEquivalents(Isolate * isolate,Zone * zone,ZoneList<CharacterRange> * ranges,bool is_one_byte)5885 void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone,
5886 ZoneList<CharacterRange>* ranges,
5887 bool is_one_byte) {
5888 CharacterRange::Canonicalize(ranges);
5889 int range_count = ranges->length();
5890 for (int i = 0; i < range_count; i++) {
5891 CharacterRange range = ranges->at(i);
5892 uc32 bottom = range.from();
5893 if (bottom > String::kMaxUtf16CodeUnit) return;
5894 uc32 top = Min(range.to(), String::kMaxUtf16CodeUnit);
5895 // Nothing to be done for surrogates.
5896 if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return;
5897 if (is_one_byte && !RangeContainsLatin1Equivalents(range)) {
5898 if (bottom > String::kMaxOneByteCharCode) return;
5899 if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode;
5900 }
5901 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5902 if (top == bottom) {
5903 // If this is a singleton we just expand the one character.
5904 int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars);
5905 for (int i = 0; i < length; i++) {
5906 uc32 chr = chars[i];
5907 if (chr != bottom) {
5908 ranges->Add(CharacterRange::Singleton(chars[i]), zone);
5909 }
5910 }
5911 } else {
5912 // If this is a range we expand the characters block by block, expanding
5913 // contiguous subranges (blocks) one at a time. The approach is as
5914 // follows. For a given start character we look up the remainder of the
5915 // block that contains it (represented by the end point), for instance we
5916 // find 'z' if the character is 'c'. A block is characterized by the
5917 // property that all characters uncanonicalize in the same way, except
5918 // that each entry in the result is incremented by the distance from the
5919 // first element. So a-z is a block because 'a' uncanonicalizes to ['a',
5920 // 'A'] and the k'th letter uncanonicalizes to ['a' + k, 'A' + k]. Once
5921 // we've found the end point we look up its uncanonicalization and
5922 // produce a range for each element. For instance for [c-f] we look up
5923 // ['z', 'Z'] and produce [c-f] and [C-F]. We then only add a range if
5924 // it is not already contained in the input, so [c-f] will be skipped but
5925 // [C-F] will be added. If this range is not completely contained in a
5926 // block we do this for all the blocks covered by the range (handling
5927 // characters that is not in a block as a "singleton block").
5928 unibrow::uchar equivalents[unibrow::Ecma262UnCanonicalize::kMaxWidth];
5929 int pos = bottom;
5930 while (pos <= top) {
5931 int length =
5932 isolate->jsregexp_canonrange()->get(pos, '\0', equivalents);
5933 uc32 block_end;
5934 if (length == 0) {
5935 block_end = pos;
5936 } else {
5937 DCHECK_EQ(1, length);
5938 block_end = equivalents[0];
5939 }
5940 int end = (block_end > top) ? top : block_end;
5941 length = isolate->jsregexp_uncanonicalize()->get(block_end, '\0',
5942 equivalents);
5943 for (int i = 0; i < length; i++) {
5944 uc32 c = equivalents[i];
5945 uc32 range_from = c - (block_end - pos);
5946 uc32 range_to = c - (block_end - end);
5947 if (!(bottom <= range_from && range_to <= top)) {
5948 ranges->Add(CharacterRange::Range(range_from, range_to), zone);
5949 }
5950 }
5951 pos = end + 1;
5952 }
5953 }
5954 }
5955 }
5956
5957
IsCanonical(ZoneList<CharacterRange> * ranges)5958 bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
5959 DCHECK_NOT_NULL(ranges);
5960 int n = ranges->length();
5961 if (n <= 1) return true;
5962 int max = ranges->at(0).to();
5963 for (int i = 1; i < n; i++) {
5964 CharacterRange next_range = ranges->at(i);
5965 if (next_range.from() <= max + 1) return false;
5966 max = next_range.to();
5967 }
5968 return true;
5969 }
5970
5971
ranges(Zone * zone)5972 ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
5973 if (ranges_ == NULL) {
5974 ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
5975 CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
5976 }
5977 return ranges_;
5978 }
5979
5980
5981 // Move a number of elements in a zonelist to another position
5982 // in the same list. Handles overlapping source and target areas.
MoveRanges(ZoneList<CharacterRange> * list,int from,int to,int count)5983 static void MoveRanges(ZoneList<CharacterRange>* list,
5984 int from,
5985 int to,
5986 int count) {
5987 // Ranges are potentially overlapping.
5988 if (from < to) {
5989 for (int i = count - 1; i >= 0; i--) {
5990 list->at(to + i) = list->at(from + i);
5991 }
5992 } else {
5993 for (int i = 0; i < count; i++) {
5994 list->at(to + i) = list->at(from + i);
5995 }
5996 }
5997 }
5998
5999
InsertRangeInCanonicalList(ZoneList<CharacterRange> * list,int count,CharacterRange insert)6000 static int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
6001 int count,
6002 CharacterRange insert) {
6003 // Inserts a range into list[0..count[, which must be sorted
6004 // by from value and non-overlapping and non-adjacent, using at most
6005 // list[0..count] for the result. Returns the number of resulting
6006 // canonicalized ranges. Inserting a range may collapse existing ranges into
6007 // fewer ranges, so the return value can be anything in the range 1..count+1.
6008 uc32 from = insert.from();
6009 uc32 to = insert.to();
6010 int start_pos = 0;
6011 int end_pos = count;
6012 for (int i = count - 1; i >= 0; i--) {
6013 CharacterRange current = list->at(i);
6014 if (current.from() > to + 1) {
6015 end_pos = i;
6016 } else if (current.to() + 1 < from) {
6017 start_pos = i + 1;
6018 break;
6019 }
6020 }
6021
6022 // Inserted range overlaps, or is adjacent to, ranges at positions
6023 // [start_pos..end_pos[. Ranges before start_pos or at or after end_pos are
6024 // not affected by the insertion.
6025 // If start_pos == end_pos, the range must be inserted before start_pos.
6026 // if start_pos < end_pos, the entire range from start_pos to end_pos
6027 // must be merged with the insert range.
6028
6029 if (start_pos == end_pos) {
6030 // Insert between existing ranges at position start_pos.
6031 if (start_pos < count) {
6032 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
6033 }
6034 list->at(start_pos) = insert;
6035 return count + 1;
6036 }
6037 if (start_pos + 1 == end_pos) {
6038 // Replace single existing range at position start_pos.
6039 CharacterRange to_replace = list->at(start_pos);
6040 int new_from = Min(to_replace.from(), from);
6041 int new_to = Max(to_replace.to(), to);
6042 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6043 return count;
6044 }
6045 // Replace a number of existing ranges from start_pos to end_pos - 1.
6046 // Move the remaining ranges down.
6047
6048 int new_from = Min(list->at(start_pos).from(), from);
6049 int new_to = Max(list->at(end_pos - 1).to(), to);
6050 if (end_pos < count) {
6051 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
6052 }
6053 list->at(start_pos) = CharacterRange::Range(new_from, new_to);
6054 return count - (end_pos - start_pos) + 1;
6055 }
6056
6057
Canonicalize()6058 void CharacterSet::Canonicalize() {
6059 // Special/default classes are always considered canonical. The result
6060 // of calling ranges() will be sorted.
6061 if (ranges_ == NULL) return;
6062 CharacterRange::Canonicalize(ranges_);
6063 }
6064
6065
Canonicalize(ZoneList<CharacterRange> * character_ranges)6066 void CharacterRange::Canonicalize(ZoneList<CharacterRange>* character_ranges) {
6067 if (character_ranges->length() <= 1) return;
6068 // Check whether ranges are already canonical (increasing, non-overlapping,
6069 // non-adjacent).
6070 int n = character_ranges->length();
6071 int max = character_ranges->at(0).to();
6072 int i = 1;
6073 while (i < n) {
6074 CharacterRange current = character_ranges->at(i);
6075 if (current.from() <= max + 1) {
6076 break;
6077 }
6078 max = current.to();
6079 i++;
6080 }
6081 // Canonical until the i'th range. If that's all of them, we are done.
6082 if (i == n) return;
6083
6084 // The ranges at index i and forward are not canonicalized. Make them so by
6085 // doing the equivalent of insertion sort (inserting each into the previous
6086 // list, in order).
6087 // Notice that inserting a range can reduce the number of ranges in the
6088 // result due to combining of adjacent and overlapping ranges.
6089 int read = i; // Range to insert.
6090 int num_canonical = i; // Length of canonicalized part of list.
6091 do {
6092 num_canonical = InsertRangeInCanonicalList(character_ranges,
6093 num_canonical,
6094 character_ranges->at(read));
6095 read++;
6096 } while (read < n);
6097 character_ranges->Rewind(num_canonical);
6098
6099 DCHECK(CharacterRange::IsCanonical(character_ranges));
6100 }
6101
6102
Negate(ZoneList<CharacterRange> * ranges,ZoneList<CharacterRange> * negated_ranges,Zone * zone)6103 void CharacterRange::Negate(ZoneList<CharacterRange>* ranges,
6104 ZoneList<CharacterRange>* negated_ranges,
6105 Zone* zone) {
6106 DCHECK(CharacterRange::IsCanonical(ranges));
6107 DCHECK_EQ(0, negated_ranges->length());
6108 int range_count = ranges->length();
6109 uc32 from = 0;
6110 int i = 0;
6111 if (range_count > 0 && ranges->at(0).from() == 0) {
6112 from = ranges->at(0).to() + 1;
6113 i = 1;
6114 }
6115 while (i < range_count) {
6116 CharacterRange range = ranges->at(i);
6117 negated_ranges->Add(CharacterRange::Range(from, range.from() - 1), zone);
6118 from = range.to() + 1;
6119 i++;
6120 }
6121 if (from < String::kMaxCodePoint) {
6122 negated_ranges->Add(CharacterRange::Range(from, String::kMaxCodePoint),
6123 zone);
6124 }
6125 }
6126
6127
6128 // -------------------------------------------------------------------
6129 // Splay tree
6130
6131
Extend(unsigned value,Zone * zone)6132 OutSet* OutSet::Extend(unsigned value, Zone* zone) {
6133 if (Get(value))
6134 return this;
6135 if (successors(zone) != NULL) {
6136 for (int i = 0; i < successors(zone)->length(); i++) {
6137 OutSet* successor = successors(zone)->at(i);
6138 if (successor->Get(value))
6139 return successor;
6140 }
6141 } else {
6142 successors_ = new(zone) ZoneList<OutSet*>(2, zone);
6143 }
6144 OutSet* result = new(zone) OutSet(first_, remaining_);
6145 result->Set(value, zone);
6146 successors(zone)->Add(result, zone);
6147 return result;
6148 }
6149
6150
Set(unsigned value,Zone * zone)6151 void OutSet::Set(unsigned value, Zone *zone) {
6152 if (value < kFirstLimit) {
6153 first_ |= (1 << value);
6154 } else {
6155 if (remaining_ == NULL)
6156 remaining_ = new(zone) ZoneList<unsigned>(1, zone);
6157 if (remaining_->is_empty() || !remaining_->Contains(value))
6158 remaining_->Add(value, zone);
6159 }
6160 }
6161
6162
Get(unsigned value) const6163 bool OutSet::Get(unsigned value) const {
6164 if (value < kFirstLimit) {
6165 return (first_ & (1 << value)) != 0;
6166 } else if (remaining_ == NULL) {
6167 return false;
6168 } else {
6169 return remaining_->Contains(value);
6170 }
6171 }
6172
6173
6174 const uc32 DispatchTable::Config::kNoKey = unibrow::Utf8::kBadChar;
6175
6176
AddRange(CharacterRange full_range,int value,Zone * zone)6177 void DispatchTable::AddRange(CharacterRange full_range, int value,
6178 Zone* zone) {
6179 CharacterRange current = full_range;
6180 if (tree()->is_empty()) {
6181 // If this is the first range we just insert into the table.
6182 ZoneSplayTree<Config>::Locator loc;
6183 bool inserted = tree()->Insert(current.from(), &loc);
6184 DCHECK(inserted);
6185 USE(inserted);
6186 loc.set_value(Entry(current.from(), current.to(),
6187 empty()->Extend(value, zone)));
6188 return;
6189 }
6190 // First see if there is a range to the left of this one that
6191 // overlaps.
6192 ZoneSplayTree<Config>::Locator loc;
6193 if (tree()->FindGreatestLessThan(current.from(), &loc)) {
6194 Entry* entry = &loc.value();
6195 // If we've found a range that overlaps with this one, and it
6196 // starts strictly to the left of this one, we have to fix it
6197 // because the following code only handles ranges that start on
6198 // or after the start point of the range we're adding.
6199 if (entry->from() < current.from() && entry->to() >= current.from()) {
6200 // Snap the overlapping range in half around the start point of
6201 // the range we're adding.
6202 CharacterRange left =
6203 CharacterRange::Range(entry->from(), current.from() - 1);
6204 CharacterRange right = CharacterRange::Range(current.from(), entry->to());
6205 // The left part of the overlapping range doesn't overlap.
6206 // Truncate the whole entry to be just the left part.
6207 entry->set_to(left.to());
6208 // The right part is the one that overlaps. We add this part
6209 // to the map and let the next step deal with merging it with
6210 // the range we're adding.
6211 ZoneSplayTree<Config>::Locator loc;
6212 bool inserted = tree()->Insert(right.from(), &loc);
6213 DCHECK(inserted);
6214 USE(inserted);
6215 loc.set_value(Entry(right.from(),
6216 right.to(),
6217 entry->out_set()));
6218 }
6219 }
6220 while (current.is_valid()) {
6221 if (tree()->FindLeastGreaterThan(current.from(), &loc) &&
6222 (loc.value().from() <= current.to()) &&
6223 (loc.value().to() >= current.from())) {
6224 Entry* entry = &loc.value();
6225 // We have overlap. If there is space between the start point of
6226 // the range we're adding and where the overlapping range starts
6227 // then we have to add a range covering just that space.
6228 if (current.from() < entry->from()) {
6229 ZoneSplayTree<Config>::Locator ins;
6230 bool inserted = tree()->Insert(current.from(), &ins);
6231 DCHECK(inserted);
6232 USE(inserted);
6233 ins.set_value(Entry(current.from(),
6234 entry->from() - 1,
6235 empty()->Extend(value, zone)));
6236 current.set_from(entry->from());
6237 }
6238 DCHECK_EQ(current.from(), entry->from());
6239 // If the overlapping range extends beyond the one we want to add
6240 // we have to snap the right part off and add it separately.
6241 if (entry->to() > current.to()) {
6242 ZoneSplayTree<Config>::Locator ins;
6243 bool inserted = tree()->Insert(current.to() + 1, &ins);
6244 DCHECK(inserted);
6245 USE(inserted);
6246 ins.set_value(Entry(current.to() + 1,
6247 entry->to(),
6248 entry->out_set()));
6249 entry->set_to(current.to());
6250 }
6251 DCHECK(entry->to() <= current.to());
6252 // The overlapping range is now completely contained by the range
6253 // we're adding so we can just update it and move the start point
6254 // of the range we're adding just past it.
6255 entry->AddValue(value, zone);
6256 DCHECK(entry->to() + 1 > current.from());
6257 current.set_from(entry->to() + 1);
6258 } else {
6259 // There is no overlap so we can just add the range
6260 ZoneSplayTree<Config>::Locator ins;
6261 bool inserted = tree()->Insert(current.from(), &ins);
6262 DCHECK(inserted);
6263 USE(inserted);
6264 ins.set_value(Entry(current.from(),
6265 current.to(),
6266 empty()->Extend(value, zone)));
6267 break;
6268 }
6269 }
6270 }
6271
6272
Get(uc32 value)6273 OutSet* DispatchTable::Get(uc32 value) {
6274 ZoneSplayTree<Config>::Locator loc;
6275 if (!tree()->FindGreatestLessThan(value, &loc))
6276 return empty();
6277 Entry* entry = &loc.value();
6278 if (value <= entry->to())
6279 return entry->out_set();
6280 else
6281 return empty();
6282 }
6283
6284
6285 // -------------------------------------------------------------------
6286 // Analysis
6287
6288
EnsureAnalyzed(RegExpNode * that)6289 void Analysis::EnsureAnalyzed(RegExpNode* that) {
6290 StackLimitCheck check(isolate());
6291 if (check.HasOverflowed()) {
6292 fail("Stack overflow");
6293 return;
6294 }
6295 if (that->info()->been_analyzed || that->info()->being_analyzed)
6296 return;
6297 that->info()->being_analyzed = true;
6298 that->Accept(this);
6299 that->info()->being_analyzed = false;
6300 that->info()->been_analyzed = true;
6301 }
6302
6303
VisitEnd(EndNode * that)6304 void Analysis::VisitEnd(EndNode* that) {
6305 // nothing to do
6306 }
6307
6308
CalculateOffsets()6309 void TextNode::CalculateOffsets() {
6310 int element_count = elements()->length();
6311 // Set up the offsets of the elements relative to the start. This is a fixed
6312 // quantity since a TextNode can only contain fixed-width things.
6313 int cp_offset = 0;
6314 for (int i = 0; i < element_count; i++) {
6315 TextElement& elm = elements()->at(i);
6316 elm.set_cp_offset(cp_offset);
6317 cp_offset += elm.length();
6318 }
6319 }
6320
6321
VisitText(TextNode * that)6322 void Analysis::VisitText(TextNode* that) {
6323 if (ignore_case()) {
6324 that->MakeCaseIndependent(isolate(), is_one_byte_);
6325 }
6326 EnsureAnalyzed(that->on_success());
6327 if (!has_failed()) {
6328 that->CalculateOffsets();
6329 }
6330 }
6331
6332
VisitAction(ActionNode * that)6333 void Analysis::VisitAction(ActionNode* that) {
6334 RegExpNode* target = that->on_success();
6335 EnsureAnalyzed(target);
6336 if (!has_failed()) {
6337 // If the next node is interested in what it follows then this node
6338 // has to be interested too so it can pass the information on.
6339 that->info()->AddFromFollowing(target->info());
6340 }
6341 }
6342
6343
VisitChoice(ChoiceNode * that)6344 void Analysis::VisitChoice(ChoiceNode* that) {
6345 NodeInfo* info = that->info();
6346 for (int i = 0; i < that->alternatives()->length(); i++) {
6347 RegExpNode* node = that->alternatives()->at(i).node();
6348 EnsureAnalyzed(node);
6349 if (has_failed()) return;
6350 // Anything the following nodes need to know has to be known by
6351 // this node also, so it can pass it on.
6352 info->AddFromFollowing(node->info());
6353 }
6354 }
6355
6356
VisitLoopChoice(LoopChoiceNode * that)6357 void Analysis::VisitLoopChoice(LoopChoiceNode* that) {
6358 NodeInfo* info = that->info();
6359 for (int i = 0; i < that->alternatives()->length(); i++) {
6360 RegExpNode* node = that->alternatives()->at(i).node();
6361 if (node != that->loop_node()) {
6362 EnsureAnalyzed(node);
6363 if (has_failed()) return;
6364 info->AddFromFollowing(node->info());
6365 }
6366 }
6367 // Check the loop last since it may need the value of this node
6368 // to get a correct result.
6369 EnsureAnalyzed(that->loop_node());
6370 if (!has_failed()) {
6371 info->AddFromFollowing(that->loop_node()->info());
6372 }
6373 }
6374
6375
VisitBackReference(BackReferenceNode * that)6376 void Analysis::VisitBackReference(BackReferenceNode* that) {
6377 EnsureAnalyzed(that->on_success());
6378 }
6379
6380
VisitAssertion(AssertionNode * that)6381 void Analysis::VisitAssertion(AssertionNode* that) {
6382 EnsureAnalyzed(that->on_success());
6383 }
6384
6385
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6386 void BackReferenceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6387 BoyerMooreLookahead* bm,
6388 bool not_at_start) {
6389 // Working out the set of characters that a backreference can match is too
6390 // hard, so we just say that any character can match.
6391 bm->SetRest(offset);
6392 SaveBMInfo(bm, not_at_start, offset);
6393 }
6394
6395
6396 STATIC_ASSERT(BoyerMoorePositionInfo::kMapSize ==
6397 RegExpMacroAssembler::kTableSize);
6398
6399
FillInBMInfo(Isolate * isolate,int offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6400 void ChoiceNode::FillInBMInfo(Isolate* isolate, int offset, int budget,
6401 BoyerMooreLookahead* bm, bool not_at_start) {
6402 ZoneList<GuardedAlternative>* alts = alternatives();
6403 budget = (budget - 1) / alts->length();
6404 for (int i = 0; i < alts->length(); i++) {
6405 GuardedAlternative& alt = alts->at(i);
6406 if (alt.guards() != NULL && alt.guards()->length() != 0) {
6407 bm->SetRest(offset); // Give up trying to fill in info.
6408 SaveBMInfo(bm, not_at_start, offset);
6409 return;
6410 }
6411 alt.node()->FillInBMInfo(isolate, offset, budget, bm, not_at_start);
6412 }
6413 SaveBMInfo(bm, not_at_start, offset);
6414 }
6415
6416
FillInBMInfo(Isolate * isolate,int initial_offset,int budget,BoyerMooreLookahead * bm,bool not_at_start)6417 void TextNode::FillInBMInfo(Isolate* isolate, int initial_offset, int budget,
6418 BoyerMooreLookahead* bm, bool not_at_start) {
6419 if (initial_offset >= bm->length()) return;
6420 int offset = initial_offset;
6421 int max_char = bm->max_char();
6422 for (int i = 0; i < elements()->length(); i++) {
6423 if (offset >= bm->length()) {
6424 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6425 return;
6426 }
6427 TextElement text = elements()->at(i);
6428 if (text.text_type() == TextElement::ATOM) {
6429 RegExpAtom* atom = text.atom();
6430 for (int j = 0; j < atom->length(); j++, offset++) {
6431 if (offset >= bm->length()) {
6432 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6433 return;
6434 }
6435 uc16 character = atom->data()[j];
6436 if (bm->compiler()->ignore_case()) {
6437 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
6438 int length = GetCaseIndependentLetters(
6439 isolate, character, bm->max_char() == String::kMaxOneByteCharCode,
6440 chars);
6441 for (int j = 0; j < length; j++) {
6442 bm->Set(offset, chars[j]);
6443 }
6444 } else {
6445 if (character <= max_char) bm->Set(offset, character);
6446 }
6447 }
6448 } else {
6449 DCHECK_EQ(TextElement::CHAR_CLASS, text.text_type());
6450 RegExpCharacterClass* char_class = text.char_class();
6451 ZoneList<CharacterRange>* ranges = char_class->ranges(zone());
6452 if (char_class->is_negated()) {
6453 bm->SetAll(offset);
6454 } else {
6455 for (int k = 0; k < ranges->length(); k++) {
6456 CharacterRange& range = ranges->at(k);
6457 if (range.from() > max_char) continue;
6458 int to = Min(max_char, static_cast<int>(range.to()));
6459 bm->SetInterval(offset, Interval(range.from(), to));
6460 }
6461 }
6462 offset++;
6463 }
6464 }
6465 if (offset >= bm->length()) {
6466 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6467 return;
6468 }
6469 on_success()->FillInBMInfo(isolate, offset, budget - 1, bm,
6470 true); // Not at start after a text node.
6471 if (initial_offset == 0) set_bm_info(not_at_start, bm);
6472 }
6473
6474
6475 // -------------------------------------------------------------------
6476 // Dispatch table construction
6477
6478
VisitEnd(EndNode * that)6479 void DispatchTableConstructor::VisitEnd(EndNode* that) {
6480 AddRange(CharacterRange::Everything());
6481 }
6482
6483
BuildTable(ChoiceNode * node)6484 void DispatchTableConstructor::BuildTable(ChoiceNode* node) {
6485 node->set_being_calculated(true);
6486 ZoneList<GuardedAlternative>* alternatives = node->alternatives();
6487 for (int i = 0; i < alternatives->length(); i++) {
6488 set_choice_index(i);
6489 alternatives->at(i).node()->Accept(this);
6490 }
6491 node->set_being_calculated(false);
6492 }
6493
6494
6495 class AddDispatchRange {
6496 public:
AddDispatchRange(DispatchTableConstructor * constructor)6497 explicit AddDispatchRange(DispatchTableConstructor* constructor)
6498 : constructor_(constructor) { }
6499 void Call(uc32 from, DispatchTable::Entry entry);
6500 private:
6501 DispatchTableConstructor* constructor_;
6502 };
6503
6504
Call(uc32 from,DispatchTable::Entry entry)6505 void AddDispatchRange::Call(uc32 from, DispatchTable::Entry entry) {
6506 constructor_->AddRange(CharacterRange::Range(from, entry.to()));
6507 }
6508
6509
VisitChoice(ChoiceNode * node)6510 void DispatchTableConstructor::VisitChoice(ChoiceNode* node) {
6511 if (node->being_calculated())
6512 return;
6513 DispatchTable* table = node->GetTable(ignore_case_);
6514 AddDispatchRange adder(this);
6515 table->ForEach(&adder);
6516 }
6517
6518
VisitBackReference(BackReferenceNode * that)6519 void DispatchTableConstructor::VisitBackReference(BackReferenceNode* that) {
6520 // TODO(160): Find the node that we refer back to and propagate its start
6521 // set back to here. For now we just accept anything.
6522 AddRange(CharacterRange::Everything());
6523 }
6524
6525
VisitAssertion(AssertionNode * that)6526 void DispatchTableConstructor::VisitAssertion(AssertionNode* that) {
6527 RegExpNode* target = that->on_success();
6528 target->Accept(this);
6529 }
6530
6531
CompareRangeByFrom(const CharacterRange * a,const CharacterRange * b)6532 static int CompareRangeByFrom(const CharacterRange* a,
6533 const CharacterRange* b) {
6534 return Compare<uc16>(a->from(), b->from());
6535 }
6536
6537
AddInverse(ZoneList<CharacterRange> * ranges)6538 void DispatchTableConstructor::AddInverse(ZoneList<CharacterRange>* ranges) {
6539 ranges->Sort(CompareRangeByFrom);
6540 uc16 last = 0;
6541 for (int i = 0; i < ranges->length(); i++) {
6542 CharacterRange range = ranges->at(i);
6543 if (last < range.from())
6544 AddRange(CharacterRange::Range(last, range.from() - 1));
6545 if (range.to() >= last) {
6546 if (range.to() == String::kMaxCodePoint) {
6547 return;
6548 } else {
6549 last = range.to() + 1;
6550 }
6551 }
6552 }
6553 AddRange(CharacterRange::Range(last, String::kMaxCodePoint));
6554 }
6555
6556
VisitText(TextNode * that)6557 void DispatchTableConstructor::VisitText(TextNode* that) {
6558 TextElement elm = that->elements()->at(0);
6559 switch (elm.text_type()) {
6560 case TextElement::ATOM: {
6561 uc16 c = elm.atom()->data()[0];
6562 AddRange(CharacterRange::Range(c, c));
6563 break;
6564 }
6565 case TextElement::CHAR_CLASS: {
6566 RegExpCharacterClass* tree = elm.char_class();
6567 ZoneList<CharacterRange>* ranges = tree->ranges(that->zone());
6568 if (tree->is_negated()) {
6569 AddInverse(ranges);
6570 } else {
6571 for (int i = 0; i < ranges->length(); i++)
6572 AddRange(ranges->at(i));
6573 }
6574 break;
6575 }
6576 default: {
6577 UNIMPLEMENTED();
6578 }
6579 }
6580 }
6581
6582
VisitAction(ActionNode * that)6583 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6584 RegExpNode* target = that->on_success();
6585 target->Accept(this);
6586 }
6587
6588
OptionallyStepBackToLeadSurrogate(RegExpCompiler * compiler,RegExpNode * on_success)6589 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6590 RegExpNode* on_success) {
6591 // If the regexp matching starts within a surrogate pair, step back
6592 // to the lead surrogate and start matching from there.
6593 DCHECK(!compiler->read_backward());
6594 Zone* zone = compiler->zone();
6595 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6596 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6597 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6598 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6599
6600 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6601
6602 int stack_register = compiler->UnicodeLookaroundStackRegister();
6603 int position_register = compiler->UnicodeLookaroundPositionRegister();
6604 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6605 zone, lead_surrogates, true, on_success);
6606 RegExpLookaround::Builder builder(true, step_back, stack_register,
6607 position_register);
6608 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6609 zone, trail_surrogates, false, builder.on_match_success());
6610
6611 optional_step_back->AddAlternative(
6612 GuardedAlternative(builder.ForMatch(match_trail)));
6613 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6614
6615 return optional_step_back;
6616 }
6617
6618
Compile(Isolate * isolate,Zone * zone,RegExpCompileData * data,JSRegExp::Flags flags,Handle<String> pattern,Handle<String> sample_subject,bool is_one_byte)6619 RegExpEngine::CompilationResult RegExpEngine::Compile(
6620 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6621 JSRegExp::Flags flags, Handle<String> pattern,
6622 Handle<String> sample_subject, bool is_one_byte) {
6623 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6624 return IrregexpRegExpTooBig(isolate);
6625 }
6626 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6627 bool is_sticky = flags & JSRegExp::kSticky;
6628 bool is_global = flags & JSRegExp::kGlobal;
6629 bool is_unicode = flags & JSRegExp::kUnicode;
6630 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
6631 is_one_byte);
6632
6633 if (compiler.optimize()) compiler.set_optimize(!TooMuchRegExpCode(pattern));
6634
6635 // Sample some characters from the middle of the string.
6636 static const int kSampleSize = 128;
6637
6638 sample_subject = String::Flatten(sample_subject);
6639 int chars_sampled = 0;
6640 int half_way = (sample_subject->length() - kSampleSize) / 2;
6641 for (int i = Max(0, half_way);
6642 i < sample_subject->length() && chars_sampled < kSampleSize;
6643 i++, chars_sampled++) {
6644 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
6645 }
6646
6647 // Wrap the body of the regexp in capture #0.
6648 RegExpNode* captured_body = RegExpCapture::ToNode(data->tree,
6649 0,
6650 &compiler,
6651 compiler.accept());
6652 RegExpNode* node = captured_body;
6653 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
6654 bool is_start_anchored = data->tree->IsAnchoredAtStart();
6655 int max_length = data->tree->max_match();
6656 if (!is_start_anchored && !is_sticky) {
6657 // Add a .*? at the beginning, outside the body capture, unless
6658 // this expression is anchored at the beginning or sticky.
6659 RegExpNode* loop_node = RegExpQuantifier::ToNode(
6660 0, RegExpTree::kInfinity, false, new (zone) RegExpCharacterClass('*'),
6661 &compiler, captured_body, data->contains_anchor);
6662
6663 if (data->contains_anchor) {
6664 // Unroll loop once, to take care of the case that might start
6665 // at the start of input.
6666 ChoiceNode* first_step_node = new(zone) ChoiceNode(2, zone);
6667 first_step_node->AddAlternative(GuardedAlternative(captured_body));
6668 first_step_node->AddAlternative(GuardedAlternative(new (zone) TextNode(
6669 new (zone) RegExpCharacterClass('*'), false, loop_node)));
6670 node = first_step_node;
6671 } else {
6672 node = loop_node;
6673 }
6674 }
6675 if (is_one_byte) {
6676 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6677 // Do it again to propagate the new nodes to places where they were not
6678 // put because they had not been calculated yet.
6679 if (node != NULL) {
6680 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6681 }
6682 } else if (compiler.unicode() && (is_global || is_sticky)) {
6683 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6684 }
6685
6686 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6687 data->node = node;
6688 Analysis analysis(isolate, flags, is_one_byte);
6689 analysis.EnsureAnalyzed(node);
6690 if (analysis.has_failed()) {
6691 const char* error_message = analysis.error_message();
6692 return CompilationResult(isolate, error_message);
6693 }
6694
6695 // Create the correct assembler for the architecture.
6696 #ifndef V8_INTERPRETED_REGEXP
6697 // Native regexp implementation.
6698
6699 NativeRegExpMacroAssembler::Mode mode =
6700 is_one_byte ? NativeRegExpMacroAssembler::LATIN1
6701 : NativeRegExpMacroAssembler::UC16;
6702
6703 #if V8_TARGET_ARCH_IA32
6704 RegExpMacroAssemblerIA32 macro_assembler(isolate, zone, mode,
6705 (data->capture_count + 1) * 2);
6706 #elif V8_TARGET_ARCH_X64
6707 RegExpMacroAssemblerX64 macro_assembler(isolate, zone, mode,
6708 (data->capture_count + 1) * 2);
6709 #elif V8_TARGET_ARCH_ARM
6710 RegExpMacroAssemblerARM macro_assembler(isolate, zone, mode,
6711 (data->capture_count + 1) * 2);
6712 #elif V8_TARGET_ARCH_ARM64
6713 RegExpMacroAssemblerARM64 macro_assembler(isolate, zone, mode,
6714 (data->capture_count + 1) * 2);
6715 #elif V8_TARGET_ARCH_S390
6716 RegExpMacroAssemblerS390 macro_assembler(isolate, zone, mode,
6717 (data->capture_count + 1) * 2);
6718 #elif V8_TARGET_ARCH_PPC
6719 RegExpMacroAssemblerPPC macro_assembler(isolate, zone, mode,
6720 (data->capture_count + 1) * 2);
6721 #elif V8_TARGET_ARCH_MIPS
6722 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6723 (data->capture_count + 1) * 2);
6724 #elif V8_TARGET_ARCH_MIPS64
6725 RegExpMacroAssemblerMIPS macro_assembler(isolate, zone, mode,
6726 (data->capture_count + 1) * 2);
6727 #elif V8_TARGET_ARCH_X87
6728 RegExpMacroAssemblerX87 macro_assembler(isolate, zone, mode,
6729 (data->capture_count + 1) * 2);
6730 #else
6731 #error "Unsupported architecture"
6732 #endif
6733
6734 #else // V8_INTERPRETED_REGEXP
6735 // Interpreted regexp implementation.
6736 EmbeddedVector<byte, 1024> codes;
6737 RegExpMacroAssemblerIrregexp macro_assembler(isolate, codes, zone);
6738 #endif // V8_INTERPRETED_REGEXP
6739
6740 macro_assembler.set_slow_safe(TooMuchRegExpCode(pattern));
6741
6742 // Inserted here, instead of in Assembler, because it depends on information
6743 // in the AST that isn't replicated in the Node structure.
6744 static const int kMaxBacksearchLimit = 1024;
6745 if (is_end_anchored &&
6746 !is_start_anchored &&
6747 max_length < kMaxBacksearchLimit) {
6748 macro_assembler.SetCurrentPositionFromEnd(max_length);
6749 }
6750
6751 if (is_global) {
6752 RegExpMacroAssembler::GlobalMode mode = RegExpMacroAssembler::GLOBAL;
6753 if (data->tree->min_match() > 0) {
6754 mode = RegExpMacroAssembler::GLOBAL_NO_ZERO_LENGTH_CHECK;
6755 } else if (is_unicode) {
6756 mode = RegExpMacroAssembler::GLOBAL_UNICODE;
6757 }
6758 macro_assembler.set_global_mode(mode);
6759 }
6760
6761 return compiler.Assemble(¯o_assembler,
6762 node,
6763 data->capture_count,
6764 pattern);
6765 }
6766
6767
TooMuchRegExpCode(Handle<String> pattern)6768 bool RegExpEngine::TooMuchRegExpCode(Handle<String> pattern) {
6769 Heap* heap = pattern->GetHeap();
6770 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6771 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6772 heap->memory_allocator()->SizeExecutable() >
6773 RegExpImpl::kRegExpExecutableMemoryLimit) {
6774 too_much = true;
6775 }
6776 return too_much;
6777 }
6778
6779
Lookup(Heap * heap,String * key_string,Object * key_pattern,FixedArray ** last_match_cache,ResultsCacheType type)6780 Object* RegExpResultsCache::Lookup(Heap* heap, String* key_string,
6781 Object* key_pattern,
6782 FixedArray** last_match_cache,
6783 ResultsCacheType type) {
6784 FixedArray* cache;
6785 if (!key_string->IsInternalizedString()) return Smi::kZero;
6786 if (type == STRING_SPLIT_SUBSTRINGS) {
6787 DCHECK(key_pattern->IsString());
6788 if (!key_pattern->IsInternalizedString()) return Smi::kZero;
6789 cache = heap->string_split_cache();
6790 } else {
6791 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6792 DCHECK(key_pattern->IsFixedArray());
6793 cache = heap->regexp_multiple_cache();
6794 }
6795
6796 uint32_t hash = key_string->Hash();
6797 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6798 ~(kArrayEntriesPerCacheEntry - 1));
6799 if (cache->get(index + kStringOffset) != key_string ||
6800 cache->get(index + kPatternOffset) != key_pattern) {
6801 index =
6802 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6803 if (cache->get(index + kStringOffset) != key_string ||
6804 cache->get(index + kPatternOffset) != key_pattern) {
6805 return Smi::kZero;
6806 }
6807 }
6808
6809 *last_match_cache = FixedArray::cast(cache->get(index + kLastMatchOffset));
6810 return cache->get(index + kArrayOffset);
6811 }
6812
6813
Enter(Isolate * isolate,Handle<String> key_string,Handle<Object> key_pattern,Handle<FixedArray> value_array,Handle<FixedArray> last_match_cache,ResultsCacheType type)6814 void RegExpResultsCache::Enter(Isolate* isolate, Handle<String> key_string,
6815 Handle<Object> key_pattern,
6816 Handle<FixedArray> value_array,
6817 Handle<FixedArray> last_match_cache,
6818 ResultsCacheType type) {
6819 Factory* factory = isolate->factory();
6820 Handle<FixedArray> cache;
6821 if (!key_string->IsInternalizedString()) return;
6822 if (type == STRING_SPLIT_SUBSTRINGS) {
6823 DCHECK(key_pattern->IsString());
6824 if (!key_pattern->IsInternalizedString()) return;
6825 cache = factory->string_split_cache();
6826 } else {
6827 DCHECK(type == REGEXP_MULTIPLE_INDICES);
6828 DCHECK(key_pattern->IsFixedArray());
6829 cache = factory->regexp_multiple_cache();
6830 }
6831
6832 uint32_t hash = key_string->Hash();
6833 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
6834 ~(kArrayEntriesPerCacheEntry - 1));
6835 if (cache->get(index + kStringOffset) == Smi::kZero) {
6836 cache->set(index + kStringOffset, *key_string);
6837 cache->set(index + kPatternOffset, *key_pattern);
6838 cache->set(index + kArrayOffset, *value_array);
6839 cache->set(index + kLastMatchOffset, *last_match_cache);
6840 } else {
6841 uint32_t index2 =
6842 ((index + kArrayEntriesPerCacheEntry) & (kRegExpResultsCacheSize - 1));
6843 if (cache->get(index2 + kStringOffset) == Smi::kZero) {
6844 cache->set(index2 + kStringOffset, *key_string);
6845 cache->set(index2 + kPatternOffset, *key_pattern);
6846 cache->set(index2 + kArrayOffset, *value_array);
6847 cache->set(index2 + kLastMatchOffset, *last_match_cache);
6848 } else {
6849 cache->set(index2 + kStringOffset, Smi::kZero);
6850 cache->set(index2 + kPatternOffset, Smi::kZero);
6851 cache->set(index2 + kArrayOffset, Smi::kZero);
6852 cache->set(index2 + kLastMatchOffset, Smi::kZero);
6853 cache->set(index + kStringOffset, *key_string);
6854 cache->set(index + kPatternOffset, *key_pattern);
6855 cache->set(index + kArrayOffset, *value_array);
6856 cache->set(index + kLastMatchOffset, *last_match_cache);
6857 }
6858 }
6859 // If the array is a reasonably short list of substrings, convert it into a
6860 // list of internalized strings.
6861 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
6862 for (int i = 0; i < value_array->length(); i++) {
6863 Handle<String> str(String::cast(value_array->get(i)), isolate);
6864 Handle<String> internalized_str = factory->InternalizeString(str);
6865 value_array->set(i, *internalized_str);
6866 }
6867 }
6868 // Convert backing store to a copy-on-write array.
6869 value_array->set_map_no_write_barrier(isolate->heap()->fixed_cow_array_map());
6870 }
6871
6872
Clear(FixedArray * cache)6873 void RegExpResultsCache::Clear(FixedArray* cache) {
6874 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6875 cache->set(i, Smi::kZero);
6876 }
6877 }
6878
6879 } // namespace internal
6880 } // namespace v8
6881