1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // A simple interpreter for the Irregexp byte code.
6 
7 
8 #include "src/v8.h"
9 
10 #include "src/ast.h"
11 #include "src/bytecodes-irregexp.h"
12 #include "src/interpreter-irregexp.h"
13 #include "src/jsregexp.h"
14 #include "src/regexp-macro-assembler.h"
15 #include "src/unicode.h"
16 #include "src/utils.h"
17 
18 namespace v8 {
19 namespace internal {
20 
21 
22 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
23 
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uc16> subject)24 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
25                                  int from,
26                                  int current,
27                                  int len,
28                                  Vector<const uc16> subject) {
29   for (int i = 0; i < len; i++) {
30     unibrow::uchar old_char = subject[from++];
31     unibrow::uchar new_char = subject[current++];
32     if (old_char == new_char) continue;
33     unibrow::uchar old_string[1] = { old_char };
34     unibrow::uchar new_string[1] = { new_char };
35     interp_canonicalize->get(old_char, '\0', old_string);
36     interp_canonicalize->get(new_char, '\0', new_string);
37     if (old_string[0] != new_string[0]) {
38       return false;
39     }
40   }
41   return true;
42 }
43 
44 
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uint8_t> subject)45 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
46                                  int from,
47                                  int current,
48                                  int len,
49                                  Vector<const uint8_t> subject) {
50   for (int i = 0; i < len; i++) {
51     unsigned int old_char = subject[from++];
52     unsigned int new_char = subject[current++];
53     if (old_char == new_char) continue;
54     // Convert both characters to lower case.
55     old_char |= 0x20;
56     new_char |= 0x20;
57     if (old_char != new_char) return false;
58     // Not letters in the ASCII range and Latin-1 range.
59     if (!(old_char - 'a' <= 'z' - 'a') &&
60         !(old_char - 224 <= 254 - 224 && old_char != 247)) {
61       return false;
62     }
63   }
64   return true;
65 }
66 
67 
68 #ifdef DEBUG
TraceInterpreter(const byte * code_base,const byte * pc,int stack_depth,int current_position,uint32_t current_char,int bytecode_length,const char * bytecode_name)69 static void TraceInterpreter(const byte* code_base,
70                              const byte* pc,
71                              int stack_depth,
72                              int current_position,
73                              uint32_t current_char,
74                              int bytecode_length,
75                              const char* bytecode_name) {
76   if (FLAG_trace_regexp_bytecodes) {
77     bool printable = (current_char < 127 && current_char >= 32);
78     const char* format =
79         printable ?
80         "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
81         "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
82     PrintF(format,
83            pc - code_base,
84            stack_depth,
85            current_position,
86            current_char,
87            printable ? current_char : '.',
88            bytecode_name);
89     for (int i = 0; i < bytecode_length; i++) {
90       printf(", %02x", pc[i]);
91     }
92     printf(" ");
93     for (int i = 1; i < bytecode_length; i++) {
94       unsigned char b = pc[i];
95       if (b < 127 && b >= 32) {
96         printf("%c", b);
97       } else {
98         printf(".");
99       }
100     }
101     printf("\n");
102   }
103 }
104 
105 
106 #define BYTECODE(name)                                                      \
107   case BC_##name:                                                           \
108     TraceInterpreter(code_base,                                             \
109                      pc,                                                    \
110                      static_cast<int>(backtrack_sp - backtrack_stack_base), \
111                      current,                                               \
112                      current_char,                                          \
113                      BC_##name##_LENGTH,                                    \
114                      #name);
115 #else
116 #define BYTECODE(name)                                                      \
117   case BC_##name:
118 #endif
119 
120 
Load32Aligned(const byte * pc)121 static int32_t Load32Aligned(const byte* pc) {
122   DCHECK((reinterpret_cast<intptr_t>(pc) & 3) == 0);
123   return *reinterpret_cast<const int32_t *>(pc);
124 }
125 
126 
Load16Aligned(const byte * pc)127 static int32_t Load16Aligned(const byte* pc) {
128   DCHECK((reinterpret_cast<intptr_t>(pc) & 1) == 0);
129   return *reinterpret_cast<const uint16_t *>(pc);
130 }
131 
132 
133 // A simple abstraction over the backtracking stack used by the interpreter.
134 // This backtracking stack does not grow automatically, but it ensures that the
135 // the memory held by the stack is released or remembered in a cache if the
136 // matching terminates.
137 class BacktrackStack {
138  public:
BacktrackStack()139   BacktrackStack() { data_ = NewArray<int>(kBacktrackStackSize); }
140 
~BacktrackStack()141   ~BacktrackStack() {
142     DeleteArray(data_);
143   }
144 
data() const145   int* data() const { return data_; }
146 
max_size() const147   int max_size() const { return kBacktrackStackSize; }
148 
149  private:
150   static const int kBacktrackStackSize = 10000;
151 
152   int* data_;
153 
154   DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
155 };
156 
157 
158 template <typename Char>
RawMatch(Isolate * isolate,const byte * code_base,Vector<const Char> subject,int * registers,int current,uint32_t current_char)159 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
160                                            const byte* code_base,
161                                            Vector<const Char> subject,
162                                            int* registers,
163                                            int current,
164                                            uint32_t current_char) {
165   const byte* pc = code_base;
166   // BacktrackStack ensures that the memory allocated for the backtracking stack
167   // is returned to the system or cached if there is no stack being cached at
168   // the moment.
169   BacktrackStack backtrack_stack;
170   int* backtrack_stack_base = backtrack_stack.data();
171   int* backtrack_sp = backtrack_stack_base;
172   int backtrack_stack_space = backtrack_stack.max_size();
173 #ifdef DEBUG
174   if (FLAG_trace_regexp_bytecodes) {
175     PrintF("\n\nStart bytecode interpreter\n\n");
176   }
177 #endif
178   while (true) {
179     int32_t insn = Load32Aligned(pc);
180     switch (insn & BYTECODE_MASK) {
181       BYTECODE(BREAK)
182         UNREACHABLE();
183         return RegExpImpl::RE_FAILURE;
184       BYTECODE(PUSH_CP)
185         if (--backtrack_stack_space < 0) {
186           return RegExpImpl::RE_EXCEPTION;
187         }
188         *backtrack_sp++ = current;
189         pc += BC_PUSH_CP_LENGTH;
190         break;
191       BYTECODE(PUSH_BT)
192         if (--backtrack_stack_space < 0) {
193           return RegExpImpl::RE_EXCEPTION;
194         }
195         *backtrack_sp++ = Load32Aligned(pc + 4);
196         pc += BC_PUSH_BT_LENGTH;
197         break;
198       BYTECODE(PUSH_REGISTER)
199         if (--backtrack_stack_space < 0) {
200           return RegExpImpl::RE_EXCEPTION;
201         }
202         *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
203         pc += BC_PUSH_REGISTER_LENGTH;
204         break;
205       BYTECODE(SET_REGISTER)
206         registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
207         pc += BC_SET_REGISTER_LENGTH;
208         break;
209       BYTECODE(ADVANCE_REGISTER)
210         registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
211         pc += BC_ADVANCE_REGISTER_LENGTH;
212         break;
213       BYTECODE(SET_REGISTER_TO_CP)
214         registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
215         pc += BC_SET_REGISTER_TO_CP_LENGTH;
216         break;
217       BYTECODE(SET_CP_TO_REGISTER)
218         current = registers[insn >> BYTECODE_SHIFT];
219         pc += BC_SET_CP_TO_REGISTER_LENGTH;
220         break;
221       BYTECODE(SET_REGISTER_TO_SP)
222         registers[insn >> BYTECODE_SHIFT] =
223             static_cast<int>(backtrack_sp - backtrack_stack_base);
224         pc += BC_SET_REGISTER_TO_SP_LENGTH;
225         break;
226       BYTECODE(SET_SP_TO_REGISTER)
227         backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
228         backtrack_stack_space = backtrack_stack.max_size() -
229             static_cast<int>(backtrack_sp - backtrack_stack_base);
230         pc += BC_SET_SP_TO_REGISTER_LENGTH;
231         break;
232       BYTECODE(POP_CP)
233         backtrack_stack_space++;
234         --backtrack_sp;
235         current = *backtrack_sp;
236         pc += BC_POP_CP_LENGTH;
237         break;
238       BYTECODE(POP_BT)
239         backtrack_stack_space++;
240         --backtrack_sp;
241         pc = code_base + *backtrack_sp;
242         break;
243       BYTECODE(POP_REGISTER)
244         backtrack_stack_space++;
245         --backtrack_sp;
246         registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
247         pc += BC_POP_REGISTER_LENGTH;
248         break;
249       BYTECODE(FAIL)
250         return RegExpImpl::RE_FAILURE;
251       BYTECODE(SUCCEED)
252         return RegExpImpl::RE_SUCCESS;
253       BYTECODE(ADVANCE_CP)
254         current += insn >> BYTECODE_SHIFT;
255         pc += BC_ADVANCE_CP_LENGTH;
256         break;
257       BYTECODE(GOTO)
258         pc = code_base + Load32Aligned(pc + 4);
259         break;
260       BYTECODE(ADVANCE_CP_AND_GOTO)
261         current += insn >> BYTECODE_SHIFT;
262         pc = code_base + Load32Aligned(pc + 4);
263         break;
264       BYTECODE(CHECK_GREEDY)
265         if (current == backtrack_sp[-1]) {
266           backtrack_sp--;
267           backtrack_stack_space++;
268           pc = code_base + Load32Aligned(pc + 4);
269         } else {
270           pc += BC_CHECK_GREEDY_LENGTH;
271         }
272         break;
273       BYTECODE(LOAD_CURRENT_CHAR) {
274         int pos = current + (insn >> BYTECODE_SHIFT);
275         if (pos >= subject.length()) {
276           pc = code_base + Load32Aligned(pc + 4);
277         } else {
278           current_char = subject[pos];
279           pc += BC_LOAD_CURRENT_CHAR_LENGTH;
280         }
281         break;
282       }
283       BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
284         int pos = current + (insn >> BYTECODE_SHIFT);
285         current_char = subject[pos];
286         pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
287         break;
288       }
289       BYTECODE(LOAD_2_CURRENT_CHARS) {
290         int pos = current + (insn >> BYTECODE_SHIFT);
291         if (pos + 2 > subject.length()) {
292           pc = code_base + Load32Aligned(pc + 4);
293         } else {
294           Char next = subject[pos + 1];
295           current_char =
296               (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
297           pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
298         }
299         break;
300       }
301       BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
302         int pos = current + (insn >> BYTECODE_SHIFT);
303         Char next = subject[pos + 1];
304         current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
305         pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
306         break;
307       }
308       BYTECODE(LOAD_4_CURRENT_CHARS) {
309         DCHECK(sizeof(Char) == 1);
310         int pos = current + (insn >> BYTECODE_SHIFT);
311         if (pos + 4 > subject.length()) {
312           pc = code_base + Load32Aligned(pc + 4);
313         } else {
314           Char next1 = subject[pos + 1];
315           Char next2 = subject[pos + 2];
316           Char next3 = subject[pos + 3];
317           current_char = (subject[pos] |
318                           (next1 << 8) |
319                           (next2 << 16) |
320                           (next3 << 24));
321           pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
322         }
323         break;
324       }
325       BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
326         DCHECK(sizeof(Char) == 1);
327         int pos = current + (insn >> BYTECODE_SHIFT);
328         Char next1 = subject[pos + 1];
329         Char next2 = subject[pos + 2];
330         Char next3 = subject[pos + 3];
331         current_char = (subject[pos] |
332                         (next1 << 8) |
333                         (next2 << 16) |
334                         (next3 << 24));
335         pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
336         break;
337       }
338       BYTECODE(CHECK_4_CHARS) {
339         uint32_t c = Load32Aligned(pc + 4);
340         if (c == current_char) {
341           pc = code_base + Load32Aligned(pc + 8);
342         } else {
343           pc += BC_CHECK_4_CHARS_LENGTH;
344         }
345         break;
346       }
347       BYTECODE(CHECK_CHAR) {
348         uint32_t c = (insn >> BYTECODE_SHIFT);
349         if (c == current_char) {
350           pc = code_base + Load32Aligned(pc + 4);
351         } else {
352           pc += BC_CHECK_CHAR_LENGTH;
353         }
354         break;
355       }
356       BYTECODE(CHECK_NOT_4_CHARS) {
357         uint32_t c = Load32Aligned(pc + 4);
358         if (c != current_char) {
359           pc = code_base + Load32Aligned(pc + 8);
360         } else {
361           pc += BC_CHECK_NOT_4_CHARS_LENGTH;
362         }
363         break;
364       }
365       BYTECODE(CHECK_NOT_CHAR) {
366         uint32_t c = (insn >> BYTECODE_SHIFT);
367         if (c != current_char) {
368           pc = code_base + Load32Aligned(pc + 4);
369         } else {
370           pc += BC_CHECK_NOT_CHAR_LENGTH;
371         }
372         break;
373       }
374       BYTECODE(AND_CHECK_4_CHARS) {
375         uint32_t c = Load32Aligned(pc + 4);
376         if (c == (current_char & Load32Aligned(pc + 8))) {
377           pc = code_base + Load32Aligned(pc + 12);
378         } else {
379           pc += BC_AND_CHECK_4_CHARS_LENGTH;
380         }
381         break;
382       }
383       BYTECODE(AND_CHECK_CHAR) {
384         uint32_t c = (insn >> BYTECODE_SHIFT);
385         if (c == (current_char & Load32Aligned(pc + 4))) {
386           pc = code_base + Load32Aligned(pc + 8);
387         } else {
388           pc += BC_AND_CHECK_CHAR_LENGTH;
389         }
390         break;
391       }
392       BYTECODE(AND_CHECK_NOT_4_CHARS) {
393         uint32_t c = Load32Aligned(pc + 4);
394         if (c != (current_char & Load32Aligned(pc + 8))) {
395           pc = code_base + Load32Aligned(pc + 12);
396         } else {
397           pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
398         }
399         break;
400       }
401       BYTECODE(AND_CHECK_NOT_CHAR) {
402         uint32_t c = (insn >> BYTECODE_SHIFT);
403         if (c != (current_char & Load32Aligned(pc + 4))) {
404           pc = code_base + Load32Aligned(pc + 8);
405         } else {
406           pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
407         }
408         break;
409       }
410       BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
411         uint32_t c = (insn >> BYTECODE_SHIFT);
412         uint32_t minus = Load16Aligned(pc + 4);
413         uint32_t mask = Load16Aligned(pc + 6);
414         if (c != ((current_char - minus) & mask)) {
415           pc = code_base + Load32Aligned(pc + 8);
416         } else {
417           pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
418         }
419         break;
420       }
421       BYTECODE(CHECK_CHAR_IN_RANGE) {
422         uint32_t from = Load16Aligned(pc + 4);
423         uint32_t to = Load16Aligned(pc + 6);
424         if (from <= current_char && current_char <= to) {
425           pc = code_base + Load32Aligned(pc + 8);
426         } else {
427           pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
428         }
429         break;
430       }
431       BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
432         uint32_t from = Load16Aligned(pc + 4);
433         uint32_t to = Load16Aligned(pc + 6);
434         if (from > current_char || current_char > to) {
435           pc = code_base + Load32Aligned(pc + 8);
436         } else {
437           pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
438         }
439         break;
440       }
441       BYTECODE(CHECK_BIT_IN_TABLE) {
442         int mask = RegExpMacroAssembler::kTableMask;
443         byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
444         int bit = (current_char & (kBitsPerByte - 1));
445         if ((b & (1 << bit)) != 0) {
446           pc = code_base + Load32Aligned(pc + 4);
447         } else {
448           pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
449         }
450         break;
451       }
452       BYTECODE(CHECK_LT) {
453         uint32_t limit = (insn >> BYTECODE_SHIFT);
454         if (current_char < limit) {
455           pc = code_base + Load32Aligned(pc + 4);
456         } else {
457           pc += BC_CHECK_LT_LENGTH;
458         }
459         break;
460       }
461       BYTECODE(CHECK_GT) {
462         uint32_t limit = (insn >> BYTECODE_SHIFT);
463         if (current_char > limit) {
464           pc = code_base + Load32Aligned(pc + 4);
465         } else {
466           pc += BC_CHECK_GT_LENGTH;
467         }
468         break;
469       }
470       BYTECODE(CHECK_REGISTER_LT)
471         if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
472           pc = code_base + Load32Aligned(pc + 8);
473         } else {
474           pc += BC_CHECK_REGISTER_LT_LENGTH;
475         }
476         break;
477       BYTECODE(CHECK_REGISTER_GE)
478         if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
479           pc = code_base + Load32Aligned(pc + 8);
480         } else {
481           pc += BC_CHECK_REGISTER_GE_LENGTH;
482         }
483         break;
484       BYTECODE(CHECK_REGISTER_EQ_POS)
485         if (registers[insn >> BYTECODE_SHIFT] == current) {
486           pc = code_base + Load32Aligned(pc + 4);
487         } else {
488           pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
489         }
490         break;
491       BYTECODE(CHECK_NOT_REGS_EQUAL)
492         if (registers[insn >> BYTECODE_SHIFT] ==
493             registers[Load32Aligned(pc + 4)]) {
494           pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
495         } else {
496           pc = code_base + Load32Aligned(pc + 8);
497         }
498         break;
499       BYTECODE(CHECK_NOT_BACK_REF) {
500         int from = registers[insn >> BYTECODE_SHIFT];
501         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
502         if (from < 0 || len <= 0) {
503           pc += BC_CHECK_NOT_BACK_REF_LENGTH;
504           break;
505         }
506         if (current + len > subject.length()) {
507           pc = code_base + Load32Aligned(pc + 4);
508           break;
509         } else {
510           int i;
511           for (i = 0; i < len; i++) {
512             if (subject[from + i] != subject[current + i]) {
513               pc = code_base + Load32Aligned(pc + 4);
514               break;
515             }
516           }
517           if (i < len) break;
518           current += len;
519         }
520         pc += BC_CHECK_NOT_BACK_REF_LENGTH;
521         break;
522       }
523       BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
524         int from = registers[insn >> BYTECODE_SHIFT];
525         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
526         if (from < 0 || len <= 0) {
527           pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
528           break;
529         }
530         if (current + len > subject.length()) {
531           pc = code_base + Load32Aligned(pc + 4);
532           break;
533         } else {
534           if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
535                                    from, current, len, subject)) {
536             current += len;
537             pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
538           } else {
539             pc = code_base + Load32Aligned(pc + 4);
540           }
541         }
542         break;
543       }
544       BYTECODE(CHECK_AT_START)
545         if (current == 0) {
546           pc = code_base + Load32Aligned(pc + 4);
547         } else {
548           pc += BC_CHECK_AT_START_LENGTH;
549         }
550         break;
551       BYTECODE(CHECK_NOT_AT_START)
552         if (current == 0) {
553           pc += BC_CHECK_NOT_AT_START_LENGTH;
554         } else {
555           pc = code_base + Load32Aligned(pc + 4);
556         }
557         break;
558       BYTECODE(SET_CURRENT_POSITION_FROM_END) {
559         int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
560         if (subject.length() - current > by) {
561           current = subject.length() - by;
562           current_char = subject[current - 1];
563         }
564         pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
565         break;
566       }
567       default:
568         UNREACHABLE();
569         break;
570     }
571   }
572 }
573 
574 
Match(Isolate * isolate,Handle<ByteArray> code_array,Handle<String> subject,int * registers,int start_position)575 RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
576     Isolate* isolate,
577     Handle<ByteArray> code_array,
578     Handle<String> subject,
579     int* registers,
580     int start_position) {
581   DCHECK(subject->IsFlat());
582 
583   DisallowHeapAllocation no_gc;
584   const byte* code_base = code_array->GetDataStartAddress();
585   uc16 previous_char = '\n';
586   String::FlatContent subject_content = subject->GetFlatContent();
587   if (subject_content.IsOneByte()) {
588     Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
589     if (start_position != 0) previous_char = subject_vector[start_position - 1];
590     return RawMatch(isolate,
591                     code_base,
592                     subject_vector,
593                     registers,
594                     start_position,
595                     previous_char);
596   } else {
597     DCHECK(subject_content.IsTwoByte());
598     Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
599     if (start_position != 0) previous_char = subject_vector[start_position - 1];
600     return RawMatch(isolate,
601                     code_base,
602                     subject_vector,
603                     registers,
604                     start_position,
605                     previous_char);
606   }
607 }
608 
609 } }  // namespace v8::internal
610