1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 
12 #ifdef V8_I18N_SUPPORT
13 #include "unicode/uchar.h"
14 #endif  // V8_I18N_SUPPORT
15 
16 namespace v8 {
17 namespace internal {
18 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20     : slow_safe_compiler_(false),
21       global_mode_(NOT_GLOBAL),
22       isolate_(isolate),
23       zone_(zone) {}
24 
25 
~RegExpMacroAssembler()26 RegExpMacroAssembler::~RegExpMacroAssembler() {
27 }
28 
29 
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31                                                      Address byte_offset2,
32                                                      size_t byte_length,
33                                                      Isolate* isolate) {
34   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35       isolate->regexp_macro_assembler_canonicalize();
36   // This function is not allowed to cause a garbage collection.
37   // A GC might move the calling generated code and invalidate the
38   // return address on the stack.
39   DCHECK(byte_length % 2 == 0);
40   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42   size_t length = byte_length >> 1;
43 
44 #ifdef V8_I18N_SUPPORT
45   if (isolate == nullptr) {
46     for (size_t i = 0; i < length; i++) {
47       uc32 c1 = substring1[i];
48       uc32 c2 = substring2[i];
49       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50         // Non-BMP characters do not have case-equivalents in the BMP.
51         // Both have to be non-BMP for them to be able to match.
52         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53         if (i + 1 < length) {
54           uc16 c1t = substring1[i + 1];
55           uc16 c2t = substring2[i + 1];
56           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57               unibrow::Utf16::IsTrailSurrogate(c2t)) {
58             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60             i++;
61           }
62         }
63       }
64       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66       if (c1 != c2) return 0;
67     }
68     return 1;
69   }
70 #endif  // V8_I18N_SUPPORT
71   DCHECK_NOT_NULL(isolate);
72   for (size_t i = 0; i < length; i++) {
73     unibrow::uchar c1 = substring1[i];
74     unibrow::uchar c2 = substring2[i];
75     if (c1 != c2) {
76       unibrow::uchar s1[1] = {c1};
77       canonicalize->get(c1, '\0', s1);
78       if (s1[0] != c2) {
79         unibrow::uchar s2[1] = {c2};
80         canonicalize->get(c2, '\0', s2);
81         if (s1[0] != s2[0]) {
82           return 0;
83         }
84       }
85     }
86   }
87   return 1;
88 }
89 
90 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92                                                    Label* on_failure) {
93   Label ok;
94   // Check that current character is not a trail surrogate.
95   LoadCurrentCharacter(cp_offset, &ok);
96   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97   // Check that previous character is not a lead surrogate.
98   LoadCurrentCharacter(cp_offset - 1, &ok);
99   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100   Bind(&ok);
101 }
102 
CheckPosition(int cp_offset,Label * on_outside_input)103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
104                                          Label* on_outside_input) {
105   LoadCurrentCharacter(cp_offset, on_outside_input, true);
106 }
107 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109                                                       Label* on_no_match) {
110   return false;
111 }
112 
113 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
114 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116                                                        Zone* zone)
117     : RegExpMacroAssembler(isolate, zone) {}
118 
119 
~NativeRegExpMacroAssembler()120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121 }
122 
123 
CanReadUnaligned()124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
126 }
127 
StringCharacterPosition(String * subject,int start_index)128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129     String* subject,
130     int start_index) {
131   if (subject->IsConsString()) {
132     subject = ConsString::cast(subject)->first();
133   } else if (subject->IsSlicedString()) {
134     start_index += SlicedString::cast(subject)->offset();
135     subject = SlicedString::cast(subject)->parent();
136   }
137   DCHECK(start_index >= 0);
138   DCHECK(start_index <= subject->length());
139   if (subject->IsSeqOneByteString()) {
140     return reinterpret_cast<const byte*>(
141         SeqOneByteString::cast(subject)->GetChars() + start_index);
142   } else if (subject->IsSeqTwoByteString()) {
143     return reinterpret_cast<const byte*>(
144         SeqTwoByteString::cast(subject)->GetChars() + start_index);
145   } else if (subject->IsExternalOneByteString()) {
146     return reinterpret_cast<const byte*>(
147         ExternalOneByteString::cast(subject)->GetChars() + start_index);
148   } else {
149     return reinterpret_cast<const byte*>(
150         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
151   }
152 }
153 
154 
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)155 int NativeRegExpMacroAssembler::CheckStackGuardState(
156     Isolate* isolate, int start_index, bool is_direct_call,
157     Address* return_address, Code* re_code, String** subject,
158     const byte** input_start, const byte** input_end) {
159   DCHECK(re_code->instruction_start() <= *return_address);
160   DCHECK(*return_address <= re_code->instruction_end());
161   int return_value = 0;
162   // Prepare for possible GC.
163   HandleScope handles(isolate);
164   Handle<Code> code_handle(re_code);
165   Handle<String> subject_handle(*subject);
166   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
167 
168   StackLimitCheck check(isolate);
169   if (check.JsHasOverflowed()) {
170     isolate->StackOverflow();
171     return_value = EXCEPTION;
172   } else if (is_direct_call) {
173     // If not real stack overflow the stack guard was used to interrupt
174     // execution for another purpose.  If this is a direct call from JavaScript
175     // retry the RegExp forcing the call through the runtime system.
176     // Currently the direct call cannot handle a GC.
177     return_value = RETRY;
178   } else {
179     Object* result = isolate->stack_guard()->HandleInterrupts();
180     if (result->IsException(isolate)) return_value = EXCEPTION;
181   }
182 
183   DisallowHeapAllocation no_gc;
184 
185   if (*code_handle != re_code) {  // Return address no longer valid
186     intptr_t delta = code_handle->address() - re_code->address();
187     // Overwrite the return address on the stack.
188     *return_address += delta;
189   }
190 
191   // If we continue, we need to update the subject string addresses.
192   if (return_value == 0) {
193     // String encoding might have changed.
194     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
195       // If we changed between an LATIN1 and an UC16 string, the specialized
196       // code cannot be used, and we need to restart regexp matching from
197       // scratch (including, potentially, compiling a new version of the code).
198       return_value = RETRY;
199     } else {
200       *subject = *subject_handle;
201       intptr_t byte_length = *input_end - *input_start;
202       *input_start = StringCharacterPosition(*subject, start_index);
203       *input_end = *input_start + byte_length;
204     }
205   }
206   return return_value;
207 }
208 
209 
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)210 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
211     Handle<Code> regexp_code,
212     Handle<String> subject,
213     int* offsets_vector,
214     int offsets_vector_length,
215     int previous_index,
216     Isolate* isolate) {
217 
218   DCHECK(subject->IsFlat());
219   DCHECK(previous_index >= 0);
220   DCHECK(previous_index <= subject->length());
221 
222   // No allocations before calling the regexp, but we can't use
223   // DisallowHeapAllocation, since regexps might be preempted, and another
224   // thread might do allocation anyway.
225 
226   String* subject_ptr = *subject;
227   // Character offsets into string.
228   int start_offset = previous_index;
229   int char_length = subject_ptr->length() - start_offset;
230   int slice_offset = 0;
231 
232   // The string has been flattened, so if it is a cons string it contains the
233   // full string in the first part.
234   if (StringShape(subject_ptr).IsCons()) {
235     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
236     subject_ptr = ConsString::cast(subject_ptr)->first();
237   } else if (StringShape(subject_ptr).IsSliced()) {
238     SlicedString* slice = SlicedString::cast(subject_ptr);
239     subject_ptr = slice->parent();
240     slice_offset = slice->offset();
241   }
242   // Ensure that an underlying string has the same representation.
243   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
244   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
245   // String is now either Sequential or External
246   int char_size_shift = is_one_byte ? 0 : 1;
247 
248   const byte* input_start =
249       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
250   int byte_length = char_length << char_size_shift;
251   const byte* input_end = input_start + byte_length;
252   Result res = Execute(*regexp_code,
253                        *subject,
254                        start_offset,
255                        input_start,
256                        input_end,
257                        offsets_vector,
258                        offsets_vector_length,
259                        isolate);
260   return res;
261 }
262 
263 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)264 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
265     Code* code,
266     String* input,  // This needs to be the unpacked (sliced, cons) string.
267     int start_offset,
268     const byte* input_start,
269     const byte* input_end,
270     int* output,
271     int output_size,
272     Isolate* isolate) {
273   // Ensure that the minimum stack has been allocated.
274   RegExpStackScope stack_scope(isolate);
275   Address stack_base = stack_scope.stack()->stack_base();
276 
277   int direct_call = 0;
278   int result = CALL_GENERATED_REGEXP_CODE(
279       isolate, code->entry(), input, start_offset, input_start, input_end,
280       output, output_size, stack_base, direct_call, isolate);
281   DCHECK(result >= RETRY);
282 
283   if (result == EXCEPTION && !isolate->has_pending_exception()) {
284     // We detected a stack overflow (on the backtrack stack) in RegExp code,
285     // but haven't created the exception yet.
286     isolate->StackOverflow();
287   }
288   return static_cast<Result>(result);
289 }
290 
291 
292 const byte NativeRegExpMacroAssembler::word_character_map[] = {
293     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
294     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
295     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
296     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
297 
298     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
299     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
300     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
301     0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
302 
303     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
304     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
305     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
306     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
307 
308     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
309     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
310     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
311     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
312     // Latin-1 range
313     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
314     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
315     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317 
318     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
319     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
320     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322 
323     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327 
328     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 };
333 
334 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)335 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
336                                               Address* stack_base,
337                                               Isolate* isolate) {
338   RegExpStack* regexp_stack = isolate->regexp_stack();
339   size_t size = regexp_stack->stack_capacity();
340   Address old_stack_base = regexp_stack->stack_base();
341   DCHECK(old_stack_base == *stack_base);
342   DCHECK(stack_pointer <= old_stack_base);
343   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
344   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
345   if (new_stack_base == NULL) {
346     return NULL;
347   }
348   *stack_base = new_stack_base;
349   intptr_t stack_content_size = old_stack_base - stack_pointer;
350   return new_stack_base - stack_content_size;
351 }
352 
353 #endif  // V8_INTERPRETED_REGEXP
354 
355 }  // namespace internal
356 }  // namespace v8
357