1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 #include "src/unicode-inl.h"
12 
13 #ifdef V8_INTL_SUPPORT
14 #include "unicode/uchar.h"
15 #endif  // V8_INTL_SUPPORT
16 
17 namespace v8 {
18 namespace internal {
19 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)20 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
21     : slow_safe_compiler_(false),
22       global_mode_(NOT_GLOBAL),
23       isolate_(isolate),
24       zone_(zone) {}
25 
26 
~RegExpMacroAssembler()27 RegExpMacroAssembler::~RegExpMacroAssembler() {
28 }
29 
30 
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)31 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
32                                                      Address byte_offset2,
33                                                      size_t byte_length,
34                                                      Isolate* isolate) {
35   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
36       isolate->regexp_macro_assembler_canonicalize();
37   // This function is not allowed to cause a garbage collection.
38   // A GC might move the calling generated code and invalidate the
39   // return address on the stack.
40   DCHECK_EQ(0, byte_length % 2);
41   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
42   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
43   size_t length = byte_length >> 1;
44 
45 #ifdef V8_INTL_SUPPORT
46   if (isolate == nullptr) {
47     for (size_t i = 0; i < length; i++) {
48       uc32 c1 = substring1[i];
49       uc32 c2 = substring2[i];
50       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
51         // Non-BMP characters do not have case-equivalents in the BMP.
52         // Both have to be non-BMP for them to be able to match.
53         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
54         if (i + 1 < length) {
55           uc16 c1t = substring1[i + 1];
56           uc16 c2t = substring2[i + 1];
57           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
58               unibrow::Utf16::IsTrailSurrogate(c2t)) {
59             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
60             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
61             i++;
62           }
63         }
64       }
65       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
66       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
67       if (c1 != c2) return 0;
68     }
69     return 1;
70   }
71 #endif  // V8_INTL_SUPPORT
72   DCHECK_NOT_NULL(isolate);
73   for (size_t i = 0; i < length; i++) {
74     unibrow::uchar c1 = substring1[i];
75     unibrow::uchar c2 = substring2[i];
76     if (c1 != c2) {
77       unibrow::uchar s1[1] = {c1};
78       canonicalize->get(c1, '\0', s1);
79       if (s1[0] != c2) {
80         unibrow::uchar s2[1] = {c2};
81         canonicalize->get(c2, '\0', s2);
82         if (s1[0] != s2[0]) {
83           return 0;
84         }
85       }
86     }
87   }
88   return 1;
89 }
90 
91 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)92 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
93                                                    Label* on_failure) {
94   Label ok;
95   // Check that current character is not a trail surrogate.
96   LoadCurrentCharacter(cp_offset, &ok);
97   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
98   // Check that previous character is not a lead surrogate.
99   LoadCurrentCharacter(cp_offset - 1, &ok);
100   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
101   Bind(&ok);
102 }
103 
CheckPosition(int cp_offset,Label * on_outside_input)104 void RegExpMacroAssembler::CheckPosition(int cp_offset,
105                                          Label* on_outside_input) {
106   LoadCurrentCharacter(cp_offset, on_outside_input, true);
107 }
108 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)109 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
110                                                       Label* on_no_match) {
111   return false;
112 }
113 
114 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
115 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)116 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
117                                                        Zone* zone)
118     : RegExpMacroAssembler(isolate, zone) {}
119 
120 
~NativeRegExpMacroAssembler()121 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
122 }
123 
124 
CanReadUnaligned()125 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
126   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
127 }
128 
StringCharacterPosition(String * subject,int start_index)129 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
130     String* subject,
131     int start_index) {
132   if (subject->IsConsString()) {
133     subject = ConsString::cast(subject)->first();
134   } else if (subject->IsSlicedString()) {
135     start_index += SlicedString::cast(subject)->offset();
136     subject = SlicedString::cast(subject)->parent();
137   }
138   if (subject->IsThinString()) {
139     subject = ThinString::cast(subject)->actual();
140   }
141   DCHECK_LE(0, start_index);
142   DCHECK_LE(start_index, subject->length());
143   if (subject->IsSeqOneByteString()) {
144     return reinterpret_cast<const byte*>(
145         SeqOneByteString::cast(subject)->GetChars() + start_index);
146   } else if (subject->IsSeqTwoByteString()) {
147     return reinterpret_cast<const byte*>(
148         SeqTwoByteString::cast(subject)->GetChars() + start_index);
149   } else if (subject->IsExternalOneByteString()) {
150     return reinterpret_cast<const byte*>(
151         ExternalOneByteString::cast(subject)->GetChars() + start_index);
152   } else {
153     DCHECK(subject->IsExternalTwoByteString());
154     return reinterpret_cast<const byte*>(
155         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
156   }
157 }
158 
159 
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)160 int NativeRegExpMacroAssembler::CheckStackGuardState(
161     Isolate* isolate, int start_index, bool is_direct_call,
162     Address* return_address, Code* re_code, String** subject,
163     const byte** input_start, const byte** input_end) {
164   DCHECK(re_code->raw_instruction_start() <= *return_address);
165   DCHECK(*return_address <= re_code->raw_instruction_end());
166   int return_value = 0;
167   // Prepare for possible GC.
168   HandleScope handles(isolate);
169   Handle<Code> code_handle(re_code, isolate);
170   Handle<String> subject_handle(*subject, isolate);
171   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
172 
173   StackLimitCheck check(isolate);
174   bool js_has_overflowed = check.JsHasOverflowed();
175 
176   if (is_direct_call) {
177     // Direct calls from JavaScript can be interrupted in two ways:
178     // 1. A real stack overflow, in which case we let the caller throw the
179     //    exception.
180     // 2. The stack guard was used to interrupt execution for another purpose,
181     //    forcing the call through the runtime system.
182     return_value = js_has_overflowed ? EXCEPTION : RETRY;
183   } else if (js_has_overflowed) {
184     isolate->StackOverflow();
185     return_value = EXCEPTION;
186   } else {
187     Object* result = isolate->stack_guard()->HandleInterrupts();
188     if (result->IsException(isolate)) return_value = EXCEPTION;
189   }
190 
191   DisallowHeapAllocation no_gc;
192 
193   if (*code_handle != re_code) {  // Return address no longer valid
194     intptr_t delta = code_handle->address() - re_code->address();
195     // Overwrite the return address on the stack.
196     *return_address += delta;
197   }
198 
199   // If we continue, we need to update the subject string addresses.
200   if (return_value == 0) {
201     // String encoding might have changed.
202     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
203       // If we changed between an LATIN1 and an UC16 string, the specialized
204       // code cannot be used, and we need to restart regexp matching from
205       // scratch (including, potentially, compiling a new version of the code).
206       return_value = RETRY;
207     } else {
208       *subject = *subject_handle;
209       intptr_t byte_length = *input_end - *input_start;
210       *input_start = StringCharacterPosition(*subject, start_index);
211       *input_end = *input_start + byte_length;
212     }
213   }
214   return return_value;
215 }
216 
217 
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)218 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
219     Handle<Code> regexp_code,
220     Handle<String> subject,
221     int* offsets_vector,
222     int offsets_vector_length,
223     int previous_index,
224     Isolate* isolate) {
225 
226   DCHECK(subject->IsFlat());
227   DCHECK_LE(0, previous_index);
228   DCHECK_LE(previous_index, subject->length());
229 
230   // No allocations before calling the regexp, but we can't use
231   // DisallowHeapAllocation, since regexps might be preempted, and another
232   // thread might do allocation anyway.
233 
234   String* subject_ptr = *subject;
235   // Character offsets into string.
236   int start_offset = previous_index;
237   int char_length = subject_ptr->length() - start_offset;
238   int slice_offset = 0;
239 
240   // The string has been flattened, so if it is a cons string it contains the
241   // full string in the first part.
242   if (StringShape(subject_ptr).IsCons()) {
243     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
244     subject_ptr = ConsString::cast(subject_ptr)->first();
245   } else if (StringShape(subject_ptr).IsSliced()) {
246     SlicedString* slice = SlicedString::cast(subject_ptr);
247     subject_ptr = slice->parent();
248     slice_offset = slice->offset();
249   }
250   if (StringShape(subject_ptr).IsThin()) {
251     subject_ptr = ThinString::cast(subject_ptr)->actual();
252   }
253   // Ensure that an underlying string has the same representation.
254   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
255   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
256   // String is now either Sequential or External
257   int char_size_shift = is_one_byte ? 0 : 1;
258 
259   const byte* input_start =
260       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
261   int byte_length = char_length << char_size_shift;
262   const byte* input_end = input_start + byte_length;
263   Result res = Execute(*regexp_code,
264                        *subject,
265                        start_offset,
266                        input_start,
267                        input_end,
268                        offsets_vector,
269                        offsets_vector_length,
270                        isolate);
271   return res;
272 }
273 
274 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)275 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
276     Code* code,
277     String* input,  // This needs to be the unpacked (sliced, cons) string.
278     int start_offset,
279     const byte* input_start,
280     const byte* input_end,
281     int* output,
282     int output_size,
283     Isolate* isolate) {
284   // Ensure that the minimum stack has been allocated.
285   RegExpStackScope stack_scope(isolate);
286   Address stack_base = stack_scope.stack()->stack_base();
287 
288   int direct_call = 0;
289 
290   using RegexpMatcherSig = int(
291       String * input, int start_offset,  // NOLINT(readability/casting)
292       const byte* input_start, const byte* input_end, int* output,
293       int output_size, Address stack_base, int direct_call, Isolate* isolate);
294 
295   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
296   int result = fn.Call(input, start_offset, input_start, input_end, output,
297                        output_size, stack_base, direct_call, isolate);
298   DCHECK(result >= RETRY);
299 
300   if (result == EXCEPTION && !isolate->has_pending_exception()) {
301     // We detected a stack overflow (on the backtrack stack) in RegExp code,
302     // but haven't created the exception yet.
303     isolate->StackOverflow();
304   }
305   return static_cast<Result>(result);
306 }
307 
308 // clang-format off
309 const byte NativeRegExpMacroAssembler::word_character_map[] = {
310     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
311     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
312     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
313     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
314 
315     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
318     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
319 
320     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
321     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
322     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
323     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
324 
325     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
326     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
327     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
328     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
329     // Latin-1 range
330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334 
335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 
340     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
343     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
344 
345     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
346     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
347     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
348     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
349 };
350 // clang-format on
351 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)352 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
353                                               Address* stack_base,
354                                               Isolate* isolate) {
355   RegExpStack* regexp_stack = isolate->regexp_stack();
356   size_t size = regexp_stack->stack_capacity();
357   Address old_stack_base = regexp_stack->stack_base();
358   DCHECK(old_stack_base == *stack_base);
359   DCHECK(stack_pointer <= old_stack_base);
360   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
361   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
362   if (new_stack_base == kNullAddress) {
363     return kNullAddress;
364   }
365   *stack_base = new_stack_base;
366   intptr_t stack_content_size = old_stack_base - stack_pointer;
367   return new_stack_base - stack_content_size;
368 }
369 
370 #endif  // V8_INTERPRETED_REGEXP
371 
372 }  // namespace internal
373 }  // namespace v8
374