1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 
12 namespace v8 {
13 namespace internal {
14 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)15 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
16     : slow_safe_compiler_(false),
17       global_mode_(NOT_GLOBAL),
18       isolate_(isolate),
19       zone_(zone) {}
20 
21 
~RegExpMacroAssembler()22 RegExpMacroAssembler::~RegExpMacroAssembler() {
23 }
24 
25 
26 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
27 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)28 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
29                                                        Zone* zone)
30     : RegExpMacroAssembler(isolate, zone) {}
31 
32 
~NativeRegExpMacroAssembler()33 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
34 }
35 
36 
CanReadUnaligned()37 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
38   return FLAG_enable_unaligned_accesses && !slow_safe();
39 }
40 
StringCharacterPosition(String * subject,int start_index)41 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
42     String* subject,
43     int start_index) {
44   if (subject->IsConsString()) {
45     subject = ConsString::cast(subject)->first();
46   } else if (subject->IsSlicedString()) {
47     start_index += SlicedString::cast(subject)->offset();
48     subject = SlicedString::cast(subject)->parent();
49   }
50   DCHECK(start_index >= 0);
51   DCHECK(start_index <= subject->length());
52   if (subject->IsSeqOneByteString()) {
53     return reinterpret_cast<const byte*>(
54         SeqOneByteString::cast(subject)->GetChars() + start_index);
55   } else if (subject->IsSeqTwoByteString()) {
56     return reinterpret_cast<const byte*>(
57         SeqTwoByteString::cast(subject)->GetChars() + start_index);
58   } else if (subject->IsExternalOneByteString()) {
59     return reinterpret_cast<const byte*>(
60         ExternalOneByteString::cast(subject)->GetChars() + start_index);
61   } else {
62     return reinterpret_cast<const byte*>(
63         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
64   }
65 }
66 
67 
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)68 int NativeRegExpMacroAssembler::CheckStackGuardState(
69     Isolate* isolate, int start_index, bool is_direct_call,
70     Address* return_address, Code* re_code, String** subject,
71     const byte** input_start, const byte** input_end) {
72   DCHECK(re_code->instruction_start() <= *return_address);
73   DCHECK(*return_address <= re_code->instruction_end());
74   int return_value = 0;
75   // Prepare for possible GC.
76   HandleScope handles(isolate);
77   Handle<Code> code_handle(re_code);
78   Handle<String> subject_handle(*subject);
79   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
80 
81   StackLimitCheck check(isolate);
82   if (check.JsHasOverflowed()) {
83     isolate->StackOverflow();
84     return_value = EXCEPTION;
85   } else if (is_direct_call) {
86     // If not real stack overflow the stack guard was used to interrupt
87     // execution for another purpose.  If this is a direct call from JavaScript
88     // retry the RegExp forcing the call through the runtime system.
89     // Currently the direct call cannot handle a GC.
90     return_value = RETRY;
91   } else {
92     Object* result = isolate->stack_guard()->HandleInterrupts();
93     if (result->IsException()) return_value = EXCEPTION;
94   }
95 
96   DisallowHeapAllocation no_gc;
97 
98   if (*code_handle != re_code) {  // Return address no longer valid
99     intptr_t delta = code_handle->address() - re_code->address();
100     // Overwrite the return address on the stack.
101     *return_address += delta;
102   }
103 
104   // If we continue, we need to update the subject string addresses.
105   if (return_value == 0) {
106     // String encoding might have changed.
107     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
108       // If we changed between an LATIN1 and an UC16 string, the specialized
109       // code cannot be used, and we need to restart regexp matching from
110       // scratch (including, potentially, compiling a new version of the code).
111       return_value = RETRY;
112     } else {
113       *subject = *subject_handle;
114       intptr_t byte_length = *input_end - *input_start;
115       *input_start = StringCharacterPosition(*subject, start_index);
116       *input_end = *input_start + byte_length;
117     }
118   }
119   return return_value;
120 }
121 
122 
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)123 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
124     Handle<Code> regexp_code,
125     Handle<String> subject,
126     int* offsets_vector,
127     int offsets_vector_length,
128     int previous_index,
129     Isolate* isolate) {
130 
131   DCHECK(subject->IsFlat());
132   DCHECK(previous_index >= 0);
133   DCHECK(previous_index <= subject->length());
134 
135   // No allocations before calling the regexp, but we can't use
136   // DisallowHeapAllocation, since regexps might be preempted, and another
137   // thread might do allocation anyway.
138 
139   String* subject_ptr = *subject;
140   // Character offsets into string.
141   int start_offset = previous_index;
142   int char_length = subject_ptr->length() - start_offset;
143   int slice_offset = 0;
144 
145   // The string has been flattened, so if it is a cons string it contains the
146   // full string in the first part.
147   if (StringShape(subject_ptr).IsCons()) {
148     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
149     subject_ptr = ConsString::cast(subject_ptr)->first();
150   } else if (StringShape(subject_ptr).IsSliced()) {
151     SlicedString* slice = SlicedString::cast(subject_ptr);
152     subject_ptr = slice->parent();
153     slice_offset = slice->offset();
154   }
155   // Ensure that an underlying string has the same representation.
156   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
157   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
158   // String is now either Sequential or External
159   int char_size_shift = is_one_byte ? 0 : 1;
160 
161   const byte* input_start =
162       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
163   int byte_length = char_length << char_size_shift;
164   const byte* input_end = input_start + byte_length;
165   Result res = Execute(*regexp_code,
166                        *subject,
167                        start_offset,
168                        input_start,
169                        input_end,
170                        offsets_vector,
171                        offsets_vector_length,
172                        isolate);
173   return res;
174 }
175 
176 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)177 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
178     Code* code,
179     String* input,  // This needs to be the unpacked (sliced, cons) string.
180     int start_offset,
181     const byte* input_start,
182     const byte* input_end,
183     int* output,
184     int output_size,
185     Isolate* isolate) {
186   // Ensure that the minimum stack has been allocated.
187   RegExpStackScope stack_scope(isolate);
188   Address stack_base = stack_scope.stack()->stack_base();
189 
190   int direct_call = 0;
191   int result = CALL_GENERATED_REGEXP_CODE(
192       isolate, code->entry(), input, start_offset, input_start, input_end,
193       output, output_size, stack_base, direct_call, isolate);
194   DCHECK(result >= RETRY);
195 
196   if (result == EXCEPTION && !isolate->has_pending_exception()) {
197     // We detected a stack overflow (on the backtrack stack) in RegExp code,
198     // but haven't created the exception yet.
199     isolate->StackOverflow();
200   }
201   return static_cast<Result>(result);
202 }
203 
204 
205 const byte NativeRegExpMacroAssembler::word_character_map[] = {
206     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
207     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
208     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
209     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
210 
211     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
212     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
213     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
214     0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
215 
216     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
217     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
218     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
219     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
220 
221     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
222     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
223     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
224     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
225     // Latin-1 range
226     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
227     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
228     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
229     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
230 
231     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
232     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
233     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
234     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
235 
236     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
237     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
238     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
239     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
240 
241     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
242     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
243     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
244     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
245 };
246 
247 
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)248 int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
249     Address byte_offset1,
250     Address byte_offset2,
251     size_t byte_length,
252     Isolate* isolate) {
253   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
254       isolate->regexp_macro_assembler_canonicalize();
255   // This function is not allowed to cause a garbage collection.
256   // A GC might move the calling generated code and invalidate the
257   // return address on the stack.
258   DCHECK(byte_length % 2 == 0);
259   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
260   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
261   size_t length = byte_length >> 1;
262 
263   for (size_t i = 0; i < length; i++) {
264     unibrow::uchar c1 = substring1[i];
265     unibrow::uchar c2 = substring2[i];
266     if (c1 != c2) {
267       unibrow::uchar s1[1] = { c1 };
268       canonicalize->get(c1, '\0', s1);
269       if (s1[0] != c2) {
270         unibrow::uchar s2[1] = { c2 };
271         canonicalize->get(c2, '\0', s2);
272         if (s1[0] != s2[0]) {
273           return 0;
274         }
275       }
276     }
277   }
278   return 1;
279 }
280 
281 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)282 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
283                                               Address* stack_base,
284                                               Isolate* isolate) {
285   RegExpStack* regexp_stack = isolate->regexp_stack();
286   size_t size = regexp_stack->stack_capacity();
287   Address old_stack_base = regexp_stack->stack_base();
288   DCHECK(old_stack_base == *stack_base);
289   DCHECK(stack_pointer <= old_stack_base);
290   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
291   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
292   if (new_stack_base == NULL) {
293     return NULL;
294   }
295   *stack_base = new_stack_base;
296   intptr_t stack_content_size = old_stack_base - stack_pointer;
297   return new_stack_base - stack_content_size;
298 }
299 
300 #endif  // V8_INTERPRETED_REGEXP
301 
302 }  // namespace internal
303 }  // namespace v8
304