1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28
29 #include <stdlib.h>
30
31 #include "src/v8.h"
32
33 #include "src/ast.h"
34 #include "src/char-predicates-inl.h"
35 #include "src/jsregexp.h"
36 #include "src/ostreams.h"
37 #include "src/parser.h"
38 #include "src/regexp-macro-assembler.h"
39 #include "src/regexp-macro-assembler-irregexp.h"
40 #include "src/string-stream.h"
41 #include "src/zone-inl.h"
42 #ifdef V8_INTERPRETED_REGEXP
43 #include "src/interpreter-irregexp.h"
44 #else // V8_INTERPRETED_REGEXP
45 #include "src/macro-assembler.h"
46 #if V8_TARGET_ARCH_ARM
47 #include "src/arm/assembler-arm.h" // NOLINT
48 #include "src/arm/macro-assembler-arm.h"
49 #include "src/arm/regexp-macro-assembler-arm.h"
50 #endif
51 #if V8_TARGET_ARCH_ARM64
52 #include "src/arm64/assembler-arm64.h"
53 #include "src/arm64/macro-assembler-arm64.h"
54 #include "src/arm64/regexp-macro-assembler-arm64.h"
55 #endif
56 #if V8_TARGET_ARCH_MIPS
57 #include "src/mips/assembler-mips.h"
58 #include "src/mips/macro-assembler-mips.h"
59 #include "src/mips/regexp-macro-assembler-mips.h"
60 #endif
61 #if V8_TARGET_ARCH_MIPS64
62 #include "src/mips64/assembler-mips64.h"
63 #include "src/mips64/macro-assembler-mips64.h"
64 #include "src/mips64/regexp-macro-assembler-mips64.h"
65 #endif
66 #if V8_TARGET_ARCH_X64
67 #include "src/x64/assembler-x64.h"
68 #include "src/x64/macro-assembler-x64.h"
69 #include "src/x64/regexp-macro-assembler-x64.h"
70 #endif
71 #if V8_TARGET_ARCH_IA32
72 #include "src/ia32/assembler-ia32.h"
73 #include "src/ia32/macro-assembler-ia32.h"
74 #include "src/ia32/regexp-macro-assembler-ia32.h"
75 #endif
76 #if V8_TARGET_ARCH_X87
77 #include "src/x87/assembler-x87.h"
78 #include "src/x87/macro-assembler-x87.h"
79 #include "src/x87/regexp-macro-assembler-x87.h"
80 #endif
81 #endif // V8_INTERPRETED_REGEXP
82 #include "test/cctest/cctest.h"
83
84 using namespace v8::internal;
85
86
CheckParse(const char * input)87 static bool CheckParse(const char* input) {
88 v8::HandleScope scope(CcTest::isolate());
89 Zone zone(CcTest::i_isolate());
90 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
91 RegExpCompileData result;
92 return v8::internal::RegExpParser::ParseRegExp(
93 &reader, false, &result, &zone);
94 }
95
96
CheckParseEq(const char * input,const char * expected)97 static void CheckParseEq(const char* input, const char* expected) {
98 v8::HandleScope scope(CcTest::isolate());
99 Zone zone(CcTest::i_isolate());
100 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
101 RegExpCompileData result;
102 CHECK(v8::internal::RegExpParser::ParseRegExp(
103 &reader, false, &result, &zone));
104 CHECK(result.tree != NULL);
105 CHECK(result.error.is_null());
106 OStringStream os;
107 result.tree->Print(os, &zone);
108 CHECK_EQ(expected, os.c_str());
109 }
110
111
CheckSimple(const char * input)112 static bool CheckSimple(const char* input) {
113 v8::HandleScope scope(CcTest::isolate());
114 Zone zone(CcTest::i_isolate());
115 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
116 RegExpCompileData result;
117 CHECK(v8::internal::RegExpParser::ParseRegExp(
118 &reader, false, &result, &zone));
119 CHECK(result.tree != NULL);
120 CHECK(result.error.is_null());
121 return result.simple;
122 }
123
124 struct MinMaxPair {
125 int min_match;
126 int max_match;
127 };
128
129
CheckMinMaxMatch(const char * input)130 static MinMaxPair CheckMinMaxMatch(const char* input) {
131 v8::HandleScope scope(CcTest::isolate());
132 Zone zone(CcTest::i_isolate());
133 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
134 RegExpCompileData result;
135 CHECK(v8::internal::RegExpParser::ParseRegExp(
136 &reader, false, &result, &zone));
137 CHECK(result.tree != NULL);
138 CHECK(result.error.is_null());
139 int min_match = result.tree->min_match();
140 int max_match = result.tree->max_match();
141 MinMaxPair pair = { min_match, max_match };
142 return pair;
143 }
144
145
146 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
147 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
148 #define CHECK_MIN_MAX(input, min, max) \
149 { MinMaxPair min_max = CheckMinMaxMatch(input); \
150 CHECK_EQ(min, min_max.min_match); \
151 CHECK_EQ(max, min_max.max_match); \
152 }
153
TEST(Parser)154 TEST(Parser) {
155 CHECK_PARSE_ERROR("?");
156
157 CheckParseEq("abc", "'abc'");
158 CheckParseEq("", "%");
159 CheckParseEq("abc|def", "(| 'abc' 'def')");
160 CheckParseEq("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
161 CheckParseEq("^xxx$", "(: @^i 'xxx' @$i)");
162 CheckParseEq("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
163 CheckParseEq("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
164 CheckParseEq("a*", "(# 0 - g 'a')");
165 CheckParseEq("a*?", "(# 0 - n 'a')");
166 CheckParseEq("abc+", "(: 'ab' (# 1 - g 'c'))");
167 CheckParseEq("abc+?", "(: 'ab' (# 1 - n 'c'))");
168 CheckParseEq("xyz?", "(: 'xy' (# 0 1 g 'z'))");
169 CheckParseEq("xyz??", "(: 'xy' (# 0 1 n 'z'))");
170 CheckParseEq("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
171 CheckParseEq("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
172 CheckParseEq("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
173 CheckParseEq("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
174 CheckParseEq("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
175 CheckParseEq("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
176 CheckParseEq("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
177 CheckParseEq("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
178 CheckParseEq("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
179 CheckParseEq("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
180 CheckParseEq("(?:foo)", "'foo'");
181 CheckParseEq("(?: foo )", "' foo '");
182 CheckParseEq("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
183 CheckParseEq("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
184 CheckParseEq("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
185 CheckParseEq("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
186 CheckParseEq("()", "(^ %)");
187 CheckParseEq("(?=)", "(-> + %)");
188 CheckParseEq("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows
189 CheckParseEq("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252
190 CheckParseEq("[x]", "[x]");
191 CheckParseEq("[xyz]", "[x y z]");
192 CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
193 CheckParseEq("[-123]", "[- 1 2 3]");
194 CheckParseEq("[^123]", "^[1 2 3]");
195 CheckParseEq("]", "']'");
196 CheckParseEq("}", "'}'");
197 CheckParseEq("[a-b-c]", "[a-b - c]");
198 CheckParseEq("[\\d]", "[0-9]");
199 CheckParseEq("[x\\dz]", "[x 0-9 z]");
200 CheckParseEq("[\\d-z]", "[0-9 - z]");
201 CheckParseEq("[\\d-\\d]", "[0-9 - 0-9]");
202 CheckParseEq("[z-\\d]", "[z - 0-9]");
203 // Control character outside character class.
204 CheckParseEq("\\cj\\cJ\\ci\\cI\\ck\\cK", "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
205 CheckParseEq("\\c!", "'\\c!'");
206 CheckParseEq("\\c_", "'\\c_'");
207 CheckParseEq("\\c~", "'\\c~'");
208 CheckParseEq("\\c1", "'\\c1'");
209 // Control character inside character class.
210 CheckParseEq("[\\c!]", "[\\ c !]");
211 CheckParseEq("[\\c_]", "[\\x1f]");
212 CheckParseEq("[\\c~]", "[\\ c ~]");
213 CheckParseEq("[\\ca]", "[\\x01]");
214 CheckParseEq("[\\cz]", "[\\x1a]");
215 CheckParseEq("[\\cA]", "[\\x01]");
216 CheckParseEq("[\\cZ]", "[\\x1a]");
217 CheckParseEq("[\\c1]", "[\\x11]");
218
219 CheckParseEq("[a\\]c]", "[a ] c]");
220 CheckParseEq("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
221 CheckParseEq("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]");
222 CheckParseEq("\\0", "'\\x00'");
223 CheckParseEq("\\8", "'8'");
224 CheckParseEq("\\9", "'9'");
225 CheckParseEq("\\11", "'\\x09'");
226 CheckParseEq("\\11a", "'\\x09a'");
227 CheckParseEq("\\011", "'\\x09'");
228 CheckParseEq("\\00011", "'\\x0011'");
229 CheckParseEq("\\118", "'\\x098'");
230 CheckParseEq("\\111", "'I'");
231 CheckParseEq("\\1111", "'I1'");
232 CheckParseEq("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
233 CheckParseEq("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
234 CheckParseEq("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
235 CheckParseEq("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
236 CheckParseEq("(x)(x)(x)\\1*",
237 "(: (^ 'x') (^ 'x') (^ 'x')"
238 " (# 0 - g (<- 1)))");
239 CheckParseEq("(x)(x)(x)\\2*",
240 "(: (^ 'x') (^ 'x') (^ 'x')"
241 " (# 0 - g (<- 2)))");
242 CheckParseEq("(x)(x)(x)\\3*",
243 "(: (^ 'x') (^ 'x') (^ 'x')"
244 " (# 0 - g (<- 3)))");
245 CheckParseEq("(x)(x)(x)\\4*",
246 "(: (^ 'x') (^ 'x') (^ 'x')"
247 " (# 0 - g '\\x04'))");
248 CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
249 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
250 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
251 CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
252 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
253 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
254 CheckParseEq("(a)\\1", "(: (^ 'a') (<- 1))");
255 CheckParseEq("(a\\1)", "(^ 'a')");
256 CheckParseEq("(\\1a)", "(^ 'a')");
257 CheckParseEq("(?=a)?a", "'a'");
258 CheckParseEq("(?=a){0,10}a", "'a'");
259 CheckParseEq("(?=a){1,10}a", "(: (-> + 'a') 'a')");
260 CheckParseEq("(?=a){9,10}a", "(: (-> + 'a') 'a')");
261 CheckParseEq("(?!a)?a", "'a'");
262 CheckParseEq("\\1(a)", "(^ 'a')");
263 CheckParseEq("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
264 CheckParseEq("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
265 CheckParseEq("[\\0]", "[\\x00]");
266 CheckParseEq("[\\11]", "[\\x09]");
267 CheckParseEq("[\\11a]", "[\\x09 a]");
268 CheckParseEq("[\\011]", "[\\x09]");
269 CheckParseEq("[\\00011]", "[\\x00 1 1]");
270 CheckParseEq("[\\118]", "[\\x09 8]");
271 CheckParseEq("[\\111]", "[I]");
272 CheckParseEq("[\\1111]", "[I 1]");
273 CheckParseEq("\\x34", "'\x34'");
274 CheckParseEq("\\x60", "'\x60'");
275 CheckParseEq("\\x3z", "'x3z'");
276 CheckParseEq("\\c", "'\\c'");
277 CheckParseEq("\\u0034", "'\x34'");
278 CheckParseEq("\\u003z", "'u003z'");
279 CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
280
281 CHECK_SIMPLE("", false);
282 CHECK_SIMPLE("a", true);
283 CHECK_SIMPLE("a|b", false);
284 CHECK_SIMPLE("a\\n", false);
285 CHECK_SIMPLE("^a", false);
286 CHECK_SIMPLE("a$", false);
287 CHECK_SIMPLE("a\\b!", false);
288 CHECK_SIMPLE("a\\Bb", false);
289 CHECK_SIMPLE("a*", false);
290 CHECK_SIMPLE("a*?", false);
291 CHECK_SIMPLE("a?", false);
292 CHECK_SIMPLE("a??", false);
293 CHECK_SIMPLE("a{0,1}?", false);
294 CHECK_SIMPLE("a{1,1}?", false);
295 CHECK_SIMPLE("a{1,2}?", false);
296 CHECK_SIMPLE("a+?", false);
297 CHECK_SIMPLE("(a)", false);
298 CHECK_SIMPLE("(a)\\1", false);
299 CHECK_SIMPLE("(\\1a)", false);
300 CHECK_SIMPLE("\\1(a)", false);
301 CHECK_SIMPLE("a\\s", false);
302 CHECK_SIMPLE("a\\S", false);
303 CHECK_SIMPLE("a\\d", false);
304 CHECK_SIMPLE("a\\D", false);
305 CHECK_SIMPLE("a\\w", false);
306 CHECK_SIMPLE("a\\W", false);
307 CHECK_SIMPLE("a.", false);
308 CHECK_SIMPLE("a\\q", false);
309 CHECK_SIMPLE("a[a]", false);
310 CHECK_SIMPLE("a[^a]", false);
311 CHECK_SIMPLE("a[a-z]", false);
312 CHECK_SIMPLE("a[\\q]", false);
313 CHECK_SIMPLE("a(?:b)", false);
314 CHECK_SIMPLE("a(?=b)", false);
315 CHECK_SIMPLE("a(?!b)", false);
316 CHECK_SIMPLE("\\x60", false);
317 CHECK_SIMPLE("\\u0060", false);
318 CHECK_SIMPLE("\\cA", false);
319 CHECK_SIMPLE("\\q", false);
320 CHECK_SIMPLE("\\1112", false);
321 CHECK_SIMPLE("\\0", false);
322 CHECK_SIMPLE("(a)\\1", false);
323 CHECK_SIMPLE("(?=a)?a", false);
324 CHECK_SIMPLE("(?!a)?a\\1", false);
325 CHECK_SIMPLE("(?:(?=a))a\\1", false);
326
327 CheckParseEq("a{}", "'a{}'");
328 CheckParseEq("a{,}", "'a{,}'");
329 CheckParseEq("a{", "'a{'");
330 CheckParseEq("a{z}", "'a{z}'");
331 CheckParseEq("a{1z}", "'a{1z}'");
332 CheckParseEq("a{12z}", "'a{12z}'");
333 CheckParseEq("a{12,", "'a{12,'");
334 CheckParseEq("a{12,3b", "'a{12,3b'");
335 CheckParseEq("{}", "'{}'");
336 CheckParseEq("{,}", "'{,}'");
337 CheckParseEq("{", "'{'");
338 CheckParseEq("{z}", "'{z}'");
339 CheckParseEq("{1z}", "'{1z}'");
340 CheckParseEq("{12z}", "'{12z}'");
341 CheckParseEq("{12,", "'{12,'");
342 CheckParseEq("{12,3b", "'{12,3b'");
343
344 CHECK_MIN_MAX("a", 1, 1);
345 CHECK_MIN_MAX("abc", 3, 3);
346 CHECK_MIN_MAX("a[bc]d", 3, 3);
347 CHECK_MIN_MAX("a|bc", 1, 2);
348 CHECK_MIN_MAX("ab|c", 1, 2);
349 CHECK_MIN_MAX("a||bc", 0, 2);
350 CHECK_MIN_MAX("|", 0, 0);
351 CHECK_MIN_MAX("(?:ab)", 2, 2);
352 CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
353 CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
354 CHECK_MIN_MAX("(ab)", 2, 2);
355 CHECK_MIN_MAX("(ab|cde)", 2, 3);
356 CHECK_MIN_MAX("(ab)\\1", 2, 4);
357 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
358 CHECK_MIN_MAX("(?:ab)?", 0, 2);
359 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
360 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
361 CHECK_MIN_MAX("a?", 0, 1);
362 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
363 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
364 CHECK_MIN_MAX("a??", 0, 1);
365 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
366 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
367 CHECK_MIN_MAX("(?:a?)?", 0, 1);
368 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
369 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
370 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
371 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
372 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
373 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
374 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
375 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
376 CHECK_MIN_MAX("a{0}", 0, 0);
377 CHECK_MIN_MAX("(?:a+){0}", 0, 0);
378 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
379 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
380 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
381 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
382 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
383 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
384 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
385 CHECK_MIN_MAX("a\\bc", 2, 2);
386 CHECK_MIN_MAX("a\\Bc", 2, 2);
387 CHECK_MIN_MAX("a\\sc", 3, 3);
388 CHECK_MIN_MAX("a\\Sc", 3, 3);
389 CHECK_MIN_MAX("a(?=b)c", 2, 2);
390 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
391 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
392 }
393
394
TEST(ParserRegression)395 TEST(ParserRegression) {
396 CheckParseEq("[A-Z$-][x]", "(! [A-Z $ -] [x])");
397 CheckParseEq("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
398 CheckParseEq("{", "'{'");
399 CheckParseEq("a|", "(| 'a' %)");
400 }
401
ExpectError(const char * input,const char * expected)402 static void ExpectError(const char* input,
403 const char* expected) {
404 v8::HandleScope scope(CcTest::isolate());
405 Zone zone(CcTest::i_isolate());
406 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
407 RegExpCompileData result;
408 CHECK(!v8::internal::RegExpParser::ParseRegExp(
409 &reader, false, &result, &zone));
410 CHECK(result.tree == NULL);
411 CHECK(!result.error.is_null());
412 SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
413 CHECK_EQ(expected, str.get());
414 }
415
416
TEST(Errors)417 TEST(Errors) {
418 const char* kEndBackslash = "\\ at end of pattern";
419 ExpectError("\\", kEndBackslash);
420 const char* kUnterminatedGroup = "Unterminated group";
421 ExpectError("(foo", kUnterminatedGroup);
422 const char* kInvalidGroup = "Invalid group";
423 ExpectError("(?", kInvalidGroup);
424 const char* kUnterminatedCharacterClass = "Unterminated character class";
425 ExpectError("[", kUnterminatedCharacterClass);
426 ExpectError("[a-", kUnterminatedCharacterClass);
427 const char* kNothingToRepeat = "Nothing to repeat";
428 ExpectError("*", kNothingToRepeat);
429 ExpectError("?", kNothingToRepeat);
430 ExpectError("+", kNothingToRepeat);
431 ExpectError("{1}", kNothingToRepeat);
432 ExpectError("{1,2}", kNothingToRepeat);
433 ExpectError("{1,}", kNothingToRepeat);
434
435 // Check that we don't allow more than kMaxCapture captures
436 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures.
437 const char* kTooManyCaptures = "Too many captures";
438 OStringStream os;
439 for (int i = 0; i <= kMaxCaptures; i++) {
440 os << "()";
441 }
442 ExpectError(os.c_str(), kTooManyCaptures);
443 }
444
445
IsDigit(uc16 c)446 static bool IsDigit(uc16 c) {
447 return ('0' <= c && c <= '9');
448 }
449
450
NotDigit(uc16 c)451 static bool NotDigit(uc16 c) {
452 return !IsDigit(c);
453 }
454
455
IsWhiteSpaceOrLineTerminator(uc16 c)456 static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
457 // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
458 // WhiteSpace (7.2) and LineTerminator (7.3) values.
459 return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
460 }
461
462
NotWhiteSpaceNorLineTermiantor(uc16 c)463 static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
464 return !IsWhiteSpaceOrLineTerminator(c);
465 }
466
467
NotWord(uc16 c)468 static bool NotWord(uc16 c) {
469 return !IsRegExpWord(c);
470 }
471
472
TestCharacterClassEscapes(uc16 c,bool (pred)(uc16 c))473 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
474 Zone zone(CcTest::i_isolate());
475 ZoneList<CharacterRange>* ranges =
476 new(&zone) ZoneList<CharacterRange>(2, &zone);
477 CharacterRange::AddClassEscape(c, ranges, &zone);
478 for (unsigned i = 0; i < (1 << 16); i++) {
479 bool in_class = false;
480 for (int j = 0; !in_class && j < ranges->length(); j++) {
481 CharacterRange& range = ranges->at(j);
482 in_class = (range.from() <= i && i <= range.to());
483 }
484 CHECK_EQ(pred(i), in_class);
485 }
486 }
487
488
TEST(CharacterClassEscapes)489 TEST(CharacterClassEscapes) {
490 TestCharacterClassEscapes('.', IsRegExpNewline);
491 TestCharacterClassEscapes('d', IsDigit);
492 TestCharacterClassEscapes('D', NotDigit);
493 TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
494 TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
495 TestCharacterClassEscapes('w', IsRegExpWord);
496 TestCharacterClassEscapes('W', NotWord);
497 }
498
499
Compile(const char * input,bool multiline,bool is_one_byte,Zone * zone)500 static RegExpNode* Compile(const char* input, bool multiline, bool is_one_byte,
501 Zone* zone) {
502 Isolate* isolate = CcTest::i_isolate();
503 FlatStringReader reader(isolate, CStrVector(input));
504 RegExpCompileData compile_data;
505 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline,
506 &compile_data, zone))
507 return NULL;
508 Handle<String> pattern = isolate->factory()->
509 NewStringFromUtf8(CStrVector(input)).ToHandleChecked();
510 Handle<String> sample_subject =
511 isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
512 RegExpEngine::Compile(&compile_data, false, false, multiline, false, pattern,
513 sample_subject, is_one_byte, zone);
514 return compile_data.node;
515 }
516
517
Execute(const char * input,bool multiline,bool is_one_byte,bool dot_output=false)518 static void Execute(const char* input, bool multiline, bool is_one_byte,
519 bool dot_output = false) {
520 v8::HandleScope scope(CcTest::isolate());
521 Zone zone(CcTest::i_isolate());
522 RegExpNode* node = Compile(input, multiline, is_one_byte, &zone);
523 USE(node);
524 #ifdef DEBUG
525 if (dot_output) {
526 RegExpEngine::DotPrint(input, node, false);
527 }
528 #endif // DEBUG
529 }
530
531
532 class TestConfig {
533 public:
534 typedef int Key;
535 typedef int Value;
536 static const int kNoKey;
NoValue()537 static int NoValue() { return 0; }
Compare(int a,int b)538 static inline int Compare(int a, int b) {
539 if (a < b)
540 return -1;
541 else if (a > b)
542 return 1;
543 else
544 return 0;
545 }
546 };
547
548
549 const int TestConfig::kNoKey = 0;
550
551
PseudoRandom(int i,int j)552 static unsigned PseudoRandom(int i, int j) {
553 return ~(~((i * 781) ^ (j * 329)));
554 }
555
556
TEST(SplayTreeSimple)557 TEST(SplayTreeSimple) {
558 static const unsigned kLimit = 1000;
559 Zone zone(CcTest::i_isolate());
560 ZoneSplayTree<TestConfig> tree(&zone);
561 bool seen[kLimit];
562 for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
563 #define CHECK_MAPS_EQUAL() do { \
564 for (unsigned k = 0; k < kLimit; k++) \
565 CHECK_EQ(seen[k], tree.Find(k, &loc)); \
566 } while (false)
567 for (int i = 0; i < 50; i++) {
568 for (int j = 0; j < 50; j++) {
569 unsigned next = PseudoRandom(i, j) % kLimit;
570 if (seen[next]) {
571 // We've already seen this one. Check the value and remove
572 // it.
573 ZoneSplayTree<TestConfig>::Locator loc;
574 CHECK(tree.Find(next, &loc));
575 CHECK_EQ(next, loc.key());
576 CHECK_EQ(3 * next, loc.value());
577 tree.Remove(next);
578 seen[next] = false;
579 CHECK_MAPS_EQUAL();
580 } else {
581 // Check that it wasn't there already and then add it.
582 ZoneSplayTree<TestConfig>::Locator loc;
583 CHECK(!tree.Find(next, &loc));
584 CHECK(tree.Insert(next, &loc));
585 CHECK_EQ(next, loc.key());
586 loc.set_value(3 * next);
587 seen[next] = true;
588 CHECK_MAPS_EQUAL();
589 }
590 int val = PseudoRandom(j, i) % kLimit;
591 if (seen[val]) {
592 ZoneSplayTree<TestConfig>::Locator loc;
593 CHECK(tree.FindGreatestLessThan(val, &loc));
594 CHECK_EQ(loc.key(), val);
595 break;
596 }
597 val = PseudoRandom(i + j, i - j) % kLimit;
598 if (seen[val]) {
599 ZoneSplayTree<TestConfig>::Locator loc;
600 CHECK(tree.FindLeastGreaterThan(val, &loc));
601 CHECK_EQ(loc.key(), val);
602 break;
603 }
604 }
605 }
606 }
607
608
TEST(DispatchTableConstruction)609 TEST(DispatchTableConstruction) {
610 // Initialize test data.
611 static const int kLimit = 1000;
612 static const int kRangeCount = 8;
613 static const int kRangeSize = 16;
614 uc16 ranges[kRangeCount][2 * kRangeSize];
615 for (int i = 0; i < kRangeCount; i++) {
616 Vector<uc16> range(ranges[i], 2 * kRangeSize);
617 for (int j = 0; j < 2 * kRangeSize; j++) {
618 range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
619 }
620 range.Sort();
621 for (int j = 1; j < 2 * kRangeSize; j++) {
622 CHECK(range[j-1] <= range[j]);
623 }
624 }
625 // Enter test data into dispatch table.
626 Zone zone(CcTest::i_isolate());
627 DispatchTable table(&zone);
628 for (int i = 0; i < kRangeCount; i++) {
629 uc16* range = ranges[i];
630 for (int j = 0; j < 2 * kRangeSize; j += 2)
631 table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone);
632 }
633 // Check that the table looks as we would expect
634 for (int p = 0; p < kLimit; p++) {
635 OutSet* outs = table.Get(p);
636 for (int j = 0; j < kRangeCount; j++) {
637 uc16* range = ranges[j];
638 bool is_on = false;
639 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
640 is_on = (range[k] <= p && p <= range[k + 1]);
641 CHECK_EQ(is_on, outs->Get(j));
642 }
643 }
644 }
645
646
647 // Test of debug-only syntax.
648 #ifdef DEBUG
649
TEST(ParsePossessiveRepetition)650 TEST(ParsePossessiveRepetition) {
651 bool old_flag_value = FLAG_regexp_possessive_quantifier;
652
653 // Enable possessive quantifier syntax.
654 FLAG_regexp_possessive_quantifier = true;
655
656 CheckParseEq("a*+", "(# 0 - p 'a')");
657 CheckParseEq("a++", "(# 1 - p 'a')");
658 CheckParseEq("a?+", "(# 0 1 p 'a')");
659 CheckParseEq("a{10,20}+", "(# 10 20 p 'a')");
660 CheckParseEq("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
661
662 // Disable possessive quantifier syntax.
663 FLAG_regexp_possessive_quantifier = false;
664
665 CHECK_PARSE_ERROR("a*+");
666 CHECK_PARSE_ERROR("a++");
667 CHECK_PARSE_ERROR("a?+");
668 CHECK_PARSE_ERROR("a{10,20}+");
669 CHECK_PARSE_ERROR("a{10,20}+b");
670
671 FLAG_regexp_possessive_quantifier = old_flag_value;
672 }
673
674 #endif
675
676 // Tests of interpreter.
677
678
679 #ifndef V8_INTERPRETED_REGEXP
680
681 #if V8_TARGET_ARCH_IA32
682 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
683 #elif V8_TARGET_ARCH_X64
684 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
685 #elif V8_TARGET_ARCH_ARM
686 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
687 #elif V8_TARGET_ARCH_ARM64
688 typedef RegExpMacroAssemblerARM64 ArchRegExpMacroAssembler;
689 #elif V8_TARGET_ARCH_MIPS
690 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
691 #elif V8_TARGET_ARCH_MIPS64
692 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
693 #elif V8_TARGET_ARCH_X87
694 typedef RegExpMacroAssemblerX87 ArchRegExpMacroAssembler;
695 #endif
696
697 class ContextInitializer {
698 public:
ContextInitializer()699 ContextInitializer()
700 : scope_(CcTest::isolate()),
701 env_(v8::Context::New(CcTest::isolate())) {
702 env_->Enter();
703 }
~ContextInitializer()704 ~ContextInitializer() {
705 env_->Exit();
706 }
707 private:
708 v8::HandleScope scope_;
709 v8::Handle<v8::Context> env_;
710 };
711
712
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * captures)713 static ArchRegExpMacroAssembler::Result Execute(Code* code,
714 String* input,
715 int start_offset,
716 const byte* input_start,
717 const byte* input_end,
718 int* captures) {
719 return NativeRegExpMacroAssembler::Execute(
720 code,
721 input,
722 start_offset,
723 input_start,
724 input_end,
725 captures,
726 0,
727 CcTest::i_isolate());
728 }
729
730
TEST(MacroAssemblerNativeSuccess)731 TEST(MacroAssemblerNativeSuccess) {
732 v8::V8::Initialize();
733 ContextInitializer initializer;
734 Isolate* isolate = CcTest::i_isolate();
735 Factory* factory = isolate->factory();
736 Zone zone(isolate);
737
738 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 4, &zone);
739
740 m.Succeed();
741
742 Handle<String> source = factory->NewStringFromStaticChars("");
743 Handle<Object> code_object = m.GetCode(source);
744 Handle<Code> code = Handle<Code>::cast(code_object);
745
746 int captures[4] = {42, 37, 87, 117};
747 Handle<String> input = factory->NewStringFromStaticChars("foofoo");
748 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
749 const byte* start_adr =
750 reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
751
752 NativeRegExpMacroAssembler::Result result =
753 Execute(*code,
754 *input,
755 0,
756 start_adr,
757 start_adr + seq_input->length(),
758 captures);
759
760 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
761 CHECK_EQ(-1, captures[0]);
762 CHECK_EQ(-1, captures[1]);
763 CHECK_EQ(-1, captures[2]);
764 CHECK_EQ(-1, captures[3]);
765 }
766
767
TEST(MacroAssemblerNativeSimple)768 TEST(MacroAssemblerNativeSimple) {
769 v8::V8::Initialize();
770 ContextInitializer initializer;
771 Isolate* isolate = CcTest::i_isolate();
772 Factory* factory = isolate->factory();
773 Zone zone(isolate);
774
775 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 4, &zone);
776
777 Label fail, backtrack;
778 m.PushBacktrack(&fail);
779 m.CheckNotAtStart(NULL);
780 m.LoadCurrentCharacter(2, NULL);
781 m.CheckNotCharacter('o', NULL);
782 m.LoadCurrentCharacter(1, NULL, false);
783 m.CheckNotCharacter('o', NULL);
784 m.LoadCurrentCharacter(0, NULL, false);
785 m.CheckNotCharacter('f', NULL);
786 m.WriteCurrentPositionToRegister(0, 0);
787 m.WriteCurrentPositionToRegister(1, 3);
788 m.AdvanceCurrentPosition(3);
789 m.PushBacktrack(&backtrack);
790 m.Succeed();
791 m.Bind(&backtrack);
792 m.Backtrack();
793 m.Bind(&fail);
794 m.Fail();
795
796 Handle<String> source = factory->NewStringFromStaticChars("^foo");
797 Handle<Object> code_object = m.GetCode(source);
798 Handle<Code> code = Handle<Code>::cast(code_object);
799
800 int captures[4] = {42, 37, 87, 117};
801 Handle<String> input = factory->NewStringFromStaticChars("foofoo");
802 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
803 Address start_adr = seq_input->GetCharsAddress();
804
805 NativeRegExpMacroAssembler::Result result =
806 Execute(*code,
807 *input,
808 0,
809 start_adr,
810 start_adr + input->length(),
811 captures);
812
813 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
814 CHECK_EQ(0, captures[0]);
815 CHECK_EQ(3, captures[1]);
816 CHECK_EQ(-1, captures[2]);
817 CHECK_EQ(-1, captures[3]);
818
819 input = factory->NewStringFromStaticChars("barbarbar");
820 seq_input = Handle<SeqOneByteString>::cast(input);
821 start_adr = seq_input->GetCharsAddress();
822
823 result = Execute(*code,
824 *input,
825 0,
826 start_adr,
827 start_adr + input->length(),
828 captures);
829
830 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
831 }
832
833
TEST(MacroAssemblerNativeSimpleUC16)834 TEST(MacroAssemblerNativeSimpleUC16) {
835 v8::V8::Initialize();
836 ContextInitializer initializer;
837 Isolate* isolate = CcTest::i_isolate();
838 Factory* factory = isolate->factory();
839 Zone zone(isolate);
840
841 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
842
843 Label fail, backtrack;
844 m.PushBacktrack(&fail);
845 m.CheckNotAtStart(NULL);
846 m.LoadCurrentCharacter(2, NULL);
847 m.CheckNotCharacter('o', NULL);
848 m.LoadCurrentCharacter(1, NULL, false);
849 m.CheckNotCharacter('o', NULL);
850 m.LoadCurrentCharacter(0, NULL, false);
851 m.CheckNotCharacter('f', NULL);
852 m.WriteCurrentPositionToRegister(0, 0);
853 m.WriteCurrentPositionToRegister(1, 3);
854 m.AdvanceCurrentPosition(3);
855 m.PushBacktrack(&backtrack);
856 m.Succeed();
857 m.Bind(&backtrack);
858 m.Backtrack();
859 m.Bind(&fail);
860 m.Fail();
861
862 Handle<String> source = factory->NewStringFromStaticChars("^foo");
863 Handle<Object> code_object = m.GetCode(source);
864 Handle<Code> code = Handle<Code>::cast(code_object);
865
866 int captures[4] = {42, 37, 87, 117};
867 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
868 static_cast<uc16>(0x2603)};
869 Handle<String> input = factory->NewStringFromTwoByte(
870 Vector<const uc16>(input_data, 6)).ToHandleChecked();
871 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
872 Address start_adr = seq_input->GetCharsAddress();
873
874 NativeRegExpMacroAssembler::Result result =
875 Execute(*code,
876 *input,
877 0,
878 start_adr,
879 start_adr + input->length(),
880 captures);
881
882 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
883 CHECK_EQ(0, captures[0]);
884 CHECK_EQ(3, captures[1]);
885 CHECK_EQ(-1, captures[2]);
886 CHECK_EQ(-1, captures[3]);
887
888 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
889 static_cast<uc16>(0x2603)};
890 input = factory->NewStringFromTwoByte(
891 Vector<const uc16>(input_data2, 9)).ToHandleChecked();
892 seq_input = Handle<SeqTwoByteString>::cast(input);
893 start_adr = seq_input->GetCharsAddress();
894
895 result = Execute(*code,
896 *input,
897 0,
898 start_adr,
899 start_adr + input->length() * 2,
900 captures);
901
902 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
903 }
904
905
TEST(MacroAssemblerNativeBacktrack)906 TEST(MacroAssemblerNativeBacktrack) {
907 v8::V8::Initialize();
908 ContextInitializer initializer;
909 Isolate* isolate = CcTest::i_isolate();
910 Factory* factory = isolate->factory();
911 Zone zone(isolate);
912
913 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 0, &zone);
914
915 Label fail;
916 Label backtrack;
917 m.LoadCurrentCharacter(10, &fail);
918 m.Succeed();
919 m.Bind(&fail);
920 m.PushBacktrack(&backtrack);
921 m.LoadCurrentCharacter(10, NULL);
922 m.Succeed();
923 m.Bind(&backtrack);
924 m.Fail();
925
926 Handle<String> source = factory->NewStringFromStaticChars("..........");
927 Handle<Object> code_object = m.GetCode(source);
928 Handle<Code> code = Handle<Code>::cast(code_object);
929
930 Handle<String> input = factory->NewStringFromStaticChars("foofoo");
931 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
932 Address start_adr = seq_input->GetCharsAddress();
933
934 NativeRegExpMacroAssembler::Result result =
935 Execute(*code,
936 *input,
937 0,
938 start_adr,
939 start_adr + input->length(),
940 NULL);
941
942 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
943 }
944
945
TEST(MacroAssemblerNativeBackReferenceLATIN1)946 TEST(MacroAssemblerNativeBackReferenceLATIN1) {
947 v8::V8::Initialize();
948 ContextInitializer initializer;
949 Isolate* isolate = CcTest::i_isolate();
950 Factory* factory = isolate->factory();
951 Zone zone(isolate);
952
953 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 4, &zone);
954
955 m.WriteCurrentPositionToRegister(0, 0);
956 m.AdvanceCurrentPosition(2);
957 m.WriteCurrentPositionToRegister(1, 0);
958 Label nomatch;
959 m.CheckNotBackReference(0, &nomatch);
960 m.Fail();
961 m.Bind(&nomatch);
962 m.AdvanceCurrentPosition(2);
963 Label missing_match;
964 m.CheckNotBackReference(0, &missing_match);
965 m.WriteCurrentPositionToRegister(2, 0);
966 m.Succeed();
967 m.Bind(&missing_match);
968 m.Fail();
969
970 Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
971 Handle<Object> code_object = m.GetCode(source);
972 Handle<Code> code = Handle<Code>::cast(code_object);
973
974 Handle<String> input = factory->NewStringFromStaticChars("fooofo");
975 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
976 Address start_adr = seq_input->GetCharsAddress();
977
978 int output[4];
979 NativeRegExpMacroAssembler::Result result =
980 Execute(*code,
981 *input,
982 0,
983 start_adr,
984 start_adr + input->length(),
985 output);
986
987 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
988 CHECK_EQ(0, output[0]);
989 CHECK_EQ(2, output[1]);
990 CHECK_EQ(6, output[2]);
991 CHECK_EQ(-1, output[3]);
992 }
993
994
TEST(MacroAssemblerNativeBackReferenceUC16)995 TEST(MacroAssemblerNativeBackReferenceUC16) {
996 v8::V8::Initialize();
997 ContextInitializer initializer;
998 Isolate* isolate = CcTest::i_isolate();
999 Factory* factory = isolate->factory();
1000 Zone zone(isolate);
1001
1002 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
1003
1004 m.WriteCurrentPositionToRegister(0, 0);
1005 m.AdvanceCurrentPosition(2);
1006 m.WriteCurrentPositionToRegister(1, 0);
1007 Label nomatch;
1008 m.CheckNotBackReference(0, &nomatch);
1009 m.Fail();
1010 m.Bind(&nomatch);
1011 m.AdvanceCurrentPosition(2);
1012 Label missing_match;
1013 m.CheckNotBackReference(0, &missing_match);
1014 m.WriteCurrentPositionToRegister(2, 0);
1015 m.Succeed();
1016 m.Bind(&missing_match);
1017 m.Fail();
1018
1019 Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1020 Handle<Object> code_object = m.GetCode(source);
1021 Handle<Code> code = Handle<Code>::cast(code_object);
1022
1023 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1024 Handle<String> input = factory->NewStringFromTwoByte(
1025 Vector<const uc16>(input_data, 6)).ToHandleChecked();
1026 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
1027 Address start_adr = seq_input->GetCharsAddress();
1028
1029 int output[4];
1030 NativeRegExpMacroAssembler::Result result =
1031 Execute(*code,
1032 *input,
1033 0,
1034 start_adr,
1035 start_adr + input->length() * 2,
1036 output);
1037
1038 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1039 CHECK_EQ(0, output[0]);
1040 CHECK_EQ(2, output[1]);
1041 CHECK_EQ(6, output[2]);
1042 CHECK_EQ(-1, output[3]);
1043 }
1044
1045
1046
TEST(MacroAssemblernativeAtStart)1047 TEST(MacroAssemblernativeAtStart) {
1048 v8::V8::Initialize();
1049 ContextInitializer initializer;
1050 Isolate* isolate = CcTest::i_isolate();
1051 Factory* factory = isolate->factory();
1052 Zone zone(isolate);
1053
1054 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 0, &zone);
1055
1056 Label not_at_start, newline, fail;
1057 m.CheckNotAtStart(¬_at_start);
1058 // Check that prevchar = '\n' and current = 'f'.
1059 m.CheckCharacter('\n', &newline);
1060 m.Bind(&fail);
1061 m.Fail();
1062 m.Bind(&newline);
1063 m.LoadCurrentCharacter(0, &fail);
1064 m.CheckNotCharacter('f', &fail);
1065 m.Succeed();
1066
1067 m.Bind(¬_at_start);
1068 // Check that prevchar = 'o' and current = 'b'.
1069 Label prevo;
1070 m.CheckCharacter('o', &prevo);
1071 m.Fail();
1072 m.Bind(&prevo);
1073 m.LoadCurrentCharacter(0, &fail);
1074 m.CheckNotCharacter('b', &fail);
1075 m.Succeed();
1076
1077 Handle<String> source = factory->NewStringFromStaticChars("(^f|ob)");
1078 Handle<Object> code_object = m.GetCode(source);
1079 Handle<Code> code = Handle<Code>::cast(code_object);
1080
1081 Handle<String> input = factory->NewStringFromStaticChars("foobar");
1082 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1083 Address start_adr = seq_input->GetCharsAddress();
1084
1085 NativeRegExpMacroAssembler::Result result =
1086 Execute(*code,
1087 *input,
1088 0,
1089 start_adr,
1090 start_adr + input->length(),
1091 NULL);
1092
1093 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1094
1095 result = Execute(*code,
1096 *input,
1097 3,
1098 start_adr + 3,
1099 start_adr + input->length(),
1100 NULL);
1101
1102 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1103 }
1104
1105
TEST(MacroAssemblerNativeBackRefNoCase)1106 TEST(MacroAssemblerNativeBackRefNoCase) {
1107 v8::V8::Initialize();
1108 ContextInitializer initializer;
1109 Isolate* isolate = CcTest::i_isolate();
1110 Factory* factory = isolate->factory();
1111 Zone zone(isolate);
1112
1113 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 4, &zone);
1114
1115 Label fail, succ;
1116
1117 m.WriteCurrentPositionToRegister(0, 0);
1118 m.WriteCurrentPositionToRegister(2, 0);
1119 m.AdvanceCurrentPosition(3);
1120 m.WriteCurrentPositionToRegister(3, 0);
1121 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC".
1122 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC".
1123 Label expected_fail;
1124 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
1125 m.Bind(&fail);
1126 m.Fail();
1127
1128 m.Bind(&expected_fail);
1129 m.AdvanceCurrentPosition(3); // Skip "xYz"
1130 m.CheckNotBackReferenceIgnoreCase(2, &succ);
1131 m.Fail();
1132
1133 m.Bind(&succ);
1134 m.WriteCurrentPositionToRegister(1, 0);
1135 m.Succeed();
1136
1137 Handle<String> source =
1138 factory->NewStringFromStaticChars("^(abc)\1\1(?!\1)...(?!\1)");
1139 Handle<Object> code_object = m.GetCode(source);
1140 Handle<Code> code = Handle<Code>::cast(code_object);
1141
1142 Handle<String> input = factory->NewStringFromStaticChars("aBcAbCABCxYzab");
1143 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1144 Address start_adr = seq_input->GetCharsAddress();
1145
1146 int output[4];
1147 NativeRegExpMacroAssembler::Result result =
1148 Execute(*code,
1149 *input,
1150 0,
1151 start_adr,
1152 start_adr + input->length(),
1153 output);
1154
1155 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1156 CHECK_EQ(0, output[0]);
1157 CHECK_EQ(12, output[1]);
1158 CHECK_EQ(0, output[2]);
1159 CHECK_EQ(3, output[3]);
1160 }
1161
1162
1163
TEST(MacroAssemblerNativeRegisters)1164 TEST(MacroAssemblerNativeRegisters) {
1165 v8::V8::Initialize();
1166 ContextInitializer initializer;
1167 Isolate* isolate = CcTest::i_isolate();
1168 Factory* factory = isolate->factory();
1169 Zone zone(isolate);
1170
1171 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 6, &zone);
1172
1173 uc16 foo_chars[3] = {'f', 'o', 'o'};
1174 Vector<const uc16> foo(foo_chars, 3);
1175
1176 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1177 Label fail;
1178 Label backtrack;
1179 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0]
1180 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1181 m.PushBacktrack(&backtrack);
1182 m.WriteStackPointerToRegister(sp);
1183 // Fill stack and registers
1184 m.AdvanceCurrentPosition(2);
1185 m.WriteCurrentPositionToRegister(out1, 0);
1186 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1187 m.PushBacktrack(&fail);
1188 // Drop backtrack stack frames.
1189 m.ReadStackPointerFromRegister(sp);
1190 // And take the first backtrack (to &backtrack)
1191 m.Backtrack();
1192
1193 m.PushCurrentPosition();
1194 m.AdvanceCurrentPosition(2);
1195 m.PopCurrentPosition();
1196
1197 m.Bind(&backtrack);
1198 m.PopRegister(out1);
1199 m.ReadCurrentPositionFromRegister(out1);
1200 m.AdvanceCurrentPosition(3);
1201 m.WriteCurrentPositionToRegister(out2, 0); // [0,3]
1202
1203 Label loop;
1204 m.SetRegister(loop_cnt, 0); // loop counter
1205 m.Bind(&loop);
1206 m.AdvanceRegister(loop_cnt, 1);
1207 m.AdvanceCurrentPosition(1);
1208 m.IfRegisterLT(loop_cnt, 3, &loop);
1209 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6]
1210
1211 Label loop2;
1212 m.SetRegister(loop_cnt, 2); // loop counter
1213 m.Bind(&loop2);
1214 m.AdvanceRegister(loop_cnt, -1);
1215 m.AdvanceCurrentPosition(1);
1216 m.IfRegisterGE(loop_cnt, 0, &loop2);
1217 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9]
1218
1219 Label loop3;
1220 Label exit_loop3;
1221 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1222 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1223 m.ReadCurrentPositionFromRegister(out3);
1224 m.Bind(&loop3);
1225 m.AdvanceCurrentPosition(1);
1226 m.CheckGreedyLoop(&exit_loop3);
1227 m.GoTo(&loop3);
1228 m.Bind(&exit_loop3);
1229 m.PopCurrentPosition();
1230 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1]
1231
1232 m.Succeed();
1233
1234 m.Bind(&fail);
1235 m.Fail();
1236
1237 Handle<String> source = factory->NewStringFromStaticChars("<loop test>");
1238 Handle<Object> code_object = m.GetCode(source);
1239 Handle<Code> code = Handle<Code>::cast(code_object);
1240
1241 // String long enough for test (content doesn't matter).
1242 Handle<String> input = factory->NewStringFromStaticChars("foofoofoofoofoo");
1243 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1244 Address start_adr = seq_input->GetCharsAddress();
1245
1246 int output[6];
1247 NativeRegExpMacroAssembler::Result result =
1248 Execute(*code,
1249 *input,
1250 0,
1251 start_adr,
1252 start_adr + input->length(),
1253 output);
1254
1255 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1256 CHECK_EQ(0, output[0]);
1257 CHECK_EQ(3, output[1]);
1258 CHECK_EQ(6, output[2]);
1259 CHECK_EQ(9, output[3]);
1260 CHECK_EQ(9, output[4]);
1261 CHECK_EQ(-1, output[5]);
1262 }
1263
1264
TEST(MacroAssemblerStackOverflow)1265 TEST(MacroAssemblerStackOverflow) {
1266 v8::V8::Initialize();
1267 ContextInitializer initializer;
1268 Isolate* isolate = CcTest::i_isolate();
1269 Factory* factory = isolate->factory();
1270 Zone zone(isolate);
1271
1272 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 0, &zone);
1273
1274 Label loop;
1275 m.Bind(&loop);
1276 m.PushBacktrack(&loop);
1277 m.GoTo(&loop);
1278
1279 Handle<String> source =
1280 factory->NewStringFromStaticChars("<stack overflow test>");
1281 Handle<Object> code_object = m.GetCode(source);
1282 Handle<Code> code = Handle<Code>::cast(code_object);
1283
1284 // String long enough for test (content doesn't matter).
1285 Handle<String> input = factory->NewStringFromStaticChars("dummy");
1286 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1287 Address start_adr = seq_input->GetCharsAddress();
1288
1289 NativeRegExpMacroAssembler::Result result =
1290 Execute(*code,
1291 *input,
1292 0,
1293 start_adr,
1294 start_adr + input->length(),
1295 NULL);
1296
1297 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1298 CHECK(isolate->has_pending_exception());
1299 isolate->clear_pending_exception();
1300 }
1301
1302
TEST(MacroAssemblerNativeLotsOfRegisters)1303 TEST(MacroAssemblerNativeLotsOfRegisters) {
1304 v8::V8::Initialize();
1305 ContextInitializer initializer;
1306 Isolate* isolate = CcTest::i_isolate();
1307 Factory* factory = isolate->factory();
1308 Zone zone(isolate);
1309
1310 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::LATIN1, 2, &zone);
1311
1312 // At least 2048, to ensure the allocated space for registers
1313 // span one full page.
1314 const int large_number = 8000;
1315 m.WriteCurrentPositionToRegister(large_number, 42);
1316 m.WriteCurrentPositionToRegister(0, 0);
1317 m.WriteCurrentPositionToRegister(1, 1);
1318 Label done;
1319 m.CheckNotBackReference(0, &done); // Performs a system-stack push.
1320 m.Bind(&done);
1321 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1322 m.PopRegister(1);
1323 m.Succeed();
1324
1325 Handle<String> source =
1326 factory->NewStringFromStaticChars("<huge register space test>");
1327 Handle<Object> code_object = m.GetCode(source);
1328 Handle<Code> code = Handle<Code>::cast(code_object);
1329
1330 // String long enough for test (content doesn't matter).
1331 Handle<String> input = factory->NewStringFromStaticChars("sample text");
1332 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1333 Address start_adr = seq_input->GetCharsAddress();
1334
1335 int captures[2];
1336 NativeRegExpMacroAssembler::Result result =
1337 Execute(*code,
1338 *input,
1339 0,
1340 start_adr,
1341 start_adr + input->length(),
1342 captures);
1343
1344 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1345 CHECK_EQ(0, captures[0]);
1346 CHECK_EQ(42, captures[1]);
1347
1348 isolate->clear_pending_exception();
1349 }
1350
1351 #else // V8_INTERPRETED_REGEXP
1352
TEST(MacroAssembler)1353 TEST(MacroAssembler) {
1354 byte codes[1024];
1355 Zone zone(CcTest::i_isolate());
1356 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024), &zone);
1357 // ^f(o)o.
1358 Label start, fail, backtrack;
1359
1360 m.SetRegister(4, 42);
1361 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1362 m.AdvanceRegister(4, 42);
1363 m.GoTo(&start);
1364 m.Fail();
1365 m.Bind(&start);
1366 m.PushBacktrack(&fail);
1367 m.CheckNotAtStart(NULL);
1368 m.LoadCurrentCharacter(0, NULL);
1369 m.CheckNotCharacter('f', NULL);
1370 m.LoadCurrentCharacter(1, NULL);
1371 m.CheckNotCharacter('o', NULL);
1372 m.LoadCurrentCharacter(2, NULL);
1373 m.CheckNotCharacter('o', NULL);
1374 m.WriteCurrentPositionToRegister(0, 0);
1375 m.WriteCurrentPositionToRegister(1, 3);
1376 m.WriteCurrentPositionToRegister(2, 1);
1377 m.WriteCurrentPositionToRegister(3, 2);
1378 m.AdvanceCurrentPosition(3);
1379 m.PushBacktrack(&backtrack);
1380 m.Succeed();
1381 m.Bind(&backtrack);
1382 m.ClearRegisters(2, 3);
1383 m.Backtrack();
1384 m.Bind(&fail);
1385 m.PopRegister(0);
1386 m.Fail();
1387
1388 Isolate* isolate = CcTest::i_isolate();
1389 Factory* factory = isolate->factory();
1390 HandleScope scope(isolate);
1391
1392 Handle<String> source = factory->NewStringFromStaticChars("^f(o)o");
1393 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1394 int captures[5];
1395
1396 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1397 Handle<String> f1_16 = factory->NewStringFromTwoByte(
1398 Vector<const uc16>(str1, 6)).ToHandleChecked();
1399
1400 CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1401 CHECK_EQ(0, captures[0]);
1402 CHECK_EQ(3, captures[1]);
1403 CHECK_EQ(1, captures[2]);
1404 CHECK_EQ(2, captures[3]);
1405 CHECK_EQ(84, captures[4]);
1406
1407 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1408 Handle<String> f2_16 = factory->NewStringFromTwoByte(
1409 Vector<const uc16>(str2, 6)).ToHandleChecked();
1410
1411 CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1412 CHECK_EQ(42, captures[0]);
1413 }
1414
1415 #endif // V8_INTERPRETED_REGEXP
1416
1417
TEST(AddInverseToTable)1418 TEST(AddInverseToTable) {
1419 static const int kLimit = 1000;
1420 static const int kRangeCount = 16;
1421 for (int t = 0; t < 10; t++) {
1422 Zone zone(CcTest::i_isolate());
1423 ZoneList<CharacterRange>* ranges =
1424 new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1425 for (int i = 0; i < kRangeCount; i++) {
1426 int from = PseudoRandom(t + 87, i + 25) % kLimit;
1427 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1428 if (to > kLimit) to = kLimit;
1429 ranges->Add(CharacterRange(from, to), &zone);
1430 }
1431 DispatchTable table(&zone);
1432 DispatchTableConstructor cons(&table, false, &zone);
1433 cons.set_choice_index(0);
1434 cons.AddInverse(ranges);
1435 for (int i = 0; i < kLimit; i++) {
1436 bool is_on = false;
1437 for (int j = 0; !is_on && j < kRangeCount; j++)
1438 is_on = ranges->at(j).Contains(i);
1439 OutSet* set = table.Get(i);
1440 CHECK_EQ(is_on, set->Get(0) == false);
1441 }
1442 }
1443 Zone zone(CcTest::i_isolate());
1444 ZoneList<CharacterRange>* ranges =
1445 new(&zone) ZoneList<CharacterRange>(1, &zone);
1446 ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone);
1447 DispatchTable table(&zone);
1448 DispatchTableConstructor cons(&table, false, &zone);
1449 cons.set_choice_index(0);
1450 cons.AddInverse(ranges);
1451 CHECK(!table.Get(0xFFFE)->Get(0));
1452 CHECK(table.Get(0xFFFF)->Get(0));
1453 }
1454
1455
canonicalize(uc32 c)1456 static uc32 canonicalize(uc32 c) {
1457 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1458 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1459 if (count == 0) {
1460 return c;
1461 } else {
1462 CHECK_EQ(1, count);
1463 return canon[0];
1464 }
1465 }
1466
1467
TEST(LatinCanonicalize)1468 TEST(LatinCanonicalize) {
1469 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1470 for (char lower = 'a'; lower <= 'z'; lower++) {
1471 char upper = lower + ('A' - 'a');
1472 CHECK_EQ(canonicalize(lower), canonicalize(upper));
1473 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1474 int length = un_canonicalize.get(lower, '\0', uncanon);
1475 CHECK_EQ(2, length);
1476 CHECK_EQ(upper, uncanon[0]);
1477 CHECK_EQ(lower, uncanon[1]);
1478 }
1479 for (uc32 c = 128; c < (1 << 21); c++)
1480 CHECK_GE(canonicalize(c), 128);
1481 unibrow::Mapping<unibrow::ToUppercase> to_upper;
1482 // Canonicalization is only defined for the Basic Multilingual Plane.
1483 for (uc32 c = 0; c < (1 << 16); c++) {
1484 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1485 int length = to_upper.get(c, '\0', upper);
1486 if (length == 0) {
1487 length = 1;
1488 upper[0] = c;
1489 }
1490 uc32 u = upper[0];
1491 if (length > 1 || (c >= 128 && u < 128))
1492 u = c;
1493 CHECK_EQ(u, canonicalize(c));
1494 }
1495 }
1496
1497
CanonRangeEnd(uc32 c)1498 static uc32 CanonRangeEnd(uc32 c) {
1499 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1500 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1501 if (count == 0) {
1502 return c;
1503 } else {
1504 CHECK_EQ(1, count);
1505 return canon[0];
1506 }
1507 }
1508
1509
TEST(RangeCanonicalization)1510 TEST(RangeCanonicalization) {
1511 // Check that we arrive at the same result when using the basic
1512 // range canonicalization primitives as when using immediate
1513 // canonicalization.
1514 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1515 int block_start = 0;
1516 while (block_start <= 0xFFFF) {
1517 uc32 block_end = CanonRangeEnd(block_start);
1518 unsigned block_length = block_end - block_start + 1;
1519 if (block_length > 1) {
1520 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1521 int first_length = un_canonicalize.get(block_start, '\0', first);
1522 for (unsigned i = 1; i < block_length; i++) {
1523 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1524 int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1525 CHECK_EQ(first_length, succ_length);
1526 for (int j = 0; j < succ_length; j++) {
1527 int calc = first[j] + i;
1528 int found = succ[j];
1529 CHECK_EQ(calc, found);
1530 }
1531 }
1532 }
1533 block_start = block_start + block_length;
1534 }
1535 }
1536
1537
TEST(UncanonicalizeEquivalence)1538 TEST(UncanonicalizeEquivalence) {
1539 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1540 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1541 for (int i = 0; i < (1 << 16); i++) {
1542 int length = un_canonicalize.get(i, '\0', chars);
1543 for (int j = 0; j < length; j++) {
1544 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1545 int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1546 CHECK_EQ(length, length2);
1547 for (int k = 0; k < length; k++)
1548 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1549 }
1550 }
1551 }
1552
1553
TestRangeCaseIndependence(CharacterRange input,Vector<CharacterRange> expected)1554 static void TestRangeCaseIndependence(CharacterRange input,
1555 Vector<CharacterRange> expected) {
1556 Zone zone(CcTest::i_isolate());
1557 int count = expected.length();
1558 ZoneList<CharacterRange>* list =
1559 new(&zone) ZoneList<CharacterRange>(count, &zone);
1560 input.AddCaseEquivalents(list, false, &zone);
1561 CHECK_EQ(count, list->length());
1562 for (int i = 0; i < list->length(); i++) {
1563 CHECK_EQ(expected[i].from(), list->at(i).from());
1564 CHECK_EQ(expected[i].to(), list->at(i).to());
1565 }
1566 }
1567
1568
TestSimpleRangeCaseIndependence(CharacterRange input,CharacterRange expected)1569 static void TestSimpleRangeCaseIndependence(CharacterRange input,
1570 CharacterRange expected) {
1571 EmbeddedVector<CharacterRange, 1> vector;
1572 vector[0] = expected;
1573 TestRangeCaseIndependence(input, vector);
1574 }
1575
1576
TEST(CharacterRangeCaseIndependence)1577 TEST(CharacterRangeCaseIndependence) {
1578 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
1579 CharacterRange::Singleton('A'));
1580 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
1581 CharacterRange::Singleton('Z'));
1582 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
1583 CharacterRange('A', 'Z'));
1584 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
1585 CharacterRange('C', 'F'));
1586 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
1587 CharacterRange('A', 'B'));
1588 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
1589 CharacterRange('Y', 'Z'));
1590 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
1591 CharacterRange('A', 'Z'));
1592 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
1593 CharacterRange('a', 'z'));
1594 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
1595 CharacterRange('c', 'f'));
1596 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
1597 CharacterRange('a', 'z'));
1598 // Here we need to add [l-z] to complete the case independence of
1599 // [A-Za-z] but we expect [a-z] to be added since we always add a
1600 // whole block at a time.
1601 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
1602 CharacterRange('a', 'z'));
1603 }
1604
1605
InClass(uc16 c,ZoneList<CharacterRange> * ranges)1606 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1607 if (ranges == NULL)
1608 return false;
1609 for (int i = 0; i < ranges->length(); i++) {
1610 CharacterRange range = ranges->at(i);
1611 if (range.from() <= c && c <= range.to())
1612 return true;
1613 }
1614 return false;
1615 }
1616
1617
TEST(CharClassDifference)1618 TEST(CharClassDifference) {
1619 Zone zone(CcTest::i_isolate());
1620 ZoneList<CharacterRange>* base =
1621 new(&zone) ZoneList<CharacterRange>(1, &zone);
1622 base->Add(CharacterRange::Everything(), &zone);
1623 Vector<const int> overlay = CharacterRange::GetWordBounds();
1624 ZoneList<CharacterRange>* included = NULL;
1625 ZoneList<CharacterRange>* excluded = NULL;
1626 CharacterRange::Split(base, overlay, &included, &excluded, &zone);
1627 for (int i = 0; i < (1 << 16); i++) {
1628 bool in_base = InClass(i, base);
1629 if (in_base) {
1630 bool in_overlay = false;
1631 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1632 if (overlay[j] <= i && i < overlay[j+1])
1633 in_overlay = true;
1634 }
1635 CHECK_EQ(in_overlay, InClass(i, included));
1636 CHECK_EQ(!in_overlay, InClass(i, excluded));
1637 } else {
1638 CHECK(!InClass(i, included));
1639 CHECK(!InClass(i, excluded));
1640 }
1641 }
1642 }
1643
1644
TEST(CanonicalizeCharacterSets)1645 TEST(CanonicalizeCharacterSets) {
1646 Zone zone(CcTest::i_isolate());
1647 ZoneList<CharacterRange>* list =
1648 new(&zone) ZoneList<CharacterRange>(4, &zone);
1649 CharacterSet set(list);
1650
1651 list->Add(CharacterRange(10, 20), &zone);
1652 list->Add(CharacterRange(30, 40), &zone);
1653 list->Add(CharacterRange(50, 60), &zone);
1654 set.Canonicalize();
1655 DCHECK_EQ(3, list->length());
1656 DCHECK_EQ(10, list->at(0).from());
1657 DCHECK_EQ(20, list->at(0).to());
1658 DCHECK_EQ(30, list->at(1).from());
1659 DCHECK_EQ(40, list->at(1).to());
1660 DCHECK_EQ(50, list->at(2).from());
1661 DCHECK_EQ(60, list->at(2).to());
1662
1663 list->Rewind(0);
1664 list->Add(CharacterRange(10, 20), &zone);
1665 list->Add(CharacterRange(50, 60), &zone);
1666 list->Add(CharacterRange(30, 40), &zone);
1667 set.Canonicalize();
1668 DCHECK_EQ(3, list->length());
1669 DCHECK_EQ(10, list->at(0).from());
1670 DCHECK_EQ(20, list->at(0).to());
1671 DCHECK_EQ(30, list->at(1).from());
1672 DCHECK_EQ(40, list->at(1).to());
1673 DCHECK_EQ(50, list->at(2).from());
1674 DCHECK_EQ(60, list->at(2).to());
1675
1676 list->Rewind(0);
1677 list->Add(CharacterRange(30, 40), &zone);
1678 list->Add(CharacterRange(10, 20), &zone);
1679 list->Add(CharacterRange(25, 25), &zone);
1680 list->Add(CharacterRange(100, 100), &zone);
1681 list->Add(CharacterRange(1, 1), &zone);
1682 set.Canonicalize();
1683 DCHECK_EQ(5, list->length());
1684 DCHECK_EQ(1, list->at(0).from());
1685 DCHECK_EQ(1, list->at(0).to());
1686 DCHECK_EQ(10, list->at(1).from());
1687 DCHECK_EQ(20, list->at(1).to());
1688 DCHECK_EQ(25, list->at(2).from());
1689 DCHECK_EQ(25, list->at(2).to());
1690 DCHECK_EQ(30, list->at(3).from());
1691 DCHECK_EQ(40, list->at(3).to());
1692 DCHECK_EQ(100, list->at(4).from());
1693 DCHECK_EQ(100, list->at(4).to());
1694
1695 list->Rewind(0);
1696 list->Add(CharacterRange(10, 19), &zone);
1697 list->Add(CharacterRange(21, 30), &zone);
1698 list->Add(CharacterRange(20, 20), &zone);
1699 set.Canonicalize();
1700 DCHECK_EQ(1, list->length());
1701 DCHECK_EQ(10, list->at(0).from());
1702 DCHECK_EQ(30, list->at(0).to());
1703 }
1704
1705
TEST(CharacterRangeMerge)1706 TEST(CharacterRangeMerge) {
1707 Zone zone(CcTest::i_isolate());
1708 ZoneList<CharacterRange> l1(4, &zone);
1709 ZoneList<CharacterRange> l2(4, &zone);
1710 // Create all combinations of intersections of ranges, both singletons and
1711 // longer.
1712
1713 int offset = 0;
1714
1715 // The five kinds of singleton intersections:
1716 // X
1717 // Y - outside before
1718 // Y - outside touching start
1719 // Y - overlap
1720 // Y - outside touching end
1721 // Y - outside after
1722
1723 for (int i = 0; i < 5; i++) {
1724 l1.Add(CharacterRange::Singleton(offset + 2), &zone);
1725 l2.Add(CharacterRange::Singleton(offset + i), &zone);
1726 offset += 6;
1727 }
1728
1729 // The seven kinds of singleton/non-singleton intersections:
1730 // XXX
1731 // Y - outside before
1732 // Y - outside touching start
1733 // Y - inside touching start
1734 // Y - entirely inside
1735 // Y - inside touching end
1736 // Y - outside touching end
1737 // Y - disjoint after
1738
1739 for (int i = 0; i < 7; i++) {
1740 l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
1741 l2.Add(CharacterRange::Singleton(offset + i), &zone);
1742 offset += 8;
1743 }
1744
1745 // The eleven kinds of non-singleton intersections:
1746 //
1747 // XXXXXXXX
1748 // YYYY - outside before.
1749 // YYYY - outside touching start.
1750 // YYYY - overlapping start
1751 // YYYY - inside touching start
1752 // YYYY - entirely inside
1753 // YYYY - inside touching end
1754 // YYYY - overlapping end
1755 // YYYY - outside touching end
1756 // YYYY - outside after
1757 // YYYYYYYY - identical
1758 // YYYYYYYYYYYY - containing entirely.
1759
1760 for (int i = 0; i < 9; i++) {
1761 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); // Length 8.
1762 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1763 offset += 22;
1764 }
1765 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1766 l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1767 offset += 22;
1768 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1769 l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1770 offset += 22;
1771
1772 // Different kinds of multi-range overlap:
1773 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX
1774 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y
1775
1776 l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
1777 l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1778 for (int i = 0; i < 6; i++) {
1779 l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
1780 l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1781 offset += 9;
1782 }
1783
1784 DCHECK(CharacterRange::IsCanonical(&l1));
1785 DCHECK(CharacterRange::IsCanonical(&l2));
1786
1787 ZoneList<CharacterRange> first_only(4, &zone);
1788 ZoneList<CharacterRange> second_only(4, &zone);
1789 ZoneList<CharacterRange> both(4, &zone);
1790 }
1791
1792
TEST(Graph)1793 TEST(Graph) {
1794 Execute("\\b\\w+\\b", false, true, true);
1795 }
1796