1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 // TODO: Test extractions for PartialMatch/Consume
7
8 #include <sys/types.h>
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <errno.h>
12 #include <vector>
13 #include "util/test.h"
14 #include "re2/re2.h"
15 #include "re2/regexp.h"
16
17 DECLARE_bool(logtostderr);
18
19 namespace re2 {
20
TEST(RE2,HexTests)21 TEST(RE2, HexTests) {
22
23 VLOG(1) << "hex tests";
24
25 #define CHECK_HEX(type, value) \
26 do { \
27 type v; \
28 CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
29 CHECK_EQ(v, 0x ## value); \
30 CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
31 CHECK_EQ(v, 0x ## value); \
32 } while(0)
33
34 CHECK_HEX(short, 2bad);
35 CHECK_HEX(unsigned short, 2badU);
36 CHECK_HEX(int, dead);
37 CHECK_HEX(unsigned int, deadU);
38 CHECK_HEX(long, 7eadbeefL);
39 CHECK_HEX(unsigned long, deadbeefUL);
40 CHECK_HEX(long long, 12345678deadbeefLL);
41 CHECK_HEX(unsigned long long, cafebabedeadbeefULL);
42
43 #undef CHECK_HEX
44 }
45
TEST(RE2,OctalTests)46 TEST(RE2, OctalTests) {
47 VLOG(1) << "octal tests";
48
49 #define CHECK_OCTAL(type, value) \
50 do { \
51 type v; \
52 CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
53 CHECK_EQ(v, 0 ## value); \
54 CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
55 CHECK_EQ(v, 0 ## value); \
56 } while(0)
57
58 CHECK_OCTAL(short, 77777);
59 CHECK_OCTAL(unsigned short, 177777U);
60 CHECK_OCTAL(int, 17777777777);
61 CHECK_OCTAL(unsigned int, 37777777777U);
62 CHECK_OCTAL(long, 17777777777L);
63 CHECK_OCTAL(unsigned long, 37777777777UL);
64 CHECK_OCTAL(long long, 777777777777777777777LL);
65 CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL);
66
67 #undef CHECK_OCTAL
68 }
69
TEST(RE2,DecimalTests)70 TEST(RE2, DecimalTests) {
71 VLOG(1) << "decimal tests";
72
73 #define CHECK_DECIMAL(type, value) \
74 do { \
75 type v; \
76 CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
77 CHECK_EQ(v, value); \
78 CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
79 CHECK_EQ(v, value); \
80 } while(0)
81
82 CHECK_DECIMAL(short, -1);
83 CHECK_DECIMAL(unsigned short, 9999);
84 CHECK_DECIMAL(int, -1000);
85 CHECK_DECIMAL(unsigned int, 12345U);
86 CHECK_DECIMAL(long, -10000000L);
87 CHECK_DECIMAL(unsigned long, 3083324652U);
88 CHECK_DECIMAL(long long, -100000000000000LL);
89 CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL);
90
91 #undef CHECK_DECIMAL
92 }
93
TEST(RE2,Replace)94 TEST(RE2, Replace) {
95 VLOG(1) << "TestReplace";
96
97 struct ReplaceTest {
98 const char *regexp;
99 const char *rewrite;
100 const char *original;
101 const char *single;
102 const char *global;
103 int greplace_count;
104 };
105 static const ReplaceTest tests[] = {
106 { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
107 "\\2\\1ay",
108 "the quick brown fox jumps over the lazy dogs.",
109 "ethay quick brown fox jumps over the lazy dogs.",
110 "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
111 9 },
112 { "\\w+",
113 "\\0-NOSPAM",
114 "abcd.efghi@google.com",
115 "abcd-NOSPAM.efghi@google.com",
116 "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
117 4 },
118 { "^",
119 "(START)",
120 "foo",
121 "(START)foo",
122 "(START)foo",
123 1 },
124 { "^",
125 "(START)",
126 "",
127 "(START)",
128 "(START)",
129 1 },
130 { "$",
131 "(END)",
132 "",
133 "(END)",
134 "(END)",
135 1 },
136 { "b",
137 "bb",
138 "ababababab",
139 "abbabababab",
140 "abbabbabbabbabb",
141 5 },
142 { "b",
143 "bb",
144 "bbbbbb",
145 "bbbbbbb",
146 "bbbbbbbbbbbb",
147 6 },
148 { "b+",
149 "bb",
150 "bbbbbb",
151 "bb",
152 "bb",
153 1 },
154 { "b*",
155 "bb",
156 "bbbbbb",
157 "bb",
158 "bb",
159 1 },
160 { "b*",
161 "bb",
162 "aaaaa",
163 "bbaaaaa",
164 "bbabbabbabbabbabb",
165 6 },
166 // Check newline handling
167 { "a.*a",
168 "(\\0)",
169 "aba\naba",
170 "(aba)\naba",
171 "(aba)\n(aba)",
172 2 },
173 { "", NULL, NULL, NULL, NULL, 0 }
174 };
175
176 for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
177 VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite);
178 string one(t->original);
179 CHECK(RE2::Replace(&one, t->regexp, t->rewrite));
180 CHECK_EQ(one, t->single);
181 string all(t->original);
182 CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
183 << "Got: " << all;
184 CHECK_EQ(all, t->global);
185 }
186 }
187
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)188 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
189 bool expect_ok) {
190 string error;
191 RE2 exp(regexp);
192 bool actual_ok = exp.CheckRewriteString(rewrite, &error);
193 EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
194 }
195
TEST(CheckRewriteString,all)196 TEST(CheckRewriteString, all) {
197 TestCheckRewriteString("abc", "foo", true);
198 TestCheckRewriteString("abc", "foo\\", false);
199 TestCheckRewriteString("abc", "foo\\0bar", true);
200
201 TestCheckRewriteString("a(b)c", "foo", true);
202 TestCheckRewriteString("a(b)c", "foo\\0bar", true);
203 TestCheckRewriteString("a(b)c", "foo\\1bar", true);
204 TestCheckRewriteString("a(b)c", "foo\\2bar", false);
205 TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
206
207 TestCheckRewriteString("a(b)(c)", "foo\\12", true);
208 TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
209 TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
210 }
211
TEST(RE2,Extract)212 TEST(RE2, Extract) {
213 VLOG(1) << "TestExtract";
214
215 string s;
216
217 CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
218 CHECK_EQ(s, "kremvax!boris");
219
220 CHECK(RE2::Extract("foo", ".*", "'\\0'", &s));
221 CHECK_EQ(s, "'foo'");
222 // check that false match doesn't overwrite
223 CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s));
224 CHECK_EQ(s, "'foo'");
225 }
226
TEST(RE2,Consume)227 TEST(RE2, Consume) {
228 VLOG(1) << "TestConsume";
229
230 RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
231 string word;
232
233 string s(" aaa b!@#$@#$cccc");
234 StringPiece input(s);
235
236 CHECK(RE2::Consume(&input, r, &word));
237 CHECK_EQ(word, "aaa") << " input: " << input;
238 CHECK(RE2::Consume(&input, r, &word));
239 CHECK_EQ(word, "b") << " input: " << input;
240 CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input;
241 }
242
TEST(RE2,ConsumeN)243 TEST(RE2, ConsumeN) {
244 const string s(" one two three 4");
245 StringPiece input(s);
246
247 RE2::Arg argv[2];
248 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
249
250 // 0 arg
251 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one".
252
253 // 1 arg
254 string word;
255 argv[0] = &word;
256 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
257 EXPECT_EQ("two", word);
258
259 // Multi-args
260 int n;
261 argv[1] = &n;
262 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
263 EXPECT_EQ("three", word);
264 EXPECT_EQ(4, n);
265 }
266
TEST(RE2,FindAndConsume)267 TEST(RE2, FindAndConsume) {
268 VLOG(1) << "TestFindAndConsume";
269
270 RE2 r("(\\w+)"); // matches a word
271 string word;
272
273 string s(" aaa b!@#$@#$cccc");
274 StringPiece input(s);
275
276 CHECK(RE2::FindAndConsume(&input, r, &word));
277 CHECK_EQ(word, "aaa");
278 CHECK(RE2::FindAndConsume(&input, r, &word));
279 CHECK_EQ(word, "b");
280 CHECK(RE2::FindAndConsume(&input, r, &word));
281 CHECK_EQ(word, "cccc");
282 CHECK(! RE2::FindAndConsume(&input, r, &word));
283
284 // Check that FindAndConsume works without any submatches.
285 // Earlier version used uninitialized data for
286 // length to consume.
287 input = "aaa";
288 CHECK(RE2::FindAndConsume(&input, "aaa"));
289 CHECK_EQ(input, "");
290 }
291
TEST(RE2,FindAndConsumeN)292 TEST(RE2, FindAndConsumeN) {
293 const string s(" one two three 4");
294 StringPiece input(s);
295
296 RE2::Arg argv[2];
297 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
298
299 // 0 arg
300 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one".
301
302 // 1 arg
303 string word;
304 argv[0] = &word;
305 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
306 EXPECT_EQ("two", word);
307
308 // Multi-args
309 int n;
310 argv[1] = &n;
311 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
312 EXPECT_EQ("three", word);
313 EXPECT_EQ(4, n);
314 }
315
TEST(RE2,MatchNumberPeculiarity)316 TEST(RE2, MatchNumberPeculiarity) {
317 VLOG(1) << "TestMatchNumberPeculiarity";
318
319 RE2 r("(foo)|(bar)|(baz)");
320 string word1;
321 string word2;
322 string word3;
323
324 CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
325 CHECK_EQ(word1, "foo");
326 CHECK_EQ(word2, "");
327 CHECK_EQ(word3, "");
328 CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
329 CHECK_EQ(word1, "");
330 CHECK_EQ(word2, "bar");
331 CHECK_EQ(word3, "");
332 CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
333 CHECK_EQ(word1, "");
334 CHECK_EQ(word2, "");
335 CHECK_EQ(word3, "baz");
336 CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3));
337
338 string a;
339 CHECK(RE2::FullMatch("hello", "(foo)|hello", &a));
340 CHECK_EQ(a, "");
341 }
342
TEST(RE2,Match)343 TEST(RE2, Match) {
344 RE2 re("((\\w+):([0-9]+))"); // extracts host and port
345 StringPiece group[4];
346
347 // No match.
348 StringPiece s = "zyzzyva";
349 CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED,
350 group, arraysize(group)));
351
352 // Matches and extracts.
353 s = "a chrisr:9000 here";
354 CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED,
355 group, arraysize(group)));
356 CHECK_EQ(group[0], "chrisr:9000");
357 CHECK_EQ(group[1], "chrisr:9000");
358 CHECK_EQ(group[2], "chrisr");
359 CHECK_EQ(group[3], "9000");
360
361 string all, host;
362 int port;
363 CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
364 CHECK_EQ(all, "chrisr:9000");
365 CHECK_EQ(host, "chrisr");
366 CHECK_EQ(port, 9000);
367 }
368
TestRecursion(int size,const char * pattern)369 static void TestRecursion(int size, const char *pattern) {
370 // Fill up a string repeating the pattern given
371 string domain;
372 domain.resize(size);
373 int patlen = strlen(pattern);
374 for (int i = 0; i < size; ++i) {
375 domain[i] = pattern[i % patlen];
376 }
377 // Just make sure it doesn't crash due to too much recursion.
378 RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
379 RE2::FullMatch(domain, re);
380 }
381
382 // A meta-quoted string, interpreted as a pattern, should always match
383 // the original unquoted string.
TestQuoteMeta(string unquoted,const RE2::Options & options=RE2::DefaultOptions)384 static void TestQuoteMeta(string unquoted,
385 const RE2::Options& options = RE2::DefaultOptions) {
386 string quoted = RE2::QuoteMeta(unquoted);
387 RE2 re(quoted, options);
388 EXPECT_TRUE_M(RE2::FullMatch(unquoted, re),
389 "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
390 }
391
392 // A meta-quoted string, interpreted as a pattern, should always match
393 // the original unquoted string.
NegativeTestQuoteMeta(string unquoted,string should_not_match,const RE2::Options & options=RE2::DefaultOptions)394 static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
395 const RE2::Options& options = RE2::DefaultOptions) {
396 string quoted = RE2::QuoteMeta(unquoted);
397 RE2 re(quoted, options);
398 EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re),
399 "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
400 }
401
402 // Tests that quoted meta characters match their original strings,
403 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)404 TEST(QuoteMeta, Simple) {
405 TestQuoteMeta("foo");
406 TestQuoteMeta("foo.bar");
407 TestQuoteMeta("foo\\.bar");
408 TestQuoteMeta("[1-9]");
409 TestQuoteMeta("1.5-2.0?");
410 TestQuoteMeta("\\d");
411 TestQuoteMeta("Who doesn't like ice cream?");
412 TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
413 TestQuoteMeta("((?!)xxx).*yyy");
414 TestQuoteMeta("([");
415 }
TEST(QuoteMeta,SimpleNegative)416 TEST(QuoteMeta, SimpleNegative) {
417 NegativeTestQuoteMeta("foo", "bar");
418 NegativeTestQuoteMeta("...", "bar");
419 NegativeTestQuoteMeta("\\.", ".");
420 NegativeTestQuoteMeta("\\.", "..");
421 NegativeTestQuoteMeta("(a)", "a");
422 NegativeTestQuoteMeta("(a|b)", "a");
423 NegativeTestQuoteMeta("(a|b)", "(a)");
424 NegativeTestQuoteMeta("(a|b)", "a|b");
425 NegativeTestQuoteMeta("[0-9]", "0");
426 NegativeTestQuoteMeta("[0-9]", "0-9");
427 NegativeTestQuoteMeta("[0-9]", "[9]");
428 NegativeTestQuoteMeta("((?!)xxx)", "xxx");
429 }
430
TEST(QuoteMeta,Latin1)431 TEST(QuoteMeta, Latin1) {
432 TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
433 }
434
TEST(QuoteMeta,UTF8)435 TEST(QuoteMeta, UTF8) {
436 TestQuoteMeta("Plácido Domingo");
437 TestQuoteMeta("xyz"); // No fancy utf8.
438 TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol.
439 TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character.
440 TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime.
441 TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note.
442 TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should
443 // still work.
444 NegativeTestQuoteMeta("27\xc2\xb0",
445 "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol.
446 }
447
TEST(QuoteMeta,HasNull)448 TEST(QuoteMeta, HasNull) {
449 string has_null;
450
451 // string with one null character
452 has_null += '\0';
453 TestQuoteMeta(has_null);
454 NegativeTestQuoteMeta(has_null, "");
455
456 // Don't want null-followed-by-'1' to be interpreted as '\01'.
457 has_null += '1';
458 TestQuoteMeta(has_null);
459 NegativeTestQuoteMeta(has_null, "\1");
460 }
461
TEST(ProgramSize,BigProgram)462 TEST(ProgramSize, BigProgram) {
463 RE2 re_simple("simple regexp");
464 RE2 re_medium("medium.*regexp");
465 RE2 re_complex("hard.{1,128}regexp");
466
467 CHECK_GT(re_simple.ProgramSize(), 0);
468 CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
469 CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
470 }
471
472 // Issue 956519: handling empty character sets was
473 // causing NULL dereference. This tests a few empty character sets.
474 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)475 TEST(EmptyCharset, Fuzz) {
476 static const char *empties[] = {
477 "[^\\S\\s]",
478 "[^\\S[:space:]]",
479 "[^\\D\\d]",
480 "[^\\D[:digit:]]"
481 };
482 for (int i = 0; i < arraysize(empties); i++)
483 CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
484 }
485
486 // Test that named groups work correctly.
TEST(Capture,NamedGroups)487 TEST(Capture, NamedGroups) {
488 {
489 RE2 re("(hello world)");
490 CHECK_EQ(re.NumberOfCapturingGroups(), 1);
491 const map<string, int>& m = re.NamedCapturingGroups();
492 CHECK_EQ(m.size(), 0);
493 }
494
495 {
496 RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
497 CHECK_EQ(re.NumberOfCapturingGroups(), 6);
498 const map<string, int>& m = re.NamedCapturingGroups();
499 CHECK_EQ(m.size(), 4);
500 CHECK_EQ(m.find("A")->second, 1);
501 CHECK_EQ(m.find("B")->second, 2);
502 CHECK_EQ(m.find("C")->second, 3);
503 CHECK_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous
504 }
505 }
506
TEST(RE2,FullMatchWithNoArgs)507 TEST(RE2, FullMatchWithNoArgs) {
508 CHECK(RE2::FullMatch("h", "h"));
509 CHECK(RE2::FullMatch("hello", "hello"));
510 CHECK(RE2::FullMatch("hello", "h.*o"));
511 CHECK(!RE2::FullMatch("othello", "h.*o")); // Must be anchored at front
512 CHECK(!RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end
513 }
514
TEST(RE2,PartialMatch)515 TEST(RE2, PartialMatch) {
516 CHECK(RE2::PartialMatch("x", "x"));
517 CHECK(RE2::PartialMatch("hello", "h.*o"));
518 CHECK(RE2::PartialMatch("othello", "h.*o"));
519 CHECK(RE2::PartialMatch("hello!", "h.*o"));
520 CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
521 }
522
TEST(RE2,PartialMatchN)523 TEST(RE2, PartialMatchN) {
524 RE2::Arg argv[2];
525 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
526
527 // 0 arg
528 EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
529 EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
530
531 // 1 arg
532 int i;
533 argv[0] = &i;
534 EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
535 EXPECT_EQ(1001, i);
536 EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
537
538 // Multi-arg
539 string s;
540 argv[1] = &s;
541 EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
542 EXPECT_EQ(42, i);
543 EXPECT_EQ("life", s);
544 EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
545 }
546
TEST(RE2,FullMatchZeroArg)547 TEST(RE2, FullMatchZeroArg) {
548 // Zero-arg
549 CHECK(RE2::FullMatch("1001", "\\d+"));
550 }
551
TEST(RE2,FullMatchOneArg)552 TEST(RE2, FullMatchOneArg) {
553 int i;
554
555 // Single-arg
556 CHECK(RE2::FullMatch("1001", "(\\d+)", &i));
557 CHECK_EQ(i, 1001);
558 CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i));
559 CHECK_EQ(i, -123);
560 CHECK(!RE2::FullMatch("10", "()\\d+", &i));
561 CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890",
562 "(\\d+)", &i));
563 }
564
TEST(RE2,FullMatchIntegerArg)565 TEST(RE2, FullMatchIntegerArg) {
566 int i;
567
568 // Digits surrounding integer-arg
569 CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i));
570 CHECK_EQ(i, 23);
571 CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i));
572 CHECK_EQ(i, 1);
573 CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
574 CHECK_EQ(i, -1);
575 CHECK(RE2::PartialMatch("1234", "(\\d)", &i));
576 CHECK_EQ(i, 1);
577 CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i));
578 CHECK_EQ(i, -1);
579 }
580
TEST(RE2,FullMatchStringArg)581 TEST(RE2, FullMatchStringArg) {
582 string s;
583 // String-arg
584 CHECK(RE2::FullMatch("hello", "h(.*)o", &s));
585 CHECK_EQ(s, string("ell"));
586 }
587
TEST(RE2,FullMatchStringPieceArg)588 TEST(RE2, FullMatchStringPieceArg) {
589 int i;
590 // StringPiece-arg
591 StringPiece sp;
592 CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
593 CHECK_EQ(sp.size(), 4);
594 CHECK(memcmp(sp.data(), "ruby", 4) == 0);
595 CHECK_EQ(i, 1234);
596 }
597
TEST(RE2,FullMatchMultiArg)598 TEST(RE2, FullMatchMultiArg) {
599 int i;
600 string s;
601 // Multi-arg
602 CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
603 CHECK_EQ(s, string("ruby"));
604 CHECK_EQ(i, 1234);
605 }
606
TEST(RE2,FullMatchN)607 TEST(RE2, FullMatchN) {
608 RE2::Arg argv[2];
609 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
610
611 // 0 arg
612 EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
613 EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
614
615 // 1 arg
616 int i;
617 argv[0] = &i;
618 EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
619 EXPECT_EQ(1001, i);
620 EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
621
622 // Multi-arg
623 string s;
624 argv[1] = &s;
625 EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
626 EXPECT_EQ(42, i);
627 EXPECT_EQ("life", s);
628 EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
629 }
630
TEST(RE2,FullMatchIgnoredArg)631 TEST(RE2, FullMatchIgnoredArg) {
632 int i;
633 string s;
634 // Ignored arg
635 CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
636 CHECK_EQ(s, string("ruby"));
637 CHECK_EQ(i, 1234);
638 }
639
TEST(RE2,FullMatchTypedNullArg)640 TEST(RE2, FullMatchTypedNullArg) {
641 string s;
642
643 // Ignore non-void* NULL arg
644 CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
645 CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
646 CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
647 CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL));
648 CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
649 CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
650 CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
651
652 // Fail on non-void* NULL arg if the match doesn't parse for the given type.
653 CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
654 CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL));
655 CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
656 CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL));
657 CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
658 }
659
660 // Check that numeric parsing code does not read past the end of
661 // the number being parsed.
TEST(RE2,NULTerminated)662 TEST(RE2, NULTerminated) {
663 char *v;
664 int x;
665 long pagesize = sysconf(_SC_PAGE_SIZE);
666
667 #ifndef MAP_ANONYMOUS
668 #define MAP_ANONYMOUS MAP_ANON
669 #endif
670 v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
671 MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
672 CHECK(v != reinterpret_cast<char*>(-1));
673 LOG(INFO) << "Memory at " << (void*)v;
674 CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
675 v[pagesize - 1] = '1';
676
677 x = 0;
678 CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
679 CHECK_EQ(x, 1);
680 }
681
TEST(RE2,FullMatchTypeTests)682 TEST(RE2, FullMatchTypeTests) {
683 // Type tests
684 string zeros(100, '0');
685 {
686 char c;
687 CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
688 CHECK_EQ(c, 'H');
689 }
690 {
691 unsigned char c;
692 CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
693 CHECK_EQ(c, static_cast<unsigned char>('H'));
694 }
695 {
696 int16 v;
697 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
698 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
699 CHECK(RE2::FullMatch("32767", "(-?\\d+)", &v)); CHECK_EQ(v, 32767);
700 CHECK(RE2::FullMatch("-32768", "(-?\\d+)", &v)); CHECK_EQ(v, -32768);
701 CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v));
702 CHECK(!RE2::FullMatch("32768", "(-?\\d+)", &v));
703 }
704 {
705 uint16 v;
706 CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100);
707 CHECK(RE2::FullMatch("32767", "(\\d+)", &v)); CHECK_EQ(v, 32767);
708 CHECK(RE2::FullMatch("65535", "(\\d+)", &v)); CHECK_EQ(v, 65535);
709 CHECK(!RE2::FullMatch("65536", "(\\d+)", &v));
710 }
711 {
712 int32 v;
713 static const int32 max = 0x7fffffff;
714 static const int32 min = -max - 1;
715 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
716 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
717 CHECK(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); CHECK_EQ(v, max);
718 CHECK(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); CHECK_EQ(v, min);
719 CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
720 CHECK(!RE2::FullMatch("2147483648", "(-?\\d+)", &v));
721
722 CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
723 CHECK_EQ(v, max);
724 CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
725 CHECK_EQ(v, min);
726
727 CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
728 CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
729 CHECK_EQ(v, max);
730 CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
731 }
732 {
733 uint32 v;
734 static const uint32 max = 0xfffffffful;
735 CHECK(RE2::FullMatch("100", "(\\d+)", &v)); CHECK_EQ(v, 100);
736 CHECK(RE2::FullMatch("4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
737 CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
738 CHECK(!RE2::FullMatch("-1", "(\\d+)", &v));
739
740 CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
741 }
742 {
743 int64 v;
744 static const int64 max = 0x7fffffffffffffffull;
745 static const int64 min = -max - 1;
746 char buf[32];
747
748 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
749 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
750
751 snprintf(buf, sizeof(buf), "%lld", (long long int)max);
752 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max);
753
754 snprintf(buf, sizeof(buf), "%lld", (long long int)min);
755 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, min);
756
757 snprintf(buf, sizeof(buf), "%lld", (long long int)max);
758 assert(buf[strlen(buf)-1] != '9');
759 buf[strlen(buf)-1]++;
760 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
761
762 snprintf(buf, sizeof(buf), "%lld", (long long int)min);
763 assert(buf[strlen(buf)-1] != '9');
764 buf[strlen(buf)-1]++;
765 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
766 }
767 {
768 uint64 v;
769 int64 v2;
770 static const uint64 max = 0xffffffffffffffffull;
771 char buf[32];
772
773 CHECK(RE2::FullMatch("100", "(-?\\d+)", &v)); CHECK_EQ(v, 100);
774 CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
775
776 snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max);
777 CHECK(RE2::FullMatch(buf, "(-?\\d+)", &v)); CHECK_EQ(v, max);
778
779 assert(buf[strlen(buf)-1] != '9');
780 buf[strlen(buf)-1]++;
781 CHECK(!RE2::FullMatch(buf, "(-?\\d+)", &v));
782 }
783 }
784
TEST(RE2,FloatingPointFullMatchTypes)785 TEST(RE2, FloatingPointFullMatchTypes) {
786 string zeros(100, '0');
787 {
788 float v;
789 CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100);
790 CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100);
791 CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, float(1e23));
792
793 CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
794 CHECK_EQ(v, float(1e23));
795
796 // 6700000000081920.1 is an edge case.
797 // 6700000000081920 is exactly halfway between
798 // two float32s, so the .1 should make it round up.
799 // However, the .1 is outside the precision possible with
800 // a float64: the nearest float64 is 6700000000081920.
801 // So if the code uses strtod and then converts to float32,
802 // round-to-even will make it round down instead of up.
803 // To pass the test, the parser must call strtof directly.
804 // This test case is carefully chosen to use only a 17-digit
805 // number, since C does not guarantee to get the correctly
806 // rounded answer for strtod and strtof unless the input is
807 // short.
808 CHECK(RE2::FullMatch("0.1", "(.*)", &v));
809 CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
810 CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
811 CHECK_EQ(v, 6700000000081920.1f)
812 << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
813 }
814 {
815 double v;
816 CHECK(RE2::FullMatch("100", "(.*)", &v)); CHECK_EQ(v, 100);
817 CHECK(RE2::FullMatch("-100.", "(.*)", &v)); CHECK_EQ(v, -100);
818 CHECK(RE2::FullMatch("1e23", "(.*)", &v)); CHECK_EQ(v, 1e23);
819 CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
820 CHECK_EQ(v, double(1e23));
821
822 CHECK(RE2::FullMatch("0.1", "(.*)", &v));
823 CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
824 CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
825 CHECK_EQ(v, 1.0000000596046448)
826 << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
827 }
828 }
829
TEST(RE2,FullMatchAnchored)830 TEST(RE2, FullMatchAnchored) {
831 int i;
832 // Check that matching is fully anchored
833 CHECK(!RE2::FullMatch("x1001", "(\\d+)", &i));
834 CHECK(!RE2::FullMatch("1001x", "(\\d+)", &i));
835 CHECK(RE2::FullMatch("x1001", "x(\\d+)", &i)); CHECK_EQ(i, 1001);
836 CHECK(RE2::FullMatch("1001x", "(\\d+)x", &i)); CHECK_EQ(i, 1001);
837 }
838
TEST(RE2,FullMatchBraces)839 TEST(RE2, FullMatchBraces) {
840 // Braces
841 CHECK(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}"));
842 CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
843 CHECK(!RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}"));
844 }
845
TEST(RE2,Complicated)846 TEST(RE2, Complicated) {
847 // Complicated RE2
848 CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
849 CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
850 CHECK(RE2::FullMatch("X", "foo|bar|[A-Z]"));
851 CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]"));
852 }
853
TEST(RE2,FullMatchEnd)854 TEST(RE2, FullMatchEnd) {
855 // Check full-match handling (needs '$' tacked on internally)
856 CHECK(RE2::FullMatch("fo", "fo|foo"));
857 CHECK(RE2::FullMatch("foo", "fo|foo"));
858 CHECK(RE2::FullMatch("fo", "fo|foo$"));
859 CHECK(RE2::FullMatch("foo", "fo|foo$"));
860 CHECK(RE2::FullMatch("foo", "foo$"));
861 CHECK(!RE2::FullMatch("foo$bar", "foo\\$"));
862 CHECK(!RE2::FullMatch("fox", "fo|bar"));
863
864 // Uncomment the following if we change the handling of '$' to
865 // prevent it from matching a trailing newline
866 if (false) {
867 // Check that we don't get bitten by pcre's special handling of a
868 // '\n' at the end of the string matching '$'
869 CHECK(!RE2::PartialMatch("foo\n", "foo$"));
870 }
871 }
872
TEST(RE2,FullMatchArgCount)873 TEST(RE2, FullMatchArgCount) {
874 // Number of args
875 int a[16];
876 CHECK(RE2::FullMatch("", ""));
877
878 memset(a, 0, sizeof(0));
879 CHECK(RE2::FullMatch("1",
880 "(\\d){1}",
881 &a[0]));
882 CHECK_EQ(a[0], 1);
883
884 memset(a, 0, sizeof(0));
885 CHECK(RE2::FullMatch("12",
886 "(\\d)(\\d)",
887 &a[0], &a[1]));
888 CHECK_EQ(a[0], 1);
889 CHECK_EQ(a[1], 2);
890
891 memset(a, 0, sizeof(0));
892 CHECK(RE2::FullMatch("123",
893 "(\\d)(\\d)(\\d)",
894 &a[0], &a[1], &a[2]));
895 CHECK_EQ(a[0], 1);
896 CHECK_EQ(a[1], 2);
897 CHECK_EQ(a[2], 3);
898
899 memset(a, 0, sizeof(0));
900 CHECK(RE2::FullMatch("1234",
901 "(\\d)(\\d)(\\d)(\\d)",
902 &a[0], &a[1], &a[2], &a[3]));
903 CHECK_EQ(a[0], 1);
904 CHECK_EQ(a[1], 2);
905 CHECK_EQ(a[2], 3);
906 CHECK_EQ(a[3], 4);
907
908 memset(a, 0, sizeof(0));
909 CHECK(RE2::FullMatch("12345",
910 "(\\d)(\\d)(\\d)(\\d)(\\d)",
911 &a[0], &a[1], &a[2], &a[3],
912 &a[4]));
913 CHECK_EQ(a[0], 1);
914 CHECK_EQ(a[1], 2);
915 CHECK_EQ(a[2], 3);
916 CHECK_EQ(a[3], 4);
917 CHECK_EQ(a[4], 5);
918
919 memset(a, 0, sizeof(0));
920 CHECK(RE2::FullMatch("123456",
921 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
922 &a[0], &a[1], &a[2], &a[3],
923 &a[4], &a[5]));
924 CHECK_EQ(a[0], 1);
925 CHECK_EQ(a[1], 2);
926 CHECK_EQ(a[2], 3);
927 CHECK_EQ(a[3], 4);
928 CHECK_EQ(a[4], 5);
929 CHECK_EQ(a[5], 6);
930
931 memset(a, 0, sizeof(0));
932 CHECK(RE2::FullMatch("1234567",
933 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
934 &a[0], &a[1], &a[2], &a[3],
935 &a[4], &a[5], &a[6]));
936 CHECK_EQ(a[0], 1);
937 CHECK_EQ(a[1], 2);
938 CHECK_EQ(a[2], 3);
939 CHECK_EQ(a[3], 4);
940 CHECK_EQ(a[4], 5);
941 CHECK_EQ(a[5], 6);
942 CHECK_EQ(a[6], 7);
943
944 memset(a, 0, sizeof(0));
945 CHECK(RE2::FullMatch("1234567890123456",
946 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
947 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
948 &a[0], &a[1], &a[2], &a[3],
949 &a[4], &a[5], &a[6], &a[7],
950 &a[8], &a[9], &a[10], &a[11],
951 &a[12], &a[13], &a[14], &a[15]));
952 CHECK_EQ(a[0], 1);
953 CHECK_EQ(a[1], 2);
954 CHECK_EQ(a[2], 3);
955 CHECK_EQ(a[3], 4);
956 CHECK_EQ(a[4], 5);
957 CHECK_EQ(a[5], 6);
958 CHECK_EQ(a[6], 7);
959 CHECK_EQ(a[7], 8);
960 CHECK_EQ(a[8], 9);
961 CHECK_EQ(a[9], 0);
962 CHECK_EQ(a[10], 1);
963 CHECK_EQ(a[11], 2);
964 CHECK_EQ(a[12], 3);
965 CHECK_EQ(a[13], 4);
966 CHECK_EQ(a[14], 5);
967 CHECK_EQ(a[15], 6);
968 }
969
TEST(RE2,Accessors)970 TEST(RE2, Accessors) {
971 // Check the pattern() accessor
972 {
973 const string kPattern = "http://([^/]+)/.*";
974 const RE2 re(kPattern);
975 CHECK_EQ(kPattern, re.pattern());
976 }
977
978 // Check RE2 error field.
979 {
980 RE2 re("foo");
981 CHECK(re.error().empty()); // Must have no error
982 CHECK(re.ok());
983 CHECK(re.error_code() == RE2::NoError);
984 }
985 }
986
TEST(RE2,UTF8)987 TEST(RE2, UTF8) {
988 // Check UTF-8 handling
989 // Three Japanese characters (nihongo)
990 const char utf8_string[] = {
991 0xe6, 0x97, 0xa5, // 65e5
992 0xe6, 0x9c, 0xac, // 627c
993 0xe8, 0xaa, 0x9e, // 8a9e
994 0
995 };
996 const char utf8_pattern[] = {
997 '.',
998 0xe6, 0x9c, 0xac, // 627c
999 '.',
1000 0
1001 };
1002
1003 // Both should match in either mode, bytes or UTF-8
1004 RE2 re_test1(".........", RE2::Latin1);
1005 CHECK(RE2::FullMatch(utf8_string, re_test1));
1006 RE2 re_test2("...");
1007 CHECK(RE2::FullMatch(utf8_string, re_test2));
1008
1009 // Check that '.' matches one byte or UTF-8 character
1010 // according to the mode.
1011 string s;
1012 RE2 re_test3("(.)", RE2::Latin1);
1013 CHECK(RE2::PartialMatch(utf8_string, re_test3, &s));
1014 CHECK_EQ(s, string("\xe6"));
1015 RE2 re_test4("(.)");
1016 CHECK(RE2::PartialMatch(utf8_string, re_test4, &s));
1017 CHECK_EQ(s, string("\xe6\x97\xa5"));
1018
1019 // Check that string matches itself in either mode
1020 RE2 re_test5(utf8_string, RE2::Latin1);
1021 CHECK(RE2::FullMatch(utf8_string, re_test5));
1022 RE2 re_test6(utf8_string);
1023 CHECK(RE2::FullMatch(utf8_string, re_test6));
1024
1025 // Check that pattern matches string only in UTF8 mode
1026 RE2 re_test7(utf8_pattern, RE2::Latin1);
1027 CHECK(!RE2::FullMatch(utf8_string, re_test7));
1028 RE2 re_test8(utf8_pattern);
1029 CHECK(RE2::FullMatch(utf8_string, re_test8));
1030 }
1031
TEST(RE2,UngreedyUTF8)1032 TEST(RE2, UngreedyUTF8) {
1033 // Check that ungreedy, UTF8 regular expressions don't match when they
1034 // oughtn't -- see bug 82246.
1035 {
1036 // This code always worked.
1037 const char* pattern = "\\w+X";
1038 const string target = "a aX";
1039 RE2 match_sentence(pattern, RE2::Latin1);
1040 RE2 match_sentence_re(pattern);
1041
1042 CHECK(!RE2::FullMatch(target, match_sentence));
1043 CHECK(!RE2::FullMatch(target, match_sentence_re));
1044 }
1045 {
1046 const char* pattern = "(?U)\\w+X";
1047 const string target = "a aX";
1048 RE2 match_sentence(pattern, RE2::Latin1);
1049 CHECK_EQ(match_sentence.error(), "");
1050 RE2 match_sentence_re(pattern);
1051
1052 CHECK(!RE2::FullMatch(target, match_sentence));
1053 CHECK(!RE2::FullMatch(target, match_sentence_re));
1054 }
1055 }
1056
TEST(RE2,Rejects)1057 TEST(RE2, Rejects) {
1058 { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); }
1059 {
1060 RE2 re("a[x", RE2::Quiet);
1061 CHECK(!re.ok());
1062 }
1063 {
1064 RE2 re("a[z-a]", RE2::Quiet);
1065 CHECK(!re.ok());
1066 }
1067 {
1068 RE2 re("a[[:foobar:]]", RE2::Quiet);
1069 CHECK(!re.ok());
1070 }
1071 {
1072 RE2 re("a(b", RE2::Quiet);
1073 CHECK(!re.ok());
1074 }
1075 {
1076 RE2 re("a\\", RE2::Quiet);
1077 CHECK(!re.ok());
1078 }
1079 }
1080
TEST(RE2,NoCrash)1081 TEST(RE2, NoCrash) {
1082 // Test that using a bad regexp doesn't crash.
1083 {
1084 RE2 re("a\\", RE2::Quiet);
1085 CHECK(!re.ok());
1086 CHECK(!RE2::PartialMatch("a\\b", re));
1087 }
1088
1089 // Test that using an enormous regexp doesn't crash
1090 {
1091 RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1092 CHECK(!re.ok());
1093 CHECK(!RE2::PartialMatch("aaa", re));
1094 }
1095
1096 // Test that a crazy regexp still compiles and runs.
1097 {
1098 RE2 re(".{512}x", RE2::Quiet);
1099 CHECK(re.ok());
1100 string s;
1101 s.append(515, 'c');
1102 s.append("x");
1103 CHECK(RE2::PartialMatch(s, re));
1104 }
1105 }
1106
TEST(RE2,Recursion)1107 TEST(RE2, Recursion) {
1108 // Test that recursion is stopped.
1109 // This test is PCRE-legacy -- there's no recursion in RE2.
1110 int bytes = 15 * 1024; // enough to crash PCRE
1111 TestRecursion(bytes, ".");
1112 TestRecursion(bytes, "a");
1113 TestRecursion(bytes, "a.");
1114 TestRecursion(bytes, "ab.");
1115 TestRecursion(bytes, "abc.");
1116 }
1117
TEST(RE2,BigCountedRepetition)1118 TEST(RE2, BigCountedRepetition) {
1119 // Test that counted repetition works, given tons of memory.
1120 RE2::Options opt;
1121 opt.set_max_mem(256<<20);
1122
1123 RE2 re(".{512}x", opt);
1124 CHECK(re.ok());
1125 string s;
1126 s.append(515, 'c');
1127 s.append("x");
1128 CHECK(RE2::PartialMatch(s, re));
1129 }
1130
TEST(RE2,DeepRecursion)1131 TEST(RE2, DeepRecursion) {
1132 // Test for deep stack recursion. This would fail with a
1133 // segmentation violation due to stack overflow before pcre was
1134 // patched.
1135 // Again, a PCRE legacy test. RE2 doesn't recurse.
1136 string comment("x*");
1137 string a(131072, 'a');
1138 comment += a;
1139 comment += "*x";
1140 RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1141 CHECK(RE2::FullMatch(comment, re));
1142 }
1143
1144 // Suggested by Josh Hyman. Failed when SearchOnePass was
1145 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1146 TEST(CaseInsensitive, MatchAndConsume) {
1147 string result;
1148 string text = "A fish named *Wanda*";
1149 StringPiece sp(text);
1150
1151 EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
1152 EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1153 }
1154
1155 // RE2 should permit implicit conversions from string, StringPiece, const char*,
1156 // and C string literals.
TEST(RE2,ImplicitConversions)1157 TEST(RE2, ImplicitConversions) {
1158 string re_string(".");
1159 StringPiece re_stringpiece(".");
1160 const char* re_cstring = ".";
1161 EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1162 EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1163 EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1164 EXPECT_TRUE(RE2::PartialMatch("e", "."));
1165 }
1166
1167 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1168 TEST(RE2, CL8622304) {
1169 // reported by ingow
1170 string dir;
1171 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok
1172 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails
1173
1174 // reported by jacobsa
1175 string key, val;
1176 EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1177 "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1178 &key,
1179 &val));
1180 EXPECT_EQ(key, "bar");
1181 EXPECT_EQ(val, "1,0x2F,030,4,5");
1182 }
1183
1184
1185 // Check that RE2 returns correct regexp pieces on error.
1186 // In particular, make sure it returns whole runes
1187 // and that it always reports invalid UTF-8.
1188 // Also check that Perl error flag piece is big enough.
1189 static struct ErrorTest {
1190 const char *regexp;
1191 const char *error;
1192 } error_tests[] = {
1193 { "ab\\αcd", "\\α" },
1194 { "ef\\x☺01", "\\x☺0" },
1195 { "gh\\x1☺01", "\\x1☺" },
1196 { "ij\\x1", "\\x1" },
1197 { "kl\\x", "\\x" },
1198 { "uv\\x{0000☺}", "\\x{0000☺" },
1199 { "wx\\p{ABC", "\\p{ABC" },
1200 { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X
1201 { "aa(?sm☺i", "(?sm☺" },
1202 { "bb[abc", "[abc" },
1203
1204 { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8
1205 { "op\377qr", "" },
1206 { "st\\x{00000\377", "" },
1207 { "zz\\p{\377}", "" },
1208 { "zz\\x{00\377}", "" },
1209 { "zz(?P<name\377>abc)", "" },
1210 };
TEST(RE2,ErrorArgs)1211 TEST(RE2, ErrorArgs) {
1212 for (int i = 0; i < arraysize(error_tests); i++) {
1213 RE2 re(error_tests[i].regexp, RE2::Quiet);
1214 EXPECT_FALSE(re.ok());
1215 EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1216 }
1217 }
1218
1219 // Check that "never match \n" mode never matches \n.
1220 static struct NeverTest {
1221 const char* regexp;
1222 const char* text;
1223 const char* match;
1224 } never_tests[] = {
1225 { "(.*)", "abc\ndef\nghi\n", "abc" },
1226 { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1227 { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1228 { "(abc[^x]*def)", "abc\ndef\n", NULL },
1229 { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1230 };
TEST(RE2,NeverNewline)1231 TEST(RE2, NeverNewline) {
1232 RE2::Options opt;
1233 opt.set_never_nl(true);
1234 for (int i = 0; i < arraysize(never_tests); i++) {
1235 const NeverTest& t = never_tests[i];
1236 RE2 re(t.regexp, opt);
1237 if (t.match == NULL) {
1238 EXPECT_FALSE(re.PartialMatch(t.text, re));
1239 } else {
1240 StringPiece m;
1241 EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1242 EXPECT_EQ(m, t.match);
1243 }
1244 }
1245 }
1246
1247 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1248 TEST(RE2, NeverCapture) {
1249 RE2::Options opt;
1250 opt.set_never_capture(true);
1251 RE2 re("(r)(e)", opt);
1252 EXPECT_EQ(0, re.NumberOfCapturingGroups());
1253 }
1254
1255 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1256 // Triggered by a failed DFA search falling back to Bitstate when
1257 // using Match with a NULL submatch set. Bitstate tried to read
1258 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1259 TEST(RE2, BitstateCaptureBug) {
1260 RE2::Options opt;
1261 opt.set_max_mem(20000);
1262 RE2 re("(_________$)", opt);
1263 StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1264 EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1265 }
1266
1267 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1268 TEST(RE2, UnicodeClasses) {
1269 const string str = "ABCDEFGHI譚永鋒";
1270 string a, b, c;
1271
1272 EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1273 EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1274 EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1275 EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1276 EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1277 EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1278
1279 EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1280 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1281 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1282 EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1283 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1284 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1285
1286 EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1287 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1288 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1289 EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1290 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1291 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1292
1293 EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1294 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1295 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1296 EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1297 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1298 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1299
1300 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1301 EXPECT_EQ("A", a);
1302 EXPECT_EQ("B", b);
1303 EXPECT_EQ("C", c);
1304
1305 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1306 EXPECT_EQ("A", a);
1307 EXPECT_EQ("B", b);
1308 EXPECT_EQ("C", c);
1309
1310 EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1311
1312 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1313 EXPECT_EQ("A", a);
1314 EXPECT_EQ("B", b);
1315 EXPECT_EQ("C", c);
1316
1317 EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1318
1319 EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1320 EXPECT_EQ("譚", a);
1321 EXPECT_EQ("永", b);
1322 EXPECT_EQ("鋒", c);
1323 }
1324
1325 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1326 TEST(RE2, NullVsEmptyString) {
1327 RE2 re2(".*");
1328 StringPiece v1("");
1329 EXPECT_TRUE(RE2::FullMatch(v1, re2));
1330
1331 StringPiece v2;
1332 EXPECT_TRUE(RE2::FullMatch(v2, re2));
1333 }
1334
1335 // Issue 1816809
TEST(RE2,Bug1816809)1336 TEST(RE2, Bug1816809) {
1337 RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1338 StringPiece piece("llx-3;llx4");
1339 string x;
1340 EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1341 }
1342
1343 // Issue 3061120
TEST(RE2,Bug3061120)1344 TEST(RE2, Bug3061120) {
1345 RE2 re("(?i)\\W");
1346 EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked
1347 EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin
1348 EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s
1349 }
1350
TEST(RE2,CapturingGroupNames)1351 TEST(RE2, CapturingGroupNames) {
1352 // Opening parentheses annotated with group IDs:
1353 // 12 3 45 6 7
1354 RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1355 EXPECT_TRUE(re.ok());
1356 const map<int, string>& have = re.CapturingGroupNames();
1357 map<int, string> want;
1358 want[3] = "G2";
1359 want[6] = "G2";
1360 want[7] = "G1";
1361 EXPECT_EQ(want, have);
1362 }
1363
TEST(RE2,RegexpToStringLossOfAnchor)1364 TEST(RE2, RegexpToStringLossOfAnchor) {
1365 EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1366 EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1367 EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1368 EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1369 }
1370
1371 } // namespace re2
1372