1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors.  All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5 
6 // TODO: Test extractions for PartialMatch/Consume
7 
8 #include <sys/types.h>
9 #include <sys/mman.h>
10 #include <sys/stat.h>
11 #include <errno.h>
12 #include <vector>
13 #include "util/test.h"
14 #include "re2/re2.h"
15 #include "re2/regexp.h"
16 
17 DECLARE_bool(logtostderr);
18 
19 namespace re2 {
20 
TEST(RE2,HexTests)21 TEST(RE2, HexTests) {
22 
23   VLOG(1) << "hex tests";
24 
25 #define CHECK_HEX(type, value) \
26   do { \
27     type v; \
28     CHECK(RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
29     CHECK_EQ(v, 0x ## value); \
30     CHECK(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
31     CHECK_EQ(v, 0x ## value); \
32   } while(0)
33 
34   CHECK_HEX(short,              2bad);
35   CHECK_HEX(unsigned short,     2badU);
36   CHECK_HEX(int,                dead);
37   CHECK_HEX(unsigned int,       deadU);
38   CHECK_HEX(long,               7eadbeefL);
39   CHECK_HEX(unsigned long,      deadbeefUL);
40   CHECK_HEX(long long,          12345678deadbeefLL);
41   CHECK_HEX(unsigned long long, cafebabedeadbeefULL);
42 
43 #undef CHECK_HEX
44 }
45 
TEST(RE2,OctalTests)46 TEST(RE2, OctalTests) {
47   VLOG(1) << "octal tests";
48 
49 #define CHECK_OCTAL(type, value) \
50   do { \
51     type v; \
52     CHECK(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
53     CHECK_EQ(v, 0 ## value); \
54     CHECK(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
55     CHECK_EQ(v, 0 ## value); \
56   } while(0)
57 
58   CHECK_OCTAL(short,              77777);
59   CHECK_OCTAL(unsigned short,     177777U);
60   CHECK_OCTAL(int,                17777777777);
61   CHECK_OCTAL(unsigned int,       37777777777U);
62   CHECK_OCTAL(long,               17777777777L);
63   CHECK_OCTAL(unsigned long,      37777777777UL);
64   CHECK_OCTAL(long long,          777777777777777777777LL);
65   CHECK_OCTAL(unsigned long long, 1777777777777777777777ULL);
66 
67 #undef CHECK_OCTAL
68 }
69 
TEST(RE2,DecimalTests)70 TEST(RE2, DecimalTests) {
71   VLOG(1) << "decimal tests";
72 
73 #define CHECK_DECIMAL(type, value) \
74   do { \
75     type v; \
76     CHECK(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
77     CHECK_EQ(v, value); \
78     CHECK(RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
79     CHECK_EQ(v, value); \
80   } while(0)
81 
82   CHECK_DECIMAL(short,              -1);
83   CHECK_DECIMAL(unsigned short,     9999);
84   CHECK_DECIMAL(int,                -1000);
85   CHECK_DECIMAL(unsigned int,       12345U);
86   CHECK_DECIMAL(long,               -10000000L);
87   CHECK_DECIMAL(unsigned long,      3083324652U);
88   CHECK_DECIMAL(long long,          -100000000000000LL);
89   CHECK_DECIMAL(unsigned long long, 1234567890987654321ULL);
90 
91 #undef CHECK_DECIMAL
92 }
93 
TEST(RE2,Replace)94 TEST(RE2, Replace) {
95   VLOG(1) << "TestReplace";
96 
97   struct ReplaceTest {
98     const char *regexp;
99     const char *rewrite;
100     const char *original;
101     const char *single;
102     const char *global;
103     int        greplace_count;
104   };
105   static const ReplaceTest tests[] = {
106     { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
107       "\\2\\1ay",
108       "the quick brown fox jumps over the lazy dogs.",
109       "ethay quick brown fox jumps over the lazy dogs.",
110       "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
111       9 },
112     { "\\w+",
113       "\\0-NOSPAM",
114       "abcd.efghi@google.com",
115       "abcd-NOSPAM.efghi@google.com",
116       "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
117       4 },
118     { "^",
119       "(START)",
120       "foo",
121       "(START)foo",
122       "(START)foo",
123       1 },
124     { "^",
125       "(START)",
126       "",
127       "(START)",
128       "(START)",
129       1 },
130     { "$",
131       "(END)",
132       "",
133       "(END)",
134       "(END)",
135       1 },
136     { "b",
137       "bb",
138       "ababababab",
139       "abbabababab",
140       "abbabbabbabbabb",
141       5 },
142     { "b",
143       "bb",
144       "bbbbbb",
145       "bbbbbbb",
146       "bbbbbbbbbbbb",
147       6 },
148     { "b+",
149       "bb",
150       "bbbbbb",
151       "bb",
152       "bb",
153       1 },
154     { "b*",
155       "bb",
156       "bbbbbb",
157       "bb",
158       "bb",
159       1 },
160     { "b*",
161       "bb",
162       "aaaaa",
163       "bbaaaaa",
164       "bbabbabbabbabbabb",
165       6 },
166     // Check newline handling
167     { "a.*a",
168       "(\\0)",
169       "aba\naba",
170       "(aba)\naba",
171       "(aba)\n(aba)",
172       2 },
173     { "", NULL, NULL, NULL, NULL, 0 }
174   };
175 
176   for (const ReplaceTest *t = tests; t->original != NULL; ++t) {
177     VLOG(1) << StringPrintf("\"%s\" =~ s/%s/%s/g", t->original, t->regexp, t->rewrite);
178     string one(t->original);
179     CHECK(RE2::Replace(&one, t->regexp, t->rewrite));
180     CHECK_EQ(one, t->single);
181     string all(t->original);
182     CHECK_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
183       << "Got: " << all;
184     CHECK_EQ(all, t->global);
185   }
186 }
187 
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)188 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
189                               bool expect_ok) {
190   string error;
191   RE2 exp(regexp);
192   bool actual_ok = exp.CheckRewriteString(rewrite, &error);
193   EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
194 }
195 
TEST(CheckRewriteString,all)196 TEST(CheckRewriteString, all) {
197   TestCheckRewriteString("abc", "foo", true);
198   TestCheckRewriteString("abc", "foo\\", false);
199   TestCheckRewriteString("abc", "foo\\0bar", true);
200 
201   TestCheckRewriteString("a(b)c", "foo", true);
202   TestCheckRewriteString("a(b)c", "foo\\0bar", true);
203   TestCheckRewriteString("a(b)c", "foo\\1bar", true);
204   TestCheckRewriteString("a(b)c", "foo\\2bar", false);
205   TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
206 
207   TestCheckRewriteString("a(b)(c)", "foo\\12", true);
208   TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
209   TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
210 }
211 
TEST(RE2,Extract)212 TEST(RE2, Extract) {
213   VLOG(1) << "TestExtract";
214 
215   string s;
216 
217   CHECK(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
218   CHECK_EQ(s, "kremvax!boris");
219 
220   CHECK(RE2::Extract("foo", ".*", "'\\0'", &s));
221   CHECK_EQ(s, "'foo'");
222   // check that false match doesn't overwrite
223   CHECK(!RE2::Extract("baz", "bar", "'\\0'", &s));
224   CHECK_EQ(s, "'foo'");
225 }
226 
TEST(RE2,Consume)227 TEST(RE2, Consume) {
228   VLOG(1) << "TestConsume";
229 
230   RE2 r("\\s*(\\w+)");    // matches a word, possibly proceeded by whitespace
231   string word;
232 
233   string s("   aaa b!@#$@#$cccc");
234   StringPiece input(s);
235 
236   CHECK(RE2::Consume(&input, r, &word));
237   CHECK_EQ(word, "aaa") << " input: " << input;
238   CHECK(RE2::Consume(&input, r, &word));
239   CHECK_EQ(word, "b") << " input: " << input;
240   CHECK(! RE2::Consume(&input, r, &word)) << " input: " << input;
241 }
242 
TEST(RE2,ConsumeN)243 TEST(RE2, ConsumeN) {
244   const string s(" one two three 4");
245   StringPiece input(s);
246 
247   RE2::Arg argv[2];
248   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
249 
250   // 0 arg
251   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0));  // Skips "one".
252 
253   // 1 arg
254   string word;
255   argv[0] = &word;
256   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
257   EXPECT_EQ("two", word);
258 
259   // Multi-args
260   int n;
261   argv[1] = &n;
262   EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
263   EXPECT_EQ("three", word);
264   EXPECT_EQ(4, n);
265 }
266 
TEST(RE2,FindAndConsume)267 TEST(RE2, FindAndConsume) {
268   VLOG(1) << "TestFindAndConsume";
269 
270   RE2 r("(\\w+)");      // matches a word
271   string word;
272 
273   string s("   aaa b!@#$@#$cccc");
274   StringPiece input(s);
275 
276   CHECK(RE2::FindAndConsume(&input, r, &word));
277   CHECK_EQ(word, "aaa");
278   CHECK(RE2::FindAndConsume(&input, r, &word));
279   CHECK_EQ(word, "b");
280   CHECK(RE2::FindAndConsume(&input, r, &word));
281   CHECK_EQ(word, "cccc");
282   CHECK(! RE2::FindAndConsume(&input, r, &word));
283 
284   // Check that FindAndConsume works without any submatches.
285   // Earlier version used uninitialized data for
286   // length to consume.
287   input = "aaa";
288   CHECK(RE2::FindAndConsume(&input, "aaa"));
289   CHECK_EQ(input, "");
290 }
291 
TEST(RE2,FindAndConsumeN)292 TEST(RE2, FindAndConsumeN) {
293   const string s(" one two three 4");
294   StringPiece input(s);
295 
296   RE2::Arg argv[2];
297   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
298 
299   // 0 arg
300   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0));  // Skips "one".
301 
302   // 1 arg
303   string word;
304   argv[0] = &word;
305   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
306   EXPECT_EQ("two", word);
307 
308   // Multi-args
309   int n;
310   argv[1] = &n;
311   EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
312   EXPECT_EQ("three", word);
313   EXPECT_EQ(4, n);
314 }
315 
TEST(RE2,MatchNumberPeculiarity)316 TEST(RE2, MatchNumberPeculiarity) {
317   VLOG(1) << "TestMatchNumberPeculiarity";
318 
319   RE2 r("(foo)|(bar)|(baz)");
320   string word1;
321   string word2;
322   string word3;
323 
324   CHECK(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
325   CHECK_EQ(word1, "foo");
326   CHECK_EQ(word2, "");
327   CHECK_EQ(word3, "");
328   CHECK(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
329   CHECK_EQ(word1, "");
330   CHECK_EQ(word2, "bar");
331   CHECK_EQ(word3, "");
332   CHECK(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
333   CHECK_EQ(word1, "");
334   CHECK_EQ(word2, "");
335   CHECK_EQ(word3, "baz");
336   CHECK(!RE2::PartialMatch("f", r, &word1, &word2, &word3));
337 
338   string a;
339   CHECK(RE2::FullMatch("hello", "(foo)|hello", &a));
340   CHECK_EQ(a, "");
341 }
342 
TEST(RE2,Match)343 TEST(RE2, Match) {
344   RE2 re("((\\w+):([0-9]+))");   // extracts host and port
345   StringPiece group[4];
346 
347   // No match.
348   StringPiece s = "zyzzyva";
349   CHECK(!re.Match(s, 0, s.size(), RE2::UNANCHORED,
350                   group, arraysize(group)));
351 
352   // Matches and extracts.
353   s = "a chrisr:9000 here";
354   CHECK(re.Match(s, 0, s.size(), RE2::UNANCHORED,
355                  group, arraysize(group)));
356   CHECK_EQ(group[0], "chrisr:9000");
357   CHECK_EQ(group[1], "chrisr:9000");
358   CHECK_EQ(group[2], "chrisr");
359   CHECK_EQ(group[3], "9000");
360 
361   string all, host;
362   int port;
363   CHECK(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
364   CHECK_EQ(all, "chrisr:9000");
365   CHECK_EQ(host, "chrisr");
366   CHECK_EQ(port, 9000);
367 }
368 
TestRecursion(int size,const char * pattern)369 static void TestRecursion(int size, const char *pattern) {
370   // Fill up a string repeating the pattern given
371   string domain;
372   domain.resize(size);
373   int patlen = strlen(pattern);
374   for (int i = 0; i < size; ++i) {
375     domain[i] = pattern[i % patlen];
376   }
377   // Just make sure it doesn't crash due to too much recursion.
378   RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
379   RE2::FullMatch(domain, re);
380 }
381 
382 // A meta-quoted string, interpreted as a pattern, should always match
383 // the original unquoted string.
TestQuoteMeta(string unquoted,const RE2::Options & options=RE2::DefaultOptions)384 static void TestQuoteMeta(string unquoted,
385                           const RE2::Options& options = RE2::DefaultOptions) {
386   string quoted = RE2::QuoteMeta(unquoted);
387   RE2 re(quoted, options);
388   EXPECT_TRUE_M(RE2::FullMatch(unquoted, re),
389                 "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
390 }
391 
392 // A meta-quoted string, interpreted as a pattern, should always match
393 // the original unquoted string.
NegativeTestQuoteMeta(string unquoted,string should_not_match,const RE2::Options & options=RE2::DefaultOptions)394 static void NegativeTestQuoteMeta(string unquoted, string should_not_match,
395                                   const RE2::Options& options = RE2::DefaultOptions) {
396   string quoted = RE2::QuoteMeta(unquoted);
397   RE2 re(quoted, options);
398   EXPECT_FALSE_M(RE2::FullMatch(should_not_match, re),
399                  "Unquoted='" + unquoted + "', quoted='" + quoted + "'.");
400 }
401 
402 // Tests that quoted meta characters match their original strings,
403 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)404 TEST(QuoteMeta, Simple) {
405   TestQuoteMeta("foo");
406   TestQuoteMeta("foo.bar");
407   TestQuoteMeta("foo\\.bar");
408   TestQuoteMeta("[1-9]");
409   TestQuoteMeta("1.5-2.0?");
410   TestQuoteMeta("\\d");
411   TestQuoteMeta("Who doesn't like ice cream?");
412   TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
413   TestQuoteMeta("((?!)xxx).*yyy");
414   TestQuoteMeta("([");
415 }
TEST(QuoteMeta,SimpleNegative)416 TEST(QuoteMeta, SimpleNegative) {
417   NegativeTestQuoteMeta("foo", "bar");
418   NegativeTestQuoteMeta("...", "bar");
419   NegativeTestQuoteMeta("\\.", ".");
420   NegativeTestQuoteMeta("\\.", "..");
421   NegativeTestQuoteMeta("(a)", "a");
422   NegativeTestQuoteMeta("(a|b)", "a");
423   NegativeTestQuoteMeta("(a|b)", "(a)");
424   NegativeTestQuoteMeta("(a|b)", "a|b");
425   NegativeTestQuoteMeta("[0-9]", "0");
426   NegativeTestQuoteMeta("[0-9]", "0-9");
427   NegativeTestQuoteMeta("[0-9]", "[9]");
428   NegativeTestQuoteMeta("((?!)xxx)", "xxx");
429 }
430 
TEST(QuoteMeta,Latin1)431 TEST(QuoteMeta, Latin1) {
432   TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
433 }
434 
TEST(QuoteMeta,UTF8)435 TEST(QuoteMeta, UTF8) {
436   TestQuoteMeta("Plácido Domingo");
437   TestQuoteMeta("xyz");  // No fancy utf8.
438   TestQuoteMeta("\xc2\xb0");  // 2-byte utf8 -- a degree symbol.
439   TestQuoteMeta("27\xc2\xb0 degrees");  // As a middle character.
440   TestQuoteMeta("\xe2\x80\xb3");  // 3-byte utf8 -- a double prime.
441   TestQuoteMeta("\xf0\x9d\x85\x9f");  // 4-byte utf8 -- a music note.
442   TestQuoteMeta("27\xc2\xb0");  // Interpreted as Latin-1, this should
443                                 // still work.
444   NegativeTestQuoteMeta("27\xc2\xb0",
445                         "27\\\xc2\\\xb0");  // 2-byte utf8 -- a degree symbol.
446 }
447 
TEST(QuoteMeta,HasNull)448 TEST(QuoteMeta, HasNull) {
449   string has_null;
450 
451   // string with one null character
452   has_null += '\0';
453   TestQuoteMeta(has_null);
454   NegativeTestQuoteMeta(has_null, "");
455 
456   // Don't want null-followed-by-'1' to be interpreted as '\01'.
457   has_null += '1';
458   TestQuoteMeta(has_null);
459   NegativeTestQuoteMeta(has_null, "\1");
460 }
461 
TEST(ProgramSize,BigProgram)462 TEST(ProgramSize, BigProgram) {
463   RE2 re_simple("simple regexp");
464   RE2 re_medium("medium.*regexp");
465   RE2 re_complex("hard.{1,128}regexp");
466 
467   CHECK_GT(re_simple.ProgramSize(), 0);
468   CHECK_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
469   CHECK_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
470 }
471 
472 // Issue 956519: handling empty character sets was
473 // causing NULL dereference.  This tests a few empty character sets.
474 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)475 TEST(EmptyCharset, Fuzz) {
476   static const char *empties[] = {
477     "[^\\S\\s]",
478     "[^\\S[:space:]]",
479     "[^\\D\\d]",
480     "[^\\D[:digit:]]"
481   };
482   for (int i = 0; i < arraysize(empties); i++)
483     CHECK(!RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
484 }
485 
486 // Test that named groups work correctly.
TEST(Capture,NamedGroups)487 TEST(Capture, NamedGroups) {
488   {
489     RE2 re("(hello world)");
490     CHECK_EQ(re.NumberOfCapturingGroups(), 1);
491     const map<string, int>& m = re.NamedCapturingGroups();
492     CHECK_EQ(m.size(), 0);
493   }
494 
495   {
496     RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
497     CHECK_EQ(re.NumberOfCapturingGroups(), 6);
498     const map<string, int>& m = re.NamedCapturingGroups();
499     CHECK_EQ(m.size(), 4);
500     CHECK_EQ(m.find("A")->second, 1);
501     CHECK_EQ(m.find("B")->second, 2);
502     CHECK_EQ(m.find("C")->second, 3);
503     CHECK_EQ(m.find("D")->second, 6);  // $4 and $5 are anonymous
504   }
505 }
506 
TEST(RE2,FullMatchWithNoArgs)507 TEST(RE2, FullMatchWithNoArgs) {
508   CHECK(RE2::FullMatch("h", "h"));
509   CHECK(RE2::FullMatch("hello", "hello"));
510   CHECK(RE2::FullMatch("hello", "h.*o"));
511   CHECK(!RE2::FullMatch("othello", "h.*o"));       // Must be anchored at front
512   CHECK(!RE2::FullMatch("hello!", "h.*o"));        // Must be anchored at end
513 }
514 
TEST(RE2,PartialMatch)515 TEST(RE2, PartialMatch) {
516   CHECK(RE2::PartialMatch("x", "x"));
517   CHECK(RE2::PartialMatch("hello", "h.*o"));
518   CHECK(RE2::PartialMatch("othello", "h.*o"));
519   CHECK(RE2::PartialMatch("hello!", "h.*o"));
520   CHECK(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
521 }
522 
TEST(RE2,PartialMatchN)523 TEST(RE2, PartialMatchN) {
524   RE2::Arg argv[2];
525   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
526 
527   // 0 arg
528   EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
529   EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
530 
531   // 1 arg
532   int i;
533   argv[0] = &i;
534   EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
535   EXPECT_EQ(1001, i);
536   EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
537 
538   // Multi-arg
539   string s;
540   argv[1] = &s;
541   EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
542   EXPECT_EQ(42, i);
543   EXPECT_EQ("life", s);
544   EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
545 }
546 
TEST(RE2,FullMatchZeroArg)547 TEST(RE2, FullMatchZeroArg) {
548   // Zero-arg
549   CHECK(RE2::FullMatch("1001", "\\d+"));
550 }
551 
TEST(RE2,FullMatchOneArg)552 TEST(RE2, FullMatchOneArg) {
553   int i;
554 
555   // Single-arg
556   CHECK(RE2::FullMatch("1001", "(\\d+)",   &i));
557   CHECK_EQ(i, 1001);
558   CHECK(RE2::FullMatch("-123", "(-?\\d+)", &i));
559   CHECK_EQ(i, -123);
560   CHECK(!RE2::FullMatch("10", "()\\d+", &i));
561   CHECK(!RE2::FullMatch("1234567890123456789012345678901234567890",
562                        "(\\d+)", &i));
563 }
564 
TEST(RE2,FullMatchIntegerArg)565 TEST(RE2, FullMatchIntegerArg) {
566   int i;
567 
568   // Digits surrounding integer-arg
569   CHECK(RE2::FullMatch("1234", "1(\\d*)4", &i));
570   CHECK_EQ(i, 23);
571   CHECK(RE2::FullMatch("1234", "(\\d)\\d+", &i));
572   CHECK_EQ(i, 1);
573   CHECK(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
574   CHECK_EQ(i, -1);
575   CHECK(RE2::PartialMatch("1234", "(\\d)", &i));
576   CHECK_EQ(i, 1);
577   CHECK(RE2::PartialMatch("-1234", "(-\\d)", &i));
578   CHECK_EQ(i, -1);
579 }
580 
TEST(RE2,FullMatchStringArg)581 TEST(RE2, FullMatchStringArg) {
582   string s;
583   // String-arg
584   CHECK(RE2::FullMatch("hello", "h(.*)o", &s));
585   CHECK_EQ(s, string("ell"));
586 }
587 
TEST(RE2,FullMatchStringPieceArg)588 TEST(RE2, FullMatchStringPieceArg) {
589   int i;
590   // StringPiece-arg
591   StringPiece sp;
592   CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
593   CHECK_EQ(sp.size(), 4);
594   CHECK(memcmp(sp.data(), "ruby", 4) == 0);
595   CHECK_EQ(i, 1234);
596 }
597 
TEST(RE2,FullMatchMultiArg)598 TEST(RE2, FullMatchMultiArg) {
599   int i;
600   string s;
601   // Multi-arg
602   CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
603   CHECK_EQ(s, string("ruby"));
604   CHECK_EQ(i, 1234);
605 }
606 
TEST(RE2,FullMatchN)607 TEST(RE2, FullMatchN) {
608   RE2::Arg argv[2];
609   const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
610 
611   // 0 arg
612   EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
613   EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
614 
615   // 1 arg
616   int i;
617   argv[0] = &i;
618   EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
619   EXPECT_EQ(1001, i);
620   EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
621 
622   // Multi-arg
623   string s;
624   argv[1] = &s;
625   EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
626   EXPECT_EQ(42, i);
627   EXPECT_EQ("life", s);
628   EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
629 }
630 
TEST(RE2,FullMatchIgnoredArg)631 TEST(RE2, FullMatchIgnoredArg) {
632   int i;
633   string s;
634   // Ignored arg
635   CHECK(RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
636   CHECK_EQ(s, string("ruby"));
637   CHECK_EQ(i, 1234);
638 }
639 
TEST(RE2,FullMatchTypedNullArg)640 TEST(RE2, FullMatchTypedNullArg) {
641   string s;
642 
643   // Ignore non-void* NULL arg
644   CHECK(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
645   CHECK(RE2::FullMatch("hello", "h(.*)o", (string*)NULL));
646   CHECK(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
647   CHECK(RE2::FullMatch("1234", "(.*)", (int*)NULL));
648   CHECK(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
649   CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
650   CHECK(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
651 
652   // Fail on non-void* NULL arg if the match doesn't parse for the given type.
653   CHECK(!RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
654   CHECK(!RE2::FullMatch("hello", "(.*)", (int*)NULL));
655   CHECK(!RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
656   CHECK(!RE2::FullMatch("hello", "(.*)", (double*)NULL));
657   CHECK(!RE2::FullMatch("hello", "(.*)", (float*)NULL));
658 }
659 
660 // Check that numeric parsing code does not read past the end of
661 // the number being parsed.
TEST(RE2,NULTerminated)662 TEST(RE2, NULTerminated) {
663   char *v;
664   int x;
665   long pagesize = sysconf(_SC_PAGE_SIZE);
666 
667 #ifndef MAP_ANONYMOUS
668 #define MAP_ANONYMOUS MAP_ANON
669 #endif
670   v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
671                               MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
672   CHECK(v != reinterpret_cast<char*>(-1));
673   LOG(INFO) << "Memory at " << (void*)v;
674   CHECK_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
675   v[pagesize - 1] = '1';
676 
677   x = 0;
678   CHECK(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
679   CHECK_EQ(x, 1);
680 }
681 
TEST(RE2,FullMatchTypeTests)682 TEST(RE2, FullMatchTypeTests) {
683   // Type tests
684   string zeros(100, '0');
685   {
686     char c;
687     CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
688     CHECK_EQ(c, 'H');
689   }
690   {
691     unsigned char c;
692     CHECK(RE2::FullMatch("Hello", "(H)ello", &c));
693     CHECK_EQ(c, static_cast<unsigned char>('H'));
694   }
695   {
696     int16 v;
697     CHECK(RE2::FullMatch("100",     "(-?\\d+)", &v));    CHECK_EQ(v, 100);
698     CHECK(RE2::FullMatch("-100",    "(-?\\d+)", &v));    CHECK_EQ(v, -100);
699     CHECK(RE2::FullMatch("32767",   "(-?\\d+)", &v));    CHECK_EQ(v, 32767);
700     CHECK(RE2::FullMatch("-32768",  "(-?\\d+)", &v));    CHECK_EQ(v, -32768);
701     CHECK(!RE2::FullMatch("-32769", "(-?\\d+)", &v));
702     CHECK(!RE2::FullMatch("32768",  "(-?\\d+)", &v));
703   }
704   {
705     uint16 v;
706     CHECK(RE2::FullMatch("100",     "(\\d+)", &v));    CHECK_EQ(v, 100);
707     CHECK(RE2::FullMatch("32767",   "(\\d+)", &v));    CHECK_EQ(v, 32767);
708     CHECK(RE2::FullMatch("65535",   "(\\d+)", &v));    CHECK_EQ(v, 65535);
709     CHECK(!RE2::FullMatch("65536",  "(\\d+)", &v));
710   }
711   {
712     int32 v;
713     static const int32 max = 0x7fffffff;
714     static const int32 min = -max - 1;
715     CHECK(RE2::FullMatch("100",          "(-?\\d+)", &v)); CHECK_EQ(v, 100);
716     CHECK(RE2::FullMatch("-100",         "(-?\\d+)", &v)); CHECK_EQ(v, -100);
717     CHECK(RE2::FullMatch("2147483647",   "(-?\\d+)", &v)); CHECK_EQ(v, max);
718     CHECK(RE2::FullMatch("-2147483648",  "(-?\\d+)", &v)); CHECK_EQ(v, min);
719     CHECK(!RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
720     CHECK(!RE2::FullMatch("2147483648",  "(-?\\d+)", &v));
721 
722     CHECK(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
723     CHECK_EQ(v, max);
724     CHECK(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
725     CHECK_EQ(v, min);
726 
727     CHECK(!RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
728     CHECK(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
729     CHECK_EQ(v, max);
730     CHECK(!RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
731   }
732   {
733     uint32 v;
734     static const uint32 max = 0xfffffffful;
735     CHECK(RE2::FullMatch("100",         "(\\d+)", &v)); CHECK_EQ(v, 100);
736     CHECK(RE2::FullMatch("4294967295",  "(\\d+)", &v)); CHECK_EQ(v, max);
737     CHECK(!RE2::FullMatch("4294967296", "(\\d+)", &v));
738     CHECK(!RE2::FullMatch("-1",         "(\\d+)", &v));
739 
740     CHECK(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); CHECK_EQ(v, max);
741   }
742   {
743     int64 v;
744     static const int64 max = 0x7fffffffffffffffull;
745     static const int64 min = -max - 1;
746     char buf[32];
747 
748     CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v)); CHECK_EQ(v, 100);
749     CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v)); CHECK_EQ(v, -100);
750 
751     snprintf(buf, sizeof(buf), "%lld", (long long int)max);
752     CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
753 
754     snprintf(buf, sizeof(buf), "%lld", (long long int)min);
755     CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, min);
756 
757     snprintf(buf, sizeof(buf), "%lld", (long long int)max);
758     assert(buf[strlen(buf)-1] != '9');
759     buf[strlen(buf)-1]++;
760     CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
761 
762     snprintf(buf, sizeof(buf), "%lld", (long long int)min);
763     assert(buf[strlen(buf)-1] != '9');
764     buf[strlen(buf)-1]++;
765     CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
766   }
767   {
768     uint64 v;
769     int64 v2;
770     static const uint64 max = 0xffffffffffffffffull;
771     char buf[32];
772 
773     CHECK(RE2::FullMatch("100",  "(-?\\d+)", &v));  CHECK_EQ(v, 100);
774     CHECK(RE2::FullMatch("-100", "(-?\\d+)", &v2)); CHECK_EQ(v2, -100);
775 
776     snprintf(buf, sizeof(buf), "%llu", (long long unsigned)max);
777     CHECK(RE2::FullMatch(buf,    "(-?\\d+)", &v)); CHECK_EQ(v, max);
778 
779     assert(buf[strlen(buf)-1] != '9');
780     buf[strlen(buf)-1]++;
781     CHECK(!RE2::FullMatch(buf,   "(-?\\d+)", &v));
782   }
783 }
784 
TEST(RE2,FloatingPointFullMatchTypes)785 TEST(RE2, FloatingPointFullMatchTypes) {
786   string zeros(100, '0');
787   {
788     float v;
789     CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
790     CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
791     CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, float(1e23));
792 
793     CHECK(RE2::FullMatch(zeros + "1e23",  "(.*)", &v));
794     CHECK_EQ(v, float(1e23));
795 
796     // 6700000000081920.1 is an edge case.
797     // 6700000000081920 is exactly halfway between
798     // two float32s, so the .1 should make it round up.
799     // However, the .1 is outside the precision possible with
800     // a float64: the nearest float64 is 6700000000081920.
801     // So if the code uses strtod and then converts to float32,
802     // round-to-even will make it round down instead of up.
803     // To pass the test, the parser must call strtof directly.
804     // This test case is carefully chosen to use only a 17-digit
805     // number, since C does not guarantee to get the correctly
806     // rounded answer for strtod and strtof unless the input is
807     // short.
808     CHECK(RE2::FullMatch("0.1", "(.*)", &v));
809     CHECK_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
810     CHECK(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
811     CHECK_EQ(v, 6700000000081920.1f)
812       << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
813   }
814   {
815     double v;
816     CHECK(RE2::FullMatch("100",   "(.*)", &v));  CHECK_EQ(v, 100);
817     CHECK(RE2::FullMatch("-100.", "(.*)", &v));  CHECK_EQ(v, -100);
818     CHECK(RE2::FullMatch("1e23",  "(.*)", &v));  CHECK_EQ(v, 1e23);
819     CHECK(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
820     CHECK_EQ(v, double(1e23));
821 
822     CHECK(RE2::FullMatch("0.1", "(.*)", &v));
823     CHECK_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
824     CHECK(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
825     CHECK_EQ(v, 1.0000000596046448)
826       << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
827   }
828 }
829 
TEST(RE2,FullMatchAnchored)830 TEST(RE2, FullMatchAnchored) {
831   int i;
832   // Check that matching is fully anchored
833   CHECK(!RE2::FullMatch("x1001", "(\\d+)",  &i));
834   CHECK(!RE2::FullMatch("1001x", "(\\d+)",  &i));
835   CHECK(RE2::FullMatch("x1001",  "x(\\d+)", &i)); CHECK_EQ(i, 1001);
836   CHECK(RE2::FullMatch("1001x",  "(\\d+)x", &i)); CHECK_EQ(i, 1001);
837 }
838 
TEST(RE2,FullMatchBraces)839 TEST(RE2, FullMatchBraces) {
840   // Braces
841   CHECK(RE2::FullMatch("0abcd",  "[0-9a-f+.-]{5,}"));
842   CHECK(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
843   CHECK(!RE2::FullMatch("0abc",  "[0-9a-f+.-]{5,}"));
844 }
845 
TEST(RE2,Complicated)846 TEST(RE2, Complicated) {
847   // Complicated RE2
848   CHECK(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
849   CHECK(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
850   CHECK(RE2::FullMatch("X",   "foo|bar|[A-Z]"));
851   CHECK(!RE2::FullMatch("XY", "foo|bar|[A-Z]"));
852 }
853 
TEST(RE2,FullMatchEnd)854 TEST(RE2, FullMatchEnd) {
855   // Check full-match handling (needs '$' tacked on internally)
856   CHECK(RE2::FullMatch("fo", "fo|foo"));
857   CHECK(RE2::FullMatch("foo", "fo|foo"));
858   CHECK(RE2::FullMatch("fo", "fo|foo$"));
859   CHECK(RE2::FullMatch("foo", "fo|foo$"));
860   CHECK(RE2::FullMatch("foo", "foo$"));
861   CHECK(!RE2::FullMatch("foo$bar", "foo\\$"));
862   CHECK(!RE2::FullMatch("fox", "fo|bar"));
863 
864   // Uncomment the following if we change the handling of '$' to
865   // prevent it from matching a trailing newline
866   if (false) {
867     // Check that we don't get bitten by pcre's special handling of a
868     // '\n' at the end of the string matching '$'
869     CHECK(!RE2::PartialMatch("foo\n", "foo$"));
870   }
871 }
872 
TEST(RE2,FullMatchArgCount)873 TEST(RE2, FullMatchArgCount) {
874   // Number of args
875   int a[16];
876   CHECK(RE2::FullMatch("", ""));
877 
878   memset(a, 0, sizeof(0));
879   CHECK(RE2::FullMatch("1",
880                       "(\\d){1}",
881                       &a[0]));
882   CHECK_EQ(a[0], 1);
883 
884   memset(a, 0, sizeof(0));
885   CHECK(RE2::FullMatch("12",
886                       "(\\d)(\\d)",
887                       &a[0],  &a[1]));
888   CHECK_EQ(a[0], 1);
889   CHECK_EQ(a[1], 2);
890 
891   memset(a, 0, sizeof(0));
892   CHECK(RE2::FullMatch("123",
893                       "(\\d)(\\d)(\\d)",
894                       &a[0],  &a[1],  &a[2]));
895   CHECK_EQ(a[0], 1);
896   CHECK_EQ(a[1], 2);
897   CHECK_EQ(a[2], 3);
898 
899   memset(a, 0, sizeof(0));
900   CHECK(RE2::FullMatch("1234",
901                       "(\\d)(\\d)(\\d)(\\d)",
902                       &a[0],  &a[1],  &a[2],  &a[3]));
903   CHECK_EQ(a[0], 1);
904   CHECK_EQ(a[1], 2);
905   CHECK_EQ(a[2], 3);
906   CHECK_EQ(a[3], 4);
907 
908   memset(a, 0, sizeof(0));
909   CHECK(RE2::FullMatch("12345",
910                       "(\\d)(\\d)(\\d)(\\d)(\\d)",
911                       &a[0],  &a[1],  &a[2],  &a[3],
912                       &a[4]));
913   CHECK_EQ(a[0], 1);
914   CHECK_EQ(a[1], 2);
915   CHECK_EQ(a[2], 3);
916   CHECK_EQ(a[3], 4);
917   CHECK_EQ(a[4], 5);
918 
919   memset(a, 0, sizeof(0));
920   CHECK(RE2::FullMatch("123456",
921                       "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
922                       &a[0],  &a[1],  &a[2],  &a[3],
923                       &a[4],  &a[5]));
924   CHECK_EQ(a[0], 1);
925   CHECK_EQ(a[1], 2);
926   CHECK_EQ(a[2], 3);
927   CHECK_EQ(a[3], 4);
928   CHECK_EQ(a[4], 5);
929   CHECK_EQ(a[5], 6);
930 
931   memset(a, 0, sizeof(0));
932   CHECK(RE2::FullMatch("1234567",
933                       "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
934                       &a[0],  &a[1],  &a[2],  &a[3],
935                       &a[4],  &a[5],  &a[6]));
936   CHECK_EQ(a[0], 1);
937   CHECK_EQ(a[1], 2);
938   CHECK_EQ(a[2], 3);
939   CHECK_EQ(a[3], 4);
940   CHECK_EQ(a[4], 5);
941   CHECK_EQ(a[5], 6);
942   CHECK_EQ(a[6], 7);
943 
944   memset(a, 0, sizeof(0));
945   CHECK(RE2::FullMatch("1234567890123456",
946                       "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
947                       "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
948                       &a[0],  &a[1],  &a[2],  &a[3],
949                       &a[4],  &a[5],  &a[6],  &a[7],
950                       &a[8],  &a[9],  &a[10], &a[11],
951                       &a[12], &a[13], &a[14], &a[15]));
952   CHECK_EQ(a[0], 1);
953   CHECK_EQ(a[1], 2);
954   CHECK_EQ(a[2], 3);
955   CHECK_EQ(a[3], 4);
956   CHECK_EQ(a[4], 5);
957   CHECK_EQ(a[5], 6);
958   CHECK_EQ(a[6], 7);
959   CHECK_EQ(a[7], 8);
960   CHECK_EQ(a[8], 9);
961   CHECK_EQ(a[9], 0);
962   CHECK_EQ(a[10], 1);
963   CHECK_EQ(a[11], 2);
964   CHECK_EQ(a[12], 3);
965   CHECK_EQ(a[13], 4);
966   CHECK_EQ(a[14], 5);
967   CHECK_EQ(a[15], 6);
968 }
969 
TEST(RE2,Accessors)970 TEST(RE2, Accessors) {
971   // Check the pattern() accessor
972   {
973     const string kPattern = "http://([^/]+)/.*";
974     const RE2 re(kPattern);
975     CHECK_EQ(kPattern, re.pattern());
976   }
977 
978   // Check RE2 error field.
979   {
980     RE2 re("foo");
981     CHECK(re.error().empty());  // Must have no error
982     CHECK(re.ok());
983     CHECK(re.error_code() == RE2::NoError);
984   }
985 }
986 
TEST(RE2,UTF8)987 TEST(RE2, UTF8) {
988   // Check UTF-8 handling
989   // Three Japanese characters (nihongo)
990   const char utf8_string[] = {
991        0xe6, 0x97, 0xa5, // 65e5
992        0xe6, 0x9c, 0xac, // 627c
993        0xe8, 0xaa, 0x9e, // 8a9e
994        0
995   };
996   const char utf8_pattern[] = {
997        '.',
998        0xe6, 0x9c, 0xac, // 627c
999        '.',
1000        0
1001   };
1002 
1003   // Both should match in either mode, bytes or UTF-8
1004   RE2 re_test1(".........", RE2::Latin1);
1005   CHECK(RE2::FullMatch(utf8_string, re_test1));
1006   RE2 re_test2("...");
1007   CHECK(RE2::FullMatch(utf8_string, re_test2));
1008 
1009   // Check that '.' matches one byte or UTF-8 character
1010   // according to the mode.
1011   string s;
1012   RE2 re_test3("(.)", RE2::Latin1);
1013   CHECK(RE2::PartialMatch(utf8_string, re_test3, &s));
1014   CHECK_EQ(s, string("\xe6"));
1015   RE2 re_test4("(.)");
1016   CHECK(RE2::PartialMatch(utf8_string, re_test4, &s));
1017   CHECK_EQ(s, string("\xe6\x97\xa5"));
1018 
1019   // Check that string matches itself in either mode
1020   RE2 re_test5(utf8_string, RE2::Latin1);
1021   CHECK(RE2::FullMatch(utf8_string, re_test5));
1022   RE2 re_test6(utf8_string);
1023   CHECK(RE2::FullMatch(utf8_string, re_test6));
1024 
1025   // Check that pattern matches string only in UTF8 mode
1026   RE2 re_test7(utf8_pattern, RE2::Latin1);
1027   CHECK(!RE2::FullMatch(utf8_string, re_test7));
1028   RE2 re_test8(utf8_pattern);
1029   CHECK(RE2::FullMatch(utf8_string, re_test8));
1030 }
1031 
TEST(RE2,UngreedyUTF8)1032 TEST(RE2, UngreedyUTF8) {
1033   // Check that ungreedy, UTF8 regular expressions don't match when they
1034   // oughtn't -- see bug 82246.
1035   {
1036     // This code always worked.
1037     const char* pattern = "\\w+X";
1038     const string target = "a aX";
1039     RE2 match_sentence(pattern, RE2::Latin1);
1040     RE2 match_sentence_re(pattern);
1041 
1042     CHECK(!RE2::FullMatch(target, match_sentence));
1043     CHECK(!RE2::FullMatch(target, match_sentence_re));
1044   }
1045   {
1046     const char* pattern = "(?U)\\w+X";
1047     const string target = "a aX";
1048     RE2 match_sentence(pattern, RE2::Latin1);
1049     CHECK_EQ(match_sentence.error(), "");
1050     RE2 match_sentence_re(pattern);
1051 
1052     CHECK(!RE2::FullMatch(target, match_sentence));
1053     CHECK(!RE2::FullMatch(target, match_sentence_re));
1054   }
1055 }
1056 
TEST(RE2,Rejects)1057 TEST(RE2, Rejects) {
1058   { RE2 re("a\\1", RE2::Quiet); CHECK(!re.ok()); }
1059   {
1060     RE2 re("a[x", RE2::Quiet);
1061     CHECK(!re.ok());
1062   }
1063   {
1064     RE2 re("a[z-a]", RE2::Quiet);
1065     CHECK(!re.ok());
1066   }
1067   {
1068     RE2 re("a[[:foobar:]]", RE2::Quiet);
1069     CHECK(!re.ok());
1070   }
1071   {
1072     RE2 re("a(b", RE2::Quiet);
1073     CHECK(!re.ok());
1074   }
1075   {
1076     RE2 re("a\\", RE2::Quiet);
1077     CHECK(!re.ok());
1078   }
1079 }
1080 
TEST(RE2,NoCrash)1081 TEST(RE2, NoCrash) {
1082   // Test that using a bad regexp doesn't crash.
1083   {
1084     RE2 re("a\\", RE2::Quiet);
1085     CHECK(!re.ok());
1086     CHECK(!RE2::PartialMatch("a\\b", re));
1087   }
1088 
1089   // Test that using an enormous regexp doesn't crash
1090   {
1091     RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1092     CHECK(!re.ok());
1093     CHECK(!RE2::PartialMatch("aaa", re));
1094   }
1095 
1096   // Test that a crazy regexp still compiles and runs.
1097   {
1098     RE2 re(".{512}x", RE2::Quiet);
1099     CHECK(re.ok());
1100     string s;
1101     s.append(515, 'c');
1102     s.append("x");
1103     CHECK(RE2::PartialMatch(s, re));
1104   }
1105 }
1106 
TEST(RE2,Recursion)1107 TEST(RE2, Recursion) {
1108   // Test that recursion is stopped.
1109   // This test is PCRE-legacy -- there's no recursion in RE2.
1110   int bytes = 15 * 1024;  // enough to crash PCRE
1111   TestRecursion(bytes, ".");
1112   TestRecursion(bytes, "a");
1113   TestRecursion(bytes, "a.");
1114   TestRecursion(bytes, "ab.");
1115   TestRecursion(bytes, "abc.");
1116 }
1117 
TEST(RE2,BigCountedRepetition)1118 TEST(RE2, BigCountedRepetition) {
1119   // Test that counted repetition works, given tons of memory.
1120   RE2::Options opt;
1121   opt.set_max_mem(256<<20);
1122 
1123   RE2 re(".{512}x", opt);
1124   CHECK(re.ok());
1125   string s;
1126   s.append(515, 'c');
1127   s.append("x");
1128   CHECK(RE2::PartialMatch(s, re));
1129 }
1130 
TEST(RE2,DeepRecursion)1131 TEST(RE2, DeepRecursion) {
1132   // Test for deep stack recursion.  This would fail with a
1133   // segmentation violation due to stack overflow before pcre was
1134   // patched.
1135   // Again, a PCRE legacy test.  RE2 doesn't recurse.
1136   string comment("x*");
1137   string a(131072, 'a');
1138   comment += a;
1139   comment += "*x";
1140   RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1141   CHECK(RE2::FullMatch(comment, re));
1142 }
1143 
1144 // Suggested by Josh Hyman.  Failed when SearchOnePass was
1145 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1146 TEST(CaseInsensitive, MatchAndConsume) {
1147   string result;
1148   string text = "A fish named *Wanda*";
1149   StringPiece sp(text);
1150 
1151   EXPECT_TRUE(RE2::PartialMatch(sp, "(?i)([wand]{5})", &result));
1152   EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1153 }
1154 
1155 // RE2 should permit implicit conversions from string, StringPiece, const char*,
1156 // and C string literals.
TEST(RE2,ImplicitConversions)1157 TEST(RE2, ImplicitConversions) {
1158   string re_string(".");
1159   StringPiece re_stringpiece(".");
1160   const char* re_cstring = ".";
1161   EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1162   EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1163   EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1164   EXPECT_TRUE(RE2::PartialMatch("e", "."));
1165 }
1166 
1167 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1168 TEST(RE2, CL8622304) {
1169   // reported by ingow
1170   string dir;
1171   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])"));  // ok
1172   EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir));  // fails
1173 
1174   // reported by jacobsa
1175   string key, val;
1176   EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1177               "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1178               &key,
1179               &val));
1180   EXPECT_EQ(key, "bar");
1181   EXPECT_EQ(val, "1,0x2F,030,4,5");
1182 }
1183 
1184 
1185 // Check that RE2 returns correct regexp pieces on error.
1186 // In particular, make sure it returns whole runes
1187 // and that it always reports invalid UTF-8.
1188 // Also check that Perl error flag piece is big enough.
1189 static struct ErrorTest {
1190   const char *regexp;
1191   const char *error;
1192 } error_tests[] = {
1193   { "ab\\αcd", "\\α" },
1194   { "ef\\x☺01", "\\x☺0" },
1195   { "gh\\x1☺01", "\\x1☺" },
1196   { "ij\\x1", "\\x1" },
1197   { "kl\\x", "\\x" },
1198   { "uv\\x{0000☺}", "\\x{0000☺" },
1199   { "wx\\p{ABC", "\\p{ABC" },
1200   { "yz(?smiUX:abc)", "(?smiUX" },   // used to return (?s but the error is X
1201   { "aa(?sm☺i", "(?sm☺" },
1202   { "bb[abc", "[abc" },
1203 
1204   { "mn\\x1\377", "" },  // no argument string returned for invalid UTF-8
1205   { "op\377qr", "" },
1206   { "st\\x{00000\377", "" },
1207   { "zz\\p{\377}", "" },
1208   { "zz\\x{00\377}", "" },
1209   { "zz(?P<name\377>abc)", "" },
1210 };
TEST(RE2,ErrorArgs)1211 TEST(RE2, ErrorArgs) {
1212   for (int i = 0; i < arraysize(error_tests); i++) {
1213     RE2 re(error_tests[i].regexp, RE2::Quiet);
1214     EXPECT_FALSE(re.ok());
1215     EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1216   }
1217 }
1218 
1219 // Check that "never match \n" mode never matches \n.
1220 static struct NeverTest {
1221   const char* regexp;
1222   const char* text;
1223   const char* match;
1224 } never_tests[] = {
1225   { "(.*)", "abc\ndef\nghi\n", "abc" },
1226   { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1227   { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1228   { "(abc[^x]*def)", "abc\ndef\n", NULL },
1229   { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1230 };
TEST(RE2,NeverNewline)1231 TEST(RE2, NeverNewline) {
1232   RE2::Options opt;
1233   opt.set_never_nl(true);
1234   for (int i = 0; i < arraysize(never_tests); i++) {
1235     const NeverTest& t = never_tests[i];
1236     RE2 re(t.regexp, opt);
1237     if (t.match == NULL) {
1238       EXPECT_FALSE(re.PartialMatch(t.text, re));
1239     } else {
1240       StringPiece m;
1241       EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1242       EXPECT_EQ(m, t.match);
1243     }
1244   }
1245 }
1246 
1247 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1248 TEST(RE2, NeverCapture) {
1249   RE2::Options opt;
1250   opt.set_never_capture(true);
1251   RE2 re("(r)(e)", opt);
1252   EXPECT_EQ(0, re.NumberOfCapturingGroups());
1253 }
1254 
1255 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1256 // Triggered by a failed DFA search falling back to Bitstate when
1257 // using Match with a NULL submatch set.  Bitstate tried to read
1258 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1259 TEST(RE2, BitstateCaptureBug) {
1260   RE2::Options opt;
1261   opt.set_max_mem(20000);
1262   RE2 re("(_________$)", opt);
1263   StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1264   EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1265 }
1266 
1267 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1268 TEST(RE2, UnicodeClasses) {
1269   const string str = "ABCDEFGHI譚永鋒";
1270   string a, b, c;
1271 
1272   EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1273   EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1274   EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1275   EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1276   EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1277   EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1278 
1279   EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1280   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1281   EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1282   EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1283   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1284   EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1285 
1286   EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1287   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1288   EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1289   EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1290   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1291   EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1292 
1293   EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1294   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1295   EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1296   EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1297   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1298   EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1299 
1300   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1301   EXPECT_EQ("A", a);
1302   EXPECT_EQ("B", b);
1303   EXPECT_EQ("C", c);
1304 
1305   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1306   EXPECT_EQ("A", a);
1307   EXPECT_EQ("B", b);
1308   EXPECT_EQ("C", c);
1309 
1310   EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1311 
1312   EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1313   EXPECT_EQ("A", a);
1314   EXPECT_EQ("B", b);
1315   EXPECT_EQ("C", c);
1316 
1317   EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1318 
1319   EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1320   EXPECT_EQ("譚", a);
1321   EXPECT_EQ("永", b);
1322   EXPECT_EQ("鋒", c);
1323 }
1324 
1325 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1326 TEST(RE2, NullVsEmptyString) {
1327   RE2 re2(".*");
1328   StringPiece v1("");
1329   EXPECT_TRUE(RE2::FullMatch(v1, re2));
1330 
1331   StringPiece v2;
1332   EXPECT_TRUE(RE2::FullMatch(v2, re2));
1333 }
1334 
1335 // Issue 1816809
TEST(RE2,Bug1816809)1336 TEST(RE2, Bug1816809) {
1337   RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1338   StringPiece piece("llx-3;llx4");
1339   string x;
1340   EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1341 }
1342 
1343 // Issue 3061120
TEST(RE2,Bug3061120)1344 TEST(RE2, Bug3061120) {
1345   RE2 re("(?i)\\W");
1346   EXPECT_FALSE(RE2::PartialMatch("x", re));  // always worked
1347   EXPECT_FALSE(RE2::PartialMatch("k", re));  // broke because of kelvin
1348   EXPECT_FALSE(RE2::PartialMatch("s", re));  // broke because of latin long s
1349 }
1350 
TEST(RE2,CapturingGroupNames)1351 TEST(RE2, CapturingGroupNames) {
1352   // Opening parentheses annotated with group IDs:
1353   //      12    3        45   6         7
1354   RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1355   EXPECT_TRUE(re.ok());
1356   const map<int, string>& have = re.CapturingGroupNames();
1357   map<int, string> want;
1358   want[3] = "G2";
1359   want[6] = "G2";
1360   want[7] = "G1";
1361   EXPECT_EQ(want, have);
1362 }
1363 
TEST(RE2,RegexpToStringLossOfAnchor)1364 TEST(RE2, RegexpToStringLossOfAnchor) {
1365   EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1366   EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1367   EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1368   EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1369 }
1370 
1371 }  // namespace re2
1372