1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <vector>
6 #include "util/test.h"
7 #include "re2/prog.h"
8 #include "re2/re2.h"
9 #include "re2/regexp.h"
10 #include "re2/testing/regexp_generator.h"
11 #include "re2/testing/string_generator.h"
12 
13 namespace re2 {
14 
15 // Test that C++ strings are compared as uint8s, not int8s.
16 // PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings,EightBit)17 TEST(CplusplusStrings, EightBit) {
18   string s = "\x70";
19   string t = "\xA0";
20   EXPECT_LT(s, t);
21 }
22 
23 struct PrefixTest {
24   const char* regexp;
25   int maxlen;
26   const char* min;
27   const char* max;
28 };
29 
30 static PrefixTest tests[] = {
31   { "",                  10,  "",           "",        },
32   { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
33   { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
34   { "a+hello",           10,  "aa",         "ahello"   },
35   { "a*hello",           10,  "a",          "hello"    },
36   { "def|abc",           10,  "abc",        "def"      },
37   { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
38   { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
39   { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
40   { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
41   { "(ab|x)?(c|z)?",     10,  "",           "z"        },
42   { "[^\\s\\S]",         10,  "",           ""         },
43   { "(abc)+",             5,  "abc",        "abcac"    },
44   { "(abc)+",             2,  "ab",         "ac"       },
45   { "(abc)+",             1,  "a",          "b"        },
46   { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
47   { "a*",                10,  "",           "ab"       },
48 
49   { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
50   { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
51   { "(?i)a+hello",       10,  "AA",         "ahello"   },
52   { "(?i)a*hello",       10,  "A",          "hello"    },
53   { "(?i)def|abc",       10,  "ABC",        "def"      },
54   { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
55   { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
56   { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
57   { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
58   { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
59   { "(?i)[^\\s\\S]",     10,  "",           ""         },
60   { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
61   { "(?i)(abc)+",         2,  "AB",         "ac"       },
62   { "(?i)(abc)+",         1,  "A",          "b"        },
63   { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
64   { "(?i)a*",            10,  "",           "ab"       },
65   { "(?i)A*",            10,  "",           "ab"       },
66 
67   { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
68   { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
69   { "\\Aa+hello",        10,  "aa",         "ahello"   },
70   { "\\Aa*hello",        10,  "a",          "hello"    },
71   { "\\Adef|abc",        10,  "abc",        "def"      },
72   { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
73   { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
74   { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
75   { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
76   { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
77   { "\\A[^\\s\\S]",      10,  "",           ""         },
78   { "\\A(abc)+",          5,  "abc",        "abcac"    },
79   { "\\A(abc)+",          2,  "ab",         "ac"       },
80   { "\\A(abc)+",          1,  "a",          "b"        },
81   { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
82   { "\\Aa*",             10,  "",           "ab"       },
83 
84   { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
85   { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
86   { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
87   { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
88   { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
89   { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
90   { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
91   { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
92   { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
93   { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
94   { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
95   { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
96   { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
97   { "(?i)\\A(abc)+",          1,  "A",          "b"        },
98   { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
99   { "(?i)\\Aa*",             10,  "",           "ab"       },
100   { "(?i)\\AA*",             10,  "",           "ab"       },
101 };
102 
TEST(PossibleMatchRange,HandWritten)103 TEST(PossibleMatchRange, HandWritten) {
104   for (int i = 0; i < arraysize(tests); i++) {
105     for (int j = 0; j < 2; j++) {
106       const PrefixTest& t = tests[i];
107       string min, max;
108       if (j == 0) {
109         LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
110         Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
111         CHECK(re);
112         Prog* prog = re->CompileToProg(0);
113         CHECK(prog);
114         CHECK(prog->PossibleMatchRange(&min, &max, t.maxlen))
115           << " " << t.regexp;
116         delete prog;
117         re->Decref();
118       } else {
119         CHECK(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
120       }
121       EXPECT_EQ(t.min, min) << t.regexp;
122       EXPECT_EQ(t.max, max) << t.regexp;
123     }
124   }
125 }
126 
127 // Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange,Failures)128 TEST(PossibleMatchRange, Failures) {
129   string min, max;
130 
131   // Fails because no room to write max.
132   EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
133 
134   // Fails because there is no max -- any non-empty string matches
135   // or begins a match.  Have to use Latin-1 input, because there
136   // are no valid UTF-8 strings beginning with byte 0xFF.
137   EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
138                PossibleMatchRange(&min, &max, 10))
139     << "min=" << CEscape(min) << ", max=" << CEscape(max);
140   EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
141                PossibleMatchRange(&min, &max, 10))
142     << "min=" << CEscape(min) << ", max=" << CEscape(max);
143   EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
144                PossibleMatchRange(&min, &max, 10))
145     << "min=" << CEscape(min) << ", max=" << CEscape(max);
146   EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
147                PossibleMatchRange(&min, &max, 10))
148     << "min=" << CEscape(min) << ", max=" << CEscape(max);
149   EXPECT_FALSE(RE2(".*", RE2::Latin1).
150                PossibleMatchRange(&min, &max, 10))
151     << "min=" << CEscape(min) << ", max=" << CEscape(max);
152   EXPECT_FALSE(RE2("\\C*").
153                PossibleMatchRange(&min, &max, 10))
154     << "min=" << CEscape(min) << ", max=" << CEscape(max);
155 
156   // Fails because it's a malformed regexp.
157   EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
158     << "min=" << CEscape(min) << ", max=" << CEscape(max);
159 }
160 
161 // Exhaustive test: generate all regexps within parameters,
162 // then generate all strings of a given length over a given alphabet,
163 // then check that the prefix information agrees with whether
164 // the regexp matches each of the strings.
165 class PossibleMatchTester : public RegexpGenerator {
166  public:
PossibleMatchTester(int maxatoms,int maxops,const vector<string> & alphabet,const vector<string> & ops,int maxstrlen,const vector<string> & stralphabet)167   PossibleMatchTester(int maxatoms,
168                       int maxops,
169                       const vector<string>& alphabet,
170                       const vector<string>& ops,
171                       int maxstrlen,
172                       const vector<string>& stralphabet)
173     : RegexpGenerator(maxatoms, maxops, alphabet, ops),
174       strgen_(maxstrlen, stralphabet),
175       regexps_(0), tests_(0) { }
176 
regexps()177   int regexps()  { return regexps_; }
tests()178   int tests()    { return tests_; }
179 
180   // Needed for RegexpGenerator interface.
181   void HandleRegexp(const string& regexp);
182 
183  private:
184   StringGenerator strgen_;
185 
186   int regexps_;   // Number of HandleRegexp calls
187   int tests_;     // Number of regexp tests.
188 
189   DISALLOW_EVIL_CONSTRUCTORS(PossibleMatchTester);
190 };
191 
192 // Processes a single generated regexp.
193 // Checks that all accepted strings agree with the prefix range.
HandleRegexp(const string & regexp)194 void PossibleMatchTester::HandleRegexp(const string& regexp) {
195   regexps_++;
196 
197   VLOG(3) << CEscape(regexp);
198 
199   RE2 re(regexp, RE2::Latin1);
200   CHECK_EQ(re.error(), "");
201 
202   string min, max;
203   if(!re.PossibleMatchRange(&min, &max, 10)) {
204     // There's no good max for "\\C*".  Can't use strcmp
205     // because sometimes it gets embedded in more
206     // complicated expressions.
207     if(strstr(regexp.c_str(), "\\C*"))
208       return;
209     LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
210   }
211 
212   strgen_.Reset();
213   while (strgen_.HasNext()) {
214     const StringPiece& s = strgen_.Next();
215     tests_++;
216     if (!RE2::FullMatch(s, re))
217       continue;
218     CHECK_GE(s, min) << " regexp: " << regexp << " max: " << max;
219     CHECK_LE(s, max) << " regexp: " << regexp << " min: " << min;
220   }
221 }
222 
TEST(PossibleMatchRange,Exhaustive)223 TEST(PossibleMatchRange, Exhaustive) {
224   int natom = 3;
225   int noperator = 3;
226   int stringlen = 5;
227   if (DEBUG_MODE) {
228     natom = 2;
229     noperator = 3;
230     stringlen = 3;
231   }
232   PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
233                  RegexpGenerator::EgrepOps(),
234                  stringlen, Explode("ab4"));
235   t.Generate();
236   LOG(INFO) << t.regexps() << " regexps, "
237             << t.tests() << " tests";
238 }
239 
240 }  // namespace re2
241