1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <string.h>
6 #include <string>
7 #include <vector>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "util/strutil.h"
12 #include "re2/prog.h"
13 #include "re2/re2.h"
14 #include "re2/regexp.h"
15 #include "re2/testing/exhaustive_tester.h"
16 #include "re2/testing/regexp_generator.h"
17 #include "re2/testing/string_generator.h"
18 
19 namespace re2 {
20 
21 // Test that C++ strings are compared as uint8s, not int8s.
22 // PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings,EightBit)23 TEST(CplusplusStrings, EightBit) {
24   std::string s = "\x70";
25   std::string t = "\xA0";
26   EXPECT_LT(s, t);
27 }
28 
29 struct PrefixTest {
30   const char* regexp;
31   int maxlen;
32   const char* min;
33   const char* max;
34 };
35 
36 static PrefixTest tests[] = {
37   { "",                  10,  "",           "",        },
38   { "Abcdef",            10,  "Abcdef",     "Abcdef"   },
39   { "abc(def|ghi)",      10,  "abcdef",     "abcghi"   },
40   { "a+hello",           10,  "aa",         "ahello"   },
41   { "a*hello",           10,  "a",          "hello"    },
42   { "def|abc",           10,  "abc",        "def"      },
43   { "a(b)(c)[d]",        10,  "abcd",       "abcd"     },
44   { "ab(cab|cat)",       10,  "abcab",      "abcat"    },
45   { "ab(cab|ca)x",       10,  "abcabx",     "abcax"    },
46   { "(ab|x)(c|de)",      10,  "abc",        "xde"      },
47   { "(ab|x)?(c|z)?",     10,  "",           "z"        },
48   { "[^\\s\\S]",         10,  "",           ""         },
49   { "(abc)+",             5,  "abc",        "abcac"    },
50   { "(abc)+",             2,  "ab",         "ac"       },
51   { "(abc)+",             1,  "a",          "b"        },
52   { "[a\xC3\xA1]",        4,  "a",          "\xC3\xA1" },
53   { "a*",                10,  "",           "ab"       },
54 
55   { "(?i)Abcdef",        10,  "ABCDEF",     "abcdef"   },
56   { "(?i)abc(def|ghi)",  10,  "ABCDEF",     "abcghi"   },
57   { "(?i)a+hello",       10,  "AA",         "ahello"   },
58   { "(?i)a*hello",       10,  "A",          "hello"    },
59   { "(?i)def|abc",       10,  "ABC",        "def"      },
60   { "(?i)a(b)(c)[d]",    10,  "ABCD",       "abcd"     },
61   { "(?i)ab(cab|cat)",   10,  "ABCAB",      "abcat"    },
62   { "(?i)ab(cab|ca)x",   10,  "ABCABX",     "abcax"    },
63   { "(?i)(ab|x)(c|de)",  10,  "ABC",        "xde"      },
64   { "(?i)(ab|x)?(c|z)?", 10,  "",           "z"        },
65   { "(?i)[^\\s\\S]",     10,  "",           ""         },
66   { "(?i)(abc)+",         5,  "ABC",        "abcac"    },
67   { "(?i)(abc)+",         2,  "AB",         "ac"       },
68   { "(?i)(abc)+",         1,  "A",          "b"        },
69   { "(?i)[a\xC3\xA1]",    4,  "A",          "\xC3\xA1" },
70   { "(?i)a*",            10,  "",           "ab"       },
71   { "(?i)A*",            10,  "",           "ab"       },
72 
73   { "\\AAbcdef",         10,  "Abcdef",     "Abcdef"   },
74   { "\\Aabc(def|ghi)",   10,  "abcdef",     "abcghi"   },
75   { "\\Aa+hello",        10,  "aa",         "ahello"   },
76   { "\\Aa*hello",        10,  "a",          "hello"    },
77   { "\\Adef|abc",        10,  "abc",        "def"      },
78   { "\\Aa(b)(c)[d]",     10,  "abcd",       "abcd"     },
79   { "\\Aab(cab|cat)",    10,  "abcab",      "abcat"    },
80   { "\\Aab(cab|ca)x",    10,  "abcabx",     "abcax"    },
81   { "\\A(ab|x)(c|de)",   10,  "abc",        "xde"      },
82   { "\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
83   { "\\A[^\\s\\S]",      10,  "",           ""         },
84   { "\\A(abc)+",          5,  "abc",        "abcac"    },
85   { "\\A(abc)+",          2,  "ab",         "ac"       },
86   { "\\A(abc)+",          1,  "a",          "b"        },
87   { "\\A[a\xC3\xA1]",     4,  "a",          "\xC3\xA1" },
88   { "\\Aa*",             10,  "",           "ab"       },
89 
90   { "(?i)\\AAbcdef",         10,  "ABCDEF",     "abcdef"   },
91   { "(?i)\\Aabc(def|ghi)",   10,  "ABCDEF",     "abcghi"   },
92   { "(?i)\\Aa+hello",        10,  "AA",         "ahello"   },
93   { "(?i)\\Aa*hello",        10,  "A",          "hello"    },
94   { "(?i)\\Adef|abc",        10,  "ABC",        "def"      },
95   { "(?i)\\Aa(b)(c)[d]",     10,  "ABCD",       "abcd"     },
96   { "(?i)\\Aab(cab|cat)",    10,  "ABCAB",      "abcat"    },
97   { "(?i)\\Aab(cab|ca)x",    10,  "ABCABX",     "abcax"    },
98   { "(?i)\\A(ab|x)(c|de)",   10,  "ABC",        "xde"      },
99   { "(?i)\\A(ab|x)?(c|z)?",  10,  "",           "z"        },
100   { "(?i)\\A[^\\s\\S]",      10,  "",           ""         },
101   { "(?i)\\A(abc)+",          5,  "ABC",        "abcac"    },
102   { "(?i)\\A(abc)+",          2,  "AB",         "ac"       },
103   { "(?i)\\A(abc)+",          1,  "A",          "b"        },
104   { "(?i)\\A[a\xC3\xA1]",     4,  "A",          "\xC3\xA1" },
105   { "(?i)\\Aa*",             10,  "",           "ab"       },
106   { "(?i)\\AA*",             10,  "",           "ab"       },
107 };
108 
TEST(PossibleMatchRange,HandWritten)109 TEST(PossibleMatchRange, HandWritten) {
110   for (size_t i = 0; i < arraysize(tests); i++) {
111     for (size_t j = 0; j < 2; j++) {
112       const PrefixTest& t = tests[i];
113       std::string min, max;
114       if (j == 0) {
115         LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
116         Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
117         ASSERT_TRUE(re != NULL);
118         Prog* prog = re->CompileToProg(0);
119         ASSERT_TRUE(prog != NULL);
120         ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
121           << " " << t.regexp;
122         delete prog;
123         re->Decref();
124       } else {
125         ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
126       }
127       EXPECT_EQ(t.min, min) << t.regexp;
128       EXPECT_EQ(t.max, max) << t.regexp;
129     }
130   }
131 }
132 
133 // Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange,Failures)134 TEST(PossibleMatchRange, Failures) {
135   std::string min, max;
136 
137   // Fails because no room to write max.
138   EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
139 
140   // Fails because there is no max -- any non-empty string matches
141   // or begins a match.  Have to use Latin-1 input, because there
142   // are no valid UTF-8 strings beginning with byte 0xFF.
143   EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
144                PossibleMatchRange(&min, &max, 10))
145       << "min=" << CEscape(min) << ", max=" << CEscape(max);
146   EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
147                PossibleMatchRange(&min, &max, 10))
148       << "min=" << CEscape(min) << ", max=" << CEscape(max);
149   EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
150                PossibleMatchRange(&min, &max, 10))
151       << "min=" << CEscape(min) << ", max=" << CEscape(max);
152   EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
153                PossibleMatchRange(&min, &max, 10))
154       << "min=" << CEscape(min) << ", max=" << CEscape(max);
155   EXPECT_FALSE(RE2(".*", RE2::Latin1).
156                PossibleMatchRange(&min, &max, 10))
157       << "min=" << CEscape(min) << ", max=" << CEscape(max);
158   EXPECT_FALSE(RE2("\\C*").
159                PossibleMatchRange(&min, &max, 10))
160       << "min=" << CEscape(min) << ", max=" << CEscape(max);
161 
162   // Fails because it's a malformed regexp.
163   EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
164       << "min=" << CEscape(min) << ", max=" << CEscape(max);
165 }
166 
167 // Exhaustive test: generate all regexps within parameters,
168 // then generate all strings of a given length over a given alphabet,
169 // then check that the prefix information agrees with whether
170 // the regexp matches each of the strings.
171 class PossibleMatchTester : public RegexpGenerator {
172  public:
PossibleMatchTester(int maxatoms,int maxops,const std::vector<std::string> & alphabet,const std::vector<std::string> & ops,int maxstrlen,const std::vector<std::string> & stralphabet)173   PossibleMatchTester(int maxatoms,
174                       int maxops,
175                       const std::vector<std::string>& alphabet,
176                       const std::vector<std::string>& ops,
177                       int maxstrlen,
178                       const std::vector<std::string>& stralphabet)
179     : RegexpGenerator(maxatoms, maxops, alphabet, ops),
180       strgen_(maxstrlen, stralphabet),
181       regexps_(0), tests_(0) { }
182 
regexps()183   int regexps()  { return regexps_; }
tests()184   int tests()    { return tests_; }
185 
186   // Needed for RegexpGenerator interface.
187   void HandleRegexp(const std::string& regexp);
188 
189  private:
190   StringGenerator strgen_;
191 
192   int regexps_;   // Number of HandleRegexp calls
193   int tests_;     // Number of regexp tests.
194 
195   PossibleMatchTester(const PossibleMatchTester&) = delete;
196   PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
197 };
198 
199 // Processes a single generated regexp.
200 // Checks that all accepted strings agree with the prefix range.
HandleRegexp(const std::string & regexp)201 void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
202   regexps_++;
203 
204   VLOG(3) << CEscape(regexp);
205 
206   RE2 re(regexp, RE2::Latin1);
207   ASSERT_EQ(re.error(), "");
208 
209   std::string min, max;
210   if(!re.PossibleMatchRange(&min, &max, 10)) {
211     // There's no good max for "\\C*".  Can't use strcmp
212     // because sometimes it gets embedded in more
213     // complicated expressions.
214     if(strstr(regexp.c_str(), "\\C*"))
215       return;
216     LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
217   }
218 
219   strgen_.Reset();
220   while (strgen_.HasNext()) {
221     const StringPiece& s = strgen_.Next();
222     tests_++;
223     if (!RE2::FullMatch(s, re))
224       continue;
225     ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
226     ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
227   }
228 }
229 
TEST(PossibleMatchRange,Exhaustive)230 TEST(PossibleMatchRange, Exhaustive) {
231   int natom = 3;
232   int noperator = 3;
233   int stringlen = 5;
234   if (RE2_DEBUG_MODE) {
235     natom = 2;
236     noperator = 3;
237     stringlen = 3;
238   }
239   PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
240                  RegexpGenerator::EgrepOps(),
241                  stringlen, Explode("ab4"));
242   t.Generate();
243   LOG(INFO) << t.regexps() << " regexps, "
244             << t.tests() << " tests";
245 }
246 
247 }  // namespace re2
248