1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression generator: generates all possible
6 // regular expressions within parameters (see regexp_generator.h for details).
7 
8 // The regexp generator first generates a sequence of commands in a simple
9 // postfix language.  Each command in the language is a string,
10 // like "a" or "%s*" or "%s|%s".
11 //
12 // To evaluate a command, enough arguments are popped from the value stack to
13 // plug into the %s slots.  Then the result is pushed onto the stack.
14 // For example, the command sequence
15 //      a b %s%s c
16 // results in the stack
17 //      ab c
18 //
19 // GeneratePostfix generates all possible command sequences.
20 // Then RunPostfix turns each sequence into a regular expression
21 // and passes the regexp to HandleRegexp.
22 
23 #include <string.h>
24 #include <string>
25 #include <stack>
26 #include <vector>
27 #include "util/test.h"
28 #include "re2/testing/regexp_generator.h"
29 
30 namespace re2 {
31 
32 // Returns a vector of the egrep regexp operators.
EgrepOps()33 const vector<string>& RegexpGenerator::EgrepOps() {
34   static const char *ops[] = {
35     "%s%s",
36     "%s|%s",
37     "%s*",
38     "%s+",
39     "%s?",
40     "%s\\C*",
41   };
42   static vector<string> v(ops, ops + arraysize(ops));
43   return v;
44 }
45 
RegexpGenerator(int maxatoms,int maxops,const vector<string> & atoms,const vector<string> & ops)46 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
47                                  const vector<string>& atoms,
48                                  const vector<string>& ops)
49     : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
50   // Degenerate case.
51   if (atoms_.size() == 0)
52     maxatoms_ = 0;
53   if (ops_.size() == 0)
54     maxops_ = 0;
55 }
56 
57 // Generates all possible regular expressions (within the parameters),
58 // calling HandleRegexp for each one.
Generate()59 void RegexpGenerator::Generate() {
60   vector<string> postfix;
61   GeneratePostfix(&postfix, 0, 0, 0);
62 }
63 
64 // Generates random regular expressions, calling HandleRegexp for each one.
GenerateRandom(int32 seed,int n)65 void RegexpGenerator::GenerateRandom(int32 seed, int n) {
66   ACMRandom acm(seed);
67   acm_ = &acm;
68 
69   for (int i = 0; i < n; i++) {
70     vector<string> postfix;
71     GenerateRandomPostfix(&postfix, 0, 0, 0);
72   }
73 
74   acm_ = NULL;
75 }
76 
77 // Counts and returns the number of occurrences of "%s" in s.
CountArgs(const string & s)78 static int CountArgs(const string& s) {
79   const char *p = s.c_str();
80   int n = 0;
81   while ((p = strstr(p, "%s")) != NULL) {
82     p += 2;
83     n++;
84   }
85   return n;
86 }
87 
88 // Generates all possible postfix command sequences.
89 // Each sequence is handed off to RunPostfix to generate a regular expression.
90 // The arguments are:
91 //   post:  the current postfix sequence
92 //   nstk:  the number of elements that would be on the stack after executing
93 //          the sequence
94 //   ops:   the number of operators used in the sequence
95 //   atoms: the number of atoms used in the sequence
96 // For example, if post were ["a", "b", "%s%s", "c"],
97 // then nstk = 2, ops = 1, atoms = 3.
98 //
99 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
100 //
GeneratePostfix(vector<string> * post,int nstk,int ops,int atoms)101 void RegexpGenerator::GeneratePostfix(vector<string>* post, int nstk,
102                                       int ops, int atoms) {
103   if (nstk == 1)
104     RunPostfix(*post);
105 
106   // Early out: if used too many operators or can't
107   // get back down to a single expression on the stack
108   // using binary operators, give up.
109   if (ops + nstk - 1 > maxops_)
110     return;
111 
112   // Add atoms if there is room.
113   if (atoms < maxatoms_) {
114     for (int i = 0; i < atoms_.size(); i++) {
115       post->push_back(atoms_[i]);
116       GeneratePostfix(post, nstk + 1, ops, atoms + 1);
117       post->pop_back();
118     }
119   }
120 
121   // Add operators if there are enough arguments.
122   if (ops < maxops_) {
123     for (int i = 0; i < ops_.size(); i++) {
124       const string& fmt = ops_[i];
125       int nargs = CountArgs(fmt);
126       if (nargs <= nstk) {
127         post->push_back(fmt);
128         GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
129         post->pop_back();
130       }
131     }
132   }
133 }
134 
135 // Generates a random postfix command sequence.
136 // Stops and returns true once a single sequence has been generated.
GenerateRandomPostfix(vector<string> * post,int nstk,int ops,int atoms)137 bool RegexpGenerator::GenerateRandomPostfix(vector<string> *post, int nstk,
138                                             int ops, int atoms) {
139   for (;;) {
140     // Stop if we get to a single element, but only sometimes.
141     if (nstk == 1 && acm_->Uniform(maxatoms_ + 1 - atoms) == 0) {
142       RunPostfix(*post);
143       return true;
144     }
145 
146     // Early out: if used too many operators or can't
147     // get back down to a single expression on the stack
148     // using binary operators, give up.
149     if (ops + nstk - 1 > maxops_)
150       return false;
151 
152     // Add operators if there are enough arguments.
153     if (ops < maxops_ && acm_->Uniform(2) == 0) {
154       const string& fmt = ops_[acm_->Uniform(ops_.size())];
155       int nargs = CountArgs(fmt);
156       if (nargs <= nstk) {
157         post->push_back(fmt);
158         bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
159                                          ops + 1, atoms);
160         post->pop_back();
161         if (ret)
162           return true;
163       }
164     }
165 
166     // Add atoms if there is room.
167     if (atoms < maxatoms_ && acm_->Uniform(2) == 0) {
168       post->push_back(atoms_[acm_->Uniform(atoms_.size())]);
169       bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
170       post->pop_back();
171       if (ret)
172         return true;
173     }
174   }
175 }
176 
177 // Interprets the postfix command sequence to create a regular expression
178 // passed to HandleRegexp.  The results of operators like %s|%s are wrapped
179 // in (?: ) to avoid needing to maintain a precedence table.
RunPostfix(const vector<string> & post)180 void RegexpGenerator::RunPostfix(const vector<string>& post) {
181   stack<string> regexps;
182   for (int i = 0; i < post.size(); i++) {
183     switch (CountArgs(post[i])) {
184       default:
185         LOG(FATAL) << "Bad operator: " << post[i];
186       case 0:
187         regexps.push(post[i]);
188         break;
189       case 1: {
190         string a = regexps.top();
191         regexps.pop();
192         regexps.push("(?:" + StringPrintf(post[i].c_str(), a.c_str()) + ")");
193         break;
194       }
195       case 2: {
196         string b = regexps.top();
197         regexps.pop();
198         string a = regexps.top();
199         regexps.pop();
200         regexps.push("(?:" +
201                      StringPrintf(post[i].c_str(), a.c_str(), b.c_str()) +
202                      ")");
203         break;
204       }
205     }
206   }
207 
208   if (regexps.size() != 1) {
209     // Internal error - should never happen.
210     printf("Bad regexp program:\n");
211     for (int i = 0; i < post.size(); i++) {
212       printf("  %s\n", CEscape(post[i]).c_str());
213     }
214     printf("Stack after running program:\n");
215     while (!regexps.empty()) {
216       printf("  %s\n", CEscape(regexps.top()).c_str());
217       regexps.pop();
218     }
219     LOG(FATAL) << "Bad regexp program.";
220   }
221 
222   HandleRegexp(regexps.top());
223   HandleRegexp("^(?:" + regexps.top() + ")$");
224   HandleRegexp("^(?:" + regexps.top() + ")");
225   HandleRegexp("(?:" + regexps.top() + ")$");
226 }
227 
228 // Split s into an vector of strings, one for each UTF-8 character.
Explode(const StringPiece & s)229 vector<string> Explode(const StringPiece& s) {
230   vector<string> v;
231 
232   for (const char *q = s.begin(); q < s.end(); ) {
233     const char* p = q;
234     Rune r;
235     q += chartorune(&r, q);
236     v.push_back(string(p, q - p));
237   }
238 
239   return v;
240 }
241 
242 // Split string everywhere a substring is found, returning
243 // vector of pieces.
Split(const StringPiece & sep,const StringPiece & s)244 vector<string> Split(const StringPiece& sep, const StringPiece& s) {
245   vector<string> v;
246 
247   if (sep.size() == 0)
248     return Explode(s);
249 
250   const char *p = s.begin();
251   for (const char *q = s.begin(); q + sep.size() <= s.end(); q++) {
252     if (StringPiece(q, sep.size()) == sep) {
253       v.push_back(string(p, q - p));
254       p = q + sep.size();
255       q = p - 1;  // -1 for ++ in loop
256       continue;
257     }
258   }
259   if (p < s.end())
260     v.push_back(string(p, s.end() - p));
261   return v;
262 }
263 
264 }  // namespace re2
265