1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Rewrite POSIX and other features in re
6 // to use simple extended regular expression features.
7 // Also sort and simplify character classes.
8 
9 #include "util/util.h"
10 #include "re2/regexp.h"
11 #include "re2/walker-inl.h"
12 
13 namespace re2 {
14 
15 // Parses the regexp src and then simplifies it and sets *dst to the
16 // string representation of the simplified form.  Returns true on success.
17 // Returns false and sets *error (if error != NULL) on error.
SimplifyRegexp(const StringPiece & src,ParseFlags flags,string * dst,RegexpStatus * status)18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
19                             string* dst,
20                             RegexpStatus* status) {
21   Regexp* re = Parse(src, flags, status);
22   if (re == NULL)
23     return false;
24   Regexp* sre = re->Simplify();
25   re->Decref();
26   if (sre == NULL) {
27     // Should not happen, since Simplify never fails.
28     LOG(ERROR) << "Simplify failed on " << src;
29     if (status) {
30       status->set_code(kRegexpInternalError);
31       status->set_error_arg(src);
32     }
33     return false;
34   }
35   *dst = sre->ToString();
36   sre->Decref();
37   return true;
38 }
39 
40 // Assuming the simple_ flags on the children are accurate,
41 // is this Regexp* simple?
ComputeSimple()42 bool Regexp::ComputeSimple() {
43   Regexp** subs;
44   switch (op_) {
45     case kRegexpNoMatch:
46     case kRegexpEmptyMatch:
47     case kRegexpLiteral:
48     case kRegexpLiteralString:
49     case kRegexpBeginLine:
50     case kRegexpEndLine:
51     case kRegexpBeginText:
52     case kRegexpWordBoundary:
53     case kRegexpNoWordBoundary:
54     case kRegexpEndText:
55     case kRegexpAnyChar:
56     case kRegexpAnyByte:
57     case kRegexpHaveMatch:
58       return true;
59     case kRegexpConcat:
60     case kRegexpAlternate:
61       // These are simple as long as the subpieces are simple.
62       subs = sub();
63       for (int i = 0; i < nsub_; i++)
64         if (!subs[i]->simple_)
65           return false;
66       return true;
67     case kRegexpCharClass:
68       // Simple as long as the char class is not empty, not full.
69       if (ccb_ != NULL)
70         return !ccb_->empty() && !ccb_->full();
71       return !cc_->empty() && !cc_->full();
72     case kRegexpCapture:
73       subs = sub();
74       return subs[0]->simple_;
75     case kRegexpStar:
76     case kRegexpPlus:
77     case kRegexpQuest:
78       subs = sub();
79       if (!subs[0]->simple_)
80         return false;
81       switch (subs[0]->op_) {
82         case kRegexpStar:
83         case kRegexpPlus:
84         case kRegexpQuest:
85         case kRegexpEmptyMatch:
86         case kRegexpNoMatch:
87           return false;
88         default:
89           break;
90       }
91       return true;
92     case kRegexpRepeat:
93       return false;
94   }
95   LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
96   return false;
97 }
98 
99 // Walker subclass used by Simplify.
100 // The simplify walk is purely post-recursive: given the simplified children,
101 // PostVisit creates the simplified result.
102 // The child_args are simplified Regexp*s.
103 class SimplifyWalker : public Regexp::Walker<Regexp*> {
104  public:
SimplifyWalker()105   SimplifyWalker() {}
106   virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
107   virtual Regexp* PostVisit(Regexp* re,
108                             Regexp* parent_arg,
109                             Regexp* pre_arg,
110                             Regexp** child_args, int nchild_args);
111   virtual Regexp* Copy(Regexp* re);
112   virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113 
114  private:
115   // These functions are declared inside SimplifyWalker so that
116   // they can edit the private fields of the Regexps they construct.
117 
118   // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
119   // Caller must Decref return value when done with it.
120   static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
121 
122   // Simplifies the expression re{min,max} in terms of *, +, and ?.
123   // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
124   // Caller must Decref return value when done with it.
125   static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
126                                 Regexp::ParseFlags parse_flags);
127 
128   // Simplifies a character class by expanding any named classes
129   // into rune ranges.  Does not edit re.  Does not consume ref to re.
130   // Caller must Decref return value when done with it.
131   static Regexp* SimplifyCharClass(Regexp* re);
132 
133   DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
134 };
135 
136 // Simplifies a regular expression, returning a new regexp.
137 // The new regexp uses traditional Unix egrep features only,
138 // plus the Perl (?:) non-capturing parentheses.
139 // Otherwise, no POSIX or Perl additions.  The new regexp
140 // captures exactly the same subexpressions (with the same indices)
141 // as the original.
142 // Does not edit current object.
143 // Caller must Decref() return value when done with it.
144 
Simplify()145 Regexp* Regexp::Simplify() {
146   if (simple_)
147     return Incref();
148   SimplifyWalker w;
149   return w.Walk(this, NULL);
150 }
151 
152 #define Simplify DontCallSimplify  // Avoid accidental recursion
153 
Copy(Regexp * re)154 Regexp* SimplifyWalker::Copy(Regexp* re) {
155   return re->Incref();
156 }
157 
ShortVisit(Regexp * re,Regexp * parent_arg)158 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
159   // This should never be called, since we use Walk and not
160   // WalkExponential.
161   LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
162   return re->Incref();
163 }
164 
PreVisit(Regexp * re,Regexp * parent_arg,bool * stop)165 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
166   if (re->simple_) {
167     *stop = true;
168     return re->Incref();
169   }
170   return NULL;
171 }
172 
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)173 Regexp* SimplifyWalker::PostVisit(Regexp* re,
174                                   Regexp* parent_arg,
175                                   Regexp* pre_arg,
176                                   Regexp** child_args,
177                                   int nchild_args) {
178   switch (re->op()) {
179     case kRegexpNoMatch:
180     case kRegexpEmptyMatch:
181     case kRegexpLiteral:
182     case kRegexpLiteralString:
183     case kRegexpBeginLine:
184     case kRegexpEndLine:
185     case kRegexpBeginText:
186     case kRegexpWordBoundary:
187     case kRegexpNoWordBoundary:
188     case kRegexpEndText:
189     case kRegexpAnyChar:
190     case kRegexpAnyByte:
191     case kRegexpHaveMatch:
192       // All these are always simple.
193       re->simple_ = true;
194       return re->Incref();
195 
196     case kRegexpConcat:
197     case kRegexpAlternate: {
198       // These are simple as long as the subpieces are simple.
199       // Two passes to avoid allocation in the common case.
200       bool changed = false;
201       Regexp** subs = re->sub();
202       for (int i = 0; i < re->nsub_; i++) {
203         Regexp* sub = subs[i];
204         Regexp* newsub = child_args[i];
205         if (newsub != sub) {
206           changed = true;
207           break;
208         }
209       }
210       if (!changed) {
211         for (int i = 0; i < re->nsub_; i++) {
212           Regexp* newsub = child_args[i];
213           newsub->Decref();
214         }
215         re->simple_ = true;
216         return re->Incref();
217       }
218       Regexp* nre = new Regexp(re->op(), re->parse_flags());
219       nre->AllocSub(re->nsub_);
220       Regexp** nre_subs = nre->sub();
221       for (int i = 0; i <re->nsub_; i++)
222         nre_subs[i] = child_args[i];
223       nre->simple_ = true;
224       return nre;
225     }
226 
227     case kRegexpCapture: {
228       Regexp* newsub = child_args[0];
229       if (newsub == re->sub()[0]) {
230         newsub->Decref();
231         re->simple_ = true;
232         return re->Incref();
233       }
234       Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
235       nre->AllocSub(1);
236       nre->sub()[0] = newsub;
237       nre->cap_ = re->cap_;
238       nre->simple_ = true;
239       return nre;
240     }
241 
242     case kRegexpStar:
243     case kRegexpPlus:
244     case kRegexpQuest: {
245       Regexp* newsub = child_args[0];
246       // Special case: repeat the empty string as much as
247       // you want, but it's still the empty string.
248       if (newsub->op() == kRegexpEmptyMatch)
249         return newsub;
250 
251       // These are simple as long as the subpiece is simple.
252       if (newsub == re->sub()[0]) {
253         newsub->Decref();
254         re->simple_ = true;
255         return re->Incref();
256       }
257 
258       // These are also idempotent if flags are constant.
259       if (re->op() == newsub->op() &&
260           re->parse_flags() == newsub->parse_flags())
261         return newsub;
262 
263       Regexp* nre = new Regexp(re->op(), re->parse_flags());
264       nre->AllocSub(1);
265       nre->sub()[0] = newsub;
266       nre->simple_ = true;
267       return nre;
268     }
269 
270     case kRegexpRepeat: {
271       Regexp* newsub = child_args[0];
272       // Special case: repeat the empty string as much as
273       // you want, but it's still the empty string.
274       if (newsub->op() == kRegexpEmptyMatch)
275         return newsub;
276 
277       Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
278                                    re->parse_flags());
279       newsub->Decref();
280       nre->simple_ = true;
281       return nre;
282     }
283 
284     case kRegexpCharClass: {
285       Regexp* nre = SimplifyCharClass(re);
286       nre->simple_ = true;
287       return nre;
288     }
289   }
290 
291   LOG(ERROR) << "Simplify case not handled: " << re->op();
292   return re->Incref();
293 }
294 
295 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
296 // Returns a new Regexp, handing the ref to the caller.
Concat2(Regexp * re1,Regexp * re2,Regexp::ParseFlags parse_flags)297 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
298                                 Regexp::ParseFlags parse_flags) {
299   Regexp* re = new Regexp(kRegexpConcat, parse_flags);
300   re->AllocSub(2);
301   Regexp** subs = re->sub();
302   subs[0] = re1;
303   subs[1] = re2;
304   return re;
305 }
306 
307 // Simplifies the expression re{min,max} in terms of *, +, and ?.
308 // Returns a new regexp.  Does not edit re.  Does not consume reference to re.
309 // Caller must Decref return value when done with it.
310 // The result will *not* necessarily have the right capturing parens
311 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
312 // but in the Regexp* representation, both (x) are marked as $1.
SimplifyRepeat(Regexp * re,int min,int max,Regexp::ParseFlags f)313 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
314                                        Regexp::ParseFlags f) {
315   // x{n,} means at least n matches of x.
316   if (max == -1) {
317     // Special case: x{0,} is x*
318     if (min == 0)
319       return Regexp::Star(re->Incref(), f);
320 
321     // Special case: x{1,} is x+
322     if (min == 1)
323       return Regexp::Plus(re->Incref(), f);
324 
325     // General case: x{4,} is xxxx+
326     Regexp* nre = new Regexp(kRegexpConcat, f);
327     nre->AllocSub(min);
328     VLOG(1) << "Simplify " << min;
329     Regexp** nre_subs = nre->sub();
330     for (int i = 0; i < min-1; i++)
331       nre_subs[i] = re->Incref();
332     nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
333     return nre;
334   }
335 
336   // Special case: (x){0} matches only empty string.
337   if (min == 0 && max == 0)
338     return new Regexp(kRegexpEmptyMatch, f);
339 
340   // Special case: x{1} is just x.
341   if (min == 1 && max == 1)
342     return re->Incref();
343 
344   // General case: x{n,m} means n copies of x and m copies of x?.
345   // The machine will do less work if we nest the final m copies,
346   // so that x{2,5} = xx(x(x(x)?)?)?
347 
348   // Build leading prefix: xx.  Capturing only on the last one.
349   Regexp* nre = NULL;
350   if (min > 0) {
351     nre = new Regexp(kRegexpConcat, f);
352     nre->AllocSub(min);
353     Regexp** nre_subs = nre->sub();
354     for (int i = 0; i < min; i++)
355       nre_subs[i] = re->Incref();
356   }
357 
358   // Build and attach suffix: (x(x(x)?)?)?
359   if (max > min) {
360     Regexp* suf = Regexp::Quest(re->Incref(), f);
361     for (int i = min+1; i < max; i++)
362       suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
363     if (nre == NULL)
364       nre = suf;
365     else
366       nre = Concat2(nre, suf, f);
367   }
368 
369   if (nre == NULL) {
370     // Some degenerate case, like min > max, or min < max < 0.
371     // This shouldn't happen, because the parser rejects such regexps.
372     LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
373     return new Regexp(kRegexpNoMatch, f);
374   }
375 
376   return nre;
377 }
378 
379 // Simplifies a character class.
380 // Caller must Decref return value when done with it.
SimplifyCharClass(Regexp * re)381 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
382   CharClass* cc = re->cc();
383 
384   // Special cases
385   if (cc->empty())
386     return new Regexp(kRegexpNoMatch, re->parse_flags());
387   if (cc->full())
388     return new Regexp(kRegexpAnyChar, re->parse_flags());
389 
390   return re->Incref();
391 }
392 
393 }  // namespace re2
394