1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Helper class for traversing Regexps without recursion.
6 // Clients should declare their own subclasses that override
7 // the PreVisit and PostVisit methods, which are called before
8 // and after visiting the subexpressions.
9 
10 // Not quite the Visitor pattern, because (among other things)
11 // the Visitor pattern is recursive.
12 
13 #ifndef RE2_WALKER_INL_H__
14 #define RE2_WALKER_INL_H__
15 
16 #include "re2/regexp.h"
17 
18 namespace re2 {
19 
20 template<typename T> struct WalkState;
21 
22 template<typename T> class Regexp::Walker {
23  public:
24   Walker();
25   virtual ~Walker();
26 
27   // Virtual method called before visiting re's children.
28   // PreVisit passes ownership of its return value to its caller.
29   // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
30   // and passed to the child PreVisits and PostVisits as parent_arg.
31   // At the top-most Regexp, parent_arg is arg passed to walk.
32   // If PreVisit sets *stop to true, the walk does not recurse
33   // into the children.  Instead it behaves as though the return
34   // value from PreVisit is the return value from PostVisit.
35   // The default PreVisit returns parent_arg.
36   virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
37 
38   // Virtual method called after visiting re's children.
39   // The pre_arg is the T that PreVisit returned.
40   // The child_args is a vector of the T that the child PostVisits returned.
41   // PostVisit takes ownership of pre_arg.
42   // PostVisit takes ownership of the Ts
43   // in *child_args, but not the vector itself.
44   // PostVisit passes ownership of its return value
45   // to its caller.
46   // The default PostVisit simply returns pre_arg.
47   virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
48                       T* child_args, int nchild_args);
49 
50   // Virtual method called to copy a T,
51   // when Walk notices that more than one child is the same re.
52   virtual T Copy(T arg);
53 
54   // Virtual method called to do a "quick visit" of the re,
55   // but not its children.  Only called once the visit budget
56   // has been used up and we're trying to abort the walk
57   // as quickly as possible.  Should return a value that
58   // makes sense for the parent PostVisits still to be run.
59   // This function is (hopefully) only called by
60   // WalkExponential, but must be implemented by all clients,
61   // just in case.
62   virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
63 
64   // Walks over a regular expression.
65   // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
66   // Returns the T returned by PostVisit on re.
67   T Walk(Regexp* re, T top_arg);
68 
69   // Like Walk, but doesn't use Copy.  This can lead to
70   // exponential runtimes on cross-linked Regexps like the
71   // ones generated by Simplify.  To help limit this,
72   // at most max_visits nodes will be visited and then
73   // the walk will be cut off early.
74   // If the walk *is* cut off early, ShortVisit(re)
75   // will be called on regexps that cannot be fully
76   // visited rather than calling PreVisit/PostVisit.
77   T WalkExponential(Regexp* re, T top_arg, int max_visits);
78 
79   // Clears the stack.  Should never be necessary, since
80   // Walk always enters and exits with an empty stack.
81   // Logs DFATAL if stack is not already clear.
82   void Reset();
83 
84   // Returns whether walk was cut off.
stopped_early()85   bool stopped_early() { return stopped_early_; }
86 
87  private:
88   // Walk state for the entire traversal.
89   stack<WalkState<T> >* stack_;
90   bool stopped_early_;
91   int max_visits_;
92 
93   T WalkInternal(Regexp* re, T top_arg, bool use_copy);
94 
95   DISALLOW_EVIL_CONSTRUCTORS(Walker);
96 };
97 
PreVisit(Regexp * re,T parent_arg,bool * stop)98 template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
99                                                    T parent_arg,
100                                                    bool* stop) {
101   return parent_arg;
102 }
103 
PostVisit(Regexp * re,T parent_arg,T pre_arg,T * child_args,int nchild_args)104 template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
105                                                     T parent_arg,
106                                                     T pre_arg,
107                                                     T* child_args,
108                                                     int nchild_args) {
109   return pre_arg;
110 }
111 
Copy(T arg)112 template<typename T> T Regexp::Walker<T>::Copy(T arg) {
113   return arg;
114 }
115 
116 // State about a single level in the traversal.
117 template<typename T> struct WalkState {
WalkStateWalkState118   WalkState<T>(Regexp* re, T parent)
119     : re(re),
120       n(-1),
121       parent_arg(parent),
122       child_args(NULL) { }
123 
124   Regexp* re;  // The regexp
125   int n;  // The index of the next child to process; -1 means need to PreVisit
126   T parent_arg;  // Accumulated arguments.
127   T pre_arg;
128   T child_arg;  // One-element buffer for child_args.
129   T* child_args;
130 };
131 
Walker()132 template<typename T> Regexp::Walker<T>::Walker() {
133   stack_ = new stack<WalkState<T> >;
134   stopped_early_ = false;
135 }
136 
~Walker()137 template<typename T> Regexp::Walker<T>::~Walker() {
138   Reset();
139   delete stack_;
140 }
141 
142 // Clears the stack.  Should never be necessary, since
143 // Walk always enters and exits with an empty stack.
144 // Logs DFATAL if stack is not already clear.
Reset()145 template<typename T> void Regexp::Walker<T>::Reset() {
146   if (stack_ && stack_->size() > 0) {
147     LOG(DFATAL) << "Stack not empty.";
148     while (stack_->size() > 0) {
149       delete stack_->top().child_args;
150       stack_->pop();
151     }
152   }
153 }
154 
WalkInternal(Regexp * re,T top_arg,bool use_copy)155 template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
156                                                        bool use_copy) {
157   Reset();
158 
159   if (re == NULL) {
160     LOG(DFATAL) << "Walk NULL";
161     return top_arg;
162   }
163 
164   stack_->push(WalkState<T>(re, top_arg));
165 
166   WalkState<T>* s;
167   for (;;) {
168     T t;
169     s = &stack_->top();
170     Regexp* re = s->re;
171     switch (s->n) {
172       case -1: {
173         if (--max_visits_ < 0) {
174           stopped_early_ = true;
175           t = ShortVisit(re, s->parent_arg);
176           break;
177         }
178         bool stop = false;
179         s->pre_arg = PreVisit(re, s->parent_arg, &stop);
180         if (stop) {
181           t = s->pre_arg;
182           break;
183         }
184         s->n = 0;
185         s->child_args = NULL;
186         if (re->nsub_ == 1)
187           s->child_args = &s->child_arg;
188         else if (re->nsub_ > 1)
189           s->child_args = new T[re->nsub_];
190         // Fall through.
191       }
192       default: {
193         if (re->nsub_ > 0) {
194           Regexp** sub = re->sub();
195           if (s->n < re->nsub_) {
196             if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
197               s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
198               s->n++;
199             } else {
200               stack_->push(WalkState<T>(sub[s->n], s->pre_arg));
201             }
202             continue;
203           }
204         }
205 
206         t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
207         if (re->nsub_ > 1)
208           delete[] s->child_args;
209         break;
210       }
211     }
212 
213     // We've finished stack_->top().
214     // Update next guy down.
215     stack_->pop();
216     if (stack_->size() == 0)
217       return t;
218     s = &stack_->top();
219     if (s->child_args != NULL)
220       s->child_args[s->n] = t;
221     else
222       s->child_arg = t;
223     s->n++;
224   }
225 }
226 
Walk(Regexp * re,T top_arg)227 template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
228   // Without the exponential walking behavior,
229   // this budget should be more than enough for any
230   // regexp, and yet not enough to get us in trouble
231   // as far as CPU time.
232   max_visits_ = 1000000;
233   return WalkInternal(re, top_arg, true);
234 }
235 
WalkExponential(Regexp * re,T top_arg,int max_visits)236 template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
237                                                           int max_visits) {
238   max_visits_ = max_visits;
239   return WalkInternal(re, top_arg, false);
240 }
241 
242 }  // namespace re2
243 
244 #endif  // RE2_WALKER_INL_H__
245