1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #ifndef RE2_WALKER_INL_H_
6 #define RE2_WALKER_INL_H_
7 
8 // Helper class for traversing Regexps without recursion.
9 // Clients should declare their own subclasses that override
10 // the PreVisit and PostVisit methods, which are called before
11 // and after visiting the subexpressions.
12 
13 // Not quite the Visitor pattern, because (among other things)
14 // the Visitor pattern is recursive.
15 
16 #include <stack>
17 
18 #include "util/logging.h"
19 #include "re2/regexp.h"
20 
21 namespace re2 {
22 
23 template<typename T> struct WalkState;
24 
25 template<typename T> class Regexp::Walker {
26  public:
27   Walker();
28   virtual ~Walker();
29 
30   // Virtual method called before visiting re's children.
31   // PreVisit passes ownership of its return value to its caller.
32   // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg
33   // and passed to the child PreVisits and PostVisits as parent_arg.
34   // At the top-most Regexp, parent_arg is arg passed to walk.
35   // If PreVisit sets *stop to true, the walk does not recurse
36   // into the children.  Instead it behaves as though the return
37   // value from PreVisit is the return value from PostVisit.
38   // The default PreVisit returns parent_arg.
39   virtual T PreVisit(Regexp* re, T parent_arg, bool* stop);
40 
41   // Virtual method called after visiting re's children.
42   // The pre_arg is the T that PreVisit returned.
43   // The child_args is a vector of the T that the child PostVisits returned.
44   // PostVisit takes ownership of pre_arg.
45   // PostVisit takes ownership of the Ts
46   // in *child_args, but not the vector itself.
47   // PostVisit passes ownership of its return value
48   // to its caller.
49   // The default PostVisit simply returns pre_arg.
50   virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg,
51                       T* child_args, int nchild_args);
52 
53   // Virtual method called to copy a T,
54   // when Walk notices that more than one child is the same re.
55   virtual T Copy(T arg);
56 
57   // Virtual method called to do a "quick visit" of the re,
58   // but not its children.  Only called once the visit budget
59   // has been used up and we're trying to abort the walk
60   // as quickly as possible.  Should return a value that
61   // makes sense for the parent PostVisits still to be run.
62   // This function is (hopefully) only called by
63   // WalkExponential, but must be implemented by all clients,
64   // just in case.
65   virtual T ShortVisit(Regexp* re, T parent_arg) = 0;
66 
67   // Walks over a regular expression.
68   // Top_arg is passed as parent_arg to PreVisit and PostVisit of re.
69   // Returns the T returned by PostVisit on re.
70   T Walk(Regexp* re, T top_arg);
71 
72   // Like Walk, but doesn't use Copy.  This can lead to
73   // exponential runtimes on cross-linked Regexps like the
74   // ones generated by Simplify.  To help limit this,
75   // at most max_visits nodes will be visited and then
76   // the walk will be cut off early.
77   // If the walk *is* cut off early, ShortVisit(re)
78   // will be called on regexps that cannot be fully
79   // visited rather than calling PreVisit/PostVisit.
80   T WalkExponential(Regexp* re, T top_arg, int max_visits);
81 
82   // Clears the stack.  Should never be necessary, since
83   // Walk always enters and exits with an empty stack.
84   // Logs DFATAL if stack is not already clear.
85   void Reset();
86 
87   // Returns whether walk was cut off.
stopped_early()88   bool stopped_early() { return stopped_early_; }
89 
90  private:
91   // Walk state for the entire traversal.
92   std::stack<WalkState<T>> stack_;
93   bool stopped_early_;
94   int max_visits_;
95 
96   T WalkInternal(Regexp* re, T top_arg, bool use_copy);
97 
98   Walker(const Walker&) = delete;
99   Walker& operator=(const Walker&) = delete;
100 };
101 
PreVisit(Regexp * re,T parent_arg,bool * stop)102 template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re,
103                                                    T parent_arg,
104                                                    bool* stop) {
105   return parent_arg;
106 }
107 
PostVisit(Regexp * re,T parent_arg,T pre_arg,T * child_args,int nchild_args)108 template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re,
109                                                     T parent_arg,
110                                                     T pre_arg,
111                                                     T* child_args,
112                                                     int nchild_args) {
113   return pre_arg;
114 }
115 
Copy(T arg)116 template<typename T> T Regexp::Walker<T>::Copy(T arg) {
117   return arg;
118 }
119 
120 // State about a single level in the traversal.
121 template<typename T> struct WalkState {
WalkStateWalkState122   WalkState<T>(Regexp* re, T parent)
123     : re(re),
124       n(-1),
125       parent_arg(parent),
126       child_args(NULL) { }
127 
128   Regexp* re;  // The regexp
129   int n;  // The index of the next child to process; -1 means need to PreVisit
130   T parent_arg;  // Accumulated arguments.
131   T pre_arg;
132   T child_arg;  // One-element buffer for child_args.
133   T* child_args;
134 };
135 
Walker()136 template<typename T> Regexp::Walker<T>::Walker() {
137   stopped_early_ = false;
138 }
139 
~Walker()140 template<typename T> Regexp::Walker<T>::~Walker() {
141   Reset();
142 }
143 
144 // Clears the stack.  Should never be necessary, since
145 // Walk always enters and exits with an empty stack.
146 // Logs DFATAL if stack is not already clear.
Reset()147 template<typename T> void Regexp::Walker<T>::Reset() {
148   if (!stack_.empty()) {
149     LOG(DFATAL) << "Stack not empty.";
150     while (!stack_.empty()) {
151       delete[] stack_.top().child_args;
152       stack_.pop();
153     }
154   }
155 }
156 
WalkInternal(Regexp * re,T top_arg,bool use_copy)157 template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg,
158                                                        bool use_copy) {
159   Reset();
160 
161   if (re == NULL) {
162     LOG(DFATAL) << "Walk NULL";
163     return top_arg;
164   }
165 
166   stack_.push(WalkState<T>(re, top_arg));
167 
168   WalkState<T>* s;
169   for (;;) {
170     T t;
171     s = &stack_.top();
172     Regexp* re = s->re;
173     switch (s->n) {
174       case -1: {
175         if (--max_visits_ < 0) {
176           stopped_early_ = true;
177           t = ShortVisit(re, s->parent_arg);
178           break;
179         }
180         bool stop = false;
181         s->pre_arg = PreVisit(re, s->parent_arg, &stop);
182         if (stop) {
183           t = s->pre_arg;
184           break;
185         }
186         s->n = 0;
187         s->child_args = NULL;
188         if (re->nsub_ == 1)
189           s->child_args = &s->child_arg;
190         else if (re->nsub_ > 1)
191           s->child_args = new T[re->nsub_];
192         FALLTHROUGH_INTENDED;
193       }
194       default: {
195         if (re->nsub_ > 0) {
196           Regexp** sub = re->sub();
197           if (s->n < re->nsub_) {
198             if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) {
199               s->child_args[s->n] = Copy(s->child_args[s->n - 1]);
200               s->n++;
201             } else {
202               stack_.push(WalkState<T>(sub[s->n], s->pre_arg));
203             }
204             continue;
205           }
206         }
207 
208         t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n);
209         if (re->nsub_ > 1)
210           delete[] s->child_args;
211         break;
212       }
213     }
214 
215     // We've finished stack_.top().
216     // Update next guy down.
217     stack_.pop();
218     if (stack_.empty())
219       return t;
220     s = &stack_.top();
221     if (s->child_args != NULL)
222       s->child_args[s->n] = t;
223     else
224       s->child_arg = t;
225     s->n++;
226   }
227 }
228 
Walk(Regexp * re,T top_arg)229 template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) {
230   // Without the exponential walking behavior,
231   // this budget should be more than enough for any
232   // regexp, and yet not enough to get us in trouble
233   // as far as CPU time.
234   max_visits_ = 1000000;
235   return WalkInternal(re, top_arg, true);
236 }
237 
WalkExponential(Regexp * re,T top_arg,int max_visits)238 template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg,
239                                                           int max_visits) {
240   max_visits_ = max_visits;
241   return WalkInternal(re, top_arg, false);
242 }
243 
244 }  // namespace re2
245 
246 #endif  // RE2_WALKER_INL_H_
247