1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 //
7 // Prog::UnsafeSearchBacktrack is a backtracking regular expression search,
8 // except that it remembers where it has been, trading a lot of
9 // memory for a lot of time. It exists only for testing purposes.
10 //
11 // Let me repeat that.
12 //
13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
14 //   - It uses a ton of memory.
15 //   - It uses a ton of stack.
16 //   - It uses CHECK and LOG(FATAL).
17 //   - It implements unanchored search by repeated anchored search.
18 //
19 // On the other hand, it is very simple and a good reference
20 // implementation for the more complicated regexp packages.
21 //
22 // In BUILD, this file is linked into the ":testing" library,
23 // not the main library, in order to make it harder to pick up
24 // accidentally.
25 
26 #include <stddef.h>
27 #include <stdint.h>
28 #include <string.h>
29 
30 #include "util/util.h"
31 #include "util/logging.h"
32 #include "re2/pod_array.h"
33 #include "re2/prog.h"
34 #include "re2/regexp.h"
35 
36 namespace re2 {
37 
38 // Backtracker holds the state for a backtracking search.
39 //
40 // Excluding the search parameters, the main search state
41 // is just the "capture registers", which record, for the
42 // current execution, the string position at which each
43 // parenthesis was passed.  cap_[0] and cap_[1] are the
44 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
45 //
46 // To avoid infinite loops during backtracking on expressions
47 // like (a*)*, the visited_[] bitmap marks the (state, string-position)
48 // pairs that have already been explored and are thus not worth
49 // re-exploring if we get there via another path.  Modern backtracking
50 // libraries engineer their program representation differently, to make
51 // such infinite loops possible to avoid without keeping a giant visited_
52 // bitmap, but visited_ works fine for a reference implementation
53 // and it has the nice benefit of making the search run in linear time.
54 class Backtracker {
55  public:
56   explicit Backtracker(Prog* prog);
57 
58   bool Search(const StringPiece& text, const StringPiece& context,
59               bool anchored, bool longest,
60               StringPiece* submatch, int nsubmatch);
61 
62  private:
63   // Explores from instruction id at string position p looking for a match.
64   // Returns true if found (so that caller can stop trying other possibilities).
65   bool Visit(int id, const char* p);
66 
67   // Tries instruction id at string position p.
68   // Returns true if a match is found.
69   bool Try(int id, const char* p);
70 
71   // Search parameters
72   Prog* prog_;              // program being run
73   StringPiece text_;        // text being searched
74   StringPiece context_;     // greater context of text being searched
75   bool anchored_;           // whether search is anchored at text.begin()
76   bool longest_;            // whether search wants leftmost-longest match
77   bool endmatch_;           // whether search must end at text.end()
78   StringPiece *submatch_;   // submatches to fill in
79   int nsubmatch_;           //   # of submatches to fill in
80 
81   // Search state
82   const char* cap_[64];         // capture registers
83   PODArray<uint32_t> visited_;  // bitmap: (Inst*, char*) pairs visited
84 
85   Backtracker(const Backtracker&) = delete;
86   Backtracker& operator=(const Backtracker&) = delete;
87 };
88 
Backtracker(Prog * prog)89 Backtracker::Backtracker(Prog* prog)
90   : prog_(prog),
91     anchored_(false),
92     longest_(false),
93     endmatch_(false),
94     submatch_(NULL),
95     nsubmatch_(0) {
96 }
97 
98 // Runs a backtracking search.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)99 bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
100                          bool anchored, bool longest,
101                          StringPiece* submatch, int nsubmatch) {
102   text_ = text;
103   context_ = context;
104   if (context_.data() == NULL)
105     context_ = text;
106   if (prog_->anchor_start() && text.begin() > context_.begin())
107     return false;
108   if (prog_->anchor_end() && text.end() < context_.end())
109     return false;
110   anchored_ = anchored | prog_->anchor_start();
111   longest_ = longest | prog_->anchor_end();
112   endmatch_ = prog_->anchor_end();
113   submatch_ = submatch;
114   nsubmatch_ = nsubmatch;
115   CHECK_LT(2*nsubmatch_, static_cast<int>(arraysize(cap_)));
116   memset(cap_, 0, sizeof cap_);
117 
118   // We use submatch_[0] for our own bookkeeping,
119   // so it had better exist.
120   StringPiece sp0;
121   if (nsubmatch < 1) {
122     submatch_ = &sp0;
123     nsubmatch_ = 1;
124   }
125   submatch_[0] = StringPiece();
126 
127   // Allocate new visited_ bitmap -- size is proportional
128   // to text, so have to reallocate on each call to Search.
129   int nvisited = prog_->size() * static_cast<int>(text.size()+1);
130   nvisited = (nvisited + 31) / 32;
131   visited_ = PODArray<uint32_t>(nvisited);
132   memset(visited_.data(), 0, nvisited*sizeof visited_[0]);
133 
134   // Anchored search must start at text.begin().
135   if (anchored_) {
136     cap_[0] = text.data();
137     return Visit(prog_->start(), text.data());
138   }
139 
140   // Unanchored search, starting from each possible text position.
141   // Notice that we have to try the empty string at the end of
142   // the text, so the loop condition is p <= text.end(), not p < text.end().
143   for (const char* p = text.data(); p <= text.data() + text.size(); p++) {
144     cap_[0] = p;
145     if (Visit(prog_->start(), p))  // Match must be leftmost; done.
146       return true;
147     // Avoid invoking undefined behavior (arithmetic on a null pointer)
148     // by simply not continuing the loop.
149     if (p == NULL)
150       break;
151   }
152   return false;
153 }
154 
155 // Explores from instruction id at string position p looking for a match.
156 // Return true if found (so that caller can stop trying other possibilities).
Visit(int id,const char * p)157 bool Backtracker::Visit(int id, const char* p) {
158   // Check bitmap.  If we've already explored from here,
159   // either it didn't match or it did but we're hoping for a better match.
160   // Either way, don't go down that road again.
161   CHECK(p <= text_.data() + text_.size());
162   int n = id * static_cast<int>(text_.size()+1) +
163           static_cast<int>(p-text_.data());
164   CHECK_LT(n/32, visited_.size());
165   if (visited_[n/32] & (1 << (n&31)))
166     return false;
167   visited_[n/32] |= 1 << (n&31);
168 
169   Prog::Inst* ip = prog_->inst(id);
170   if (Try(id, p)) {
171     if (longest_ && !ip->last())
172       Visit(id+1, p);
173     return true;
174   }
175   if (!ip->last())
176     return Visit(id+1, p);
177   return false;
178 }
179 
180 // Tries instruction id at string position p.
181 // Returns true if a match is found.
Try(int id,const char * p)182 bool Backtracker::Try(int id, const char* p) {
183   // Pick out byte at current position.  If at end of string,
184   // have to explore in hope of finishing a match.  Use impossible byte -1.
185   int c = -1;
186   if (p < text_.data() + text_.size())
187     c = *p & 0xFF;
188 
189   Prog::Inst* ip = prog_->inst(id);
190   switch (ip->opcode()) {
191     default:
192       LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
193       return false;  // not reached
194 
195     case kInstAltMatch:
196       // Ignored.
197       return false;
198 
199     case kInstByteRange:
200       if (ip->Matches(c))
201         return Visit(ip->out(), p+1);
202       return false;
203 
204     case kInstCapture:
205       if (0 <= ip->cap() &&
206           ip->cap() < static_cast<int>(arraysize(cap_))) {
207         // Capture p to register, but save old value.
208         const char* q = cap_[ip->cap()];
209         cap_[ip->cap()] = p;
210         bool ret = Visit(ip->out(), p);
211         // Restore old value as we backtrack.
212         cap_[ip->cap()] = q;
213         return ret;
214       }
215       return Visit(ip->out(), p);
216 
217     case kInstEmptyWidth:
218       if (ip->empty() & ~Prog::EmptyFlags(context_, p))
219         return false;
220       return Visit(ip->out(), p);
221 
222     case kInstNop:
223       return Visit(ip->out(), p);
224 
225     case kInstMatch:
226       // We found a match.  If it's the best so far, record the
227       // parameters in the caller's submatch_ array.
228       if (endmatch_ && p != context_.data() + context_.size())
229         return false;
230       cap_[1] = p;
231       if (submatch_[0].data() == NULL ||
232           (longest_ && p > submatch_[0].data() + submatch_[0].size())) {
233         // First match so far - or better match.
234         for (int i = 0; i < nsubmatch_; i++)
235           submatch_[i] = StringPiece(
236               cap_[2 * i], static_cast<size_t>(cap_[2 * i + 1] - cap_[2 * i]));
237       }
238       return true;
239 
240     case kInstFail:
241       return false;
242   }
243 }
244 
245 // Runs a backtracking search.
UnsafeSearchBacktrack(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)246 bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
247                                  const StringPiece& context,
248                                  Anchor anchor,
249                                  MatchKind kind,
250                                  StringPiece* match,
251                                  int nmatch) {
252   // If full match, we ask for an anchored longest match
253   // and then check that match[0] == text.
254   // So make sure match[0] exists.
255   StringPiece sp0;
256   if (kind == kFullMatch) {
257     anchor = kAnchored;
258     if (nmatch < 1) {
259       match = &sp0;
260       nmatch = 1;
261     }
262   }
263 
264   // Run the search.
265   Backtracker b(this);
266   bool anchored = anchor == kAnchored;
267   bool longest = kind != kFirstMatch;
268   if (!b.Search(text, context, anchored, longest, match, nmatch))
269     return false;
270   if (kind == kFullMatch && match[0].end() != text.end())
271     return false;
272   return true;
273 }
274 
275 }  // namespace re2
276