1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc
6 
7 // Prog::SearchBitState is a regular expression search with submatch
8 // tracking for small regular expressions and texts.  Like
9 // testing/backtrack.cc, it allocates a bit vector with (length of
10 // text) * (length of prog) bits, to make sure it never explores the
11 // same (character position, instruction) state multiple times.  This
12 // limits the search to run in time linear in the length of the text.
13 //
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive
15 // on the text.
16 //
17 // SearchBitState is a fast replacement for the NFA code on small
18 // regexps and texts when SearchOnePass cannot be used.
19 
20 #include "re2/prog.h"
21 #include "re2/regexp.h"
22 
23 namespace re2 {
24 
25 struct Job {
26   int id;
27   int arg;
28   const char* p;
29 };
30 
31 class BitState {
32  public:
33   explicit BitState(Prog* prog);
34   ~BitState();
35 
36   // The usual Search prototype.
37   // Can only call Search once per BitState.
38   bool Search(const StringPiece& text, const StringPiece& context,
39               bool anchored, bool longest,
40               StringPiece* submatch, int nsubmatch);
41 
42  private:
43   inline bool ShouldVisit(int id, const char* p);
44   void Push(int id, const char* p, int arg);
45   bool GrowStack();
46   bool TrySearch(int id, const char* p);
47 
48   // Search parameters
49   Prog* prog_;              // program being run
50   StringPiece text_;        // text being searched
51   StringPiece context_;     // greater context of text being searched
52   bool anchored_;           // whether search is anchored at text.begin()
53   bool longest_;            // whether search wants leftmost-longest match
54   bool endmatch_;           // whether match must end at text.end()
55   StringPiece *submatch_;   // submatches to fill in
56   int nsubmatch_;           //   # of submatches to fill in
57 
58   // Search state
59   const char** cap_;        // capture registers
60   int ncap_;
61 
62   static const int VisitedBits = 32;
63   uint32 *visited_;         // bitmap: (Inst*, char*) pairs already backtracked
64   int nvisited_;            //   # of words in bitmap
65 
66   Job *job_;                // stack of text positions to explore
67   int njob_;
68   int maxjob_;
69 };
70 
BitState(Prog * prog)71 BitState::BitState(Prog* prog)
72   : prog_(prog),
73     anchored_(false),
74     longest_(false),
75     endmatch_(false),
76     submatch_(NULL),
77     nsubmatch_(0),
78     cap_(NULL),
79     ncap_(0),
80     visited_(NULL),
81     nvisited_(0),
82     job_(NULL),
83     njob_(0),
84     maxjob_(0) {
85 }
86 
~BitState()87 BitState::~BitState() {
88   delete[] visited_;
89   delete[] job_;
90   delete[] cap_;
91 }
92 
93 // Should the search visit the pair ip, p?
94 // If so, remember that it was visited so that the next time,
95 // we don't repeat the visit.
ShouldVisit(int id,const char * p)96 bool BitState::ShouldVisit(int id, const char* p) {
97   uint n = id * (text_.size() + 1) + (p - text_.begin());
98   if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
99     return false;
100   visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
101   return true;
102 }
103 
104 // Grow the stack.
GrowStack()105 bool BitState::GrowStack() {
106   // VLOG(0) << "Reallocate.";
107   maxjob_ *= 2;
108   Job* newjob = new Job[maxjob_];
109   memmove(newjob, job_, njob_*sizeof job_[0]);
110   delete[] job_;
111   job_ = newjob;
112   if (njob_ >= maxjob_) {
113     LOG(DFATAL) << "Job stack overflow.";
114     return false;
115   }
116   return true;
117 }
118 
119 // Push the triple (id, p, arg) onto the stack, growing it if necessary.
Push(int id,const char * p,int arg)120 void BitState::Push(int id, const char* p, int arg) {
121   if (njob_ >= maxjob_) {
122     if (!GrowStack())
123       return;
124   }
125   int op = prog_->inst(id)->opcode();
126   if (op == kInstFail)
127     return;
128 
129   // Only check ShouldVisit when arg == 0.
130   // When arg > 0, we are continuing a previous visit.
131   if (arg == 0 && !ShouldVisit(id, p))
132     return;
133 
134   Job* j = &job_[njob_++];
135   j->id = id;
136   j->p = p;
137   j->arg = arg;
138 }
139 
140 // Try a search from instruction id0 in state p0.
141 // Return whether it succeeded.
TrySearch(int id0,const char * p0)142 bool BitState::TrySearch(int id0, const char* p0) {
143   bool matched = false;
144   const char* end = text_.end();
145   njob_ = 0;
146   Push(id0, p0, 0);
147   while (njob_ > 0) {
148     // Pop job off stack.
149     --njob_;
150     int id = job_[njob_].id;
151     const char* p = job_[njob_].p;
152     int arg = job_[njob_].arg;
153 
154     // Optimization: rather than push and pop,
155     // code that is going to Push and continue
156     // the loop simply updates ip, p, and arg
157     // and jumps to CheckAndLoop.  We have to
158     // do the ShouldVisit check that Push
159     // would have, but we avoid the stack
160     // manipulation.
161     if (0) {
162     CheckAndLoop:
163       if (!ShouldVisit(id, p))
164         continue;
165     }
166 
167     // Visit ip, p.
168     // VLOG(0) << "Job: " << ip->id() << " "
169     //         << (p - text_.begin()) << " " << arg;
170     Prog::Inst* ip = prog_->inst(id);
171     switch (ip->opcode()) {
172       case kInstFail:
173       default:
174         LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
175         return false;
176 
177       case kInstAlt:
178         // Cannot just
179         //   Push(ip->out1(), p, 0);
180         //   Push(ip->out(), p, 0);
181         // If, during the processing of ip->out(), we encounter
182         // ip->out1() via another path, we want to process it then.
183         // Pushing it here will inhibit that.  Instead, re-push
184         // ip with arg==1 as a reminder to push ip->out1() later.
185         switch (arg) {
186           case 0:
187             Push(id, p, 1);  // come back when we're done
188             id = ip->out();
189             goto CheckAndLoop;
190 
191           case 1:
192             // Finished ip->out(); try ip->out1().
193             arg = 0;
194             id = ip->out1();
195             goto CheckAndLoop;
196         }
197         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
198         continue;
199 
200       case kInstAltMatch:
201         // One opcode is byte range; the other leads to match.
202         if (ip->greedy(prog_)) {
203           // out1 is the match
204           Push(ip->out1(), p, 0);
205           id = ip->out1();
206           p = end;
207           goto CheckAndLoop;
208         }
209         // out is the match - non-greedy
210         Push(ip->out(), end, 0);
211         id = ip->out();
212         goto CheckAndLoop;
213 
214       case kInstByteRange: {
215         int c = -1;
216         if (p < end)
217           c = *p & 0xFF;
218         if (ip->Matches(c)) {
219           id = ip->out();
220           p++;
221           goto CheckAndLoop;
222         }
223         continue;
224       }
225 
226       case kInstCapture:
227         switch (arg) {
228           case 0:
229             if (0 <= ip->cap() && ip->cap() < ncap_) {
230               // Capture p to register, but save old value.
231               Push(id, cap_[ip->cap()], 1);  // come back when we're done
232               cap_[ip->cap()] = p;
233             }
234             // Continue on.
235             id = ip->out();
236             goto CheckAndLoop;
237           case 1:
238             // Finished ip->out(); restore the old value.
239             cap_[ip->cap()] = p;
240             continue;
241         }
242         LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
243         continue;
244 
245       case kInstEmptyWidth:
246         if (ip->empty() & ~Prog::EmptyFlags(context_, p))
247           continue;
248         id = ip->out();
249         goto CheckAndLoop;
250 
251       case kInstNop:
252         id = ip->out();
253         goto CheckAndLoop;
254 
255       case kInstMatch: {
256         if (endmatch_ && p != text_.end())
257           continue;
258 
259         // VLOG(0) << "Found match.";
260         // We found a match.  If the caller doesn't care
261         // where the match is, no point going further.
262         if (nsubmatch_ == 0)
263           return true;
264 
265         // Record best match so far.
266         // Only need to check end point, because this entire
267         // call is only considering one start position.
268         matched = true;
269         cap_[1] = p;
270         if (submatch_[0].data() == NULL ||
271             (longest_ && p > submatch_[0].end())) {
272           for (int i = 0; i < nsubmatch_; i++)
273             submatch_[i] = StringPiece(cap_[2*i], cap_[2*i+1] - cap_[2*i]);
274         }
275 
276         // If going for first match, we're done.
277         if (!longest_)
278           return true;
279 
280         // If we used the entire text, no longer match is possible.
281         if (p == text_.end())
282           return true;
283 
284         // Otherwise, continue on in hope of a longer match.
285         continue;
286       }
287     }
288   }
289   return matched;
290 }
291 
292 // Search text (within context) for prog_.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)293 bool BitState::Search(const StringPiece& text, const StringPiece& context,
294                       bool anchored, bool longest,
295                       StringPiece* submatch, int nsubmatch) {
296   // Search parameters.
297   text_ = text;
298   context_ = context;
299   if (context_.begin() == NULL)
300     context_ = text;
301   if (prog_->anchor_start() && context_.begin() != text.begin())
302     return false;
303   if (prog_->anchor_end() && context_.end() != text.end())
304     return false;
305   anchored_ = anchored || prog_->anchor_start();
306   longest_ = longest || prog_->anchor_end();
307   endmatch_ = prog_->anchor_end();
308   submatch_ = submatch;
309   nsubmatch_ = nsubmatch;
310   for (int i = 0; i < nsubmatch_; i++)
311     submatch_[i] = NULL;
312 
313   // Allocate scratch space.
314   nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
315   visited_ = new uint32[nvisited_];
316   memset(visited_, 0, nvisited_*sizeof visited_[0]);
317   // VLOG(0) << "nvisited_ = " << nvisited_;
318 
319   ncap_ = 2*nsubmatch;
320   if (ncap_ < 2)
321     ncap_ = 2;
322   cap_ = new const char*[ncap_];
323   memset(cap_, 0, ncap_*sizeof cap_[0]);
324 
325   maxjob_ = 256;
326   job_ = new Job[maxjob_];
327 
328   // Anchored search must start at text.begin().
329   if (anchored_) {
330     cap_[0] = text.begin();
331     return TrySearch(prog_->start(), text.begin());
332   }
333 
334   // Unanchored search, starting from each possible text position.
335   // Notice that we have to try the empty string at the end of
336   // the text, so the loop condition is p <= text.end(), not p < text.end().
337   // This looks like it's quadratic in the size of the text,
338   // but we are not clearing visited_ between calls to TrySearch,
339   // so no work is duplicated and it ends up still being linear.
340   for (const char* p = text.begin(); p <= text.end(); p++) {
341     cap_[0] = p;
342     if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
343       return true;
344   }
345   return false;
346 }
347 
348 // Bit-state search.
SearchBitState(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)349 bool Prog::SearchBitState(const StringPiece& text,
350                           const StringPiece& context,
351                           Anchor anchor,
352                           MatchKind kind,
353                           StringPiece* match,
354                           int nmatch) {
355   // If full match, we ask for an anchored longest match
356   // and then check that match[0] == text.
357   // So make sure match[0] exists.
358   StringPiece sp0;
359   if (kind == kFullMatch) {
360     anchor = kAnchored;
361     if (nmatch < 1) {
362       match = &sp0;
363       nmatch = 1;
364     }
365   }
366 
367   // Run the search.
368   BitState b(this);
369   bool anchored = anchor == kAnchored;
370   bool longest = kind != kFirstMatch;
371   if (!b.Search(text, context, anchored, longest, match, nmatch))
372     return false;
373   if (kind == kFullMatch && match[0].end() != text.end())
374     return false;
375   return true;
376 }
377 
378 }  // namespace re2
379