1 // Copyright 2006-2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Tested by search_test.cc.
6 //
7 // Prog::SearchNFA, an NFA search.
8 // This is an actual NFA like the theorists talk about,
9 // not the pseudo-NFA found in backtracking regexp implementations.
10 //
11 // IMPLEMENTATION
12 //
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor,
14 // which is a variant of the one described in Thompson's 1968 CACM paper.
15 // See http://swtch.com/~rsc/regexp/ for various history.  The main feature
16 // over the DFA implementation is that it tracks submatch boundaries.
17 //
18 // When the choice of submatch boundaries is ambiguous, this particular
19 // implementation makes the same choices that traditional backtracking
20 // implementations (in particular, Perl and PCRE) do.
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
22 // time in the length of the input.
23 //
24 // Like Thompson's original machine and like the DFA implementation, this
25 // implementation notices a match only once it is one byte past it.
26 
27 #include "re2/prog.h"
28 #include "re2/regexp.h"
29 #include "util/sparse_array.h"
30 #include "util/sparse_set.h"
31 
32 namespace re2 {
33 
34 class NFA {
35  public:
36   NFA(Prog* prog);
37   ~NFA();
38 
39   // Searches for a matching string.
40   //   * If anchored is true, only considers matches starting at offset.
41   //     Otherwise finds lefmost match at or after offset.
42   //   * If longest is true, returns the longest match starting
43   //     at the chosen start point.  Otherwise returns the so-called
44   //     left-biased match, the one traditional backtracking engines
45   //     (like Perl and PCRE) find.
46   // Records submatch boundaries in submatch[1..nsubmatch-1].
47   // Submatch[0] is the entire match.  When there is a choice in
48   // which text matches each subexpression, the submatch boundaries
49   // are chosen to match what a backtracking implementation would choose.
50   bool Search(const StringPiece& text, const StringPiece& context,
51               bool anchored, bool longest,
52               StringPiece* submatch, int nsubmatch);
53 
54   static const int Debug = 0;
55 
56  private:
57   struct Thread {
58     union {
59       int id;
60       Thread* next;  // when on free list
61     };
62     const char** capture;
63   };
64 
65   // State for explicit stack in AddToThreadq.
66   struct AddState {
67     int id;           // Inst to process
68     int j;
69     const char* cap_j;  // if j>=0, set capture[j] = cap_j before processing ip
70 
AddStatere2::NFA::AddState71     AddState()
72       : id(0), j(-1), cap_j(NULL) {}
AddStatere2::NFA::AddState73     explicit AddState(int id)
74       : id(id), j(-1), cap_j(NULL) {}
AddStatere2::NFA::AddState75     AddState(int id, const char* cap_j, int j)
76       : id(id), j(j), cap_j(cap_j) {}
77   };
78 
79   // Threadq is a list of threads.  The list is sorted by the order
80   // in which Perl would explore that particular state -- the earlier
81   // choices appear earlier in the list.
82   typedef SparseArray<Thread*> Threadq;
83 
84   inline Thread* AllocThread();
85   inline void FreeThread(Thread*);
86 
87   // Add id (or its children, following unlabeled arrows)
88   // to the workqueue q with associated capture info.
89   void AddToThreadq(Threadq* q, int id, int flag,
90                     const char* p, const char** capture);
91 
92   // Run runq on byte c, appending new states to nextq.
93   // Updates matched_ and match_ as new, better matches are found.
94   // p is position of the next byte (the one after c)
95   // in the input string, used when processing capturing parens.
96   // flag is the bitwise or of Bol, Eol, etc., specifying whether
97   // ^, $ and \b match the current input point (after c).
98   inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
99 
100   // Returns text version of capture information, for debugging.
101   string FormatCapture(const char** capture);
102 
103   inline void CopyCapture(const char** dst, const char** src);
104 
105   // Computes whether all matches must begin with the same first
106   // byte, and if so, returns that byte.  If not, returns -1.
107   int ComputeFirstByte();
108 
109   Prog* prog_;          // underlying program
110   int start_;           // start instruction in program
111   int ncapture_;        // number of submatches to track
112   bool longest_;        // whether searching for longest match
113   bool endmatch_;       // whether match must end at text.end()
114   const char* btext_;   // beginning of text being matched (for FormatSubmatch)
115   const char* etext_;   // end of text being matched (for endmatch_)
116   Threadq q0_, q1_;     // pre-allocated for Search.
117   const char** match_;  // best match so far
118   bool matched_;        // any match so far?
119   AddState* astack_;    // pre-allocated for AddToThreadq
120   int nastack_;
121   int first_byte_;      // required first byte for match, or -1 if none
122 
123   Thread* free_threads_;  // free list
124 
125   DISALLOW_EVIL_CONSTRUCTORS(NFA);
126 };
127 
NFA(Prog * prog)128 NFA::NFA(Prog* prog) {
129   prog_ = prog;
130   start_ = prog->start();
131   ncapture_ = 0;
132   longest_ = false;
133   endmatch_ = false;
134   btext_ = NULL;
135   etext_ = NULL;
136   q0_.resize(prog_->size());
137   q1_.resize(prog_->size());
138   nastack_ = 2*prog_->size();
139   astack_ = new AddState[nastack_];
140   match_ = NULL;
141   matched_ = false;
142   free_threads_ = NULL;
143   first_byte_ = ComputeFirstByte();
144 }
145 
~NFA()146 NFA::~NFA() {
147   delete[] match_;
148   delete[] astack_;
149   Thread* next;
150   for (Thread* t = free_threads_; t; t = next) {
151     next = t->next;
152     delete[] t->capture;
153     delete t;
154   }
155 }
156 
FreeThread(Thread * t)157 void NFA::FreeThread(Thread *t) {
158   if (t == NULL)
159     return;
160   t->next = free_threads_;
161   free_threads_ = t;
162 }
163 
AllocThread()164 NFA::Thread* NFA::AllocThread() {
165   Thread* t = free_threads_;
166   if (t == NULL) {
167     t = new Thread;
168     t->capture = new const char*[ncapture_];
169     return t;
170   }
171   free_threads_ = t->next;
172   return t;
173 }
174 
CopyCapture(const char ** dst,const char ** src)175 void NFA::CopyCapture(const char** dst, const char** src) {
176   for (int i = 0; i < ncapture_; i+=2) {
177     dst[i] = src[i];
178     dst[i+1] = src[i+1];
179   }
180 }
181 
182 // Follows all empty arrows from id0 and enqueues all the states reached.
183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
184 // The pointer p is the current input position, and m is the
185 // current set of match boundaries.
AddToThreadq(Threadq * q,int id0,int flag,const char * p,const char ** capture)186 void NFA::AddToThreadq(Threadq* q, int id0, int flag,
187                        const char* p, const char** capture) {
188   if (id0 == 0)
189     return;
190 
191   // Astack_ is pre-allocated to avoid resize operations.
192   // It has room for 2*prog_->size() entries, which is enough:
193   // Each inst in prog can be processed at most once,
194   // pushing at most two entries on stk.
195 
196   int nstk = 0;
197   AddState* stk = astack_;
198   stk[nstk++] = AddState(id0);
199 
200   while (nstk > 0) {
201     DCHECK_LE(nstk, nastack_);
202     const AddState& a = stk[--nstk];
203     if (a.j >= 0)
204       capture[a.j] = a.cap_j;
205 
206     int id = a.id;
207     if (id == 0)
208       continue;
209     if (q->has_index(id)) {
210       if (Debug)
211         fprintf(stderr, "  [%d%s]\n", id, FormatCapture(capture).c_str());
212       continue;
213     }
214 
215     // Create entry in q no matter what.  We might fill it in below,
216     // or we might not.  Even if not, it is necessary to have it,
217     // so that we don't revisit id0 during the recursion.
218     q->set_new(id, NULL);
219 
220     Thread** tp = &q->find(id)->second;
221     int j;
222     Thread* t;
223     Prog::Inst* ip = prog_->inst(id);
224     switch (ip->opcode()) {
225     default:
226       LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
227       break;
228 
229     case kInstFail:
230       break;
231 
232     case kInstAltMatch:
233       // Save state; will pick up at next byte.
234       t = AllocThread();
235       t->id = id;
236       CopyCapture(t->capture, capture);
237       *tp = t;
238       // fall through
239 
240     case kInstAlt:
241       // Explore alternatives.
242       stk[nstk++] = AddState(ip->out1());
243       stk[nstk++] = AddState(ip->out());
244       break;
245 
246     case kInstNop:
247       // Continue on.
248       stk[nstk++] = AddState(ip->out());
249       break;
250 
251     case kInstCapture:
252       if ((j=ip->cap()) < ncapture_) {
253         // Push a dummy whose only job is to restore capture[j]
254         // once we finish exploring this possibility.
255         stk[nstk++] = AddState(0, capture[j], j);
256 
257         // Record capture.
258         capture[j] = p;
259       }
260       stk[nstk++] = AddState(ip->out());
261       break;
262 
263     case kInstMatch:
264     case kInstByteRange:
265       // Save state; will pick up at next byte.
266       t = AllocThread();
267       t->id = id;
268       CopyCapture(t->capture, capture);
269       *tp = t;
270       if (Debug)
271         fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
272       break;
273 
274     case kInstEmptyWidth:
275       // Continue on if we have all the right flag bits.
276       if (ip->empty() & ~flag)
277         break;
278       stk[nstk++] = AddState(ip->out());
279       break;
280     }
281   }
282 }
283 
284 // Run runq on byte c, appending new states to nextq.
285 // Updates match as new, better matches are found.
286 // p is position of the byte c in the input string,
287 // used when processing capturing parens.
288 // flag is the bitwise or of Bol, Eol, etc., specifying whether
289 // ^, $ and \b match the current input point (after c).
290 // Frees all the threads on runq.
291 // If there is a shortcut to the end, returns that shortcut.
Step(Threadq * runq,Threadq * nextq,int c,int flag,const char * p)292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
293   nextq->clear();
294 
295   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
296     Thread* t = i->second;
297     if (t == NULL)
298       continue;
299 
300     if (longest_) {
301       // Can skip any threads started after our current best match.
302       if (matched_ && match_[0] < t->capture[0]) {
303         FreeThread(t);
304         continue;
305       }
306     }
307 
308     int id = t->id;
309     Prog::Inst* ip = prog_->inst(id);
310 
311     switch (ip->opcode()) {
312       default:
313         // Should only see the values handled below.
314         LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
315         break;
316 
317       case kInstByteRange:
318         if (ip->Matches(c))
319           AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
320         break;
321 
322       case kInstAltMatch:
323         if (i != runq->begin())
324           break;
325         // The match is ours if we want it.
326         if (ip->greedy(prog_) || longest_) {
327           CopyCapture((const char**)match_, t->capture);
328           FreeThread(t);
329           for (++i; i != runq->end(); ++i)
330             FreeThread(i->second);
331           runq->clear();
332           matched_ = true;
333           if (ip->greedy(prog_))
334             return ip->out1();
335           return ip->out();
336         }
337         break;
338 
339       case kInstMatch:
340         if (endmatch_ && p != etext_)
341           break;
342 
343         const char* old = t->capture[1];  // previous end pointer
344         t->capture[1] = p;
345         if (longest_) {
346           // Leftmost-longest mode: save this match only if
347           // it is either farther to the left or at the same
348           // point but longer than an existing match.
349           if (!matched_ || t->capture[0] < match_[0] ||
350               (t->capture[0] == match_[0] && t->capture[1] > match_[1]))
351             CopyCapture((const char**)match_, t->capture);
352         } else {
353           // Leftmost-biased mode: this match is by definition
354           // better than what we've already found (see next line).
355           CopyCapture((const char**)match_, t->capture);
356 
357           // Cut off the threads that can only find matches
358           // worse than the one we just found: don't run the
359           // rest of the current Threadq.
360           t->capture[0] = old;
361           FreeThread(t);
362           for (++i; i != runq->end(); ++i)
363             FreeThread(i->second);
364           runq->clear();
365           matched_ = true;
366           return 0;
367         }
368         t->capture[0] = old;
369         matched_ = true;
370         break;
371     }
372     FreeThread(t);
373   }
374   runq->clear();
375   return 0;
376 }
377 
FormatCapture(const char ** capture)378 string NFA::FormatCapture(const char** capture) {
379   string s;
380 
381   for (int i = 0; i < ncapture_; i+=2) {
382     if (capture[i] == NULL)
383       StringAppendF(&s, "(?,?)");
384     else if (capture[i+1] == NULL)
385       StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
386     else
387       StringAppendF(&s, "(%d,%d)",
388                     (int)(capture[i] - btext_),
389                     (int)(capture[i+1] - btext_));
390   }
391   return s;
392 }
393 
394 // Returns whether haystack contains needle's memory.
StringPieceContains(const StringPiece haystack,const StringPiece needle)395 static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
396   return haystack.begin() <= needle.begin() &&
397          haystack.end() >= needle.end();
398 }
399 
Search(const StringPiece & text,const StringPiece & const_context,bool anchored,bool longest,StringPiece * submatch,int nsubmatch)400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
401             bool anchored, bool longest,
402             StringPiece* submatch, int nsubmatch) {
403   if (start_ == 0)
404     return false;
405 
406   StringPiece context = const_context;
407   if (context.begin() == NULL)
408     context = text;
409 
410   if (!StringPieceContains(context, text)) {
411     LOG(FATAL) << "Bad args: context does not contain text "
412                 << reinterpret_cast<const void*>(context.begin())
413                 << "+" << context.size() << " "
414                 << reinterpret_cast<const void*>(text.begin())
415                 << "+" << text.size();
416     return false;
417   }
418 
419   if (prog_->anchor_start() && context.begin() != text.begin())
420     return false;
421   if (prog_->anchor_end() && context.end() != text.end())
422     return false;
423   anchored |= prog_->anchor_start();
424   if (prog_->anchor_end()) {
425     longest = true;
426     endmatch_ = true;
427     etext_ = text.end();
428   }
429 
430   if (nsubmatch < 0) {
431     LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
432     return false;
433   }
434 
435   // Save search parameters.
436   ncapture_ = 2*nsubmatch;
437   longest_ = longest;
438 
439   if (nsubmatch == 0) {
440     // We need to maintain match[0], both to distinguish the
441     // longest match (if longest is true) and also to tell
442     // whether we've seen any matches at all.
443     ncapture_ = 2;
444   }
445 
446   match_ = new const char*[ncapture_];
447   matched_ = false;
448   memset(match_, 0, ncapture_*sizeof match_[0]);
449 
450   // For debugging prints.
451   btext_ = context.begin();
452 
453   if (Debug) {
454     fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
455             text.as_string().c_str(), context.as_string().c_str(), anchored,
456             longest);
457   }
458 
459   // Set up search.
460   Threadq* runq = &q0_;
461   Threadq* nextq = &q1_;
462   runq->clear();
463   nextq->clear();
464   memset(&match_[0], 0, ncapture_*sizeof match_[0]);
465   const char* bp = context.begin();
466   int c = -1;
467   int wasword = 0;
468 
469   if (text.begin() > context.begin()) {
470     c = text.begin()[-1] & 0xFF;
471     wasword = Prog::IsWordChar(c);
472   }
473 
474   // Loop over the text, stepping the machine.
475   for (const char* p = text.begin();; p++) {
476     // Check for empty-width specials.
477     int flag = 0;
478 
479     // ^ and \A
480     if (p == context.begin())
481       flag |= kEmptyBeginText | kEmptyBeginLine;
482     else if (p <= context.end() && p[-1] == '\n')
483       flag |= kEmptyBeginLine;
484 
485     // $ and \z
486     if (p == context.end())
487       flag |= kEmptyEndText | kEmptyEndLine;
488     else if (p < context.end() && p[0] == '\n')
489       flag |= kEmptyEndLine;
490 
491     // \b and \B
492     int isword = 0;
493     if (p < context.end())
494       isword = Prog::IsWordChar(p[0] & 0xFF);
495 
496     if (isword != wasword)
497       flag |= kEmptyWordBoundary;
498     else
499       flag |= kEmptyNonWordBoundary;
500 
501     if (Debug) {
502       fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
503       for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
504         Thread* t = i->second;
505         if (t == NULL)
506           continue;
507         fprintf(stderr, " %d%s", t->id,
508                 FormatCapture((const char**)t->capture).c_str());
509       }
510       fprintf(stderr, "\n");
511     }
512 
513     // Process previous character (waited until now to avoid
514     // repeating the flag computation above).
515     // This is a no-op the first time around the loop, because
516     // runq is empty.
517     int id = Step(runq, nextq, c, flag, p-1);
518     DCHECK_EQ(runq->size(), 0);
519     swap(nextq, runq);
520     nextq->clear();
521     if (id != 0) {
522       // We're done: full match ahead.
523       p = text.end();
524       for (;;) {
525         Prog::Inst* ip = prog_->inst(id);
526         switch (ip->opcode()) {
527           default:
528             LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
529             break;
530 
531           case kInstCapture:
532             match_[ip->cap()] = p;
533             id = ip->out();
534             continue;
535 
536           case kInstNop:
537             id = ip->out();
538             continue;
539 
540           case kInstMatch:
541             match_[1] = p;
542             matched_ = true;
543             break;
544 
545           case kInstEmptyWidth:
546             if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
547               LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
548               break;
549             }
550             id = ip->out();
551             continue;
552         }
553         break;
554       }
555       break;
556     }
557 
558     if (p > text.end())
559       break;
560 
561     // Start a new thread if there have not been any matches.
562     // (No point in starting a new thread if there have been
563     // matches, since it would be to the right of the match
564     // we already found.)
565     if (!matched_ && (!anchored || p == text.begin())) {
566       // If there's a required first byte for an unanchored search
567       // and we're not in the middle of any possible matches,
568       // use memchr to search for the byte quickly.
569       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
570           p < text.end() && (p[0] & 0xFF) != first_byte_) {
571         p = reinterpret_cast<const char*>(memchr(p, first_byte_,
572                                                  text.end() - p));
573         if (p == NULL) {
574           p = text.end();
575           isword = 0;
576         } else {
577           isword = Prog::IsWordChar(p[0] & 0xFF);
578         }
579         flag = Prog::EmptyFlags(context, p);
580       }
581 
582       // Steal match storage (cleared but unused as of yet)
583       // temporarily to hold match boundaries for new thread.
584       match_[0] = p;
585       AddToThreadq(runq, start_, flag, p, match_);
586       match_[0] = NULL;
587     }
588 
589     // If all the threads have died, stop early.
590     if (runq->size() == 0) {
591       if (Debug)
592         fprintf(stderr, "dead\n");
593       break;
594     }
595 
596     if (p == text.end())
597       c = 0;
598     else
599       c = *p & 0xFF;
600     wasword = isword;
601 
602     // Will run step(runq, nextq, c, ...) on next iteration.  See above.
603   }
604 
605   for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
606     FreeThread(i->second);
607 
608   if (matched_) {
609     for (int i = 0; i < nsubmatch; i++)
610       submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]);
611     if (Debug)
612       fprintf(stderr, "match (%d,%d)\n",
613               static_cast<int>(match_[0] - btext_),
614               static_cast<int>(match_[1] - btext_));
615     return true;
616   }
617   VLOG(1) << "No matches found";
618   return false;
619 }
620 
621 // Computes whether all successful matches have a common first byte,
622 // and if so, returns that byte.  If not, returns -1.
ComputeFirstByte()623 int NFA::ComputeFirstByte() {
624   if (start_ == 0)
625     return -1;
626 
627   int b = -1;  // first byte, not yet computed
628 
629   typedef SparseSet Workq;
630   Workq q(prog_->size());
631   q.insert(start_);
632   for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
633     int id = *it;
634     Prog::Inst* ip = prog_->inst(id);
635     switch (ip->opcode()) {
636       default:
637         LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
638         break;
639 
640       case kInstMatch:
641         // The empty string matches: no first byte.
642         return -1;
643 
644       case kInstByteRange:
645         // Must match only a single byte
646         if (ip->lo() != ip->hi())
647           return -1;
648         if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
649           return -1;
650         // If we haven't seen any bytes yet, record it;
651         // otherwise must match the one we saw before.
652         if (b == -1)
653           b = ip->lo();
654         else if (b != ip->lo())
655           return -1;
656         break;
657 
658       case kInstNop:
659       case kInstCapture:
660       case kInstEmptyWidth:
661         // Continue on.
662         // Ignore ip->empty() flags for kInstEmptyWidth
663         // in order to be as conservative as possible
664         // (assume all possible empty-width flags are true).
665         if (ip->out())
666           q.insert(ip->out());
667         break;
668 
669       case kInstAlt:
670       case kInstAltMatch:
671         // Explore alternatives.
672         if (ip->out())
673           q.insert(ip->out());
674         if (ip->out1())
675           q.insert(ip->out1());
676         break;
677 
678       case kInstFail:
679         break;
680     }
681   }
682   return b;
683 }
684 
685 bool
SearchNFA(const StringPiece & text,const StringPiece & context,Anchor anchor,MatchKind kind,StringPiece * match,int nmatch)686 Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
687                 Anchor anchor, MatchKind kind,
688                 StringPiece* match, int nmatch) {
689   if (NFA::Debug)
690     Dump();
691 
692   NFA nfa(this);
693   StringPiece sp;
694   if (kind == kFullMatch) {
695     anchor = kAnchored;
696     if (nmatch == 0) {
697       match = &sp;
698       nmatch = 1;
699     }
700   }
701   if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
702     return false;
703   if (kind == kFullMatch && match[0].end() != text.end())
704     return false;
705   return true;
706 }
707 
708 }  // namespace re2
709 
710