1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // A DFA (deterministic finite automaton)-based regular expression search.
6 //
7 // The DFA search has two main parts: the construction of the automaton,
8 // which is represented by a graph of State structures, and the execution
9 // of the automaton over a given input string.
10 //
11 // The basic idea is that the State graph is constructed so that the
12 // execution can simply start with a state s, and then for each byte c in
13 // the input string, execute "s = s->next[c]", checking at each point whether
14 // the current s represents a matching state.
15 //
16 // The simple explanation just given does convey the essence of this code,
17 // but it omits the details of how the State graph gets constructed as well
18 // as some performance-driven optimizations to the execution of the automaton.
19 // All these details are explained in the comments for the code following
20 // the definition of class DFA.
21 //
22 // See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent.
23 
24 #include "re2/prog.h"
25 #include "re2/stringpiece.h"
26 #include "util/atomicops.h"
27 #include "util/flags.h"
28 #include "util/sparse_set.h"
29 
30 DEFINE_bool(re2_dfa_bail_when_slow, true,
31             "Whether the RE2 DFA should bail out early "
32             "if the NFA would be faster (for testing).");
33 
34 namespace re2 {
35 
36 #if !defined(__linux__)  /* only Linux seems to have memrchr */
memrchr(const void * s,int c,size_t n)37 static void* memrchr(const void* s, int c, size_t n) {
38   const unsigned char* p = (const unsigned char*)s;
39   for (p += n; n > 0; n--)
40     if (*--p == c)
41       return (void*)p;
42 
43   return NULL;
44 }
45 #endif
46 
47 // Changing this to true compiles in prints that trace execution of the DFA.
48 // Generates a lot of output -- only useful for debugging.
49 static const bool DebugDFA = false;
50 
51 // A DFA implementation of a regular expression program.
52 // Since this is entirely a forward declaration mandated by C++,
53 // some of the comments here are better understood after reading
54 // the comments in the sections that follow the DFA definition.
55 class DFA {
56  public:
57   DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem);
58   ~DFA();
ok() const59   bool ok() const { return !init_failed_; }
kind()60   Prog::MatchKind kind() { return kind_; }
61 
62   // Searches for the regular expression in text, which is considered
63   // as a subsection of context for the purposes of interpreting flags
64   // like ^ and $ and \A and \z.
65   // Returns whether a match was found.
66   // If a match is found, sets *ep to the end point of the best match in text.
67   // If "anchored", the match must begin at the start of text.
68   // If "want_earliest_match", the match that ends first is used, not
69   //   necessarily the best one.
70   // If "run_forward" is true, the DFA runs from text.begin() to text.end().
71   //   If it is false, the DFA runs from text.end() to text.begin(),
72   //   returning the leftmost end of the match instead of the rightmost one.
73   // If the DFA cannot complete the search (for example, if it is out of
74   //   memory), it sets *failed and returns false.
75   bool Search(const StringPiece& text, const StringPiece& context,
76               bool anchored, bool want_earliest_match, bool run_forward,
77               bool* failed, const char** ep, vector<int>* matches);
78 
79   // Builds out all states for the entire DFA.  FOR TESTING ONLY
80   // Returns number of states.
81   int BuildAllStates();
82 
83   // Computes min and max for matching strings.  Won't return strings
84   // bigger than maxlen.
85   bool PossibleMatchRange(string* min, string* max, int maxlen);
86 
87   // These data structures are logically private, but C++ makes it too
88   // difficult to mark them as such.
89   class Workq;
90   class RWLocker;
91   class StateSaver;
92 
93   // A single DFA state.  The DFA is represented as a graph of these
94   // States, linked by the next_ pointers.  If in state s and reading
95   // byte c, the next state should be s->next_[c].
96   struct State {
IsMatchre2::DFA::State97     inline bool IsMatch() const { return flag_ & kFlagMatch; }
98     void SaveMatch(vector<int>* v);
99 
100     int* inst_;         // Instruction pointers in the state.
101     int ninst_;         // # of inst_ pointers.
102     uint flag_;         // Empty string bitfield flags in effect on the way
103                         // into this state, along with kFlagMatch if this
104                         // is a matching state.
105     State** next_;      // Outgoing arrows from State,
106                         // one per input byte class
107   };
108 
109   enum {
110     kByteEndText = 256,         // imaginary byte at end of text
111 
112     kFlagEmptyMask = 0xFFF,     // State.flag_: bits holding kEmptyXXX flags
113     kFlagMatch = 0x1000,        // State.flag_: this is a matching state
114     kFlagLastWord = 0x2000,     // State.flag_: last byte was a word char
115     kFlagNeedShift = 16,        // needed kEmpty bits are or'ed in shifted left
116   };
117 
118 #ifndef STL_MSVC
119   // STL function structures for use with unordered_set.
120   struct StateEqual {
operator ()re2::DFA::StateEqual121     bool operator()(const State* a, const State* b) const {
122       if (a == b)
123         return true;
124       if (a == NULL || b == NULL)
125         return false;
126       if (a->ninst_ != b->ninst_)
127         return false;
128       if (a->flag_ != b->flag_)
129         return false;
130       for (int i = 0; i < a->ninst_; i++)
131         if (a->inst_[i] != b->inst_[i])
132           return false;
133       return true;  // they're equal
134     }
135   };
136 #endif  // STL_MSVC
137   struct StateHash {
operator ()re2::DFA::StateHash138     size_t operator()(const State* a) const {
139       if (a == NULL)
140         return 0;
141       const char* s = reinterpret_cast<const char*>(a->inst_);
142       int len = a->ninst_ * sizeof a->inst_[0];
143       if (sizeof(size_t) == sizeof(uint32))
144         return Hash32StringWithSeed(s, len, a->flag_);
145       else
146         return Hash64StringWithSeed(s, len, a->flag_);
147     }
148 #ifdef STL_MSVC
149     // Less than operator.
operator ()re2::DFA::StateHash150     bool operator()(const State* a, const State* b) const {
151       if (a == b)
152         return false;
153       if (a == NULL || b == NULL)
154         return a == NULL;
155       if (a->ninst_ != b->ninst_)
156         return a->ninst_ < b->ninst_;
157       if (a->flag_ != b->flag_)
158         return a->flag_ < b->flag_;
159       for (int i = 0; i < a->ninst_; ++i)
160         if (a->inst_[i] != b->inst_[i])
161           return a->inst_[i] < b->inst_[i];
162       return false;  // they're equal
163     }
164     // The two public members are required by msvc. 4 and 8 are default values.
165     // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx
166     static const size_t bucket_size = 4;
167     static const size_t min_buckets = 8;
168 #endif  // STL_MSVC
169   };
170 
171 #ifdef STL_MSVC
172   typedef unordered_set<State*, StateHash> StateSet;
173 #else  // !STL_MSVC
174   typedef unordered_set<State*, StateHash, StateEqual> StateSet;
175 #endif  // STL_MSVC
176 
177 
178  private:
179   // Special "firstbyte" values for a state.  (Values >= 0 denote actual bytes.)
180   enum {
181     kFbUnknown = -1,   // No analysis has been performed.
182     kFbMany = -2,      // Many bytes will lead out of this state.
183     kFbNone = -3,      // No bytes lead out of this state.
184   };
185 
186   enum {
187     // Indices into start_ for unanchored searches.
188     // Add kStartAnchored for anchored searches.
189     kStartBeginText = 0,          // text at beginning of context
190     kStartBeginLine = 2,          // text at beginning of line
191     kStartAfterWordChar = 4,      // text follows a word character
192     kStartAfterNonWordChar = 6,   // text follows non-word character
193     kMaxStart = 8,
194 
195     kStartAnchored = 1,
196   };
197 
198   // Resets the DFA State cache, flushing all saved State* information.
199   // Releases and reacquires cache_mutex_ via cache_lock, so any
200   // State* existing before the call are not valid after the call.
201   // Use a StateSaver to preserve important states across the call.
202   // cache_mutex_.r <= L < mutex_
203   // After: cache_mutex_.w <= L < mutex_
204   void ResetCache(RWLocker* cache_lock);
205 
206   // Looks up and returns the State corresponding to a Workq.
207   // L >= mutex_
208   State* WorkqToCachedState(Workq* q, uint flag);
209 
210   // Looks up and returns a State matching the inst, ninst, and flag.
211   // L >= mutex_
212   State* CachedState(int* inst, int ninst, uint flag);
213 
214   // Clear the cache entirely.
215   // Must hold cache_mutex_.w or be in destructor.
216   void ClearCache();
217 
218   // Converts a State into a Workq: the opposite of WorkqToCachedState.
219   // L >= mutex_
220   static void StateToWorkq(State* s, Workq* q);
221 
222   // Runs a State on a given byte, returning the next state.
223   State* RunStateOnByteUnlocked(State*, int);  // cache_mutex_.r <= L < mutex_
224   State* RunStateOnByte(State*, int);          // L >= mutex_
225 
226   // Runs a Workq on a given byte followed by a set of empty-string flags,
227   // producing a new Workq in nq.  If a match instruction is encountered,
228   // sets *ismatch to true.
229   // L >= mutex_
230   void RunWorkqOnByte(Workq* q, Workq* nq,
231                              int c, uint flag, bool* ismatch,
232                              Prog::MatchKind kind,
233                              int new_byte_loop);
234 
235   // Runs a Workq on a set of empty-string flags, producing a new Workq in nq.
236   // L >= mutex_
237   void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag);
238 
239   // Adds the instruction id to the Workq, following empty arrows
240   // according to flag.
241   // L >= mutex_
242   void AddToQueue(Workq* q, int id, uint flag);
243 
244   // For debugging, returns a text representation of State.
245   static string DumpState(State* state);
246 
247   // For debugging, returns a text representation of a Workq.
248   static string DumpWorkq(Workq* q);
249 
250   // Search parameters
251   struct SearchParams {
SearchParamsre2::DFA::SearchParams252     SearchParams(const StringPiece& text, const StringPiece& context,
253                  RWLocker* cache_lock)
254       : text(text), context(context),
255         anchored(false),
256         want_earliest_match(false),
257         run_forward(false),
258         start(NULL),
259         firstbyte(kFbUnknown),
260         cache_lock(cache_lock),
261         failed(false),
262         ep(NULL),
263         matches(NULL) { }
264 
265     StringPiece text;
266     StringPiece context;
267     bool anchored;
268     bool want_earliest_match;
269     bool run_forward;
270     State* start;
271     int firstbyte;
272     RWLocker *cache_lock;
273     bool failed;     // "out" parameter: whether search gave up
274     const char* ep;  // "out" parameter: end pointer for match
275     vector<int>* matches;
276 
277    private:
278     DISALLOW_EVIL_CONSTRUCTORS(SearchParams);
279   };
280 
281   // Before each search, the parameters to Search are analyzed by
282   // AnalyzeSearch to determine the state in which to start and the
283   // "firstbyte" for that state, if any.
284   struct StartInfo {
StartInfore2::DFA::StartInfo285     StartInfo() : start(NULL), firstbyte(kFbUnknown) { }
286     State* start;
287     volatile int firstbyte;
288   };
289 
290   // Fills in params->start and params->firstbyte using
291   // the other search parameters.  Returns true on success,
292   // false on failure.
293   // cache_mutex_.r <= L < mutex_
294   bool AnalyzeSearch(SearchParams* params);
295   bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags);
296 
297   // The generic search loop, inlined to create specialized versions.
298   // cache_mutex_.r <= L < mutex_
299   // Might unlock and relock cache_mutex_ via params->cache_lock.
300   inline bool InlinedSearchLoop(SearchParams* params,
301                                 bool have_firstbyte,
302                                 bool want_earliest_match,
303                                 bool run_forward);
304 
305   // The specialized versions of InlinedSearchLoop.  The three letters
306   // at the ends of the name denote the true/false values used as the
307   // last three parameters of InlinedSearchLoop.
308   // cache_mutex_.r <= L < mutex_
309   // Might unlock and relock cache_mutex_ via params->cache_lock.
310   bool SearchFFF(SearchParams* params);
311   bool SearchFFT(SearchParams* params);
312   bool SearchFTF(SearchParams* params);
313   bool SearchFTT(SearchParams* params);
314   bool SearchTFF(SearchParams* params);
315   bool SearchTFT(SearchParams* params);
316   bool SearchTTF(SearchParams* params);
317   bool SearchTTT(SearchParams* params);
318 
319   // The main search loop: calls an appropriate specialized version of
320   // InlinedSearchLoop.
321   // cache_mutex_.r <= L < mutex_
322   // Might unlock and relock cache_mutex_ via params->cache_lock.
323   bool FastSearchLoop(SearchParams* params);
324 
325   // For debugging, a slow search loop that calls InlinedSearchLoop
326   // directly -- because the booleans passed are not constants, the
327   // loop is not specialized like the SearchFFF etc. versions, so it
328   // runs much more slowly.  Useful only for debugging.
329   // cache_mutex_.r <= L < mutex_
330   // Might unlock and relock cache_mutex_ via params->cache_lock.
331   bool SlowSearchLoop(SearchParams* params);
332 
333   // Looks up bytes in bytemap_ but handles case c == kByteEndText too.
ByteMap(int c)334   int ByteMap(int c) {
335     if (c == kByteEndText)
336       return prog_->bytemap_range();
337     return prog_->bytemap()[c];
338   }
339 
340   // Constant after initialization.
341   Prog* prog_;              // The regular expression program to run.
342   Prog::MatchKind kind_;    // The kind of DFA.
343   int start_unanchored_;  // start of unanchored program
344   bool init_failed_;        // initialization failed (out of memory)
345 
346   Mutex mutex_;  // mutex_ >= cache_mutex_.r
347 
348   // Scratch areas, protected by mutex_.
349   Workq* q0_;             // Two pre-allocated work queues.
350   Workq* q1_;
351   int* astack_;         // Pre-allocated stack for AddToQueue
352   int nastack_;
353 
354   // State* cache.  Many threads use and add to the cache simultaneously,
355   // holding cache_mutex_ for reading and mutex_ (above) when adding.
356   // If the cache fills and needs to be discarded, the discarding is done
357   // while holding cache_mutex_ for writing, to avoid interrupting other
358   // readers.  Any State* pointers are only valid while cache_mutex_
359   // is held.
360   Mutex cache_mutex_;
361   int64 mem_budget_;       // Total memory budget for all States.
362   int64 state_budget_;     // Amount of memory remaining for new States.
363   StateSet state_cache_;   // All States computed so far.
364   StartInfo start_[kMaxStart];
365   bool cache_warned_;      // have printed to LOG(INFO) about the cache
366 };
367 
368 // Shorthand for casting to uint8*.
BytePtr(const void * v)369 static inline const uint8* BytePtr(const void* v) {
370   return reinterpret_cast<const uint8*>(v);
371 }
372 
373 // Work queues
374 
375 // Marks separate thread groups of different priority
376 // in the work queue when in leftmost-longest matching mode.
377 #define Mark (-1)
378 
379 // Internally, the DFA uses a sparse array of
380 // program instruction pointers as a work queue.
381 // In leftmost longest mode, marks separate sections
382 // of workq that started executing at different
383 // locations in the string (earlier locations first).
384 class DFA::Workq : public SparseSet {
385  public:
386   // Constructor: n is number of normal slots, maxmark number of mark slots.
Workq(int n,int maxmark)387   Workq(int n, int maxmark) :
388     SparseSet(n+maxmark),
389     n_(n),
390     maxmark_(maxmark),
391     nextmark_(n),
392     last_was_mark_(true) {
393   }
394 
is_mark(int i)395   bool is_mark(int i) { return i >= n_; }
396 
maxmark()397   int maxmark() { return maxmark_; }
398 
clear()399   void clear() {
400     SparseSet::clear();
401     nextmark_ = n_;
402   }
403 
mark()404   void mark() {
405     if (last_was_mark_)
406       return;
407     last_was_mark_ = false;
408     SparseSet::insert_new(nextmark_++);
409   }
410 
size()411   int size() {
412     return n_ + maxmark_;
413   }
414 
insert(int id)415   void insert(int id) {
416     if (contains(id))
417       return;
418     insert_new(id);
419   }
420 
insert_new(int id)421   void insert_new(int id) {
422     last_was_mark_ = false;
423     SparseSet::insert_new(id);
424   }
425 
426  private:
427   int n_;                // size excluding marks
428   int maxmark_;          // maximum number of marks
429   int nextmark_;         // id of next mark
430   bool last_was_mark_;   // last inserted was mark
431   DISALLOW_EVIL_CONSTRUCTORS(Workq);
432 };
433 
DFA(Prog * prog,Prog::MatchKind kind,int64 max_mem)434 DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem)
435   : prog_(prog),
436     kind_(kind),
437     init_failed_(false),
438     q0_(NULL),
439     q1_(NULL),
440     astack_(NULL),
441     mem_budget_(max_mem),
442     cache_warned_(false) {
443   if (DebugDFA)
444     fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str());
445   int nmark = 0;
446   start_unanchored_ = 0;
447   if (kind_ == Prog::kLongestMatch) {
448     nmark = prog->size();
449     start_unanchored_ = prog->start_unanchored();
450   }
451   nastack_ = 2 * prog->size() + nmark;
452 
453   // Account for space needed for DFA, q0, q1, astack.
454   mem_budget_ -= sizeof(DFA);
455   mem_budget_ -= (prog_->size() + nmark) *
456                  (sizeof(int)+sizeof(int)) * 2;  // q0, q1
457   mem_budget_ -= nastack_ * sizeof(int);  // astack
458   if (mem_budget_ < 0) {
459     LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
460                               prog_->size(), max_mem);
461     init_failed_ = true;
462     return;
463   }
464 
465   state_budget_ = mem_budget_;
466 
467   // Make sure there is a reasonable amount of working room left.
468   // At minimum, the search requires room for two states in order
469   // to limp along, restarting frequently.  We'll get better performance
470   // if there is room for a larger number of states, say 20.
471   int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) +
472                     (prog_->bytemap_range()+1)*sizeof(State*);
473   if (state_budget_ < 20*one_state) {
474     LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld",
475                               prog_->size(), max_mem);
476     init_failed_ = true;
477     return;
478   }
479 
480   q0_ = new Workq(prog->size(), nmark);
481   q1_ = new Workq(prog->size(), nmark);
482   astack_ = new int[nastack_];
483 }
484 
~DFA()485 DFA::~DFA() {
486   delete q0_;
487   delete q1_;
488   delete[] astack_;
489   ClearCache();
490 }
491 
492 // In the DFA state graph, s->next[c] == NULL means that the
493 // state has not yet been computed and needs to be.  We need
494 // a different special value to signal that s->next[c] is a
495 // state that can never lead to a match (and thus the search
496 // can be called off).  Hence DeadState.
497 #define DeadState reinterpret_cast<State*>(1)
498 
499 // Signals that the rest of the string matches no matter what it is.
500 #define FullMatchState reinterpret_cast<State*>(2)
501 
502 #define SpecialStateMax FullMatchState
503 
504 // Debugging printouts
505 
506 // For debugging, returns a string representation of the work queue.
DumpWorkq(Workq * q)507 string DFA::DumpWorkq(Workq* q) {
508   string s;
509   const char* sep = "";
510   for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) {
511     if (q->is_mark(*it)) {
512       StringAppendF(&s, "|");
513       sep = "";
514     } else {
515       StringAppendF(&s, "%s%d", sep, *it);
516       sep = ",";
517     }
518   }
519   return s;
520 }
521 
522 // For debugging, returns a string representation of the state.
DumpState(State * state)523 string DFA::DumpState(State* state) {
524   if (state == NULL)
525     return "_";
526   if (state == DeadState)
527     return "X";
528   if (state == FullMatchState)
529     return "*";
530   string s;
531   const char* sep = "";
532   StringAppendF(&s, "(%p)", state);
533   for (int i = 0; i < state->ninst_; i++) {
534     if (state->inst_[i] == Mark) {
535       StringAppendF(&s, "|");
536       sep = "";
537     } else {
538       StringAppendF(&s, "%s%d", sep, state->inst_[i]);
539       sep = ",";
540     }
541   }
542   StringAppendF(&s, " flag=%#x", state->flag_);
543   return s;
544 }
545 
546 //////////////////////////////////////////////////////////////////////
547 //
548 // DFA state graph construction.
549 //
550 // The DFA state graph is a heavily-linked collection of State* structures.
551 // The state_cache_ is a set of all the State structures ever allocated,
552 // so that if the same state is reached by two different paths,
553 // the same State structure can be used.  This reduces allocation
554 // requirements and also avoids duplication of effort across the two
555 // identical states.
556 //
557 // A State is defined by an ordered list of instruction ids and a flag word.
558 //
559 // The choice of an ordered list of instructions differs from a typical
560 // textbook DFA implementation, which would use an unordered set.
561 // Textbook descriptions, however, only care about whether
562 // the DFA matches, not where it matches in the text.  To decide where the
563 // DFA matches, we need to mimic the behavior of the dominant backtracking
564 // implementations like PCRE, which try one possible regular expression
565 // execution, then another, then another, stopping when one of them succeeds.
566 // The DFA execution tries these many executions in parallel, representing
567 // each by an instruction id.  These pointers are ordered in the State.inst_
568 // list in the same order that the executions would happen in a backtracking
569 // search: if a match is found during execution of inst_[2], inst_[i] for i>=3
570 // can be discarded.
571 //
572 // Textbooks also typically do not consider context-aware empty string operators
573 // like ^ or $.  These are handled by the flag word, which specifies the set
574 // of empty-string operators that should be matched when executing at the
575 // current text position.  These flag bits are defined in prog.h.
576 // The flag word also contains two DFA-specific bits: kFlagMatch if the state
577 // is a matching state (one that reached a kInstMatch in the program)
578 // and kFlagLastWord if the last processed byte was a word character, for the
579 // implementation of \B and \b.
580 //
581 // The flag word also contains, shifted up 16 bits, the bits looked for by
582 // any kInstEmptyWidth instructions in the state.  These provide a useful
583 // summary indicating when new flags might be useful.
584 //
585 // The permanent representation of a State's instruction ids is just an array,
586 // but while a state is being analyzed, these instruction ids are represented
587 // as a Workq, which is an array that allows iteration in insertion order.
588 
589 // NOTE(rsc): The choice of State construction determines whether the DFA
590 // mimics backtracking implementations (so-called leftmost first matching) or
591 // traditional DFA implementations (so-called leftmost longest matching as
592 // prescribed by POSIX).  This implementation chooses to mimic the
593 // backtracking implementations, because we want to replace PCRE.  To get
594 // POSIX behavior, the states would need to be considered not as a simple
595 // ordered list of instruction ids, but as a list of unordered sets of instruction
596 // ids.  A match by a state in one set would inhibit the running of sets
597 // farther down the list but not other instruction ids in the same set.  Each
598 // set would correspond to matches beginning at a given point in the string.
599 // This is implemented by separating different sets with Mark pointers.
600 
601 // Looks in the State cache for a State matching q, flag.
602 // If one is found, returns it.  If one is not found, allocates one,
603 // inserts it in the cache, and returns it.
WorkqToCachedState(Workq * q,uint flag)604 DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) {
605   if (DEBUG_MODE)
606     mutex_.AssertHeld();
607 
608   // Construct array of instruction ids for the new state.
609   // Only ByteRange, EmptyWidth, and Match instructions are useful to keep:
610   // those are the only operators with any effect in
611   // RunWorkqOnEmptyString or RunWorkqOnByte.
612   int* inst = new int[q->size()];
613   int n = 0;
614   uint needflags = 0;     // flags needed by kInstEmptyWidth instructions
615   bool sawmatch = false;  // whether queue contains guaranteed kInstMatch
616   bool sawmark = false;  // whether queue contains a Mark
617   if (DebugDFA)
618     fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag);
619   for (Workq::iterator it = q->begin(); it != q->end(); ++it) {
620     int id = *it;
621     if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id)))
622       break;
623     if (q->is_mark(id)) {
624       if (n > 0 && inst[n-1] != Mark) {
625         sawmark = true;
626         inst[n++] = Mark;
627       }
628       continue;
629     }
630     Prog::Inst* ip = prog_->inst(id);
631     switch (ip->opcode()) {
632       case kInstAltMatch:
633         // This state will continue to a match no matter what
634         // the rest of the input is.  If it is the highest priority match
635         // being considered, return the special FullMatchState
636         // to indicate that it's all matches from here out.
637         if (kind_ != Prog::kManyMatch &&
638             (kind_ != Prog::kFirstMatch ||
639              (it == q->begin() && ip->greedy(prog_))) &&
640             (kind_ != Prog::kLongestMatch || !sawmark) &&
641             (flag & kFlagMatch)) {
642           delete[] inst;
643           if (DebugDFA)
644             fprintf(stderr, " -> FullMatchState\n");
645           return FullMatchState;
646         }
647         // Fall through.
648       case kInstByteRange:    // These are useful.
649       case kInstEmptyWidth:
650       case kInstMatch:
651       case kInstAlt:          // Not useful, but necessary [*]
652         inst[n++] = *it;
653         if (ip->opcode() == kInstEmptyWidth)
654           needflags |= ip->empty();
655         if (ip->opcode() == kInstMatch && !prog_->anchor_end())
656           sawmatch = true;
657         break;
658 
659       default:                // The rest are not.
660         break;
661     }
662 
663     // [*] kInstAlt would seem useless to record in a state, since
664     // we've already followed both its arrows and saved all the
665     // interesting states we can reach from there.  The problem
666     // is that one of the empty-width instructions might lead
667     // back to the same kInstAlt (if an empty-width operator is starred),
668     // producing a different evaluation order depending on whether
669     // we keep the kInstAlt to begin with.  Sigh.
670     // A specific case that this affects is /(^|a)+/ matching "a".
671     // If we don't save the kInstAlt, we will match the whole "a" (0,1)
672     // but in fact the correct leftmost-first match is the leading "" (0,0).
673   }
674   DCHECK_LE(n, q->size());
675   if (n > 0 && inst[n-1] == Mark)
676     n--;
677 
678   // If there are no empty-width instructions waiting to execute,
679   // then the extra flag bits will not be used, so there is no
680   // point in saving them.  (Discarding them reduces the number
681   // of distinct states.)
682   if (needflags == 0)
683     flag &= kFlagMatch;
684 
685   // NOTE(rsc): The code above cannot do flag &= needflags,
686   // because if the right flags were present to pass the current
687   // kInstEmptyWidth instructions, new kInstEmptyWidth instructions
688   // might be reached that in turn need different flags.
689   // The only sure thing is that if there are no kInstEmptyWidth
690   // instructions at all, no flags will be needed.
691   // We could do the extra work to figure out the full set of
692   // possibly needed flags by exploring past the kInstEmptyWidth
693   // instructions, but the check above -- are any flags needed
694   // at all? -- handles the most common case.  More fine-grained
695   // analysis can only be justified by measurements showing that
696   // too many redundant states are being allocated.
697 
698   // If there are no Insts in the list, it's a dead state,
699   // which is useful to signal with a special pointer so that
700   // the execution loop can stop early.  This is only okay
701   // if the state is *not* a matching state.
702   if (n == 0 && flag == 0) {
703     delete[] inst;
704     if (DebugDFA)
705       fprintf(stderr, " -> DeadState\n");
706     return DeadState;
707   }
708 
709   // If we're in longest match mode, the state is a sequence of
710   // unordered state sets separated by Marks.  Sort each set
711   // to canonicalize, to reduce the number of distinct sets stored.
712   if (kind_ == Prog::kLongestMatch) {
713     int* ip = inst;
714     int* ep = ip + n;
715     while (ip < ep) {
716       int* markp = ip;
717       while (markp < ep && *markp != Mark)
718         markp++;
719       sort(ip, markp);
720       if (markp < ep)
721         markp++;
722       ip = markp;
723     }
724   }
725 
726   // Save the needed empty-width flags in the top bits for use later.
727   flag |= needflags << kFlagNeedShift;
728 
729   State* state = CachedState(inst, n, flag);
730   delete[] inst;
731   return state;
732 }
733 
734 // Looks in the State cache for a State matching inst, ninst, flag.
735 // If one is found, returns it.  If one is not found, allocates one,
736 // inserts it in the cache, and returns it.
CachedState(int * inst,int ninst,uint flag)737 DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) {
738   if (DEBUG_MODE)
739     mutex_.AssertHeld();
740 
741   // Look in the cache for a pre-existing state.
742   State state = { inst, ninst, flag, NULL };
743   StateSet::iterator it = state_cache_.find(&state);
744   if (it != state_cache_.end()) {
745     if (DebugDFA)
746       fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str());
747     return *it;
748   }
749 
750   // Must have enough memory for new state.
751   // In addition to what we're going to allocate,
752   // the state cache hash table seems to incur about 32 bytes per
753   // State*, empirically.
754   const int kStateCacheOverhead = 32;
755   int nnext = prog_->bytemap_range() + 1;  // + 1 for kByteEndText slot
756   int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int);
757   if (mem_budget_ < mem + kStateCacheOverhead) {
758     mem_budget_ = -1;
759     return NULL;
760   }
761   mem_budget_ -= mem + kStateCacheOverhead;
762 
763   // Allocate new state, along with room for next and inst.
764   char* space = new char[mem];
765   State* s = reinterpret_cast<State*>(space);
766   s->next_ = reinterpret_cast<State**>(s + 1);
767   s->inst_ = reinterpret_cast<int*>(s->next_ + nnext);
768   memset(s->next_, 0, nnext*sizeof s->next_[0]);
769   memmove(s->inst_, inst, ninst*sizeof s->inst_[0]);
770   s->ninst_ = ninst;
771   s->flag_ = flag;
772   if (DebugDFA)
773     fprintf(stderr, " -> %s\n", DumpState(s).c_str());
774 
775   // Put state in cache and return it.
776   state_cache_.insert(s);
777   return s;
778 }
779 
780 // Clear the cache.  Must hold cache_mutex_.w or be in destructor.
ClearCache()781 void DFA::ClearCache() {
782   // In case state_cache_ doesn't support deleting entries
783   // during iteration, copy into a vector and then delete.
784   vector<State*> v;
785   v.reserve(state_cache_.size());
786   for (StateSet::iterator it = state_cache_.begin();
787        it != state_cache_.end(); ++it)
788     v.push_back(*it);
789   state_cache_.clear();
790   for (int i = 0; i < v.size(); i++)
791     delete[] reinterpret_cast<const char*>(v[i]);
792 }
793 
794 // Copies insts in state s to the work queue q.
StateToWorkq(State * s,Workq * q)795 void DFA::StateToWorkq(State* s, Workq* q) {
796   q->clear();
797   for (int i = 0; i < s->ninst_; i++) {
798     if (s->inst_[i] == Mark)
799       q->mark();
800     else
801       q->insert_new(s->inst_[i]);
802   }
803 }
804 
805 // Adds ip to the work queue, following empty arrows according to flag
806 // and expanding kInstAlt instructions (two-target gotos).
AddToQueue(Workq * q,int id,uint flag)807 void DFA::AddToQueue(Workq* q, int id, uint flag) {
808 
809   // Use astack_ to hold our stack of states yet to process.
810   // It is sized to have room for nastack_ == 2*prog->size() + nmark
811   // instructions, which is enough: each instruction can be
812   // processed by the switch below only once, and the processing
813   // pushes at most two instructions plus maybe a mark.
814   // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.)
815   int* stk = astack_;
816   int nstk = 0;
817 
818   stk[nstk++] = id;
819   while (nstk > 0) {
820     DCHECK_LE(nstk, nastack_);
821     id = stk[--nstk];
822 
823     if (id == Mark) {
824       q->mark();
825       continue;
826     }
827 
828     if (id == 0)
829       continue;
830 
831     // If ip is already on the queue, nothing to do.
832     // Otherwise add it.  We don't actually keep all the ones
833     // that get added -- for example, kInstAlt is ignored
834     // when on a work queue -- but adding all ip's here
835     // increases the likelihood of q->contains(id),
836     // reducing the amount of duplicated work.
837     if (q->contains(id))
838       continue;
839     q->insert_new(id);
840 
841     // Process instruction.
842     Prog::Inst* ip = prog_->inst(id);
843     switch (ip->opcode()) {
844       case kInstFail:       // can't happen: discarded above
845         break;
846 
847       case kInstByteRange:  // just save these on the queue
848       case kInstMatch:
849         break;
850 
851       case kInstCapture:    // DFA treats captures as no-ops.
852       case kInstNop:
853         stk[nstk++] = ip->out();
854         break;
855 
856       case kInstAlt:        // two choices: expand both, in order
857       case kInstAltMatch:
858         // Want to visit out then out1, so push on stack in reverse order.
859         // This instruction is the [00-FF]* loop at the beginning of
860         // a leftmost-longest unanchored search, separate out from out1
861         // with a Mark, so that out1's threads (which will start farther
862         // to the right in the string being searched) are lower priority
863         // than the current ones.
864         stk[nstk++] = ip->out1();
865         if (q->maxmark() > 0 &&
866             id == prog_->start_unanchored() && id != prog_->start())
867           stk[nstk++] = Mark;
868         stk[nstk++] = ip->out();
869         break;
870 
871       case kInstEmptyWidth:
872         if ((ip->empty() & flag) == ip->empty())
873           stk[nstk++] = ip->out();
874         break;
875     }
876   }
877 }
878 
879 // Running of work queues.  In the work queue, order matters:
880 // the queue is sorted in priority order.  If instruction i comes before j,
881 // then the instructions that i produces during the run must come before
882 // the ones that j produces.  In order to keep this invariant, all the
883 // work queue runners have to take an old queue to process and then
884 // also a new queue to fill in.  It's not acceptable to add to the end of
885 // an existing queue, because new instructions will not end up in the
886 // correct position.
887 
888 // Runs the work queue, processing the empty strings indicated by flag.
889 // For example, flag == kEmptyBeginLine|kEmptyEndLine means to match
890 // both ^ and $.  It is important that callers pass all flags at once:
891 // processing both ^ and $ is not the same as first processing only ^
892 // and then processing only $.  Doing the two-step sequence won't match
893 // ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior
894 // exhibited by existing implementations).
RunWorkqOnEmptyString(Workq * oldq,Workq * newq,uint flag)895 void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) {
896   newq->clear();
897   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
898     if (oldq->is_mark(*i))
899       AddToQueue(newq, Mark, flag);
900     else
901       AddToQueue(newq, *i, flag);
902   }
903 }
904 
905 // Runs the work queue, processing the single byte c followed by any empty
906 // strings indicated by flag.  For example, c == 'a' and flag == kEmptyEndLine,
907 // means to match c$.  Sets the bool *ismatch to true if the end of the
908 // regular expression program has been reached (the regexp has matched).
RunWorkqOnByte(Workq * oldq,Workq * newq,int c,uint flag,bool * ismatch,Prog::MatchKind kind,int new_byte_loop)909 void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq,
910                          int c, uint flag, bool* ismatch,
911                          Prog::MatchKind kind,
912                          int new_byte_loop) {
913   if (DEBUG_MODE)
914     mutex_.AssertHeld();
915 
916   newq->clear();
917   for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) {
918     if (oldq->is_mark(*i)) {
919       if (*ismatch)
920         return;
921       newq->mark();
922       continue;
923     }
924     int id = *i;
925     Prog::Inst* ip = prog_->inst(id);
926     switch (ip->opcode()) {
927       case kInstFail:        // never succeeds
928       case kInstCapture:     // already followed
929       case kInstNop:         // already followed
930       case kInstAlt:         // already followed
931       case kInstAltMatch:    // already followed
932       case kInstEmptyWidth:  // already followed
933         break;
934 
935       case kInstByteRange:   // can follow if c is in range
936         if (ip->Matches(c))
937           AddToQueue(newq, ip->out(), flag);
938         break;
939 
940       case kInstMatch:
941         if (prog_->anchor_end() && c != kByteEndText)
942           break;
943         *ismatch = true;
944         if (kind == Prog::kFirstMatch) {
945           // Can stop processing work queue since we found a match.
946           return;
947         }
948         break;
949     }
950   }
951 
952   if (DebugDFA)
953     fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(),
954             c, flag, DumpWorkq(newq).c_str(), *ismatch);
955 }
956 
957 // Processes input byte c in state, returning new state.
958 // Caller does not hold mutex.
RunStateOnByteUnlocked(State * state,int c)959 DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) {
960   // Keep only one RunStateOnByte going
961   // even if the DFA is being run by multiple threads.
962   MutexLock l(&mutex_);
963   return RunStateOnByte(state, c);
964 }
965 
966 // Processes input byte c in state, returning new state.
RunStateOnByte(State * state,int c)967 DFA::State* DFA::RunStateOnByte(State* state, int c) {
968   if (DEBUG_MODE)
969     mutex_.AssertHeld();
970   if (state <= SpecialStateMax) {
971     if (state == FullMatchState) {
972       // It is convenient for routines like PossibleMatchRange
973       // if we implement RunStateOnByte for FullMatchState:
974       // once you get into this state you never get out,
975       // so it's pretty easy.
976       return FullMatchState;
977     }
978     if (state == DeadState) {
979       LOG(DFATAL) << "DeadState in RunStateOnByte";
980       return NULL;
981     }
982     if (state == NULL) {
983       LOG(DFATAL) << "NULL state in RunStateOnByte";
984       return NULL;
985     }
986     LOG(DFATAL) << "Unexpected special state in RunStateOnByte";
987     return NULL;
988   }
989 
990   // If someone else already computed this, return it.
991   MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
992   State* ns = state->next_[ByteMap(c)];
993   ANNOTATE_HAPPENS_AFTER(ns);
994   if (ns != NULL)
995     return ns;
996 
997   // Convert state into Workq.
998   StateToWorkq(state, q0_);
999 
1000   // Flags marking the kinds of empty-width things (^ $ etc)
1001   // around this byte.  Before the byte we have the flags recorded
1002   // in the State structure itself.  After the byte we have
1003   // nothing yet (but that will change: read on).
1004   uint needflag = state->flag_ >> kFlagNeedShift;
1005   uint beforeflag = state->flag_ & kFlagEmptyMask;
1006   uint oldbeforeflag = beforeflag;
1007   uint afterflag = 0;
1008 
1009   if (c == '\n') {
1010     // Insert implicit $ and ^ around \n
1011     beforeflag |= kEmptyEndLine;
1012     afterflag |= kEmptyBeginLine;
1013   }
1014 
1015   if (c == kByteEndText) {
1016     // Insert implicit $ and \z before the fake "end text" byte.
1017     beforeflag |= kEmptyEndLine | kEmptyEndText;
1018   }
1019 
1020   // The state flag kFlagLastWord says whether the last
1021   // byte processed was a word character.  Use that info to
1022   // insert empty-width (non-)word boundaries.
1023   bool islastword = state->flag_ & kFlagLastWord;
1024   bool isword = (c != kByteEndText && Prog::IsWordChar(c));
1025   if (isword == islastword)
1026     beforeflag |= kEmptyNonWordBoundary;
1027   else
1028     beforeflag |= kEmptyWordBoundary;
1029 
1030   // Okay, finally ready to run.
1031   // Only useful to rerun on empty string if there are new, useful flags.
1032   if (beforeflag & ~oldbeforeflag & needflag) {
1033     RunWorkqOnEmptyString(q0_, q1_, beforeflag);
1034     swap(q0_, q1_);
1035   }
1036   bool ismatch = false;
1037   RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_);
1038 
1039   // Most of the time, we build the state from the output of
1040   // RunWorkqOnByte, so swap q0_ and q1_ here.  However, so that
1041   // RE2::Set can tell exactly which match instructions
1042   // contributed to the match, don't swap if c is kByteEndText.
1043   // The resulting state wouldn't be correct for further processing
1044   // of the string, but we're at the end of the text so that's okay.
1045   // Leaving q0_ alone preseves the match instructions that led to
1046   // the current setting of ismatch.
1047   if (c != kByteEndText || kind_ != Prog::kManyMatch)
1048     swap(q0_, q1_);
1049 
1050   // Save afterflag along with ismatch and isword in new state.
1051   uint flag = afterflag;
1052   if (ismatch)
1053     flag |= kFlagMatch;
1054   if (isword)
1055     flag |= kFlagLastWord;
1056 
1057   ns = WorkqToCachedState(q0_, flag);
1058 
1059   // Write barrier before updating state->next_ so that the
1060   // main search loop can proceed without any locking, for speed.
1061   // (Otherwise it would need one mutex operation per input byte.)
1062   // The annotations below tell race detectors that:
1063   //   a) the access to next_ should be ignored,
1064   //   b) 'ns' is properly published.
1065   WriteMemoryBarrier();  // Flush ns before linking to it.
1066 
1067   ANNOTATE_IGNORE_WRITES_BEGIN();
1068   ANNOTATE_HAPPENS_BEFORE(ns);
1069   state->next_[ByteMap(c)] = ns;
1070   ANNOTATE_IGNORE_WRITES_END();
1071   return ns;
1072 }
1073 
1074 
1075 //////////////////////////////////////////////////////////////////////
1076 // DFA cache reset.
1077 
1078 // Reader-writer lock helper.
1079 //
1080 // The DFA uses a reader-writer mutex to protect the state graph itself.
1081 // Traversing the state graph requires holding the mutex for reading,
1082 // and discarding the state graph and starting over requires holding the
1083 // lock for writing.  If a search needs to expand the graph but is out
1084 // of memory, it will need to drop its read lock and then acquire the
1085 // write lock.  Since it cannot then atomically downgrade from write lock
1086 // to read lock, it runs the rest of the search holding the write lock.
1087 // (This probably helps avoid repeated contention, but really the decision
1088 // is forced by the Mutex interface.)  It's a bit complicated to keep
1089 // track of whether the lock is held for reading or writing and thread
1090 // that through the search, so instead we encapsulate it in the RWLocker
1091 // and pass that around.
1092 
1093 class DFA::RWLocker {
1094  public:
1095   explicit RWLocker(Mutex* mu);
1096   ~RWLocker();
1097 
1098   // If the lock is only held for reading right now,
1099   // drop the read lock and re-acquire for writing.
1100   // Subsequent calls to LockForWriting are no-ops.
1101   // Notice that the lock is *released* temporarily.
1102   void LockForWriting();
1103 
1104   // Returns whether the lock is already held for writing.
IsLockedForWriting()1105   bool IsLockedForWriting() {
1106     return writing_;
1107   }
1108 
1109  private:
1110   Mutex* mu_;
1111   bool writing_;
1112 
1113   DISALLOW_EVIL_CONSTRUCTORS(RWLocker);
1114 };
1115 
RWLocker(Mutex * mu)1116 DFA::RWLocker::RWLocker(Mutex* mu)
1117   : mu_(mu), writing_(false) {
1118 
1119   mu_->ReaderLock();
1120 }
1121 
1122 // This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations
1123 // does not support lock upgrade.
LockForWriting()1124 void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS {
1125   if (!writing_) {
1126     mu_->ReaderUnlock();
1127     mu_->Lock();
1128     writing_ = true;
1129   }
1130 }
1131 
~RWLocker()1132 DFA::RWLocker::~RWLocker() {
1133   if (writing_)
1134     mu_->WriterUnlock();
1135   else
1136     mu_->ReaderUnlock();
1137 }
1138 
1139 
1140 // When the DFA's State cache fills, we discard all the states in the
1141 // cache and start over.  Many threads can be using and adding to the
1142 // cache at the same time, so we synchronize using the cache_mutex_
1143 // to keep from stepping on other threads.  Specifically, all the
1144 // threads using the current cache hold cache_mutex_ for reading.
1145 // When a thread decides to flush the cache, it drops cache_mutex_
1146 // and then re-acquires it for writing.  That ensures there are no
1147 // other threads accessing the cache anymore.  The rest of the search
1148 // runs holding cache_mutex_ for writing, avoiding any contention
1149 // with or cache pollution caused by other threads.
1150 
ResetCache(RWLocker * cache_lock)1151 void DFA::ResetCache(RWLocker* cache_lock) {
1152   // Re-acquire the cache_mutex_ for writing (exclusive use).
1153   bool was_writing = cache_lock->IsLockedForWriting();
1154   cache_lock->LockForWriting();
1155 
1156   // If we already held cache_mutex_ for writing, it means
1157   // this invocation of Search() has already reset the
1158   // cache once already.  That's a pretty clear indication
1159   // that the cache is too small.  Warn about that, once.
1160   // TODO(rsc): Only warn if state_cache_.size() < some threshold.
1161   if (was_writing && !cache_warned_) {
1162     LOG(INFO) << "DFA memory cache could be too small: "
1163               << "only room for " << state_cache_.size() << " states.";
1164     cache_warned_ = true;
1165   }
1166 
1167   // Clear the cache, reset the memory budget.
1168   for (int i = 0; i < kMaxStart; i++) {
1169     start_[i].start = NULL;
1170     start_[i].firstbyte = kFbUnknown;
1171   }
1172   ClearCache();
1173   mem_budget_ = state_budget_;
1174 }
1175 
1176 // Typically, a couple States do need to be preserved across a cache
1177 // reset, like the State at the current point in the search.
1178 // The StateSaver class helps keep States across cache resets.
1179 // It makes a copy of the state's guts outside the cache (before the reset)
1180 // and then can be asked, after the reset, to recreate the State
1181 // in the new cache.  For example, in a DFA method ("this" is a DFA):
1182 //
1183 //   StateSaver saver(this, s);
1184 //   ResetCache(cache_lock);
1185 //   s = saver.Restore();
1186 //
1187 // The saver should always have room in the cache to re-create the state,
1188 // because resetting the cache locks out all other threads, and the cache
1189 // is known to have room for at least a couple states (otherwise the DFA
1190 // constructor fails).
1191 
1192 class DFA::StateSaver {
1193  public:
1194   explicit StateSaver(DFA* dfa, State* state);
1195   ~StateSaver();
1196 
1197   // Recreates and returns a state equivalent to the
1198   // original state passed to the constructor.
1199   // Returns NULL if the cache has filled, but
1200   // since the DFA guarantees to have room in the cache
1201   // for a couple states, should never return NULL
1202   // if used right after ResetCache.
1203   State* Restore();
1204 
1205  private:
1206   DFA* dfa_;         // the DFA to use
1207   int* inst_;        // saved info from State
1208   int ninst_;
1209   uint flag_;
1210   bool is_special_;  // whether original state was special
1211   State* special_;   // if is_special_, the original state
1212 
1213   DISALLOW_EVIL_CONSTRUCTORS(StateSaver);
1214 };
1215 
StateSaver(DFA * dfa,State * state)1216 DFA::StateSaver::StateSaver(DFA* dfa, State* state) {
1217   dfa_ = dfa;
1218   if (state <= SpecialStateMax) {
1219     inst_ = NULL;
1220     ninst_ = 0;
1221     flag_ = 0;
1222     is_special_ = true;
1223     special_ = state;
1224     return;
1225   }
1226   is_special_ = false;
1227   special_ = NULL;
1228   flag_ = state->flag_;
1229   ninst_ = state->ninst_;
1230   inst_ = new int[ninst_];
1231   memmove(inst_, state->inst_, ninst_*sizeof inst_[0]);
1232 }
1233 
~StateSaver()1234 DFA::StateSaver::~StateSaver() {
1235   if (!is_special_)
1236     delete[] inst_;
1237 }
1238 
Restore()1239 DFA::State* DFA::StateSaver::Restore() {
1240   if (is_special_)
1241     return special_;
1242   MutexLock l(&dfa_->mutex_);
1243   State* s = dfa_->CachedState(inst_, ninst_, flag_);
1244   if (s == NULL)
1245     LOG(DFATAL) << "StateSaver failed to restore state.";
1246   return s;
1247 }
1248 
1249 
1250 //////////////////////////////////////////////////////////////////////
1251 //
1252 // DFA execution.
1253 //
1254 // The basic search loop is easy: start in a state s and then for each
1255 // byte c in the input, s = s->next[c].
1256 //
1257 // This simple description omits a few efficiency-driven complications.
1258 //
1259 // First, the State graph is constructed incrementally: it is possible
1260 // that s->next[c] is null, indicating that that state has not been
1261 // fully explored.  In this case, RunStateOnByte must be invoked to
1262 // determine the next state, which is cached in s->next[c] to save
1263 // future effort.  An alternative reason for s->next[c] to be null is
1264 // that the DFA has reached a so-called "dead state", in which any match
1265 // is no longer possible.  In this case RunStateOnByte will return NULL
1266 // and the processing of the string can stop early.
1267 //
1268 // Second, a 256-element pointer array for s->next_ makes each State
1269 // quite large (2kB on 64-bit machines).  Instead, dfa->bytemap_[]
1270 // maps from bytes to "byte classes" and then next_ only needs to have
1271 // as many pointers as there are byte classes.  A byte class is simply a
1272 // range of bytes that the regexp never distinguishes between.
1273 // A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1,
1274 // 'a', 'b' to 'c', and 'c' to 0xFF.  The bytemap slows us a little bit
1275 // but in exchange we typically cut the size of a State (and thus our
1276 // memory footprint) by about 5-10x.  The comments still refer to
1277 // s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]].
1278 //
1279 // Third, it is common for a DFA for an unanchored match to begin in a
1280 // state in which only one particular byte value can take the DFA to a
1281 // different state.  That is, s->next[c] != s for only one c.  In this
1282 // situation, the DFA can do better than executing the simple loop.
1283 // Instead, it can call memchr to search very quickly for the byte c.
1284 // Whether the start state has this property is determined during a
1285 // pre-compilation pass, and if so, the byte b is passed to the search
1286 // loop as the "firstbyte" argument, along with a boolean "have_firstbyte".
1287 //
1288 // Fourth, the desired behavior is to search for the leftmost-best match
1289 // (approximately, the same one that Perl would find), which is not
1290 // necessarily the match ending earliest in the string.  Each time a
1291 // match is found, it must be noted, but the DFA must continue on in
1292 // hope of finding a higher-priority match.  In some cases, the caller only
1293 // cares whether there is any match at all, not which one is found.
1294 // The "want_earliest_match" flag causes the search to stop at the first
1295 // match found.
1296 //
1297 // Fifth, one algorithm that uses the DFA needs it to run over the
1298 // input string backward, beginning at the end and ending at the beginning.
1299 // Passing false for the "run_forward" flag causes the DFA to run backward.
1300 //
1301 // The checks for these last three cases, which in a naive implementation
1302 // would be performed once per input byte, slow the general loop enough
1303 // to merit specialized versions of the search loop for each of the
1304 // eight possible settings of the three booleans.  Rather than write
1305 // eight different functions, we write one general implementation and then
1306 // inline it to create the specialized ones.
1307 //
1308 // Note that matches are delayed by one byte, to make it easier to
1309 // accomodate match conditions depending on the next input byte (like $ and \b).
1310 // When s->next[c]->IsMatch(), it means that there is a match ending just
1311 // *before* byte c.
1312 
1313 // The generic search loop.  Searches text for a match, returning
1314 // the pointer to the end of the chosen match, or NULL if no match.
1315 // The bools are equal to the same-named variables in params, but
1316 // making them function arguments lets the inliner specialize
1317 // this function to each combination (see two paragraphs above).
InlinedSearchLoop(SearchParams * params,bool have_firstbyte,bool want_earliest_match,bool run_forward)1318 inline bool DFA::InlinedSearchLoop(SearchParams* params,
1319                                    bool have_firstbyte,
1320                                    bool want_earliest_match,
1321                                    bool run_forward) {
1322   State* start = params->start;
1323   const uint8* bp = BytePtr(params->text.begin());  // start of text
1324   const uint8* p = bp;                              // text scanning point
1325   const uint8* ep = BytePtr(params->text.end());    // end of text
1326   const uint8* resetp = NULL;                       // p at last cache reset
1327   if (!run_forward)
1328     swap(p, ep);
1329 
1330   const uint8* bytemap = prog_->bytemap();
1331   const uint8* lastmatch = NULL;   // most recent matching position in text
1332   bool matched = false;
1333   State* s = start;
1334 
1335   if (s->IsMatch()) {
1336     matched = true;
1337     lastmatch = p;
1338     if (want_earliest_match) {
1339       params->ep = reinterpret_cast<const char*>(lastmatch);
1340       return true;
1341     }
1342   }
1343 
1344   while (p != ep) {
1345     if (DebugDFA)
1346       fprintf(stderr, "@%d: %s\n", static_cast<int>(p - bp),
1347               DumpState(s).c_str());
1348     if (have_firstbyte && s == start) {
1349       // In start state, only way out is to find firstbyte,
1350       // so use optimized assembly in memchr to skip ahead.
1351       // If firstbyte isn't found, we can skip to the end
1352       // of the string.
1353       if (run_forward) {
1354         if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) {
1355           p = ep;
1356           break;
1357         }
1358       } else {
1359         if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) {
1360           p = ep;
1361           break;
1362         }
1363         p++;
1364       }
1365     }
1366 
1367     int c;
1368     if (run_forward)
1369       c = *p++;
1370     else
1371       c = *--p;
1372 
1373     // Note that multiple threads might be consulting
1374     // s->next_[bytemap[c]] simultaneously.
1375     // RunStateOnByte takes care of the appropriate locking,
1376     // including a memory barrier so that the unlocked access
1377     // (sometimes known as "double-checked locking") is safe.
1378     // The alternative would be either one DFA per thread
1379     // or one mutex operation per input byte.
1380     //
1381     // ns == DeadState means the state is known to be dead
1382     // (no more matches are possible).
1383     // ns == NULL means the state has not yet been computed
1384     // (need to call RunStateOnByteUnlocked).
1385     // RunStateOnByte returns ns == NULL if it is out of memory.
1386     // ns == FullMatchState means the rest of the string matches.
1387     //
1388     // Okay to use bytemap[] not ByteMap() here, because
1389     // c is known to be an actual byte and not kByteEndText.
1390 
1391     MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
1392     State* ns = s->next_[bytemap[c]];
1393     ANNOTATE_HAPPENS_AFTER(ns);
1394     if (ns == NULL) {
1395       ns = RunStateOnByteUnlocked(s, c);
1396       if (ns == NULL) {
1397         // After we reset the cache, we hold cache_mutex exclusively,
1398         // so if resetp != NULL, it means we filled the DFA state
1399         // cache with this search alone (without any other threads).
1400         // Benchmarks show that doing a state computation on every
1401         // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the
1402         // same at about 2 MB/s.  Unless we're processing an average
1403         // of 10 bytes per state computation, fail so that RE2 can
1404         // fall back to the NFA.
1405         if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL &&
1406             (p - resetp) < 10*state_cache_.size()) {
1407           params->failed = true;
1408           return false;
1409         }
1410         resetp = p;
1411 
1412         // Prepare to save start and s across the reset.
1413         StateSaver save_start(this, start);
1414         StateSaver save_s(this, s);
1415 
1416         // Discard all the States in the cache.
1417         ResetCache(params->cache_lock);
1418 
1419         // Restore start and s so we can continue.
1420         if ((start = save_start.Restore()) == NULL ||
1421             (s = save_s.Restore()) == NULL) {
1422           // Restore already did LOG(DFATAL).
1423           params->failed = true;
1424           return false;
1425         }
1426         ns = RunStateOnByteUnlocked(s, c);
1427         if (ns == NULL) {
1428           LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache";
1429           params->failed = true;
1430           return false;
1431         }
1432       }
1433     }
1434     if (ns <= SpecialStateMax) {
1435       if (ns == DeadState) {
1436         params->ep = reinterpret_cast<const char*>(lastmatch);
1437         return matched;
1438       }
1439       // FullMatchState
1440       params->ep = reinterpret_cast<const char*>(ep);
1441       return true;
1442     }
1443     s = ns;
1444 
1445     if (s->IsMatch()) {
1446       matched = true;
1447       // The DFA notices the match one byte late,
1448       // so adjust p before using it in the match.
1449       if (run_forward)
1450         lastmatch = p - 1;
1451       else
1452         lastmatch = p + 1;
1453       if (DebugDFA)
1454         fprintf(stderr, "match @%d! [%s]\n",
1455                 static_cast<int>(lastmatch - bp),
1456                 DumpState(s).c_str());
1457 
1458       if (want_earliest_match) {
1459         params->ep = reinterpret_cast<const char*>(lastmatch);
1460         return true;
1461       }
1462     }
1463   }
1464 
1465   // Process one more byte to see if it triggers a match.
1466   // (Remember, matches are delayed one byte.)
1467   int lastbyte;
1468   if (run_forward) {
1469     if (params->text.end() == params->context.end())
1470       lastbyte = kByteEndText;
1471     else
1472       lastbyte = params->text.end()[0] & 0xFF;
1473   } else {
1474     if (params->text.begin() == params->context.begin())
1475       lastbyte = kByteEndText;
1476     else
1477       lastbyte = params->text.begin()[-1] & 0xFF;
1478   }
1479 
1480   MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering
1481   State* ns = s->next_[ByteMap(lastbyte)];
1482   ANNOTATE_HAPPENS_AFTER(ns);
1483   if (ns == NULL) {
1484     ns = RunStateOnByteUnlocked(s, lastbyte);
1485     if (ns == NULL) {
1486       StateSaver save_s(this, s);
1487       ResetCache(params->cache_lock);
1488       if ((s = save_s.Restore()) == NULL) {
1489         params->failed = true;
1490         return false;
1491       }
1492       ns = RunStateOnByteUnlocked(s, lastbyte);
1493       if (ns == NULL) {
1494         LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset";
1495         params->failed = true;
1496         return false;
1497       }
1498     }
1499   }
1500   s = ns;
1501   if (DebugDFA)
1502     fprintf(stderr, "@_: %s\n", DumpState(s).c_str());
1503   if (s == FullMatchState) {
1504     params->ep = reinterpret_cast<const char*>(ep);
1505     return true;
1506   }
1507   if (s > SpecialStateMax && s->IsMatch()) {
1508     matched = true;
1509     lastmatch = p;
1510     if (params->matches && kind_ == Prog::kManyMatch) {
1511       vector<int>* v = params->matches;
1512       v->clear();
1513       for (int i = 0; i < s->ninst_; i++) {
1514         Prog::Inst* ip = prog_->inst(s->inst_[i]);
1515         if (ip->opcode() == kInstMatch)
1516           v->push_back(ip->match_id());
1517       }
1518     }
1519     if (DebugDFA)
1520       fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp),
1521               DumpState(s).c_str());
1522   }
1523   params->ep = reinterpret_cast<const char*>(lastmatch);
1524   return matched;
1525 }
1526 
1527 // Inline specializations of the general loop.
SearchFFF(SearchParams * params)1528 bool DFA::SearchFFF(SearchParams* params) {
1529   return InlinedSearchLoop(params, 0, 0, 0);
1530 }
SearchFFT(SearchParams * params)1531 bool DFA::SearchFFT(SearchParams* params) {
1532   return InlinedSearchLoop(params, 0, 0, 1);
1533 }
SearchFTF(SearchParams * params)1534 bool DFA::SearchFTF(SearchParams* params) {
1535   return InlinedSearchLoop(params, 0, 1, 0);
1536 }
SearchFTT(SearchParams * params)1537 bool DFA::SearchFTT(SearchParams* params) {
1538   return InlinedSearchLoop(params, 0, 1, 1);
1539 }
SearchTFF(SearchParams * params)1540 bool DFA::SearchTFF(SearchParams* params) {
1541   return InlinedSearchLoop(params, 1, 0, 0);
1542 }
SearchTFT(SearchParams * params)1543 bool DFA::SearchTFT(SearchParams* params) {
1544   return InlinedSearchLoop(params, 1, 0, 1);
1545 }
SearchTTF(SearchParams * params)1546 bool DFA::SearchTTF(SearchParams* params) {
1547   return InlinedSearchLoop(params, 1, 1, 0);
1548 }
SearchTTT(SearchParams * params)1549 bool DFA::SearchTTT(SearchParams* params) {
1550   return InlinedSearchLoop(params, 1, 1, 1);
1551 }
1552 
1553 // For debugging, calls the general code directly.
SlowSearchLoop(SearchParams * params)1554 bool DFA::SlowSearchLoop(SearchParams* params) {
1555   return InlinedSearchLoop(params,
1556                            params->firstbyte >= 0,
1557                            params->want_earliest_match,
1558                            params->run_forward);
1559 }
1560 
1561 // For performance, calls the appropriate specialized version
1562 // of InlinedSearchLoop.
FastSearchLoop(SearchParams * params)1563 bool DFA::FastSearchLoop(SearchParams* params) {
1564   // Because the methods are private, the Searches array
1565   // cannot be declared at top level.
1566   static bool (DFA::*Searches[])(SearchParams*) = {
1567     &DFA::SearchFFF,
1568     &DFA::SearchFFT,
1569     &DFA::SearchFTF,
1570     &DFA::SearchFTT,
1571     &DFA::SearchTFF,
1572     &DFA::SearchTFT,
1573     &DFA::SearchTTF,
1574     &DFA::SearchTTT,
1575   };
1576 
1577   bool have_firstbyte = (params->firstbyte >= 0);
1578   int index = 4 * have_firstbyte +
1579               2 * params->want_earliest_match +
1580               1 * params->run_forward;
1581   return (this->*Searches[index])(params);
1582 }
1583 
1584 
1585 // The discussion of DFA execution above ignored the question of how
1586 // to determine the initial state for the search loop.  There are two
1587 // factors that influence the choice of start state.
1588 //
1589 // The first factor is whether the search is anchored or not.
1590 // The regexp program (Prog*) itself has
1591 // two different entry points: one for anchored searches and one for
1592 // unanchored searches.  (The unanchored version starts with a leading ".*?"
1593 // and then jumps to the anchored one.)
1594 //
1595 // The second factor is where text appears in the larger context, which
1596 // determines which empty-string operators can be matched at the beginning
1597 // of execution.  If text is at the very beginning of context, \A and ^ match.
1598 // Otherwise if text is at the beginning of a line, then ^ matches.
1599 // Otherwise it matters whether the character before text is a word character
1600 // or a non-word character.
1601 //
1602 // The two cases (unanchored vs not) and four cases (empty-string flags)
1603 // combine to make the eight cases recorded in the DFA's begin_text_[2],
1604 // begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached
1605 // StartInfos.  The start state for each is filled in the first time it
1606 // is used for an actual search.
1607 
1608 // Examines text, context, and anchored to determine the right start
1609 // state for the DFA search loop.  Fills in params and returns true on success.
1610 // Returns false on failure.
AnalyzeSearch(SearchParams * params)1611 bool DFA::AnalyzeSearch(SearchParams* params) {
1612   const StringPiece& text = params->text;
1613   const StringPiece& context = params->context;
1614 
1615   // Sanity check: make sure that text lies within context.
1616   if (text.begin() < context.begin() || text.end() > context.end()) {
1617     LOG(DFATAL) << "Text is not inside context.";
1618     params->start = DeadState;
1619     return true;
1620   }
1621 
1622   // Determine correct search type.
1623   int start;
1624   uint flags;
1625   if (params->run_forward) {
1626     if (text.begin() == context.begin()) {
1627       start = kStartBeginText;
1628       flags = kEmptyBeginText|kEmptyBeginLine;
1629     } else if (text.begin()[-1] == '\n') {
1630       start = kStartBeginLine;
1631       flags = kEmptyBeginLine;
1632     } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) {
1633       start = kStartAfterWordChar;
1634       flags = kFlagLastWord;
1635     } else {
1636       start = kStartAfterNonWordChar;
1637       flags = 0;
1638     }
1639   } else {
1640     if (text.end() == context.end()) {
1641       start = kStartBeginText;
1642       flags = kEmptyBeginText|kEmptyBeginLine;
1643     } else if (text.end()[0] == '\n') {
1644       start = kStartBeginLine;
1645       flags = kEmptyBeginLine;
1646     } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) {
1647       start = kStartAfterWordChar;
1648       flags = kFlagLastWord;
1649     } else {
1650       start = kStartAfterNonWordChar;
1651       flags = 0;
1652     }
1653   }
1654   if (params->anchored || prog_->anchor_start())
1655     start |= kStartAnchored;
1656   StartInfo* info = &start_[start];
1657 
1658   // Try once without cache_lock for writing.
1659   // Try again after resetting the cache
1660   // (ResetCache will relock cache_lock for writing).
1661   if (!AnalyzeSearchHelper(params, info, flags)) {
1662     ResetCache(params->cache_lock);
1663     if (!AnalyzeSearchHelper(params, info, flags)) {
1664       LOG(DFATAL) << "Failed to analyze start state.";
1665       params->failed = true;
1666       return false;
1667     }
1668   }
1669 
1670   if (DebugDFA)
1671     fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n",
1672             params->anchored, params->run_forward, flags,
1673             DumpState(info->start).c_str(), info->firstbyte);
1674 
1675   params->start = info->start;
1676   params->firstbyte = ANNOTATE_UNPROTECTED_READ(info->firstbyte);
1677 
1678   return true;
1679 }
1680 
1681 // Fills in info if needed.  Returns true on success, false on failure.
AnalyzeSearchHelper(SearchParams * params,StartInfo * info,uint flags)1682 bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info,
1683                               uint flags) {
1684   // Quick check; okay because of memory barriers below.
1685   if (ANNOTATE_UNPROTECTED_READ(info->firstbyte) != kFbUnknown) {
1686     ANNOTATE_HAPPENS_AFTER(&info->firstbyte);
1687     return true;
1688   }
1689 
1690   MutexLock l(&mutex_);
1691   if (info->firstbyte != kFbUnknown) {
1692     ANNOTATE_HAPPENS_AFTER(&info->firstbyte);
1693     return true;
1694   }
1695 
1696   q0_->clear();
1697   AddToQueue(q0_,
1698              params->anchored ? prog_->start() : prog_->start_unanchored(),
1699              flags);
1700   info->start = WorkqToCachedState(q0_, flags);
1701   if (info->start == NULL)
1702     return false;
1703 
1704   if (info->start == DeadState) {
1705     ANNOTATE_HAPPENS_BEFORE(&info->firstbyte);
1706     WriteMemoryBarrier();  // Synchronize with "quick check" above.
1707     info->firstbyte = kFbNone;
1708     return true;
1709   }
1710 
1711   if (info->start == FullMatchState) {
1712     ANNOTATE_HAPPENS_BEFORE(&info->firstbyte);
1713     WriteMemoryBarrier();  // Synchronize with "quick check" above.
1714     info->firstbyte = kFbNone;	// will be ignored
1715     return true;
1716   }
1717 
1718   // Compute info->firstbyte by running state on all
1719   // possible byte values, looking for a single one that
1720   // leads to a different state.
1721   int firstbyte = kFbNone;
1722   for (int i = 0; i < 256; i++) {
1723     State* s = RunStateOnByte(info->start, i);
1724     if (s == NULL) {
1725       ANNOTATE_HAPPENS_BEFORE(&info->firstbyte);
1726       WriteMemoryBarrier();  // Synchronize with "quick check" above.
1727       info->firstbyte = firstbyte;
1728       return false;
1729     }
1730     if (s == info->start)
1731       continue;
1732     // Goes to new state...
1733     if (firstbyte == kFbNone) {
1734       firstbyte = i;        // ... first one
1735     } else {
1736       firstbyte = kFbMany;  // ... too many
1737       break;
1738     }
1739   }
1740   ANNOTATE_HAPPENS_BEFORE(&info->firstbyte);
1741   WriteMemoryBarrier();  // Synchronize with "quick check" above.
1742   info->firstbyte = firstbyte;
1743   return true;
1744 }
1745 
1746 // The actual DFA search: calls AnalyzeSearch and then FastSearchLoop.
Search(const StringPiece & text,const StringPiece & context,bool anchored,bool want_earliest_match,bool run_forward,bool * failed,const char ** epp,vector<int> * matches)1747 bool DFA::Search(const StringPiece& text,
1748                  const StringPiece& context,
1749                  bool anchored,
1750                  bool want_earliest_match,
1751                  bool run_forward,
1752                  bool* failed,
1753                  const char** epp,
1754                  vector<int>* matches) {
1755   *epp = NULL;
1756   if (!ok()) {
1757     *failed = true;
1758     return false;
1759   }
1760   *failed = false;
1761 
1762   if (DebugDFA) {
1763     fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str());
1764     fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n",
1765             text.as_string().c_str(), anchored, want_earliest_match,
1766             run_forward, kind_);
1767   }
1768 
1769   RWLocker l(&cache_mutex_);
1770   SearchParams params(text, context, &l);
1771   params.anchored = anchored;
1772   params.want_earliest_match = want_earliest_match;
1773   params.run_forward = run_forward;
1774   params.matches = matches;
1775 
1776   if (!AnalyzeSearch(&params)) {
1777     *failed = true;
1778     return false;
1779   }
1780   if (params.start == DeadState)
1781     return false;
1782   if (params.start == FullMatchState) {
1783     if (run_forward == want_earliest_match)
1784       *epp = text.begin();
1785     else
1786       *epp = text.end();
1787     return true;
1788   }
1789   if (DebugDFA)
1790     fprintf(stderr, "start %s\n", DumpState(params.start).c_str());
1791   bool ret = FastSearchLoop(&params);
1792   if (params.failed) {
1793     *failed = true;
1794     return false;
1795   }
1796   *epp = params.ep;
1797   return ret;
1798 }
1799 
1800 // Deletes dfa.
1801 //
1802 // This is a separate function so that
1803 // prog.h can be used without moving the definition of
1804 // class DFA out of this file.  If you set
1805 //   prog->dfa_ = dfa;
1806 // then you also have to set
1807 //   prog->delete_dfa_ = DeleteDFA;
1808 // so that ~Prog can delete the dfa.
DeleteDFA(DFA * dfa)1809 static void DeleteDFA(DFA* dfa) {
1810   delete dfa;
1811 }
1812 
GetDFA(MatchKind kind)1813 DFA* Prog::GetDFA(MatchKind kind) {
1814   DFA*volatile* pdfa;
1815   if (kind == kFirstMatch || kind == kManyMatch) {
1816     pdfa = &dfa_first_;
1817   } else {
1818     kind = kLongestMatch;
1819     pdfa = &dfa_longest_;
1820   }
1821 
1822   // Quick check; okay because of memory barrier below.
1823   DFA *dfa = ANNOTATE_UNPROTECTED_READ(*pdfa);
1824   if (dfa != NULL) {
1825     ANNOTATE_HAPPENS_AFTER(dfa);
1826     return dfa;
1827   }
1828 
1829   MutexLock l(&dfa_mutex_);
1830   dfa = *pdfa;
1831   if (dfa != NULL) {
1832     ANNOTATE_HAPPENS_AFTER(dfa);
1833     return dfa;
1834   }
1835 
1836   // For a forward DFA, half the memory goes to each DFA.
1837   // For a reverse DFA, all the memory goes to the
1838   // "longest match" DFA, because RE2 never does reverse
1839   // "first match" searches.
1840   int64 m = dfa_mem_/2;
1841   if (reversed_) {
1842     if (kind == kLongestMatch || kind == kManyMatch)
1843       m = dfa_mem_;
1844     else
1845       m = 0;
1846   }
1847   dfa = new DFA(this, kind, m);
1848   delete_dfa_ = DeleteDFA;
1849 
1850   // Synchronize with "quick check" above.
1851   ANNOTATE_HAPPENS_BEFORE(dfa);
1852   WriteMemoryBarrier();
1853   *pdfa = dfa;
1854 
1855   return dfa;
1856 }
1857 
1858 
1859 // Executes the regexp program to search in text,
1860 // which itself is inside the larger context.  (As a convenience,
1861 // passing a NULL context is equivalent to passing text.)
1862 // Returns true if a match is found, false if not.
1863 // If a match is found, fills in match0->end() to point at the end of the match
1864 // and sets match0->begin() to text.begin(), since the DFA can't track
1865 // where the match actually began.
1866 //
1867 // This is the only external interface (class DFA only exists in this file).
1868 //
SearchDFA(const StringPiece & text,const StringPiece & const_context,Anchor anchor,MatchKind kind,StringPiece * match0,bool * failed,vector<int> * matches)1869 bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context,
1870                      Anchor anchor, MatchKind kind,
1871                      StringPiece* match0, bool* failed, vector<int>* matches) {
1872   *failed = false;
1873 
1874   StringPiece context = const_context;
1875   if (context.begin() == NULL)
1876     context = text;
1877   bool carat = anchor_start();
1878   bool dollar = anchor_end();
1879   if (reversed_) {
1880     bool t = carat;
1881     carat = dollar;
1882     dollar = t;
1883   }
1884   if (carat && context.begin() != text.begin())
1885     return false;
1886   if (dollar && context.end() != text.end())
1887     return false;
1888 
1889   // Handle full match by running an anchored longest match
1890   // and then checking if it covers all of text.
1891   bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch;
1892   bool endmatch = false;
1893   if (kind == kManyMatch) {
1894     endmatch = true;
1895   } else if (kind == kFullMatch || anchor_end()) {
1896     endmatch = true;
1897     kind = kLongestMatch;
1898   }
1899 
1900   // If the caller doesn't care where the match is (just whether one exists),
1901   // then we can stop at the very first match we find, the so-called
1902   // "shortest match".
1903   bool want_shortest_match = false;
1904   if (match0 == NULL && !endmatch) {
1905     want_shortest_match = true;
1906     kind = kLongestMatch;
1907   }
1908 
1909   DFA* dfa = GetDFA(kind);
1910   const char* ep;
1911   bool matched = dfa->Search(text, context, anchored,
1912                              want_shortest_match, !reversed_,
1913                              failed, &ep, matches);
1914   if (*failed)
1915     return false;
1916   if (!matched)
1917     return false;
1918   if (endmatch && ep != (reversed_ ? text.begin() : text.end()))
1919     return false;
1920 
1921   // If caller cares, record the boundary of the match.
1922   // We only know where it ends, so use the boundary of text
1923   // as the beginning.
1924   if (match0) {
1925     if (reversed_)
1926       *match0 = StringPiece(ep, text.end() - ep);
1927     else
1928       *match0 = StringPiece(text.begin(), ep - text.begin());
1929   }
1930   return true;
1931 }
1932 
1933 // Build out all states in DFA.  Returns number of states.
BuildAllStates()1934 int DFA::BuildAllStates() {
1935   if (!ok())
1936     return 0;
1937 
1938   // Pick out start state for unanchored search
1939   // at beginning of text.
1940   RWLocker l(&cache_mutex_);
1941   SearchParams params(NULL, NULL, &l);
1942   params.anchored = false;
1943   if (!AnalyzeSearch(&params) || params.start <= SpecialStateMax)
1944     return 0;
1945 
1946   // Add start state to work queue.
1947   StateSet queued;
1948   vector<State*> q;
1949   queued.insert(params.start);
1950   q.push_back(params.start);
1951 
1952   // Flood to expand every state.
1953   for (int i = 0; i < q.size(); i++) {
1954     State* s = q[i];
1955     for (int c = 0; c < 257; c++) {
1956       State* ns = RunStateOnByteUnlocked(s, c);
1957       if (ns > SpecialStateMax && queued.find(ns) == queued.end()) {
1958         queued.insert(ns);
1959         q.push_back(ns);
1960       }
1961     }
1962   }
1963 
1964   return q.size();
1965 }
1966 
1967 // Build out all states in DFA for kind.  Returns number of states.
BuildEntireDFA(MatchKind kind)1968 int Prog::BuildEntireDFA(MatchKind kind) {
1969   //LOG(ERROR) << "BuildEntireDFA is only for testing.";
1970   return GetDFA(kind)->BuildAllStates();
1971 }
1972 
1973 // Computes min and max for matching string.
1974 // Won't return strings bigger than maxlen.
PossibleMatchRange(string * min,string * max,int maxlen)1975 bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) {
1976   if (!ok())
1977     return false;
1978 
1979   // NOTE: if future users of PossibleMatchRange want more precision when
1980   // presented with infinitely repeated elements, consider making this a
1981   // parameter to PossibleMatchRange.
1982   static int kMaxEltRepetitions = 0;
1983 
1984   // Keep track of the number of times we've visited states previously. We only
1985   // revisit a given state if it's part of a repeated group, so if the value
1986   // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set
1987   // |*max| to |PrefixSuccessor(*max)|.
1988   //
1989   // Also note that previously_visited_states[UnseenStatePtr] will, in the STL
1990   // tradition, implicitly insert a '0' value at first use. We take advantage
1991   // of that property below.
1992   map<State*, int> previously_visited_states;
1993 
1994   // Pick out start state for anchored search at beginning of text.
1995   RWLocker l(&cache_mutex_);
1996   SearchParams params(NULL, NULL, &l);
1997   params.anchored = true;
1998   if (!AnalyzeSearch(&params))
1999     return false;
2000   if (params.start == DeadState) {  // No matching strings
2001     *min = "";
2002     *max = "";
2003     return true;
2004   }
2005   if (params.start == FullMatchState)  // Every string matches: no max
2006     return false;
2007 
2008   // The DFA is essentially a big graph rooted at params.start,
2009   // and paths in the graph correspond to accepted strings.
2010   // Each node in the graph has potentially 256+1 arrows
2011   // coming out, one for each byte plus the magic end of
2012   // text character kByteEndText.
2013 
2014   // To find the smallest possible prefix of an accepted
2015   // string, we just walk the graph preferring to follow
2016   // arrows with the lowest bytes possible.  To find the
2017   // largest possible prefix, we follow the largest bytes
2018   // possible.
2019 
2020   // The test for whether there is an arrow from s on byte j is
2021   //    ns = RunStateOnByteUnlocked(s, j);
2022   //    if (ns == NULL)
2023   //      return false;
2024   //    if (ns != DeadState && ns->ninst > 0)
2025   // The RunStateOnByteUnlocked call asks the DFA to build out the graph.
2026   // It returns NULL only if the DFA has run out of memory,
2027   // in which case we can't be sure of anything.
2028   // The second check sees whether there was graph built
2029   // and whether it is interesting graph.  Nodes might have
2030   // ns->ninst == 0 if they exist only to represent the fact
2031   // that a match was found on the previous byte.
2032 
2033   // Build minimum prefix.
2034   State* s = params.start;
2035   min->clear();
2036   for (int i = 0; i < maxlen; i++) {
2037     if (previously_visited_states[s] > kMaxEltRepetitions) {
2038       VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions
2039         << " for state s=" << s << " and min=" << CEscape(*min);
2040       break;
2041     }
2042     previously_visited_states[s]++;
2043 
2044     // Stop if min is a match.
2045     State* ns = RunStateOnByteUnlocked(s, kByteEndText);
2046     if (ns == NULL)  // DFA out of memory
2047       return false;
2048     if (ns != DeadState && (ns == FullMatchState || ns->IsMatch()))
2049       break;
2050 
2051     // Try to extend the string with low bytes.
2052     bool extended = false;
2053     for (int j = 0; j < 256; j++) {
2054       ns = RunStateOnByteUnlocked(s, j);
2055       if (ns == NULL)  // DFA out of memory
2056         return false;
2057       if (ns == FullMatchState ||
2058           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2059         extended = true;
2060         min->append(1, j);
2061         s = ns;
2062         break;
2063       }
2064     }
2065     if (!extended)
2066       break;
2067   }
2068 
2069   // Build maximum prefix.
2070   previously_visited_states.clear();
2071   s = params.start;
2072   max->clear();
2073   for (int i = 0; i < maxlen; i++) {
2074     if (previously_visited_states[s] > kMaxEltRepetitions) {
2075       VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions
2076         << " for state s=" << s << " and max=" << CEscape(*max);
2077       break;
2078     }
2079     previously_visited_states[s] += 1;
2080 
2081     // Try to extend the string with high bytes.
2082     bool extended = false;
2083     for (int j = 255; j >= 0; j--) {
2084       State* ns = RunStateOnByteUnlocked(s, j);
2085       if (ns == NULL)
2086         return false;
2087       if (ns == FullMatchState ||
2088           (ns > SpecialStateMax && ns->ninst_ > 0)) {
2089         extended = true;
2090         max->append(1, j);
2091         s = ns;
2092         break;
2093       }
2094     }
2095     if (!extended) {
2096       // Done, no need for PrefixSuccessor.
2097       return true;
2098     }
2099   }
2100 
2101   // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b
2102   *max = PrefixSuccessor(*max);
2103 
2104   // If there are no bytes left, we have no way to say "there is no maximum
2105   // string".  We could make the interface more complicated and be able to
2106   // return "there is no maximum but here is a minimum", but that seems like
2107   // overkill -- the most common no-max case is all possible strings, so not
2108   // telling the caller that the empty string is the minimum match isn't a
2109   // great loss.
2110   if (max->empty())
2111     return false;
2112 
2113   return true;
2114 }
2115 
2116 // PossibleMatchRange for a Prog.
PossibleMatchRange(string * min,string * max,int maxlen)2117 bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) {
2118   DFA* dfa = NULL;
2119   {
2120     MutexLock l(&dfa_mutex_);
2121     // Have to use dfa_longest_ to get all strings for full matches.
2122     // For example, (a|aa) never matches aa in first-match mode.
2123     if (dfa_longest_ == NULL) {
2124       dfa_longest_ = new DFA(this, Prog::kLongestMatch, dfa_mem_/2);
2125       delete_dfa_ = DeleteDFA;
2126     }
2127     dfa = dfa_longest_;
2128   }
2129   return dfa->PossibleMatchRange(min, max, maxlen);
2130 }
2131 
2132 }  // namespace re2
2133