1 // Copyright 2003-2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9 
10 #include "re2/re2.h"
11 
12 #include <stdio.h>
13 #include <string>
14 #include <pthread.h>
15 #include <errno.h>
16 #include "util/util.h"
17 #include "util/flags.h"
18 #include "re2/prog.h"
19 #include "re2/regexp.h"
20 
21 DEFINE_bool(trace_re2, false, "trace RE2 execution");
22 
23 namespace re2 {
24 
25 // Maximum number of args we can set
26 static const int kMaxArgs = 16;
27 static const int kVecSize = 1+kMaxArgs;
28 
29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
33 
34 // This will trigger LNK2005 error in MSVC.
35 #ifndef COMPILER_MSVC
36 const int RE2::Options::kDefaultMaxMem;  // initialized in re2.h
37 #endif  // COMPILER_MSVC
38 
Options(RE2::CannedOptions opt)39 RE2::Options::Options(RE2::CannedOptions opt)
40   : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
41     posix_syntax_(opt == RE2::POSIX),
42     longest_match_(opt == RE2::POSIX),
43     log_errors_(opt != RE2::Quiet),
44     max_mem_(kDefaultMaxMem),
45     literal_(false),
46     never_nl_(false),
47     never_capture_(false),
48     case_sensitive_(true),
49     perl_classes_(false),
50     word_boundary_(false),
51     one_line_(false) {
52 }
53 
54 // static empty things for use as const references.
55 // To avoid global constructors, initialized on demand.
56 GLOBAL_MUTEX(empty_mutex);
57 static const string *empty_string;
58 static const map<string, int> *empty_named_groups;
59 static const map<int, string> *empty_group_names;
60 
InitEmpty()61 static void InitEmpty() {
62   GLOBAL_MUTEX_LOCK(empty_mutex);
63   if (empty_string == NULL) {
64     empty_string = new string;
65     empty_named_groups = new map<string, int>;
66     empty_group_names = new map<int, string>;
67   }
68   GLOBAL_MUTEX_UNLOCK(empty_mutex);
69 }
70 
71 // Converts from Regexp error code to RE2 error code.
72 // Maybe some day they will diverge.  In any event, this
73 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)74 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
75   switch (code) {
76     case re2::kRegexpSuccess:
77       return RE2::NoError;
78     case re2::kRegexpInternalError:
79       return RE2::ErrorInternal;
80     case re2::kRegexpBadEscape:
81       return RE2::ErrorBadEscape;
82     case re2::kRegexpBadCharClass:
83       return RE2::ErrorBadCharClass;
84     case re2::kRegexpBadCharRange:
85       return RE2::ErrorBadCharRange;
86     case re2::kRegexpMissingBracket:
87       return RE2::ErrorMissingBracket;
88     case re2::kRegexpMissingParen:
89       return RE2::ErrorMissingParen;
90     case re2::kRegexpTrailingBackslash:
91       return RE2::ErrorTrailingBackslash;
92     case re2::kRegexpRepeatArgument:
93       return RE2::ErrorRepeatArgument;
94     case re2::kRegexpRepeatSize:
95       return RE2::ErrorRepeatSize;
96     case re2::kRegexpRepeatOp:
97       return RE2::ErrorRepeatOp;
98     case re2::kRegexpBadPerlOp:
99       return RE2::ErrorBadPerlOp;
100     case re2::kRegexpBadUTF8:
101       return RE2::ErrorBadUTF8;
102     case re2::kRegexpBadNamedCapture:
103       return RE2::ErrorBadNamedCapture;
104   }
105   return RE2::ErrorInternal;
106 }
107 
trunc(const StringPiece & pattern)108 static string trunc(const StringPiece& pattern) {
109   if (pattern.size() < 100)
110     return pattern.as_string();
111   return pattern.substr(0, 100).as_string() + "...";
112 }
113 
114 
RE2(const char * pattern)115 RE2::RE2(const char* pattern) {
116   Init(pattern, DefaultOptions);
117 }
118 
RE2(const string & pattern)119 RE2::RE2(const string& pattern) {
120   Init(pattern, DefaultOptions);
121 }
122 
RE2(const StringPiece & pattern)123 RE2::RE2(const StringPiece& pattern) {
124   Init(pattern, DefaultOptions);
125 }
126 
RE2(const StringPiece & pattern,const Options & options)127 RE2::RE2(const StringPiece& pattern, const Options& options) {
128   Init(pattern, options);
129 }
130 
ParseFlags() const131 int RE2::Options::ParseFlags() const {
132   int flags = Regexp::ClassNL;
133   switch (encoding()) {
134     default:
135       if (log_errors())
136         LOG(ERROR) << "Unknown encoding " << encoding();
137       break;
138     case RE2::Options::EncodingUTF8:
139       break;
140     case RE2::Options::EncodingLatin1:
141       flags |= Regexp::Latin1;
142       break;
143   }
144 
145   if (!posix_syntax())
146     flags |= Regexp::LikePerl;
147 
148   if (literal())
149     flags |= Regexp::Literal;
150 
151   if (never_nl())
152     flags |= Regexp::NeverNL;
153 
154   if (never_capture())
155     flags |= Regexp::NeverCapture;
156 
157   if (!case_sensitive())
158     flags |= Regexp::FoldCase;
159 
160   if (perl_classes())
161     flags |= Regexp::PerlClasses;
162 
163   if (word_boundary())
164     flags |= Regexp::PerlB;
165 
166   if (one_line())
167     flags |= Regexp::OneLine;
168 
169   return flags;
170 }
171 
Init(const StringPiece & pattern,const Options & options)172 void RE2::Init(const StringPiece& pattern, const Options& options) {
173   mutex_ = new Mutex;
174   pattern_ = pattern.as_string();
175   options_.Copy(options);
176   InitEmpty();
177   error_ = empty_string;
178   error_code_ = NoError;
179   suffix_regexp_ = NULL;
180   entire_regexp_ = NULL;
181   prog_ = NULL;
182   rprog_ = NULL;
183   named_groups_ = NULL;
184   group_names_ = NULL;
185   num_captures_ = -1;
186 
187   RegexpStatus status;
188   entire_regexp_ = Regexp::Parse(
189     pattern_,
190     static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191     &status);
192   if (entire_regexp_ == NULL) {
193     if (error_ == empty_string)
194       error_ = new string(status.Text());
195     if (options_.log_errors()) {
196       LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
197                  << status.Text();
198     }
199     error_arg_ = status.error_arg().as_string();
200     error_code_ = RegexpErrorToRE2(status.code());
201     return;
202   }
203 
204   prefix_.clear();
205   prefix_foldcase_ = false;
206   re2::Regexp* suffix;
207   if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
208     suffix_regexp_ = suffix;
209   else
210     suffix_regexp_ = entire_regexp_->Incref();
211 
212   // Two thirds of the memory goes to the forward Prog,
213   // one third to the reverse prog, because the forward
214   // Prog has two DFAs but the reverse prog has one.
215   prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
216   if (prog_ == NULL) {
217     if (options_.log_errors())
218       LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
219     error_ = new string("pattern too large - compile failed");
220     error_code_ = RE2::ErrorPatternTooLarge;
221     return;
222   }
223 
224   // Could delay this until the first match call that
225   // cares about submatch information, but the one-pass
226   // machine's memory gets cut from the DFA memory budget,
227   // and that is harder to do if the DFA has already
228   // been built.
229   is_one_pass_ = prog_->IsOnePass();
230 }
231 
232 // Returns rprog_, computing it if needed.
ReverseProg() const233 re2::Prog* RE2::ReverseProg() const {
234   MutexLock l(mutex_);
235   if (rprog_ == NULL && error_ == empty_string) {
236     rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
237     if (rprog_ == NULL) {
238       if (options_.log_errors())
239         LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
240       error_ = new string("pattern too large - reverse compile failed");
241       error_code_ = RE2::ErrorPatternTooLarge;
242       return NULL;
243     }
244   }
245   return rprog_;
246 }
247 
~RE2()248 RE2::~RE2() {
249   if (suffix_regexp_)
250     suffix_regexp_->Decref();
251   if (entire_regexp_)
252     entire_regexp_->Decref();
253   delete mutex_;
254   delete prog_;
255   delete rprog_;
256   if (error_ != empty_string)
257     delete error_;
258   if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259     delete named_groups_;
260   if (group_names_ != NULL &&  group_names_ != empty_group_names)
261     delete group_names_;
262 }
263 
ProgramSize() const264 int RE2::ProgramSize() const {
265   if (prog_ == NULL)
266     return -1;
267   return prog_->size();
268 }
269 
270 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const271 const map<string, int>&  RE2::NamedCapturingGroups() const {
272   MutexLock l(mutex_);
273   if (!ok())
274     return *empty_named_groups;
275   if (named_groups_ == NULL) {
276     named_groups_ = suffix_regexp_->NamedCaptures();
277     if (named_groups_ == NULL)
278       named_groups_ = empty_named_groups;
279   }
280   return *named_groups_;
281 }
282 
283 // Returns group_names_, computing it if needed.
CapturingGroupNames() const284 const map<int, string>&  RE2::CapturingGroupNames() const {
285   MutexLock l(mutex_);
286   if (!ok())
287     return *empty_group_names;
288   if (group_names_ == NULL) {
289     group_names_ = suffix_regexp_->CaptureNames();
290     if (group_names_ == NULL)
291       group_names_ = empty_group_names;
292   }
293   return *group_names_;
294 }
295 
296 /***** Convenience interfaces *****/
297 
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)298 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
299                      const Arg* const args[], int n) {
300   return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
301 }
302 
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)303 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
304                         const Arg* const args[], int n) {
305   return re.DoMatch(text, UNANCHORED, NULL, args, n);
306 }
307 
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)308 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
309                    const Arg* const args[], int n) {
310   int consumed;
311   if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
312     input->remove_prefix(consumed);
313     return true;
314   } else {
315     return false;
316   }
317 }
318 
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)319 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
320                           const Arg* const args[], int n) {
321   int consumed;
322   if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
323     input->remove_prefix(consumed);
324     return true;
325   } else {
326     return false;
327   }
328 }
329 
330 // Returns the maximum submatch needed for the rewrite to be done by Replace().
331 // E.g. if rewrite == "foo \\2,\\1", returns 2.
MaxSubmatch(const StringPiece & rewrite)332 int RE2::MaxSubmatch(const StringPiece& rewrite) {
333   int max = 0;
334   for (const char *s = rewrite.data(), *end = s + rewrite.size();
335        s < end; s++) {
336     if (*s == '\\') {
337       s++;
338       int c = (s < end) ? *s : -1;
339       if (isdigit(c)) {
340         int n = (c - '0');
341         if (n > max)
342           max = n;
343       }
344     }
345   }
346   return max;
347 }
348 
Replace(string * str,const RE2 & re,const StringPiece & rewrite)349 bool RE2::Replace(string *str,
350                  const RE2& re,
351                  const StringPiece& rewrite) {
352   StringPiece vec[kVecSize];
353   int nvec = 1 + MaxSubmatch(rewrite);
354   if (nvec > arraysize(vec))
355     return false;
356   if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
357     return false;
358 
359   string s;
360   if (!re.Rewrite(&s, rewrite, vec, nvec))
361     return false;
362 
363   assert(vec[0].begin() >= str->data());
364   assert(vec[0].end() <= str->data()+str->size());
365   str->replace(vec[0].data() - str->data(), vec[0].size(), s);
366   return true;
367 }
368 
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)369 int RE2::GlobalReplace(string *str,
370                       const RE2& re,
371                       const StringPiece& rewrite) {
372   StringPiece vec[kVecSize];
373   int nvec = 1 + MaxSubmatch(rewrite);
374   if (nvec > arraysize(vec))
375     return false;
376 
377   const char* p = str->data();
378   const char* ep = p + str->size();
379   const char* lastend = NULL;
380   string out;
381   int count = 0;
382   while (p <= ep) {
383     if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
384       break;
385     if (p < vec[0].begin())
386       out.append(p, vec[0].begin() - p);
387     if (vec[0].begin() == lastend && vec[0].size() == 0) {
388       // Disallow empty match at end of last match: skip ahead.
389       if (p < ep)
390         out.append(p, 1);
391       p++;
392       continue;
393     }
394     re.Rewrite(&out, rewrite, vec, nvec);
395     p = vec[0].end();
396     lastend = p;
397     count++;
398   }
399 
400   if (count == 0)
401     return 0;
402 
403   if (p < ep)
404     out.append(p, ep - p);
405   swap(out, *str);
406   return count;
407 }
408 
Extract(const StringPiece & text,const RE2 & re,const StringPiece & rewrite,string * out)409 bool RE2::Extract(const StringPiece &text,
410                  const RE2& re,
411                  const StringPiece &rewrite,
412                  string *out) {
413   StringPiece vec[kVecSize];
414   int nvec = 1 + MaxSubmatch(rewrite);
415   if (nvec > arraysize(vec))
416     return false;
417 
418   if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
419     return false;
420 
421   out->clear();
422   return re.Rewrite(out, rewrite, vec, nvec);
423 }
424 
QuoteMeta(const StringPiece & unquoted)425 string RE2::QuoteMeta(const StringPiece& unquoted) {
426   string result;
427   result.reserve(unquoted.size() << 1);
428 
429   // Escape any ascii character not in [A-Za-z_0-9].
430   //
431   // Note that it's legal to escape a character even if it has no
432   // special meaning in a regular expression -- so this function does
433   // that.  (This also makes it identical to the perl function of the
434   // same name except for the null-character special case;
435   // see `perldoc -f quotemeta`.)
436   for (int ii = 0; ii < unquoted.length(); ++ii) {
437     // Note that using 'isalnum' here raises the benchmark time from
438     // 32ns to 58ns:
439     if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
440         (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
441         (unquoted[ii] < '0' || unquoted[ii] > '9') &&
442         unquoted[ii] != '_' &&
443         // If this is the part of a UTF8 or Latin1 character, we need
444         // to copy this byte without escaping.  Experimentally this is
445         // what works correctly with the regexp library.
446         !(unquoted[ii] & 128)) {
447       if (unquoted[ii] == '\0') {  // Special handling for null chars.
448         // Note that this special handling is not strictly required for RE2,
449         // but this quoting is required for other regexp libraries such as
450         // PCRE.
451         // Can't use "\\0" since the next character might be a digit.
452         result += "\\x00";
453         continue;
454       }
455       result += '\\';
456     }
457     result += unquoted[ii];
458   }
459 
460   return result;
461 }
462 
PossibleMatchRange(string * min,string * max,int maxlen) const463 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
464   if (prog_ == NULL)
465     return false;
466 
467   int n = prefix_.size();
468   if (n > maxlen)
469     n = maxlen;
470 
471   // Determine initial min max from prefix_ literal.
472   string pmin, pmax;
473   pmin = prefix_.substr(0, n);
474   pmax = prefix_.substr(0, n);
475   if (prefix_foldcase_) {
476     // prefix is ASCII lowercase; change pmin to uppercase.
477     for (int i = 0; i < n; i++) {
478       if ('a' <= pmin[i] && pmin[i] <= 'z')
479         pmin[i] += 'A' - 'a';
480     }
481   }
482 
483   // Add to prefix min max using PossibleMatchRange on regexp.
484   string dmin, dmax;
485   maxlen -= n;
486   if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
487     pmin += dmin;
488     pmax += dmax;
489   } else if (pmax.size() > 0) {
490     // prog_->PossibleMatchRange has failed us,
491     // but we still have useful information from prefix_.
492     // Round up pmax to allow any possible suffix.
493     pmax = PrefixSuccessor(pmax);
494   } else {
495     // Nothing useful.
496     *min = "";
497     *max = "";
498     return false;
499   }
500 
501   *min = pmin;
502   *max = pmax;
503   return true;
504 }
505 
506 // Avoid possible locale nonsense in standard strcasecmp.
507 // The string a is known to be all lowercase.
ascii_strcasecmp(const char * a,const char * b,int len)508 static int ascii_strcasecmp(const char* a, const char* b, int len) {
509   const char *ae = a + len;
510 
511   for (; a < ae; a++, b++) {
512     uint8 x = *a;
513     uint8 y = *b;
514     if ('A' <= y && y <= 'Z')
515       y += 'a' - 'A';
516     if (x != y)
517       return x - y;
518   }
519   return 0;
520 }
521 
522 
523 /***** Actual matching and rewriting code *****/
524 
Match(const StringPiece & text,int startpos,int endpos,Anchor re_anchor,StringPiece * submatch,int nsubmatch) const525 bool RE2::Match(const StringPiece& text,
526                 int startpos,
527                 int endpos,
528                 Anchor re_anchor,
529                 StringPiece* submatch,
530                 int nsubmatch) const {
531   if (!ok() || suffix_regexp_ == NULL) {
532     if (options_.log_errors())
533       LOG(ERROR) << "Invalid RE2: " << *error_;
534     return false;
535   }
536 
537   if (startpos < 0 || startpos > endpos || endpos > text.size()) {
538     if (options_.log_errors())
539       LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
540     return false;
541   }
542 
543   StringPiece subtext = text;
544   subtext.remove_prefix(startpos);
545   subtext.remove_suffix(text.size() - endpos);
546 
547   // Use DFAs to find exact location of match, filter out non-matches.
548 
549   // Don't ask for the location if we won't use it.
550   // SearchDFA can do extra optimizations in that case.
551   StringPiece match;
552   StringPiece* matchp = &match;
553   if (nsubmatch == 0)
554     matchp = NULL;
555 
556   int ncap = 1 + NumberOfCapturingGroups();
557   if (ncap > nsubmatch)
558     ncap = nsubmatch;
559 
560   // If the regexp is anchored explicitly, must not be in middle of text.
561   if (prog_->anchor_start() && startpos != 0)
562     return false;
563 
564   // If the regexp is anchored explicitly, update re_anchor
565   // so that we can potentially fall into a faster case below.
566   if (prog_->anchor_start() && prog_->anchor_end())
567     re_anchor = ANCHOR_BOTH;
568   else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
569     re_anchor = ANCHOR_START;
570 
571   // Check for the required prefix, if any.
572   int prefixlen = 0;
573   if (!prefix_.empty()) {
574     if (startpos != 0)
575       return false;
576     prefixlen = prefix_.size();
577     if (prefixlen > subtext.size())
578       return false;
579     if (prefix_foldcase_) {
580       if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
581         return false;
582     } else {
583       if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
584         return false;
585     }
586     subtext.remove_prefix(prefixlen);
587     // If there is a required prefix, the anchor must be at least ANCHOR_START.
588     if (re_anchor != ANCHOR_BOTH)
589       re_anchor = ANCHOR_START;
590   }
591 
592   Prog::Anchor anchor = Prog::kUnanchored;
593   Prog::MatchKind kind = Prog::kFirstMatch;
594   if (options_.longest_match())
595     kind = Prog::kLongestMatch;
596   bool skipped_test = false;
597 
598   bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
599 
600   // SearchBitState allocates a bit vector of size prog_->size() * text.size().
601   // It also allocates a stack of 3-word structures which could potentially
602   // grow as large as prog_->size() * text.size() but in practice is much
603   // smaller.
604   // Conditions for using SearchBitState:
605   const int MaxBitStateProg = 500;   // prog_->size() <= Max.
606   const int MaxBitStateVector = 256*1024;  // bit vector size <= Max (bits)
607   bool can_bit_state = prog_->size() <= MaxBitStateProg;
608   int bit_state_text_max = MaxBitStateVector / prog_->size();
609 
610   bool dfa_failed = false;
611   switch (re_anchor) {
612     default:
613     case UNANCHORED: {
614       if (!prog_->SearchDFA(subtext, text, anchor, kind,
615                             matchp, &dfa_failed, NULL)) {
616         if (dfa_failed) {
617           // Fall back to NFA below.
618           skipped_test = true;
619           if (FLAGS_trace_re2)
620             LOG(INFO) << "Match " << trunc(pattern_)
621                       << " [" << CEscape(subtext) << "]"
622                       << " DFA failed.";
623           break;
624         }
625         if (FLAGS_trace_re2)
626           LOG(INFO) << "Match " << trunc(pattern_)
627                     << " [" << CEscape(subtext) << "]"
628                     << " used DFA - no match.";
629         return false;
630       }
631       if (FLAGS_trace_re2)
632         LOG(INFO) << "Match " << trunc(pattern_)
633                   << " [" << CEscape(subtext) << "]"
634                   << " used DFA - match";
635       if (matchp == NULL)  // Matched.  Don't care where
636         return true;
637       // SearchDFA set match[0].end() but didn't know where the
638       // match started.  Run the regexp backward from match[0].end()
639       // to find the longest possible match -- that's where it started.
640       Prog* prog = ReverseProg();
641       if (prog == NULL)
642         return false;
643       if (!prog->SearchDFA(match, text, Prog::kAnchored,
644                            Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
645         if (dfa_failed) {
646           // Fall back to NFA below.
647           skipped_test = true;
648           if (FLAGS_trace_re2)
649             LOG(INFO) << "Match " << trunc(pattern_)
650                       << " [" << CEscape(subtext) << "]"
651                       << " reverse DFA failed.";
652           break;
653         }
654         if (FLAGS_trace_re2)
655           LOG(INFO) << "Match " << trunc(pattern_)
656                     << " [" << CEscape(subtext) << "]"
657                     << " DFA inconsistency.";
658         if (options_.log_errors())
659           LOG(ERROR) << "DFA inconsistency";
660         return false;
661       }
662       if (FLAGS_trace_re2)
663         LOG(INFO) << "Match " << trunc(pattern_)
664                   << " [" << CEscape(subtext) << "]"
665                   << " used reverse DFA.";
666       break;
667     }
668 
669     case ANCHOR_BOTH:
670     case ANCHOR_START:
671       if (re_anchor == ANCHOR_BOTH)
672         kind = Prog::kFullMatch;
673       anchor = Prog::kAnchored;
674 
675       // If only a small amount of text and need submatch
676       // information anyway and we're going to use OnePass or BitState
677       // to get it, we might as well not even bother with the DFA:
678       // OnePass or BitState will be fast enough.
679       // On tiny texts, OnePass outruns even the DFA, and
680       // it doesn't have the shared state and occasional mutex that
681       // the DFA does.
682       if (can_one_pass && text.size() <= 4096 &&
683           (ncap > 1 || text.size() <= 8)) {
684         if (FLAGS_trace_re2)
685           LOG(INFO) << "Match " << trunc(pattern_)
686                     << " [" << CEscape(subtext) << "]"
687                     << " skipping DFA for OnePass.";
688         skipped_test = true;
689         break;
690       }
691       if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
692         if (FLAGS_trace_re2)
693           LOG(INFO) << "Match " << trunc(pattern_)
694                     << " [" << CEscape(subtext) << "]"
695                     << " skipping DFA for BitState.";
696         skipped_test = true;
697         break;
698       }
699       if (!prog_->SearchDFA(subtext, text, anchor, kind,
700                             &match, &dfa_failed, NULL)) {
701         if (dfa_failed) {
702           if (FLAGS_trace_re2)
703             LOG(INFO) << "Match " << trunc(pattern_)
704                       << " [" << CEscape(subtext) << "]"
705                       << " DFA failed.";
706           skipped_test = true;
707           break;
708         }
709         if (FLAGS_trace_re2)
710           LOG(INFO) << "Match " << trunc(pattern_)
711                     << " [" << CEscape(subtext) << "]"
712                     << " used DFA - no match.";
713         return false;
714       }
715       break;
716   }
717 
718   if (!skipped_test && ncap <= 1) {
719     // We know exactly where it matches.  That's enough.
720     if (ncap == 1)
721       submatch[0] = match;
722   } else {
723     StringPiece subtext1;
724     if (skipped_test) {
725       // DFA ran out of memory or was skipped:
726       // need to search in entire original text.
727       subtext1 = subtext;
728     } else {
729       // DFA found the exact match location:
730       // let NFA run an anchored, full match search
731       // to find submatch locations.
732       subtext1 = match;
733       anchor = Prog::kAnchored;
734       kind = Prog::kFullMatch;
735     }
736 
737     if (can_one_pass && anchor != Prog::kUnanchored) {
738       if (FLAGS_trace_re2)
739         LOG(INFO) << "Match " << trunc(pattern_)
740                   << " [" << CEscape(subtext) << "]"
741                   << " using OnePass.";
742       if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
743         if (!skipped_test && options_.log_errors())
744           LOG(ERROR) << "SearchOnePass inconsistency";
745         return false;
746       }
747     } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
748       if (FLAGS_trace_re2)
749         LOG(INFO) << "Match " << trunc(pattern_)
750                   << " [" << CEscape(subtext) << "]"
751                   << " using BitState.";
752       if (!prog_->SearchBitState(subtext1, text, anchor,
753                                  kind, submatch, ncap)) {
754         if (!skipped_test && options_.log_errors())
755           LOG(ERROR) << "SearchBitState inconsistency";
756         return false;
757       }
758     } else {
759       if (FLAGS_trace_re2)
760         LOG(INFO) << "Match " << trunc(pattern_)
761                   << " [" << CEscape(subtext) << "]"
762                   << " using NFA.";
763       if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
764         if (!skipped_test && options_.log_errors())
765           LOG(ERROR) << "SearchNFA inconsistency";
766         return false;
767       }
768     }
769   }
770 
771   // Adjust overall match for required prefix that we stripped off.
772   if (prefixlen > 0 && nsubmatch > 0)
773     submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
774                               submatch[0].size() + prefixlen);
775 
776   // Zero submatches that don't exist in the regexp.
777   for (int i = ncap; i < nsubmatch; i++)
778     submatch[i] = NULL;
779   return true;
780 }
781 
782 // Internal matcher - like Match() but takes Args not StringPieces.
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n) const783 bool RE2::DoMatch(const StringPiece& text,
784                   Anchor anchor,
785                   int* consumed,
786                   const Arg* const* args,
787                   int n) const {
788   if (!ok()) {
789     if (options_.log_errors())
790       LOG(ERROR) << "Invalid RE2: " << *error_;
791     return false;
792   }
793 
794   // Count number of capture groups needed.
795   int nvec;
796   if (n == 0 && consumed == NULL)
797     nvec = 0;
798   else
799     nvec = n+1;
800 
801   StringPiece* vec;
802   StringPiece stkvec[kVecSize];
803   StringPiece* heapvec = NULL;
804 
805   if (nvec <= arraysize(stkvec)) {
806     vec = stkvec;
807   } else {
808     vec = new StringPiece[nvec];
809     heapvec = vec;
810   }
811 
812   if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
813     delete[] heapvec;
814     return false;
815   }
816 
817   if(consumed != NULL)
818     *consumed = vec[0].end() - text.begin();
819 
820   if (n == 0 || args == NULL) {
821     // We are not interested in results
822     delete[] heapvec;
823     return true;
824   }
825 
826   int ncap = NumberOfCapturingGroups();
827   if (ncap < n) {
828     // RE has fewer capturing groups than number of arg pointers passed in
829     VLOG(1) << "Asked for " << n << " but only have " << ncap;
830     delete[] heapvec;
831     return false;
832   }
833 
834   // If we got here, we must have matched the whole pattern.
835   for (int i = 0; i < n; i++) {
836     const StringPiece& s = vec[i+1];
837     if (!args[i]->Parse(s.data(), s.size())) {
838       // TODO: Should we indicate what the error was?
839       VLOG(1) << "Parse error on #" << i << " " << s << " "
840 	      << (void*)s.data() << "/" << s.size();
841       delete[] heapvec;
842       return false;
843     }
844   }
845 
846   delete[] heapvec;
847   return true;
848 }
849 
850 // Append the "rewrite" string, with backslash subsitutions from "vec",
851 // to string "out".
Rewrite(string * out,const StringPiece & rewrite,const StringPiece * vec,int veclen) const852 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
853                  const StringPiece *vec, int veclen) const {
854   for (const char *s = rewrite.data(), *end = s + rewrite.size();
855        s < end; s++) {
856     int c = *s;
857     if (c == '\\') {
858       s++;
859       c = (s < end) ? *s : -1;
860       if (isdigit(c)) {
861         int n = (c - '0');
862         if (n >= veclen) {
863           if (options_.log_errors()) {
864             LOG(ERROR) << "requested group " << n
865                        << " in regexp " << rewrite.data();
866           }
867           return false;
868         }
869         StringPiece snip = vec[n];
870         if (snip.size() > 0)
871           out->append(snip.data(), snip.size());
872       } else if (c == '\\') {
873         out->push_back('\\');
874       } else {
875         if (options_.log_errors())
876           LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
877         return false;
878       }
879     } else {
880       out->push_back(c);
881     }
882   }
883   return true;
884 }
885 
886 // Return the number of capturing subpatterns, or -1 if the
887 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const888 int RE2::NumberOfCapturingGroups() const {
889   if (suffix_regexp_ == NULL)
890     return -1;
891   ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
892     " multiple threads end up doing the same work in parallel.");
893   if (num_captures_ == -1)
894     num_captures_ = suffix_regexp_->NumCaptures();
895   return num_captures_;
896 }
897 
898 // Checks that the rewrite string is well-formed with respect to this
899 // regular expression.
CheckRewriteString(const StringPiece & rewrite,string * error) const900 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
901   int max_token = -1;
902   for (const char *s = rewrite.data(), *end = s + rewrite.size();
903        s < end; s++) {
904     int c = *s;
905     if (c != '\\') {
906       continue;
907     }
908     if (++s == end) {
909       *error = "Rewrite schema error: '\\' not allowed at end.";
910       return false;
911     }
912     c = *s;
913     if (c == '\\') {
914       continue;
915     }
916     if (!isdigit(c)) {
917       *error = "Rewrite schema error: "
918                "'\\' must be followed by a digit or '\\'.";
919       return false;
920     }
921     int n = (c - '0');
922     if (max_token < n) {
923       max_token = n;
924     }
925   }
926 
927   if (max_token > NumberOfCapturingGroups()) {
928     SStringPrintf(error, "Rewrite schema requests %d matches, "
929                   "but the regexp only has %d parenthesized subexpressions.",
930                   max_token, NumberOfCapturingGroups());
931     return false;
932   }
933   return true;
934 }
935 
936 /***** Parsers for various types *****/
937 
parse_null(const char * str,int n,void * dest)938 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
939   // We fail if somebody asked us to store into a non-NULL void* pointer
940   return (dest == NULL);
941 }
942 
parse_string(const char * str,int n,void * dest)943 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
944   if (dest == NULL) return true;
945   reinterpret_cast<string*>(dest)->assign(str, n);
946   return true;
947 }
948 
parse_stringpiece(const char * str,int n,void * dest)949 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
950   if (dest == NULL) return true;
951   reinterpret_cast<StringPiece*>(dest)->set(str, n);
952   return true;
953 }
954 
parse_char(const char * str,int n,void * dest)955 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
956   if (n != 1) return false;
957   if (dest == NULL) return true;
958   *(reinterpret_cast<char*>(dest)) = str[0];
959   return true;
960 }
961 
parse_uchar(const char * str,int n,void * dest)962 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
963   if (n != 1) return false;
964   if (dest == NULL) return true;
965   *(reinterpret_cast<unsigned char*>(dest)) = str[0];
966   return true;
967 }
968 
969 // Largest number spec that we are willing to parse
970 static const int kMaxNumberLength = 32;
971 
972 // REQUIRES "buf" must have length at least kMaxNumberLength+1
973 // Copies "str" into "buf" and null-terminates.
974 // Overwrites *np with the new length.
TerminateNumber(char * buf,const char * str,int * np)975 static const char* TerminateNumber(char* buf, const char* str, int* np) {
976   int n = *np;
977   if (n <= 0) return "";
978   if (n > 0 && isspace(*str)) {
979     // We are less forgiving than the strtoxxx() routines and do not
980     // allow leading spaces.
981     return "";
982   }
983 
984   // Although buf has a fixed maximum size, we can still handle
985   // arbitrarily large integers correctly by omitting leading zeros.
986   // (Numbers that are still too long will be out of range.)
987   // Before deciding whether str is too long,
988   // remove leading zeros with s/000+/00/.
989   // Leaving the leading two zeros in place means that
990   // we don't change 0000x123 (invalid) into 0x123 (valid).
991   // Skip over leading - before replacing.
992   bool neg = false;
993   if (n >= 1 && str[0] == '-') {
994     neg = true;
995     n--;
996     str++;
997   }
998 
999   if (n >= 3 && str[0] == '0' && str[1] == '0') {
1000     while (n >= 3 && str[2] == '0') {
1001       n--;
1002       str++;
1003     }
1004   }
1005 
1006   if (neg) {  // make room in buf for -
1007     n++;
1008     str--;
1009   }
1010 
1011   if (n > kMaxNumberLength) return "";
1012 
1013   memmove(buf, str, n);
1014   if (neg) {
1015     buf[0] = '-';
1016   }
1017   buf[n] = '\0';
1018   *np = n;
1019   return buf;
1020 }
1021 
parse_long_radix(const char * str,int n,void * dest,int radix)1022 bool RE2::Arg::parse_long_radix(const char* str,
1023                                int n,
1024                                void* dest,
1025                                int radix) {
1026   if (n == 0) return false;
1027   char buf[kMaxNumberLength+1];
1028   str = TerminateNumber(buf, str, &n);
1029   char* end;
1030   errno = 0;
1031   long r = strtol(str, &end, radix);
1032   if (end != str + n) return false;   // Leftover junk
1033   if (errno) return false;
1034   if (dest == NULL) return true;
1035   *(reinterpret_cast<long*>(dest)) = r;
1036   return true;
1037 }
1038 
parse_ulong_radix(const char * str,int n,void * dest,int radix)1039 bool RE2::Arg::parse_ulong_radix(const char* str,
1040                                 int n,
1041                                 void* dest,
1042                                 int radix) {
1043   if (n == 0) return false;
1044   char buf[kMaxNumberLength+1];
1045   str = TerminateNumber(buf, str, &n);
1046   if (str[0] == '-') {
1047    // strtoul() will silently accept negative numbers and parse
1048    // them.  This module is more strict and treats them as errors.
1049    return false;
1050   }
1051 
1052   char* end;
1053   errno = 0;
1054   unsigned long r = strtoul(str, &end, radix);
1055   if (end != str + n) return false;   // Leftover junk
1056   if (errno) return false;
1057   if (dest == NULL) return true;
1058   *(reinterpret_cast<unsigned long*>(dest)) = r;
1059   return true;
1060 }
1061 
parse_short_radix(const char * str,int n,void * dest,int radix)1062 bool RE2::Arg::parse_short_radix(const char* str,
1063                                 int n,
1064                                 void* dest,
1065                                 int radix) {
1066   long r;
1067   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1068   if ((short)r != r) return false;       // Out of range
1069   if (dest == NULL) return true;
1070   *(reinterpret_cast<short*>(dest)) = r;
1071   return true;
1072 }
1073 
parse_ushort_radix(const char * str,int n,void * dest,int radix)1074 bool RE2::Arg::parse_ushort_radix(const char* str,
1075                                  int n,
1076                                  void* dest,
1077                                  int radix) {
1078   unsigned long r;
1079   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1080   if ((ushort)r != r) return false;                      // Out of range
1081   if (dest == NULL) return true;
1082   *(reinterpret_cast<unsigned short*>(dest)) = r;
1083   return true;
1084 }
1085 
parse_int_radix(const char * str,int n,void * dest,int radix)1086 bool RE2::Arg::parse_int_radix(const char* str,
1087                               int n,
1088                               void* dest,
1089                               int radix) {
1090   long r;
1091   if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1092   if ((int)r != r) return false;         // Out of range
1093   if (dest == NULL) return true;
1094   *(reinterpret_cast<int*>(dest)) = r;
1095   return true;
1096 }
1097 
parse_uint_radix(const char * str,int n,void * dest,int radix)1098 bool RE2::Arg::parse_uint_radix(const char* str,
1099                                int n,
1100                                void* dest,
1101                                int radix) {
1102   unsigned long r;
1103   if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1104   if ((uint)r != r) return false;                       // Out of range
1105   if (dest == NULL) return true;
1106   *(reinterpret_cast<unsigned int*>(dest)) = r;
1107   return true;
1108 }
1109 
parse_longlong_radix(const char * str,int n,void * dest,int radix)1110 bool RE2::Arg::parse_longlong_radix(const char* str,
1111                                    int n,
1112                                    void* dest,
1113                                    int radix) {
1114   if (n == 0) return false;
1115   char buf[kMaxNumberLength+1];
1116   str = TerminateNumber(buf, str, &n);
1117   char* end;
1118   errno = 0;
1119   int64 r = strtoll(str, &end, radix);
1120   if (end != str + n) return false;   // Leftover junk
1121   if (errno) return false;
1122   if (dest == NULL) return true;
1123   *(reinterpret_cast<int64*>(dest)) = r;
1124   return true;
1125 }
1126 
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)1127 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1128                                     int n,
1129                                     void* dest,
1130                                     int radix) {
1131   if (n == 0) return false;
1132   char buf[kMaxNumberLength+1];
1133   str = TerminateNumber(buf, str, &n);
1134   if (str[0] == '-') {
1135     // strtoull() will silently accept negative numbers and parse
1136     // them.  This module is more strict and treats them as errors.
1137     return false;
1138   }
1139   char* end;
1140   errno = 0;
1141   uint64 r = strtoull(str, &end, radix);
1142   if (end != str + n) return false;   // Leftover junk
1143   if (errno) return false;
1144   if (dest == NULL) return true;
1145   *(reinterpret_cast<uint64*>(dest)) = r;
1146   return true;
1147 }
1148 
parse_double_float(const char * str,int n,bool isfloat,void * dest)1149 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1150   if (n == 0) return false;
1151   static const int kMaxLength = 200;
1152   char buf[kMaxLength];
1153   if (n >= kMaxLength) return false;
1154   memcpy(buf, str, n);
1155   buf[n] = '\0';
1156   errno = 0;
1157   char* end;
1158   double r;
1159   if (isfloat) {
1160     r = strtof(buf, &end);
1161   } else {
1162     r = strtod(buf, &end);
1163   }
1164   if (end != buf + n) return false;   // Leftover junk
1165   if (errno) return false;
1166   if (dest == NULL) return true;
1167   if (isfloat) {
1168     *(reinterpret_cast<float*>(dest)) = r;
1169   } else {
1170     *(reinterpret_cast<double*>(dest)) = r;
1171   }
1172   return true;
1173 }
1174 
parse_double(const char * str,int n,void * dest)1175 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1176   return parse_double_float(str, n, false, dest);
1177 }
1178 
parse_float(const char * str,int n,void * dest)1179 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1180   return parse_double_float(str, n, true, dest);
1181 }
1182 
1183 
1184 #define DEFINE_INTEGER_PARSERS(name)                                        \
1185   bool RE2::Arg::parse_##name(const char* str, int n, void* dest) {          \
1186     return parse_##name##_radix(str, n, dest, 10);                          \
1187   }                                                                         \
1188   bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) {    \
1189     return parse_##name##_radix(str, n, dest, 16);                          \
1190   }                                                                         \
1191   bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) {  \
1192     return parse_##name##_radix(str, n, dest, 8);                           \
1193   }                                                                         \
1194   bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1195     return parse_##name##_radix(str, n, dest, 0);                           \
1196   }
1197 
1198 DEFINE_INTEGER_PARSERS(short);
1199 DEFINE_INTEGER_PARSERS(ushort);
1200 DEFINE_INTEGER_PARSERS(int);
1201 DEFINE_INTEGER_PARSERS(uint);
1202 DEFINE_INTEGER_PARSERS(long);
1203 DEFINE_INTEGER_PARSERS(ulong);
1204 DEFINE_INTEGER_PARSERS(longlong);
1205 DEFINE_INTEGER_PARSERS(ulonglong);
1206 
1207 #undef DEFINE_INTEGER_PARSERS
1208 
1209 }  // namespace re2
1210