1 // Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression interface RE2.
6 //
7 // Originally the PCRE C++ wrapper, but adapted to use
8 // the new automata-based regular expression engines.
9
10 #include "re2/re2.h"
11
12 #include <stdio.h>
13 #include <string>
14 #include <pthread.h>
15 #include <errno.h>
16 #include "util/util.h"
17 #include "util/flags.h"
18 #include "re2/prog.h"
19 #include "re2/regexp.h"
20
21 DEFINE_bool(trace_re2, false, "trace RE2 execution");
22
23 namespace re2 {
24
25 // Maximum number of args we can set
26 static const int kMaxArgs = 16;
27 static const int kVecSize = 1+kMaxArgs;
28
29 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::FullMatchN> RE2::FullMatch;
30 const VariadicFunction2<bool, const StringPiece&, const RE2&, RE2::Arg, RE2::PartialMatchN> RE2::PartialMatch;
31 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::ConsumeN> RE2::Consume;
32 const VariadicFunction2<bool, StringPiece*, const RE2&, RE2::Arg, RE2::FindAndConsumeN> RE2::FindAndConsume;
33
34 // This will trigger LNK2005 error in MSVC.
35 #ifndef COMPILER_MSVC
36 const int RE2::Options::kDefaultMaxMem; // initialized in re2.h
37 #endif // COMPILER_MSVC
38
Options(RE2::CannedOptions opt)39 RE2::Options::Options(RE2::CannedOptions opt)
40 : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8),
41 posix_syntax_(opt == RE2::POSIX),
42 longest_match_(opt == RE2::POSIX),
43 log_errors_(opt != RE2::Quiet),
44 max_mem_(kDefaultMaxMem),
45 literal_(false),
46 never_nl_(false),
47 never_capture_(false),
48 case_sensitive_(true),
49 perl_classes_(false),
50 word_boundary_(false),
51 one_line_(false) {
52 }
53
54 // static empty things for use as const references.
55 // To avoid global constructors, initialized on demand.
56 GLOBAL_MUTEX(empty_mutex);
57 static const string *empty_string;
58 static const map<string, int> *empty_named_groups;
59 static const map<int, string> *empty_group_names;
60
InitEmpty()61 static void InitEmpty() {
62 GLOBAL_MUTEX_LOCK(empty_mutex);
63 if (empty_string == NULL) {
64 empty_string = new string;
65 empty_named_groups = new map<string, int>;
66 empty_group_names = new map<int, string>;
67 }
68 GLOBAL_MUTEX_UNLOCK(empty_mutex);
69 }
70
71 // Converts from Regexp error code to RE2 error code.
72 // Maybe some day they will diverge. In any event, this
73 // hides the existence of Regexp from RE2 users.
RegexpErrorToRE2(re2::RegexpStatusCode code)74 static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) {
75 switch (code) {
76 case re2::kRegexpSuccess:
77 return RE2::NoError;
78 case re2::kRegexpInternalError:
79 return RE2::ErrorInternal;
80 case re2::kRegexpBadEscape:
81 return RE2::ErrorBadEscape;
82 case re2::kRegexpBadCharClass:
83 return RE2::ErrorBadCharClass;
84 case re2::kRegexpBadCharRange:
85 return RE2::ErrorBadCharRange;
86 case re2::kRegexpMissingBracket:
87 return RE2::ErrorMissingBracket;
88 case re2::kRegexpMissingParen:
89 return RE2::ErrorMissingParen;
90 case re2::kRegexpTrailingBackslash:
91 return RE2::ErrorTrailingBackslash;
92 case re2::kRegexpRepeatArgument:
93 return RE2::ErrorRepeatArgument;
94 case re2::kRegexpRepeatSize:
95 return RE2::ErrorRepeatSize;
96 case re2::kRegexpRepeatOp:
97 return RE2::ErrorRepeatOp;
98 case re2::kRegexpBadPerlOp:
99 return RE2::ErrorBadPerlOp;
100 case re2::kRegexpBadUTF8:
101 return RE2::ErrorBadUTF8;
102 case re2::kRegexpBadNamedCapture:
103 return RE2::ErrorBadNamedCapture;
104 }
105 return RE2::ErrorInternal;
106 }
107
trunc(const StringPiece & pattern)108 static string trunc(const StringPiece& pattern) {
109 if (pattern.size() < 100)
110 return pattern.as_string();
111 return pattern.substr(0, 100).as_string() + "...";
112 }
113
114
RE2(const char * pattern)115 RE2::RE2(const char* pattern) {
116 Init(pattern, DefaultOptions);
117 }
118
RE2(const string & pattern)119 RE2::RE2(const string& pattern) {
120 Init(pattern, DefaultOptions);
121 }
122
RE2(const StringPiece & pattern)123 RE2::RE2(const StringPiece& pattern) {
124 Init(pattern, DefaultOptions);
125 }
126
RE2(const StringPiece & pattern,const Options & options)127 RE2::RE2(const StringPiece& pattern, const Options& options) {
128 Init(pattern, options);
129 }
130
ParseFlags() const131 int RE2::Options::ParseFlags() const {
132 int flags = Regexp::ClassNL;
133 switch (encoding()) {
134 default:
135 if (log_errors())
136 LOG(ERROR) << "Unknown encoding " << encoding();
137 break;
138 case RE2::Options::EncodingUTF8:
139 break;
140 case RE2::Options::EncodingLatin1:
141 flags |= Regexp::Latin1;
142 break;
143 }
144
145 if (!posix_syntax())
146 flags |= Regexp::LikePerl;
147
148 if (literal())
149 flags |= Regexp::Literal;
150
151 if (never_nl())
152 flags |= Regexp::NeverNL;
153
154 if (never_capture())
155 flags |= Regexp::NeverCapture;
156
157 if (!case_sensitive())
158 flags |= Regexp::FoldCase;
159
160 if (perl_classes())
161 flags |= Regexp::PerlClasses;
162
163 if (word_boundary())
164 flags |= Regexp::PerlB;
165
166 if (one_line())
167 flags |= Regexp::OneLine;
168
169 return flags;
170 }
171
Init(const StringPiece & pattern,const Options & options)172 void RE2::Init(const StringPiece& pattern, const Options& options) {
173 mutex_ = new Mutex;
174 pattern_ = pattern.as_string();
175 options_.Copy(options);
176 InitEmpty();
177 error_ = empty_string;
178 error_code_ = NoError;
179 suffix_regexp_ = NULL;
180 entire_regexp_ = NULL;
181 prog_ = NULL;
182 rprog_ = NULL;
183 named_groups_ = NULL;
184 group_names_ = NULL;
185 num_captures_ = -1;
186
187 RegexpStatus status;
188 entire_regexp_ = Regexp::Parse(
189 pattern_,
190 static_cast<Regexp::ParseFlags>(options_.ParseFlags()),
191 &status);
192 if (entire_regexp_ == NULL) {
193 if (error_ == empty_string)
194 error_ = new string(status.Text());
195 if (options_.log_errors()) {
196 LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': "
197 << status.Text();
198 }
199 error_arg_ = status.error_arg().as_string();
200 error_code_ = RegexpErrorToRE2(status.code());
201 return;
202 }
203
204 prefix_.clear();
205 prefix_foldcase_ = false;
206 re2::Regexp* suffix;
207 if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix))
208 suffix_regexp_ = suffix;
209 else
210 suffix_regexp_ = entire_regexp_->Incref();
211
212 // Two thirds of the memory goes to the forward Prog,
213 // one third to the reverse prog, because the forward
214 // Prog has two DFAs but the reverse prog has one.
215 prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3);
216 if (prog_ == NULL) {
217 if (options_.log_errors())
218 LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'";
219 error_ = new string("pattern too large - compile failed");
220 error_code_ = RE2::ErrorPatternTooLarge;
221 return;
222 }
223
224 // Could delay this until the first match call that
225 // cares about submatch information, but the one-pass
226 // machine's memory gets cut from the DFA memory budget,
227 // and that is harder to do if the DFA has already
228 // been built.
229 is_one_pass_ = prog_->IsOnePass();
230 }
231
232 // Returns rprog_, computing it if needed.
ReverseProg() const233 re2::Prog* RE2::ReverseProg() const {
234 MutexLock l(mutex_);
235 if (rprog_ == NULL && error_ == empty_string) {
236 rprog_ = suffix_regexp_->CompileToReverseProg(options_.max_mem()/3);
237 if (rprog_ == NULL) {
238 if (options_.log_errors())
239 LOG(ERROR) << "Error reverse compiling '" << trunc(pattern_) << "'";
240 error_ = new string("pattern too large - reverse compile failed");
241 error_code_ = RE2::ErrorPatternTooLarge;
242 return NULL;
243 }
244 }
245 return rprog_;
246 }
247
~RE2()248 RE2::~RE2() {
249 if (suffix_regexp_)
250 suffix_regexp_->Decref();
251 if (entire_regexp_)
252 entire_regexp_->Decref();
253 delete mutex_;
254 delete prog_;
255 delete rprog_;
256 if (error_ != empty_string)
257 delete error_;
258 if (named_groups_ != NULL && named_groups_ != empty_named_groups)
259 delete named_groups_;
260 if (group_names_ != NULL && group_names_ != empty_group_names)
261 delete group_names_;
262 }
263
ProgramSize() const264 int RE2::ProgramSize() const {
265 if (prog_ == NULL)
266 return -1;
267 return prog_->size();
268 }
269
270 // Returns named_groups_, computing it if needed.
NamedCapturingGroups() const271 const map<string, int>& RE2::NamedCapturingGroups() const {
272 MutexLock l(mutex_);
273 if (!ok())
274 return *empty_named_groups;
275 if (named_groups_ == NULL) {
276 named_groups_ = suffix_regexp_->NamedCaptures();
277 if (named_groups_ == NULL)
278 named_groups_ = empty_named_groups;
279 }
280 return *named_groups_;
281 }
282
283 // Returns group_names_, computing it if needed.
CapturingGroupNames() const284 const map<int, string>& RE2::CapturingGroupNames() const {
285 MutexLock l(mutex_);
286 if (!ok())
287 return *empty_group_names;
288 if (group_names_ == NULL) {
289 group_names_ = suffix_regexp_->CaptureNames();
290 if (group_names_ == NULL)
291 group_names_ = empty_group_names;
292 }
293 return *group_names_;
294 }
295
296 /***** Convenience interfaces *****/
297
FullMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)298 bool RE2::FullMatchN(const StringPiece& text, const RE2& re,
299 const Arg* const args[], int n) {
300 return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n);
301 }
302
PartialMatchN(const StringPiece & text,const RE2 & re,const Arg * const args[],int n)303 bool RE2::PartialMatchN(const StringPiece& text, const RE2& re,
304 const Arg* const args[], int n) {
305 return re.DoMatch(text, UNANCHORED, NULL, args, n);
306 }
307
ConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)308 bool RE2::ConsumeN(StringPiece* input, const RE2& re,
309 const Arg* const args[], int n) {
310 int consumed;
311 if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) {
312 input->remove_prefix(consumed);
313 return true;
314 } else {
315 return false;
316 }
317 }
318
FindAndConsumeN(StringPiece * input,const RE2 & re,const Arg * const args[],int n)319 bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re,
320 const Arg* const args[], int n) {
321 int consumed;
322 if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) {
323 input->remove_prefix(consumed);
324 return true;
325 } else {
326 return false;
327 }
328 }
329
330 // Returns the maximum submatch needed for the rewrite to be done by Replace().
331 // E.g. if rewrite == "foo \\2,\\1", returns 2.
MaxSubmatch(const StringPiece & rewrite)332 int RE2::MaxSubmatch(const StringPiece& rewrite) {
333 int max = 0;
334 for (const char *s = rewrite.data(), *end = s + rewrite.size();
335 s < end; s++) {
336 if (*s == '\\') {
337 s++;
338 int c = (s < end) ? *s : -1;
339 if (isdigit(c)) {
340 int n = (c - '0');
341 if (n > max)
342 max = n;
343 }
344 }
345 }
346 return max;
347 }
348
Replace(string * str,const RE2 & re,const StringPiece & rewrite)349 bool RE2::Replace(string *str,
350 const RE2& re,
351 const StringPiece& rewrite) {
352 StringPiece vec[kVecSize];
353 int nvec = 1 + MaxSubmatch(rewrite);
354 if (nvec > arraysize(vec))
355 return false;
356 if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec))
357 return false;
358
359 string s;
360 if (!re.Rewrite(&s, rewrite, vec, nvec))
361 return false;
362
363 assert(vec[0].begin() >= str->data());
364 assert(vec[0].end() <= str->data()+str->size());
365 str->replace(vec[0].data() - str->data(), vec[0].size(), s);
366 return true;
367 }
368
GlobalReplace(string * str,const RE2 & re,const StringPiece & rewrite)369 int RE2::GlobalReplace(string *str,
370 const RE2& re,
371 const StringPiece& rewrite) {
372 StringPiece vec[kVecSize];
373 int nvec = 1 + MaxSubmatch(rewrite);
374 if (nvec > arraysize(vec))
375 return false;
376
377 const char* p = str->data();
378 const char* ep = p + str->size();
379 const char* lastend = NULL;
380 string out;
381 int count = 0;
382 while (p <= ep) {
383 if (!re.Match(*str, p - str->data(), str->size(), UNANCHORED, vec, nvec))
384 break;
385 if (p < vec[0].begin())
386 out.append(p, vec[0].begin() - p);
387 if (vec[0].begin() == lastend && vec[0].size() == 0) {
388 // Disallow empty match at end of last match: skip ahead.
389 if (p < ep)
390 out.append(p, 1);
391 p++;
392 continue;
393 }
394 re.Rewrite(&out, rewrite, vec, nvec);
395 p = vec[0].end();
396 lastend = p;
397 count++;
398 }
399
400 if (count == 0)
401 return 0;
402
403 if (p < ep)
404 out.append(p, ep - p);
405 swap(out, *str);
406 return count;
407 }
408
Extract(const StringPiece & text,const RE2 & re,const StringPiece & rewrite,string * out)409 bool RE2::Extract(const StringPiece &text,
410 const RE2& re,
411 const StringPiece &rewrite,
412 string *out) {
413 StringPiece vec[kVecSize];
414 int nvec = 1 + MaxSubmatch(rewrite);
415 if (nvec > arraysize(vec))
416 return false;
417
418 if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec))
419 return false;
420
421 out->clear();
422 return re.Rewrite(out, rewrite, vec, nvec);
423 }
424
QuoteMeta(const StringPiece & unquoted)425 string RE2::QuoteMeta(const StringPiece& unquoted) {
426 string result;
427 result.reserve(unquoted.size() << 1);
428
429 // Escape any ascii character not in [A-Za-z_0-9].
430 //
431 // Note that it's legal to escape a character even if it has no
432 // special meaning in a regular expression -- so this function does
433 // that. (This also makes it identical to the perl function of the
434 // same name except for the null-character special case;
435 // see `perldoc -f quotemeta`.)
436 for (int ii = 0; ii < unquoted.length(); ++ii) {
437 // Note that using 'isalnum' here raises the benchmark time from
438 // 32ns to 58ns:
439 if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') &&
440 (unquoted[ii] < 'A' || unquoted[ii] > 'Z') &&
441 (unquoted[ii] < '0' || unquoted[ii] > '9') &&
442 unquoted[ii] != '_' &&
443 // If this is the part of a UTF8 or Latin1 character, we need
444 // to copy this byte without escaping. Experimentally this is
445 // what works correctly with the regexp library.
446 !(unquoted[ii] & 128)) {
447 if (unquoted[ii] == '\0') { // Special handling for null chars.
448 // Note that this special handling is not strictly required for RE2,
449 // but this quoting is required for other regexp libraries such as
450 // PCRE.
451 // Can't use "\\0" since the next character might be a digit.
452 result += "\\x00";
453 continue;
454 }
455 result += '\\';
456 }
457 result += unquoted[ii];
458 }
459
460 return result;
461 }
462
PossibleMatchRange(string * min,string * max,int maxlen) const463 bool RE2::PossibleMatchRange(string* min, string* max, int maxlen) const {
464 if (prog_ == NULL)
465 return false;
466
467 int n = prefix_.size();
468 if (n > maxlen)
469 n = maxlen;
470
471 // Determine initial min max from prefix_ literal.
472 string pmin, pmax;
473 pmin = prefix_.substr(0, n);
474 pmax = prefix_.substr(0, n);
475 if (prefix_foldcase_) {
476 // prefix is ASCII lowercase; change pmin to uppercase.
477 for (int i = 0; i < n; i++) {
478 if ('a' <= pmin[i] && pmin[i] <= 'z')
479 pmin[i] += 'A' - 'a';
480 }
481 }
482
483 // Add to prefix min max using PossibleMatchRange on regexp.
484 string dmin, dmax;
485 maxlen -= n;
486 if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) {
487 pmin += dmin;
488 pmax += dmax;
489 } else if (pmax.size() > 0) {
490 // prog_->PossibleMatchRange has failed us,
491 // but we still have useful information from prefix_.
492 // Round up pmax to allow any possible suffix.
493 pmax = PrefixSuccessor(pmax);
494 } else {
495 // Nothing useful.
496 *min = "";
497 *max = "";
498 return false;
499 }
500
501 *min = pmin;
502 *max = pmax;
503 return true;
504 }
505
506 // Avoid possible locale nonsense in standard strcasecmp.
507 // The string a is known to be all lowercase.
ascii_strcasecmp(const char * a,const char * b,int len)508 static int ascii_strcasecmp(const char* a, const char* b, int len) {
509 const char *ae = a + len;
510
511 for (; a < ae; a++, b++) {
512 uint8 x = *a;
513 uint8 y = *b;
514 if ('A' <= y && y <= 'Z')
515 y += 'a' - 'A';
516 if (x != y)
517 return x - y;
518 }
519 return 0;
520 }
521
522
523 /***** Actual matching and rewriting code *****/
524
Match(const StringPiece & text,int startpos,int endpos,Anchor re_anchor,StringPiece * submatch,int nsubmatch) const525 bool RE2::Match(const StringPiece& text,
526 int startpos,
527 int endpos,
528 Anchor re_anchor,
529 StringPiece* submatch,
530 int nsubmatch) const {
531 if (!ok() || suffix_regexp_ == NULL) {
532 if (options_.log_errors())
533 LOG(ERROR) << "Invalid RE2: " << *error_;
534 return false;
535 }
536
537 if (startpos < 0 || startpos > endpos || endpos > text.size()) {
538 if (options_.log_errors())
539 LOG(ERROR) << "RE2: invalid startpos, endpos pair.";
540 return false;
541 }
542
543 StringPiece subtext = text;
544 subtext.remove_prefix(startpos);
545 subtext.remove_suffix(text.size() - endpos);
546
547 // Use DFAs to find exact location of match, filter out non-matches.
548
549 // Don't ask for the location if we won't use it.
550 // SearchDFA can do extra optimizations in that case.
551 StringPiece match;
552 StringPiece* matchp = &match;
553 if (nsubmatch == 0)
554 matchp = NULL;
555
556 int ncap = 1 + NumberOfCapturingGroups();
557 if (ncap > nsubmatch)
558 ncap = nsubmatch;
559
560 // If the regexp is anchored explicitly, must not be in middle of text.
561 if (prog_->anchor_start() && startpos != 0)
562 return false;
563
564 // If the regexp is anchored explicitly, update re_anchor
565 // so that we can potentially fall into a faster case below.
566 if (prog_->anchor_start() && prog_->anchor_end())
567 re_anchor = ANCHOR_BOTH;
568 else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH)
569 re_anchor = ANCHOR_START;
570
571 // Check for the required prefix, if any.
572 int prefixlen = 0;
573 if (!prefix_.empty()) {
574 if (startpos != 0)
575 return false;
576 prefixlen = prefix_.size();
577 if (prefixlen > subtext.size())
578 return false;
579 if (prefix_foldcase_) {
580 if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0)
581 return false;
582 } else {
583 if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0)
584 return false;
585 }
586 subtext.remove_prefix(prefixlen);
587 // If there is a required prefix, the anchor must be at least ANCHOR_START.
588 if (re_anchor != ANCHOR_BOTH)
589 re_anchor = ANCHOR_START;
590 }
591
592 Prog::Anchor anchor = Prog::kUnanchored;
593 Prog::MatchKind kind = Prog::kFirstMatch;
594 if (options_.longest_match())
595 kind = Prog::kLongestMatch;
596 bool skipped_test = false;
597
598 bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture);
599
600 // SearchBitState allocates a bit vector of size prog_->size() * text.size().
601 // It also allocates a stack of 3-word structures which could potentially
602 // grow as large as prog_->size() * text.size() but in practice is much
603 // smaller.
604 // Conditions for using SearchBitState:
605 const int MaxBitStateProg = 500; // prog_->size() <= Max.
606 const int MaxBitStateVector = 256*1024; // bit vector size <= Max (bits)
607 bool can_bit_state = prog_->size() <= MaxBitStateProg;
608 int bit_state_text_max = MaxBitStateVector / prog_->size();
609
610 bool dfa_failed = false;
611 switch (re_anchor) {
612 default:
613 case UNANCHORED: {
614 if (!prog_->SearchDFA(subtext, text, anchor, kind,
615 matchp, &dfa_failed, NULL)) {
616 if (dfa_failed) {
617 // Fall back to NFA below.
618 skipped_test = true;
619 if (FLAGS_trace_re2)
620 LOG(INFO) << "Match " << trunc(pattern_)
621 << " [" << CEscape(subtext) << "]"
622 << " DFA failed.";
623 break;
624 }
625 if (FLAGS_trace_re2)
626 LOG(INFO) << "Match " << trunc(pattern_)
627 << " [" << CEscape(subtext) << "]"
628 << " used DFA - no match.";
629 return false;
630 }
631 if (FLAGS_trace_re2)
632 LOG(INFO) << "Match " << trunc(pattern_)
633 << " [" << CEscape(subtext) << "]"
634 << " used DFA - match";
635 if (matchp == NULL) // Matched. Don't care where
636 return true;
637 // SearchDFA set match[0].end() but didn't know where the
638 // match started. Run the regexp backward from match[0].end()
639 // to find the longest possible match -- that's where it started.
640 Prog* prog = ReverseProg();
641 if (prog == NULL)
642 return false;
643 if (!prog->SearchDFA(match, text, Prog::kAnchored,
644 Prog::kLongestMatch, &match, &dfa_failed, NULL)) {
645 if (dfa_failed) {
646 // Fall back to NFA below.
647 skipped_test = true;
648 if (FLAGS_trace_re2)
649 LOG(INFO) << "Match " << trunc(pattern_)
650 << " [" << CEscape(subtext) << "]"
651 << " reverse DFA failed.";
652 break;
653 }
654 if (FLAGS_trace_re2)
655 LOG(INFO) << "Match " << trunc(pattern_)
656 << " [" << CEscape(subtext) << "]"
657 << " DFA inconsistency.";
658 if (options_.log_errors())
659 LOG(ERROR) << "DFA inconsistency";
660 return false;
661 }
662 if (FLAGS_trace_re2)
663 LOG(INFO) << "Match " << trunc(pattern_)
664 << " [" << CEscape(subtext) << "]"
665 << " used reverse DFA.";
666 break;
667 }
668
669 case ANCHOR_BOTH:
670 case ANCHOR_START:
671 if (re_anchor == ANCHOR_BOTH)
672 kind = Prog::kFullMatch;
673 anchor = Prog::kAnchored;
674
675 // If only a small amount of text and need submatch
676 // information anyway and we're going to use OnePass or BitState
677 // to get it, we might as well not even bother with the DFA:
678 // OnePass or BitState will be fast enough.
679 // On tiny texts, OnePass outruns even the DFA, and
680 // it doesn't have the shared state and occasional mutex that
681 // the DFA does.
682 if (can_one_pass && text.size() <= 4096 &&
683 (ncap > 1 || text.size() <= 8)) {
684 if (FLAGS_trace_re2)
685 LOG(INFO) << "Match " << trunc(pattern_)
686 << " [" << CEscape(subtext) << "]"
687 << " skipping DFA for OnePass.";
688 skipped_test = true;
689 break;
690 }
691 if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) {
692 if (FLAGS_trace_re2)
693 LOG(INFO) << "Match " << trunc(pattern_)
694 << " [" << CEscape(subtext) << "]"
695 << " skipping DFA for BitState.";
696 skipped_test = true;
697 break;
698 }
699 if (!prog_->SearchDFA(subtext, text, anchor, kind,
700 &match, &dfa_failed, NULL)) {
701 if (dfa_failed) {
702 if (FLAGS_trace_re2)
703 LOG(INFO) << "Match " << trunc(pattern_)
704 << " [" << CEscape(subtext) << "]"
705 << " DFA failed.";
706 skipped_test = true;
707 break;
708 }
709 if (FLAGS_trace_re2)
710 LOG(INFO) << "Match " << trunc(pattern_)
711 << " [" << CEscape(subtext) << "]"
712 << " used DFA - no match.";
713 return false;
714 }
715 break;
716 }
717
718 if (!skipped_test && ncap <= 1) {
719 // We know exactly where it matches. That's enough.
720 if (ncap == 1)
721 submatch[0] = match;
722 } else {
723 StringPiece subtext1;
724 if (skipped_test) {
725 // DFA ran out of memory or was skipped:
726 // need to search in entire original text.
727 subtext1 = subtext;
728 } else {
729 // DFA found the exact match location:
730 // let NFA run an anchored, full match search
731 // to find submatch locations.
732 subtext1 = match;
733 anchor = Prog::kAnchored;
734 kind = Prog::kFullMatch;
735 }
736
737 if (can_one_pass && anchor != Prog::kUnanchored) {
738 if (FLAGS_trace_re2)
739 LOG(INFO) << "Match " << trunc(pattern_)
740 << " [" << CEscape(subtext) << "]"
741 << " using OnePass.";
742 if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) {
743 if (!skipped_test && options_.log_errors())
744 LOG(ERROR) << "SearchOnePass inconsistency";
745 return false;
746 }
747 } else if (can_bit_state && subtext1.size() <= bit_state_text_max) {
748 if (FLAGS_trace_re2)
749 LOG(INFO) << "Match " << trunc(pattern_)
750 << " [" << CEscape(subtext) << "]"
751 << " using BitState.";
752 if (!prog_->SearchBitState(subtext1, text, anchor,
753 kind, submatch, ncap)) {
754 if (!skipped_test && options_.log_errors())
755 LOG(ERROR) << "SearchBitState inconsistency";
756 return false;
757 }
758 } else {
759 if (FLAGS_trace_re2)
760 LOG(INFO) << "Match " << trunc(pattern_)
761 << " [" << CEscape(subtext) << "]"
762 << " using NFA.";
763 if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) {
764 if (!skipped_test && options_.log_errors())
765 LOG(ERROR) << "SearchNFA inconsistency";
766 return false;
767 }
768 }
769 }
770
771 // Adjust overall match for required prefix that we stripped off.
772 if (prefixlen > 0 && nsubmatch > 0)
773 submatch[0] = StringPiece(submatch[0].begin() - prefixlen,
774 submatch[0].size() + prefixlen);
775
776 // Zero submatches that don't exist in the regexp.
777 for (int i = ncap; i < nsubmatch; i++)
778 submatch[i] = NULL;
779 return true;
780 }
781
782 // Internal matcher - like Match() but takes Args not StringPieces.
DoMatch(const StringPiece & text,Anchor anchor,int * consumed,const Arg * const * args,int n) const783 bool RE2::DoMatch(const StringPiece& text,
784 Anchor anchor,
785 int* consumed,
786 const Arg* const* args,
787 int n) const {
788 if (!ok()) {
789 if (options_.log_errors())
790 LOG(ERROR) << "Invalid RE2: " << *error_;
791 return false;
792 }
793
794 // Count number of capture groups needed.
795 int nvec;
796 if (n == 0 && consumed == NULL)
797 nvec = 0;
798 else
799 nvec = n+1;
800
801 StringPiece* vec;
802 StringPiece stkvec[kVecSize];
803 StringPiece* heapvec = NULL;
804
805 if (nvec <= arraysize(stkvec)) {
806 vec = stkvec;
807 } else {
808 vec = new StringPiece[nvec];
809 heapvec = vec;
810 }
811
812 if (!Match(text, 0, text.size(), anchor, vec, nvec)) {
813 delete[] heapvec;
814 return false;
815 }
816
817 if(consumed != NULL)
818 *consumed = vec[0].end() - text.begin();
819
820 if (n == 0 || args == NULL) {
821 // We are not interested in results
822 delete[] heapvec;
823 return true;
824 }
825
826 int ncap = NumberOfCapturingGroups();
827 if (ncap < n) {
828 // RE has fewer capturing groups than number of arg pointers passed in
829 VLOG(1) << "Asked for " << n << " but only have " << ncap;
830 delete[] heapvec;
831 return false;
832 }
833
834 // If we got here, we must have matched the whole pattern.
835 for (int i = 0; i < n; i++) {
836 const StringPiece& s = vec[i+1];
837 if (!args[i]->Parse(s.data(), s.size())) {
838 // TODO: Should we indicate what the error was?
839 VLOG(1) << "Parse error on #" << i << " " << s << " "
840 << (void*)s.data() << "/" << s.size();
841 delete[] heapvec;
842 return false;
843 }
844 }
845
846 delete[] heapvec;
847 return true;
848 }
849
850 // Append the "rewrite" string, with backslash subsitutions from "vec",
851 // to string "out".
Rewrite(string * out,const StringPiece & rewrite,const StringPiece * vec,int veclen) const852 bool RE2::Rewrite(string *out, const StringPiece &rewrite,
853 const StringPiece *vec, int veclen) const {
854 for (const char *s = rewrite.data(), *end = s + rewrite.size();
855 s < end; s++) {
856 int c = *s;
857 if (c == '\\') {
858 s++;
859 c = (s < end) ? *s : -1;
860 if (isdigit(c)) {
861 int n = (c - '0');
862 if (n >= veclen) {
863 if (options_.log_errors()) {
864 LOG(ERROR) << "requested group " << n
865 << " in regexp " << rewrite.data();
866 }
867 return false;
868 }
869 StringPiece snip = vec[n];
870 if (snip.size() > 0)
871 out->append(snip.data(), snip.size());
872 } else if (c == '\\') {
873 out->push_back('\\');
874 } else {
875 if (options_.log_errors())
876 LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data();
877 return false;
878 }
879 } else {
880 out->push_back(c);
881 }
882 }
883 return true;
884 }
885
886 // Return the number of capturing subpatterns, or -1 if the
887 // regexp wasn't valid on construction.
NumberOfCapturingGroups() const888 int RE2::NumberOfCapturingGroups() const {
889 if (suffix_regexp_ == NULL)
890 return -1;
891 ANNOTATE_BENIGN_RACE(&num_captures_, "benign race: in the worst case"
892 " multiple threads end up doing the same work in parallel.");
893 if (num_captures_ == -1)
894 num_captures_ = suffix_regexp_->NumCaptures();
895 return num_captures_;
896 }
897
898 // Checks that the rewrite string is well-formed with respect to this
899 // regular expression.
CheckRewriteString(const StringPiece & rewrite,string * error) const900 bool RE2::CheckRewriteString(const StringPiece& rewrite, string* error) const {
901 int max_token = -1;
902 for (const char *s = rewrite.data(), *end = s + rewrite.size();
903 s < end; s++) {
904 int c = *s;
905 if (c != '\\') {
906 continue;
907 }
908 if (++s == end) {
909 *error = "Rewrite schema error: '\\' not allowed at end.";
910 return false;
911 }
912 c = *s;
913 if (c == '\\') {
914 continue;
915 }
916 if (!isdigit(c)) {
917 *error = "Rewrite schema error: "
918 "'\\' must be followed by a digit or '\\'.";
919 return false;
920 }
921 int n = (c - '0');
922 if (max_token < n) {
923 max_token = n;
924 }
925 }
926
927 if (max_token > NumberOfCapturingGroups()) {
928 SStringPrintf(error, "Rewrite schema requests %d matches, "
929 "but the regexp only has %d parenthesized subexpressions.",
930 max_token, NumberOfCapturingGroups());
931 return false;
932 }
933 return true;
934 }
935
936 /***** Parsers for various types *****/
937
parse_null(const char * str,int n,void * dest)938 bool RE2::Arg::parse_null(const char* str, int n, void* dest) {
939 // We fail if somebody asked us to store into a non-NULL void* pointer
940 return (dest == NULL);
941 }
942
parse_string(const char * str,int n,void * dest)943 bool RE2::Arg::parse_string(const char* str, int n, void* dest) {
944 if (dest == NULL) return true;
945 reinterpret_cast<string*>(dest)->assign(str, n);
946 return true;
947 }
948
parse_stringpiece(const char * str,int n,void * dest)949 bool RE2::Arg::parse_stringpiece(const char* str, int n, void* dest) {
950 if (dest == NULL) return true;
951 reinterpret_cast<StringPiece*>(dest)->set(str, n);
952 return true;
953 }
954
parse_char(const char * str,int n,void * dest)955 bool RE2::Arg::parse_char(const char* str, int n, void* dest) {
956 if (n != 1) return false;
957 if (dest == NULL) return true;
958 *(reinterpret_cast<char*>(dest)) = str[0];
959 return true;
960 }
961
parse_uchar(const char * str,int n,void * dest)962 bool RE2::Arg::parse_uchar(const char* str, int n, void* dest) {
963 if (n != 1) return false;
964 if (dest == NULL) return true;
965 *(reinterpret_cast<unsigned char*>(dest)) = str[0];
966 return true;
967 }
968
969 // Largest number spec that we are willing to parse
970 static const int kMaxNumberLength = 32;
971
972 // REQUIRES "buf" must have length at least kMaxNumberLength+1
973 // Copies "str" into "buf" and null-terminates.
974 // Overwrites *np with the new length.
TerminateNumber(char * buf,const char * str,int * np)975 static const char* TerminateNumber(char* buf, const char* str, int* np) {
976 int n = *np;
977 if (n <= 0) return "";
978 if (n > 0 && isspace(*str)) {
979 // We are less forgiving than the strtoxxx() routines and do not
980 // allow leading spaces.
981 return "";
982 }
983
984 // Although buf has a fixed maximum size, we can still handle
985 // arbitrarily large integers correctly by omitting leading zeros.
986 // (Numbers that are still too long will be out of range.)
987 // Before deciding whether str is too long,
988 // remove leading zeros with s/000+/00/.
989 // Leaving the leading two zeros in place means that
990 // we don't change 0000x123 (invalid) into 0x123 (valid).
991 // Skip over leading - before replacing.
992 bool neg = false;
993 if (n >= 1 && str[0] == '-') {
994 neg = true;
995 n--;
996 str++;
997 }
998
999 if (n >= 3 && str[0] == '0' && str[1] == '0') {
1000 while (n >= 3 && str[2] == '0') {
1001 n--;
1002 str++;
1003 }
1004 }
1005
1006 if (neg) { // make room in buf for -
1007 n++;
1008 str--;
1009 }
1010
1011 if (n > kMaxNumberLength) return "";
1012
1013 memmove(buf, str, n);
1014 if (neg) {
1015 buf[0] = '-';
1016 }
1017 buf[n] = '\0';
1018 *np = n;
1019 return buf;
1020 }
1021
parse_long_radix(const char * str,int n,void * dest,int radix)1022 bool RE2::Arg::parse_long_radix(const char* str,
1023 int n,
1024 void* dest,
1025 int radix) {
1026 if (n == 0) return false;
1027 char buf[kMaxNumberLength+1];
1028 str = TerminateNumber(buf, str, &n);
1029 char* end;
1030 errno = 0;
1031 long r = strtol(str, &end, radix);
1032 if (end != str + n) return false; // Leftover junk
1033 if (errno) return false;
1034 if (dest == NULL) return true;
1035 *(reinterpret_cast<long*>(dest)) = r;
1036 return true;
1037 }
1038
parse_ulong_radix(const char * str,int n,void * dest,int radix)1039 bool RE2::Arg::parse_ulong_radix(const char* str,
1040 int n,
1041 void* dest,
1042 int radix) {
1043 if (n == 0) return false;
1044 char buf[kMaxNumberLength+1];
1045 str = TerminateNumber(buf, str, &n);
1046 if (str[0] == '-') {
1047 // strtoul() will silently accept negative numbers and parse
1048 // them. This module is more strict and treats them as errors.
1049 return false;
1050 }
1051
1052 char* end;
1053 errno = 0;
1054 unsigned long r = strtoul(str, &end, radix);
1055 if (end != str + n) return false; // Leftover junk
1056 if (errno) return false;
1057 if (dest == NULL) return true;
1058 *(reinterpret_cast<unsigned long*>(dest)) = r;
1059 return true;
1060 }
1061
parse_short_radix(const char * str,int n,void * dest,int radix)1062 bool RE2::Arg::parse_short_radix(const char* str,
1063 int n,
1064 void* dest,
1065 int radix) {
1066 long r;
1067 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1068 if ((short)r != r) return false; // Out of range
1069 if (dest == NULL) return true;
1070 *(reinterpret_cast<short*>(dest)) = r;
1071 return true;
1072 }
1073
parse_ushort_radix(const char * str,int n,void * dest,int radix)1074 bool RE2::Arg::parse_ushort_radix(const char* str,
1075 int n,
1076 void* dest,
1077 int radix) {
1078 unsigned long r;
1079 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1080 if ((ushort)r != r) return false; // Out of range
1081 if (dest == NULL) return true;
1082 *(reinterpret_cast<unsigned short*>(dest)) = r;
1083 return true;
1084 }
1085
parse_int_radix(const char * str,int n,void * dest,int radix)1086 bool RE2::Arg::parse_int_radix(const char* str,
1087 int n,
1088 void* dest,
1089 int radix) {
1090 long r;
1091 if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse
1092 if ((int)r != r) return false; // Out of range
1093 if (dest == NULL) return true;
1094 *(reinterpret_cast<int*>(dest)) = r;
1095 return true;
1096 }
1097
parse_uint_radix(const char * str,int n,void * dest,int radix)1098 bool RE2::Arg::parse_uint_radix(const char* str,
1099 int n,
1100 void* dest,
1101 int radix) {
1102 unsigned long r;
1103 if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse
1104 if ((uint)r != r) return false; // Out of range
1105 if (dest == NULL) return true;
1106 *(reinterpret_cast<unsigned int*>(dest)) = r;
1107 return true;
1108 }
1109
parse_longlong_radix(const char * str,int n,void * dest,int radix)1110 bool RE2::Arg::parse_longlong_radix(const char* str,
1111 int n,
1112 void* dest,
1113 int radix) {
1114 if (n == 0) return false;
1115 char buf[kMaxNumberLength+1];
1116 str = TerminateNumber(buf, str, &n);
1117 char* end;
1118 errno = 0;
1119 int64 r = strtoll(str, &end, radix);
1120 if (end != str + n) return false; // Leftover junk
1121 if (errno) return false;
1122 if (dest == NULL) return true;
1123 *(reinterpret_cast<int64*>(dest)) = r;
1124 return true;
1125 }
1126
parse_ulonglong_radix(const char * str,int n,void * dest,int radix)1127 bool RE2::Arg::parse_ulonglong_radix(const char* str,
1128 int n,
1129 void* dest,
1130 int radix) {
1131 if (n == 0) return false;
1132 char buf[kMaxNumberLength+1];
1133 str = TerminateNumber(buf, str, &n);
1134 if (str[0] == '-') {
1135 // strtoull() will silently accept negative numbers and parse
1136 // them. This module is more strict and treats them as errors.
1137 return false;
1138 }
1139 char* end;
1140 errno = 0;
1141 uint64 r = strtoull(str, &end, radix);
1142 if (end != str + n) return false; // Leftover junk
1143 if (errno) return false;
1144 if (dest == NULL) return true;
1145 *(reinterpret_cast<uint64*>(dest)) = r;
1146 return true;
1147 }
1148
parse_double_float(const char * str,int n,bool isfloat,void * dest)1149 static bool parse_double_float(const char* str, int n, bool isfloat, void *dest) {
1150 if (n == 0) return false;
1151 static const int kMaxLength = 200;
1152 char buf[kMaxLength];
1153 if (n >= kMaxLength) return false;
1154 memcpy(buf, str, n);
1155 buf[n] = '\0';
1156 errno = 0;
1157 char* end;
1158 double r;
1159 if (isfloat) {
1160 r = strtof(buf, &end);
1161 } else {
1162 r = strtod(buf, &end);
1163 }
1164 if (end != buf + n) return false; // Leftover junk
1165 if (errno) return false;
1166 if (dest == NULL) return true;
1167 if (isfloat) {
1168 *(reinterpret_cast<float*>(dest)) = r;
1169 } else {
1170 *(reinterpret_cast<double*>(dest)) = r;
1171 }
1172 return true;
1173 }
1174
parse_double(const char * str,int n,void * dest)1175 bool RE2::Arg::parse_double(const char* str, int n, void* dest) {
1176 return parse_double_float(str, n, false, dest);
1177 }
1178
parse_float(const char * str,int n,void * dest)1179 bool RE2::Arg::parse_float(const char* str, int n, void* dest) {
1180 return parse_double_float(str, n, true, dest);
1181 }
1182
1183
1184 #define DEFINE_INTEGER_PARSERS(name) \
1185 bool RE2::Arg::parse_##name(const char* str, int n, void* dest) { \
1186 return parse_##name##_radix(str, n, dest, 10); \
1187 } \
1188 bool RE2::Arg::parse_##name##_hex(const char* str, int n, void* dest) { \
1189 return parse_##name##_radix(str, n, dest, 16); \
1190 } \
1191 bool RE2::Arg::parse_##name##_octal(const char* str, int n, void* dest) { \
1192 return parse_##name##_radix(str, n, dest, 8); \
1193 } \
1194 bool RE2::Arg::parse_##name##_cradix(const char* str, int n, void* dest) { \
1195 return parse_##name##_radix(str, n, dest, 0); \
1196 }
1197
1198 DEFINE_INTEGER_PARSERS(short);
1199 DEFINE_INTEGER_PARSERS(ushort);
1200 DEFINE_INTEGER_PARSERS(int);
1201 DEFINE_INTEGER_PARSERS(uint);
1202 DEFINE_INTEGER_PARSERS(long);
1203 DEFINE_INTEGER_PARSERS(ulong);
1204 DEFINE_INTEGER_PARSERS(longlong);
1205 DEFINE_INTEGER_PARSERS(ulonglong);
1206
1207 #undef DEFINE_INTEGER_PARSERS
1208
1209 } // namespace re2
1210