1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression engine tester -- test all the implementations against each other.
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <string.h>
10 #include <string>
11 
12 #include "util/util.h"
13 #include "util/flags.h"
14 #include "util/logging.h"
15 #include "util/strutil.h"
16 #include "re2/testing/tester.h"
17 #include "re2/prog.h"
18 #include "re2/re2.h"
19 #include "re2/regexp.h"
20 
21 DEFINE_FLAG(bool, dump_prog, false, "dump regexp program");
22 DEFINE_FLAG(bool, log_okay, false, "log successful runs");
23 DEFINE_FLAG(bool, dump_rprog, false, "dump reversed regexp program");
24 
25 DEFINE_FLAG(int, max_regexp_failures, 100,
26             "maximum number of regexp test failures (-1 = unlimited)");
27 
28 DEFINE_FLAG(std::string, regexp_engines, "",
29             "pattern to select regexp engines to test");
30 
31 namespace re2 {
32 
33 enum {
34   kMaxSubmatch = 1+16,  // $0...$16
35 };
36 
37 const char* engine_names[kEngineMax] = {
38   "Backtrack",
39   "NFA",
40   "DFA",
41   "DFA1",
42   "OnePass",
43   "BitState",
44   "RE2",
45   "RE2a",
46   "RE2b",
47   "PCRE",
48 };
49 
50 // Returns the name of the engine.
EngineName(Engine e)51 static const char* EngineName(Engine e) {
52   CHECK_GE(e, 0);
53   CHECK_LT(e, arraysize(engine_names));
54   CHECK(engine_names[e] != NULL);
55   return engine_names[e];
56 }
57 
58 // Returns bit mask of engines to use.
Engines()59 static uint32_t Engines() {
60   static bool did_parse = false;
61   static uint32_t cached_engines = 0;
62 
63   if (did_parse)
64     return cached_engines;
65 
66   if (GetFlag(FLAGS_regexp_engines).empty()) {
67     cached_engines = ~0;
68   } else {
69     for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
70       if (GetFlag(FLAGS_regexp_engines).find(EngineName(i)) != std::string::npos)
71         cached_engines |= 1<<i;
72   }
73 
74   if (cached_engines == 0)
75     LOG(INFO) << "Warning: no engines enabled.";
76   if (!UsingPCRE)
77     cached_engines &= ~(1<<kEnginePCRE);
78   for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
79     if (cached_engines & (1<<i))
80       LOG(INFO) << EngineName(i) << " enabled";
81   }
82 
83   did_parse = true;
84   return cached_engines;
85 }
86 
87 // The result of running a match.
88 struct TestInstance::Result {
89   bool skipped;         // test skipped: wasn't applicable
90   bool matched;         // found a match
91   bool untrusted;       // don't really trust the answer
92   bool have_submatch;   // computed all submatch info
93   bool have_submatch0;  // computed just submatch[0]
94   StringPiece submatch[kMaxSubmatch];
95 };
96 
97 typedef TestInstance::Result Result;
98 
99 // Formats a single capture range s in text in the form (a,b)
100 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)101 static std::string FormatCapture(const StringPiece& text,
102                                  const StringPiece& s) {
103   if (s.data() == NULL)
104     return "(?,?)";
105   return StringPrintf("(%td,%td)",
106                       s.begin() - text.begin(),
107                       s.end() - text.begin());
108 }
109 
110 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)111 static bool NonASCII(const StringPiece& text) {
112   for (size_t i = 0; i < text.size(); i++)
113     if ((uint8_t)text[i] >= 0x80)
114       return true;
115   return false;
116 }
117 
118 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)119 static std::string FormatKind(Prog::MatchKind kind) {
120   switch (kind) {
121     case Prog::kFullMatch:
122       return "full match";
123     case Prog::kLongestMatch:
124       return "longest match";
125     case Prog::kFirstMatch:
126       return "first match";
127     case Prog::kManyMatch:
128       return "many match";
129   }
130   return "???";
131 }
132 
133 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)134 static std::string FormatAnchor(Prog::Anchor anchor) {
135   switch (anchor) {
136     case Prog::kAnchored:
137       return "anchored";
138     case Prog::kUnanchored:
139       return "unanchored";
140   }
141   return "???";
142 }
143 
144 struct ParseMode {
145   Regexp::ParseFlags parse_flags;
146   std::string desc;
147 };
148 
149 static const Regexp::ParseFlags single_line =
150   Regexp::LikePerl;
151 static const Regexp::ParseFlags multi_line =
152   static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
153 
154 static ParseMode parse_modes[] = {
155   { single_line,                   "single-line"          },
156   { single_line|Regexp::Latin1,    "single-line, latin1"  },
157   { multi_line,                    "multiline"            },
158   { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
159   { multi_line|Regexp::Latin1,     "multiline, latin1"    },
160 };
161 
FormatMode(Regexp::ParseFlags flags)162 static std::string FormatMode(Regexp::ParseFlags flags) {
163   for (size_t i = 0; i < arraysize(parse_modes); i++)
164     if (parse_modes[i].parse_flags == flags)
165       return parse_modes[i].desc;
166   return StringPrintf("%#x", static_cast<uint32_t>(flags));
167 }
168 
169 // Constructs and saves all the matching engines that
170 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)171 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
172                            Regexp::ParseFlags flags)
173   : regexp_str_(regexp_str),
174     kind_(kind),
175     flags_(flags),
176     error_(false),
177     regexp_(NULL),
178     num_captures_(0),
179     prog_(NULL),
180     rprog_(NULL),
181     re_(NULL),
182     re2_(NULL) {
183 
184   VLOG(1) << CEscape(regexp_str);
185 
186   // Compile regexp to prog.
187   // Always required - needed for backtracking (reference implementation).
188   RegexpStatus status;
189   regexp_ = Regexp::Parse(regexp_str, flags, &status);
190   if (regexp_ == NULL) {
191     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
192               << " mode: " << FormatMode(flags);
193     error_ = true;
194     return;
195   }
196   num_captures_ = regexp_->NumCaptures();
197   prog_ = regexp_->CompileToProg(0);
198   if (prog_ == NULL) {
199     LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
200     error_ = true;
201     return;
202   }
203   if (GetFlag(FLAGS_dump_prog)) {
204     LOG(INFO) << "Prog for "
205               << " regexp "
206               << CEscape(regexp_str_)
207               << " (" << FormatKind(kind_)
208               << ", " << FormatMode(flags_)
209               << ")\n"
210               << prog_->Dump();
211   }
212 
213   // Compile regexp to reversed prog.  Only needed for DFA engines.
214   if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
215     rprog_ = regexp_->CompileToReverseProg(0);
216     if (rprog_ == NULL) {
217       LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
218       error_ = true;
219       return;
220     }
221     if (GetFlag(FLAGS_dump_rprog))
222       LOG(INFO) << rprog_->Dump();
223   }
224 
225   // Create re string that will be used for RE and RE2.
226   std::string re = std::string(regexp_str);
227   // Accomodate flags.
228   // Regexp::Latin1 will be accomodated below.
229   if (!(flags & Regexp::OneLine))
230     re = "(?m)" + re;
231   if (flags & Regexp::NonGreedy)
232     re = "(?U)" + re;
233   if (flags & Regexp::DotNL)
234     re = "(?s)" + re;
235 
236   // Compile regexp to RE2.
237   if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
238     RE2::Options options;
239     if (flags & Regexp::Latin1)
240       options.set_encoding(RE2::Options::EncodingLatin1);
241     if (kind_ == Prog::kLongestMatch)
242       options.set_longest_match(true);
243     re2_ = new RE2(re, options);
244     if (!re2_->error().empty()) {
245       LOG(INFO) << "Cannot RE2: " << CEscape(re);
246       error_ = true;
247       return;
248     }
249   }
250 
251   // Compile regexp to RE.
252   // PCRE as exposed by the RE interface isn't always usable.
253   // 1. It disagrees about handling of empty-string reptitions
254   //    like matching (a*)* against "b".  PCRE treats the (a*) as
255   //    occurring once, while we treat it as occurring not at all.
256   // 2. It treats $ as this weird thing meaning end of string
257   //    or before the \n at the end of the string.
258   // 3. It doesn't implement POSIX leftmost-longest matching.
259   // 4. It lets \s match vertical tab.
260   // MimicsPCRE() detects 1 and 2.
261   if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
262       kind_ != Prog::kLongestMatch) {
263     PCRE_Options o;
264     o.set_option(PCRE::UTF8);
265     if (flags & Regexp::Latin1)
266       o.set_option(PCRE::None);
267     // PCRE has interface bug keeping us from finding $0, so
268     // add one more layer of parens.
269     re_ = new PCRE("("+re+")", o);
270     if (!re_->error().empty()) {
271       LOG(INFO) << "Cannot PCRE: " << CEscape(re);
272       error_ = true;
273       return;
274     }
275   }
276 }
277 
~TestInstance()278 TestInstance::~TestInstance() {
279   if (regexp_)
280     regexp_->Decref();
281   delete prog_;
282   delete rprog_;
283   delete re_;
284   delete re2_;
285 }
286 
287 // Runs a single search using the named engine type.
288 // This interface hides all the irregularities of the various
289 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)290 void TestInstance::RunSearch(Engine type,
291                              const StringPiece& orig_text,
292                              const StringPiece& orig_context,
293                              Prog::Anchor anchor,
294                              Result* result) {
295   // Result is not trivial, so we cannot freely clear it with memset(3),
296   // but zeroing objects like so is safe and expedient for our purposes.
297   memset(reinterpret_cast<void*>(result), 0, sizeof *result);
298   if (regexp_ == NULL) {
299     result->skipped = true;
300     return;
301   }
302   int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
303   if (nsubmatch > kMaxSubmatch)
304     nsubmatch = kMaxSubmatch;
305 
306   StringPiece text = orig_text;
307   StringPiece context = orig_context;
308 
309   switch (type) {
310     default:
311       LOG(FATAL) << "Bad RunSearch type: " << (int)type;
312 
313     case kEngineBacktrack:
314       if (prog_ == NULL) {
315         result->skipped = true;
316         break;
317       }
318       result->matched =
319         prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
320                                      result->submatch, nsubmatch);
321       result->have_submatch = true;
322       break;
323 
324     case kEngineNFA:
325       if (prog_ == NULL) {
326         result->skipped = true;
327         break;
328       }
329       result->matched =
330         prog_->SearchNFA(text, context, anchor, kind_,
331                         result->submatch, nsubmatch);
332       result->have_submatch = true;
333       break;
334 
335     case kEngineDFA:
336       if (prog_ == NULL) {
337         result->skipped = true;
338         break;
339       }
340       result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
341                                          &result->skipped, NULL);
342       break;
343 
344     case kEngineDFA1:
345       if (prog_ == NULL || rprog_ == NULL) {
346         result->skipped = true;
347         break;
348       }
349       result->matched =
350         prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
351                          &result->skipped, NULL);
352       // If anchored, no need for second run,
353       // but do it anyway to find more bugs.
354       if (result->matched) {
355         if (!rprog_->SearchDFA(result->submatch[0], context,
356                                Prog::kAnchored, Prog::kLongestMatch,
357                                result->submatch,
358                                &result->skipped, NULL)) {
359           LOG(ERROR) << "Reverse DFA inconsistency: "
360                      << CEscape(regexp_str_)
361                      << " on " << CEscape(text);
362           result->matched = false;
363         }
364       }
365       result->have_submatch0 = true;
366       break;
367 
368     case kEngineOnePass:
369       if (prog_ == NULL ||
370           !prog_->IsOnePass() ||
371           anchor == Prog::kUnanchored ||
372           nsubmatch > Prog::kMaxOnePassCapture) {
373         result->skipped = true;
374         break;
375       }
376       result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
377                                       result->submatch, nsubmatch);
378       result->have_submatch = true;
379       break;
380 
381     case kEngineBitState:
382       if (prog_ == NULL ||
383           !prog_->CanBitState()) {
384         result->skipped = true;
385         break;
386       }
387       result->matched = prog_->SearchBitState(text, context, anchor, kind_,
388                                               result->submatch, nsubmatch);
389       result->have_submatch = true;
390       break;
391 
392     case kEngineRE2:
393     case kEngineRE2a:
394     case kEngineRE2b: {
395       if (!re2_ || text.end() != context.end()) {
396         result->skipped = true;
397         break;
398       }
399 
400       RE2::Anchor re_anchor;
401       if (anchor == Prog::kAnchored)
402         re_anchor = RE2::ANCHOR_START;
403       else
404         re_anchor = RE2::UNANCHORED;
405       if (kind_ == Prog::kFullMatch)
406         re_anchor = RE2::ANCHOR_BOTH;
407 
408       result->matched = re2_->Match(
409           context,
410           static_cast<size_t>(text.begin() - context.begin()),
411           static_cast<size_t>(text.end() - context.begin()),
412           re_anchor,
413           result->submatch,
414           nsubmatch);
415       result->have_submatch = nsubmatch > 0;
416       break;
417     }
418 
419     case kEnginePCRE: {
420       if (!re_ || text.begin() != context.begin() ||
421           text.end() != context.end()) {
422         result->skipped = true;
423         break;
424       }
425 
426       // In Perl/PCRE, \v matches any character considered vertical
427       // whitespace, not just vertical tab. Regexp::MimicsPCRE() is
428       // unable to handle all cases of this, unfortunately, so just
429       // catch them here. :(
430       if (regexp_str_.find("\\v") != StringPiece::npos &&
431           (text.find('\n') != StringPiece::npos ||
432            text.find('\f') != StringPiece::npos ||
433            text.find('\r') != StringPiece::npos)) {
434         result->skipped = true;
435         break;
436       }
437 
438       // PCRE 8.34 or so started allowing vertical tab to match \s,
439       // following a change made in Perl 5.18. RE2 does not.
440       if ((regexp_str_.find("\\s") != StringPiece::npos ||
441            regexp_str_.find("\\S") != StringPiece::npos) &&
442           text.find('\v') != StringPiece::npos) {
443         result->skipped = true;
444         break;
445       }
446 
447       const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
448       PCRE::Arg *a = new PCRE::Arg[nsubmatch];
449       for (int i = 0; i < nsubmatch; i++) {
450         a[i] = PCRE::Arg(&result->submatch[i]);
451         argptr[i] = &a[i];
452       }
453       size_t consumed;
454       PCRE::Anchor pcre_anchor;
455       if (anchor == Prog::kAnchored)
456         pcre_anchor = PCRE::ANCHOR_START;
457       else
458         pcre_anchor = PCRE::UNANCHORED;
459       if (kind_ == Prog::kFullMatch)
460         pcre_anchor = PCRE::ANCHOR_BOTH;
461       re_->ClearHitLimit();
462       result->matched =
463         re_->DoMatch(text,
464                      pcre_anchor,
465                      &consumed,
466                      argptr, nsubmatch);
467       if (re_->HitLimit()) {
468         result->untrusted = true;
469         delete[] argptr;
470         delete[] a;
471         break;
472       }
473       result->have_submatch = true;
474       delete[] argptr;
475       delete[] a;
476       break;
477     }
478   }
479 
480   if (!result->matched)
481     memset(result->submatch, 0, sizeof result->submatch);
482 }
483 
484 // Checks whether r is okay given that correct is the right answer.
485 // Specifically, r's answers have to match (but it doesn't have to
486 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)487 static bool ResultOkay(const Result& r, const Result& correct) {
488   if (r.skipped)
489     return true;
490   if (r.matched != correct.matched)
491     return false;
492   if (r.have_submatch || r.have_submatch0) {
493     for (int i = 0; i < kMaxSubmatch; i++) {
494       if (correct.submatch[i].data() != r.submatch[i].data() ||
495           correct.submatch[i].size() != r.submatch[i].size())
496         return false;
497       if (!r.have_submatch)
498         break;
499     }
500   }
501   return true;
502 }
503 
504 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)505 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
506                            Prog::Anchor anchor) {
507   // Backtracking is the gold standard.
508   Result correct;
509   RunSearch(kEngineBacktrack, text, context, anchor, &correct);
510   if (correct.skipped) {
511     if (regexp_ == NULL)
512       return true;
513     LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
514                << " " << FormatMode(flags_);
515     return false;
516   }
517   VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
518           << " text " << CEscape(text)
519           << " (" << FormatKind(kind_)
520           << ", " << FormatAnchor(anchor)
521           << ", " << FormatMode(flags_)
522           << ")";
523 
524   // Compare the others.
525   bool all_okay = true;
526   for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
527     if (!(Engines() & (1<<i)))
528       continue;
529 
530     Result r;
531     RunSearch(i, text, context, anchor, &r);
532     if (ResultOkay(r, correct)) {
533       if (GetFlag(FLAGS_log_okay))
534         LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
535       continue;
536     }
537 
538     // We disagree with PCRE on the meaning of some Unicode matches.
539     // In particular, we treat non-ASCII UTF-8 as non-word characters.
540     // We also treat "empty" character sets like [^\w\W] as being
541     // impossible to match, while PCRE apparently excludes some code
542     // points (e.g., 0x0080) from both \w and \W.
543     if (i == kEnginePCRE && NonASCII(text))
544       continue;
545 
546     if (!r.untrusted)
547       all_okay = false;
548 
549     LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
550              context, anchor);
551     if (r.matched != correct.matched) {
552       if (r.matched) {
553         LOG(INFO) << "   Should not match (but does).";
554       } else {
555         LOG(INFO) << "   Should match (but does not).";
556         continue;
557       }
558     }
559     for (int i = 0; i < 1+num_captures_; i++) {
560       if (r.submatch[i].data() != correct.submatch[i].data() ||
561           r.submatch[i].size() != correct.submatch[i].size()) {
562         LOG(INFO) <<
563           StringPrintf("   $%d: should be %s is %s",
564                        i,
565                        FormatCapture(text, correct.submatch[i]).c_str(),
566                        FormatCapture(text, r.submatch[i]).c_str());
567       } else {
568         LOG(INFO) <<
569           StringPrintf("   $%d: %s ok", i,
570                        FormatCapture(text, r.submatch[i]).c_str());
571       }
572     }
573   }
574 
575   if (!all_okay) {
576     // This will be initialised once (after flags have been initialised)
577     // and that is desirable because we want to enforce a global limit.
578     static int max_regexp_failures = GetFlag(FLAGS_max_regexp_failures);
579     if (max_regexp_failures > 0 && --max_regexp_failures == 0)
580       LOG(QFATAL) << "Too many regexp failures.";
581   }
582 
583   return all_okay;
584 }
585 
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)586 void TestInstance::LogMatch(const char* prefix, Engine e,
587                             const StringPiece& text, const StringPiece& context,
588                             Prog::Anchor anchor) {
589   LOG(INFO) << prefix
590     << EngineName(e)
591     << " regexp "
592     << CEscape(regexp_str_)
593     << " "
594     << CEscape(regexp_->ToString())
595     << " text "
596     << CEscape(text)
597     << " ("
598     << text.begin() - context.begin()
599     << ","
600     << text.end() - context.begin()
601     << ") of context "
602     << CEscape(context)
603     << " (" << FormatKind(kind_)
604     << ", " << FormatAnchor(anchor)
605     << ", " << FormatMode(flags_)
606     << ")";
607 }
608 
609 static Prog::MatchKind kinds[] = {
610   Prog::kFirstMatch,
611   Prog::kLongestMatch,
612   Prog::kFullMatch,
613 };
614 
615 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)616 Tester::Tester(const StringPiece& regexp) {
617   error_ = false;
618   for (size_t i = 0; i < arraysize(kinds); i++) {
619     for (size_t j = 0; j < arraysize(parse_modes); j++) {
620       TestInstance* t = new TestInstance(regexp, kinds[i],
621                                          parse_modes[j].parse_flags);
622       error_ |= t->error();
623       v_.push_back(t);
624     }
625   }
626 }
627 
~Tester()628 Tester::~Tester() {
629   for (size_t i = 0; i < v_.size(); i++)
630     delete v_[i];
631 }
632 
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)633 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
634                          Prog::Anchor anchor) {
635   bool okay = true;
636   for (size_t i = 0; i < v_.size(); i++)
637     okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
638   return okay;
639 }
640 
641 static Prog::Anchor anchors[] = {
642   Prog::kAnchored,
643   Prog::kUnanchored
644 };
645 
TestInput(const StringPiece & text)646 bool Tester::TestInput(const StringPiece& text) {
647   bool okay = TestInputInContext(text, text);
648   if (!text.empty()) {
649     StringPiece sp;
650     sp = text;
651     sp.remove_prefix(1);
652     okay &= TestInputInContext(sp, text);
653     sp = text;
654     sp.remove_suffix(1);
655     okay &= TestInputInContext(sp, text);
656   }
657   return okay;
658 }
659 
TestInputInContext(const StringPiece & text,const StringPiece & context)660 bool Tester::TestInputInContext(const StringPiece& text,
661                                 const StringPiece& context) {
662   bool okay = true;
663   for (size_t i = 0; i < arraysize(anchors); i++)
664     okay &= TestCase(text, context, anchors[i]);
665   return okay;
666 }
667 
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)668 bool TestRegexpOnText(const StringPiece& regexp,
669                       const StringPiece& text) {
670   Tester t(regexp);
671   return t.TestInput(text);
672 }
673 
674 }  // namespace re2
675