1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression engine tester -- test all the implementations against each other.
6 
7 #include "util/util.h"
8 #include "util/flags.h"
9 #include "re2/testing/tester.h"
10 #include "re2/prog.h"
11 #include "re2/re2.h"
12 #include "re2/regexp.h"
13 
14 DEFINE_bool(dump_prog, false, "dump regexp program");
15 DEFINE_bool(log_okay, false, "log successful runs");
16 DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
17 
18 DEFINE_int32(max_regexp_failures, 100,
19              "maximum number of regexp test failures (-1 = unlimited)");
20 
21 DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
22 
23 namespace re2 {
24 
25 enum {
26   kMaxSubmatch = 1+16,  // $0...$16
27 };
28 
29 const char* engine_types[kEngineMax] = {
30   "Backtrack",
31   "NFA",
32   "DFA",
33   "DFA1",
34   "OnePass",
35   "BitState",
36   "RE2",
37   "RE2a",
38   "RE2b",
39   "PCRE",
40 };
41 
42 // Returns the name string for the type t.
EngineString(Engine t)43 static string EngineString(Engine t) {
44   if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
45     return StringPrintf("type%d", static_cast<int>(t));
46   }
47   return engine_types[t];
48 }
49 
50 // Returns bit mask of engines to use.
Engines()51 static uint32 Engines() {
52   static uint32 cached_engines;
53   static bool did_parse;
54 
55   if (did_parse)
56     return cached_engines;
57 
58   if (FLAGS_regexp_engines.empty()) {
59     cached_engines = ~0;
60   } else {
61     for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
62       if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
63         cached_engines |= 1<<i;
64   }
65 
66   if (cached_engines == 0)
67     LOG(INFO) << "Warning: no engines enabled.";
68   if (!UsingPCRE)
69     cached_engines &= ~(1<<kEnginePCRE);
70   for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
71     if (cached_engines & (1<<i))
72       LOG(INFO) << EngineString(i) << " enabled";
73   }
74   did_parse = true;
75   return cached_engines;
76 }
77 
78 // The result of running a match.
79 struct TestInstance::Result {
80   bool skipped;         // test skipped: wasn't applicable
81   bool matched;         // found a match
82   bool untrusted;       // don't really trust the answer
83   bool have_submatch;   // computed all submatch info
84   bool have_submatch0;  // computed just submatch[0]
85   StringPiece submatch[kMaxSubmatch];
86 };
87 
88 typedef TestInstance::Result Result;
89 
90 // Formats a single capture range s in text in the form (a,b)
91 // where a and b are the starting and ending offsets of s in text.
FormatCapture(const StringPiece & text,const StringPiece & s)92 static string FormatCapture(const StringPiece& text, const StringPiece& s) {
93   if (s.begin() == NULL)
94     return "(?,?)";
95   return StringPrintf("(%d,%d)",
96                       static_cast<int>(s.begin() - text.begin()),
97                       static_cast<int>(s.end() - text.begin()));
98 }
99 
100 // Returns whether text contains non-ASCII (>= 0x80) bytes.
NonASCII(const StringPiece & text)101 static bool NonASCII(const StringPiece& text) {
102   for (int i = 0; i < text.size(); i++)
103     if ((uint8)text[i] >= 0x80)
104       return true;
105   return false;
106 }
107 
108 // Returns string representation of match kind.
FormatKind(Prog::MatchKind kind)109 static string FormatKind(Prog::MatchKind kind) {
110   switch (kind) {
111     case Prog::kFullMatch:
112       return "full match";
113     case Prog::kLongestMatch:
114       return "longest match";
115     case Prog::kFirstMatch:
116       return "first match";
117     case Prog::kManyMatch:
118       return "many match";
119   }
120   return "???";
121 }
122 
123 // Returns string representation of anchor kind.
FormatAnchor(Prog::Anchor anchor)124 static string FormatAnchor(Prog::Anchor anchor) {
125   switch (anchor) {
126     case Prog::kAnchored:
127       return "anchored";
128     case Prog::kUnanchored:
129       return "unanchored";
130   }
131   return "???";
132 }
133 
134 struct ParseMode {
135   Regexp::ParseFlags parse_flags;
136   string desc;
137 };
138 
139 static const Regexp::ParseFlags single_line =
140   Regexp::LikePerl;
141 static const Regexp::ParseFlags multi_line =
142   static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
143 
144 static ParseMode parse_modes[] = {
145   { single_line,                   "single-line"          },
146   { single_line|Regexp::Latin1,    "single-line, latin1"  },
147   { multi_line,                    "multiline"            },
148   { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
149   { multi_line|Regexp::Latin1,     "multiline, latin1"    },
150 };
151 
FormatMode(Regexp::ParseFlags flags)152 static string FormatMode(Regexp::ParseFlags flags) {
153   for (int i = 0; i < arraysize(parse_modes); i++)
154     if (parse_modes[i].parse_flags == flags)
155       return parse_modes[i].desc;
156   return StringPrintf("%#x", static_cast<uint>(flags));
157 }
158 
159 // Constructs and saves all the matching engines that
160 // will be required for the given tests.
TestInstance(const StringPiece & regexp_str,Prog::MatchKind kind,Regexp::ParseFlags flags)161 TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
162                            Regexp::ParseFlags flags)
163   : regexp_str_(regexp_str),
164     kind_(kind),
165     flags_(flags),
166     error_(false),
167     regexp_(NULL),
168     num_captures_(0),
169     prog_(NULL),
170     rprog_(NULL),
171     re_(NULL),
172     re2_(NULL) {
173 
174   VLOG(1) << CEscape(regexp_str);
175 
176   // Compile regexp to prog.
177   // Always required - needed for backtracking (reference implementation).
178   RegexpStatus status;
179   regexp_ = Regexp::Parse(regexp_str, flags, &status);
180   if (regexp_ == NULL) {
181     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
182               << " mode: " << FormatMode(flags);
183     error_ = true;
184     return;
185   }
186   num_captures_ = regexp_->NumCaptures();
187   prog_ = regexp_->CompileToProg(0);
188   if (prog_ == NULL) {
189     LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
190     error_ = true;
191     return;
192   }
193   if (FLAGS_dump_prog) {
194     LOG(INFO) << "Prog for "
195               << " regexp "
196               << CEscape(regexp_str_)
197               << " (" << FormatKind(kind_)
198               << ", " << FormatMode(flags_)
199               << ")\n"
200               << prog_->Dump();
201   }
202 
203   // Compile regexp to reversed prog.  Only needed for DFA engines.
204   if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
205     rprog_ = regexp_->CompileToReverseProg(0);
206     if (rprog_ == NULL) {
207       LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
208       error_ = true;
209       return;
210     }
211     if (FLAGS_dump_rprog)
212       LOG(INFO) << rprog_->Dump();
213   }
214 
215   // Create re string that will be used for RE and RE2.
216   string re = regexp_str.as_string();
217   // Accomodate flags.
218   // Regexp::Latin1 will be accomodated below.
219   if (!(flags & Regexp::OneLine))
220     re = "(?m)" + re;
221   if (flags & Regexp::NonGreedy)
222     re = "(?U)" + re;
223   if (flags & Regexp::DotNL)
224     re = "(?s)" + re;
225 
226   // Compile regexp to RE2.
227   if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
228     RE2::Options options;
229     if (flags & Regexp::Latin1)
230       options.set_encoding(RE2::Options::EncodingLatin1);
231     if (kind_ == Prog::kLongestMatch)
232       options.set_longest_match(true);
233     re2_ = new RE2(re, options);
234     if (!re2_->error().empty()) {
235       LOG(INFO) << "Cannot RE2: " << CEscape(re);
236       error_ = true;
237       return;
238     }
239   }
240 
241   // Compile regexp to RE.
242   // PCRE as exposed by the RE interface isn't always usable.
243   // 1. It disagrees about handling of empty-string reptitions
244   //    like matching (a*)* against "b".  PCRE treats the (a*) as
245   //    occurring once, while we treat it as occurring not at all.
246   // 2. It treats $ as this weird thing meaning end of string
247   //    or before the \n at the end of the string.
248   // 3. It doesn't implement POSIX leftmost-longest matching.
249   // MimicsPCRE() detects 1 and 2.
250   if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
251       kind_ != Prog::kLongestMatch) {
252     PCRE_Options o;
253     o.set_option(PCRE::UTF8);
254     if (flags & Regexp::Latin1)
255       o.set_option(PCRE::None);
256     // PCRE has interface bug keeping us from finding $0, so
257     // add one more layer of parens.
258     re_ = new PCRE("("+re+")", o);
259     if (!re_->error().empty()) {
260       LOG(INFO) << "Cannot PCRE: " << CEscape(re);
261       error_ = true;
262       return;
263     }
264   }
265 }
266 
~TestInstance()267 TestInstance::~TestInstance() {
268   if (regexp_)
269     regexp_->Decref();
270   delete prog_;
271   delete rprog_;
272   delete re_;
273   delete re2_;
274 }
275 
276 // Runs a single search using the named engine type.
277 // This interface hides all the irregularities of the various
278 // engine interfaces from the rest of this file.
RunSearch(Engine type,const StringPiece & orig_text,const StringPiece & orig_context,Prog::Anchor anchor,Result * result)279 void TestInstance::RunSearch(Engine type,
280                              const StringPiece& orig_text,
281                              const StringPiece& orig_context,
282                              Prog::Anchor anchor,
283                              Result *result) {
284   memset(result, 0, sizeof *result);
285   if (regexp_ == NULL) {
286     result->skipped = true;
287     return;
288   }
289   int nsubmatch = 1 + num_captures_;  // NumCaptures doesn't count $0
290   if (nsubmatch > kMaxSubmatch)
291     nsubmatch = kMaxSubmatch;
292 
293   StringPiece text = orig_text;
294   StringPiece context = orig_context;
295 
296   switch (type) {
297     default:
298       LOG(FATAL) << "Bad RunSearch type: " << (int)type;
299 
300     case kEngineBacktrack:
301       if (prog_ == NULL) {
302         result->skipped = true;
303         break;
304       }
305       result->matched =
306         prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
307                                      result->submatch, nsubmatch);
308       result->have_submatch = true;
309       break;
310 
311     case kEngineNFA:
312       if (prog_ == NULL) {
313         result->skipped = true;
314         break;
315       }
316       result->matched =
317         prog_->SearchNFA(text, context, anchor, kind_,
318                         result->submatch, nsubmatch);
319       result->have_submatch = true;
320       break;
321 
322     case kEngineDFA:
323       if (prog_ == NULL) {
324         result->skipped = true;
325         break;
326       }
327       result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
328                                          &result->skipped, NULL);
329       break;
330 
331     case kEngineDFA1:
332       if (prog_ == NULL || rprog_ == NULL) {
333         result->skipped = true;
334         break;
335       }
336       result->matched =
337         prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
338                          &result->skipped, NULL);
339       // If anchored, no need for second run,
340       // but do it anyway to find more bugs.
341       if (result->matched) {
342         if (!rprog_->SearchDFA(result->submatch[0], context,
343                                Prog::kAnchored, Prog::kLongestMatch,
344                                result->submatch,
345                                &result->skipped, NULL)) {
346           LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_)
347                      << " on " << CEscape(text);
348           result->matched = false;
349         }
350       }
351       result->have_submatch0 = true;
352       break;
353 
354     case kEngineOnePass:
355       if (prog_ == NULL ||
356           anchor == Prog::kUnanchored ||
357           !prog_->IsOnePass() ||
358           nsubmatch > Prog::kMaxOnePassCapture) {
359         result->skipped = true;
360         break;
361       }
362       result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
363                                       result->submatch, nsubmatch);
364       result->have_submatch = true;
365       break;
366 
367     case kEngineBitState:
368       if (prog_ == NULL) {
369         result->skipped = true;
370         break;
371       }
372       result->matched = prog_->SearchBitState(text, context, anchor, kind_,
373                                               result->submatch, nsubmatch);
374       result->have_submatch = true;
375       break;
376 
377     case kEngineRE2:
378     case kEngineRE2a:
379     case kEngineRE2b: {
380       if (!re2_ || text.end() != context.end()) {
381         result->skipped = true;
382         break;
383       }
384 
385       RE2::Anchor re_anchor;
386       if (anchor == Prog::kAnchored)
387         re_anchor = RE2::ANCHOR_START;
388       else
389         re_anchor = RE2::UNANCHORED;
390       if (kind_ == Prog::kFullMatch)
391         re_anchor = RE2::ANCHOR_BOTH;
392 
393       result->matched = re2_->Match(context,
394                                     text.begin() - context.begin(),
395                                     text.end() - context.begin(),
396                                     re_anchor, result->submatch, nsubmatch);
397       result->have_submatch = nsubmatch > 0;
398       break;
399     }
400 
401     case kEnginePCRE: {
402       if (!re_ || text.begin() != context.begin() ||
403           text.end() != context.end()) {
404         result->skipped = true;
405         break;
406       }
407 
408       const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
409       PCRE::Arg *a = new PCRE::Arg[nsubmatch];
410       for (int i = 0; i < nsubmatch; i++) {
411         a[i] = PCRE::Arg(&result->submatch[i]);
412         argptr[i] = &a[i];
413       }
414       int consumed;
415       PCRE::Anchor pcre_anchor;
416       if (anchor == Prog::kAnchored)
417         pcre_anchor = PCRE::ANCHOR_START;
418       else
419         pcre_anchor = PCRE::UNANCHORED;
420       if (kind_ == Prog::kFullMatch)
421         pcre_anchor = PCRE::ANCHOR_BOTH;
422       re_->ClearHitLimit();
423       result->matched =
424         re_->DoMatch(text,
425                      pcre_anchor,
426                      &consumed,
427                      argptr, nsubmatch);
428       if (re_->HitLimit()) {
429         result->untrusted = true;
430         delete[] argptr;
431         delete[] a;
432         break;
433       }
434       result->have_submatch = true;
435 
436       // Work around RE interface bug: PCRE returns -1 as the
437       // offsets for an unmatched subexpression, and RE should
438       // turn that into StringPiece(NULL) but in fact it uses
439       // StringPiece(text.begin() - 1, 0).  Oops.
440       for (int i = 0; i < nsubmatch; i++)
441         if (result->submatch[i].begin() == text.begin() - 1)
442           result->submatch[i] = NULL;
443       delete[] argptr;
444       delete[] a;
445       break;
446     }
447   }
448 
449   if (!result->matched)
450     memset(result->submatch, 0, sizeof result->submatch);
451 }
452 
453 // Checks whether r is okay given that correct is the right answer.
454 // Specifically, r's answers have to match (but it doesn't have to
455 // claim to have all the answers).
ResultOkay(const Result & r,const Result & correct)456 static bool ResultOkay(const Result& r, const Result& correct) {
457   if (r.skipped)
458     return true;
459   if (r.matched != correct.matched)
460     return false;
461   if (r.have_submatch || r.have_submatch0) {
462     for (int i = 0; i < kMaxSubmatch; i++) {
463       if (correct.submatch[i].begin() != r.submatch[i].begin() ||
464           correct.submatch[i].size() != r.submatch[i].size())
465         return false;
466       if (!r.have_submatch)
467         break;
468     }
469   }
470   return true;
471 }
472 
473 // Runs a single test.
RunCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)474 bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
475                            Prog::Anchor anchor) {
476   // Backtracking is the gold standard.
477   Result correct;
478   RunSearch(kEngineBacktrack, text, context, anchor, &correct);
479   if (correct.skipped) {
480     if (regexp_ == NULL)
481       return true;
482     LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
483                << " " << FormatMode(flags_);
484     return false;
485   }
486   VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
487           << " text " << CEscape(text)
488           << " (" << FormatKind(kind_)
489           << ", " << FormatAnchor(anchor)
490           << ", " << FormatMode(flags_)
491           << ")";
492 
493   // Compare the others.
494   bool all_okay = true;
495   for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
496     if (!(Engines() & (1<<i)))
497       continue;
498 
499     Result r;
500     RunSearch(i, text, context, anchor, &r);
501     if (ResultOkay(r, correct)) {
502       if (FLAGS_log_okay)
503         LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
504       continue;
505     }
506 
507     // We disagree with PCRE on the meaning of some Unicode matches.
508     // In particular, we treat all non-ASCII UTF-8 as word characters.
509     // We also treat "empty" character sets like [^\w\W] as being
510     // impossible to match, while PCRE apparently excludes some code
511     // points (e.g., 0x0080) from both \w and \W.
512     if (i == kEnginePCRE && NonASCII(text))
513       continue;
514 
515     if (!r.untrusted)
516       all_okay = false;
517 
518     LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
519              context, anchor);
520     if (r.matched != correct.matched) {
521       if (r.matched) {
522         LOG(INFO) << "   Should not match (but does).";
523       } else {
524         LOG(INFO) << "   Should match (but does not).";
525         continue;
526       }
527     }
528     for (int i = 0; i < 1+num_captures_; i++) {
529       if (r.submatch[i].begin() != correct.submatch[i].begin() ||
530           r.submatch[i].end() != correct.submatch[i].end()) {
531         LOG(INFO) <<
532           StringPrintf("   $%d: should be %s is %s",
533                        i,
534                        FormatCapture(text, correct.submatch[i]).c_str(),
535                        FormatCapture(text, r.submatch[i]).c_str());
536       } else {
537         LOG(INFO) <<
538           StringPrintf("   $%d: %s ok", i,
539                        FormatCapture(text, r.submatch[i]).c_str());
540       }
541     }
542   }
543 
544   if (!all_okay) {
545     if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
546       LOG(QFATAL) << "Too many regexp failures.";
547   }
548 
549   return all_okay;
550 }
551 
LogMatch(const char * prefix,Engine e,const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)552 void TestInstance::LogMatch(const char* prefix, Engine e,
553                             const StringPiece& text, const StringPiece& context,
554                             Prog::Anchor anchor) {
555   LOG(INFO) << prefix
556     << EngineString(e)
557     << " regexp "
558     << CEscape(regexp_str_)
559     << " "
560     << CEscape(regexp_->ToString())
561     << " text "
562     << CEscape(text)
563     << " ("
564     << text.begin() - context.begin()
565     << ","
566     << text.end() - context.begin()
567     << ") of context "
568     << CEscape(context)
569     << " (" << FormatKind(kind_)
570     << ", " << FormatAnchor(anchor)
571     << ", " << FormatMode(flags_)
572     << ")";
573 }
574 
575 static Prog::MatchKind kinds[] = {
576   Prog::kFirstMatch,
577   Prog::kLongestMatch,
578   Prog::kFullMatch,
579 };
580 
581 // Test all possible match kinds and parse modes.
Tester(const StringPiece & regexp)582 Tester::Tester(const StringPiece& regexp) {
583   error_ = false;
584   for (int i = 0; i < arraysize(kinds); i++) {
585     for (int j = 0; j < arraysize(parse_modes); j++) {
586       TestInstance* t = new TestInstance(regexp, kinds[i],
587                                          parse_modes[j].parse_flags);
588       error_ |= t->error();
589       v_.push_back(t);
590     }
591   }
592 }
593 
~Tester()594 Tester::~Tester() {
595   for (int i = 0; i < v_.size(); i++)
596     delete v_[i];
597 }
598 
TestCase(const StringPiece & text,const StringPiece & context,Prog::Anchor anchor)599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
600                          Prog::Anchor anchor) {
601   bool okay = true;
602   for (int i = 0; i < v_.size(); i++)
603     okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
604   return okay;
605 }
606 
607 static Prog::Anchor anchors[] = {
608   Prog::kAnchored,
609   Prog::kUnanchored
610 };
611 
TestInput(const StringPiece & text)612 bool Tester::TestInput(const StringPiece& text) {
613   bool okay = TestInputInContext(text, text);
614   if (text.size() > 0) {
615     StringPiece sp;
616     sp = text;
617     sp.remove_prefix(1);
618     okay &= TestInputInContext(sp, text);
619     sp = text;
620     sp.remove_suffix(1);
621     okay &= TestInputInContext(sp, text);
622   }
623   return okay;
624 }
625 
TestInputInContext(const StringPiece & text,const StringPiece & context)626 bool Tester::TestInputInContext(const StringPiece& text,
627                                 const StringPiece& context) {
628   bool okay = true;
629   for (int i = 0; i < arraysize(anchors); i++)
630     okay &= TestCase(text, context, anchors[i]);
631   return okay;
632 }
633 
TestRegexpOnText(const StringPiece & regexp,const StringPiece & text)634 bool TestRegexpOnText(const StringPiece& regexp,
635                       const StringPiece& text) {
636   Tester t(regexp);
637   return t.TestInput(text);
638 }
639 
640 }  // namespace re2
641