1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp.  (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 //   * the regexp contains a repetition of an empty string,
11 //     like (a*)* or (a*)+.  In this case, PCRE will treat
12 //     the repetition sequence as ending with an empty string,
13 //     while this library does not.
14 //   * Perl and PCRE differ on whether \v matches \n.
15 //     For historical reasons, this library implements the Perl behavior.
16 //   * Perl and PCRE allow $ in one-line mode to match either the very
17 //     end of the text or just before a \n at the end of the text.
18 //     This library requires it to match only the end of the text.
19 //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 //     match the end of the text if the last character is a \n.
21 //     This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/util.h"
26 #include "re2/regexp.h"
27 #include "re2/walker-inl.h"
28 
29 namespace re2 {
30 
31 // Returns whether re might match an empty string.
32 static bool CanBeEmptyString(Regexp *re);
33 
34 // Walker class to compute whether library handles a regexp
35 // exactly as PCRE would.  See comment at top for conditions.
36 
37 class PCREWalker : public Regexp::Walker<bool> {
38  public:
PCREWalker()39   PCREWalker() {}
40   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
41                  int nchild_args);
42 
ShortVisit(Regexp * re,bool a)43   bool ShortVisit(Regexp* re, bool a) {
44     // Should never be called: we use Walk not WalkExponential.
45     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
46     return a;
47   }
48 };
49 
50 // Called after visiting each of re's children and accumulating
51 // the return values in child_args.  So child_args contains whether
52 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
54                            bool* child_args, int nchild_args) {
55   // If children failed, so do we.
56   for (int i = 0; i < nchild_args; i++)
57     if (!child_args[i])
58       return false;
59 
60   // Otherwise look for other reasons to fail.
61   switch (re->op()) {
62     // Look for repeated empty string.
63     case kRegexpStar:
64     case kRegexpPlus:
65     case kRegexpQuest:
66       if (CanBeEmptyString(re->sub()[0]))
67         return false;
68       break;
69     case kRegexpRepeat:
70       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
71         return false;
72       break;
73 
74     // Look for \v
75     case kRegexpLiteral:
76       if (re->rune() == '\v')
77         return false;
78       break;
79 
80     // Look for $ in single-line mode.
81     case kRegexpEndText:
82     case kRegexpEmptyMatch:
83       if (re->parse_flags() & Regexp::WasDollar)
84         return false;
85       break;
86 
87     // Look for ^ in multi-line mode.
88     case kRegexpBeginLine:
89       // No condition: in single-line mode ^ becomes kRegexpBeginText.
90       return false;
91 
92     default:
93       break;
94   }
95 
96   // Not proven guilty.
97   return true;
98 }
99 
100 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()101 bool Regexp::MimicsPCRE() {
102   PCREWalker w;
103   return w.Walk(this, true);
104 }
105 
106 
107 // Walker class to compute whether a Regexp can match an empty string.
108 // It is okay to overestimate.  For example, \b\B cannot match an empty
109 // string, because \b and \B are mutually exclusive, but this isn't
110 // that smart and will say it can.  Spurious empty strings
111 // will reduce the number of regexps we sanity check against PCRE,
112 // but they won't break anything.
113 
114 class EmptyStringWalker : public Regexp::Walker<bool> {
115  public:
EmptyStringWalker()116   EmptyStringWalker() { }
117   bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
118                  bool* child_args, int nchild_args);
119 
ShortVisit(Regexp * re,bool a)120   bool ShortVisit(Regexp* re, bool a) {
121     // Should never be called: we use Walk not WalkExponential.
122     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
123     return a;
124   }
125 
126  private:
127   DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
128 };
129 
130 // Called after visiting re's children.  child_args contains the return
131 // value from each of the children's PostVisits (i.e., whether each child
132 // can match an empty string).  Returns whether this clause can match an
133 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
135                                   bool* child_args, int nchild_args) {
136   switch (re->op()) {
137     case kRegexpNoMatch:               // never empty
138     case kRegexpLiteral:
139     case kRegexpAnyChar:
140     case kRegexpAnyByte:
141     case kRegexpCharClass:
142     case kRegexpLiteralString:
143       return false;
144 
145     case kRegexpEmptyMatch:            // always empty
146     case kRegexpBeginLine:             // always empty, when they match
147     case kRegexpEndLine:
148     case kRegexpNoWordBoundary:
149     case kRegexpWordBoundary:
150     case kRegexpBeginText:
151     case kRegexpEndText:
152     case kRegexpStar:                  // can always be empty
153     case kRegexpQuest:
154     case kRegexpHaveMatch:
155       return true;
156 
157     case kRegexpConcat:                // can be empty if all children can
158       for (int i = 0; i < nchild_args; i++)
159         if (!child_args[i])
160           return false;
161       return true;
162 
163     case kRegexpAlternate:             // can be empty if any child can
164       for (int i = 0; i < nchild_args; i++)
165         if (child_args[i])
166           return true;
167       return false;
168 
169     case kRegexpPlus:                  // can be empty if the child can
170     case kRegexpCapture:
171       return child_args[0];
172 
173     case kRegexpRepeat:                // can be empty if child can or is x{0}
174       return child_args[0] || re->min() == 0;
175   }
176   return false;
177 }
178 
179 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)180 static bool CanBeEmptyString(Regexp* re) {
181   EmptyStringWalker w;
182   return w.Walk(re, true);
183 }
184 
185 }  // namespace re2
186