1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp.  (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 //   * the regexp contains a repetition of an empty string,
11 //     like (a*)* or (a*)+.  In this case, PCRE will treat
12 //     the repetition sequence as ending with an empty string,
13 //     while this library does not.
14 //   * Perl and PCRE differ on whether \v matches \n.
15 //     For historical reasons, this library implements the Perl behavior.
16 //   * Perl and PCRE allow $ in one-line mode to match either the very
17 //     end of the text or just before a \n at the end of the text.
18 //     This library requires it to match only the end of the text.
19 //   * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 //     match the end of the text if the last character is a \n.
21 //     This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24 
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29 
30 namespace re2 {
31 
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34 
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would.  See comment at top for conditions.
37 
38 class PCREWalker : public Regexp::Walker<bool> {
39  public:
PCREWalker()40   PCREWalker() {}
41 
42   virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
43                          bool* child_args, int nchild_args);
44 
ShortVisit(Regexp * re,bool a)45   virtual bool ShortVisit(Regexp* re, bool a) {
46     // Should never be called: we use Walk(), not WalkExponential().
47 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48     LOG(DFATAL) << "PCREWalker::ShortVisit called";
49 #endif
50     return a;
51   }
52 
53  private:
54   PCREWalker(const PCREWalker&) = delete;
55   PCREWalker& operator=(const PCREWalker&) = delete;
56 };
57 
58 // Called after visiting each of re's children and accumulating
59 // the return values in child_args.  So child_args contains whether
60 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)61 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
62                            bool* child_args, int nchild_args) {
63   // If children failed, so do we.
64   for (int i = 0; i < nchild_args; i++)
65     if (!child_args[i])
66       return false;
67 
68   // Otherwise look for other reasons to fail.
69   switch (re->op()) {
70     // Look for repeated empty string.
71     case kRegexpStar:
72     case kRegexpPlus:
73     case kRegexpQuest:
74       if (CanBeEmptyString(re->sub()[0]))
75         return false;
76       break;
77     case kRegexpRepeat:
78       if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
79         return false;
80       break;
81 
82     // Look for \v
83     case kRegexpLiteral:
84       if (re->rune() == '\v')
85         return false;
86       break;
87 
88     // Look for $ in single-line mode.
89     case kRegexpEndText:
90     case kRegexpEmptyMatch:
91       if (re->parse_flags() & Regexp::WasDollar)
92         return false;
93       break;
94 
95     // Look for ^ in multi-line mode.
96     case kRegexpBeginLine:
97       // No condition: in single-line mode ^ becomes kRegexpBeginText.
98       return false;
99 
100     default:
101       break;
102   }
103 
104   // Not proven guilty.
105   return true;
106 }
107 
108 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()109 bool Regexp::MimicsPCRE() {
110   PCREWalker w;
111   return w.Walk(this, true);
112 }
113 
114 
115 // Walker class to compute whether a Regexp can match an empty string.
116 // It is okay to overestimate.  For example, \b\B cannot match an empty
117 // string, because \b and \B are mutually exclusive, but this isn't
118 // that smart and will say it can.  Spurious empty strings
119 // will reduce the number of regexps we sanity check against PCRE,
120 // but they won't break anything.
121 
122 class EmptyStringWalker : public Regexp::Walker<bool> {
123  public:
EmptyStringWalker()124   EmptyStringWalker() {}
125 
126   virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
127                          bool* child_args, int nchild_args);
128 
ShortVisit(Regexp * re,bool a)129   virtual bool ShortVisit(Regexp* re, bool a) {
130     // Should never be called: we use Walk(), not WalkExponential().
131 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
132     LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
133 #endif
134     return a;
135   }
136 
137  private:
138   EmptyStringWalker(const EmptyStringWalker&) = delete;
139   EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
140 };
141 
142 // Called after visiting re's children.  child_args contains the return
143 // value from each of the children's PostVisits (i.e., whether each child
144 // can match an empty string).  Returns whether this clause can match an
145 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)146 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
147                                   bool* child_args, int nchild_args) {
148   switch (re->op()) {
149     case kRegexpNoMatch:               // never empty
150     case kRegexpLiteral:
151     case kRegexpAnyChar:
152     case kRegexpAnyByte:
153     case kRegexpCharClass:
154     case kRegexpLiteralString:
155       return false;
156 
157     case kRegexpEmptyMatch:            // always empty
158     case kRegexpBeginLine:             // always empty, when they match
159     case kRegexpEndLine:
160     case kRegexpNoWordBoundary:
161     case kRegexpWordBoundary:
162     case kRegexpBeginText:
163     case kRegexpEndText:
164     case kRegexpStar:                  // can always be empty
165     case kRegexpQuest:
166     case kRegexpHaveMatch:
167       return true;
168 
169     case kRegexpConcat:                // can be empty if all children can
170       for (int i = 0; i < nchild_args; i++)
171         if (!child_args[i])
172           return false;
173       return true;
174 
175     case kRegexpAlternate:             // can be empty if any child can
176       for (int i = 0; i < nchild_args; i++)
177         if (child_args[i])
178           return true;
179       return false;
180 
181     case kRegexpPlus:                  // can be empty if the child can
182     case kRegexpCapture:
183       return child_args[0];
184 
185     case kRegexpRepeat:                // can be empty if child can or is x{0}
186       return child_args[0] || re->min() == 0;
187   }
188   return false;
189 }
190 
191 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)192 static bool CanBeEmptyString(Regexp* re) {
193   EmptyStringWalker w;
194   return w.Walk(re, true);
195 }
196 
197 }  // namespace re2
198