1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Format a regular expression structure as a string.
6 // Tested by parse_test.cc
7 
8 #include <string.h>
9 #include <string>
10 
11 #include "util/util.h"
12 #include "util/logging.h"
13 #include "util/strutil.h"
14 #include "util/utf.h"
15 #include "re2/regexp.h"
16 #include "re2/walker-inl.h"
17 
18 namespace re2 {
19 
20 enum {
21   PrecAtom,
22   PrecUnary,
23   PrecConcat,
24   PrecAlternate,
25   PrecEmpty,
26   PrecParen,
27   PrecToplevel,
28 };
29 
30 // Helper function.  See description below.
31 static void AppendCCRange(std::string* t, Rune lo, Rune hi);
32 
33 // Walker to generate string in s_.
34 // The arg pointers are actually integers giving the
35 // context precedence.
36 // The child_args are always NULL.
37 class ToStringWalker : public Regexp::Walker<int> {
38  public:
ToStringWalker(std::string * t)39   explicit ToStringWalker(std::string* t) : t_(t) {}
40 
41   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
42   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
43                         int* child_args, int nchild_args);
ShortVisit(Regexp * re,int parent_arg)44   virtual int ShortVisit(Regexp* re, int parent_arg) {
45     return 0;
46   }
47 
48  private:
49   std::string* t_;  // The string the walker appends to.
50 
51   ToStringWalker(const ToStringWalker&) = delete;
52   ToStringWalker& operator=(const ToStringWalker&) = delete;
53 };
54 
ToString()55 std::string Regexp::ToString() {
56   std::string t;
57   ToStringWalker w(&t);
58   w.WalkExponential(this, PrecToplevel, 100000);
59   if (w.stopped_early())
60     t += " [truncated]";
61   return t;
62 }
63 
64 #define ToString DontCallToString  // Avoid accidental recursion.
65 
66 // Visits re before children are processed.
67 // Appends ( if needed and passes new precedence to children.
PreVisit(Regexp * re,int parent_arg,bool * stop)68 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
69   int prec = parent_arg;
70   int nprec = PrecAtom;
71 
72   switch (re->op()) {
73     case kRegexpNoMatch:
74     case kRegexpEmptyMatch:
75     case kRegexpLiteral:
76     case kRegexpAnyChar:
77     case kRegexpAnyByte:
78     case kRegexpBeginLine:
79     case kRegexpEndLine:
80     case kRegexpBeginText:
81     case kRegexpEndText:
82     case kRegexpWordBoundary:
83     case kRegexpNoWordBoundary:
84     case kRegexpCharClass:
85     case kRegexpHaveMatch:
86       nprec = PrecAtom;
87       break;
88 
89     case kRegexpConcat:
90     case kRegexpLiteralString:
91       if (prec < PrecConcat)
92         t_->append("(?:");
93       nprec = PrecConcat;
94       break;
95 
96     case kRegexpAlternate:
97       if (prec < PrecAlternate)
98         t_->append("(?:");
99       nprec = PrecAlternate;
100       break;
101 
102     case kRegexpCapture:
103       t_->append("(");
104       if (re->cap() == 0)
105         LOG(DFATAL) << "kRegexpCapture cap() == 0";
106       if (re->name()) {
107         t_->append("?P<");
108         t_->append(*re->name());
109         t_->append(">");
110       }
111       nprec = PrecParen;
112       break;
113 
114     case kRegexpStar:
115     case kRegexpPlus:
116     case kRegexpQuest:
117     case kRegexpRepeat:
118       if (prec < PrecUnary)
119         t_->append("(?:");
120       // The subprecedence here is PrecAtom instead of PrecUnary
121       // because PCRE treats two unary ops in a row as a parse error.
122       nprec = PrecAtom;
123       break;
124   }
125 
126   return nprec;
127 }
128 
AppendLiteral(std::string * t,Rune r,bool foldcase)129 static void AppendLiteral(std::string *t, Rune r, bool foldcase) {
130   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
131     t->append(1, '\\');
132     t->append(1, static_cast<char>(r));
133   } else if (foldcase && 'a' <= r && r <= 'z') {
134     r -= 'a' - 'A';
135     t->append(1, '[');
136     t->append(1, static_cast<char>(r));
137     t->append(1, static_cast<char>(r) + 'a' - 'A');
138     t->append(1, ']');
139   } else {
140     AppendCCRange(t, r, r);
141   }
142 }
143 
144 // Visits re after children are processed.
145 // For childless regexps, all the work is done here.
146 // For regexps with children, append any unary suffixes or ).
PostVisit(Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)147 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
148                               int* child_args, int nchild_args) {
149   int prec = parent_arg;
150   switch (re->op()) {
151     case kRegexpNoMatch:
152       // There's no simple symbol for "no match", but
153       // [^0-Runemax] excludes everything.
154       t_->append("[^\\x00-\\x{10ffff}]");
155       break;
156 
157     case kRegexpEmptyMatch:
158       // Append (?:) to make empty string visible,
159       // unless this is already being parenthesized.
160       if (prec < PrecEmpty)
161         t_->append("(?:)");
162       break;
163 
164     case kRegexpLiteral:
165       AppendLiteral(t_, re->rune(),
166                     (re->parse_flags() & Regexp::FoldCase) != 0);
167       break;
168 
169     case kRegexpLiteralString:
170       for (int i = 0; i < re->nrunes(); i++)
171         AppendLiteral(t_, re->runes()[i],
172                       (re->parse_flags() & Regexp::FoldCase) != 0);
173       if (prec < PrecConcat)
174         t_->append(")");
175       break;
176 
177     case kRegexpConcat:
178       if (prec < PrecConcat)
179         t_->append(")");
180       break;
181 
182     case kRegexpAlternate:
183       // Clumsy but workable: the children all appended |
184       // at the end of their strings, so just remove the last one.
185       if ((*t_)[t_->size()-1] == '|')
186         t_->erase(t_->size()-1);
187       else
188         LOG(DFATAL) << "Bad final char: " << t_;
189       if (prec < PrecAlternate)
190         t_->append(")");
191       break;
192 
193     case kRegexpStar:
194       t_->append("*");
195       if (re->parse_flags() & Regexp::NonGreedy)
196         t_->append("?");
197       if (prec < PrecUnary)
198         t_->append(")");
199       break;
200 
201     case kRegexpPlus:
202       t_->append("+");
203       if (re->parse_flags() & Regexp::NonGreedy)
204         t_->append("?");
205       if (prec < PrecUnary)
206         t_->append(")");
207       break;
208 
209     case kRegexpQuest:
210       t_->append("?");
211       if (re->parse_flags() & Regexp::NonGreedy)
212         t_->append("?");
213       if (prec < PrecUnary)
214         t_->append(")");
215       break;
216 
217     case kRegexpRepeat:
218       if (re->max() == -1)
219         t_->append(StringPrintf("{%d,}", re->min()));
220       else if (re->min() == re->max())
221         t_->append(StringPrintf("{%d}", re->min()));
222       else
223         t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
224       if (re->parse_flags() & Regexp::NonGreedy)
225         t_->append("?");
226       if (prec < PrecUnary)
227         t_->append(")");
228       break;
229 
230     case kRegexpAnyChar:
231       t_->append(".");
232       break;
233 
234     case kRegexpAnyByte:
235       t_->append("\\C");
236       break;
237 
238     case kRegexpBeginLine:
239       t_->append("^");
240       break;
241 
242     case kRegexpEndLine:
243       t_->append("$");
244       break;
245 
246     case kRegexpBeginText:
247       t_->append("(?-m:^)");
248       break;
249 
250     case kRegexpEndText:
251       if (re->parse_flags() & Regexp::WasDollar)
252         t_->append("(?-m:$)");
253       else
254         t_->append("\\z");
255       break;
256 
257     case kRegexpWordBoundary:
258       t_->append("\\b");
259       break;
260 
261     case kRegexpNoWordBoundary:
262       t_->append("\\B");
263       break;
264 
265     case kRegexpCharClass: {
266       if (re->cc()->size() == 0) {
267         t_->append("[^\\x00-\\x{10ffff}]");
268         break;
269       }
270       t_->append("[");
271       // Heuristic: show class as negated if it contains the
272       // non-character 0xFFFE and yet somehow isn't full.
273       CharClass* cc = re->cc();
274       if (cc->Contains(0xFFFE) && !cc->full()) {
275         cc = cc->Negate();
276         t_->append("^");
277       }
278       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
279         AppendCCRange(t_, i->lo, i->hi);
280       if (cc != re->cc())
281         cc->Delete();
282       t_->append("]");
283       break;
284     }
285 
286     case kRegexpCapture:
287       t_->append(")");
288       break;
289 
290     case kRegexpHaveMatch:
291       // There's no syntax accepted by the parser to generate
292       // this node (it is generated by RE2::Set) so make something
293       // up that is readable but won't compile.
294       t_->append("(?HaveMatch:%d)", re->match_id());
295       break;
296   }
297 
298   // If the parent is an alternation, append the | for it.
299   if (prec == PrecAlternate)
300     t_->append("|");
301 
302   return 0;
303 }
304 
305 // Appends a rune for use in a character class to the string t.
AppendCCChar(std::string * t,Rune r)306 static void AppendCCChar(std::string* t, Rune r) {
307   if (0x20 <= r && r <= 0x7E) {
308     if (strchr("[]^-\\", r))
309       t->append("\\");
310     t->append(1, static_cast<char>(r));
311     return;
312   }
313   switch (r) {
314     default:
315       break;
316 
317     case '\r':
318       t->append("\\r");
319       return;
320 
321     case '\t':
322       t->append("\\t");
323       return;
324 
325     case '\n':
326       t->append("\\n");
327       return;
328 
329     case '\f':
330       t->append("\\f");
331       return;
332   }
333 
334   if (r < 0x100) {
335     *t += StringPrintf("\\x%02x", static_cast<int>(r));
336     return;
337   }
338   *t += StringPrintf("\\x{%x}", static_cast<int>(r));
339 }
340 
AppendCCRange(std::string * t,Rune lo,Rune hi)341 static void AppendCCRange(std::string* t, Rune lo, Rune hi) {
342   if (lo > hi)
343     return;
344   AppendCCChar(t, lo);
345   if (lo < hi) {
346     t->append("-");
347     AppendCCChar(t, hi);
348   }
349 }
350 
351 }  // namespace re2
352