1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Format a regular expression structure as a string.
6 // Tested by parse_test.cc
7 
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/walker-inl.h"
11 
12 namespace re2 {
13 
14 enum {
15   PrecAtom,
16   PrecUnary,
17   PrecConcat,
18   PrecAlternate,
19   PrecEmpty,
20   PrecParen,
21   PrecToplevel,
22 };
23 
24 // Helper function.  See description below.
25 static void AppendCCRange(string* t, Rune lo, Rune hi);
26 
27 // Walker to generate string in s_.
28 // The arg pointers are actually integers giving the
29 // context precedence.
30 // The child_args are always NULL.
31 class ToStringWalker : public Regexp::Walker<int> {
32  public:
ToStringWalker(string * t)33   explicit ToStringWalker(string* t) : t_(t) {}
34 
35   virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
36   virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
37                         int* child_args, int nchild_args);
ShortVisit(Regexp * re,int parent_arg)38   virtual int ShortVisit(Regexp* re, int parent_arg) {
39     return 0;
40   }
41 
42  private:
43   string* t_;  // The string the walker appends to.
44 
45   DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
46 };
47 
ToString()48 string Regexp::ToString() {
49   string t;
50   ToStringWalker w(&t);
51   w.WalkExponential(this, PrecToplevel, 100000);
52   if (w.stopped_early())
53     t += " [truncated]";
54   return t;
55 }
56 
57 #define ToString DontCallToString  // Avoid accidental recursion.
58 
59 // Visits re before children are processed.
60 // Appends ( if needed and passes new precedence to children.
PreVisit(Regexp * re,int parent_arg,bool * stop)61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
62   int prec = parent_arg;
63   int nprec = PrecAtom;
64 
65   switch (re->op()) {
66     case kRegexpNoMatch:
67     case kRegexpEmptyMatch:
68     case kRegexpLiteral:
69     case kRegexpAnyChar:
70     case kRegexpAnyByte:
71     case kRegexpBeginLine:
72     case kRegexpEndLine:
73     case kRegexpBeginText:
74     case kRegexpEndText:
75     case kRegexpWordBoundary:
76     case kRegexpNoWordBoundary:
77     case kRegexpCharClass:
78     case kRegexpHaveMatch:
79       nprec = PrecAtom;
80       break;
81 
82     case kRegexpConcat:
83     case kRegexpLiteralString:
84       if (prec < PrecConcat)
85         t_->append("(?:");
86       nprec = PrecConcat;
87       break;
88 
89     case kRegexpAlternate:
90       if (prec < PrecAlternate)
91         t_->append("(?:");
92       nprec = PrecAlternate;
93       break;
94 
95     case kRegexpCapture:
96       t_->append("(");
97       if (re->name()) {
98         t_->append("?P<");
99         t_->append(*re->name());
100         t_->append(">");
101       }
102       nprec = PrecParen;
103       break;
104 
105     case kRegexpStar:
106     case kRegexpPlus:
107     case kRegexpQuest:
108     case kRegexpRepeat:
109       if (prec < PrecUnary)
110         t_->append("(?:");
111       // The subprecedence here is PrecAtom instead of PrecUnary
112       // because PCRE treats two unary ops in a row as a parse error.
113       nprec = PrecAtom;
114       break;
115   }
116 
117   return nprec;
118 }
119 
AppendLiteral(string * t,Rune r,bool foldcase)120 static void AppendLiteral(string *t, Rune r, bool foldcase) {
121   if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
122     t->append(1, '\\');
123     t->append(1, r);
124   } else if (foldcase && 'a' <= r && r <= 'z') {
125     if ('a' <= r && r <= 'z')
126       r += 'A' - 'a';
127     t->append(1, '[');
128     t->append(1, r);
129     t->append(1, r + 'a' - 'A');
130     t->append(1, ']');
131   } else {
132     AppendCCRange(t, r, r);
133   }
134 }
135 
136 // Visits re after children are processed.
137 // For childless regexps, all the work is done here.
138 // For regexps with children, append any unary suffixes or ).
PostVisit(Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)139 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
140                               int* child_args, int nchild_args) {
141   int prec = parent_arg;
142   switch (re->op()) {
143     case kRegexpNoMatch:
144       // There's no simple symbol for "no match", but
145       // [^0-Runemax] excludes everything.
146       t_->append("[^\\x00-\\x{10ffff}]");
147       break;
148 
149     case kRegexpEmptyMatch:
150       // Append (?:) to make empty string visible,
151       // unless this is already being parenthesized.
152       if (prec < PrecEmpty)
153         t_->append("(?:)");
154       break;
155 
156     case kRegexpLiteral:
157       AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
158       break;
159 
160     case kRegexpLiteralString:
161       for (int i = 0; i < re->nrunes(); i++)
162         AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
163       if (prec < PrecConcat)
164         t_->append(")");
165       break;
166 
167     case kRegexpConcat:
168       if (prec < PrecConcat)
169         t_->append(")");
170       break;
171 
172     case kRegexpAlternate:
173       // Clumsy but workable: the children all appended |
174       // at the end of their strings, so just remove the last one.
175       if ((*t_)[t_->size()-1] == '|')
176         t_->erase(t_->size()-1);
177       else
178         LOG(DFATAL) << "Bad final char: " << t_;
179       if (prec < PrecAlternate)
180         t_->append(")");
181       break;
182 
183     case kRegexpStar:
184       t_->append("*");
185       if (re->parse_flags() & Regexp::NonGreedy)
186         t_->append("?");
187       if (prec < PrecUnary)
188         t_->append(")");
189       break;
190 
191     case kRegexpPlus:
192       t_->append("+");
193       if (re->parse_flags() & Regexp::NonGreedy)
194         t_->append("?");
195       if (prec < PrecUnary)
196         t_->append(")");
197       break;
198 
199     case kRegexpQuest:
200       t_->append("?");
201       if (re->parse_flags() & Regexp::NonGreedy)
202         t_->append("?");
203       if (prec < PrecUnary)
204         t_->append(")");
205       break;
206 
207     case kRegexpRepeat:
208       if (re->max() == -1)
209         t_->append(StringPrintf("{%d,}", re->min()));
210       else if (re->min() == re->max())
211         t_->append(StringPrintf("{%d}", re->min()));
212       else
213         t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
214       if (re->parse_flags() & Regexp::NonGreedy)
215         t_->append("?");
216       if (prec < PrecUnary)
217         t_->append(")");
218       break;
219 
220     case kRegexpAnyChar:
221       t_->append(".");
222       break;
223 
224     case kRegexpAnyByte:
225       t_->append("\\C");
226       break;
227 
228     case kRegexpBeginLine:
229       t_->append("^");
230       break;
231 
232     case kRegexpEndLine:
233       t_->append("$");
234       break;
235 
236     case kRegexpBeginText:
237       t_->append("(?-m:^)");
238       break;
239 
240     case kRegexpEndText:
241       if (re->parse_flags() & Regexp::WasDollar)
242         t_->append("(?-m:$)");
243       else
244         t_->append("\\z");
245       break;
246 
247     case kRegexpWordBoundary:
248       t_->append("\\b");
249       break;
250 
251     case kRegexpNoWordBoundary:
252       t_->append("\\B");
253       break;
254 
255     case kRegexpCharClass: {
256       if (re->cc()->size() == 0) {
257         t_->append("[^\\x00-\\x{10ffff}]");
258         break;
259       }
260       t_->append("[");
261       // Heuristic: show class as negated if it contains the
262       // non-character 0xFFFE.
263       CharClass* cc = re->cc();
264       if (cc->Contains(0xFFFE)) {
265         cc = cc->Negate();
266         t_->append("^");
267       }
268       for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
269         AppendCCRange(t_, i->lo, i->hi);
270       if (cc != re->cc())
271         cc->Delete();
272       t_->append("]");
273       break;
274     }
275 
276     case kRegexpCapture:
277       t_->append(")");
278       break;
279 
280     case kRegexpHaveMatch:
281       // There's no syntax accepted by the parser to generate
282       // this node (it is generated by RE2::Set) so make something
283       // up that is readable but won't compile.
284       t_->append("(?HaveMatch:%d)", re->match_id());
285       break;
286   }
287 
288   // If the parent is an alternation, append the | for it.
289   if (prec == PrecAlternate)
290     t_->append("|");
291 
292   return 0;
293 }
294 
295 // Appends a rune for use in a character class to the string t.
AppendCCChar(string * t,Rune r)296 static void AppendCCChar(string* t, Rune r) {
297   if (0x20 <= r && r <= 0x7E) {
298     if (strchr("[]^-\\", r))
299       t->append("\\");
300     t->append(1, r);
301     return;
302   }
303   switch (r) {
304     default:
305       break;
306 
307     case '\r':
308       t->append("\\r");
309       return;
310 
311     case '\t':
312       t->append("\\t");
313       return;
314 
315     case '\n':
316       t->append("\\n");
317       return;
318 
319     case '\f':
320       t->append("\\f");
321       return;
322   }
323 
324   if (r < 0x100) {
325     StringAppendF(t, "\\x%02x", static_cast<int>(r));
326     return;
327   }
328   StringAppendF(t, "\\x{%x}", static_cast<int>(r));
329 }
330 
AppendCCRange(string * t,Rune lo,Rune hi)331 static void AppendCCRange(string* t, Rune lo, Rune hi) {
332   if (lo > hi)
333     return;
334   AppendCCChar(t, lo);
335   if (lo < hi) {
336     t->append("-");
337     AppendCCChar(t, hi);
338   }
339 }
340 
341 }  // namespace re2
342