1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Format a regular expression structure as a string.
6 // Tested by parse_test.cc
7
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/walker-inl.h"
11
12 namespace re2 {
13
14 enum {
15 PrecAtom,
16 PrecUnary,
17 PrecConcat,
18 PrecAlternate,
19 PrecEmpty,
20 PrecParen,
21 PrecToplevel,
22 };
23
24 // Helper function. See description below.
25 static void AppendCCRange(string* t, Rune lo, Rune hi);
26
27 // Walker to generate string in s_.
28 // The arg pointers are actually integers giving the
29 // context precedence.
30 // The child_args are always NULL.
31 class ToStringWalker : public Regexp::Walker<int> {
32 public:
ToStringWalker(string * t)33 explicit ToStringWalker(string* t) : t_(t) {}
34
35 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
36 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
37 int* child_args, int nchild_args);
ShortVisit(Regexp * re,int parent_arg)38 virtual int ShortVisit(Regexp* re, int parent_arg) {
39 return 0;
40 }
41
42 private:
43 string* t_; // The string the walker appends to.
44
45 DISALLOW_EVIL_CONSTRUCTORS(ToStringWalker);
46 };
47
ToString()48 string Regexp::ToString() {
49 string t;
50 ToStringWalker w(&t);
51 w.WalkExponential(this, PrecToplevel, 100000);
52 if (w.stopped_early())
53 t += " [truncated]";
54 return t;
55 }
56
57 #define ToString DontCallToString // Avoid accidental recursion.
58
59 // Visits re before children are processed.
60 // Appends ( if needed and passes new precedence to children.
PreVisit(Regexp * re,int parent_arg,bool * stop)61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
62 int prec = parent_arg;
63 int nprec = PrecAtom;
64
65 switch (re->op()) {
66 case kRegexpNoMatch:
67 case kRegexpEmptyMatch:
68 case kRegexpLiteral:
69 case kRegexpAnyChar:
70 case kRegexpAnyByte:
71 case kRegexpBeginLine:
72 case kRegexpEndLine:
73 case kRegexpBeginText:
74 case kRegexpEndText:
75 case kRegexpWordBoundary:
76 case kRegexpNoWordBoundary:
77 case kRegexpCharClass:
78 case kRegexpHaveMatch:
79 nprec = PrecAtom;
80 break;
81
82 case kRegexpConcat:
83 case kRegexpLiteralString:
84 if (prec < PrecConcat)
85 t_->append("(?:");
86 nprec = PrecConcat;
87 break;
88
89 case kRegexpAlternate:
90 if (prec < PrecAlternate)
91 t_->append("(?:");
92 nprec = PrecAlternate;
93 break;
94
95 case kRegexpCapture:
96 t_->append("(");
97 if (re->name()) {
98 t_->append("?P<");
99 t_->append(*re->name());
100 t_->append(">");
101 }
102 nprec = PrecParen;
103 break;
104
105 case kRegexpStar:
106 case kRegexpPlus:
107 case kRegexpQuest:
108 case kRegexpRepeat:
109 if (prec < PrecUnary)
110 t_->append("(?:");
111 // The subprecedence here is PrecAtom instead of PrecUnary
112 // because PCRE treats two unary ops in a row as a parse error.
113 nprec = PrecAtom;
114 break;
115 }
116
117 return nprec;
118 }
119
AppendLiteral(string * t,Rune r,bool foldcase)120 static void AppendLiteral(string *t, Rune r, bool foldcase) {
121 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) {
122 t->append(1, '\\');
123 t->append(1, r);
124 } else if (foldcase && 'a' <= r && r <= 'z') {
125 if ('a' <= r && r <= 'z')
126 r += 'A' - 'a';
127 t->append(1, '[');
128 t->append(1, r);
129 t->append(1, r + 'a' - 'A');
130 t->append(1, ']');
131 } else {
132 AppendCCRange(t, r, r);
133 }
134 }
135
136 // Visits re after children are processed.
137 // For childless regexps, all the work is done here.
138 // For regexps with children, append any unary suffixes or ).
PostVisit(Regexp * re,int parent_arg,int pre_arg,int * child_args,int nchild_args)139 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
140 int* child_args, int nchild_args) {
141 int prec = parent_arg;
142 switch (re->op()) {
143 case kRegexpNoMatch:
144 // There's no simple symbol for "no match", but
145 // [^0-Runemax] excludes everything.
146 t_->append("[^\\x00-\\x{10ffff}]");
147 break;
148
149 case kRegexpEmptyMatch:
150 // Append (?:) to make empty string visible,
151 // unless this is already being parenthesized.
152 if (prec < PrecEmpty)
153 t_->append("(?:)");
154 break;
155
156 case kRegexpLiteral:
157 AppendLiteral(t_, re->rune(), re->parse_flags() & Regexp::FoldCase);
158 break;
159
160 case kRegexpLiteralString:
161 for (int i = 0; i < re->nrunes(); i++)
162 AppendLiteral(t_, re->runes()[i], re->parse_flags() & Regexp::FoldCase);
163 if (prec < PrecConcat)
164 t_->append(")");
165 break;
166
167 case kRegexpConcat:
168 if (prec < PrecConcat)
169 t_->append(")");
170 break;
171
172 case kRegexpAlternate:
173 // Clumsy but workable: the children all appended |
174 // at the end of their strings, so just remove the last one.
175 if ((*t_)[t_->size()-1] == '|')
176 t_->erase(t_->size()-1);
177 else
178 LOG(DFATAL) << "Bad final char: " << t_;
179 if (prec < PrecAlternate)
180 t_->append(")");
181 break;
182
183 case kRegexpStar:
184 t_->append("*");
185 if (re->parse_flags() & Regexp::NonGreedy)
186 t_->append("?");
187 if (prec < PrecUnary)
188 t_->append(")");
189 break;
190
191 case kRegexpPlus:
192 t_->append("+");
193 if (re->parse_flags() & Regexp::NonGreedy)
194 t_->append("?");
195 if (prec < PrecUnary)
196 t_->append(")");
197 break;
198
199 case kRegexpQuest:
200 t_->append("?");
201 if (re->parse_flags() & Regexp::NonGreedy)
202 t_->append("?");
203 if (prec < PrecUnary)
204 t_->append(")");
205 break;
206
207 case kRegexpRepeat:
208 if (re->max() == -1)
209 t_->append(StringPrintf("{%d,}", re->min()));
210 else if (re->min() == re->max())
211 t_->append(StringPrintf("{%d}", re->min()));
212 else
213 t_->append(StringPrintf("{%d,%d}", re->min(), re->max()));
214 if (re->parse_flags() & Regexp::NonGreedy)
215 t_->append("?");
216 if (prec < PrecUnary)
217 t_->append(")");
218 break;
219
220 case kRegexpAnyChar:
221 t_->append(".");
222 break;
223
224 case kRegexpAnyByte:
225 t_->append("\\C");
226 break;
227
228 case kRegexpBeginLine:
229 t_->append("^");
230 break;
231
232 case kRegexpEndLine:
233 t_->append("$");
234 break;
235
236 case kRegexpBeginText:
237 t_->append("(?-m:^)");
238 break;
239
240 case kRegexpEndText:
241 if (re->parse_flags() & Regexp::WasDollar)
242 t_->append("(?-m:$)");
243 else
244 t_->append("\\z");
245 break;
246
247 case kRegexpWordBoundary:
248 t_->append("\\b");
249 break;
250
251 case kRegexpNoWordBoundary:
252 t_->append("\\B");
253 break;
254
255 case kRegexpCharClass: {
256 if (re->cc()->size() == 0) {
257 t_->append("[^\\x00-\\x{10ffff}]");
258 break;
259 }
260 t_->append("[");
261 // Heuristic: show class as negated if it contains the
262 // non-character 0xFFFE.
263 CharClass* cc = re->cc();
264 if (cc->Contains(0xFFFE)) {
265 cc = cc->Negate();
266 t_->append("^");
267 }
268 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i)
269 AppendCCRange(t_, i->lo, i->hi);
270 if (cc != re->cc())
271 cc->Delete();
272 t_->append("]");
273 break;
274 }
275
276 case kRegexpCapture:
277 t_->append(")");
278 break;
279
280 case kRegexpHaveMatch:
281 // There's no syntax accepted by the parser to generate
282 // this node (it is generated by RE2::Set) so make something
283 // up that is readable but won't compile.
284 t_->append("(?HaveMatch:%d)", re->match_id());
285 break;
286 }
287
288 // If the parent is an alternation, append the | for it.
289 if (prec == PrecAlternate)
290 t_->append("|");
291
292 return 0;
293 }
294
295 // Appends a rune for use in a character class to the string t.
AppendCCChar(string * t,Rune r)296 static void AppendCCChar(string* t, Rune r) {
297 if (0x20 <= r && r <= 0x7E) {
298 if (strchr("[]^-\\", r))
299 t->append("\\");
300 t->append(1, r);
301 return;
302 }
303 switch (r) {
304 default:
305 break;
306
307 case '\r':
308 t->append("\\r");
309 return;
310
311 case '\t':
312 t->append("\\t");
313 return;
314
315 case '\n':
316 t->append("\\n");
317 return;
318
319 case '\f':
320 t->append("\\f");
321 return;
322 }
323
324 if (r < 0x100) {
325 StringAppendF(t, "\\x%02x", static_cast<int>(r));
326 return;
327 }
328 StringAppendF(t, "\\x{%x}", static_cast<int>(r));
329 }
330
AppendCCRange(string * t,Rune lo,Rune hi)331 static void AppendCCRange(string* t, Rune lo, Rune hi) {
332 if (lo > hi)
333 return;
334 AppendCCChar(t, lo);
335 if (lo < hi) {
336 t->append("-");
337 AppendCCChar(t, hi);
338 }
339 }
340
341 } // namespace re2
342