1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "RegexParser.h"
9 
10 #include "LexUtil.h"
11 
12 RegexNode RegexParser::parse(std::string source) {
13     fSource = source;
14     fIndex = 0;
15     ASSERT(fStack.size() == 0);
16     this->regex();
17     ASSERT(fStack.size() == 1);
18     ASSERT(fIndex == source.size());
19     return this->pop();
20 }
21 
22 char RegexParser::peek() {
23     if (fIndex >= fSource.size()) {
24         return END;
25     }
26     return fSource[fIndex];
27 }
28 
29 void RegexParser::expect(char c) {
30     if (this->peek() != c) {
31         printf("expected '%c' at index %d, but found '%c'", c, (int) fIndex, this->peek());
32         exit(1);
33     }
34     ++fIndex;
35 }
36 
37 RegexNode RegexParser::pop() {
38     RegexNode result = fStack.top();
39     fStack.pop();
40     return result;
41 }
42 
43 void RegexParser::term() {
44     switch (this->peek()) {
45         case '(': this->group();  break;
46         case '[': this->set();    break;
47         case '.': this->dot();    break;
48         default: this->literal();
49     }
50 }
51 
52 void RegexParser::quantifiedTerm() {
53     this->term();
54     switch (this->peek()) {
55         case '*': fStack.push(RegexNode(RegexNode::kStar_Kind,     this->pop())); ++fIndex; break;
56         case '+': fStack.push(RegexNode(RegexNode::kPlus_Kind,     this->pop())); ++fIndex; break;
57         case '?': fStack.push(RegexNode(RegexNode::kQuestion_Kind, this->pop())); ++fIndex; break;
58         default:  break;
59     }
60 }
61 
62 void RegexParser::sequence() {
63     this->quantifiedTerm();
64     for (;;) {
65         switch (this->peek()) {
66             case END: // fall through
67             case '|': // fall through
68             case ')': return;
69             default:
70                 this->sequence();
71                 RegexNode right = this->pop();
72                 RegexNode left = this->pop();
73                 fStack.emplace(RegexNode::kConcat_Kind, std::move(left), std::move(right));
74         }
75     }
76 }
77 
78 RegexNode RegexParser::escapeSequence(char c) {
79     switch (c) {
80         case 'n': return RegexNode(RegexNode::kChar_Kind, '\n');
81         case 'r': return RegexNode(RegexNode::kChar_Kind, '\r');
82         case 't': return RegexNode(RegexNode::kChar_Kind, '\t');
83         case 's': return RegexNode(RegexNode::kCharset_Kind, " \t\n\r");
84         default:  return RegexNode(RegexNode::kChar_Kind, c);
85     }
86 }
87 
88 void RegexParser::literal() {
89     char c = this->peek();
90     if (c == '\\') {
91         ++fIndex;
92         fStack.push(this->escapeSequence(peek()));
93         ++fIndex;
94     }
95     else {
96         fStack.push(RegexNode(RegexNode::kChar_Kind, c));
97         ++fIndex;
98     }
99 }
100 
101 void RegexParser::dot() {
102     this->expect('.');
103     fStack.push(RegexNode(RegexNode::kDot_Kind));
104 }
105 
106 void RegexParser::group() {
107     this->expect('(');
108     this->regex();
109     this->expect(')');
110 }
111 
112 void RegexParser::setItem() {
113     this->literal();
114     if (this->peek() == '-') {
115         ++fIndex;
116         if (peek() == ']') {
117             fStack.push(RegexNode(RegexNode::kChar_Kind, '-'));
118         }
119         else {
120             literal();
121             RegexNode end = this->pop();
122             ASSERT(end.fKind == RegexNode::kChar_Kind);
123             RegexNode start = this->pop();
124             ASSERT(start.fKind == RegexNode::kChar_Kind);
125             fStack.push(RegexNode(RegexNode::kRange_Kind, std::move(start), std::move(end)));
126         }
127     }
128 }
129 
130 void RegexParser::set() {
131     expect('[');
132     size_t depth = fStack.size();
133     RegexNode set(RegexNode::kCharset_Kind);
134     if (this->peek() == '^') {
135         ++fIndex;
136         set.fPayload.fBool = true;
137     }
138     else {
139         set.fPayload.fBool = false;
140     }
141     for (;;) {
142         switch (this->peek()) {
143             case ']':
144                 ++fIndex;
145                 while (fStack.size() > depth) {
146                     set.fChildren.push_back(this->pop());
147                 }
148                 fStack.push(std::move(set));
149                 return;
150             case END:
151                 printf("unterminated character set\n");
152                 exit(1);
153             default:
154                 this->setItem();
155         }
156     }
157 }
158 
159 void RegexParser::regex() {
160     this->sequence();
161     switch (this->peek()) {
162         case '|': {
163             ++fIndex;
164             this->regex();
165             RegexNode right = this->pop();
166             RegexNode left = this->pop();
167             fStack.push(RegexNode(RegexNode::kOr_Kind, left, right));
168             break;
169         }
170         case END: // fall through
171         case ')':
172             return;
173         default:
174             ASSERT(false);
175     }
176 }
177