1 /* 2 * Copyright 2017 Google Inc. 3 * 4 * Use of this source code is governed by a BSD-style license that can be 5 * found in the LICENSE file. 6 */ 7 8 #include "RegexParser.h" 9 10 #include "LexUtil.h" 11 12 RegexNode RegexParser::parse(std::string source) { 13 fSource = source; 14 fIndex = 0; 15 ASSERT(fStack.size() == 0); 16 this->regex(); 17 ASSERT(fStack.size() == 1); 18 ASSERT(fIndex == source.size()); 19 return this->pop(); 20 } 21 22 char RegexParser::peek() { 23 if (fIndex >= fSource.size()) { 24 return END; 25 } 26 return fSource[fIndex]; 27 } 28 29 void RegexParser::expect(char c) { 30 if (this->peek() != c) { 31 printf("expected '%c' at index %d, but found '%c'", c, (int) fIndex, this->peek()); 32 exit(1); 33 } 34 ++fIndex; 35 } 36 37 RegexNode RegexParser::pop() { 38 RegexNode result = fStack.top(); 39 fStack.pop(); 40 return result; 41 } 42 43 void RegexParser::term() { 44 switch (this->peek()) { 45 case '(': this->group(); break; 46 case '[': this->set(); break; 47 case '.': this->dot(); break; 48 default: this->literal(); 49 } 50 } 51 52 void RegexParser::quantifiedTerm() { 53 this->term(); 54 switch (this->peek()) { 55 case '*': fStack.push(RegexNode(RegexNode::kStar_Kind, this->pop())); ++fIndex; break; 56 case '+': fStack.push(RegexNode(RegexNode::kPlus_Kind, this->pop())); ++fIndex; break; 57 case '?': fStack.push(RegexNode(RegexNode::kQuestion_Kind, this->pop())); ++fIndex; break; 58 default: break; 59 } 60 } 61 62 void RegexParser::sequence() { 63 this->quantifiedTerm(); 64 for (;;) { 65 switch (this->peek()) { 66 case END: // fall through 67 case '|': // fall through 68 case ')': return; 69 default: 70 this->sequence(); 71 RegexNode right = this->pop(); 72 RegexNode left = this->pop(); 73 fStack.emplace(RegexNode::kConcat_Kind, std::move(left), std::move(right)); 74 } 75 } 76 } 77 78 RegexNode RegexParser::escapeSequence(char c) { 79 switch (c) { 80 case 'n': return RegexNode(RegexNode::kChar_Kind, '\n'); 81 case 'r': return RegexNode(RegexNode::kChar_Kind, '\r'); 82 case 't': return RegexNode(RegexNode::kChar_Kind, '\t'); 83 case 's': return RegexNode(RegexNode::kCharset_Kind, " \t\n\r"); 84 default: return RegexNode(RegexNode::kChar_Kind, c); 85 } 86 } 87 88 void RegexParser::literal() { 89 char c = this->peek(); 90 if (c == '\\') { 91 ++fIndex; 92 fStack.push(this->escapeSequence(peek())); 93 ++fIndex; 94 } 95 else { 96 fStack.push(RegexNode(RegexNode::kChar_Kind, c)); 97 ++fIndex; 98 } 99 } 100 101 void RegexParser::dot() { 102 this->expect('.'); 103 fStack.push(RegexNode(RegexNode::kDot_Kind)); 104 } 105 106 void RegexParser::group() { 107 this->expect('('); 108 this->regex(); 109 this->expect(')'); 110 } 111 112 void RegexParser::setItem() { 113 this->literal(); 114 if (this->peek() == '-') { 115 ++fIndex; 116 if (peek() == ']') { 117 fStack.push(RegexNode(RegexNode::kChar_Kind, '-')); 118 } 119 else { 120 literal(); 121 RegexNode end = this->pop(); 122 ASSERT(end.fKind == RegexNode::kChar_Kind); 123 RegexNode start = this->pop(); 124 ASSERT(start.fKind == RegexNode::kChar_Kind); 125 fStack.push(RegexNode(RegexNode::kRange_Kind, std::move(start), std::move(end))); 126 } 127 } 128 } 129 130 void RegexParser::set() { 131 expect('['); 132 size_t depth = fStack.size(); 133 RegexNode set(RegexNode::kCharset_Kind); 134 if (this->peek() == '^') { 135 ++fIndex; 136 set.fPayload.fBool = true; 137 } 138 else { 139 set.fPayload.fBool = false; 140 } 141 for (;;) { 142 switch (this->peek()) { 143 case ']': 144 ++fIndex; 145 while (fStack.size() > depth) { 146 set.fChildren.push_back(this->pop()); 147 } 148 fStack.push(std::move(set)); 149 return; 150 case END: 151 printf("unterminated character set\n"); 152 exit(1); 153 default: 154 this->setItem(); 155 } 156 } 157 } 158 159 void RegexParser::regex() { 160 this->sequence(); 161 switch (this->peek()) { 162 case '|': { 163 ++fIndex; 164 this->regex(); 165 RegexNode right = this->pop(); 166 RegexNode left = this->pop(); 167 fStack.push(RegexNode(RegexNode::kOr_Kind, left, right)); 168 break; 169 } 170 case END: // fall through 171 case ')': 172 return; 173 default: 174 ASSERT(false); 175 } 176 } 177