1 // Copyright 2018 The Amber Authors.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/tokenizer.h"
16 
17 #include <cctype>
18 #include <cstdlib>
19 #include <limits>
20 #include <sstream>
21 
22 #include "src/make_unique.h"
23 
24 namespace amber {
25 
Token(TokenType type)26 Token::Token(TokenType type) : type_(type) {}
27 
28 Token::~Token() = default;
29 
ConvertToDouble()30 Result Token::ConvertToDouble() {
31   if (IsDouble())
32     return {};
33 
34   if (IsIdentifier() || IsEOL() || IsEOS())
35     return Result("Invalid conversion to double");
36 
37   if (IsInteger()) {
38     if (is_negative_ ||
39         uint_value_ <=
40             static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) {
41       double_value_ = static_cast<double>(AsInt64());
42     } else {
43       return Result("uint64_t value too big to fit in double");
44     }
45 
46     uint_value_ = 0;
47   } else if (IsHex()) {
48     double_value_ = static_cast<double>(AsHex());
49     string_value_ = "";
50   }
51   type_ = TokenType::kDouble;
52   return {};
53 }
54 
Tokenizer(const std::string & data)55 Tokenizer::Tokenizer(const std::string& data) : data_(data) {}
56 
57 Tokenizer::~Tokenizer() = default;
58 
NextToken()59 std::unique_ptr<Token> Tokenizer::NextToken() {
60   SkipWhitespace();
61   if (current_position_ >= data_.length())
62     return MakeUnique<Token>(TokenType::kEOS);
63 
64   if (data_[current_position_] == '#') {
65     SkipComment();
66     SkipWhitespace();
67   }
68   if (current_position_ >= data_.length())
69     return MakeUnique<Token>(TokenType::kEOS);
70 
71   if (data_[current_position_] == '\n') {
72     ++current_line_;
73     ++current_position_;
74     return MakeUnique<Token>(TokenType::kEOL);
75   }
76 
77   if (data_[current_position_] == '"') {
78     current_position_++;  // Skip opening quote
79     std::string tok_str;
80     bool escape = false;
81     for (; current_position_ < data_.length(); current_position_++) {
82       auto c = data_[current_position_];
83       switch (c) {
84         case '\\':
85           if (!escape) {
86             escape = true;
87             continue;
88           }
89           break;
90         case '"':
91           if (!escape) {
92             current_position_++;  // Skip closing quote
93             auto tok = MakeUnique<Token>(TokenType::kString);
94             tok->SetStringValue(tok_str);
95             return tok;
96           }
97           break;
98         case 'a':
99           if (escape) {
100             tok_str += '\a';
101             escape = false;
102             continue;
103           }
104           break;
105         case 'b':
106           if (escape) {
107             tok_str += '\b';
108             escape = false;
109             continue;
110           }
111           break;
112         case 't':
113           if (escape) {
114             tok_str += '\t';
115             escape = false;
116             continue;
117           }
118           break;
119         case 'n':
120           if (escape) {
121             tok_str += '\n';
122             escape = false;
123             continue;
124           }
125           break;
126         case 'v':
127           if (escape) {
128             tok_str += '\v';
129             escape = false;
130             continue;
131           }
132           break;
133         case 'f':
134           if (escape) {
135             tok_str += '\f';
136             escape = false;
137             continue;
138           }
139           break;
140         case 'r':
141           if (escape) {
142             tok_str += '\r';
143             escape = false;
144             continue;
145           }
146           break;
147       }
148       escape = false;
149       tok_str += c;
150     }
151 
152     auto tok = MakeUnique<Token>(TokenType::kString);
153     tok->SetStringValue(tok_str);
154     return tok;
155   }
156 
157   // If the current position is a , ( or ) then handle it specially as we don't
158   // want to consume any other characters.
159   if (data_[current_position_] == ',' || data_[current_position_] == '(' ||
160       data_[current_position_] == ')') {
161     auto tok = MakeUnique<Token>(TokenType::kIdentifier);
162     std::string str(1, data_[current_position_]);
163     tok->SetStringValue(str);
164     ++current_position_;
165     return tok;
166   }
167 
168   size_t end_pos = current_position_;
169   while (end_pos < data_.length()) {
170     if (data_[end_pos] == ' ' || data_[end_pos] == '\r' ||
171         data_[end_pos] == '\n' || data_[end_pos] == ')' ||
172         data_[end_pos] == ',' || data_[end_pos] == '(') {
173       break;
174     }
175     ++end_pos;
176   }
177 
178   std::string tok_str =
179       data_.substr(current_position_, end_pos - current_position_);
180   current_position_ = end_pos;
181 
182   // Check for "NaN" explicitly.
183   bool is_nan =
184       (tok_str.size() == 3 && std::tolower(tok_str[0]) == 'n' &&
185        std::tolower(tok_str[1]) == 'a' && std::tolower(tok_str[2]) == 'n');
186 
187   // Starts with an alpha is a string.
188   if (!is_nan && !std::isdigit(tok_str[0]) &&
189       !(tok_str[0] == '-' && tok_str.size() >= 2 && std::isdigit(tok_str[1])) &&
190       !(tok_str[0] == '.' && tok_str.size() >= 2 && std::isdigit(tok_str[1]))) {
191     // If we've got a continuation, skip over the end of line and get the next
192     // token.
193     if (tok_str == "\\") {
194       if ((current_position_ < data_.length() &&
195            data_[current_position_] == '\n')) {
196         ++current_line_;
197         ++current_position_;
198         return NextToken();
199       } else if (current_position_ + 1 < data_.length() &&
200                  data_[current_position_] == '\r' &&
201                  data_[current_position_ + 1] == '\n') {
202         ++current_line_;
203         current_position_ += 2;
204         return NextToken();
205       }
206     }
207 
208     auto tok = MakeUnique<Token>(TokenType::kIdentifier);
209     tok->SetStringValue(tok_str);
210     return tok;
211   }
212 
213   // Handle hex strings
214   if (!is_nan && tok_str.size() > 2 && tok_str[0] == '0' && tok_str[1] == 'x') {
215     auto tok = MakeUnique<Token>(TokenType::kHex);
216     tok->SetStringValue(tok_str);
217     return tok;
218   }
219 
220   bool is_double = false;
221   if (is_nan) {
222     is_double = true;
223   } else {
224     for (const char ch : tok_str) {
225       if (ch == '.') {
226         is_double = true;
227         break;
228       }
229     }
230   }
231 
232   std::unique_ptr<Token> tok;
233 
234   char* final_pos = nullptr;
235   if (is_double) {
236     tok = MakeUnique<Token>(TokenType::kDouble);
237 
238     double val = strtod(tok_str.c_str(), &final_pos);
239     tok->SetDoubleValue(val);
240   } else {
241     tok = MakeUnique<Token>(TokenType::kInteger);
242 
243     uint64_t val = uint64_t(std::strtoull(tok_str.c_str(), &final_pos, 10));
244     tok->SetUint64Value(static_cast<uint64_t>(val));
245   }
246   if (tok_str.size() > 1 && tok_str[0] == '-')
247     tok->SetNegative();
248 
249   tok->SetOriginalString(
250       tok_str.substr(0, static_cast<size_t>(final_pos - tok_str.c_str())));
251 
252   // If the number isn't the whole token then move back so we can then parse
253   // the string portion.
254   auto diff = size_t(final_pos - tok_str.c_str());
255   if (diff > 0)
256     current_position_ -= tok_str.length() - diff;
257 
258   return tok;
259 }
260 
PeekNextToken()261 std::unique_ptr<Token> Tokenizer::PeekNextToken() {
262   // Use NextToken() and restore location pointers.
263   auto orig_position = current_position_;
264   auto orig_line = current_line_;
265   std::unique_ptr<Token> tok = NextToken();
266   current_position_ = orig_position;
267   current_line_ = orig_line;
268 
269   return tok;
270 }
271 
ExtractToNext(const std::string & str)272 std::string Tokenizer::ExtractToNext(const std::string& str) {
273   size_t pos = data_.find(str, current_position_);
274   std::string ret;
275   if (pos == std::string::npos) {
276     ret = data_.substr(current_position_);
277     current_position_ = data_.length();
278   } else {
279     ret = data_.substr(current_position_, pos - current_position_);
280     current_position_ = pos;
281   }
282 
283   // Account for any new lines in the extracted text so our current line
284   // number stays correct.
285   for (const char c : ret) {
286     if (c == '\n')
287       ++current_line_;
288   }
289 
290   return ret;
291 }
292 
IsWhitespace(char ch)293 bool Tokenizer::IsWhitespace(char ch) {
294   return ch == '\0' || ch == '\t' || ch == '\r' || ch == 0x0c /* ff */ ||
295          ch == ' ';
296 }
297 
SkipWhitespace()298 void Tokenizer::SkipWhitespace() {
299   while (current_position_ < data_.size() &&
300          IsWhitespace(data_[current_position_])) {
301     ++current_position_;
302   }
303 }
304 
SkipComment()305 void Tokenizer::SkipComment() {
306   while (current_position_ < data_.length() &&
307          data_[current_position_] != '\n') {
308     ++current_position_;
309   }
310 }
311 
312 }  // namespace amber
313