1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/asmjs/asm-scanner.h"
6
7 #include "src/char-predicates-inl.h"
8 #include "src/conversions.h"
9 #include "src/flags.h"
10 #include "src/parsing/scanner.h"
11 #include "src/unicode-cache.h"
12
13 namespace v8 {
14 namespace internal {
15
16 namespace {
17 // Cap number of identifiers to ensure we can assign both global and
18 // local ones a token id in the range of an int32_t.
19 static const int kMaxIdentifierCount = 0xF000000;
20 };
21
AsmJsScanner(Utf16CharacterStream * stream)22 AsmJsScanner::AsmJsScanner(Utf16CharacterStream* stream)
23 : stream_(stream),
24 token_(kUninitialized),
25 preceding_token_(kUninitialized),
26 next_token_(kUninitialized),
27 position_(0),
28 preceding_position_(0),
29 next_position_(0),
30 rewind_(false),
31 in_local_scope_(false),
32 global_count_(0),
33 double_value_(0.0),
34 unsigned_value_(0),
35 preceded_by_newline_(false) {
36 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
37 STDLIB_MATH_FUNCTION_LIST(V)
38 STDLIB_ARRAY_TYPE_LIST(V)
39 #undef V
40 #define V(name, _junk1) property_names_[#name] = kToken_##name;
41 STDLIB_MATH_VALUE_LIST(V)
42 #undef V
43 #define V(name) property_names_[#name] = kToken_##name;
44 STDLIB_OTHER_LIST(V)
45 #undef V
46 #define V(name) global_names_[#name] = kToken_##name;
47 KEYWORD_NAME_LIST(V)
48 #undef V
49 Next();
50 }
51
Next()52 void AsmJsScanner::Next() {
53 if (rewind_) {
54 preceding_token_ = token_;
55 preceding_position_ = position_;
56 token_ = next_token_;
57 position_ = next_position_;
58 next_token_ = kUninitialized;
59 next_position_ = 0;
60 rewind_ = false;
61 return;
62 }
63
64 if (token_ == kEndOfInput || token_ == kParseError) {
65 return;
66 }
67
68 #if DEBUG
69 if (FLAG_trace_asm_scanner) {
70 if (Token() == kDouble) {
71 PrintF("%lf ", AsDouble());
72 } else if (Token() == kUnsigned) {
73 PrintF("%" PRIu32 " ", AsUnsigned());
74 } else {
75 std::string name = Name(Token());
76 PrintF("%s ", name.c_str());
77 }
78 }
79 #endif
80
81 preceded_by_newline_ = false;
82 preceding_token_ = token_;
83 preceding_position_ = position_;
84
85 for (;;) {
86 position_ = stream_->pos();
87 uc32 ch = stream_->Advance();
88 switch (ch) {
89 case ' ':
90 case '\t':
91 case '\r':
92 // Ignore whitespace.
93 break;
94
95 case '\n':
96 // Track when we've passed a newline for optional semicolon support,
97 // but keep scanning.
98 preceded_by_newline_ = true;
99 break;
100
101 case kEndOfInput:
102 token_ = kEndOfInput;
103 return;
104
105 case '\'':
106 case '"':
107 ConsumeString(ch);
108 return;
109
110 case '/':
111 ch = stream_->Advance();
112 if (ch == '/') {
113 ConsumeCPPComment();
114 } else if (ch == '*') {
115 if (!ConsumeCComment()) {
116 token_ = kParseError;
117 return;
118 }
119 } else {
120 stream_->Back();
121 token_ = '/';
122 return;
123 }
124 // Breaks out of switch, but loops again (i.e. the case when we parsed
125 // a comment, but need to continue to look for the next token).
126 break;
127
128 case '<':
129 case '>':
130 case '=':
131 case '!':
132 ConsumeCompareOrShift(ch);
133 return;
134
135 #define V(single_char_token) case single_char_token:
136 SIMPLE_SINGLE_TOKEN_LIST(V)
137 #undef V
138 // Use fixed token IDs for ASCII.
139 token_ = ch;
140 return;
141
142 default:
143 if (IsIdentifierStart(ch)) {
144 ConsumeIdentifier(ch);
145 } else if (IsNumberStart(ch)) {
146 ConsumeNumber(ch);
147 } else {
148 // TODO(bradnelson): Support unicode (probably via UnicodeCache).
149 token_ = kParseError;
150 }
151 return;
152 }
153 }
154 }
155
Rewind()156 void AsmJsScanner::Rewind() {
157 DCHECK_NE(kUninitialized, preceding_token_);
158 // TODO(bradnelson): Currently rewinding needs to leave in place the
159 // preceding newline state (in case a |0 ends a line).
160 // This is weird and stateful, fix me.
161 DCHECK(!rewind_);
162 next_token_ = token_;
163 next_position_ = position_;
164 token_ = preceding_token_;
165 position_ = preceding_position_;
166 preceding_token_ = kUninitialized;
167 preceding_position_ = 0;
168 rewind_ = true;
169 identifier_string_.clear();
170 }
171
ResetLocals()172 void AsmJsScanner::ResetLocals() { local_names_.clear(); }
173
174 #if DEBUG
175 // Only used for debugging.
Name(token_t token) const176 std::string AsmJsScanner::Name(token_t token) const {
177 if (token >= 32 && token < 127) {
178 return std::string(1, static_cast<char>(token));
179 }
180 for (auto& i : local_names_) {
181 if (i.second == token) {
182 return i.first;
183 }
184 }
185 for (auto& i : global_names_) {
186 if (i.second == token) {
187 return i.first;
188 }
189 }
190 for (auto& i : property_names_) {
191 if (i.second == token) {
192 return i.first;
193 }
194 }
195 switch (token) {
196 #define V(rawname, name) \
197 case kToken_##name: \
198 return rawname;
199 LONG_SYMBOL_NAME_LIST(V)
200 #undef V
201 #define V(name, value, string_name) \
202 case name: \
203 return string_name;
204 SPECIAL_TOKEN_LIST(V)
205 default:
206 break;
207 #undef V
208 }
209 UNREACHABLE();
210 }
211 #endif
212
Seek(size_t pos)213 void AsmJsScanner::Seek(size_t pos) {
214 stream_->Seek(pos);
215 preceding_token_ = kUninitialized;
216 token_ = kUninitialized;
217 next_token_ = kUninitialized;
218 preceding_position_ = 0;
219 position_ = 0;
220 next_position_ = 0;
221 rewind_ = false;
222 Next();
223 }
224
ConsumeIdentifier(uc32 ch)225 void AsmJsScanner::ConsumeIdentifier(uc32 ch) {
226 // Consume characters while still part of the identifier.
227 identifier_string_.clear();
228 while (IsIdentifierPart(ch)) {
229 identifier_string_ += ch;
230 ch = stream_->Advance();
231 }
232 // Go back one for next time.
233 stream_->Back();
234
235 // Decode what the identifier means.
236 if (preceding_token_ == '.') {
237 auto i = property_names_.find(identifier_string_);
238 if (i != property_names_.end()) {
239 token_ = i->second;
240 return;
241 }
242 } else {
243 {
244 auto i = local_names_.find(identifier_string_);
245 if (i != local_names_.end()) {
246 token_ = i->second;
247 return;
248 }
249 }
250 if (!in_local_scope_) {
251 auto i = global_names_.find(identifier_string_);
252 if (i != global_names_.end()) {
253 token_ = i->second;
254 return;
255 }
256 }
257 }
258 if (preceding_token_ == '.') {
259 CHECK_LT(global_count_, kMaxIdentifierCount);
260 token_ = kGlobalsStart + global_count_++;
261 property_names_[identifier_string_] = token_;
262 } else if (in_local_scope_) {
263 CHECK_LT(local_names_.size(), kMaxIdentifierCount);
264 token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
265 local_names_[identifier_string_] = token_;
266 } else {
267 CHECK_LT(global_count_, kMaxIdentifierCount);
268 token_ = kGlobalsStart + global_count_++;
269 global_names_[identifier_string_] = token_;
270 }
271 }
272
ConsumeNumber(uc32 ch)273 void AsmJsScanner::ConsumeNumber(uc32 ch) {
274 std::string number;
275 number = ch;
276 bool has_dot = ch == '.';
277 bool has_prefix = false;
278 for (;;) {
279 ch = stream_->Advance();
280 if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
281 (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'b' || ch == 'o' ||
282 ch == 'x' ||
283 ((ch == '-' || ch == '+') && !has_prefix &&
284 (number[number.size() - 1] == 'e' ||
285 number[number.size() - 1] == 'E'))) {
286 // TODO(bradnelson): Test weird cases ending in -.
287 if (ch == '.') {
288 has_dot = true;
289 }
290 if (ch == 'b' || ch == 'o' || ch == 'x') {
291 has_prefix = true;
292 }
293 number.push_back(ch);
294 } else {
295 break;
296 }
297 }
298 stream_->Back();
299 // Special case the most common number.
300 if (number.size() == 1 && number[0] == '0') {
301 unsigned_value_ = 0;
302 token_ = kUnsigned;
303 return;
304 }
305 // Pick out dot.
306 if (number.size() == 1 && number[0] == '.') {
307 token_ = '.';
308 return;
309 }
310 // Decode numbers.
311 UnicodeCache cache;
312 double_value_ = StringToDouble(
313 &cache,
314 Vector<const uint8_t>(reinterpret_cast<const uint8_t*>(number.data()),
315 static_cast<int>(number.size())),
316 ALLOW_HEX | ALLOW_OCTAL | ALLOW_BINARY | ALLOW_IMPLICIT_OCTAL);
317 if (std::isnan(double_value_)) {
318 // Check if string to number conversion didn't consume all the characters.
319 // This happens if the character filter let through something invalid
320 // like: 0123ef for example.
321 // TODO(bradnelson): Check if this happens often enough to be a perf
322 // problem.
323 if (number[0] == '.') {
324 for (size_t k = 1; k < number.size(); ++k) {
325 stream_->Back();
326 }
327 token_ = '.';
328 return;
329 }
330 // Anything else that doesn't parse is an error.
331 token_ = kParseError;
332 return;
333 }
334 if (has_dot) {
335 token_ = kDouble;
336 } else {
337 // Exceeding safe integer range is an error.
338 if (double_value_ > static_cast<double>(kMaxUInt32)) {
339 token_ = kParseError;
340 return;
341 }
342 unsigned_value_ = static_cast<uint32_t>(double_value_);
343 token_ = kUnsigned;
344 }
345 }
346
ConsumeCComment()347 bool AsmJsScanner::ConsumeCComment() {
348 for (;;) {
349 uc32 ch = stream_->Advance();
350 while (ch == '*') {
351 ch = stream_->Advance();
352 if (ch == '/') {
353 return true;
354 }
355 }
356 if (ch == kEndOfInput) {
357 return false;
358 }
359 }
360 }
361
ConsumeCPPComment()362 void AsmJsScanner::ConsumeCPPComment() {
363 for (;;) {
364 uc32 ch = stream_->Advance();
365 if (ch == '\n' || ch == kEndOfInput) {
366 return;
367 }
368 }
369 }
370
ConsumeString(uc32 quote)371 void AsmJsScanner::ConsumeString(uc32 quote) {
372 // Only string allowed is 'use asm' / "use asm".
373 const char* expected = "use asm";
374 for (; *expected != '\0'; ++expected) {
375 if (stream_->Advance() != *expected) {
376 token_ = kParseError;
377 return;
378 }
379 }
380 if (stream_->Advance() != quote) {
381 token_ = kParseError;
382 return;
383 }
384 token_ = kToken_UseAsm;
385 }
386
ConsumeCompareOrShift(uc32 ch)387 void AsmJsScanner::ConsumeCompareOrShift(uc32 ch) {
388 uc32 next_ch = stream_->Advance();
389 if (next_ch == '=') {
390 switch (ch) {
391 case '<':
392 token_ = kToken_LE;
393 break;
394 case '>':
395 token_ = kToken_GE;
396 break;
397 case '=':
398 token_ = kToken_EQ;
399 break;
400 case '!':
401 token_ = kToken_NE;
402 break;
403 default:
404 UNREACHABLE();
405 }
406 } else if (ch == '<' && next_ch == '<') {
407 token_ = kToken_SHL;
408 } else if (ch == '>' && next_ch == '>') {
409 if (stream_->Advance() == '>') {
410 token_ = kToken_SHR;
411 } else {
412 token_ = kToken_SAR;
413 stream_->Back();
414 }
415 } else {
416 stream_->Back();
417 token_ = ch;
418 }
419 }
420
IsIdentifierStart(uc32 ch)421 bool AsmJsScanner::IsIdentifierStart(uc32 ch) {
422 return IsInRange(AsciiAlphaToLower(ch), 'a', 'z') || ch == '_' || ch == '$';
423 }
424
IsIdentifierPart(uc32 ch)425 bool AsmJsScanner::IsIdentifierPart(uc32 ch) { return IsAsciiIdentifier(ch); }
426
IsNumberStart(uc32 ch)427 bool AsmJsScanner::IsNumberStart(uc32 ch) {
428 return ch == '.' || IsDecimalDigit(ch);
429 }
430
431 } // namespace internal
432 } // namespace v8
433