1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/asmjs/asm-scanner.h"
6 
7 #include "src/char-predicates-inl.h"
8 #include "src/conversions.h"
9 #include "src/flags.h"
10 #include "src/parsing/scanner.h"
11 #include "src/unicode-cache.h"
12 
13 namespace v8 {
14 namespace internal {
15 
16 namespace {
17 // Cap number of identifiers to ensure we can assign both global and
18 // local ones a token id in the range of an int32_t.
19 static const int kMaxIdentifierCount = 0xF000000;
20 };
21 
AsmJsScanner(Utf16CharacterStream * stream)22 AsmJsScanner::AsmJsScanner(Utf16CharacterStream* stream)
23     : stream_(stream),
24       token_(kUninitialized),
25       preceding_token_(kUninitialized),
26       next_token_(kUninitialized),
27       position_(0),
28       preceding_position_(0),
29       next_position_(0),
30       rewind_(false),
31       in_local_scope_(false),
32       global_count_(0),
33       double_value_(0.0),
34       unsigned_value_(0),
35       preceded_by_newline_(false) {
36 #define V(name, _junk1, _junk2, _junk3) property_names_[#name] = kToken_##name;
37   STDLIB_MATH_FUNCTION_LIST(V)
38   STDLIB_ARRAY_TYPE_LIST(V)
39 #undef V
40 #define V(name, _junk1) property_names_[#name] = kToken_##name;
41   STDLIB_MATH_VALUE_LIST(V)
42 #undef V
43 #define V(name) property_names_[#name] = kToken_##name;
44   STDLIB_OTHER_LIST(V)
45 #undef V
46 #define V(name) global_names_[#name] = kToken_##name;
47   KEYWORD_NAME_LIST(V)
48 #undef V
49   Next();
50 }
51 
Next()52 void AsmJsScanner::Next() {
53   if (rewind_) {
54     preceding_token_ = token_;
55     preceding_position_ = position_;
56     token_ = next_token_;
57     position_ = next_position_;
58     next_token_ = kUninitialized;
59     next_position_ = 0;
60     rewind_ = false;
61     return;
62   }
63 
64   if (token_ == kEndOfInput || token_ == kParseError) {
65     return;
66   }
67 
68 #if DEBUG
69   if (FLAG_trace_asm_scanner) {
70     if (Token() == kDouble) {
71       PrintF("%lf ", AsDouble());
72     } else if (Token() == kUnsigned) {
73       PrintF("%" PRIu32 " ", AsUnsigned());
74     } else {
75       std::string name = Name(Token());
76       PrintF("%s ", name.c_str());
77     }
78   }
79 #endif
80 
81   preceded_by_newline_ = false;
82   preceding_token_ = token_;
83   preceding_position_ = position_;
84 
85   for (;;) {
86     position_ = stream_->pos();
87     uc32 ch = stream_->Advance();
88     switch (ch) {
89       case ' ':
90       case '\t':
91       case '\r':
92         // Ignore whitespace.
93         break;
94 
95       case '\n':
96         // Track when we've passed a newline for optional semicolon support,
97         // but keep scanning.
98         preceded_by_newline_ = true;
99         break;
100 
101       case kEndOfInput:
102         token_ = kEndOfInput;
103         return;
104 
105       case '\'':
106       case '"':
107         ConsumeString(ch);
108         return;
109 
110       case '/':
111         ch = stream_->Advance();
112         if (ch == '/') {
113           ConsumeCPPComment();
114         } else if (ch == '*') {
115           if (!ConsumeCComment()) {
116             token_ = kParseError;
117             return;
118           }
119         } else {
120           stream_->Back();
121           token_ = '/';
122           return;
123         }
124         // Breaks out of switch, but loops again (i.e. the case when we parsed
125         // a comment, but need to continue to look for the next token).
126         break;
127 
128       case '<':
129       case '>':
130       case '=':
131       case '!':
132         ConsumeCompareOrShift(ch);
133         return;
134 
135 #define V(single_char_token) case single_char_token:
136         SIMPLE_SINGLE_TOKEN_LIST(V)
137 #undef V
138         // Use fixed token IDs for ASCII.
139         token_ = ch;
140         return;
141 
142       default:
143         if (IsIdentifierStart(ch)) {
144           ConsumeIdentifier(ch);
145         } else if (IsNumberStart(ch)) {
146           ConsumeNumber(ch);
147         } else {
148           // TODO(bradnelson): Support unicode (probably via UnicodeCache).
149           token_ = kParseError;
150         }
151         return;
152     }
153   }
154 }
155 
Rewind()156 void AsmJsScanner::Rewind() {
157   DCHECK_NE(kUninitialized, preceding_token_);
158   // TODO(bradnelson): Currently rewinding needs to leave in place the
159   // preceding newline state (in case a |0 ends a line).
160   // This is weird and stateful, fix me.
161   DCHECK(!rewind_);
162   next_token_ = token_;
163   next_position_ = position_;
164   token_ = preceding_token_;
165   position_ = preceding_position_;
166   preceding_token_ = kUninitialized;
167   preceding_position_ = 0;
168   rewind_ = true;
169   identifier_string_.clear();
170 }
171 
ResetLocals()172 void AsmJsScanner::ResetLocals() { local_names_.clear(); }
173 
174 #if DEBUG
175 // Only used for debugging.
Name(token_t token) const176 std::string AsmJsScanner::Name(token_t token) const {
177   if (token >= 32 && token < 127) {
178     return std::string(1, static_cast<char>(token));
179   }
180   for (auto& i : local_names_) {
181     if (i.second == token) {
182       return i.first;
183     }
184   }
185   for (auto& i : global_names_) {
186     if (i.second == token) {
187       return i.first;
188     }
189   }
190   for (auto& i : property_names_) {
191     if (i.second == token) {
192       return i.first;
193     }
194   }
195   switch (token) {
196 #define V(rawname, name) \
197   case kToken_##name:    \
198     return rawname;
199     LONG_SYMBOL_NAME_LIST(V)
200 #undef V
201 #define V(name, value, string_name) \
202   case name:                        \
203     return string_name;
204     SPECIAL_TOKEN_LIST(V)
205     default:
206       break;
207 #undef V
208   }
209   UNREACHABLE();
210 }
211 #endif
212 
Seek(size_t pos)213 void AsmJsScanner::Seek(size_t pos) {
214   stream_->Seek(pos);
215   preceding_token_ = kUninitialized;
216   token_ = kUninitialized;
217   next_token_ = kUninitialized;
218   preceding_position_ = 0;
219   position_ = 0;
220   next_position_ = 0;
221   rewind_ = false;
222   Next();
223 }
224 
ConsumeIdentifier(uc32 ch)225 void AsmJsScanner::ConsumeIdentifier(uc32 ch) {
226   // Consume characters while still part of the identifier.
227   identifier_string_.clear();
228   while (IsIdentifierPart(ch)) {
229     identifier_string_ += ch;
230     ch = stream_->Advance();
231   }
232   // Go back one for next time.
233   stream_->Back();
234 
235   // Decode what the identifier means.
236   if (preceding_token_ == '.') {
237     auto i = property_names_.find(identifier_string_);
238     if (i != property_names_.end()) {
239       token_ = i->second;
240       return;
241     }
242   } else {
243     {
244       auto i = local_names_.find(identifier_string_);
245       if (i != local_names_.end()) {
246         token_ = i->second;
247         return;
248       }
249     }
250     if (!in_local_scope_) {
251       auto i = global_names_.find(identifier_string_);
252       if (i != global_names_.end()) {
253         token_ = i->second;
254         return;
255       }
256     }
257   }
258   if (preceding_token_ == '.') {
259     CHECK_LT(global_count_, kMaxIdentifierCount);
260     token_ = kGlobalsStart + global_count_++;
261     property_names_[identifier_string_] = token_;
262   } else if (in_local_scope_) {
263     CHECK_LT(local_names_.size(), kMaxIdentifierCount);
264     token_ = kLocalsStart - static_cast<token_t>(local_names_.size());
265     local_names_[identifier_string_] = token_;
266   } else {
267     CHECK_LT(global_count_, kMaxIdentifierCount);
268     token_ = kGlobalsStart + global_count_++;
269     global_names_[identifier_string_] = token_;
270   }
271 }
272 
ConsumeNumber(uc32 ch)273 void AsmJsScanner::ConsumeNumber(uc32 ch) {
274   std::string number;
275   number = ch;
276   bool has_dot = ch == '.';
277   bool has_prefix = false;
278   for (;;) {
279     ch = stream_->Advance();
280     if ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') ||
281         (ch >= 'A' && ch <= 'F') || ch == '.' || ch == 'b' || ch == 'o' ||
282         ch == 'x' ||
283         ((ch == '-' || ch == '+') && !has_prefix &&
284          (number[number.size() - 1] == 'e' ||
285           number[number.size() - 1] == 'E'))) {
286       // TODO(bradnelson): Test weird cases ending in -.
287       if (ch == '.') {
288         has_dot = true;
289       }
290       if (ch == 'b' || ch == 'o' || ch == 'x') {
291         has_prefix = true;
292       }
293       number.push_back(ch);
294     } else {
295       break;
296     }
297   }
298   stream_->Back();
299   // Special case the most common number.
300   if (number.size() == 1 && number[0] == '0') {
301     unsigned_value_ = 0;
302     token_ = kUnsigned;
303     return;
304   }
305   // Pick out dot.
306   if (number.size() == 1 && number[0] == '.') {
307     token_ = '.';
308     return;
309   }
310   // Decode numbers.
311   UnicodeCache cache;
312   double_value_ = StringToDouble(
313       &cache,
314       Vector<const uint8_t>(reinterpret_cast<const uint8_t*>(number.data()),
315                             static_cast<int>(number.size())),
316       ALLOW_HEX | ALLOW_OCTAL | ALLOW_BINARY | ALLOW_IMPLICIT_OCTAL);
317   if (std::isnan(double_value_)) {
318     // Check if string to number conversion didn't consume all the characters.
319     // This happens if the character filter let through something invalid
320     // like: 0123ef for example.
321     // TODO(bradnelson): Check if this happens often enough to be a perf
322     // problem.
323     if (number[0] == '.') {
324       for (size_t k = 1; k < number.size(); ++k) {
325         stream_->Back();
326       }
327       token_ = '.';
328       return;
329     }
330     // Anything else that doesn't parse is an error.
331     token_ = kParseError;
332     return;
333   }
334   if (has_dot) {
335     token_ = kDouble;
336   } else {
337     // Exceeding safe integer range is an error.
338     if (double_value_ > static_cast<double>(kMaxUInt32)) {
339       token_ = kParseError;
340       return;
341     }
342     unsigned_value_ = static_cast<uint32_t>(double_value_);
343     token_ = kUnsigned;
344   }
345 }
346 
ConsumeCComment()347 bool AsmJsScanner::ConsumeCComment() {
348   for (;;) {
349     uc32 ch = stream_->Advance();
350     while (ch == '*') {
351       ch = stream_->Advance();
352       if (ch == '/') {
353         return true;
354       }
355     }
356     if (ch == kEndOfInput) {
357       return false;
358     }
359   }
360 }
361 
ConsumeCPPComment()362 void AsmJsScanner::ConsumeCPPComment() {
363   for (;;) {
364     uc32 ch = stream_->Advance();
365     if (ch == '\n' || ch == kEndOfInput) {
366       return;
367     }
368   }
369 }
370 
ConsumeString(uc32 quote)371 void AsmJsScanner::ConsumeString(uc32 quote) {
372   // Only string allowed is 'use asm' / "use asm".
373   const char* expected = "use asm";
374   for (; *expected != '\0'; ++expected) {
375     if (stream_->Advance() != *expected) {
376       token_ = kParseError;
377       return;
378     }
379   }
380   if (stream_->Advance() != quote) {
381     token_ = kParseError;
382     return;
383   }
384   token_ = kToken_UseAsm;
385 }
386 
ConsumeCompareOrShift(uc32 ch)387 void AsmJsScanner::ConsumeCompareOrShift(uc32 ch) {
388   uc32 next_ch = stream_->Advance();
389   if (next_ch == '=') {
390     switch (ch) {
391       case '<':
392         token_ = kToken_LE;
393         break;
394       case '>':
395         token_ = kToken_GE;
396         break;
397       case '=':
398         token_ = kToken_EQ;
399         break;
400       case '!':
401         token_ = kToken_NE;
402         break;
403       default:
404         UNREACHABLE();
405     }
406   } else if (ch == '<' && next_ch == '<') {
407     token_ = kToken_SHL;
408   } else if (ch == '>' && next_ch == '>') {
409     if (stream_->Advance() == '>') {
410       token_ = kToken_SHR;
411     } else {
412       token_ = kToken_SAR;
413       stream_->Back();
414     }
415   } else {
416     stream_->Back();
417     token_ = ch;
418   }
419 }
420 
IsIdentifierStart(uc32 ch)421 bool AsmJsScanner::IsIdentifierStart(uc32 ch) {
422   return IsInRange(AsciiAlphaToLower(ch), 'a', 'z') || ch == '_' || ch == '$';
423 }
424 
IsIdentifierPart(uc32 ch)425 bool AsmJsScanner::IsIdentifierPart(uc32 ch) { return IsAsciiIdentifier(ch); }
426 
IsNumberStart(uc32 ch)427 bool AsmJsScanner::IsNumberStart(uc32 ch) {
428   return ch == '.' || IsDecimalDigit(ch);
429 }
430 
431 }  // namespace internal
432 }  // namespace v8
433