1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #include "src/parsing/scanner.h"
8 
9 #include <stdint.h>
10 
11 #include <cmath>
12 
13 #include "src/ast/ast-value-factory.h"
14 #include "src/char-predicates-inl.h"
15 #include "src/conversions-inl.h"
16 #include "src/list-inl.h"
17 #include "src/parsing/duplicate-finder.h"  // For Scanner::FindSymbol
18 
19 namespace v8 {
20 namespace internal {
21 
Internalize(Isolate * isolate) const22 Handle<String> Scanner::LiteralBuffer::Internalize(Isolate* isolate) const {
23   if (is_one_byte()) {
24     return isolate->factory()->InternalizeOneByteString(one_byte_literal());
25   }
26   return isolate->factory()->InternalizeTwoByteString(two_byte_literal());
27 }
28 
29 // ----------------------------------------------------------------------------
30 // Scanner::BookmarkScope
31 
32 const size_t Scanner::BookmarkScope::kBookmarkAtFirstPos =
33     std::numeric_limits<size_t>::max() - 2;
34 const size_t Scanner::BookmarkScope::kNoBookmark =
35     std::numeric_limits<size_t>::max() - 1;
36 const size_t Scanner::BookmarkScope::kBookmarkWasApplied =
37     std::numeric_limits<size_t>::max();
38 
Set()39 void Scanner::BookmarkScope::Set() {
40   DCHECK_EQ(bookmark_, kNoBookmark);
41   DCHECK_EQ(scanner_->next_next_.token, Token::UNINITIALIZED);
42 
43   // The first token is a bit special, since current_ will still be
44   // uninitialized. In this case, store kBookmarkAtFirstPos and special-case it
45   // when
46   // applying the bookmark.
47   DCHECK_IMPLIES(
48       scanner_->current_.token == Token::UNINITIALIZED,
49       scanner_->current_.location.beg_pos == scanner_->next_.location.beg_pos);
50   bookmark_ = (scanner_->current_.token == Token::UNINITIALIZED)
51                   ? kBookmarkAtFirstPos
52                   : scanner_->location().beg_pos;
53 }
54 
Apply()55 void Scanner::BookmarkScope::Apply() {
56   DCHECK(HasBeenSet());  // Caller hasn't called SetBookmark.
57   if (bookmark_ == kBookmarkAtFirstPos) {
58     scanner_->SeekNext(0);
59   } else {
60     scanner_->SeekNext(bookmark_);
61     scanner_->Next();
62     DCHECK_EQ(scanner_->location().beg_pos, static_cast<int>(bookmark_));
63   }
64   bookmark_ = kBookmarkWasApplied;
65 }
66 
HasBeenSet()67 bool Scanner::BookmarkScope::HasBeenSet() {
68   return bookmark_ != kNoBookmark && bookmark_ != kBookmarkWasApplied;
69 }
70 
HasBeenApplied()71 bool Scanner::BookmarkScope::HasBeenApplied() {
72   return bookmark_ == kBookmarkWasApplied;
73 }
74 
75 // ----------------------------------------------------------------------------
76 // Scanner
77 
Scanner(UnicodeCache * unicode_cache)78 Scanner::Scanner(UnicodeCache* unicode_cache)
79     : unicode_cache_(unicode_cache),
80       octal_pos_(Location::invalid()),
81       decimal_with_leading_zero_pos_(Location::invalid()),
82       found_html_comment_(false) {
83 }
84 
85 
Initialize(Utf16CharacterStream * source)86 void Scanner::Initialize(Utf16CharacterStream* source) {
87   source_ = source;
88   // Need to capture identifiers in order to recognize "get" and "set"
89   // in object literals.
90   Init();
91   // Skip initial whitespace allowing HTML comment ends just like
92   // after a newline and scan first token.
93   has_line_terminator_before_next_ = true;
94   SkipWhiteSpace();
95   Scan();
96 }
97 
98 template <bool capture_raw, bool unicode>
ScanHexNumber(int expected_length)99 uc32 Scanner::ScanHexNumber(int expected_length) {
100   DCHECK(expected_length <= 4);  // prevent overflow
101 
102   int begin = source_pos() - 2;
103   uc32 x = 0;
104   for (int i = 0; i < expected_length; i++) {
105     int d = HexValue(c0_);
106     if (d < 0) {
107       ReportScannerError(Location(begin, begin + expected_length + 2),
108                          unicode
109                              ? MessageTemplate::kInvalidUnicodeEscapeSequence
110                              : MessageTemplate::kInvalidHexEscapeSequence);
111       return -1;
112     }
113     x = x * 16 + d;
114     Advance<capture_raw>();
115   }
116 
117   return x;
118 }
119 
120 template <bool capture_raw>
ScanUnlimitedLengthHexNumber(int max_value,int beg_pos)121 uc32 Scanner::ScanUnlimitedLengthHexNumber(int max_value, int beg_pos) {
122   uc32 x = 0;
123   int d = HexValue(c0_);
124   if (d < 0) return -1;
125 
126   while (d >= 0) {
127     x = x * 16 + d;
128     if (x > max_value) {
129       ReportScannerError(Location(beg_pos, source_pos() + 1),
130                          MessageTemplate::kUndefinedUnicodeCodePoint);
131       return -1;
132     }
133     Advance<capture_raw>();
134     d = HexValue(c0_);
135   }
136 
137   return x;
138 }
139 
140 
141 // Ensure that tokens can be stored in a byte.
142 STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
143 
144 // Table of one-character tokens, by character (0x00..0x7f only).
145 static const byte one_char_tokens[] = {
146   Token::ILLEGAL,
147   Token::ILLEGAL,
148   Token::ILLEGAL,
149   Token::ILLEGAL,
150   Token::ILLEGAL,
151   Token::ILLEGAL,
152   Token::ILLEGAL,
153   Token::ILLEGAL,
154   Token::ILLEGAL,
155   Token::ILLEGAL,
156   Token::ILLEGAL,
157   Token::ILLEGAL,
158   Token::ILLEGAL,
159   Token::ILLEGAL,
160   Token::ILLEGAL,
161   Token::ILLEGAL,
162   Token::ILLEGAL,
163   Token::ILLEGAL,
164   Token::ILLEGAL,
165   Token::ILLEGAL,
166   Token::ILLEGAL,
167   Token::ILLEGAL,
168   Token::ILLEGAL,
169   Token::ILLEGAL,
170   Token::ILLEGAL,
171   Token::ILLEGAL,
172   Token::ILLEGAL,
173   Token::ILLEGAL,
174   Token::ILLEGAL,
175   Token::ILLEGAL,
176   Token::ILLEGAL,
177   Token::ILLEGAL,
178   Token::ILLEGAL,
179   Token::ILLEGAL,
180   Token::ILLEGAL,
181   Token::ILLEGAL,
182   Token::ILLEGAL,
183   Token::ILLEGAL,
184   Token::ILLEGAL,
185   Token::ILLEGAL,
186   Token::LPAREN,       // 0x28
187   Token::RPAREN,       // 0x29
188   Token::ILLEGAL,
189   Token::ILLEGAL,
190   Token::COMMA,        // 0x2c
191   Token::ILLEGAL,
192   Token::ILLEGAL,
193   Token::ILLEGAL,
194   Token::ILLEGAL,
195   Token::ILLEGAL,
196   Token::ILLEGAL,
197   Token::ILLEGAL,
198   Token::ILLEGAL,
199   Token::ILLEGAL,
200   Token::ILLEGAL,
201   Token::ILLEGAL,
202   Token::ILLEGAL,
203   Token::ILLEGAL,
204   Token::COLON,        // 0x3a
205   Token::SEMICOLON,    // 0x3b
206   Token::ILLEGAL,
207   Token::ILLEGAL,
208   Token::ILLEGAL,
209   Token::CONDITIONAL,  // 0x3f
210   Token::ILLEGAL,
211   Token::ILLEGAL,
212   Token::ILLEGAL,
213   Token::ILLEGAL,
214   Token::ILLEGAL,
215   Token::ILLEGAL,
216   Token::ILLEGAL,
217   Token::ILLEGAL,
218   Token::ILLEGAL,
219   Token::ILLEGAL,
220   Token::ILLEGAL,
221   Token::ILLEGAL,
222   Token::ILLEGAL,
223   Token::ILLEGAL,
224   Token::ILLEGAL,
225   Token::ILLEGAL,
226   Token::ILLEGAL,
227   Token::ILLEGAL,
228   Token::ILLEGAL,
229   Token::ILLEGAL,
230   Token::ILLEGAL,
231   Token::ILLEGAL,
232   Token::ILLEGAL,
233   Token::ILLEGAL,
234   Token::ILLEGAL,
235   Token::ILLEGAL,
236   Token::ILLEGAL,
237   Token::LBRACK,     // 0x5b
238   Token::ILLEGAL,
239   Token::RBRACK,     // 0x5d
240   Token::ILLEGAL,
241   Token::ILLEGAL,
242   Token::ILLEGAL,
243   Token::ILLEGAL,
244   Token::ILLEGAL,
245   Token::ILLEGAL,
246   Token::ILLEGAL,
247   Token::ILLEGAL,
248   Token::ILLEGAL,
249   Token::ILLEGAL,
250   Token::ILLEGAL,
251   Token::ILLEGAL,
252   Token::ILLEGAL,
253   Token::ILLEGAL,
254   Token::ILLEGAL,
255   Token::ILLEGAL,
256   Token::ILLEGAL,
257   Token::ILLEGAL,
258   Token::ILLEGAL,
259   Token::ILLEGAL,
260   Token::ILLEGAL,
261   Token::ILLEGAL,
262   Token::ILLEGAL,
263   Token::ILLEGAL,
264   Token::ILLEGAL,
265   Token::ILLEGAL,
266   Token::ILLEGAL,
267   Token::ILLEGAL,
268   Token::ILLEGAL,
269   Token::LBRACE,       // 0x7b
270   Token::ILLEGAL,
271   Token::RBRACE,       // 0x7d
272   Token::BIT_NOT,      // 0x7e
273   Token::ILLEGAL
274 };
275 
276 
Next()277 Token::Value Scanner::Next() {
278   if (next_.token == Token::EOS) {
279     next_.location.beg_pos = current_.location.beg_pos;
280     next_.location.end_pos = current_.location.end_pos;
281   }
282   current_ = next_;
283   if (V8_UNLIKELY(next_next_.token != Token::UNINITIALIZED)) {
284     next_ = next_next_;
285     next_next_.token = Token::UNINITIALIZED;
286     has_line_terminator_before_next_ = has_line_terminator_after_next_;
287     return current_.token;
288   }
289   has_line_terminator_before_next_ = false;
290   has_multiline_comment_before_next_ = false;
291   if (static_cast<unsigned>(c0_) <= 0x7f) {
292     Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
293     if (token != Token::ILLEGAL) {
294       int pos = source_pos();
295       next_.token = token;
296       next_.location.beg_pos = pos;
297       next_.location.end_pos = pos + 1;
298       next_.literal_chars = nullptr;
299       next_.raw_literal_chars = nullptr;
300       Advance();
301       return current_.token;
302     }
303   }
304   Scan();
305   return current_.token;
306 }
307 
308 
PeekAhead()309 Token::Value Scanner::PeekAhead() {
310   DCHECK(next_.token != Token::DIV);
311   DCHECK(next_.token != Token::ASSIGN_DIV);
312 
313   if (next_next_.token != Token::UNINITIALIZED) {
314     return next_next_.token;
315   }
316   TokenDesc prev = current_;
317   bool has_line_terminator_before_next =
318       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
319   Next();
320   has_line_terminator_after_next_ =
321       has_line_terminator_before_next_ || has_multiline_comment_before_next_;
322   has_line_terminator_before_next_ = has_line_terminator_before_next;
323   Token::Value ret = next_.token;
324   next_next_ = next_;
325   next_ = current_;
326   current_ = prev;
327   return ret;
328 }
329 
330 
331 // TODO(yangguo): check whether this is actually necessary.
IsLittleEndianByteOrderMark(uc32 c)332 static inline bool IsLittleEndianByteOrderMark(uc32 c) {
333   // The Unicode value U+FFFE is guaranteed never to be assigned as a
334   // Unicode character; this implies that in a Unicode context the
335   // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
336   // character expressed in little-endian byte order (since it could
337   // not be a U+FFFE character expressed in big-endian byte
338   // order). Nevertheless, we check for it to be compatible with
339   // Spidermonkey.
340   return c == 0xFFFE;
341 }
342 
SkipWhiteSpace()343 bool Scanner::SkipWhiteSpace() {
344   int start_position = source_pos();
345 
346   while (true) {
347     while (true) {
348       // Don't skip behind the end of input.
349       if (c0_ == kEndOfInput) break;
350 
351       // Advance as long as character is a WhiteSpace or LineTerminator.
352       // Remember if the latter is the case.
353       if (unicode_cache_->IsLineTerminator(c0_)) {
354         has_line_terminator_before_next_ = true;
355       } else if (!unicode_cache_->IsWhiteSpace(c0_) &&
356                  !IsLittleEndianByteOrderMark(c0_)) {
357         break;
358       }
359       Advance();
360     }
361 
362     // If there is an HTML comment end '-->' at the beginning of a
363     // line (with only whitespace in front of it), we treat the rest
364     // of the line as a comment. This is in line with the way
365     // SpiderMonkey handles it.
366     if (c0_ != '-' || !has_line_terminator_before_next_) break;
367 
368     Advance();
369     if (c0_ != '-') {
370       PushBack('-');  // undo Advance()
371       break;
372     }
373 
374     Advance();
375     if (c0_ != '>') {
376       PushBack2('-', '-');  // undo 2x Advance();
377       break;
378     }
379 
380     // Treat the rest of the line as a comment.
381     SkipSingleLineComment();
382   }
383 
384   // Return whether or not we skipped any characters.
385   return source_pos() != start_position;
386 }
387 
SkipSingleLineComment()388 Token::Value Scanner::SkipSingleLineComment() {
389   Advance();
390 
391   // The line terminator at the end of the line is not considered
392   // to be part of the single-line comment; it is recognized
393   // separately by the lexical grammar and becomes part of the
394   // stream of input elements for the syntactic grammar (see
395   // ECMA-262, section 7.4).
396   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
397     Advance();
398   }
399 
400   return Token::WHITESPACE;
401 }
402 
403 
SkipSourceURLComment()404 Token::Value Scanner::SkipSourceURLComment() {
405   TryToParseSourceURLComment();
406   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
407     Advance();
408   }
409 
410   return Token::WHITESPACE;
411 }
412 
413 
TryToParseSourceURLComment()414 void Scanner::TryToParseSourceURLComment() {
415   // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this
416   // function will just return if it cannot parse a magic comment.
417   if (c0_ == kEndOfInput || !unicode_cache_->IsWhiteSpace(c0_)) return;
418   Advance();
419   LiteralBuffer name;
420   while (c0_ != kEndOfInput &&
421          !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && c0_ != '=') {
422     name.AddChar(c0_);
423     Advance();
424   }
425   if (!name.is_one_byte()) return;
426   Vector<const uint8_t> name_literal = name.one_byte_literal();
427   LiteralBuffer* value;
428   if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) {
429     value = &source_url_;
430   } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) {
431     value = &source_mapping_url_;
432   } else {
433     return;
434   }
435   if (c0_ != '=')
436     return;
437   Advance();
438   value->Reset();
439   while (c0_ != kEndOfInput && unicode_cache_->IsWhiteSpace(c0_)) {
440     Advance();
441   }
442   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
443     // Disallowed characters.
444     if (c0_ == '"' || c0_ == '\'') {
445       value->Reset();
446       return;
447     }
448     if (unicode_cache_->IsWhiteSpace(c0_)) {
449       break;
450     }
451     value->AddChar(c0_);
452     Advance();
453   }
454   // Allow whitespace at the end.
455   while (c0_ != kEndOfInput && !unicode_cache_->IsLineTerminator(c0_)) {
456     if (!unicode_cache_->IsWhiteSpace(c0_)) {
457       value->Reset();
458       break;
459     }
460     Advance();
461   }
462 }
463 
464 
SkipMultiLineComment()465 Token::Value Scanner::SkipMultiLineComment() {
466   DCHECK(c0_ == '*');
467   Advance();
468 
469   while (c0_ != kEndOfInput) {
470     uc32 ch = c0_;
471     Advance();
472     if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(ch)) {
473       // Following ECMA-262, section 7.4, a comment containing
474       // a newline will make the comment count as a line-terminator.
475       has_multiline_comment_before_next_ = true;
476     }
477     // If we have reached the end of the multi-line comment, we
478     // consume the '/' and insert a whitespace. This way all
479     // multi-line comments are treated as whitespace.
480     if (ch == '*' && c0_ == '/') {
481       c0_ = ' ';
482       return Token::WHITESPACE;
483     }
484   }
485 
486   // Unterminated multi-line comment.
487   return Token::ILLEGAL;
488 }
489 
ScanHtmlComment()490 Token::Value Scanner::ScanHtmlComment() {
491   // Check for <!-- comments.
492   DCHECK(c0_ == '!');
493   Advance();
494   if (c0_ != '-') {
495     PushBack('!');  // undo Advance()
496     return Token::LT;
497   }
498 
499   Advance();
500   if (c0_ != '-') {
501     PushBack2('-', '!');  // undo 2x Advance()
502     return Token::LT;
503   }
504 
505   found_html_comment_ = true;
506   return SkipSingleLineComment();
507 }
508 
Scan()509 void Scanner::Scan() {
510   next_.literal_chars = NULL;
511   next_.raw_literal_chars = NULL;
512   Token::Value token;
513   do {
514     // Remember the position of the next token
515     next_.location.beg_pos = source_pos();
516 
517     switch (c0_) {
518       case ' ':
519       case '\t':
520         Advance();
521         token = Token::WHITESPACE;
522         break;
523 
524       case '\n':
525         Advance();
526         has_line_terminator_before_next_ = true;
527         token = Token::WHITESPACE;
528         break;
529 
530       case '"': case '\'':
531         token = ScanString();
532         break;
533 
534       case '<':
535         // < <= << <<= <!--
536         Advance();
537         if (c0_ == '=') {
538           token = Select(Token::LTE);
539         } else if (c0_ == '<') {
540           token = Select('=', Token::ASSIGN_SHL, Token::SHL);
541         } else if (c0_ == '!') {
542           token = ScanHtmlComment();
543         } else {
544           token = Token::LT;
545         }
546         break;
547 
548       case '>':
549         // > >= >> >>= >>> >>>=
550         Advance();
551         if (c0_ == '=') {
552           token = Select(Token::GTE);
553         } else if (c0_ == '>') {
554           // >> >>= >>> >>>=
555           Advance();
556           if (c0_ == '=') {
557             token = Select(Token::ASSIGN_SAR);
558           } else if (c0_ == '>') {
559             token = Select('=', Token::ASSIGN_SHR, Token::SHR);
560           } else {
561             token = Token::SAR;
562           }
563         } else {
564           token = Token::GT;
565         }
566         break;
567 
568       case '=':
569         // = == === =>
570         Advance();
571         if (c0_ == '=') {
572           token = Select('=', Token::EQ_STRICT, Token::EQ);
573         } else if (c0_ == '>') {
574           token = Select(Token::ARROW);
575         } else {
576           token = Token::ASSIGN;
577         }
578         break;
579 
580       case '!':
581         // ! != !==
582         Advance();
583         if (c0_ == '=') {
584           token = Select('=', Token::NE_STRICT, Token::NE);
585         } else {
586           token = Token::NOT;
587         }
588         break;
589 
590       case '+':
591         // + ++ +=
592         Advance();
593         if (c0_ == '+') {
594           token = Select(Token::INC);
595         } else if (c0_ == '=') {
596           token = Select(Token::ASSIGN_ADD);
597         } else {
598           token = Token::ADD;
599         }
600         break;
601 
602       case '-':
603         // - -- --> -=
604         Advance();
605         if (c0_ == '-') {
606           Advance();
607           if (c0_ == '>' && HasAnyLineTerminatorBeforeNext()) {
608             // For compatibility with SpiderMonkey, we skip lines that
609             // start with an HTML comment end '-->'.
610             token = SkipSingleLineComment();
611           } else {
612             token = Token::DEC;
613           }
614         } else if (c0_ == '=') {
615           token = Select(Token::ASSIGN_SUB);
616         } else {
617           token = Token::SUB;
618         }
619         break;
620 
621       case '*':
622         // * *=
623         Advance();
624         if (c0_ == '*') {
625           token = Select('=', Token::ASSIGN_EXP, Token::EXP);
626         } else if (c0_ == '=') {
627           token = Select(Token::ASSIGN_MUL);
628         } else {
629           token = Token::MUL;
630         }
631         break;
632 
633       case '%':
634         // % %=
635         token = Select('=', Token::ASSIGN_MOD, Token::MOD);
636         break;
637 
638       case '/':
639         // /  // /* /=
640         Advance();
641         if (c0_ == '/') {
642           Advance();
643           if (c0_ == '#' || c0_ == '@') {
644             Advance();
645             token = SkipSourceURLComment();
646           } else {
647             PushBack(c0_);
648             token = SkipSingleLineComment();
649           }
650         } else if (c0_ == '*') {
651           token = SkipMultiLineComment();
652         } else if (c0_ == '=') {
653           token = Select(Token::ASSIGN_DIV);
654         } else {
655           token = Token::DIV;
656         }
657         break;
658 
659       case '&':
660         // & && &=
661         Advance();
662         if (c0_ == '&') {
663           token = Select(Token::AND);
664         } else if (c0_ == '=') {
665           token = Select(Token::ASSIGN_BIT_AND);
666         } else {
667           token = Token::BIT_AND;
668         }
669         break;
670 
671       case '|':
672         // | || |=
673         Advance();
674         if (c0_ == '|') {
675           token = Select(Token::OR);
676         } else if (c0_ == '=') {
677           token = Select(Token::ASSIGN_BIT_OR);
678         } else {
679           token = Token::BIT_OR;
680         }
681         break;
682 
683       case '^':
684         // ^ ^=
685         token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
686         break;
687 
688       case '.':
689         // . Number
690         Advance();
691         if (IsDecimalDigit(c0_)) {
692           token = ScanNumber(true);
693         } else {
694           token = Token::PERIOD;
695           if (c0_ == '.') {
696             Advance();
697             if (c0_ == '.') {
698               Advance();
699               token = Token::ELLIPSIS;
700             } else {
701               PushBack('.');
702             }
703           }
704         }
705         break;
706 
707       case ':':
708         token = Select(Token::COLON);
709         break;
710 
711       case ';':
712         token = Select(Token::SEMICOLON);
713         break;
714 
715       case ',':
716         token = Select(Token::COMMA);
717         break;
718 
719       case '(':
720         token = Select(Token::LPAREN);
721         break;
722 
723       case ')':
724         token = Select(Token::RPAREN);
725         break;
726 
727       case '[':
728         token = Select(Token::LBRACK);
729         break;
730 
731       case ']':
732         token = Select(Token::RBRACK);
733         break;
734 
735       case '{':
736         token = Select(Token::LBRACE);
737         break;
738 
739       case '}':
740         token = Select(Token::RBRACE);
741         break;
742 
743       case '?':
744         token = Select(Token::CONDITIONAL);
745         break;
746 
747       case '~':
748         token = Select(Token::BIT_NOT);
749         break;
750 
751       case '`':
752         token = ScanTemplateStart();
753         break;
754 
755       default:
756         if (c0_ == kEndOfInput) {
757           token = Token::EOS;
758         } else if (unicode_cache_->IsIdentifierStart(c0_)) {
759           token = ScanIdentifierOrKeyword();
760         } else if (IsDecimalDigit(c0_)) {
761           token = ScanNumber(false);
762         } else if (SkipWhiteSpace()) {
763           token = Token::WHITESPACE;
764         } else {
765           token = Select(Token::ILLEGAL);
766         }
767         break;
768     }
769 
770     // Continue scanning for tokens as long as we're just skipping
771     // whitespace.
772   } while (token == Token::WHITESPACE);
773 
774   next_.location.end_pos = source_pos();
775   next_.token = token;
776 
777 #ifdef DEBUG
778   SanityCheckTokenDesc(current_);
779   SanityCheckTokenDesc(next_);
780   SanityCheckTokenDesc(next_next_);
781 #endif
782 }
783 
784 #ifdef DEBUG
SanityCheckTokenDesc(const TokenDesc & token) const785 void Scanner::SanityCheckTokenDesc(const TokenDesc& token) const {
786   // Most tokens should not have literal_chars or even raw_literal chars.
787   // The rules are:
788   // - UNINITIALIZED: we don't care.
789   // - TEMPLATE_*: need both literal + raw literal chars.
790   // - IDENTIFIERS, STRINGS, etc.: need a literal, but no raw literal.
791   // - all others: should have neither.
792 
793   switch (token.token) {
794     case Token::UNINITIALIZED:
795       // token.literal_chars & other members might be garbage. That's ok.
796       break;
797     case Token::TEMPLATE_SPAN:
798     case Token::TEMPLATE_TAIL:
799       DCHECK_NOT_NULL(token.raw_literal_chars);
800       DCHECK_NOT_NULL(token.literal_chars);
801       break;
802     case Token::ESCAPED_KEYWORD:
803     case Token::ESCAPED_STRICT_RESERVED_WORD:
804     case Token::FUTURE_STRICT_RESERVED_WORD:
805     case Token::IDENTIFIER:
806     case Token::NUMBER:
807     case Token::REGEXP_LITERAL:
808     case Token::SMI:
809     case Token::STRING:
810       DCHECK_NOT_NULL(token.literal_chars);
811       DCHECK_NULL(token.raw_literal_chars);
812       break;
813     default:
814       DCHECK_NULL(token.literal_chars);
815       DCHECK_NULL(token.raw_literal_chars);
816       break;
817   }
818 }
819 #endif  // DEBUG
820 
SeekForward(int pos)821 void Scanner::SeekForward(int pos) {
822   // After this call, we will have the token at the given position as
823   // the "next" token. The "current" token will be invalid.
824   if (pos == next_.location.beg_pos) return;
825   int current_pos = source_pos();
826   DCHECK_EQ(next_.location.end_pos, current_pos);
827   // Positions inside the lookahead token aren't supported.
828   DCHECK(pos >= current_pos);
829   if (pos != current_pos) {
830     source_->Seek(pos);
831     Advance();
832     // This function is only called to seek to the location
833     // of the end of a function (at the "}" token). It doesn't matter
834     // whether there was a line terminator in the part we skip.
835     has_line_terminator_before_next_ = false;
836     has_multiline_comment_before_next_ = false;
837   }
838   Scan();
839 }
840 
841 
842 template <bool capture_raw, bool in_template_literal>
ScanEscape()843 bool Scanner::ScanEscape() {
844   uc32 c = c0_;
845   Advance<capture_raw>();
846 
847   // Skip escaped newlines.
848   if (!in_template_literal && c0_ != kEndOfInput &&
849       unicode_cache_->IsLineTerminator(c)) {
850     // Allow CR+LF newlines in multiline string literals.
851     if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance<capture_raw>();
852     // Allow LF+CR newlines in multiline string literals.
853     if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance<capture_raw>();
854     return true;
855   }
856 
857   switch (c) {
858     case '\'':  // fall through
859     case '"' :  // fall through
860     case '\\': break;
861     case 'b' : c = '\b'; break;
862     case 'f' : c = '\f'; break;
863     case 'n' : c = '\n'; break;
864     case 'r' : c = '\r'; break;
865     case 't' : c = '\t'; break;
866     case 'u' : {
867       c = ScanUnicodeEscape<capture_raw>();
868       if (c < 0) return false;
869       break;
870     }
871     case 'v':
872       c = '\v';
873       break;
874     case 'x': {
875       c = ScanHexNumber<capture_raw>(2);
876       if (c < 0) return false;
877       break;
878     }
879     case '0':  // Fall through.
880     case '1':  // fall through
881     case '2':  // fall through
882     case '3':  // fall through
883     case '4':  // fall through
884     case '5':  // fall through
885     case '6':  // fall through
886     case '7':
887       c = ScanOctalEscape<capture_raw>(c, 2);
888       break;
889   }
890 
891   // According to ECMA-262, section 7.8.4, characters not covered by the
892   // above cases should be illegal, but they are commonly handled as
893   // non-escaped characters by JS VMs.
894   AddLiteralChar(c);
895   return true;
896 }
897 
898 
899 // Octal escapes of the forms '\0xx' and '\xxx' are not a part of
900 // ECMA-262. Other JS VMs support them.
901 template <bool capture_raw>
ScanOctalEscape(uc32 c,int length)902 uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
903   uc32 x = c - '0';
904   int i = 0;
905   for (; i < length; i++) {
906     int d = c0_ - '0';
907     if (d < 0 || d > 7) break;
908     int nx = x * 8 + d;
909     if (nx >= 256) break;
910     x = nx;
911     Advance<capture_raw>();
912   }
913   // Anything except '\0' is an octal escape sequence, illegal in strict mode.
914   // Remember the position of octal escape sequences so that an error
915   // can be reported later (in strict mode).
916   // We don't report the error immediately, because the octal escape can
917   // occur before the "use strict" directive.
918   if (c != '0' || i > 0) {
919     octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
920   }
921   return x;
922 }
923 
924 
ScanString()925 Token::Value Scanner::ScanString() {
926   uc32 quote = c0_;
927   Advance<false, false>();  // consume quote
928 
929   LiteralScope literal(this);
930   while (true) {
931     if (c0_ > kMaxAscii) {
932       HandleLeadSurrogate();
933       break;
934     }
935     if (c0_ == kEndOfInput || c0_ == '\n' || c0_ == '\r') return Token::ILLEGAL;
936     if (c0_ == quote) {
937       literal.Complete();
938       Advance<false, false>();
939       return Token::STRING;
940     }
941     char c = static_cast<char>(c0_);
942     if (c == '\\') break;
943     Advance<false, false>();
944     AddLiteralChar(c);
945   }
946 
947   while (c0_ != quote && c0_ != kEndOfInput &&
948          !unicode_cache_->IsLineTerminator(c0_)) {
949     uc32 c = c0_;
950     Advance();
951     if (c == '\\') {
952       if (c0_ == kEndOfInput || !ScanEscape<false, false>()) {
953         return Token::ILLEGAL;
954       }
955     } else {
956       AddLiteralChar(c);
957     }
958   }
959   if (c0_ != quote) return Token::ILLEGAL;
960   literal.Complete();
961 
962   Advance();  // consume quote
963   return Token::STRING;
964 }
965 
966 
ScanTemplateSpan()967 Token::Value Scanner::ScanTemplateSpan() {
968   // When scanning a TemplateSpan, we are looking for the following construct:
969   // TEMPLATE_SPAN ::
970   //     ` LiteralChars* ${
971   //   | } LiteralChars* ${
972   //
973   // TEMPLATE_TAIL ::
974   //     ` LiteralChars* `
975   //   | } LiteralChar* `
976   //
977   // A TEMPLATE_SPAN should always be followed by an Expression, while a
978   // TEMPLATE_TAIL terminates a TemplateLiteral and does not need to be
979   // followed by an Expression.
980 
981   Token::Value result = Token::TEMPLATE_SPAN;
982   LiteralScope literal(this);
983   StartRawLiteral();
984   const bool capture_raw = true;
985   const bool in_template_literal = true;
986   while (true) {
987     uc32 c = c0_;
988     Advance<capture_raw>();
989     if (c == '`') {
990       result = Token::TEMPLATE_TAIL;
991       ReduceRawLiteralLength(1);
992       break;
993     } else if (c == '$' && c0_ == '{') {
994       Advance<capture_raw>();  // Consume '{'
995       ReduceRawLiteralLength(2);
996       break;
997     } else if (c == '\\') {
998       if (c0_ != kEndOfInput && unicode_cache_->IsLineTerminator(c0_)) {
999         // The TV of LineContinuation :: \ LineTerminatorSequence is the empty
1000         // code unit sequence.
1001         uc32 lastChar = c0_;
1002         Advance<capture_raw>();
1003         if (lastChar == '\r') {
1004           ReduceRawLiteralLength(1);  // Remove \r
1005           if (c0_ == '\n') {
1006             Advance<capture_raw>();  // Adds \n
1007           } else {
1008             AddRawLiteralChar('\n');
1009           }
1010         }
1011       } else if (!ScanEscape<capture_raw, in_template_literal>()) {
1012         return Token::ILLEGAL;
1013       }
1014     } else if (c < 0) {
1015       // Unterminated template literal
1016       PushBack(c);
1017       break;
1018     } else {
1019       // The TRV of LineTerminatorSequence :: <CR> is the CV 0x000A.
1020       // The TRV of LineTerminatorSequence :: <CR><LF> is the sequence
1021       // consisting of the CV 0x000A.
1022       if (c == '\r') {
1023         ReduceRawLiteralLength(1);  // Remove \r
1024         if (c0_ == '\n') {
1025           Advance<capture_raw>();  // Adds \n
1026         } else {
1027           AddRawLiteralChar('\n');
1028         }
1029         c = '\n';
1030       }
1031       AddLiteralChar(c);
1032     }
1033   }
1034   literal.Complete();
1035   next_.location.end_pos = source_pos();
1036   next_.token = result;
1037   return result;
1038 }
1039 
1040 
ScanTemplateStart()1041 Token::Value Scanner::ScanTemplateStart() {
1042   DCHECK(next_next_.token == Token::UNINITIALIZED);
1043   DCHECK(c0_ == '`');
1044   next_.location.beg_pos = source_pos();
1045   Advance();  // Consume `
1046   return ScanTemplateSpan();
1047 }
1048 
1049 
ScanTemplateContinuation()1050 Token::Value Scanner::ScanTemplateContinuation() {
1051   DCHECK_EQ(next_.token, Token::RBRACE);
1052   next_.location.beg_pos = source_pos() - 1;  // We already consumed }
1053   return ScanTemplateSpan();
1054 }
1055 
1056 
ScanDecimalDigits()1057 void Scanner::ScanDecimalDigits() {
1058   while (IsDecimalDigit(c0_))
1059     AddLiteralCharAdvance();
1060 }
1061 
1062 
ScanNumber(bool seen_period)1063 Token::Value Scanner::ScanNumber(bool seen_period) {
1064   DCHECK(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1065 
1066   enum {
1067     DECIMAL,
1068     DECIMAL_WITH_LEADING_ZERO,
1069     HEX,
1070     OCTAL,
1071     IMPLICIT_OCTAL,
1072     BINARY
1073   } kind = DECIMAL;
1074 
1075   LiteralScope literal(this);
1076   bool at_start = !seen_period;
1077   int start_pos = source_pos();  // For reporting octal positions.
1078   if (seen_period) {
1079     // we have already seen a decimal point of the float
1080     AddLiteralChar('.');
1081     ScanDecimalDigits();  // we know we have at least one digit
1082 
1083   } else {
1084     // if the first character is '0' we must check for octals and hex
1085     if (c0_ == '0') {
1086       AddLiteralCharAdvance();
1087 
1088       // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or
1089       // an octal number.
1090       if (c0_ == 'x' || c0_ == 'X') {
1091         // hex number
1092         kind = HEX;
1093         AddLiteralCharAdvance();
1094         if (!IsHexDigit(c0_)) {
1095           // we must have at least one hex digit after 'x'/'X'
1096           return Token::ILLEGAL;
1097         }
1098         while (IsHexDigit(c0_)) {
1099           AddLiteralCharAdvance();
1100         }
1101       } else if (c0_ == 'o' || c0_ == 'O') {
1102         kind = OCTAL;
1103         AddLiteralCharAdvance();
1104         if (!IsOctalDigit(c0_)) {
1105           // we must have at least one octal digit after 'o'/'O'
1106           return Token::ILLEGAL;
1107         }
1108         while (IsOctalDigit(c0_)) {
1109           AddLiteralCharAdvance();
1110         }
1111       } else if (c0_ == 'b' || c0_ == 'B') {
1112         kind = BINARY;
1113         AddLiteralCharAdvance();
1114         if (!IsBinaryDigit(c0_)) {
1115           // we must have at least one binary digit after 'b'/'B'
1116           return Token::ILLEGAL;
1117         }
1118         while (IsBinaryDigit(c0_)) {
1119           AddLiteralCharAdvance();
1120         }
1121       } else if ('0' <= c0_ && c0_ <= '7') {
1122         // (possible) octal number
1123         kind = IMPLICIT_OCTAL;
1124         while (true) {
1125           if (c0_ == '8' || c0_ == '9') {
1126             at_start = false;
1127             kind = DECIMAL_WITH_LEADING_ZERO;
1128             break;
1129           }
1130           if (c0_  < '0' || '7'  < c0_) {
1131             // Octal literal finished.
1132             octal_pos_ = Location(start_pos, source_pos());
1133             break;
1134           }
1135           AddLiteralCharAdvance();
1136         }
1137       } else if (c0_ == '8' || c0_ == '9') {
1138         kind = DECIMAL_WITH_LEADING_ZERO;
1139       }
1140     }
1141 
1142     // Parse decimal digits and allow trailing fractional part.
1143     if (kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO) {
1144       if (at_start) {
1145         uint64_t value = 0;
1146         while (IsDecimalDigit(c0_)) {
1147           value = 10 * value + (c0_ - '0');
1148 
1149           uc32 first_char = c0_;
1150           Advance<false, false>();
1151           AddLiteralChar(first_char);
1152         }
1153 
1154         if (next_.literal_chars->one_byte_literal().length() <= 10 &&
1155             value <= Smi::kMaxValue && c0_ != '.' && c0_ != 'e' && c0_ != 'E') {
1156           next_.smi_value_ = static_cast<uint32_t>(value);
1157           literal.Complete();
1158           HandleLeadSurrogate();
1159 
1160           if (kind == DECIMAL_WITH_LEADING_ZERO)
1161             decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
1162           return Token::SMI;
1163         }
1164         HandleLeadSurrogate();
1165       }
1166 
1167       ScanDecimalDigits();  // optional
1168       if (c0_ == '.') {
1169         AddLiteralCharAdvance();
1170         ScanDecimalDigits();  // optional
1171       }
1172     }
1173   }
1174 
1175   // scan exponent, if any
1176   if (c0_ == 'e' || c0_ == 'E') {
1177     DCHECK(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1178     if (!(kind == DECIMAL || kind == DECIMAL_WITH_LEADING_ZERO))
1179       return Token::ILLEGAL;
1180     // scan exponent
1181     AddLiteralCharAdvance();
1182     if (c0_ == '+' || c0_ == '-')
1183       AddLiteralCharAdvance();
1184     if (!IsDecimalDigit(c0_)) {
1185       // we must have at least one decimal digit after 'e'/'E'
1186       return Token::ILLEGAL;
1187     }
1188     ScanDecimalDigits();
1189   }
1190 
1191   // The source character immediately following a numeric literal must
1192   // not be an identifier start or a decimal digit; see ECMA-262
1193   // section 7.8.3, page 17 (note that we read only one decimal digit
1194   // if the value is 0).
1195   if (IsDecimalDigit(c0_) ||
1196       (c0_ != kEndOfInput && unicode_cache_->IsIdentifierStart(c0_)))
1197     return Token::ILLEGAL;
1198 
1199   literal.Complete();
1200 
1201   if (kind == DECIMAL_WITH_LEADING_ZERO)
1202     decimal_with_leading_zero_pos_ = Location(start_pos, source_pos());
1203   return Token::NUMBER;
1204 }
1205 
1206 
ScanIdentifierUnicodeEscape()1207 uc32 Scanner::ScanIdentifierUnicodeEscape() {
1208   Advance();
1209   if (c0_ != 'u') return -1;
1210   Advance();
1211   return ScanUnicodeEscape<false>();
1212 }
1213 
1214 
1215 template <bool capture_raw>
ScanUnicodeEscape()1216 uc32 Scanner::ScanUnicodeEscape() {
1217   // Accept both \uxxxx and \u{xxxxxx}. In the latter case, the number of
1218   // hex digits between { } is arbitrary. \ and u have already been read.
1219   if (c0_ == '{') {
1220     int begin = source_pos() - 2;
1221     Advance<capture_raw>();
1222     uc32 cp = ScanUnlimitedLengthHexNumber<capture_raw>(0x10ffff, begin);
1223     if (cp < 0 || c0_ != '}') {
1224       ReportScannerError(source_pos(),
1225                          MessageTemplate::kInvalidUnicodeEscapeSequence);
1226       return -1;
1227     }
1228     Advance<capture_raw>();
1229     return cp;
1230   }
1231   const bool unicode = true;
1232   return ScanHexNumber<capture_raw, unicode>(4);
1233 }
1234 
1235 
1236 // ----------------------------------------------------------------------------
1237 // Keyword Matcher
1238 
1239 #define KEYWORDS(KEYWORD_GROUP, KEYWORD)                    \
1240   KEYWORD_GROUP('a')                                        \
1241   KEYWORD("async", Token::ASYNC)                            \
1242   KEYWORD("await", Token::AWAIT)                            \
1243   KEYWORD_GROUP('b')                                        \
1244   KEYWORD("break", Token::BREAK)                            \
1245   KEYWORD_GROUP('c')                                        \
1246   KEYWORD("case", Token::CASE)                              \
1247   KEYWORD("catch", Token::CATCH)                            \
1248   KEYWORD("class", Token::CLASS)                            \
1249   KEYWORD("const", Token::CONST)                            \
1250   KEYWORD("continue", Token::CONTINUE)                      \
1251   KEYWORD_GROUP('d')                                        \
1252   KEYWORD("debugger", Token::DEBUGGER)                      \
1253   KEYWORD("default", Token::DEFAULT)                        \
1254   KEYWORD("delete", Token::DELETE)                          \
1255   KEYWORD("do", Token::DO)                                  \
1256   KEYWORD_GROUP('e')                                        \
1257   KEYWORD("else", Token::ELSE)                              \
1258   KEYWORD("enum", Token::ENUM)                              \
1259   KEYWORD("export", Token::EXPORT)                          \
1260   KEYWORD("extends", Token::EXTENDS)                        \
1261   KEYWORD_GROUP('f')                                        \
1262   KEYWORD("false", Token::FALSE_LITERAL)                    \
1263   KEYWORD("finally", Token::FINALLY)                        \
1264   KEYWORD("for", Token::FOR)                                \
1265   KEYWORD("function", Token::FUNCTION)                      \
1266   KEYWORD_GROUP('i')                                        \
1267   KEYWORD("if", Token::IF)                                  \
1268   KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \
1269   KEYWORD("import", Token::IMPORT)                          \
1270   KEYWORD("in", Token::IN)                                  \
1271   KEYWORD("instanceof", Token::INSTANCEOF)                  \
1272   KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)  \
1273   KEYWORD_GROUP('l')                                        \
1274   KEYWORD("let", Token::LET)                                \
1275   KEYWORD_GROUP('n')                                        \
1276   KEYWORD("new", Token::NEW)                                \
1277   KEYWORD("null", Token::NULL_LITERAL)                      \
1278   KEYWORD_GROUP('p')                                        \
1279   KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)    \
1280   KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)    \
1281   KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)  \
1282   KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)     \
1283   KEYWORD_GROUP('r')                                        \
1284   KEYWORD("return", Token::RETURN)                          \
1285   KEYWORD_GROUP('s')                                        \
1286   KEYWORD("static", Token::STATIC)                          \
1287   KEYWORD("super", Token::SUPER)                            \
1288   KEYWORD("switch", Token::SWITCH)                          \
1289   KEYWORD_GROUP('t')                                        \
1290   KEYWORD("this", Token::THIS)                              \
1291   KEYWORD("throw", Token::THROW)                            \
1292   KEYWORD("true", Token::TRUE_LITERAL)                      \
1293   KEYWORD("try", Token::TRY)                                \
1294   KEYWORD("typeof", Token::TYPEOF)                          \
1295   KEYWORD_GROUP('v')                                        \
1296   KEYWORD("var", Token::VAR)                                \
1297   KEYWORD("void", Token::VOID)                              \
1298   KEYWORD_GROUP('w')                                        \
1299   KEYWORD("while", Token::WHILE)                            \
1300   KEYWORD("with", Token::WITH)                              \
1301   KEYWORD_GROUP('y')                                        \
1302   KEYWORD("yield", Token::YIELD)
1303 
KeywordOrIdentifierToken(const uint8_t * input,int input_length)1304 static Token::Value KeywordOrIdentifierToken(const uint8_t* input,
1305                                              int input_length) {
1306   DCHECK(input_length >= 1);
1307   const int kMinLength = 2;
1308   const int kMaxLength = 10;
1309   if (input_length < kMinLength || input_length > kMaxLength) {
1310     return Token::IDENTIFIER;
1311   }
1312   switch (input[0]) {
1313     default:
1314 #define KEYWORD_GROUP_CASE(ch)                                \
1315       break;                                                  \
1316     case ch:
1317 #define KEYWORD(keyword, token)                                     \
1318   {                                                                 \
1319     /* 'keyword' is a char array, so sizeof(keyword) is */          \
1320     /* strlen(keyword) plus 1 for the NUL char. */                  \
1321     const int keyword_length = sizeof(keyword) - 1;                 \
1322     STATIC_ASSERT(keyword_length >= kMinLength);                    \
1323     STATIC_ASSERT(keyword_length <= kMaxLength);                    \
1324     if (input_length == keyword_length && input[1] == keyword[1] && \
1325         (keyword_length <= 2 || input[2] == keyword[2]) &&          \
1326         (keyword_length <= 3 || input[3] == keyword[3]) &&          \
1327         (keyword_length <= 4 || input[4] == keyword[4]) &&          \
1328         (keyword_length <= 5 || input[5] == keyword[5]) &&          \
1329         (keyword_length <= 6 || input[6] == keyword[6]) &&          \
1330         (keyword_length <= 7 || input[7] == keyword[7]) &&          \
1331         (keyword_length <= 8 || input[8] == keyword[8]) &&          \
1332         (keyword_length <= 9 || input[9] == keyword[9])) {          \
1333       return token;                                                 \
1334     }                                                               \
1335   }
1336     KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
1337   }
1338   return Token::IDENTIFIER;
1339 }
1340 
1341 
IdentifierIsFutureStrictReserved(const AstRawString * string) const1342 bool Scanner::IdentifierIsFutureStrictReserved(
1343     const AstRawString* string) const {
1344   // Keywords are always 1-byte strings.
1345   if (!string->is_one_byte()) return false;
1346   if (string->IsOneByteEqualTo("let") || string->IsOneByteEqualTo("static") ||
1347       string->IsOneByteEqualTo("yield")) {
1348     return true;
1349   }
1350   return Token::FUTURE_STRICT_RESERVED_WORD ==
1351          KeywordOrIdentifierToken(string->raw_data(), string->length());
1352 }
1353 
1354 
ScanIdentifierOrKeyword()1355 Token::Value Scanner::ScanIdentifierOrKeyword() {
1356   DCHECK(unicode_cache_->IsIdentifierStart(c0_));
1357   LiteralScope literal(this);
1358   if (IsInRange(c0_, 'a', 'z')) {
1359     do {
1360       char first_char = static_cast<char>(c0_);
1361       Advance<false, false>();
1362       AddLiteralChar(first_char);
1363     } while (IsInRange(c0_, 'a', 'z'));
1364 
1365     if (IsDecimalDigit(c0_) || IsInRange(c0_, 'A', 'Z') || c0_ == '_' ||
1366         c0_ == '$') {
1367       // Identifier starting with lowercase.
1368       char first_char = static_cast<char>(c0_);
1369       Advance<false, false>();
1370       AddLiteralChar(first_char);
1371       while (IsAsciiIdentifier(c0_)) {
1372         char first_char = static_cast<char>(c0_);
1373         Advance<false, false>();
1374         AddLiteralChar(first_char);
1375       }
1376       if (c0_ <= kMaxAscii && c0_ != '\\') {
1377         literal.Complete();
1378         return Token::IDENTIFIER;
1379       }
1380     } else if (c0_ <= kMaxAscii && c0_ != '\\') {
1381       // Only a-z+: could be a keyword or identifier.
1382       Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1383       Token::Value token =
1384           KeywordOrIdentifierToken(chars.start(), chars.length());
1385       if (token == Token::IDENTIFIER ||
1386           token == Token::FUTURE_STRICT_RESERVED_WORD)
1387         literal.Complete();
1388       return token;
1389     }
1390 
1391     HandleLeadSurrogate();
1392   } else if (IsInRange(c0_, 'A', 'Z') || c0_ == '_' || c0_ == '$') {
1393     do {
1394       char first_char = static_cast<char>(c0_);
1395       Advance<false, false>();
1396       AddLiteralChar(first_char);
1397     } while (IsAsciiIdentifier(c0_));
1398 
1399     if (c0_ <= kMaxAscii && c0_ != '\\') {
1400       literal.Complete();
1401       return Token::IDENTIFIER;
1402     }
1403 
1404     HandleLeadSurrogate();
1405   } else if (c0_ == '\\') {
1406     // Scan identifier start character.
1407     uc32 c = ScanIdentifierUnicodeEscape();
1408     // Only allow legal identifier start characters.
1409     if (c < 0 ||
1410         c == '\\' ||  // No recursive escapes.
1411         !unicode_cache_->IsIdentifierStart(c)) {
1412       return Token::ILLEGAL;
1413     }
1414     AddLiteralChar(c);
1415     return ScanIdentifierSuffix(&literal, true);
1416   } else {
1417     uc32 first_char = c0_;
1418     Advance();
1419     AddLiteralChar(first_char);
1420   }
1421 
1422   // Scan the rest of the identifier characters.
1423   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1424     if (c0_ != '\\') {
1425       uc32 next_char = c0_;
1426       Advance();
1427       AddLiteralChar(next_char);
1428       continue;
1429     }
1430     // Fallthrough if no longer able to complete keyword.
1431     return ScanIdentifierSuffix(&literal, false);
1432   }
1433 
1434   if (next_.literal_chars->is_one_byte()) {
1435     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1436     Token::Value token =
1437         KeywordOrIdentifierToken(chars.start(), chars.length());
1438     if (token == Token::IDENTIFIER) literal.Complete();
1439     return token;
1440   }
1441   literal.Complete();
1442   return Token::IDENTIFIER;
1443 }
1444 
1445 
ScanIdentifierSuffix(LiteralScope * literal,bool escaped)1446 Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal,
1447                                            bool escaped) {
1448   // Scan the rest of the identifier characters.
1449   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1450     if (c0_ == '\\') {
1451       uc32 c = ScanIdentifierUnicodeEscape();
1452       escaped = true;
1453       // Only allow legal identifier part characters.
1454       if (c < 0 ||
1455           c == '\\' ||
1456           !unicode_cache_->IsIdentifierPart(c)) {
1457         return Token::ILLEGAL;
1458       }
1459       AddLiteralChar(c);
1460     } else {
1461       AddLiteralChar(c0_);
1462       Advance();
1463     }
1464   }
1465   literal->Complete();
1466 
1467   if (escaped && next_.literal_chars->is_one_byte()) {
1468     Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal();
1469     Token::Value token =
1470         KeywordOrIdentifierToken(chars.start(), chars.length());
1471     /* TODO(adamk): YIELD should be handled specially. */
1472     if (token == Token::IDENTIFIER) {
1473       return Token::IDENTIFIER;
1474     } else if (token == Token::FUTURE_STRICT_RESERVED_WORD ||
1475                token == Token::LET || token == Token::STATIC) {
1476       return Token::ESCAPED_STRICT_RESERVED_WORD;
1477     } else {
1478       return Token::ESCAPED_KEYWORD;
1479     }
1480   }
1481   return Token::IDENTIFIER;
1482 }
1483 
ScanRegExpPattern()1484 bool Scanner::ScanRegExpPattern() {
1485   DCHECK(next_next_.token == Token::UNINITIALIZED);
1486   DCHECK(next_.token == Token::DIV || next_.token == Token::ASSIGN_DIV);
1487 
1488   // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1489   bool in_character_class = false;
1490   bool seen_equal = (next_.token == Token::ASSIGN_DIV);
1491 
1492   // Previous token is either '/' or '/=', in the second case, the
1493   // pattern starts at =.
1494   next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1495   next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1496 
1497   // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1498   // the scanner should pass uninterpreted bodies to the RegExp
1499   // constructor.
1500   LiteralScope literal(this);
1501   if (seen_equal) {
1502     AddLiteralChar('=');
1503   }
1504 
1505   while (c0_ != '/' || in_character_class) {
1506     if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1507       return false;
1508     if (c0_ == '\\') {  // Escape sequence.
1509       AddLiteralCharAdvance();
1510       if (c0_ == kEndOfInput || unicode_cache_->IsLineTerminator(c0_))
1511         return false;
1512       AddLiteralCharAdvance();
1513       // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1514       // only "safe" characters are allowed (letters, digits, underscore),
1515       // otherwise the escape isn't valid and the invalid character has
1516       // its normal meaning. I.e., we can just continue scanning without
1517       // worrying whether the following characters are part of the escape
1518       // or not, since any '/', '\\' or '[' is guaranteed to not be part
1519       // of the escape sequence.
1520 
1521       // TODO(896): At some point, parse RegExps more throughly to capture
1522       // octal esacpes in strict mode.
1523     } else {  // Unescaped character.
1524       if (c0_ == '[') in_character_class = true;
1525       if (c0_ == ']') in_character_class = false;
1526       AddLiteralCharAdvance();
1527     }
1528   }
1529   Advance();  // consume '/'
1530 
1531   literal.Complete();
1532   next_.token = Token::REGEXP_LITERAL;
1533   return true;
1534 }
1535 
1536 
ScanRegExpFlags()1537 Maybe<RegExp::Flags> Scanner::ScanRegExpFlags() {
1538   DCHECK(next_.token == Token::REGEXP_LITERAL);
1539 
1540   // Scan regular expression flags.
1541   int flags = 0;
1542   while (c0_ != kEndOfInput && unicode_cache_->IsIdentifierPart(c0_)) {
1543     RegExp::Flags flag = RegExp::kNone;
1544     switch (c0_) {
1545       case 'g':
1546         flag = RegExp::kGlobal;
1547         break;
1548       case 'i':
1549         flag = RegExp::kIgnoreCase;
1550         break;
1551       case 'm':
1552         flag = RegExp::kMultiline;
1553         break;
1554       case 'u':
1555         flag = RegExp::kUnicode;
1556         break;
1557       case 'y':
1558         flag = RegExp::kSticky;
1559         break;
1560       default:
1561         return Nothing<RegExp::Flags>();
1562     }
1563     if (flags & flag) {
1564       return Nothing<RegExp::Flags>();
1565     }
1566     Advance();
1567     flags |= flag;
1568   }
1569 
1570   next_.location.end_pos = source_pos();
1571   return Just(RegExp::Flags(flags));
1572 }
1573 
1574 
CurrentSymbol(AstValueFactory * ast_value_factory)1575 const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) {
1576   if (is_literal_one_byte()) {
1577     return ast_value_factory->GetOneByteString(literal_one_byte_string());
1578   }
1579   return ast_value_factory->GetTwoByteString(literal_two_byte_string());
1580 }
1581 
1582 
NextSymbol(AstValueFactory * ast_value_factory)1583 const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) {
1584   if (is_next_literal_one_byte()) {
1585     return ast_value_factory->GetOneByteString(next_literal_one_byte_string());
1586   }
1587   return ast_value_factory->GetTwoByteString(next_literal_two_byte_string());
1588 }
1589 
1590 
CurrentRawSymbol(AstValueFactory * ast_value_factory)1591 const AstRawString* Scanner::CurrentRawSymbol(
1592     AstValueFactory* ast_value_factory) {
1593   if (is_raw_literal_one_byte()) {
1594     return ast_value_factory->GetOneByteString(raw_literal_one_byte_string());
1595   }
1596   return ast_value_factory->GetTwoByteString(raw_literal_two_byte_string());
1597 }
1598 
1599 
DoubleValue()1600 double Scanner::DoubleValue() {
1601   DCHECK(is_literal_one_byte());
1602   return StringToDouble(
1603       unicode_cache_,
1604       literal_one_byte_string(),
1605       ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY);
1606 }
1607 
1608 
ContainsDot()1609 bool Scanner::ContainsDot() {
1610   DCHECK(is_literal_one_byte());
1611   Vector<const uint8_t> str = literal_one_byte_string();
1612   return std::find(str.begin(), str.end(), '.') != str.end();
1613 }
1614 
1615 
FindSymbol(DuplicateFinder * finder,int value)1616 int Scanner::FindSymbol(DuplicateFinder* finder, int value) {
1617   // TODO(vogelheim): Move this logic into the calling class; this can be fully
1618   //                  implemented using the public interface.
1619   if (is_literal_one_byte()) {
1620     return finder->AddOneByteSymbol(literal_one_byte_string(), value);
1621   }
1622   return finder->AddTwoByteSymbol(literal_two_byte_string(), value);
1623 }
1624 
SeekNext(size_t position)1625 void Scanner::SeekNext(size_t position) {
1626   // Use with care: This cleanly resets most, but not all scanner state.
1627   // TODO(vogelheim): Fix this, or at least DCHECK the relevant conditions.
1628 
1629   // To re-scan from a given character position, we need to:
1630   // 1, Reset the current_, next_ and next_next_ tokens
1631   //    (next_ + next_next_ will be overwrittem by Next(),
1632   //     current_ will remain unchanged, so overwrite it fully.)
1633   current_ = {{0, 0}, nullptr, nullptr, 0, Token::UNINITIALIZED};
1634   next_.token = Token::UNINITIALIZED;
1635   next_next_.token = Token::UNINITIALIZED;
1636   // 2, reset the source to the desired position,
1637   source_->Seek(position);
1638   // 3, re-scan, by scanning the look-ahead char + 1 token (next_).
1639   c0_ = source_->Advance();
1640   Next();
1641   DCHECK_EQ(next_.location.beg_pos, static_cast<int>(position));
1642 }
1643 
1644 }  // namespace internal
1645 }  // namespace v8
1646