1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Features shared by parsing and pre-parsing scanners.
6 
7 #ifndef V8_PARSING_SCANNER_H_
8 #define V8_PARSING_SCANNER_H_
9 
10 #include "src/allocation.h"
11 #include "src/base/logging.h"
12 #include "src/char-predicates.h"
13 #include "src/globals.h"
14 #include "src/messages.h"
15 #include "src/parsing/token.h"
16 #include "src/unicode-decoder.h"
17 #include "src/unicode.h"
18 
19 namespace v8 {
20 namespace internal {
21 
22 
23 class AstRawString;
24 class AstValueFactory;
25 class DuplicateFinder;
26 class ExternalOneByteString;
27 class ExternalTwoByteString;
28 class ParserRecorder;
29 class UnicodeCache;
30 
31 // ---------------------------------------------------------------------
32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
33 // A code unit is a 16 bit value representing either a 16 bit code point
34 // or one part of a surrogate pair that make a single 21 bit code point.
35 class Utf16CharacterStream {
36  public:
37   static const uc32 kEndOfInput = -1;
38 
~Utf16CharacterStream()39   virtual ~Utf16CharacterStream() { }
40 
41   // Returns and advances past the next UTF-16 code unit in the input
42   // stream. If there are no more code units it returns kEndOfInput.
Advance()43   inline uc32 Advance() {
44     if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
45       return static_cast<uc32>(*(buffer_cursor_++));
46     } else if (ReadBlock()) {
47       return static_cast<uc32>(*(buffer_cursor_++));
48     } else {
49       // Note: currently the following increment is necessary to avoid a
50       // parser problem! The scanner treats the final kEndOfInput as
51       // a code unit with a position, and does math relative to that
52       // position.
53       buffer_cursor_++;
54       return kEndOfInput;
55     }
56   }
57 
58   // Go back one by one character in the input stream.
59   // This undoes the most recent Advance().
Back()60   inline void Back() {
61     // The common case - if the previous character is within
62     // buffer_start_ .. buffer_end_ will be handles locally.
63     // Otherwise, a new block is requested.
64     if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
65       buffer_cursor_--;
66     } else {
67       ReadBlockAt(pos() - 1);
68     }
69   }
70 
71   // Go back one by two characters in the input stream. (This is the same as
72   // calling Back() twice. But Back() may - in some instances - do substantial
73   // work. Back2() guarantees this work will be done only once.)
Back2()74   inline void Back2() {
75     if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) {
76       buffer_cursor_ -= 2;
77     } else {
78       ReadBlockAt(pos() - 2);
79     }
80   }
81 
pos()82   inline size_t pos() const {
83     return buffer_pos_ + (buffer_cursor_ - buffer_start_);
84   }
85 
Seek(size_t pos)86   inline void Seek(size_t pos) {
87     if (V8_LIKELY(pos >= buffer_pos_ &&
88                   pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
89       buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
90     } else {
91       ReadBlockAt(pos);
92     }
93   }
94 
95  protected:
Utf16CharacterStream(const uint16_t * buffer_start,const uint16_t * buffer_cursor,const uint16_t * buffer_end,size_t buffer_pos)96   Utf16CharacterStream(const uint16_t* buffer_start,
97                        const uint16_t* buffer_cursor,
98                        const uint16_t* buffer_end, size_t buffer_pos)
99       : buffer_start_(buffer_start),
100         buffer_cursor_(buffer_cursor),
101         buffer_end_(buffer_end),
102         buffer_pos_(buffer_pos) {}
Utf16CharacterStream()103   Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
104 
ReadBlockAt(size_t new_pos)105   void ReadBlockAt(size_t new_pos) {
106     // The callers of this method (Back/Back2/Seek) should handle the easy
107     // case (seeking within the current buffer), and we should only get here
108     // if we actually require new data.
109     // (This is really an efficiency check, not a correctness invariant.)
110     DCHECK(new_pos < buffer_pos_ ||
111            new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
112 
113     // Change pos() to point to new_pos.
114     buffer_pos_ = new_pos;
115     buffer_cursor_ = buffer_start_;
116     bool success = ReadBlock();
117     USE(success);
118 
119     // Post-conditions: 1, on success, we should be at the right position.
120     //                  2, success == we should have more characters available.
121     DCHECK_IMPLIES(success, pos() == new_pos);
122     DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
123     DCHECK_EQ(success, buffer_start_ < buffer_end_);
124   }
125 
126   // Read more data, and update buffer_*_ to point to it.
127   // Returns true if more data was available.
128   //
129   // ReadBlock() may modify any of the buffer_*_ members, but must sure that
130   // the result of pos() remains unaffected.
131   //
132   // Examples:
133   // - a stream could either fill a separate buffer. Then buffer_start_ and
134   //   buffer_cursor_ would point to the beginning of the buffer, and
135   //   buffer_pos would be the old pos().
136   // - a stream with existing buffer chunks would set buffer_start_ and
137   //   buffer_end_ to cover the full chunk, and then buffer_cursor_ would
138   //   point into the middle of the buffer, while buffer_pos_ would describe
139   //   the start of the buffer.
140   virtual bool ReadBlock() = 0;
141 
142   const uint16_t* buffer_start_;
143   const uint16_t* buffer_cursor_;
144   const uint16_t* buffer_end_;
145   size_t buffer_pos_;
146 };
147 
148 
149 // ----------------------------------------------------------------------------
150 // JavaScript Scanner.
151 
152 class Scanner {
153  public:
154   // Scoped helper for a re-settable bookmark.
155   class BookmarkScope {
156    public:
BookmarkScope(Scanner * scanner)157     explicit BookmarkScope(Scanner* scanner)
158         : scanner_(scanner), bookmark_(kNoBookmark) {
159       DCHECK_NOT_NULL(scanner_);
160     }
~BookmarkScope()161     ~BookmarkScope() {}
162 
163     void Set();
164     void Apply();
165     bool HasBeenSet();
166     bool HasBeenApplied();
167 
168    private:
169     static const size_t kNoBookmark;
170     static const size_t kBookmarkWasApplied;
171     static const size_t kBookmarkAtFirstPos;
172 
173     Scanner* scanner_;
174     size_t bookmark_;
175 
176     DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
177   };
178 
179   // Representation of an interval of source positions.
180   struct Location {
LocationLocation181     Location(int b, int e) : beg_pos(b), end_pos(e) { }
LocationLocation182     Location() : beg_pos(0), end_pos(0) { }
183 
IsValidLocation184     bool IsValid() const {
185       return beg_pos >= 0 && end_pos >= beg_pos;
186     }
187 
invalidLocation188     static Location invalid() { return Location(-1, -1); }
189 
190     int beg_pos;
191     int end_pos;
192   };
193 
194   // -1 is outside of the range of any real source code.
195   static const int kNoOctalLocation = -1;
196   static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
197 
198   explicit Scanner(UnicodeCache* scanner_contants);
199 
200   void Initialize(Utf16CharacterStream* source);
201 
202   // Returns the next token and advances input.
203   Token::Value Next();
204   // Returns the token following peek()
205   Token::Value PeekAhead();
206   // Returns the current token again.
current_token()207   Token::Value current_token() { return current_.token; }
208   // Returns the location information for the current token
209   // (the token last returned by Next()).
location()210   Location location() const { return current_.location; }
211 
has_error()212   bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
error()213   MessageTemplate::Template error() const { return scanner_error_; }
error_location()214   Location error_location() const { return scanner_error_location_; }
215 
216   // Similar functions for the upcoming token.
217 
218   // One token look-ahead (past the token returned by Next()).
peek()219   Token::Value peek() const { return next_.token; }
220 
peek_location()221   Location peek_location() const { return next_.location; }
222 
literal_contains_escapes()223   bool literal_contains_escapes() const {
224     return LiteralContainsEscapes(current_);
225   }
is_literal_contextual_keyword(Vector<const char> keyword)226   bool is_literal_contextual_keyword(Vector<const char> keyword) {
227     DCHECK(current_.token == Token::IDENTIFIER ||
228            current_.token == Token::ESCAPED_STRICT_RESERVED_WORD);
229     DCHECK_NOT_NULL(current_.literal_chars);
230     return current_.literal_chars->is_contextual_keyword(keyword);
231   }
is_next_contextual_keyword(Vector<const char> keyword)232   bool is_next_contextual_keyword(Vector<const char> keyword) {
233     DCHECK_NOT_NULL(next_.literal_chars);
234     return next_.literal_chars->is_contextual_keyword(keyword);
235   }
236 
237   const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
238   const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
239   const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
240 
241   double DoubleValue();
242   bool ContainsDot();
243   bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
244     if (!current_.literal_chars) {
245       return !strncmp(Token::Name(current_.token), data, length);
246     } else if (is_literal_one_byte() && literal_length() == length &&
247                (allow_escapes || !literal_contains_escapes())) {
248       const char* token =
249           reinterpret_cast<const char*>(literal_one_byte_string().start());
250       return !strncmp(token, data, length);
251     }
252     return false;
253   }
UnescapedLiteralMatches(const char * data,int length)254   inline bool UnescapedLiteralMatches(const char* data, int length) {
255     return LiteralMatches(data, length, false);
256   }
257 
IsGetOrSet(bool * is_get,bool * is_set)258   bool IsGetOrSet(bool* is_get, bool* is_set) {
259     if (is_literal_one_byte() &&
260         literal_length() == 3 &&
261         !literal_contains_escapes()) {
262       const char* token =
263           reinterpret_cast<const char*>(literal_one_byte_string().start());
264       *is_get = strncmp(token, "get", 3) == 0;
265       *is_set = !*is_get && strncmp(token, "set", 3) == 0;
266       return *is_get || *is_set;
267     }
268     return false;
269   }
270 
271   int FindSymbol(DuplicateFinder* finder, int value);
272 
unicode_cache()273   UnicodeCache* unicode_cache() { return unicode_cache_; }
274 
275   // Returns the location of the last seen octal literal.
octal_position()276   Location octal_position() const { return octal_pos_; }
clear_octal_position()277   void clear_octal_position() { octal_pos_ = Location::invalid(); }
278   // Returns the location of the last seen decimal literal with a leading zero.
decimal_with_leading_zero_position()279   Location decimal_with_leading_zero_position() const {
280     return decimal_with_leading_zero_pos_;
281   }
clear_decimal_with_leading_zero_position()282   void clear_decimal_with_leading_zero_position() {
283     decimal_with_leading_zero_pos_ = Location::invalid();
284   }
285 
286   // Returns the value of the last smi that was scanned.
smi_value()287   uint32_t smi_value() const { return current_.smi_value_; }
288 
289   // Seek forward to the given position.  This operation does not
290   // work in general, for instance when there are pushed back
291   // characters, but works for seeking forward until simple delimiter
292   // tokens, which is what it is used for.
293   void SeekForward(int pos);
294 
295   // Returns true if there was a line terminator before the peek'ed token,
296   // possibly inside a multi-line comment.
HasAnyLineTerminatorBeforeNext()297   bool HasAnyLineTerminatorBeforeNext() const {
298     return has_line_terminator_before_next_ ||
299            has_multiline_comment_before_next_;
300   }
301 
HasAnyLineTerminatorAfterNext()302   bool HasAnyLineTerminatorAfterNext() {
303     Token::Value ensure_next_next = PeekAhead();
304     USE(ensure_next_next);
305     return has_line_terminator_after_next_;
306   }
307 
308   // Scans the input as a regular expression pattern, next token must be /(=).
309   // Returns true if a pattern is scanned.
310   bool ScanRegExpPattern();
311   // Scans the input as regular expression flags. Returns the flags on success.
312   Maybe<RegExp::Flags> ScanRegExpFlags();
313 
314   // Scans the input as a template literal
315   Token::Value ScanTemplateStart();
316   Token::Value ScanTemplateContinuation();
317 
SourceUrl(Isolate * isolate)318   Handle<String> SourceUrl(Isolate* isolate) const {
319     Handle<String> tmp;
320     if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
321     return tmp;
322   }
323 
SourceMappingUrl(Isolate * isolate)324   Handle<String> SourceMappingUrl(Isolate* isolate) const {
325     Handle<String> tmp;
326     if (source_mapping_url_.length() > 0)
327       tmp = source_mapping_url_.Internalize(isolate);
328     return tmp;
329   }
330 
331   bool IdentifierIsFutureStrictReserved(const AstRawString* string) const;
332 
FoundHtmlComment()333   bool FoundHtmlComment() const { return found_html_comment_; }
334 
335  private:
336   // Scoped helper for literal recording. Automatically drops the literal
337   // if aborting the scanning before it's complete.
338   class LiteralScope {
339    public:
LiteralScope(Scanner * self)340     explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
341       scanner_->StartLiteral();
342     }
~LiteralScope()343     ~LiteralScope() {
344       if (!complete_) scanner_->DropLiteral();
345     }
Complete()346     void Complete() { complete_ = true; }
347 
348    private:
349     Scanner* scanner_;
350     bool complete_;
351   };
352 
353   // LiteralBuffer -  Collector of chars of literals.
354   class LiteralBuffer {
355    public:
LiteralBuffer()356     LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {}
357 
~LiteralBuffer()358     ~LiteralBuffer() { backing_store_.Dispose(); }
359 
INLINE(void AddChar (char code_unit))360     INLINE(void AddChar(char code_unit)) {
361       if (position_ >= backing_store_.length()) ExpandBuffer();
362       DCHECK(is_one_byte_);
363       DCHECK(IsValidAscii(code_unit));
364       backing_store_[position_] = static_cast<byte>(code_unit);
365       position_ += kOneByteSize;
366       return;
367     }
368 
INLINE(void AddChar (uc32 code_unit))369     INLINE(void AddChar(uc32 code_unit)) {
370       if (position_ >= backing_store_.length()) ExpandBuffer();
371       if (is_one_byte_) {
372         if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
373           backing_store_[position_] = static_cast<byte>(code_unit);
374           position_ += kOneByteSize;
375           return;
376         }
377         ConvertToTwoByte();
378       }
379       if (code_unit <=
380           static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
381         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit;
382         position_ += kUC16Size;
383       } else {
384         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
385             unibrow::Utf16::LeadSurrogate(code_unit);
386         position_ += kUC16Size;
387         if (position_ >= backing_store_.length()) ExpandBuffer();
388         *reinterpret_cast<uint16_t*>(&backing_store_[position_]) =
389             unibrow::Utf16::TrailSurrogate(code_unit);
390         position_ += kUC16Size;
391       }
392     }
393 
is_one_byte()394     bool is_one_byte() const { return is_one_byte_; }
395 
is_contextual_keyword(Vector<const char> keyword)396     bool is_contextual_keyword(Vector<const char> keyword) const {
397       return is_one_byte() && keyword.length() == position_ &&
398              (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
399     }
400 
two_byte_literal()401     Vector<const uint16_t> two_byte_literal() const {
402       DCHECK(!is_one_byte_);
403       DCHECK((position_ & 0x1) == 0);
404       return Vector<const uint16_t>(
405           reinterpret_cast<const uint16_t*>(backing_store_.start()),
406           position_ >> 1);
407     }
408 
one_byte_literal()409     Vector<const uint8_t> one_byte_literal() const {
410       DCHECK(is_one_byte_);
411       return Vector<const uint8_t>(
412           reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
413     }
414 
length()415     int length() const { return is_one_byte_ ? position_ : (position_ >> 1); }
416 
ReduceLength(int delta)417     void ReduceLength(int delta) {
418       position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
419     }
420 
Reset()421     void Reset() {
422       position_ = 0;
423       is_one_byte_ = true;
424     }
425 
426     Handle<String> Internalize(Isolate* isolate) const;
427 
428    private:
429     static const int kInitialCapacity = 16;
430     static const int kGrowthFactory = 4;
431     static const int kMinConversionSlack = 256;
432     static const int kMaxGrowth = 1 * MB;
433 
IsValidAscii(char code_unit)434     inline bool IsValidAscii(char code_unit) {
435       // Control characters and printable characters span the range of
436       // valid ASCII characters (0-127). Chars are unsigned on some
437       // platforms which causes compiler warnings if the validity check
438       // tests the lower bound >= 0 as it's always true.
439       return iscntrl(code_unit) || isprint(code_unit);
440     }
441 
NewCapacity(int min_capacity)442     inline int NewCapacity(int min_capacity) {
443       int capacity = Max(min_capacity, backing_store_.length());
444       int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth);
445       return new_capacity;
446     }
447 
ExpandBuffer()448     void ExpandBuffer() {
449       Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity));
450       MemCopy(new_store.start(), backing_store_.start(), position_);
451       backing_store_.Dispose();
452       backing_store_ = new_store;
453     }
454 
ConvertToTwoByte()455     void ConvertToTwoByte() {
456       DCHECK(is_one_byte_);
457       Vector<byte> new_store;
458       int new_content_size = position_ * kUC16Size;
459       if (new_content_size >= backing_store_.length()) {
460         // Ensure room for all currently read code units as UC16 as well
461         // as the code unit about to be stored.
462         new_store = Vector<byte>::New(NewCapacity(new_content_size));
463       } else {
464         new_store = backing_store_;
465       }
466       uint8_t* src = backing_store_.start();
467       uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start());
468       for (int i = position_ - 1; i >= 0; i--) {
469         dst[i] = src[i];
470       }
471       if (new_store.start() != backing_store_.start()) {
472         backing_store_.Dispose();
473         backing_store_ = new_store;
474       }
475       position_ = new_content_size;
476       is_one_byte_ = false;
477     }
478 
479     bool is_one_byte_;
480     int position_;
481     Vector<byte> backing_store_;
482 
483     DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
484   };
485 
486   // The current and look-ahead token.
487   struct TokenDesc {
488     Location location;
489     LiteralBuffer* literal_chars;
490     LiteralBuffer* raw_literal_chars;
491     uint32_t smi_value_;
492     Token::Value token;
493   };
494 
495   static const int kCharacterLookaheadBufferSize = 1;
496   const int kMaxAscii = 127;
497 
498   // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
499   template <bool capture_raw>
500   uc32 ScanOctalEscape(uc32 c, int length);
501 
502   // Call this after setting source_ to the input.
Init()503   void Init() {
504     // Set c0_ (one character ahead)
505     STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
506     Advance();
507     // Initialize current_ to not refer to a literal.
508     current_.token = Token::UNINITIALIZED;
509     current_.literal_chars = NULL;
510     current_.raw_literal_chars = NULL;
511     next_.token = Token::UNINITIALIZED;
512     next_.literal_chars = NULL;
513     next_.raw_literal_chars = NULL;
514     next_next_.token = Token::UNINITIALIZED;
515     next_next_.literal_chars = NULL;
516     next_next_.raw_literal_chars = NULL;
517     found_html_comment_ = false;
518     scanner_error_ = MessageTemplate::kNone;
519   }
520 
ReportScannerError(const Location & location,MessageTemplate::Template error)521   void ReportScannerError(const Location& location,
522                           MessageTemplate::Template error) {
523     if (has_error()) return;
524     scanner_error_ = error;
525     scanner_error_location_ = location;
526   }
527 
ReportScannerError(int pos,MessageTemplate::Template error)528   void ReportScannerError(int pos, MessageTemplate::Template error) {
529     if (has_error()) return;
530     scanner_error_ = error;
531     scanner_error_location_ = Location(pos, pos + 1);
532   }
533 
534   // Seek to the next_ token at the given position.
535   void SeekNext(size_t position);
536 
537   // Literal buffer support
StartLiteral()538   inline void StartLiteral() {
539     LiteralBuffer* free_buffer =
540         (current_.literal_chars == &literal_buffer0_)
541             ? &literal_buffer1_
542             : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
543                                                             : &literal_buffer0_;
544     free_buffer->Reset();
545     next_.literal_chars = free_buffer;
546   }
547 
StartRawLiteral()548   inline void StartRawLiteral() {
549     LiteralBuffer* free_buffer =
550         (current_.raw_literal_chars == &raw_literal_buffer0_)
551             ? &raw_literal_buffer1_
552             : (current_.raw_literal_chars == &raw_literal_buffer1_)
553                   ? &raw_literal_buffer2_
554                   : &raw_literal_buffer0_;
555     free_buffer->Reset();
556     next_.raw_literal_chars = free_buffer;
557   }
558 
INLINE(void AddLiteralChar (uc32 c))559   INLINE(void AddLiteralChar(uc32 c)) {
560     DCHECK_NOT_NULL(next_.literal_chars);
561     next_.literal_chars->AddChar(c);
562   }
563 
INLINE(void AddLiteralChar (char c))564   INLINE(void AddLiteralChar(char c)) {
565     DCHECK_NOT_NULL(next_.literal_chars);
566     next_.literal_chars->AddChar(c);
567   }
568 
INLINE(void AddRawLiteralChar (uc32 c))569   INLINE(void AddRawLiteralChar(uc32 c)) {
570     DCHECK_NOT_NULL(next_.raw_literal_chars);
571     next_.raw_literal_chars->AddChar(c);
572   }
573 
INLINE(void ReduceRawLiteralLength (int delta))574   INLINE(void ReduceRawLiteralLength(int delta)) {
575     DCHECK_NOT_NULL(next_.raw_literal_chars);
576     next_.raw_literal_chars->ReduceLength(delta);
577   }
578 
579   // Stops scanning of a literal and drop the collected characters,
580   // e.g., due to an encountered error.
DropLiteral()581   inline void DropLiteral() {
582     next_.literal_chars = NULL;
583     next_.raw_literal_chars = NULL;
584   }
585 
AddLiteralCharAdvance()586   inline void AddLiteralCharAdvance() {
587     AddLiteralChar(c0_);
588     Advance();
589   }
590 
591   // Low-level scanning support.
592   template <bool capture_raw = false, bool check_surrogate = true>
Advance()593   void Advance() {
594     if (capture_raw) {
595       AddRawLiteralChar(c0_);
596     }
597     c0_ = source_->Advance();
598     if (check_surrogate) HandleLeadSurrogate();
599   }
600 
HandleLeadSurrogate()601   void HandleLeadSurrogate() {
602     if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
603       uc32 c1 = source_->Advance();
604       if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
605         source_->Back();
606       } else {
607         c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
608       }
609     }
610   }
611 
PushBack(uc32 ch)612   void PushBack(uc32 ch) {
613     if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
614       source_->Back2();
615     } else {
616       source_->Back();
617     }
618     c0_ = ch;
619   }
620 
621   // Same as PushBack(ch1); PushBack(ch2).
622   // - Potentially more efficient as it uses Back2() on the stream.
623   // - Uses char as parameters, since we're only calling it with ASCII chars in
624   //   practice. This way, we can avoid a few edge cases.
PushBack2(char ch1,char ch2)625   void PushBack2(char ch1, char ch2) {
626     source_->Back2();
627     c0_ = ch2;
628   }
629 
Select(Token::Value tok)630   inline Token::Value Select(Token::Value tok) {
631     Advance();
632     return tok;
633   }
634 
Select(uc32 next,Token::Value then,Token::Value else_)635   inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
636     Advance();
637     if (c0_ == next) {
638       Advance();
639       return then;
640     } else {
641       return else_;
642     }
643   }
644 
645   // Returns the literal string, if any, for the current token (the
646   // token last returned by Next()). The string is 0-terminated.
647   // Literal strings are collected for identifiers, strings, numbers as well
648   // as for template literals. For template literals we also collect the raw
649   // form.
650   // These functions only give the correct result if the literal was scanned
651   // when a LiteralScope object is alive.
652   //
653   // Current usage of these functions is unfortunately a little undisciplined,
654   // and is_literal_one_byte() + is_literal_one_byte_string() is also
655   // requested for tokens that do not have a literal. Hence, we treat any
656   // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
657   // literal "function".
literal_one_byte_string()658   Vector<const uint8_t> literal_one_byte_string() {
659     if (current_.literal_chars)
660       return current_.literal_chars->one_byte_literal();
661     const char* str = Token::String(current_.token);
662     const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str);
663     return Vector<const uint8_t>(str_as_uint8,
664                                  Token::StringLength(current_.token));
665   }
literal_two_byte_string()666   Vector<const uint16_t> literal_two_byte_string() {
667     DCHECK_NOT_NULL(current_.literal_chars);
668     return current_.literal_chars->two_byte_literal();
669   }
is_literal_one_byte()670   bool is_literal_one_byte() {
671     return !current_.literal_chars || current_.literal_chars->is_one_byte();
672   }
literal_length()673   int literal_length() const {
674     if (current_.literal_chars) return current_.literal_chars->length();
675     return Token::StringLength(current_.token);
676   }
677   // Returns the literal string for the next token (the token that
678   // would be returned if Next() were called).
next_literal_one_byte_string()679   Vector<const uint8_t> next_literal_one_byte_string() {
680     DCHECK_NOT_NULL(next_.literal_chars);
681     return next_.literal_chars->one_byte_literal();
682   }
next_literal_two_byte_string()683   Vector<const uint16_t> next_literal_two_byte_string() {
684     DCHECK_NOT_NULL(next_.literal_chars);
685     return next_.literal_chars->two_byte_literal();
686   }
is_next_literal_one_byte()687   bool is_next_literal_one_byte() {
688     DCHECK_NOT_NULL(next_.literal_chars);
689     return next_.literal_chars->is_one_byte();
690   }
raw_literal_one_byte_string()691   Vector<const uint8_t> raw_literal_one_byte_string() {
692     DCHECK_NOT_NULL(current_.raw_literal_chars);
693     return current_.raw_literal_chars->one_byte_literal();
694   }
raw_literal_two_byte_string()695   Vector<const uint16_t> raw_literal_two_byte_string() {
696     DCHECK_NOT_NULL(current_.raw_literal_chars);
697     return current_.raw_literal_chars->two_byte_literal();
698   }
is_raw_literal_one_byte()699   bool is_raw_literal_one_byte() {
700     DCHECK_NOT_NULL(current_.raw_literal_chars);
701     return current_.raw_literal_chars->is_one_byte();
702   }
703 
704   template <bool capture_raw, bool unicode = false>
705   uc32 ScanHexNumber(int expected_length);
706   // Scan a number of any length but not bigger than max_value. For example, the
707   // number can be 000000001, so it's very long in characters but its value is
708   // small.
709   template <bool capture_raw>
710   uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
711 
712   // Scans a single JavaScript token.
713   void Scan();
714 
715   bool SkipWhiteSpace();
716   Token::Value SkipSingleLineComment();
717   Token::Value SkipSourceURLComment();
718   void TryToParseSourceURLComment();
719   Token::Value SkipMultiLineComment();
720   // Scans a possible HTML comment -- begins with '<!'.
721   Token::Value ScanHtmlComment();
722 
723   void ScanDecimalDigits();
724   Token::Value ScanNumber(bool seen_period);
725   Token::Value ScanIdentifierOrKeyword();
726   Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
727 
728   Token::Value ScanString();
729 
730   // Scans an escape-sequence which is part of a string and adds the
731   // decoded character to the current literal. Returns true if a pattern
732   // is scanned.
733   template <bool capture_raw, bool in_template_literal>
734   bool ScanEscape();
735 
736   // Decodes a Unicode escape-sequence which is part of an identifier.
737   // If the escape sequence cannot be decoded the result is kBadChar.
738   uc32 ScanIdentifierUnicodeEscape();
739   // Helper for the above functions.
740   template <bool capture_raw>
741   uc32 ScanUnicodeEscape();
742 
743   Token::Value ScanTemplateSpan();
744 
745   // Return the current source position.
source_pos()746   int source_pos() {
747     return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
748   }
749 
LiteralContainsEscapes(const TokenDesc & token)750   static bool LiteralContainsEscapes(const TokenDesc& token) {
751     Location location = token.location;
752     int source_length = (location.end_pos - location.beg_pos);
753     if (token.token == Token::STRING) {
754       // Subtract delimiters.
755       source_length -= 2;
756     }
757     return token.literal_chars &&
758            (token.literal_chars->length() != source_length);
759   }
760 
761 #ifdef DEBUG
762   void SanityCheckTokenDesc(const TokenDesc&) const;
763 #endif
764 
765   UnicodeCache* unicode_cache_;
766 
767   // Buffers collecting literal strings, numbers, etc.
768   LiteralBuffer literal_buffer0_;
769   LiteralBuffer literal_buffer1_;
770   LiteralBuffer literal_buffer2_;
771 
772   // Values parsed from magic comments.
773   LiteralBuffer source_url_;
774   LiteralBuffer source_mapping_url_;
775 
776   // Buffer to store raw string values
777   LiteralBuffer raw_literal_buffer0_;
778   LiteralBuffer raw_literal_buffer1_;
779   LiteralBuffer raw_literal_buffer2_;
780 
781   TokenDesc current_;    // desc for current token (as returned by Next())
782   TokenDesc next_;       // desc for next token (one token look-ahead)
783   TokenDesc next_next_;  // desc for the token after next (after PeakAhead())
784 
785   // Input stream. Must be initialized to an Utf16CharacterStream.
786   Utf16CharacterStream* source_;
787 
788   // Last-seen positions of potentially problematic tokens.
789   Location octal_pos_;
790   Location decimal_with_leading_zero_pos_;
791 
792   // One Unicode character look-ahead; c0_ < 0 at the end of the input.
793   uc32 c0_;
794 
795   // Whether there is a line terminator whitespace character after
796   // the current token, and  before the next. Does not count newlines
797   // inside multiline comments.
798   bool has_line_terminator_before_next_;
799   // Whether there is a multi-line comment that contains a
800   // line-terminator after the current token, and before the next.
801   bool has_multiline_comment_before_next_;
802   bool has_line_terminator_after_next_;
803 
804   // Whether this scanner encountered an HTML comment.
805   bool found_html_comment_;
806 
807   MessageTemplate::Template scanner_error_;
808   Location scanner_error_location_;
809 };
810 
811 }  // namespace internal
812 }  // namespace v8
813 
814 #endif  // V8_PARSING_SCANNER_H_
815