1 // Copyright 2011 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Features shared by parsing and pre-parsing scanners. 6 7 #ifndef V8_PARSING_SCANNER_H_ 8 #define V8_PARSING_SCANNER_H_ 9 10 #include "src/allocation.h" 11 #include "src/base/logging.h" 12 #include "src/char-predicates.h" 13 #include "src/globals.h" 14 #include "src/messages.h" 15 #include "src/parsing/token.h" 16 #include "src/unicode-decoder.h" 17 #include "src/unicode.h" 18 19 namespace v8 { 20 namespace internal { 21 22 23 class AstRawString; 24 class AstValueFactory; 25 class DuplicateFinder; 26 class ExternalOneByteString; 27 class ExternalTwoByteString; 28 class ParserRecorder; 29 class UnicodeCache; 30 31 // --------------------------------------------------------------------- 32 // Buffered stream of UTF-16 code units, using an internal UTF-16 buffer. 33 // A code unit is a 16 bit value representing either a 16 bit code point 34 // or one part of a surrogate pair that make a single 21 bit code point. 35 class Utf16CharacterStream { 36 public: 37 static const uc32 kEndOfInput = -1; 38 ~Utf16CharacterStream()39 virtual ~Utf16CharacterStream() { } 40 41 // Returns and advances past the next UTF-16 code unit in the input 42 // stream. If there are no more code units it returns kEndOfInput. Advance()43 inline uc32 Advance() { 44 if (V8_LIKELY(buffer_cursor_ < buffer_end_)) { 45 return static_cast<uc32>(*(buffer_cursor_++)); 46 } else if (ReadBlock()) { 47 return static_cast<uc32>(*(buffer_cursor_++)); 48 } else { 49 // Note: currently the following increment is necessary to avoid a 50 // parser problem! The scanner treats the final kEndOfInput as 51 // a code unit with a position, and does math relative to that 52 // position. 53 buffer_cursor_++; 54 return kEndOfInput; 55 } 56 } 57 58 // Go back one by one character in the input stream. 59 // This undoes the most recent Advance(). Back()60 inline void Back() { 61 // The common case - if the previous character is within 62 // buffer_start_ .. buffer_end_ will be handles locally. 63 // Otherwise, a new block is requested. 64 if (V8_LIKELY(buffer_cursor_ > buffer_start_)) { 65 buffer_cursor_--; 66 } else { 67 ReadBlockAt(pos() - 1); 68 } 69 } 70 71 // Go back one by two characters in the input stream. (This is the same as 72 // calling Back() twice. But Back() may - in some instances - do substantial 73 // work. Back2() guarantees this work will be done only once.) Back2()74 inline void Back2() { 75 if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) { 76 buffer_cursor_ -= 2; 77 } else { 78 ReadBlockAt(pos() - 2); 79 } 80 } 81 pos()82 inline size_t pos() const { 83 return buffer_pos_ + (buffer_cursor_ - buffer_start_); 84 } 85 Seek(size_t pos)86 inline void Seek(size_t pos) { 87 if (V8_LIKELY(pos >= buffer_pos_ && 88 pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) { 89 buffer_cursor_ = buffer_start_ + (pos - buffer_pos_); 90 } else { 91 ReadBlockAt(pos); 92 } 93 } 94 95 protected: Utf16CharacterStream(const uint16_t * buffer_start,const uint16_t * buffer_cursor,const uint16_t * buffer_end,size_t buffer_pos)96 Utf16CharacterStream(const uint16_t* buffer_start, 97 const uint16_t* buffer_cursor, 98 const uint16_t* buffer_end, size_t buffer_pos) 99 : buffer_start_(buffer_start), 100 buffer_cursor_(buffer_cursor), 101 buffer_end_(buffer_end), 102 buffer_pos_(buffer_pos) {} Utf16CharacterStream()103 Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {} 104 ReadBlockAt(size_t new_pos)105 void ReadBlockAt(size_t new_pos) { 106 // The callers of this method (Back/Back2/Seek) should handle the easy 107 // case (seeking within the current buffer), and we should only get here 108 // if we actually require new data. 109 // (This is really an efficiency check, not a correctness invariant.) 110 DCHECK(new_pos < buffer_pos_ || 111 new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_)); 112 113 // Change pos() to point to new_pos. 114 buffer_pos_ = new_pos; 115 buffer_cursor_ = buffer_start_; 116 bool success = ReadBlock(); 117 USE(success); 118 119 // Post-conditions: 1, on success, we should be at the right position. 120 // 2, success == we should have more characters available. 121 DCHECK_IMPLIES(success, pos() == new_pos); 122 DCHECK_EQ(success, buffer_cursor_ < buffer_end_); 123 DCHECK_EQ(success, buffer_start_ < buffer_end_); 124 } 125 126 // Read more data, and update buffer_*_ to point to it. 127 // Returns true if more data was available. 128 // 129 // ReadBlock() may modify any of the buffer_*_ members, but must sure that 130 // the result of pos() remains unaffected. 131 // 132 // Examples: 133 // - a stream could either fill a separate buffer. Then buffer_start_ and 134 // buffer_cursor_ would point to the beginning of the buffer, and 135 // buffer_pos would be the old pos(). 136 // - a stream with existing buffer chunks would set buffer_start_ and 137 // buffer_end_ to cover the full chunk, and then buffer_cursor_ would 138 // point into the middle of the buffer, while buffer_pos_ would describe 139 // the start of the buffer. 140 virtual bool ReadBlock() = 0; 141 142 const uint16_t* buffer_start_; 143 const uint16_t* buffer_cursor_; 144 const uint16_t* buffer_end_; 145 size_t buffer_pos_; 146 }; 147 148 149 // ---------------------------------------------------------------------------- 150 // JavaScript Scanner. 151 152 class Scanner { 153 public: 154 // Scoped helper for a re-settable bookmark. 155 class BookmarkScope { 156 public: BookmarkScope(Scanner * scanner)157 explicit BookmarkScope(Scanner* scanner) 158 : scanner_(scanner), bookmark_(kNoBookmark) { 159 DCHECK_NOT_NULL(scanner_); 160 } ~BookmarkScope()161 ~BookmarkScope() {} 162 163 void Set(); 164 void Apply(); 165 bool HasBeenSet(); 166 bool HasBeenApplied(); 167 168 private: 169 static const size_t kNoBookmark; 170 static const size_t kBookmarkWasApplied; 171 static const size_t kBookmarkAtFirstPos; 172 173 Scanner* scanner_; 174 size_t bookmark_; 175 176 DISALLOW_COPY_AND_ASSIGN(BookmarkScope); 177 }; 178 179 // Representation of an interval of source positions. 180 struct Location { LocationLocation181 Location(int b, int e) : beg_pos(b), end_pos(e) { } LocationLocation182 Location() : beg_pos(0), end_pos(0) { } 183 IsValidLocation184 bool IsValid() const { 185 return beg_pos >= 0 && end_pos >= beg_pos; 186 } 187 invalidLocation188 static Location invalid() { return Location(-1, -1); } 189 190 int beg_pos; 191 int end_pos; 192 }; 193 194 // -1 is outside of the range of any real source code. 195 static const int kNoOctalLocation = -1; 196 static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput; 197 198 explicit Scanner(UnicodeCache* scanner_contants); 199 200 void Initialize(Utf16CharacterStream* source); 201 202 // Returns the next token and advances input. 203 Token::Value Next(); 204 // Returns the token following peek() 205 Token::Value PeekAhead(); 206 // Returns the current token again. current_token()207 Token::Value current_token() { return current_.token; } 208 // Returns the location information for the current token 209 // (the token last returned by Next()). location()210 Location location() const { return current_.location; } 211 has_error()212 bool has_error() const { return scanner_error_ != MessageTemplate::kNone; } error()213 MessageTemplate::Template error() const { return scanner_error_; } error_location()214 Location error_location() const { return scanner_error_location_; } 215 216 // Similar functions for the upcoming token. 217 218 // One token look-ahead (past the token returned by Next()). peek()219 Token::Value peek() const { return next_.token; } 220 peek_location()221 Location peek_location() const { return next_.location; } 222 literal_contains_escapes()223 bool literal_contains_escapes() const { 224 return LiteralContainsEscapes(current_); 225 } is_literal_contextual_keyword(Vector<const char> keyword)226 bool is_literal_contextual_keyword(Vector<const char> keyword) { 227 DCHECK(current_.token == Token::IDENTIFIER || 228 current_.token == Token::ESCAPED_STRICT_RESERVED_WORD); 229 DCHECK_NOT_NULL(current_.literal_chars); 230 return current_.literal_chars->is_contextual_keyword(keyword); 231 } is_next_contextual_keyword(Vector<const char> keyword)232 bool is_next_contextual_keyword(Vector<const char> keyword) { 233 DCHECK_NOT_NULL(next_.literal_chars); 234 return next_.literal_chars->is_contextual_keyword(keyword); 235 } 236 237 const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory); 238 const AstRawString* NextSymbol(AstValueFactory* ast_value_factory); 239 const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory); 240 241 double DoubleValue(); 242 bool ContainsDot(); 243 bool LiteralMatches(const char* data, int length, bool allow_escapes = true) { 244 if (!current_.literal_chars) { 245 return !strncmp(Token::Name(current_.token), data, length); 246 } else if (is_literal_one_byte() && literal_length() == length && 247 (allow_escapes || !literal_contains_escapes())) { 248 const char* token = 249 reinterpret_cast<const char*>(literal_one_byte_string().start()); 250 return !strncmp(token, data, length); 251 } 252 return false; 253 } UnescapedLiteralMatches(const char * data,int length)254 inline bool UnescapedLiteralMatches(const char* data, int length) { 255 return LiteralMatches(data, length, false); 256 } 257 IsGetOrSet(bool * is_get,bool * is_set)258 bool IsGetOrSet(bool* is_get, bool* is_set) { 259 if (is_literal_one_byte() && 260 literal_length() == 3 && 261 !literal_contains_escapes()) { 262 const char* token = 263 reinterpret_cast<const char*>(literal_one_byte_string().start()); 264 *is_get = strncmp(token, "get", 3) == 0; 265 *is_set = !*is_get && strncmp(token, "set", 3) == 0; 266 return *is_get || *is_set; 267 } 268 return false; 269 } 270 271 int FindSymbol(DuplicateFinder* finder, int value); 272 unicode_cache()273 UnicodeCache* unicode_cache() { return unicode_cache_; } 274 275 // Returns the location of the last seen octal literal. octal_position()276 Location octal_position() const { return octal_pos_; } clear_octal_position()277 void clear_octal_position() { octal_pos_ = Location::invalid(); } 278 // Returns the location of the last seen decimal literal with a leading zero. decimal_with_leading_zero_position()279 Location decimal_with_leading_zero_position() const { 280 return decimal_with_leading_zero_pos_; 281 } clear_decimal_with_leading_zero_position()282 void clear_decimal_with_leading_zero_position() { 283 decimal_with_leading_zero_pos_ = Location::invalid(); 284 } 285 286 // Returns the value of the last smi that was scanned. smi_value()287 uint32_t smi_value() const { return current_.smi_value_; } 288 289 // Seek forward to the given position. This operation does not 290 // work in general, for instance when there are pushed back 291 // characters, but works for seeking forward until simple delimiter 292 // tokens, which is what it is used for. 293 void SeekForward(int pos); 294 295 // Returns true if there was a line terminator before the peek'ed token, 296 // possibly inside a multi-line comment. HasAnyLineTerminatorBeforeNext()297 bool HasAnyLineTerminatorBeforeNext() const { 298 return has_line_terminator_before_next_ || 299 has_multiline_comment_before_next_; 300 } 301 HasAnyLineTerminatorAfterNext()302 bool HasAnyLineTerminatorAfterNext() { 303 Token::Value ensure_next_next = PeekAhead(); 304 USE(ensure_next_next); 305 return has_line_terminator_after_next_; 306 } 307 308 // Scans the input as a regular expression pattern, next token must be /(=). 309 // Returns true if a pattern is scanned. 310 bool ScanRegExpPattern(); 311 // Scans the input as regular expression flags. Returns the flags on success. 312 Maybe<RegExp::Flags> ScanRegExpFlags(); 313 314 // Scans the input as a template literal 315 Token::Value ScanTemplateStart(); 316 Token::Value ScanTemplateContinuation(); 317 SourceUrl(Isolate * isolate)318 Handle<String> SourceUrl(Isolate* isolate) const { 319 Handle<String> tmp; 320 if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate); 321 return tmp; 322 } 323 SourceMappingUrl(Isolate * isolate)324 Handle<String> SourceMappingUrl(Isolate* isolate) const { 325 Handle<String> tmp; 326 if (source_mapping_url_.length() > 0) 327 tmp = source_mapping_url_.Internalize(isolate); 328 return tmp; 329 } 330 331 bool IdentifierIsFutureStrictReserved(const AstRawString* string) const; 332 FoundHtmlComment()333 bool FoundHtmlComment() const { return found_html_comment_; } 334 335 private: 336 // Scoped helper for literal recording. Automatically drops the literal 337 // if aborting the scanning before it's complete. 338 class LiteralScope { 339 public: LiteralScope(Scanner * self)340 explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) { 341 scanner_->StartLiteral(); 342 } ~LiteralScope()343 ~LiteralScope() { 344 if (!complete_) scanner_->DropLiteral(); 345 } Complete()346 void Complete() { complete_ = true; } 347 348 private: 349 Scanner* scanner_; 350 bool complete_; 351 }; 352 353 // LiteralBuffer - Collector of chars of literals. 354 class LiteralBuffer { 355 public: LiteralBuffer()356 LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {} 357 ~LiteralBuffer()358 ~LiteralBuffer() { backing_store_.Dispose(); } 359 INLINE(void AddChar (char code_unit))360 INLINE(void AddChar(char code_unit)) { 361 if (position_ >= backing_store_.length()) ExpandBuffer(); 362 DCHECK(is_one_byte_); 363 DCHECK(IsValidAscii(code_unit)); 364 backing_store_[position_] = static_cast<byte>(code_unit); 365 position_ += kOneByteSize; 366 return; 367 } 368 INLINE(void AddChar (uc32 code_unit))369 INLINE(void AddChar(uc32 code_unit)) { 370 if (position_ >= backing_store_.length()) ExpandBuffer(); 371 if (is_one_byte_) { 372 if (code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) { 373 backing_store_[position_] = static_cast<byte>(code_unit); 374 position_ += kOneByteSize; 375 return; 376 } 377 ConvertToTwoByte(); 378 } 379 if (code_unit <= 380 static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 381 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = code_unit; 382 position_ += kUC16Size; 383 } else { 384 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 385 unibrow::Utf16::LeadSurrogate(code_unit); 386 position_ += kUC16Size; 387 if (position_ >= backing_store_.length()) ExpandBuffer(); 388 *reinterpret_cast<uint16_t*>(&backing_store_[position_]) = 389 unibrow::Utf16::TrailSurrogate(code_unit); 390 position_ += kUC16Size; 391 } 392 } 393 is_one_byte()394 bool is_one_byte() const { return is_one_byte_; } 395 is_contextual_keyword(Vector<const char> keyword)396 bool is_contextual_keyword(Vector<const char> keyword) const { 397 return is_one_byte() && keyword.length() == position_ && 398 (memcmp(keyword.start(), backing_store_.start(), position_) == 0); 399 } 400 two_byte_literal()401 Vector<const uint16_t> two_byte_literal() const { 402 DCHECK(!is_one_byte_); 403 DCHECK((position_ & 0x1) == 0); 404 return Vector<const uint16_t>( 405 reinterpret_cast<const uint16_t*>(backing_store_.start()), 406 position_ >> 1); 407 } 408 one_byte_literal()409 Vector<const uint8_t> one_byte_literal() const { 410 DCHECK(is_one_byte_); 411 return Vector<const uint8_t>( 412 reinterpret_cast<const uint8_t*>(backing_store_.start()), position_); 413 } 414 length()415 int length() const { return is_one_byte_ ? position_ : (position_ >> 1); } 416 ReduceLength(int delta)417 void ReduceLength(int delta) { 418 position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size); 419 } 420 Reset()421 void Reset() { 422 position_ = 0; 423 is_one_byte_ = true; 424 } 425 426 Handle<String> Internalize(Isolate* isolate) const; 427 428 private: 429 static const int kInitialCapacity = 16; 430 static const int kGrowthFactory = 4; 431 static const int kMinConversionSlack = 256; 432 static const int kMaxGrowth = 1 * MB; 433 IsValidAscii(char code_unit)434 inline bool IsValidAscii(char code_unit) { 435 // Control characters and printable characters span the range of 436 // valid ASCII characters (0-127). Chars are unsigned on some 437 // platforms which causes compiler warnings if the validity check 438 // tests the lower bound >= 0 as it's always true. 439 return iscntrl(code_unit) || isprint(code_unit); 440 } 441 NewCapacity(int min_capacity)442 inline int NewCapacity(int min_capacity) { 443 int capacity = Max(min_capacity, backing_store_.length()); 444 int new_capacity = Min(capacity * kGrowthFactory, capacity + kMaxGrowth); 445 return new_capacity; 446 } 447 ExpandBuffer()448 void ExpandBuffer() { 449 Vector<byte> new_store = Vector<byte>::New(NewCapacity(kInitialCapacity)); 450 MemCopy(new_store.start(), backing_store_.start(), position_); 451 backing_store_.Dispose(); 452 backing_store_ = new_store; 453 } 454 ConvertToTwoByte()455 void ConvertToTwoByte() { 456 DCHECK(is_one_byte_); 457 Vector<byte> new_store; 458 int new_content_size = position_ * kUC16Size; 459 if (new_content_size >= backing_store_.length()) { 460 // Ensure room for all currently read code units as UC16 as well 461 // as the code unit about to be stored. 462 new_store = Vector<byte>::New(NewCapacity(new_content_size)); 463 } else { 464 new_store = backing_store_; 465 } 466 uint8_t* src = backing_store_.start(); 467 uint16_t* dst = reinterpret_cast<uint16_t*>(new_store.start()); 468 for (int i = position_ - 1; i >= 0; i--) { 469 dst[i] = src[i]; 470 } 471 if (new_store.start() != backing_store_.start()) { 472 backing_store_.Dispose(); 473 backing_store_ = new_store; 474 } 475 position_ = new_content_size; 476 is_one_byte_ = false; 477 } 478 479 bool is_one_byte_; 480 int position_; 481 Vector<byte> backing_store_; 482 483 DISALLOW_COPY_AND_ASSIGN(LiteralBuffer); 484 }; 485 486 // The current and look-ahead token. 487 struct TokenDesc { 488 Location location; 489 LiteralBuffer* literal_chars; 490 LiteralBuffer* raw_literal_chars; 491 uint32_t smi_value_; 492 Token::Value token; 493 }; 494 495 static const int kCharacterLookaheadBufferSize = 1; 496 const int kMaxAscii = 127; 497 498 // Scans octal escape sequence. Also accepts "\0" decimal escape sequence. 499 template <bool capture_raw> 500 uc32 ScanOctalEscape(uc32 c, int length); 501 502 // Call this after setting source_ to the input. Init()503 void Init() { 504 // Set c0_ (one character ahead) 505 STATIC_ASSERT(kCharacterLookaheadBufferSize == 1); 506 Advance(); 507 // Initialize current_ to not refer to a literal. 508 current_.token = Token::UNINITIALIZED; 509 current_.literal_chars = NULL; 510 current_.raw_literal_chars = NULL; 511 next_.token = Token::UNINITIALIZED; 512 next_.literal_chars = NULL; 513 next_.raw_literal_chars = NULL; 514 next_next_.token = Token::UNINITIALIZED; 515 next_next_.literal_chars = NULL; 516 next_next_.raw_literal_chars = NULL; 517 found_html_comment_ = false; 518 scanner_error_ = MessageTemplate::kNone; 519 } 520 ReportScannerError(const Location & location,MessageTemplate::Template error)521 void ReportScannerError(const Location& location, 522 MessageTemplate::Template error) { 523 if (has_error()) return; 524 scanner_error_ = error; 525 scanner_error_location_ = location; 526 } 527 ReportScannerError(int pos,MessageTemplate::Template error)528 void ReportScannerError(int pos, MessageTemplate::Template error) { 529 if (has_error()) return; 530 scanner_error_ = error; 531 scanner_error_location_ = Location(pos, pos + 1); 532 } 533 534 // Seek to the next_ token at the given position. 535 void SeekNext(size_t position); 536 537 // Literal buffer support StartLiteral()538 inline void StartLiteral() { 539 LiteralBuffer* free_buffer = 540 (current_.literal_chars == &literal_buffer0_) 541 ? &literal_buffer1_ 542 : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_ 543 : &literal_buffer0_; 544 free_buffer->Reset(); 545 next_.literal_chars = free_buffer; 546 } 547 StartRawLiteral()548 inline void StartRawLiteral() { 549 LiteralBuffer* free_buffer = 550 (current_.raw_literal_chars == &raw_literal_buffer0_) 551 ? &raw_literal_buffer1_ 552 : (current_.raw_literal_chars == &raw_literal_buffer1_) 553 ? &raw_literal_buffer2_ 554 : &raw_literal_buffer0_; 555 free_buffer->Reset(); 556 next_.raw_literal_chars = free_buffer; 557 } 558 INLINE(void AddLiteralChar (uc32 c))559 INLINE(void AddLiteralChar(uc32 c)) { 560 DCHECK_NOT_NULL(next_.literal_chars); 561 next_.literal_chars->AddChar(c); 562 } 563 INLINE(void AddLiteralChar (char c))564 INLINE(void AddLiteralChar(char c)) { 565 DCHECK_NOT_NULL(next_.literal_chars); 566 next_.literal_chars->AddChar(c); 567 } 568 INLINE(void AddRawLiteralChar (uc32 c))569 INLINE(void AddRawLiteralChar(uc32 c)) { 570 DCHECK_NOT_NULL(next_.raw_literal_chars); 571 next_.raw_literal_chars->AddChar(c); 572 } 573 INLINE(void ReduceRawLiteralLength (int delta))574 INLINE(void ReduceRawLiteralLength(int delta)) { 575 DCHECK_NOT_NULL(next_.raw_literal_chars); 576 next_.raw_literal_chars->ReduceLength(delta); 577 } 578 579 // Stops scanning of a literal and drop the collected characters, 580 // e.g., due to an encountered error. DropLiteral()581 inline void DropLiteral() { 582 next_.literal_chars = NULL; 583 next_.raw_literal_chars = NULL; 584 } 585 AddLiteralCharAdvance()586 inline void AddLiteralCharAdvance() { 587 AddLiteralChar(c0_); 588 Advance(); 589 } 590 591 // Low-level scanning support. 592 template <bool capture_raw = false, bool check_surrogate = true> Advance()593 void Advance() { 594 if (capture_raw) { 595 AddRawLiteralChar(c0_); 596 } 597 c0_ = source_->Advance(); 598 if (check_surrogate) HandleLeadSurrogate(); 599 } 600 HandleLeadSurrogate()601 void HandleLeadSurrogate() { 602 if (unibrow::Utf16::IsLeadSurrogate(c0_)) { 603 uc32 c1 = source_->Advance(); 604 if (!unibrow::Utf16::IsTrailSurrogate(c1)) { 605 source_->Back(); 606 } else { 607 c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1); 608 } 609 } 610 } 611 PushBack(uc32 ch)612 void PushBack(uc32 ch) { 613 if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) { 614 source_->Back2(); 615 } else { 616 source_->Back(); 617 } 618 c0_ = ch; 619 } 620 621 // Same as PushBack(ch1); PushBack(ch2). 622 // - Potentially more efficient as it uses Back2() on the stream. 623 // - Uses char as parameters, since we're only calling it with ASCII chars in 624 // practice. This way, we can avoid a few edge cases. PushBack2(char ch1,char ch2)625 void PushBack2(char ch1, char ch2) { 626 source_->Back2(); 627 c0_ = ch2; 628 } 629 Select(Token::Value tok)630 inline Token::Value Select(Token::Value tok) { 631 Advance(); 632 return tok; 633 } 634 Select(uc32 next,Token::Value then,Token::Value else_)635 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) { 636 Advance(); 637 if (c0_ == next) { 638 Advance(); 639 return then; 640 } else { 641 return else_; 642 } 643 } 644 645 // Returns the literal string, if any, for the current token (the 646 // token last returned by Next()). The string is 0-terminated. 647 // Literal strings are collected for identifiers, strings, numbers as well 648 // as for template literals. For template literals we also collect the raw 649 // form. 650 // These functions only give the correct result if the literal was scanned 651 // when a LiteralScope object is alive. 652 // 653 // Current usage of these functions is unfortunately a little undisciplined, 654 // and is_literal_one_byte() + is_literal_one_byte_string() is also 655 // requested for tokens that do not have a literal. Hence, we treat any 656 // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a 657 // literal "function". literal_one_byte_string()658 Vector<const uint8_t> literal_one_byte_string() { 659 if (current_.literal_chars) 660 return current_.literal_chars->one_byte_literal(); 661 const char* str = Token::String(current_.token); 662 const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str); 663 return Vector<const uint8_t>(str_as_uint8, 664 Token::StringLength(current_.token)); 665 } literal_two_byte_string()666 Vector<const uint16_t> literal_two_byte_string() { 667 DCHECK_NOT_NULL(current_.literal_chars); 668 return current_.literal_chars->two_byte_literal(); 669 } is_literal_one_byte()670 bool is_literal_one_byte() { 671 return !current_.literal_chars || current_.literal_chars->is_one_byte(); 672 } literal_length()673 int literal_length() const { 674 if (current_.literal_chars) return current_.literal_chars->length(); 675 return Token::StringLength(current_.token); 676 } 677 // Returns the literal string for the next token (the token that 678 // would be returned if Next() were called). next_literal_one_byte_string()679 Vector<const uint8_t> next_literal_one_byte_string() { 680 DCHECK_NOT_NULL(next_.literal_chars); 681 return next_.literal_chars->one_byte_literal(); 682 } next_literal_two_byte_string()683 Vector<const uint16_t> next_literal_two_byte_string() { 684 DCHECK_NOT_NULL(next_.literal_chars); 685 return next_.literal_chars->two_byte_literal(); 686 } is_next_literal_one_byte()687 bool is_next_literal_one_byte() { 688 DCHECK_NOT_NULL(next_.literal_chars); 689 return next_.literal_chars->is_one_byte(); 690 } raw_literal_one_byte_string()691 Vector<const uint8_t> raw_literal_one_byte_string() { 692 DCHECK_NOT_NULL(current_.raw_literal_chars); 693 return current_.raw_literal_chars->one_byte_literal(); 694 } raw_literal_two_byte_string()695 Vector<const uint16_t> raw_literal_two_byte_string() { 696 DCHECK_NOT_NULL(current_.raw_literal_chars); 697 return current_.raw_literal_chars->two_byte_literal(); 698 } is_raw_literal_one_byte()699 bool is_raw_literal_one_byte() { 700 DCHECK_NOT_NULL(current_.raw_literal_chars); 701 return current_.raw_literal_chars->is_one_byte(); 702 } 703 704 template <bool capture_raw, bool unicode = false> 705 uc32 ScanHexNumber(int expected_length); 706 // Scan a number of any length but not bigger than max_value. For example, the 707 // number can be 000000001, so it's very long in characters but its value is 708 // small. 709 template <bool capture_raw> 710 uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos); 711 712 // Scans a single JavaScript token. 713 void Scan(); 714 715 bool SkipWhiteSpace(); 716 Token::Value SkipSingleLineComment(); 717 Token::Value SkipSourceURLComment(); 718 void TryToParseSourceURLComment(); 719 Token::Value SkipMultiLineComment(); 720 // Scans a possible HTML comment -- begins with '<!'. 721 Token::Value ScanHtmlComment(); 722 723 void ScanDecimalDigits(); 724 Token::Value ScanNumber(bool seen_period); 725 Token::Value ScanIdentifierOrKeyword(); 726 Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped); 727 728 Token::Value ScanString(); 729 730 // Scans an escape-sequence which is part of a string and adds the 731 // decoded character to the current literal. Returns true if a pattern 732 // is scanned. 733 template <bool capture_raw, bool in_template_literal> 734 bool ScanEscape(); 735 736 // Decodes a Unicode escape-sequence which is part of an identifier. 737 // If the escape sequence cannot be decoded the result is kBadChar. 738 uc32 ScanIdentifierUnicodeEscape(); 739 // Helper for the above functions. 740 template <bool capture_raw> 741 uc32 ScanUnicodeEscape(); 742 743 Token::Value ScanTemplateSpan(); 744 745 // Return the current source position. source_pos()746 int source_pos() { 747 return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize; 748 } 749 LiteralContainsEscapes(const TokenDesc & token)750 static bool LiteralContainsEscapes(const TokenDesc& token) { 751 Location location = token.location; 752 int source_length = (location.end_pos - location.beg_pos); 753 if (token.token == Token::STRING) { 754 // Subtract delimiters. 755 source_length -= 2; 756 } 757 return token.literal_chars && 758 (token.literal_chars->length() != source_length); 759 } 760 761 #ifdef DEBUG 762 void SanityCheckTokenDesc(const TokenDesc&) const; 763 #endif 764 765 UnicodeCache* unicode_cache_; 766 767 // Buffers collecting literal strings, numbers, etc. 768 LiteralBuffer literal_buffer0_; 769 LiteralBuffer literal_buffer1_; 770 LiteralBuffer literal_buffer2_; 771 772 // Values parsed from magic comments. 773 LiteralBuffer source_url_; 774 LiteralBuffer source_mapping_url_; 775 776 // Buffer to store raw string values 777 LiteralBuffer raw_literal_buffer0_; 778 LiteralBuffer raw_literal_buffer1_; 779 LiteralBuffer raw_literal_buffer2_; 780 781 TokenDesc current_; // desc for current token (as returned by Next()) 782 TokenDesc next_; // desc for next token (one token look-ahead) 783 TokenDesc next_next_; // desc for the token after next (after PeakAhead()) 784 785 // Input stream. Must be initialized to an Utf16CharacterStream. 786 Utf16CharacterStream* source_; 787 788 // Last-seen positions of potentially problematic tokens. 789 Location octal_pos_; 790 Location decimal_with_leading_zero_pos_; 791 792 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 793 uc32 c0_; 794 795 // Whether there is a line terminator whitespace character after 796 // the current token, and before the next. Does not count newlines 797 // inside multiline comments. 798 bool has_line_terminator_before_next_; 799 // Whether there is a multi-line comment that contains a 800 // line-terminator after the current token, and before the next. 801 bool has_multiline_comment_before_next_; 802 bool has_line_terminator_after_next_; 803 804 // Whether this scanner encountered an HTML comment. 805 bool found_html_comment_; 806 807 MessageTemplate::Template scanner_error_; 808 Location scanner_error_location_; 809 }; 810 811 } // namespace internal 812 } // namespace v8 813 814 #endif // V8_PARSING_SCANNER_H_ 815