1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Here we have a hand-written lexer.  At first you might ask yourself,
36 // "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
37 // yes I am crazy, but that's beside the point.  There are actually reasons
38 // why I ended up writing this this way.
39 //
40 // The traditional approach to lexing is to use lex to generate a lexer for
41 // you.  Unfortunately, lex's output is ridiculously ugly and difficult to
42 // integrate cleanly with C++ code, especially abstract code or code meant
43 // as a library.  Better parser-generators exist but would add dependencies
44 // which most users won't already have, which we'd like to avoid.  (GNU flex
45 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
46 // and not library-friendly.)
47 //
48 // The next approach that any good software engineer should look at is to
49 // use regular expressions.  And, indeed, I did.  I have code which
50 // implements this same class using regular expressions.  It's about 200
51 // lines shorter.  However:
52 // - Rather than error messages telling you "This string has an invalid
53 //   escape sequence at line 5, column 45", you get error messages like
54 //   "Parse error on line 5".  Giving more precise errors requires adding
55 //   a lot of code that ends up basically as complex as the hand-coded
56 //   version anyway.
57 // - The regular expression to match a string literal looks like this:
58 //     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
59 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
60 //                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
61 //                       "\'([^\'\\\\]|"        // Also support single-quotes.
62 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"
63 //                       "\\\\x[0-9a-fA-F])*\')");
64 //   Verifying the correctness of this line noise is actually harder than
65 //   verifying the correctness of ConsumeString(), defined below.  I'm not
66 //   even confident that the above is correct, after staring at it for some
67 //   time.
68 // - PCRE is fast, but there's still more overhead involved than the code
69 //   below.
70 // - Sadly, regular expressions are not part of the C standard library, so
71 //   using them would require depending on some other library.  For the
72 //   open source release, this could be really annoying.  Nobody likes
73 //   downloading one piece of software just to find that they need to
74 //   download something else to make it work, and in all likelihood
75 //   people downloading Protocol Buffers will already be doing so just
76 //   to make something else work.  We could include a copy of PCRE with
77 //   our code, but that obligates us to keep it up-to-date and just seems
78 //   like a big waste just to save 200 lines of code.
79 //
80 // On a similar but unrelated note, I'm even scared to use ctype.h.
81 // Apparently functions like isalpha() are locale-dependent.  So, if we used
82 // that, then if this code is being called from some program that doesn't
83 // have its locale set to "C", it would behave strangely.  We can't just set
84 // the locale to "C" ourselves since we might break the calling program that
85 // way, particularly if it is multi-threaded.  WTF?  Someone please let me
86 // (Kenton) know if I'm missing something here...
87 //
88 // I'd love to hear about other alternatives, though, as this code isn't
89 // exactly pretty.
90 
91 #include <google/protobuf/io/tokenizer.h>
92 #include <google/protobuf/stubs/common.h>
93 #include <google/protobuf/stubs/stringprintf.h>
94 #include <google/protobuf/io/strtod.h>
95 #include <google/protobuf/io/zero_copy_stream.h>
96 #include <google/protobuf/stubs/strutil.h>
97 #include <google/protobuf/stubs/stl_util.h>
98 
99 namespace google {
100 namespace protobuf {
101 namespace io {
102 namespace {
103 
104 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
105 // So, I have written replacement functions here.  Someone please smack me if
106 // this is a bad idea or if there is some way around this.
107 //
108 // These "character classes" are designed to be used in template methods.
109 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
110 // whitespace.
111 
112 // Note:  No class is allowed to contain '\0', since this is used to mark end-
113 //   of-input and is handled specially.
114 
115 #define CHARACTER_CLASS(NAME, EXPRESSION)      \
116   class NAME {                                 \
117    public:                                     \
118     static inline bool InClass(char c) {       \
119       return EXPRESSION;                       \
120     }                                          \
121   }
122 
123 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
124                             c == '\r' || c == '\v' || c == '\f');
125 CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
126                                      c == '\r' || c == '\v' || c == '\f');
127 
128 CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
129 
130 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
131 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
132 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
133                           ('a' <= c && c <= 'f') ||
134                           ('A' <= c && c <= 'F'));
135 
136 CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
137                         ('A' <= c && c <= 'Z') ||
138                         (c == '_'));
139 
140 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
141                               ('A' <= c && c <= 'Z') ||
142                               ('0' <= c && c <= '9') ||
143                               (c == '_'));
144 
145 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
146                         c == 'r' || c == 't' || c == 'v' || c == '\\' ||
147                         c == '?' || c == '\'' || c == '\"');
148 
149 #undef CHARACTER_CLASS
150 
151 // Given a char, interpret it as a numeric digit and return its value.
152 // This supports any number base up to 36.
DigitValue(char digit)153 inline int DigitValue(char digit) {
154   if ('0' <= digit && digit <= '9') return digit - '0';
155   if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
156   if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
157   return -1;
158 }
159 
160 // Inline because it's only used in one place.
TranslateEscape(char c)161 inline char TranslateEscape(char c) {
162   switch (c) {
163     case 'a':  return '\a';
164     case 'b':  return '\b';
165     case 'f':  return '\f';
166     case 'n':  return '\n';
167     case 'r':  return '\r';
168     case 't':  return '\t';
169     case 'v':  return '\v';
170     case '\\': return '\\';
171     case '?':  return '\?';    // Trigraphs = :(
172     case '\'': return '\'';
173     case '"':  return '\"';
174 
175     // We expect escape sequences to have been validated separately.
176     default:   return '?';
177   }
178 }
179 
180 }  // anonymous namespace
181 
~ErrorCollector()182 ErrorCollector::~ErrorCollector() {}
183 
184 // ===================================================================
185 
Tokenizer(ZeroCopyInputStream * input,ErrorCollector * error_collector)186 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
187                      ErrorCollector* error_collector)
188   : input_(input),
189     error_collector_(error_collector),
190     buffer_(NULL),
191     buffer_size_(0),
192     buffer_pos_(0),
193     read_error_(false),
194     line_(0),
195     column_(0),
196     record_target_(NULL),
197     record_start_(-1),
198     allow_f_after_float_(false),
199     comment_style_(CPP_COMMENT_STYLE),
200     require_space_after_number_(true),
201     allow_multiline_strings_(false) {
202 
203   current_.line = 0;
204   current_.column = 0;
205   current_.end_column = 0;
206   current_.type = TYPE_START;
207 
208   Refresh();
209 }
210 
~Tokenizer()211 Tokenizer::~Tokenizer() {
212   // If we had any buffer left unread, return it to the underlying stream
213   // so that someone else can read it.
214   if (buffer_size_ > buffer_pos_) {
215     input_->BackUp(buffer_size_ - buffer_pos_);
216   }
217 }
218 
219 // -------------------------------------------------------------------
220 // Internal helpers.
221 
NextChar()222 void Tokenizer::NextChar() {
223   // Update our line and column counters based on the character being
224   // consumed.
225   if (current_char_ == '\n') {
226     ++line_;
227     column_ = 0;
228   } else if (current_char_ == '\t') {
229     column_ += kTabWidth - column_ % kTabWidth;
230   } else {
231     ++column_;
232   }
233 
234   // Advance to the next character.
235   ++buffer_pos_;
236   if (buffer_pos_ < buffer_size_) {
237     current_char_ = buffer_[buffer_pos_];
238   } else {
239     Refresh();
240   }
241 }
242 
Refresh()243 void Tokenizer::Refresh() {
244   if (read_error_) {
245     current_char_ = '\0';
246     return;
247   }
248 
249   // If we're in a token, append the rest of the buffer to it.
250   if (record_target_ != NULL && record_start_ < buffer_size_) {
251     record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
252     record_start_ = 0;
253   }
254 
255   const void* data = NULL;
256   buffer_ = NULL;
257   buffer_pos_ = 0;
258   do {
259     if (!input_->Next(&data, &buffer_size_)) {
260       // end of stream (or read error)
261       buffer_size_ = 0;
262       read_error_ = true;
263       current_char_ = '\0';
264       return;
265     }
266   } while (buffer_size_ == 0);
267 
268   buffer_ = static_cast<const char*>(data);
269 
270   current_char_ = buffer_[0];
271 }
272 
RecordTo(string * target)273 inline void Tokenizer::RecordTo(string* target) {
274   record_target_ = target;
275   record_start_ = buffer_pos_;
276 }
277 
StopRecording()278 inline void Tokenizer::StopRecording() {
279   // Note:  The if() is necessary because some STL implementations crash when
280   //   you call string::append(NULL, 0), presumably because they are trying to
281   //   be helpful by detecting the NULL pointer, even though there's nothing
282   //   wrong with reading zero bytes from NULL.
283   if (buffer_pos_ != record_start_) {
284     record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
285   }
286   record_target_ = NULL;
287   record_start_ = -1;
288 }
289 
StartToken()290 inline void Tokenizer::StartToken() {
291   current_.type = TYPE_START;    // Just for the sake of initializing it.
292   current_.text.clear();
293   current_.line = line_;
294   current_.column = column_;
295   RecordTo(&current_.text);
296 }
297 
EndToken()298 inline void Tokenizer::EndToken() {
299   StopRecording();
300   current_.end_column = column_;
301 }
302 
303 // -------------------------------------------------------------------
304 // Helper methods that consume characters.
305 
306 template<typename CharacterClass>
LookingAt()307 inline bool Tokenizer::LookingAt() {
308   return CharacterClass::InClass(current_char_);
309 }
310 
311 template<typename CharacterClass>
TryConsumeOne()312 inline bool Tokenizer::TryConsumeOne() {
313   if (CharacterClass::InClass(current_char_)) {
314     NextChar();
315     return true;
316   } else {
317     return false;
318   }
319 }
320 
TryConsume(char c)321 inline bool Tokenizer::TryConsume(char c) {
322   if (current_char_ == c) {
323     NextChar();
324     return true;
325   } else {
326     return false;
327   }
328 }
329 
330 template<typename CharacterClass>
ConsumeZeroOrMore()331 inline void Tokenizer::ConsumeZeroOrMore() {
332   while (CharacterClass::InClass(current_char_)) {
333     NextChar();
334   }
335 }
336 
337 template<typename CharacterClass>
ConsumeOneOrMore(const char * error)338 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
339   if (!CharacterClass::InClass(current_char_)) {
340     AddError(error);
341   } else {
342     do {
343       NextChar();
344     } while (CharacterClass::InClass(current_char_));
345   }
346 }
347 
348 // -------------------------------------------------------------------
349 // Methods that read whole patterns matching certain kinds of tokens
350 // or comments.
351 
ConsumeString(char delimiter)352 void Tokenizer::ConsumeString(char delimiter) {
353   while (true) {
354     switch (current_char_) {
355       case '\0':
356         AddError("Unexpected end of string.");
357         return;
358 
359       case '\n': {
360         if (!allow_multiline_strings_) {
361           AddError("String literals cannot cross line boundaries.");
362           return;
363         }
364         NextChar();
365         break;
366       }
367 
368       case '\\': {
369         // An escape sequence.
370         NextChar();
371         if (TryConsumeOne<Escape>()) {
372           // Valid escape sequence.
373         } else if (TryConsumeOne<OctalDigit>()) {
374           // Possibly followed by two more octal digits, but these will
375           // just be consumed by the main loop anyway so we don't need
376           // to do so explicitly here.
377         } else if (TryConsume('x') || TryConsume('X')) {
378           if (!TryConsumeOne<HexDigit>()) {
379             AddError("Expected hex digits for escape sequence.");
380           }
381           // Possibly followed by another hex digit, but again we don't care.
382         } else if (TryConsume('u')) {
383           if (!TryConsumeOne<HexDigit>() ||
384               !TryConsumeOne<HexDigit>() ||
385               !TryConsumeOne<HexDigit>() ||
386               !TryConsumeOne<HexDigit>()) {
387             AddError("Expected four hex digits for \\u escape sequence.");
388           }
389         } else if (TryConsume('U')) {
390           // We expect 8 hex digits; but only the range up to 0x10ffff is
391           // legal.
392           if (!TryConsume('0') ||
393               !TryConsume('0') ||
394               !(TryConsume('0') || TryConsume('1')) ||
395               !TryConsumeOne<HexDigit>() ||
396               !TryConsumeOne<HexDigit>() ||
397               !TryConsumeOne<HexDigit>() ||
398               !TryConsumeOne<HexDigit>() ||
399               !TryConsumeOne<HexDigit>()) {
400             AddError("Expected eight hex digits up to 10ffff for \\U escape "
401                      "sequence");
402           }
403         } else {
404           AddError("Invalid escape sequence in string literal.");
405         }
406         break;
407       }
408 
409       default: {
410         if (current_char_ == delimiter) {
411           NextChar();
412           return;
413         }
414         NextChar();
415         break;
416       }
417     }
418   }
419 }
420 
ConsumeNumber(bool started_with_zero,bool started_with_dot)421 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
422                                               bool started_with_dot) {
423   bool is_float = false;
424 
425   if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
426     // A hex number (started with "0x").
427     ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
428 
429   } else if (started_with_zero && LookingAt<Digit>()) {
430     // An octal number (had a leading zero).
431     ConsumeZeroOrMore<OctalDigit>();
432     if (LookingAt<Digit>()) {
433       AddError("Numbers starting with leading zero must be in octal.");
434       ConsumeZeroOrMore<Digit>();
435     }
436 
437   } else {
438     // A decimal number.
439     if (started_with_dot) {
440       is_float = true;
441       ConsumeZeroOrMore<Digit>();
442     } else {
443       ConsumeZeroOrMore<Digit>();
444 
445       if (TryConsume('.')) {
446         is_float = true;
447         ConsumeZeroOrMore<Digit>();
448       }
449     }
450 
451     if (TryConsume('e') || TryConsume('E')) {
452       is_float = true;
453       TryConsume('-') || TryConsume('+');
454       ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
455     }
456 
457     if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
458       is_float = true;
459     }
460   }
461 
462   if (LookingAt<Letter>() && require_space_after_number_) {
463     AddError("Need space between number and identifier.");
464   } else if (current_char_ == '.') {
465     if (is_float) {
466       AddError(
467         "Already saw decimal point or exponent; can't have another one.");
468     } else {
469       AddError("Hex and octal numbers must be integers.");
470     }
471   }
472 
473   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
474 }
475 
ConsumeLineComment(string * content)476 void Tokenizer::ConsumeLineComment(string* content) {
477   if (content != NULL) RecordTo(content);
478 
479   while (current_char_ != '\0' && current_char_ != '\n') {
480     NextChar();
481   }
482   TryConsume('\n');
483 
484   if (content != NULL) StopRecording();
485 }
486 
ConsumeBlockComment(string * content)487 void Tokenizer::ConsumeBlockComment(string* content) {
488   int start_line = line_;
489   int start_column = column_ - 2;
490 
491   if (content != NULL) RecordTo(content);
492 
493   while (true) {
494     while (current_char_ != '\0' &&
495            current_char_ != '*' &&
496            current_char_ != '/' &&
497            current_char_ != '\n') {
498       NextChar();
499     }
500 
501     if (TryConsume('\n')) {
502       if (content != NULL) StopRecording();
503 
504       // Consume leading whitespace and asterisk;
505       ConsumeZeroOrMore<WhitespaceNoNewline>();
506       if (TryConsume('*')) {
507         if (TryConsume('/')) {
508           // End of comment.
509           break;
510         }
511       }
512 
513       if (content != NULL) RecordTo(content);
514     } else if (TryConsume('*') && TryConsume('/')) {
515       // End of comment.
516       if (content != NULL) {
517         StopRecording();
518         // Strip trailing "*/".
519         content->erase(content->size() - 2);
520       }
521       break;
522     } else if (TryConsume('/') && current_char_ == '*') {
523       // Note:  We didn't consume the '*' because if there is a '/' after it
524       //   we want to interpret that as the end of the comment.
525       AddError(
526         "\"/*\" inside block comment.  Block comments cannot be nested.");
527     } else if (current_char_ == '\0') {
528       AddError("End-of-file inside block comment.");
529       error_collector_->AddError(
530         start_line, start_column, "  Comment started here.");
531       if (content != NULL) StopRecording();
532       break;
533     }
534   }
535 }
536 
TryConsumeCommentStart()537 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
538   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
539     if (TryConsume('/')) {
540       return LINE_COMMENT;
541     } else if (TryConsume('*')) {
542       return BLOCK_COMMENT;
543     } else {
544       // Oops, it was just a slash.  Return it.
545       current_.type = TYPE_SYMBOL;
546       current_.text = "/";
547       current_.line = line_;
548       current_.column = column_ - 1;
549       current_.end_column = column_;
550       return SLASH_NOT_COMMENT;
551     }
552   } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
553     return LINE_COMMENT;
554   } else {
555     return NO_COMMENT;
556   }
557 }
558 
559 // -------------------------------------------------------------------
560 
Next()561 bool Tokenizer::Next() {
562   previous_ = current_;
563 
564   while (!read_error_) {
565     ConsumeZeroOrMore<Whitespace>();
566 
567     switch (TryConsumeCommentStart()) {
568       case LINE_COMMENT:
569         ConsumeLineComment(NULL);
570         continue;
571       case BLOCK_COMMENT:
572         ConsumeBlockComment(NULL);
573         continue;
574       case SLASH_NOT_COMMENT:
575         return true;
576       case NO_COMMENT:
577         break;
578     }
579 
580     // Check for EOF before continuing.
581     if (read_error_) break;
582 
583     if (LookingAt<Unprintable>() || current_char_ == '\0') {
584       AddError("Invalid control characters encountered in text.");
585       NextChar();
586       // Skip more unprintable characters, too.  But, remember that '\0' is
587       // also what current_char_ is set to after EOF / read error.  We have
588       // to be careful not to go into an infinite loop of trying to consume
589       // it, so make sure to check read_error_ explicitly before consuming
590       // '\0'.
591       while (TryConsumeOne<Unprintable>() ||
592              (!read_error_ && TryConsume('\0'))) {
593         // Ignore.
594       }
595 
596     } else {
597       // Reading some sort of token.
598       StartToken();
599 
600       if (TryConsumeOne<Letter>()) {
601         ConsumeZeroOrMore<Alphanumeric>();
602         current_.type = TYPE_IDENTIFIER;
603       } else if (TryConsume('0')) {
604         current_.type = ConsumeNumber(true, false);
605       } else if (TryConsume('.')) {
606         // This could be the beginning of a floating-point number, or it could
607         // just be a '.' symbol.
608 
609         if (TryConsumeOne<Digit>()) {
610           // It's a floating-point number.
611           if (previous_.type == TYPE_IDENTIFIER &&
612               current_.line == previous_.line &&
613               current_.column == previous_.end_column) {
614             // We don't accept syntax like "blah.123".
615             error_collector_->AddError(line_, column_ - 2,
616               "Need space between identifier and decimal point.");
617           }
618           current_.type = ConsumeNumber(false, true);
619         } else {
620           current_.type = TYPE_SYMBOL;
621         }
622       } else if (TryConsumeOne<Digit>()) {
623         current_.type = ConsumeNumber(false, false);
624       } else if (TryConsume('\"')) {
625         ConsumeString('\"');
626         current_.type = TYPE_STRING;
627       } else if (TryConsume('\'')) {
628         ConsumeString('\'');
629         current_.type = TYPE_STRING;
630       } else {
631         // Check if the high order bit is set.
632         if (current_char_ & 0x80) {
633           error_collector_->AddError(line_, column_,
634               StringPrintf("Interpreting non ascii codepoint %d.",
635                            static_cast<unsigned char>(current_char_)));
636         }
637         NextChar();
638         current_.type = TYPE_SYMBOL;
639       }
640 
641       EndToken();
642       return true;
643     }
644   }
645 
646   // EOF
647   current_.type = TYPE_END;
648   current_.text.clear();
649   current_.line = line_;
650   current_.column = column_;
651   current_.end_column = column_;
652   return false;
653 }
654 
655 namespace {
656 
657 // Helper class for collecting comments and putting them in the right places.
658 //
659 // This basically just buffers the most recent comment until it can be decided
660 // exactly where that comment should be placed.  When Flush() is called, the
661 // current comment goes into either prev_trailing_comments or detached_comments.
662 // When the CommentCollector is destroyed, the last buffered comment goes into
663 // next_leading_comments.
664 class CommentCollector {
665  public:
CommentCollector(string * prev_trailing_comments,vector<string> * detached_comments,string * next_leading_comments)666   CommentCollector(string* prev_trailing_comments,
667                    vector<string>* detached_comments,
668                    string* next_leading_comments)
669       : prev_trailing_comments_(prev_trailing_comments),
670         detached_comments_(detached_comments),
671         next_leading_comments_(next_leading_comments),
672         has_comment_(false),
673         is_line_comment_(false),
674         can_attach_to_prev_(true) {
675     if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
676     if (detached_comments != NULL) detached_comments->clear();
677     if (next_leading_comments != NULL) next_leading_comments->clear();
678   }
679 
~CommentCollector()680   ~CommentCollector() {
681     // Whatever is in the buffer is a leading comment.
682     if (next_leading_comments_ != NULL && has_comment_) {
683       comment_buffer_.swap(*next_leading_comments_);
684     }
685   }
686 
687   // About to read a line comment.  Get the comment buffer pointer in order to
688   // read into it.
GetBufferForLineComment()689   string* GetBufferForLineComment() {
690     // We want to combine with previous line comments, but not block comments.
691     if (has_comment_ && !is_line_comment_) {
692       Flush();
693     }
694     has_comment_ = true;
695     is_line_comment_ = true;
696     return &comment_buffer_;
697   }
698 
699   // About to read a block comment.  Get the comment buffer pointer in order to
700   // read into it.
GetBufferForBlockComment()701   string* GetBufferForBlockComment() {
702     if (has_comment_) {
703       Flush();
704     }
705     has_comment_ = true;
706     is_line_comment_ = false;
707     return &comment_buffer_;
708   }
709 
ClearBuffer()710   void ClearBuffer() {
711     comment_buffer_.clear();
712     has_comment_ = false;
713   }
714 
715   // Called once we know that the comment buffer is complete and is *not*
716   // connected to the next token.
Flush()717   void Flush() {
718     if (has_comment_) {
719       if (can_attach_to_prev_) {
720         if (prev_trailing_comments_ != NULL) {
721           prev_trailing_comments_->append(comment_buffer_);
722         }
723         can_attach_to_prev_ = false;
724       } else {
725         if (detached_comments_ != NULL) {
726           detached_comments_->push_back(comment_buffer_);
727         }
728       }
729       ClearBuffer();
730     }
731   }
732 
DetachFromPrev()733   void DetachFromPrev() {
734     can_attach_to_prev_ = false;
735   }
736 
737  private:
738   string* prev_trailing_comments_;
739   vector<string>* detached_comments_;
740   string* next_leading_comments_;
741 
742   string comment_buffer_;
743 
744   // True if any comments were read into comment_buffer_.  This can be true even
745   // if comment_buffer_ is empty, namely if the comment was "/**/".
746   bool has_comment_;
747 
748   // Is the comment in the comment buffer a line comment?
749   bool is_line_comment_;
750 
751   // Is it still possible that we could be reading a comment attached to the
752   // previous token?
753   bool can_attach_to_prev_;
754 };
755 
756 } // namespace
757 
NextWithComments(string * prev_trailing_comments,vector<string> * detached_comments,string * next_leading_comments)758 bool Tokenizer::NextWithComments(string* prev_trailing_comments,
759                                  vector<string>* detached_comments,
760                                  string* next_leading_comments) {
761   CommentCollector collector(prev_trailing_comments, detached_comments,
762                              next_leading_comments);
763 
764   if (current_.type == TYPE_START) {
765     collector.DetachFromPrev();
766   } else {
767     // A comment appearing on the same line must be attached to the previous
768     // declaration.
769     ConsumeZeroOrMore<WhitespaceNoNewline>();
770     switch (TryConsumeCommentStart()) {
771       case LINE_COMMENT:
772         ConsumeLineComment(collector.GetBufferForLineComment());
773 
774         // Don't allow comments on subsequent lines to be attached to a trailing
775         // comment.
776         collector.Flush();
777         break;
778       case BLOCK_COMMENT:
779         ConsumeBlockComment(collector.GetBufferForBlockComment());
780 
781         ConsumeZeroOrMore<WhitespaceNoNewline>();
782         if (!TryConsume('\n')) {
783           // Oops, the next token is on the same line.  If we recorded a comment
784           // we really have no idea which token it should be attached to.
785           collector.ClearBuffer();
786           return Next();
787         }
788 
789         // Don't allow comments on subsequent lines to be attached to a trailing
790         // comment.
791         collector.Flush();
792         break;
793       case SLASH_NOT_COMMENT:
794         return true;
795       case NO_COMMENT:
796         if (!TryConsume('\n')) {
797           // The next token is on the same line.  There are no comments.
798           return Next();
799         }
800         break;
801     }
802   }
803 
804   // OK, we are now on the line *after* the previous token.
805   while (true) {
806     ConsumeZeroOrMore<WhitespaceNoNewline>();
807 
808     switch (TryConsumeCommentStart()) {
809       case LINE_COMMENT:
810         ConsumeLineComment(collector.GetBufferForLineComment());
811         break;
812       case BLOCK_COMMENT:
813         ConsumeBlockComment(collector.GetBufferForBlockComment());
814 
815         // Consume the rest of the line so that we don't interpret it as a
816         // blank line the next time around the loop.
817         ConsumeZeroOrMore<WhitespaceNoNewline>();
818         TryConsume('\n');
819         break;
820       case SLASH_NOT_COMMENT:
821         return true;
822       case NO_COMMENT:
823         if (TryConsume('\n')) {
824           // Completely blank line.
825           collector.Flush();
826           collector.DetachFromPrev();
827         } else {
828           bool result = Next();
829           if (!result ||
830               current_.text == "}" ||
831               current_.text == "]" ||
832               current_.text == ")") {
833             // It looks like we're at the end of a scope.  In this case it
834             // makes no sense to attach a comment to the following token.
835             collector.Flush();
836           }
837           return result;
838         }
839         break;
840     }
841   }
842 }
843 
844 // -------------------------------------------------------------------
845 // Token-parsing helpers.  Remember that these don't need to report
846 // errors since any errors should already have been reported while
847 // tokenizing.  Also, these can assume that whatever text they
848 // are given is text that the tokenizer actually parsed as a token
849 // of the given type.
850 
ParseInteger(const string & text,uint64 max_value,uint64 * output)851 bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
852                              uint64* output) {
853   // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
854   // is non-standard.  I hate the C standard library.  :(
855 
856 //  return strtoull(text.c_str(), NULL, 0);
857 
858   const char* ptr = text.c_str();
859   int base = 10;
860   if (ptr[0] == '0') {
861     if (ptr[1] == 'x' || ptr[1] == 'X') {
862       // This is hex.
863       base = 16;
864       ptr += 2;
865     } else {
866       // This is octal.
867       base = 8;
868     }
869   }
870 
871   uint64 result = 0;
872   for (; *ptr != '\0'; ptr++) {
873     int digit = DigitValue(*ptr);
874     GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base)
875       << " Tokenizer::ParseInteger() passed text that could not have been"
876          " tokenized as an integer: " << CEscape(text);
877     if (digit > max_value || result > (max_value - digit) / base) {
878       // Overflow.
879       return false;
880     }
881     result = result * base + digit;
882   }
883 
884   *output = result;
885   return true;
886 }
887 
ParseFloat(const string & text)888 double Tokenizer::ParseFloat(const string& text) {
889   const char* start = text.c_str();
890   char* end;
891   double result = NoLocaleStrtod(start, &end);
892 
893   // "1e" is not a valid float, but if the tokenizer reads it, it will
894   // report an error but still return it as a valid token.  We need to
895   // accept anything the tokenizer could possibly return, error or not.
896   if (*end == 'e' || *end == 'E') {
897     ++end;
898     if (*end == '-' || *end == '+') ++end;
899   }
900 
901   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
902   // suffixed with the letter 'f'.
903   if (*end == 'f' || *end == 'F') {
904     ++end;
905   }
906 
907   GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
908     << " Tokenizer::ParseFloat() passed text that could not have been"
909        " tokenized as a float: " << CEscape(text);
910   return result;
911 }
912 
913 // Helper to append a Unicode code point to a string as UTF8, without bringing
914 // in any external dependencies.
AppendUTF8(uint32 code_point,string * output)915 static void AppendUTF8(uint32 code_point, string* output) {
916   uint32 tmp = 0;
917   int len = 0;
918   if (code_point <= 0x7f) {
919     tmp = code_point;
920     len = 1;
921   } else if (code_point <= 0x07ff) {
922     tmp = 0x0000c080 |
923         ((code_point & 0x07c0) << 2) |
924         (code_point & 0x003f);
925     len = 2;
926   } else if (code_point <= 0xffff) {
927     tmp = 0x00e08080 |
928         ((code_point & 0xf000) << 4) |
929         ((code_point & 0x0fc0) << 2) |
930         (code_point & 0x003f);
931     len = 3;
932   } else if (code_point <= 0x1fffff) {
933     tmp = 0xf0808080 |
934         ((code_point & 0x1c0000) << 6) |
935         ((code_point & 0x03f000) << 4) |
936         ((code_point & 0x000fc0) << 2) |
937         (code_point & 0x003f);
938     len = 4;
939   } else {
940     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
941     // normally only defined up to there as well.
942     StringAppendF(output, "\\U%08x", code_point);
943     return;
944   }
945   tmp = ghtonl(tmp);
946   output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
947 }
948 
949 // Try to read <len> hex digits from ptr, and stuff the numeric result into
950 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32 * result)951 static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
952   *result = 0;
953   if (len == 0) return false;
954   for (const char* end = ptr + len; ptr < end; ++ptr) {
955     if (*ptr == '\0') return false;
956     *result = (*result << 4) + DigitValue(*ptr);
957   }
958   return true;
959 }
960 
961 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
962 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
963 // surrogate. These numbers are in a reserved range of Unicode code points, so
964 // if we encounter such a pair we know how to parse it and convert it into a
965 // single code point.
966 static const uint32 kMinHeadSurrogate = 0xd800;
967 static const uint32 kMaxHeadSurrogate = 0xdc00;
968 static const uint32 kMinTrailSurrogate = 0xdc00;
969 static const uint32 kMaxTrailSurrogate = 0xe000;
970 
IsHeadSurrogate(uint32 code_point)971 static inline bool IsHeadSurrogate(uint32 code_point) {
972   return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
973 }
974 
IsTrailSurrogate(uint32 code_point)975 static inline bool IsTrailSurrogate(uint32 code_point) {
976   return (code_point >= kMinTrailSurrogate) &&
977       (code_point < kMaxTrailSurrogate);
978 }
979 
980 // Combine a head and trail surrogate into a single Unicode code point.
AssembleUTF16(uint32 head_surrogate,uint32 trail_surrogate)981 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
982   GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
983   GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
984   return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
985       (trail_surrogate - kMinTrailSurrogate));
986 }
987 
988 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)989 static inline int UnicodeLength(char key) {
990   if (key == 'u') return 4;
991   if (key == 'U') return 8;
992   return 0;
993 }
994 
995 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
996 // to parse that sequence. On success, returns a pointer to the first char
997 // beyond that sequence, and fills in *code_point. On failure, returns ptr
998 // itself.
FetchUnicodePoint(const char * ptr,uint32 * code_point)999 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
1000   const char* p = ptr;
1001   // Fetch the code point.
1002   const int len = UnicodeLength(*p++);
1003   if (!ReadHexDigits(p, len, code_point))
1004     return ptr;
1005   p += len;
1006 
1007   // Check if the code point we read is a "head surrogate." If so, then we
1008   // expect it to be immediately followed by another code point which is a valid
1009   // "trail surrogate," and together they form a UTF-16 pair which decodes into
1010   // a single Unicode point. Trail surrogates may only use \u, not \U.
1011   if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1012     uint32 trail_surrogate;
1013     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1014         IsTrailSurrogate(trail_surrogate)) {
1015       *code_point = AssembleUTF16(*code_point, trail_surrogate);
1016       p += 6;
1017     }
1018     // If this failed, then we just emit the head surrogate as a code point.
1019     // It's bogus, but so is the string.
1020   }
1021 
1022   return p;
1023 }
1024 
1025 // The text string must begin and end with single or double quote
1026 // characters.
ParseStringAppend(const string & text,string * output)1027 void Tokenizer::ParseStringAppend(const string& text, string* output) {
1028   // Reminder: text[0] is always a quote character.  (If text is
1029   // empty, it's invalid, so we'll just return).
1030   const size_t text_size = text.size();
1031   if (text_size == 0) {
1032     GOOGLE_LOG(DFATAL)
1033       << " Tokenizer::ParseStringAppend() passed text that could not"
1034          " have been tokenized as a string: " << CEscape(text);
1035     return;
1036   }
1037 
1038   // Reserve room for new string. The branch is necessary because if
1039   // there is already space available the reserve() call might
1040   // downsize the output.
1041   const size_t new_len = text_size + output->size();
1042   if (new_len > output->capacity()) {
1043     output->reserve(new_len);
1044   }
1045 
1046   // Loop through the string copying characters to "output" and
1047   // interpreting escape sequences.  Note that any invalid escape
1048   // sequences or other errors were already reported while tokenizing.
1049   // In this case we do not need to produce valid results.
1050   for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1051     if (*ptr == '\\' && ptr[1] != '\0') {
1052       // An escape sequence.
1053       ++ptr;
1054 
1055       if (OctalDigit::InClass(*ptr)) {
1056         // An octal escape.  May one, two, or three digits.
1057         int code = DigitValue(*ptr);
1058         if (OctalDigit::InClass(ptr[1])) {
1059           ++ptr;
1060           code = code * 8 + DigitValue(*ptr);
1061         }
1062         if (OctalDigit::InClass(ptr[1])) {
1063           ++ptr;
1064           code = code * 8 + DigitValue(*ptr);
1065         }
1066         output->push_back(static_cast<char>(code));
1067 
1068       } else if (*ptr == 'x') {
1069         // A hex escape.  May zero, one, or two digits.  (The zero case
1070         // will have been caught as an error earlier.)
1071         int code = 0;
1072         if (HexDigit::InClass(ptr[1])) {
1073           ++ptr;
1074           code = DigitValue(*ptr);
1075         }
1076         if (HexDigit::InClass(ptr[1])) {
1077           ++ptr;
1078           code = code * 16 + DigitValue(*ptr);
1079         }
1080         output->push_back(static_cast<char>(code));
1081 
1082       } else if (*ptr == 'u' || *ptr == 'U') {
1083         uint32 unicode;
1084         const char* end = FetchUnicodePoint(ptr, &unicode);
1085         if (end == ptr) {
1086           // Failure: Just dump out what we saw, don't try to parse it.
1087           output->push_back(*ptr);
1088         } else {
1089           AppendUTF8(unicode, output);
1090           ptr = end - 1;  // Because we're about to ++ptr.
1091         }
1092       } else {
1093         // Some other escape code.
1094         output->push_back(TranslateEscape(*ptr));
1095       }
1096 
1097     } else if (*ptr == text[0] && ptr[1] == '\0') {
1098       // Ignore final quote matching the starting quote.
1099     } else {
1100       output->push_back(*ptr);
1101     }
1102   }
1103 }
1104 
1105 template<typename CharacterClass>
AllInClass(const string & s)1106 static bool AllInClass(const string& s) {
1107   for (int i = 0; i < s.size(); ++i) {
1108     if (!CharacterClass::InClass(s[i]))
1109       return false;
1110   }
1111   return true;
1112 }
1113 
IsIdentifier(const string & text)1114 bool Tokenizer::IsIdentifier(const string& text) {
1115   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1116   if (text.size() == 0)
1117     return false;
1118   if (!Letter::InClass(text.at(0)))
1119     return false;
1120   if (!AllInClass<Alphanumeric>(text.substr(1)))
1121     return false;
1122   return true;
1123 }
1124 
1125 }  // namespace io
1126 }  // namespace protobuf
1127 }  // namespace google
1128