1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 //
35 // Here we have a hand-written lexer.  At first you might ask yourself,
36 // "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
37 // yes I am crazy, but that's beside the point.  There are actually reasons
38 // why I ended up writing this this way.
39 //
40 // The traditional approach to lexing is to use lex to generate a lexer for
41 // you.  Unfortunately, lex's output is ridiculously ugly and difficult to
42 // integrate cleanly with C++ code, especially abstract code or code meant
43 // as a library.  Better parser-generators exist but would add dependencies
44 // which most users won't already have, which we'd like to avoid.  (GNU flex
45 // has a C++ output option, but it's still ridiculously ugly, non-abstract,
46 // and not library-friendly.)
47 //
48 // The next approach that any good software engineer should look at is to
49 // use regular expressions.  And, indeed, I did.  I have code which
50 // implements this same class using regular expressions.  It's about 200
51 // lines shorter.  However:
52 // - Rather than error messages telling you "This string has an invalid
53 //   escape sequence at line 5, column 45", you get error messages like
54 //   "Parse error on line 5".  Giving more precise errors requires adding
55 //   a lot of code that ends up basically as complex as the hand-coded
56 //   version anyway.
57 // - The regular expression to match a string literal looks like this:
58 //     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
59 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
60 //                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
61 //                       "\'([^\'\\\\]|"        // Also support single-quotes.
62 //                       "\\\\[abfnrtv?\"'\\\\0-7]|"
63 //                       "\\\\x[0-9a-fA-F])*\')");
64 //   Verifying the correctness of this line noise is actually harder than
65 //   verifying the correctness of ConsumeString(), defined below.  I'm not
66 //   even confident that the above is correct, after staring at it for some
67 //   time.
68 // - PCRE is fast, but there's still more overhead involved than the code
69 //   below.
70 // - Sadly, regular expressions are not part of the C standard library, so
71 //   using them would require depending on some other library.  For the
72 //   open source release, this could be really annoying.  Nobody likes
73 //   downloading one piece of software just to find that they need to
74 //   download something else to make it work, and in all likelihood
75 //   people downloading Protocol Buffers will already be doing so just
76 //   to make something else work.  We could include a copy of PCRE with
77 //   our code, but that obligates us to keep it up-to-date and just seems
78 //   like a big waste just to save 200 lines of code.
79 //
80 // On a similar but unrelated note, I'm even scared to use ctype.h.
81 // Apparently functions like isalpha() are locale-dependent.  So, if we used
82 // that, then if this code is being called from some program that doesn't
83 // have its locale set to "C", it would behave strangely.  We can't just set
84 // the locale to "C" ourselves since we might break the calling program that
85 // way, particularly if it is multi-threaded.  WTF?  Someone please let me
86 // (Kenton) know if I'm missing something here...
87 //
88 // I'd love to hear about other alternatives, though, as this code isn't
89 // exactly pretty.
90 
91 #include <google/protobuf/io/tokenizer.h>
92 #include <google/protobuf/stubs/common.h>
93 #include <google/protobuf/stubs/logging.h>
94 #include <google/protobuf/stubs/stringprintf.h>
95 #include <google/protobuf/io/strtod.h>
96 #include <google/protobuf/io/zero_copy_stream.h>
97 #include <google/protobuf/stubs/strutil.h>
98 #include <google/protobuf/stubs/stl_util.h>
99 
100 namespace google {
101 namespace protobuf {
102 namespace io {
103 namespace {
104 
105 // As mentioned above, I don't trust ctype.h due to the presence of "locales".
106 // So, I have written replacement functions here.  Someone please smack me if
107 // this is a bad idea or if there is some way around this.
108 //
109 // These "character classes" are designed to be used in template methods.
110 // For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
111 // whitespace.
112 
113 // Note:  No class is allowed to contain '\0', since this is used to mark end-
114 //   of-input and is handled specially.
115 
116 #define CHARACTER_CLASS(NAME, EXPRESSION)      \
117   class NAME {                                 \
118    public:                                     \
119     static inline bool InClass(char c) {       \
120       return EXPRESSION;                       \
121     }                                          \
122   }
123 
124 CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
125                             c == '\r' || c == '\v' || c == '\f');
126 CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
127                                      c == '\r' || c == '\v' || c == '\f');
128 
129 CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
130 
131 CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
132 CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
133 CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
134                           ('a' <= c && c <= 'f') ||
135                           ('A' <= c && c <= 'F'));
136 
137 CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
138                         ('A' <= c && c <= 'Z') ||
139                         (c == '_'));
140 
141 CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
142                               ('A' <= c && c <= 'Z') ||
143                               ('0' <= c && c <= '9') ||
144                               (c == '_'));
145 
146 CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
147                         c == 'r' || c == 't' || c == 'v' || c == '\\' ||
148                         c == '?' || c == '\'' || c == '\"');
149 
150 #undef CHARACTER_CLASS
151 
152 // Given a char, interpret it as a numeric digit and return its value.
153 // This supports any number base up to 36.
DigitValue(char digit)154 inline int DigitValue(char digit) {
155   if ('0' <= digit && digit <= '9') return digit - '0';
156   if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
157   if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
158   return -1;
159 }
160 
161 // Inline because it's only used in one place.
TranslateEscape(char c)162 inline char TranslateEscape(char c) {
163   switch (c) {
164     case 'a':  return '\a';
165     case 'b':  return '\b';
166     case 'f':  return '\f';
167     case 'n':  return '\n';
168     case 'r':  return '\r';
169     case 't':  return '\t';
170     case 'v':  return '\v';
171     case '\\': return '\\';
172     case '?':  return '\?';    // Trigraphs = :(
173     case '\'': return '\'';
174     case '"':  return '\"';
175 
176     // We expect escape sequences to have been validated separately.
177     default:   return '?';
178   }
179 }
180 
181 }  // anonymous namespace
182 
~ErrorCollector()183 ErrorCollector::~ErrorCollector() {}
184 
185 // ===================================================================
186 
Tokenizer(ZeroCopyInputStream * input,ErrorCollector * error_collector)187 Tokenizer::Tokenizer(ZeroCopyInputStream* input,
188                      ErrorCollector* error_collector)
189   : input_(input),
190     error_collector_(error_collector),
191     buffer_(NULL),
192     buffer_size_(0),
193     buffer_pos_(0),
194     read_error_(false),
195     line_(0),
196     column_(0),
197     record_target_(NULL),
198     record_start_(-1),
199     allow_f_after_float_(false),
200     comment_style_(CPP_COMMENT_STYLE),
201     require_space_after_number_(true),
202     allow_multiline_strings_(false) {
203 
204   current_.line = 0;
205   current_.column = 0;
206   current_.end_column = 0;
207   current_.type = TYPE_START;
208 
209   Refresh();
210 }
211 
~Tokenizer()212 Tokenizer::~Tokenizer() {
213   // If we had any buffer left unread, return it to the underlying stream
214   // so that someone else can read it.
215   if (buffer_size_ > buffer_pos_) {
216     input_->BackUp(buffer_size_ - buffer_pos_);
217   }
218 }
219 
220 // -------------------------------------------------------------------
221 // Internal helpers.
222 
NextChar()223 void Tokenizer::NextChar() {
224   // Update our line and column counters based on the character being
225   // consumed.
226   if (current_char_ == '\n') {
227     ++line_;
228     column_ = 0;
229   } else if (current_char_ == '\t') {
230     column_ += kTabWidth - column_ % kTabWidth;
231   } else {
232     ++column_;
233   }
234 
235   // Advance to the next character.
236   ++buffer_pos_;
237   if (buffer_pos_ < buffer_size_) {
238     current_char_ = buffer_[buffer_pos_];
239   } else {
240     Refresh();
241   }
242 }
243 
Refresh()244 void Tokenizer::Refresh() {
245   if (read_error_) {
246     current_char_ = '\0';
247     return;
248   }
249 
250   // If we're in a token, append the rest of the buffer to it.
251   if (record_target_ != NULL && record_start_ < buffer_size_) {
252     record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
253     record_start_ = 0;
254   }
255 
256   const void* data = NULL;
257   buffer_ = NULL;
258   buffer_pos_ = 0;
259   do {
260     if (!input_->Next(&data, &buffer_size_)) {
261       // end of stream (or read error)
262       buffer_size_ = 0;
263       read_error_ = true;
264       current_char_ = '\0';
265       return;
266     }
267   } while (buffer_size_ == 0);
268 
269   buffer_ = static_cast<const char*>(data);
270 
271   current_char_ = buffer_[0];
272 }
273 
RecordTo(string * target)274 inline void Tokenizer::RecordTo(string* target) {
275   record_target_ = target;
276   record_start_ = buffer_pos_;
277 }
278 
StopRecording()279 inline void Tokenizer::StopRecording() {
280   // Note:  The if() is necessary because some STL implementations crash when
281   //   you call string::append(NULL, 0), presumably because they are trying to
282   //   be helpful by detecting the NULL pointer, even though there's nothing
283   //   wrong with reading zero bytes from NULL.
284   if (buffer_pos_ != record_start_) {
285     record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
286   }
287   record_target_ = NULL;
288   record_start_ = -1;
289 }
290 
StartToken()291 inline void Tokenizer::StartToken() {
292   current_.type = TYPE_START;    // Just for the sake of initializing it.
293   current_.text.clear();
294   current_.line = line_;
295   current_.column = column_;
296   RecordTo(&current_.text);
297 }
298 
EndToken()299 inline void Tokenizer::EndToken() {
300   StopRecording();
301   current_.end_column = column_;
302 }
303 
304 // -------------------------------------------------------------------
305 // Helper methods that consume characters.
306 
307 template<typename CharacterClass>
LookingAt()308 inline bool Tokenizer::LookingAt() {
309   return CharacterClass::InClass(current_char_);
310 }
311 
312 template<typename CharacterClass>
TryConsumeOne()313 inline bool Tokenizer::TryConsumeOne() {
314   if (CharacterClass::InClass(current_char_)) {
315     NextChar();
316     return true;
317   } else {
318     return false;
319   }
320 }
321 
TryConsume(char c)322 inline bool Tokenizer::TryConsume(char c) {
323   if (current_char_ == c) {
324     NextChar();
325     return true;
326   } else {
327     return false;
328   }
329 }
330 
331 template<typename CharacterClass>
ConsumeZeroOrMore()332 inline void Tokenizer::ConsumeZeroOrMore() {
333   while (CharacterClass::InClass(current_char_)) {
334     NextChar();
335   }
336 }
337 
338 template<typename CharacterClass>
ConsumeOneOrMore(const char * error)339 inline void Tokenizer::ConsumeOneOrMore(const char* error) {
340   if (!CharacterClass::InClass(current_char_)) {
341     AddError(error);
342   } else {
343     do {
344       NextChar();
345     } while (CharacterClass::InClass(current_char_));
346   }
347 }
348 
349 // -------------------------------------------------------------------
350 // Methods that read whole patterns matching certain kinds of tokens
351 // or comments.
352 
ConsumeString(char delimiter)353 void Tokenizer::ConsumeString(char delimiter) {
354   while (true) {
355     switch (current_char_) {
356       case '\0':
357         AddError("Unexpected end of string.");
358         return;
359 
360       case '\n': {
361         if (!allow_multiline_strings_) {
362           AddError("String literals cannot cross line boundaries.");
363           return;
364         }
365         NextChar();
366         break;
367       }
368 
369       case '\\': {
370         // An escape sequence.
371         NextChar();
372         if (TryConsumeOne<Escape>()) {
373           // Valid escape sequence.
374         } else if (TryConsumeOne<OctalDigit>()) {
375           // Possibly followed by two more octal digits, but these will
376           // just be consumed by the main loop anyway so we don't need
377           // to do so explicitly here.
378         } else if (TryConsume('x')) {
379           if (!TryConsumeOne<HexDigit>()) {
380             AddError("Expected hex digits for escape sequence.");
381           }
382           // Possibly followed by another hex digit, but again we don't care.
383         } else if (TryConsume('u')) {
384           if (!TryConsumeOne<HexDigit>() ||
385               !TryConsumeOne<HexDigit>() ||
386               !TryConsumeOne<HexDigit>() ||
387               !TryConsumeOne<HexDigit>()) {
388             AddError("Expected four hex digits for \\u escape sequence.");
389           }
390         } else if (TryConsume('U')) {
391           // We expect 8 hex digits; but only the range up to 0x10ffff is
392           // legal.
393           if (!TryConsume('0') ||
394               !TryConsume('0') ||
395               !(TryConsume('0') || TryConsume('1')) ||
396               !TryConsumeOne<HexDigit>() ||
397               !TryConsumeOne<HexDigit>() ||
398               !TryConsumeOne<HexDigit>() ||
399               !TryConsumeOne<HexDigit>() ||
400               !TryConsumeOne<HexDigit>()) {
401             AddError("Expected eight hex digits up to 10ffff for \\U escape "
402                      "sequence");
403           }
404         } else {
405           AddError("Invalid escape sequence in string literal.");
406         }
407         break;
408       }
409 
410       default: {
411         if (current_char_ == delimiter) {
412           NextChar();
413           return;
414         }
415         NextChar();
416         break;
417       }
418     }
419   }
420 }
421 
ConsumeNumber(bool started_with_zero,bool started_with_dot)422 Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
423                                               bool started_with_dot) {
424   bool is_float = false;
425 
426   if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
427     // A hex number (started with "0x").
428     ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
429 
430   } else if (started_with_zero && LookingAt<Digit>()) {
431     // An octal number (had a leading zero).
432     ConsumeZeroOrMore<OctalDigit>();
433     if (LookingAt<Digit>()) {
434       AddError("Numbers starting with leading zero must be in octal.");
435       ConsumeZeroOrMore<Digit>();
436     }
437 
438   } else {
439     // A decimal number.
440     if (started_with_dot) {
441       is_float = true;
442       ConsumeZeroOrMore<Digit>();
443     } else {
444       ConsumeZeroOrMore<Digit>();
445 
446       if (TryConsume('.')) {
447         is_float = true;
448         ConsumeZeroOrMore<Digit>();
449       }
450     }
451 
452     if (TryConsume('e') || TryConsume('E')) {
453       is_float = true;
454       TryConsume('-') || TryConsume('+');
455       ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
456     }
457 
458     if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
459       is_float = true;
460     }
461   }
462 
463   if (LookingAt<Letter>() && require_space_after_number_) {
464     AddError("Need space between number and identifier.");
465   } else if (current_char_ == '.') {
466     if (is_float) {
467       AddError(
468         "Already saw decimal point or exponent; can't have another one.");
469     } else {
470       AddError("Hex and octal numbers must be integers.");
471     }
472   }
473 
474   return is_float ? TYPE_FLOAT : TYPE_INTEGER;
475 }
476 
ConsumeLineComment(string * content)477 void Tokenizer::ConsumeLineComment(string* content) {
478   if (content != NULL) RecordTo(content);
479 
480   while (current_char_ != '\0' && current_char_ != '\n') {
481     NextChar();
482   }
483   TryConsume('\n');
484 
485   if (content != NULL) StopRecording();
486 }
487 
ConsumeBlockComment(string * content)488 void Tokenizer::ConsumeBlockComment(string* content) {
489   int start_line = line_;
490   int start_column = column_ - 2;
491 
492   if (content != NULL) RecordTo(content);
493 
494   while (true) {
495     while (current_char_ != '\0' &&
496            current_char_ != '*' &&
497            current_char_ != '/' &&
498            current_char_ != '\n') {
499       NextChar();
500     }
501 
502     if (TryConsume('\n')) {
503       if (content != NULL) StopRecording();
504 
505       // Consume leading whitespace and asterisk;
506       ConsumeZeroOrMore<WhitespaceNoNewline>();
507       if (TryConsume('*')) {
508         if (TryConsume('/')) {
509           // End of comment.
510           break;
511         }
512       }
513 
514       if (content != NULL) RecordTo(content);
515     } else if (TryConsume('*') && TryConsume('/')) {
516       // End of comment.
517       if (content != NULL) {
518         StopRecording();
519         // Strip trailing "*/".
520         content->erase(content->size() - 2);
521       }
522       break;
523     } else if (TryConsume('/') && current_char_ == '*') {
524       // Note:  We didn't consume the '*' because if there is a '/' after it
525       //   we want to interpret that as the end of the comment.
526       AddError(
527         "\"/*\" inside block comment.  Block comments cannot be nested.");
528     } else if (current_char_ == '\0') {
529       AddError("End-of-file inside block comment.");
530       error_collector_->AddError(
531         start_line, start_column, "  Comment started here.");
532       if (content != NULL) StopRecording();
533       break;
534     }
535   }
536 }
537 
TryConsumeCommentStart()538 Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
539   if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
540     if (TryConsume('/')) {
541       return LINE_COMMENT;
542     } else if (TryConsume('*')) {
543       return BLOCK_COMMENT;
544     } else {
545       // Oops, it was just a slash.  Return it.
546       current_.type = TYPE_SYMBOL;
547       current_.text = "/";
548       current_.line = line_;
549       current_.column = column_ - 1;
550       current_.end_column = column_;
551       return SLASH_NOT_COMMENT;
552     }
553   } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
554     return LINE_COMMENT;
555   } else {
556     return NO_COMMENT;
557   }
558 }
559 
560 // -------------------------------------------------------------------
561 
Next()562 bool Tokenizer::Next() {
563   previous_ = current_;
564 
565   while (!read_error_) {
566     ConsumeZeroOrMore<Whitespace>();
567 
568     switch (TryConsumeCommentStart()) {
569       case LINE_COMMENT:
570         ConsumeLineComment(NULL);
571         continue;
572       case BLOCK_COMMENT:
573         ConsumeBlockComment(NULL);
574         continue;
575       case SLASH_NOT_COMMENT:
576         return true;
577       case NO_COMMENT:
578         break;
579     }
580 
581     // Check for EOF before continuing.
582     if (read_error_) break;
583 
584     if (LookingAt<Unprintable>() || current_char_ == '\0') {
585       AddError("Invalid control characters encountered in text.");
586       NextChar();
587       // Skip more unprintable characters, too.  But, remember that '\0' is
588       // also what current_char_ is set to after EOF / read error.  We have
589       // to be careful not to go into an infinite loop of trying to consume
590       // it, so make sure to check read_error_ explicitly before consuming
591       // '\0'.
592       while (TryConsumeOne<Unprintable>() ||
593              (!read_error_ && TryConsume('\0'))) {
594         // Ignore.
595       }
596 
597     } else {
598       // Reading some sort of token.
599       StartToken();
600 
601       if (TryConsumeOne<Letter>()) {
602         ConsumeZeroOrMore<Alphanumeric>();
603         current_.type = TYPE_IDENTIFIER;
604       } else if (TryConsume('0')) {
605         current_.type = ConsumeNumber(true, false);
606       } else if (TryConsume('.')) {
607         // This could be the beginning of a floating-point number, or it could
608         // just be a '.' symbol.
609 
610         if (TryConsumeOne<Digit>()) {
611           // It's a floating-point number.
612           if (previous_.type == TYPE_IDENTIFIER &&
613               current_.line == previous_.line &&
614               current_.column == previous_.end_column) {
615             // We don't accept syntax like "blah.123".
616             error_collector_->AddError(line_, column_ - 2,
617               "Need space between identifier and decimal point.");
618           }
619           current_.type = ConsumeNumber(false, true);
620         } else {
621           current_.type = TYPE_SYMBOL;
622         }
623       } else if (TryConsumeOne<Digit>()) {
624         current_.type = ConsumeNumber(false, false);
625       } else if (TryConsume('\"')) {
626         ConsumeString('\"');
627         current_.type = TYPE_STRING;
628       } else if (TryConsume('\'')) {
629         ConsumeString('\'');
630         current_.type = TYPE_STRING;
631       } else {
632         // Check if the high order bit is set.
633         if (current_char_ & 0x80) {
634           error_collector_->AddError(line_, column_,
635               StringPrintf("Interpreting non ascii codepoint %d.",
636                            static_cast<unsigned char>(current_char_)));
637         }
638         NextChar();
639         current_.type = TYPE_SYMBOL;
640       }
641 
642       EndToken();
643       return true;
644     }
645   }
646 
647   // EOF
648   current_.type = TYPE_END;
649   current_.text.clear();
650   current_.line = line_;
651   current_.column = column_;
652   current_.end_column = column_;
653   return false;
654 }
655 
656 namespace {
657 
658 // Helper class for collecting comments and putting them in the right places.
659 //
660 // This basically just buffers the most recent comment until it can be decided
661 // exactly where that comment should be placed.  When Flush() is called, the
662 // current comment goes into either prev_trailing_comments or detached_comments.
663 // When the CommentCollector is destroyed, the last buffered comment goes into
664 // next_leading_comments.
665 class CommentCollector {
666  public:
CommentCollector(string * prev_trailing_comments,vector<string> * detached_comments,string * next_leading_comments)667   CommentCollector(string* prev_trailing_comments,
668                    vector<string>* detached_comments,
669                    string* next_leading_comments)
670       : prev_trailing_comments_(prev_trailing_comments),
671         detached_comments_(detached_comments),
672         next_leading_comments_(next_leading_comments),
673         has_comment_(false),
674         is_line_comment_(false),
675         can_attach_to_prev_(true) {
676     if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
677     if (detached_comments != NULL) detached_comments->clear();
678     if (next_leading_comments != NULL) next_leading_comments->clear();
679   }
680 
~CommentCollector()681   ~CommentCollector() {
682     // Whatever is in the buffer is a leading comment.
683     if (next_leading_comments_ != NULL && has_comment_) {
684       comment_buffer_.swap(*next_leading_comments_);
685     }
686   }
687 
688   // About to read a line comment.  Get the comment buffer pointer in order to
689   // read into it.
GetBufferForLineComment()690   string* GetBufferForLineComment() {
691     // We want to combine with previous line comments, but not block comments.
692     if (has_comment_ && !is_line_comment_) {
693       Flush();
694     }
695     has_comment_ = true;
696     is_line_comment_ = true;
697     return &comment_buffer_;
698   }
699 
700   // About to read a block comment.  Get the comment buffer pointer in order to
701   // read into it.
GetBufferForBlockComment()702   string* GetBufferForBlockComment() {
703     if (has_comment_) {
704       Flush();
705     }
706     has_comment_ = true;
707     is_line_comment_ = false;
708     return &comment_buffer_;
709   }
710 
ClearBuffer()711   void ClearBuffer() {
712     comment_buffer_.clear();
713     has_comment_ = false;
714   }
715 
716   // Called once we know that the comment buffer is complete and is *not*
717   // connected to the next token.
Flush()718   void Flush() {
719     if (has_comment_) {
720       if (can_attach_to_prev_) {
721         if (prev_trailing_comments_ != NULL) {
722           prev_trailing_comments_->append(comment_buffer_);
723         }
724         can_attach_to_prev_ = false;
725       } else {
726         if (detached_comments_ != NULL) {
727           detached_comments_->push_back(comment_buffer_);
728         }
729       }
730       ClearBuffer();
731     }
732   }
733 
DetachFromPrev()734   void DetachFromPrev() {
735     can_attach_to_prev_ = false;
736   }
737 
738  private:
739   string* prev_trailing_comments_;
740   vector<string>* detached_comments_;
741   string* next_leading_comments_;
742 
743   string comment_buffer_;
744 
745   // True if any comments were read into comment_buffer_.  This can be true even
746   // if comment_buffer_ is empty, namely if the comment was "/**/".
747   bool has_comment_;
748 
749   // Is the comment in the comment buffer a line comment?
750   bool is_line_comment_;
751 
752   // Is it still possible that we could be reading a comment attached to the
753   // previous token?
754   bool can_attach_to_prev_;
755 };
756 
757 } // namespace
758 
NextWithComments(string * prev_trailing_comments,vector<string> * detached_comments,string * next_leading_comments)759 bool Tokenizer::NextWithComments(string* prev_trailing_comments,
760                                  vector<string>* detached_comments,
761                                  string* next_leading_comments) {
762   CommentCollector collector(prev_trailing_comments, detached_comments,
763                              next_leading_comments);
764 
765   if (current_.type == TYPE_START) {
766     // Ignore unicode byte order mark(BOM) if it appears at the file
767     // beginning. Only UTF-8 BOM (0xEF 0xBB 0xBF) is accepted.
768     if (TryConsume((char)0xEF)) {
769       if (!TryConsume((char)0xBB) || !TryConsume((char)0xBF)) {
770         AddError("Proto file starts with 0xEF but not UTF-8 BOM. "
771                  "Only UTF-8 is accepted for proto file.");
772         return false;
773       }
774     }
775     collector.DetachFromPrev();
776   } else {
777     // A comment appearing on the same line must be attached to the previous
778     // declaration.
779     ConsumeZeroOrMore<WhitespaceNoNewline>();
780     switch (TryConsumeCommentStart()) {
781       case LINE_COMMENT:
782         ConsumeLineComment(collector.GetBufferForLineComment());
783 
784         // Don't allow comments on subsequent lines to be attached to a trailing
785         // comment.
786         collector.Flush();
787         break;
788       case BLOCK_COMMENT:
789         ConsumeBlockComment(collector.GetBufferForBlockComment());
790 
791         ConsumeZeroOrMore<WhitespaceNoNewline>();
792         if (!TryConsume('\n')) {
793           // Oops, the next token is on the same line.  If we recorded a comment
794           // we really have no idea which token it should be attached to.
795           collector.ClearBuffer();
796           return Next();
797         }
798 
799         // Don't allow comments on subsequent lines to be attached to a trailing
800         // comment.
801         collector.Flush();
802         break;
803       case SLASH_NOT_COMMENT:
804         return true;
805       case NO_COMMENT:
806         if (!TryConsume('\n')) {
807           // The next token is on the same line.  There are no comments.
808           return Next();
809         }
810         break;
811     }
812   }
813 
814   // OK, we are now on the line *after* the previous token.
815   while (true) {
816     ConsumeZeroOrMore<WhitespaceNoNewline>();
817 
818     switch (TryConsumeCommentStart()) {
819       case LINE_COMMENT:
820         ConsumeLineComment(collector.GetBufferForLineComment());
821         break;
822       case BLOCK_COMMENT:
823         ConsumeBlockComment(collector.GetBufferForBlockComment());
824 
825         // Consume the rest of the line so that we don't interpret it as a
826         // blank line the next time around the loop.
827         ConsumeZeroOrMore<WhitespaceNoNewline>();
828         TryConsume('\n');
829         break;
830       case SLASH_NOT_COMMENT:
831         return true;
832       case NO_COMMENT:
833         if (TryConsume('\n')) {
834           // Completely blank line.
835           collector.Flush();
836           collector.DetachFromPrev();
837         } else {
838           bool result = Next();
839           if (!result ||
840               current_.text == "}" ||
841               current_.text == "]" ||
842               current_.text == ")") {
843             // It looks like we're at the end of a scope.  In this case it
844             // makes no sense to attach a comment to the following token.
845             collector.Flush();
846           }
847           return result;
848         }
849         break;
850     }
851   }
852 }
853 
854 // -------------------------------------------------------------------
855 // Token-parsing helpers.  Remember that these don't need to report
856 // errors since any errors should already have been reported while
857 // tokenizing.  Also, these can assume that whatever text they
858 // are given is text that the tokenizer actually parsed as a token
859 // of the given type.
860 
ParseInteger(const string & text,uint64 max_value,uint64 * output)861 bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
862                              uint64* output) {
863   // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
864   // is non-standard.  I hate the C standard library.  :(
865 
866 //  return strtoull(text.c_str(), NULL, 0);
867 
868   const char* ptr = text.c_str();
869   int base = 10;
870   if (ptr[0] == '0') {
871     if (ptr[1] == 'x' || ptr[1] == 'X') {
872       // This is hex.
873       base = 16;
874       ptr += 2;
875     } else {
876       // This is octal.
877       base = 8;
878     }
879   }
880 
881   uint64 result = 0;
882   for (; *ptr != '\0'; ptr++) {
883     int digit = DigitValue(*ptr);
884     if (digit < 0 || digit >= base) {
885       // The token provided by Tokenizer is invalid. i.e., 099 is an invalid
886       // token, but Tokenizer still think it's integer.
887       return false;
888     }
889     if (digit > max_value || result > (max_value - digit) / base) {
890       // Overflow.
891       return false;
892     }
893     result = result * base + digit;
894   }
895 
896   *output = result;
897   return true;
898 }
899 
ParseFloat(const string & text)900 double Tokenizer::ParseFloat(const string& text) {
901   const char* start = text.c_str();
902   char* end;
903   double result = NoLocaleStrtod(start, &end);
904 
905   // "1e" is not a valid float, but if the tokenizer reads it, it will
906   // report an error but still return it as a valid token.  We need to
907   // accept anything the tokenizer could possibly return, error or not.
908   if (*end == 'e' || *end == 'E') {
909     ++end;
910     if (*end == '-' || *end == '+') ++end;
911   }
912 
913   // If the Tokenizer had allow_f_after_float_ enabled, the float may be
914   // suffixed with the letter 'f'.
915   if (*end == 'f' || *end == 'F') {
916     ++end;
917   }
918 
919   GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
920     << " Tokenizer::ParseFloat() passed text that could not have been"
921        " tokenized as a float: " << CEscape(text);
922   return result;
923 }
924 
925 // Helper to append a Unicode code point to a string as UTF8, without bringing
926 // in any external dependencies.
AppendUTF8(uint32 code_point,string * output)927 static void AppendUTF8(uint32 code_point, string* output) {
928   uint32 tmp = 0;
929   int len = 0;
930   if (code_point <= 0x7f) {
931     tmp = code_point;
932     len = 1;
933   } else if (code_point <= 0x07ff) {
934     tmp = 0x0000c080 |
935         ((code_point & 0x07c0) << 2) |
936         (code_point & 0x003f);
937     len = 2;
938   } else if (code_point <= 0xffff) {
939     tmp = 0x00e08080 |
940         ((code_point & 0xf000) << 4) |
941         ((code_point & 0x0fc0) << 2) |
942         (code_point & 0x003f);
943     len = 3;
944   } else if (code_point <= 0x1fffff) {
945     tmp = 0xf0808080 |
946         ((code_point & 0x1c0000) << 6) |
947         ((code_point & 0x03f000) << 4) |
948         ((code_point & 0x000fc0) << 2) |
949         (code_point & 0x003f);
950     len = 4;
951   } else {
952     // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
953     // normally only defined up to there as well.
954     StringAppendF(output, "\\U%08x", code_point);
955     return;
956   }
957   tmp = ghtonl(tmp);
958   output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
959 }
960 
961 // Try to read <len> hex digits from ptr, and stuff the numeric result into
962 // *result. Returns true if that many digits were successfully consumed.
ReadHexDigits(const char * ptr,int len,uint32 * result)963 static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
964   *result = 0;
965   if (len == 0) return false;
966   for (const char* end = ptr + len; ptr < end; ++ptr) {
967     if (*ptr == '\0') return false;
968     *result = (*result << 4) + DigitValue(*ptr);
969   }
970   return true;
971 }
972 
973 // Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
974 // 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
975 // surrogate. These numbers are in a reserved range of Unicode code points, so
976 // if we encounter such a pair we know how to parse it and convert it into a
977 // single code point.
978 static const uint32 kMinHeadSurrogate = 0xd800;
979 static const uint32 kMaxHeadSurrogate = 0xdc00;
980 static const uint32 kMinTrailSurrogate = 0xdc00;
981 static const uint32 kMaxTrailSurrogate = 0xe000;
982 
IsHeadSurrogate(uint32 code_point)983 static inline bool IsHeadSurrogate(uint32 code_point) {
984   return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
985 }
986 
IsTrailSurrogate(uint32 code_point)987 static inline bool IsTrailSurrogate(uint32 code_point) {
988   return (code_point >= kMinTrailSurrogate) &&
989       (code_point < kMaxTrailSurrogate);
990 }
991 
992 // Combine a head and trail surrogate into a single Unicode code point.
AssembleUTF16(uint32 head_surrogate,uint32 trail_surrogate)993 static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
994   GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
995   GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
996   return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
997       (trail_surrogate - kMinTrailSurrogate));
998 }
999 
1000 // Convert the escape sequence parameter to a number of expected hex digits.
UnicodeLength(char key)1001 static inline int UnicodeLength(char key) {
1002   if (key == 'u') return 4;
1003   if (key == 'U') return 8;
1004   return 0;
1005 }
1006 
1007 // Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
1008 // to parse that sequence. On success, returns a pointer to the first char
1009 // beyond that sequence, and fills in *code_point. On failure, returns ptr
1010 // itself.
FetchUnicodePoint(const char * ptr,uint32 * code_point)1011 static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
1012   const char* p = ptr;
1013   // Fetch the code point.
1014   const int len = UnicodeLength(*p++);
1015   if (!ReadHexDigits(p, len, code_point))
1016     return ptr;
1017   p += len;
1018 
1019   // Check if the code point we read is a "head surrogate." If so, then we
1020   // expect it to be immediately followed by another code point which is a valid
1021   // "trail surrogate," and together they form a UTF-16 pair which decodes into
1022   // a single Unicode point. Trail surrogates may only use \u, not \U.
1023   if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
1024     uint32 trail_surrogate;
1025     if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
1026         IsTrailSurrogate(trail_surrogate)) {
1027       *code_point = AssembleUTF16(*code_point, trail_surrogate);
1028       p += 6;
1029     }
1030     // If this failed, then we just emit the head surrogate as a code point.
1031     // It's bogus, but so is the string.
1032   }
1033 
1034   return p;
1035 }
1036 
1037 // The text string must begin and end with single or double quote
1038 // characters.
ParseStringAppend(const string & text,string * output)1039 void Tokenizer::ParseStringAppend(const string& text, string* output) {
1040   // Reminder: text[0] is always a quote character.  (If text is
1041   // empty, it's invalid, so we'll just return).
1042   const size_t text_size = text.size();
1043   if (text_size == 0) {
1044     GOOGLE_LOG(DFATAL)
1045       << " Tokenizer::ParseStringAppend() passed text that could not"
1046          " have been tokenized as a string: " << CEscape(text);
1047     return;
1048   }
1049 
1050   // Reserve room for new string. The branch is necessary because if
1051   // there is already space available the reserve() call might
1052   // downsize the output.
1053   const size_t new_len = text_size + output->size();
1054   if (new_len > output->capacity()) {
1055     output->reserve(new_len);
1056   }
1057 
1058   // Loop through the string copying characters to "output" and
1059   // interpreting escape sequences.  Note that any invalid escape
1060   // sequences or other errors were already reported while tokenizing.
1061   // In this case we do not need to produce valid results.
1062   for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1063     if (*ptr == '\\' && ptr[1] != '\0') {
1064       // An escape sequence.
1065       ++ptr;
1066 
1067       if (OctalDigit::InClass(*ptr)) {
1068         // An octal escape.  May one, two, or three digits.
1069         int code = DigitValue(*ptr);
1070         if (OctalDigit::InClass(ptr[1])) {
1071           ++ptr;
1072           code = code * 8 + DigitValue(*ptr);
1073         }
1074         if (OctalDigit::InClass(ptr[1])) {
1075           ++ptr;
1076           code = code * 8 + DigitValue(*ptr);
1077         }
1078         output->push_back(static_cast<char>(code));
1079 
1080       } else if (*ptr == 'x') {
1081         // A hex escape.  May zero, one, or two digits.  (The zero case
1082         // will have been caught as an error earlier.)
1083         int code = 0;
1084         if (HexDigit::InClass(ptr[1])) {
1085           ++ptr;
1086           code = DigitValue(*ptr);
1087         }
1088         if (HexDigit::InClass(ptr[1])) {
1089           ++ptr;
1090           code = code * 16 + DigitValue(*ptr);
1091         }
1092         output->push_back(static_cast<char>(code));
1093 
1094       } else if (*ptr == 'u' || *ptr == 'U') {
1095         uint32 unicode;
1096         const char* end = FetchUnicodePoint(ptr, &unicode);
1097         if (end == ptr) {
1098           // Failure: Just dump out what we saw, don't try to parse it.
1099           output->push_back(*ptr);
1100         } else {
1101           AppendUTF8(unicode, output);
1102           ptr = end - 1;  // Because we're about to ++ptr.
1103         }
1104       } else {
1105         // Some other escape code.
1106         output->push_back(TranslateEscape(*ptr));
1107       }
1108 
1109     } else if (*ptr == text[0] && ptr[1] == '\0') {
1110       // Ignore final quote matching the starting quote.
1111     } else {
1112       output->push_back(*ptr);
1113     }
1114   }
1115 }
1116 
1117 template<typename CharacterClass>
AllInClass(const string & s)1118 static bool AllInClass(const string& s) {
1119   for (int i = 0; i < s.size(); ++i) {
1120     if (!CharacterClass::InClass(s[i]))
1121       return false;
1122   }
1123   return true;
1124 }
1125 
IsIdentifier(const string & text)1126 bool Tokenizer::IsIdentifier(const string& text) {
1127   // Mirrors IDENTIFIER definition in Tokenizer::Next() above.
1128   if (text.size() == 0)
1129     return false;
1130   if (!Letter::InClass(text.at(0)))
1131     return false;
1132   if (!AllInClass<Alphanumeric>(text.substr(1)))
1133     return false;
1134   return true;
1135 }
1136 
1137 }  // namespace io
1138 }  // namespace protobuf
1139 }  // namespace google
1140