1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 // Author: kenton@google.com (Kenton Varda)
32 //  Based on original Protocol Buffers design by
33 //  Sanjay Ghemawat, Jeff Dean, and others.
34 
35 #include <limits.h>
36 #include <math.h>
37 
38 #include <vector>
39 
40 #include <google/protobuf/io/tokenizer.h>
41 #include <google/protobuf/io/zero_copy_stream_impl.h>
42 
43 #include <google/protobuf/stubs/common.h>
44 #include <google/protobuf/stubs/logging.h>
45 #include <google/protobuf/stubs/strutil.h>
46 #include <google/protobuf/stubs/substitute.h>
47 #include <google/protobuf/testing/googletest.h>
48 #include <gtest/gtest.h>
49 
50 namespace google {
51 namespace protobuf {
52 namespace io {
53 namespace {
54 
55 // ===================================================================
56 // Data-Driven Test Infrastructure
57 
58 // TODO(kenton):  This is copied from coded_stream_unittest.  This is
59 //   temporary until these fetaures are integrated into gTest itself.
60 
61 // TEST_1D and TEST_2D are macros I'd eventually like to see added to
62 // gTest.  These macros can be used to declare tests which should be
63 // run multiple times, once for each item in some input array.  TEST_1D
64 // tests all cases in a single input array.  TEST_2D tests all
65 // combinations of cases from two arrays.  The arrays must be statically
66 // defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
67 //
68 // int kCases[] = {1, 2, 3, 4}
69 // TEST_1D(MyFixture, MyTest, kCases) {
70 //   EXPECT_GT(kCases_case, 0);
71 // }
72 //
73 // This test iterates through the numbers 1, 2, 3, and 4 and tests that
74 // they are all grater than zero.  In case of failure, the exact case
75 // which failed will be printed.  The case type must be printable using
76 // ostream::operator<<.
77 
78 #define TEST_1D(FIXTURE, NAME, CASES)                                      \
79   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
80    protected:                                                              \
81     template <typename CaseType>                                           \
82     void DoSingleCase(const CaseType& CASES##_case);                       \
83   };                                                                       \
84                                                                            \
85   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
86     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
87       SCOPED_TRACE(testing::Message()                                      \
88         << #CASES " case #" << i << ": " << CASES[i]);                     \
89       DoSingleCase(CASES[i]);                                              \
90     }                                                                      \
91   }                                                                        \
92                                                                            \
93   template <typename CaseType>                                             \
94   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
95 
96 #define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
97   class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
98    protected:                                                              \
99     template <typename CaseType1, typename CaseType2>                      \
100     void DoSingleCase(const CaseType1& CASES1##_case,                      \
101                       const CaseType2& CASES2##_case);                     \
102   };                                                                       \
103                                                                            \
104   TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
105     for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
106       for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
107         SCOPED_TRACE(testing::Message()                                    \
108           << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
109           << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
110         DoSingleCase(CASES1[i], CASES2[j]);                                \
111       }                                                                    \
112     }                                                                      \
113   }                                                                        \
114                                                                            \
115   template <typename CaseType1, typename CaseType2>                        \
116   void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
117                                            const CaseType2& CASES2##_case)
118 
119 // -------------------------------------------------------------------
120 
121 // An input stream that is basically like an ArrayInputStream but sometimes
122 // returns empty buffers, just to throw us off.
123 class TestInputStream : public ZeroCopyInputStream {
124  public:
TestInputStream(const void * data,int size,int block_size)125   TestInputStream(const void* data, int size, int block_size)
126     : array_stream_(data, size, block_size), counter_(0) {}
~TestInputStream()127   ~TestInputStream() {}
128 
129   // implements ZeroCopyInputStream ----------------------------------
Next(const void ** data,int * size)130   bool Next(const void** data, int* size) {
131     // We'll return empty buffers starting with the first buffer, and every
132     // 3 and 5 buffers after that.
133     if (counter_ % 3 == 0 || counter_ % 5 == 0) {
134       *data = NULL;
135       *size = 0;
136       ++counter_;
137       return true;
138     } else {
139       ++counter_;
140       return array_stream_.Next(data, size);
141     }
142   }
143 
BackUp(int count)144   void BackUp(int count)  { return array_stream_.BackUp(count); }
Skip(int count)145   bool Skip(int count)    { return array_stream_.Skip(count);   }
ByteCount() const146   int64 ByteCount() const { return array_stream_.ByteCount();   }
147 
148  private:
149   ArrayInputStream array_stream_;
150   int counter_;
151 };
152 
153 // -------------------------------------------------------------------
154 
155 // An error collector which simply concatenates all its errors into a big
156 // block of text which can be checked.
157 class TestErrorCollector : public ErrorCollector {
158  public:
TestErrorCollector()159   TestErrorCollector() {}
~TestErrorCollector()160   ~TestErrorCollector() {}
161 
162   string text_;
163 
164   // implements ErrorCollector ---------------------------------------
AddError(int line,int column,const string & message)165   void AddError(int line, int column, const string& message) {
166     strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
167                                  line, column, message);
168   }
169 };
170 
171 // -------------------------------------------------------------------
172 
173 // We test each operation over a variety of block sizes to insure that
174 // we test cases where reads cross buffer boundaries as well as cases
175 // where they don't.  This is sort of a brute-force approach to this,
176 // but it's easy to write and easy to understand.
177 const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
178 
179 class TokenizerTest : public testing::Test {
180  protected:
181   // For easy testing.
ParseInteger(const string & text)182   uint64 ParseInteger(const string& text) {
183     uint64 result;
184     EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
185     return result;
186   }
187 };
188 
189 // ===================================================================
190 
191 // These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
192 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
193 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
194 
195 // In each test case, the entire input text should parse as a single token
196 // of the given type.
197 struct SimpleTokenCase {
198   string input;
199   Tokenizer::TokenType type;
200 };
201 
operator <<(ostream & out,const SimpleTokenCase & test_case)202 inline ostream& operator<<(ostream& out,
203                            const SimpleTokenCase& test_case) {
204   return out << CEscape(test_case.input);
205 }
206 
207 SimpleTokenCase kSimpleTokenCases[] = {
208   // Test identifiers.
209   { "hello",       Tokenizer::TYPE_IDENTIFIER },
210 
211   // Test integers.
212   { "123",         Tokenizer::TYPE_INTEGER },
213   { "0xab6",       Tokenizer::TYPE_INTEGER },
214   { "0XAB6",       Tokenizer::TYPE_INTEGER },
215   { "0X1234567",   Tokenizer::TYPE_INTEGER },
216   { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
217   { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
218   { "01234567",    Tokenizer::TYPE_INTEGER },
219 
220   // Test floats.
221   { "123.45",      Tokenizer::TYPE_FLOAT },
222   { "1.",          Tokenizer::TYPE_FLOAT },
223   { "1e3",         Tokenizer::TYPE_FLOAT },
224   { "1E3",         Tokenizer::TYPE_FLOAT },
225   { "1e-3",        Tokenizer::TYPE_FLOAT },
226   { "1e+3",        Tokenizer::TYPE_FLOAT },
227   { "1.e3",        Tokenizer::TYPE_FLOAT },
228   { "1.2e3",       Tokenizer::TYPE_FLOAT },
229   { ".1",          Tokenizer::TYPE_FLOAT },
230   { ".1e3",        Tokenizer::TYPE_FLOAT },
231   { ".1e-3",       Tokenizer::TYPE_FLOAT },
232   { ".1e+3",       Tokenizer::TYPE_FLOAT },
233 
234   // Test strings.
235   { "'hello'",     Tokenizer::TYPE_STRING },
236   { "\"foo\"",     Tokenizer::TYPE_STRING },
237   { "'a\"b'",      Tokenizer::TYPE_STRING },
238   { "\"a'b\"",     Tokenizer::TYPE_STRING },
239   { "'a\\'b'",     Tokenizer::TYPE_STRING },
240   { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
241   { "'\\xf'",      Tokenizer::TYPE_STRING },
242   { "'\\0'",       Tokenizer::TYPE_STRING },
243 
244   // Test symbols.
245   { "+",           Tokenizer::TYPE_SYMBOL },
246   { ".",           Tokenizer::TYPE_SYMBOL },
247 };
248 
TEST_2D(TokenizerTest,SimpleTokens,kSimpleTokenCases,kBlockSizes)249 TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
250   // Set up the tokenizer.
251   TestInputStream input(kSimpleTokenCases_case.input.data(),
252                         kSimpleTokenCases_case.input.size(),
253                         kBlockSizes_case);
254   TestErrorCollector error_collector;
255   Tokenizer tokenizer(&input, &error_collector);
256 
257   // Before Next() is called, the initial token should always be TYPE_START.
258   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
259   EXPECT_EQ("", tokenizer.current().text);
260   EXPECT_EQ(0, tokenizer.current().line);
261   EXPECT_EQ(0, tokenizer.current().column);
262   EXPECT_EQ(0, tokenizer.current().end_column);
263 
264   // Parse the token.
265   ASSERT_TRUE(tokenizer.Next());
266 
267   // Check that it has the right type.
268   EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
269   // Check that it contains the complete input text.
270   EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
271   // Check that it is located at the beginning of the input
272   EXPECT_EQ(0, tokenizer.current().line);
273   EXPECT_EQ(0, tokenizer.current().column);
274   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
275             tokenizer.current().end_column);
276 
277   // There should be no more input.
278   EXPECT_FALSE(tokenizer.Next());
279 
280   // After Next() returns false, the token should have type TYPE_END.
281   EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
282   EXPECT_EQ("", tokenizer.current().text);
283   EXPECT_EQ(0, tokenizer.current().line);
284   EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
285   EXPECT_EQ(kSimpleTokenCases_case.input.size(),
286             tokenizer.current().end_column);
287 
288   // There should be no errors.
289   EXPECT_TRUE(error_collector.text_.empty());
290 }
291 
TEST_1D(TokenizerTest,FloatSuffix,kBlockSizes)292 TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
293   // Test the "allow_f_after_float" option.
294 
295   // Set up the tokenizer.
296   const char* text = "1f 2.5f 6e3f 7F";
297   TestInputStream input(text, strlen(text), kBlockSizes_case);
298   TestErrorCollector error_collector;
299   Tokenizer tokenizer(&input, &error_collector);
300   tokenizer.set_allow_f_after_float(true);
301 
302   // Advance through tokens and check that they are parsed as expected.
303   ASSERT_TRUE(tokenizer.Next());
304   EXPECT_EQ(tokenizer.current().text, "1f");
305   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
306   ASSERT_TRUE(tokenizer.Next());
307   EXPECT_EQ(tokenizer.current().text, "2.5f");
308   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
309   ASSERT_TRUE(tokenizer.Next());
310   EXPECT_EQ(tokenizer.current().text, "6e3f");
311   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
312   ASSERT_TRUE(tokenizer.Next());
313   EXPECT_EQ(tokenizer.current().text, "7F");
314   EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
315 
316   // There should be no more input.
317   EXPECT_FALSE(tokenizer.Next());
318   // There should be no errors.
319   EXPECT_TRUE(error_collector.text_.empty());
320 }
321 
322 #endif
323 
324 // -------------------------------------------------------------------
325 
326 // In each case, the input is parsed to produce a list of tokens.  The
327 // last token in "output" must have type TYPE_END.
328 struct MultiTokenCase {
329   string input;
330   Tokenizer::Token output[10];  // The compiler wants a constant array
331                                 // size for initialization to work.  There
332                                 // is no reason this can't be increased if
333                                 // needed.
334 };
335 
operator <<(ostream & out,const MultiTokenCase & test_case)336 inline ostream& operator<<(ostream& out,
337                            const MultiTokenCase& test_case) {
338   return out << CEscape(test_case.input);
339 }
340 
341 MultiTokenCase kMultiTokenCases[] = {
342   // Test empty input.
343   { "", {
344     { Tokenizer::TYPE_END       , ""     , 0,  0 },
345   }},
346 
347   // Test all token types at the same time.
348   { "foo 1 1.2 + 'bar'", {
349     { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
350     { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
351     { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
352     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
353     { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
354     { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
355   }},
356 
357   // Test that consecutive symbols are parsed as separate tokens.
358   { "!@+%", {
359     { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
360     { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
361     { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
362     { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
363     { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
364   }},
365 
366   // Test that newlines affect line numbers correctly.
367   { "foo bar\nrab oof", {
368     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
369     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
370     { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
371     { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
372     { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
373   }},
374 
375   // Test that tabs affect column numbers correctly.
376   { "foo\tbar  \tbaz", {
377     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
378     { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
379     { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
380     { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
381   }},
382 
383   // Test that tabs in string literals affect column numbers correctly.
384   { "\"foo\tbar\" baz", {
385     { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
386     { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
387     { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
388   }},
389 
390   // Test that line comments are ignored.
391   { "foo // This is a comment\n"
392     "bar // This is another comment", {
393     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
394     { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
395     { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
396   }},
397 
398   // Test that block comments are ignored.
399   { "foo /* This is a block comment */ bar", {
400     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
401     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
402     { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
403   }},
404 
405   // Test that sh-style comments are not ignored by default.
406   { "foo # bar\n"
407     "baz", {
408     { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
409     { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
410     { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
411     { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
412     { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
413   }},
414 
415   // Test all whitespace chars
416   { "foo\n\t\r\v\fbar", {
417     { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
418     { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
419     { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
420   }},
421 };
422 
TEST_2D(TokenizerTest,MultipleTokens,kMultiTokenCases,kBlockSizes)423 TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
424   // Set up the tokenizer.
425   TestInputStream input(kMultiTokenCases_case.input.data(),
426                         kMultiTokenCases_case.input.size(),
427                         kBlockSizes_case);
428   TestErrorCollector error_collector;
429   Tokenizer tokenizer(&input, &error_collector);
430 
431   // Before Next() is called, the initial token should always be TYPE_START.
432   EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
433   EXPECT_EQ("", tokenizer.current().text);
434   EXPECT_EQ(0, tokenizer.current().line);
435   EXPECT_EQ(0, tokenizer.current().column);
436   EXPECT_EQ(0, tokenizer.current().end_column);
437 
438   // Loop through all expected tokens.
439   int i = 0;
440   Tokenizer::Token token;
441   do {
442     token = kMultiTokenCases_case.output[i++];
443 
444     SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
445 
446     Tokenizer::Token previous = tokenizer.current();
447 
448     // Next() should only return false when it hits the end token.
449     if (token.type != Tokenizer::TYPE_END) {
450       ASSERT_TRUE(tokenizer.Next());
451     } else {
452       ASSERT_FALSE(tokenizer.Next());
453     }
454 
455     // Check that the previous token is set correctly.
456     EXPECT_EQ(previous.type, tokenizer.previous().type);
457     EXPECT_EQ(previous.text, tokenizer.previous().text);
458     EXPECT_EQ(previous.line, tokenizer.previous().line);
459     EXPECT_EQ(previous.column, tokenizer.previous().column);
460     EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
461 
462     // Check that the token matches the expected one.
463     EXPECT_EQ(token.type, tokenizer.current().type);
464     EXPECT_EQ(token.text, tokenizer.current().text);
465     EXPECT_EQ(token.line, tokenizer.current().line);
466     EXPECT_EQ(token.column, tokenizer.current().column);
467     EXPECT_EQ(token.end_column, tokenizer.current().end_column);
468 
469   } while (token.type != Tokenizer::TYPE_END);
470 
471   // There should be no errors.
472   EXPECT_TRUE(error_collector.text_.empty());
473 }
474 
475 // This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
476 //   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
477 #if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
478 
TEST_1D(TokenizerTest,ShCommentStyle,kBlockSizes)479 TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
480   // Test the "comment_style" option.
481 
482   const char* text = "foo # bar\n"
483                      "baz // qux\n"
484                      "corge /* grault */\n"
485                      "garply";
486   const char* const kTokens[] = {"foo",  // "# bar" is ignored
487                                  "baz", "/", "/", "qux",
488                                  "corge", "/", "*", "grault", "*", "/",
489                                  "garply"};
490 
491   // Set up the tokenizer.
492   TestInputStream input(text, strlen(text), kBlockSizes_case);
493   TestErrorCollector error_collector;
494   Tokenizer tokenizer(&input, &error_collector);
495   tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
496 
497   // Advance through tokens and check that they are parsed as expected.
498   for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
499     EXPECT_TRUE(tokenizer.Next());
500     EXPECT_EQ(tokenizer.current().text, kTokens[i]);
501   }
502 
503   // There should be no more input.
504   EXPECT_FALSE(tokenizer.Next());
505   // There should be no errors.
506   EXPECT_TRUE(error_collector.text_.empty());
507 }
508 
509 #endif
510 
511 // -------------------------------------------------------------------
512 
513 // In each case, the input is expected to have two tokens named "prev" and
514 // "next" with comments in between.
515 struct DocCommentCase {
516   string input;
517 
518   const char* prev_trailing_comments;
519   const char* detached_comments[10];
520   const char* next_leading_comments;
521 };
522 
operator <<(ostream & out,const DocCommentCase & test_case)523 inline ostream& operator<<(ostream& out,
524                            const DocCommentCase& test_case) {
525   return out << CEscape(test_case.input);
526 }
527 
528 DocCommentCase kDocCommentCases[] = {
529   {
530     "prev next",
531 
532     "",
533     {},
534     ""
535       },
536 
537         {
538       "prev /* ignored */ next",
539 
540       "",
541       {},
542       ""
543         },
544 
545           {
546         "prev // trailing comment\n"
547             "next",
548 
549             " trailing comment\n",
550             {},
551             ""
552           },
553 
554             {
555           "prev\n"
556               "// leading comment\n"
557               "// line 2\n"
558               "next",
559 
560               "",
561               {},
562               " leading comment\n"
563               " line 2\n"
564             },
565 
566               {
567             "prev\n"
568                 "// trailing comment\n"
569                 "// line 2\n"
570                 "\n"
571                 "next",
572 
573                 " trailing comment\n"
574                 " line 2\n",
575                 {},
576                 ""
577               },
578 
579                 {
580               "prev // trailing comment\n"
581                   "// leading comment\n"
582                   "// line 2\n"
583                   "next",
584 
585                   " trailing comment\n",
586                   {},
587                   " leading comment\n"
588                   " line 2\n"
589                 },
590 
591                   {
592                 "prev /* trailing block comment */\n"
593                     "/* leading block comment\n"
594                     " * line 2\n"
595                     " * line 3 */"
596                     "next",
597 
598                     " trailing block comment ",
599                     {},
600                     " leading block comment\n"
601                     " line 2\n"
602                     " line 3 "
603                   },
604 
605                     {
606                   "prev\n"
607                       "/* trailing block comment\n"
608                       " * line 2\n"
609                       " * line 3\n"
610                       " */\n"
611                       "/* leading block comment\n"
612                       " * line 2\n"
613                       " * line 3 */"
614                       "next",
615 
616                       " trailing block comment\n"
617                       " line 2\n"
618                       " line 3\n",
619                       {},
620                       " leading block comment\n"
621                       " line 2\n"
622                       " line 3 "
623                     },
624 
625                       {
626                     "prev\n"
627                         "// trailing comment\n"
628                         "\n"
629                         "// detached comment\n"
630                         "// line 2\n"
631                         "\n"
632                         "// second detached comment\n"
633                         "/* third detached comment\n"
634                         " * line 2 */\n"
635                         "// leading comment\n"
636                         "next",
637 
638                         " trailing comment\n",
639                         {
640                       " detached comment\n"
641                           " line 2\n",
642                           " second detached comment\n",
643                           " third detached comment\n"
644                           " line 2 "
645                         },
646                           " leading comment\n"
647                         },
648 
649                           {
650                         "prev /**/\n"
651                             "\n"
652                             "// detached comment\n"
653                             "\n"
654                             "// leading comment\n"
655                             "next",
656 
657                             "",
658                             {
659                           " detached comment\n"
660                             },
661                               " leading comment\n"
662                             },
663 
664                               {
665                             "prev /**/\n"
666                                 "// leading comment\n"
667                                 "next",
668 
669                                 "",
670                                 {},
671                                 " leading comment\n"
672                               },
673                               };
674 
TEST_2D(TokenizerTest,DocComments,kDocCommentCases,kBlockSizes)675 TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
676   // Set up the tokenizer.
677   TestInputStream input(kDocCommentCases_case.input.data(),
678                         kDocCommentCases_case.input.size(),
679                         kBlockSizes_case);
680   TestErrorCollector error_collector;
681   Tokenizer tokenizer(&input, &error_collector);
682 
683   // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
684   TestInputStream input2(kDocCommentCases_case.input.data(),
685                         kDocCommentCases_case.input.size(),
686                         kBlockSizes_case);
687   Tokenizer tokenizer2(&input2, &error_collector);
688 
689   tokenizer.Next();
690   tokenizer2.Next();
691 
692   EXPECT_EQ("prev", tokenizer.current().text);
693   EXPECT_EQ("prev", tokenizer2.current().text);
694 
695   string prev_trailing_comments;
696   vector<string> detached_comments;
697   string next_leading_comments;
698   tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
699                              &next_leading_comments);
700   tokenizer2.NextWithComments(NULL, NULL, NULL);
701   EXPECT_EQ("next", tokenizer.current().text);
702   EXPECT_EQ("next", tokenizer2.current().text);
703 
704   EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
705             prev_trailing_comments);
706 
707   for (int i = 0; i < detached_comments.size(); i++) {
708     ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
709     ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
710     EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
711               detached_comments[i]);
712   }
713 
714   // Verify that we matched all the detached comments.
715   EXPECT_EQ(NULL,
716       kDocCommentCases_case.detached_comments[detached_comments.size()]);
717 
718   EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
719             next_leading_comments);
720 }
721 
722 // -------------------------------------------------------------------
723 
724 // Test parse helpers.  It's not really worth setting up a full data-driven
725 // test here.
TEST_F(TokenizerTest,ParseInteger)726 TEST_F(TokenizerTest, ParseInteger) {
727   EXPECT_EQ(0, ParseInteger("0"));
728   EXPECT_EQ(123, ParseInteger("123"));
729   EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
730   EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
731   EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
732   EXPECT_EQ(01234567, ParseInteger("01234567"));
733   EXPECT_EQ(0X123, ParseInteger("0X123"));
734 
735   // Test invalid integers that may still be tokenized as integers.
736   EXPECT_EQ(0, ParseInteger("0x"));
737 
738   uint64 i;
739 
740   // Test invalid integers that will never be tokenized as integers.
741   EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
742   EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
743   EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
744   EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
745   EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
746 
747   // Test overflows.
748   EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
749   EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
750   EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
751   EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
752   EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
753   EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
754   EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
755 }
756 
TEST_F(TokenizerTest,ParseFloat)757 TEST_F(TokenizerTest, ParseFloat) {
758   EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
759   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
760   EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
761   EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
762   EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
763   EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
764   EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
765   EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
766   EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
767   EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
768   EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
769   EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
770   EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
771   EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
772 
773   // Test invalid integers that may still be tokenized as integers.
774   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
775   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
776   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
777 
778   // Test 'f' suffix.
779   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
780   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
781   EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
782 
783   // These should parse successfully even though they are out of range.
784   // Overflows become infinity and underflows become zero.
785   EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
786   EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
787 
788 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
789   // Test invalid integers that will never be tokenized as integers.
790   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
791     "passed text that could not have been tokenized as a float");
792   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
793     "passed text that could not have been tokenized as a float");
794   EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
795     "passed text that could not have been tokenized as a float");
796 #endif  // PROTOBUF_HAS_DEATH_TEST
797 }
798 
TEST_F(TokenizerTest,ParseString)799 TEST_F(TokenizerTest, ParseString) {
800   string output;
801   Tokenizer::ParseString("'hello'", &output);
802   EXPECT_EQ("hello", output);
803   Tokenizer::ParseString("\"blah\\nblah2\"", &output);
804   EXPECT_EQ("blah\nblah2", output);
805   Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
806   EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
807   Tokenizer::ParseString("'\\x20\\x4'", &output);
808   EXPECT_EQ("\x20\x4", output);
809 
810   // Test invalid strings that may still be tokenized as strings.
811   Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
812   EXPECT_EQ("\a?\v\t", output);
813   Tokenizer::ParseString("'", &output);
814   EXPECT_EQ("", output);
815   Tokenizer::ParseString("'\\", &output);
816   EXPECT_EQ("\\", output);
817 
818   // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
819   // characters.
820   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
821   EXPECT_EQ("$¢€��XX", output);
822   // Same thing encoded using UTF16.
823   Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
824   EXPECT_EQ("$¢€��XX", output);
825   // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
826   // We just output this as if it were UTF8; it's not a defined code point, but
827   // it has a defined encoding.
828   Tokenizer::ParseString("'\\ud852XX'", &output);
829   EXPECT_EQ("\xed\xa1\x92XX", output);
830   // Malformed escape: Demons may fly out of the nose.
831   Tokenizer::ParseString("\\u0", &output);
832   EXPECT_EQ("u0", output);
833 
834   // Test invalid strings that will never be tokenized as strings.
835 #ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
836   EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
837     "passed text that could not have been tokenized as a string");
838 #endif  // PROTOBUF_HAS_DEATH_TEST
839 }
840 
TEST_F(TokenizerTest,ParseStringAppend)841 TEST_F(TokenizerTest, ParseStringAppend) {
842   // Check that ParseString and ParseStringAppend differ.
843   string output("stuff+");
844   Tokenizer::ParseStringAppend("'hello'", &output);
845   EXPECT_EQ("stuff+hello", output);
846   Tokenizer::ParseString("'hello'", &output);
847   EXPECT_EQ("hello", output);
848 }
849 
850 // -------------------------------------------------------------------
851 
852 // Each case parses some input text, ignoring the tokens produced, and
853 // checks that the error output matches what is expected.
854 struct ErrorCase {
855   string input;
856   bool recoverable;  // True if the tokenizer should be able to recover and
857                      // parse more tokens after seeing this error.  Cases
858                      // for which this is true must end with "foo" as
859                      // the last token, which the test will check for.
860   const char* errors;
861 };
862 
operator <<(ostream & out,const ErrorCase & test_case)863 inline ostream& operator<<(ostream& out,
864                            const ErrorCase& test_case) {
865   return out << CEscape(test_case.input);
866 }
867 
868 ErrorCase kErrorCases[] = {
869   // String errors.
870   { "'\\l' foo", true,
871     "0:2: Invalid escape sequence in string literal.\n" },
872   { "'\\X' foo", true,
873     "0:2: Invalid escape sequence in string literal.\n" },
874   { "'\\x' foo", true,
875     "0:3: Expected hex digits for escape sequence.\n" },
876   { "'foo", false,
877     "0:4: Unexpected end of string.\n" },
878   { "'bar\nfoo", true,
879     "0:4: String literals cannot cross line boundaries.\n" },
880   { "'\\u01' foo", true,
881     "0:5: Expected four hex digits for \\u escape sequence.\n" },
882   { "'\\u01' foo", true,
883     "0:5: Expected four hex digits for \\u escape sequence.\n" },
884   { "'\\uXYZ' foo", true,
885     "0:3: Expected four hex digits for \\u escape sequence.\n" },
886 
887   // Integer errors.
888   { "123foo", true,
889     "0:3: Need space between number and identifier.\n" },
890 
891   // Hex/octal errors.
892   { "0x foo", true,
893     "0:2: \"0x\" must be followed by hex digits.\n" },
894   { "0541823 foo", true,
895     "0:4: Numbers starting with leading zero must be in octal.\n" },
896   { "0x123z foo", true,
897     "0:5: Need space between number and identifier.\n" },
898   { "0x123.4 foo", true,
899     "0:5: Hex and octal numbers must be integers.\n" },
900   { "0123.4 foo", true,
901     "0:4: Hex and octal numbers must be integers.\n" },
902 
903   // Float errors.
904   { "1e foo", true,
905     "0:2: \"e\" must be followed by exponent.\n" },
906   { "1e- foo", true,
907     "0:3: \"e\" must be followed by exponent.\n" },
908   { "1.2.3 foo", true,
909     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
910   { "1e2.3 foo", true,
911     "0:3: Already saw decimal point or exponent; can't have another one.\n" },
912   { "a.1 foo", true,
913     "0:1: Need space between identifier and decimal point.\n" },
914   // allow_f_after_float not enabled, so this should be an error.
915   { "1.0f foo", true,
916     "0:3: Need space between number and identifier.\n" },
917 
918   // Block comment errors.
919   { "/*", false,
920     "0:2: End-of-file inside block comment.\n"
921     "0:0:   Comment started here.\n"},
922   { "/*/*/ foo", true,
923     "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
924 
925   // Control characters.  Multiple consecutive control characters should only
926   // produce one error.
927   { "\b foo", true,
928     "0:0: Invalid control characters encountered in text.\n" },
929   { "\b\b foo", true,
930     "0:0: Invalid control characters encountered in text.\n" },
931 
932   // Check that control characters at end of input don't result in an
933   // infinite loop.
934   { "\b", false,
935     "0:0: Invalid control characters encountered in text.\n" },
936 
937   // Check recovery from '\0'.  We have to explicitly specify the length of
938   // these strings because otherwise the string constructor will just call
939   // strlen() which will see the first '\0' and think that is the end of the
940   // string.
941   { string("\0foo", 4), true,
942     "0:0: Invalid control characters encountered in text.\n" },
943   { string("\0\0foo", 5), true,
944     "0:0: Invalid control characters encountered in text.\n" },
945 
946   // Check error from high order bits set
947   { "\300foo", true,
948     "0:0: Interpreting non ascii codepoint 192.\n" },
949 };
950 
TEST_2D(TokenizerTest,Errors,kErrorCases,kBlockSizes)951 TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
952   // Set up the tokenizer.
953   TestInputStream input(kErrorCases_case.input.data(),
954                         kErrorCases_case.input.size(),
955                         kBlockSizes_case);
956   TestErrorCollector error_collector;
957   Tokenizer tokenizer(&input, &error_collector);
958 
959   // Ignore all input, except remember if the last token was "foo".
960   bool last_was_foo = false;
961   while (tokenizer.Next()) {
962     last_was_foo = tokenizer.current().text == "foo";
963   }
964 
965   // Check that the errors match what was expected.
966   EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
967 
968   // If the error was recoverable, make sure we saw "foo" after it.
969   if (kErrorCases_case.recoverable) {
970     EXPECT_TRUE(last_was_foo);
971   }
972 }
973 
974 // -------------------------------------------------------------------
975 
TEST_1D(TokenizerTest,BackUpOnDestruction,kBlockSizes)976 TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
977   string text = "foo bar";
978   TestInputStream input(text.data(), text.size(), kBlockSizes_case);
979 
980   // Create a tokenizer, read one token, then destroy it.
981   {
982     TestErrorCollector error_collector;
983     Tokenizer tokenizer(&input, &error_collector);
984 
985     tokenizer.Next();
986   }
987 
988   // Only "foo" should have been read.
989   EXPECT_EQ(strlen("foo"), input.ByteCount());
990 }
991 
992 
993 }  // namespace
994 }  // namespace io
995 }  // namespace protobuf
996 }  // namespace google
997