1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc.  All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 //     * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 //     * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 //     * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 
31 #include <google/protobuf/util/internal/json_stream_parser.h>
32 
33 #include <algorithm>
34 #include <cctype>
35 #include <cerrno>
36 #include <cstdlib>
37 #include <cstring>
38 #include <memory>
39 #ifndef _SHARED_PTR_H
40 #include <google/protobuf/stubs/shared_ptr.h>
41 #endif
42 
43 #include <google/protobuf/stubs/logging.h>
44 #include <google/protobuf/stubs/common.h>
45 #include <google/protobuf/util/internal/object_writer.h>
46 #include <google/protobuf/util/internal/json_escaping.h>
47 #include <google/protobuf/stubs/strutil.h>
48 
49 namespace google {
50 namespace protobuf {
51 namespace util {
52 
53 // Allow these symbols to be referenced as util::Status, util::error::* in
54 // this file.
55 using util::Status;
56 namespace error {
57 using util::error::INTERNAL;
58 using util::error::INVALID_ARGUMENT;
59 }  // namespace error
60 
61 namespace converter {
62 
63 // Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
64 static const int kUnicodeEscapedLength = 6;
65 
66 // Length of the true, false, and null literals.
67 static const int true_len = strlen("true");
68 static const int false_len = strlen("false");
69 static const int null_len = strlen("null");
70 
IsLetter(char c)71 inline bool IsLetter(char c) {
72   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
73          (c == '$');
74 }
75 
IsAlphanumeric(char c)76 inline bool IsAlphanumeric(char c) {
77   return IsLetter(c) || ('0' <= c && c <= '9');
78 }
79 
ConsumeKey(StringPiece * input,StringPiece * key)80 static bool ConsumeKey(StringPiece* input, StringPiece* key) {
81   if (input->empty() || !IsLetter((*input)[0])) return false;
82   int len = 1;
83   for (; len < input->size(); ++len) {
84     if (!IsAlphanumeric((*input)[len])) {
85       break;
86     }
87   }
88   *key = StringPiece(input->data(), len);
89   *input = StringPiece(input->data() + len, input->size() - len);
90   return true;
91 }
92 
MatchKey(StringPiece input)93 static bool MatchKey(StringPiece input) {
94   return !input.empty() && IsLetter(input[0]);
95 }
96 
JsonStreamParser(ObjectWriter * ow)97 JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
98     : ow_(ow),
99       stack_(),
100       leftover_(),
101       json_(),
102       p_(),
103       key_(),
104       key_storage_(),
105       finishing_(false),
106       parsed_(),
107       parsed_storage_(),
108       string_open_(0),
109       chunk_storage_(),
110       coerce_to_utf8_(false) {
111   // Initialize the stack with a single value to be parsed.
112   stack_.push(VALUE);
113 }
114 
~JsonStreamParser()115 JsonStreamParser::~JsonStreamParser() {}
116 
117 
Parse(StringPiece json)118 util::Status JsonStreamParser::Parse(StringPiece json) {
119   StringPiece chunk = json;
120   // If we have leftovers from a previous chunk, append the new chunk to it
121   // and create a new StringPiece pointing at the string's data. This could
122   // be large but we rely on the chunks to be small, assuming they are
123   // fragments of a Cord.
124   if (!leftover_.empty()) {
125     // Don't point chunk to leftover_ because leftover_ will be updated in
126     // ParseChunk(chunk).
127     chunk_storage_.swap(leftover_);
128     json.AppendToString(&chunk_storage_);
129     chunk = StringPiece(chunk_storage_);
130   }
131 
132   // Find the structurally valid UTF8 prefix and parse only that.
133   int n = internal::UTF8SpnStructurallyValid(chunk);
134   if (n > 0) {
135     util::Status status = ParseChunk(chunk.substr(0, n));
136 
137     // Any leftover characters are stashed in leftover_ for later parsing when
138     // there is more data available.
139     chunk.substr(n).AppendToString(&leftover_);
140     return status;
141   } else {
142     chunk.CopyToString(&leftover_);
143     return util::Status::OK;
144   }
145 }
146 
FinishParse()147 util::Status JsonStreamParser::FinishParse() {
148   // If we do not expect anything and there is nothing left to parse we're all
149   // done.
150   if (stack_.empty() && leftover_.empty()) {
151     return util::Status::OK;
152   }
153 
154   // Storage for UTF8-coerced string.
155   google::protobuf::scoped_array<char> utf8;
156   if (coerce_to_utf8_) {
157     utf8.reset(new char[leftover_.size()]);
158     char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
159     p_ = json_ = StringPiece(coerced, leftover_.size());
160   } else {
161     p_ = json_ = leftover_;
162     if (!internal::IsStructurallyValidUTF8(leftover_)) {
163       return ReportFailure("Encountered non UTF-8 code points.");
164     }
165   }
166 
167   // Parse the remainder in finishing mode, which reports errors for things like
168   // unterminated strings or unknown tokens that would normally be retried.
169   finishing_ = true;
170   util::Status result = RunParser();
171   if (result.ok()) {
172     SkipWhitespace();
173     if (!p_.empty()) {
174       result = ReportFailure("Parsing terminated before end of input.");
175     }
176   }
177   return result;
178 }
179 
ParseChunk(StringPiece chunk)180 util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
181   // Do not do any work if the chunk is empty.
182   if (chunk.empty()) return util::Status::OK;
183 
184   p_ = json_ = chunk;
185 
186   finishing_ = false;
187   util::Status result = RunParser();
188   if (!result.ok()) return result;
189 
190   SkipWhitespace();
191   if (p_.empty()) {
192     // If we parsed everything we had, clear the leftover.
193     leftover_.clear();
194   } else {
195     // If we do not expect anything i.e. stack is empty, and we have non-empty
196     // string left to parse, we report an error.
197     if (stack_.empty()) {
198       return ReportFailure("Parsing terminated before end of input.");
199     }
200     // If we expect future data i.e. stack is non-empty, and we have some
201     // unparsed data left, we save it for later parse.
202     leftover_ = p_.ToString();
203   }
204   return util::Status::OK;
205 }
206 
RunParser()207 util::Status JsonStreamParser::RunParser() {
208   while (!stack_.empty()) {
209     ParseType type = stack_.top();
210     TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
211     stack_.pop();
212     util::Status result;
213     switch (type) {
214       case VALUE:
215         result = ParseValue(t);
216         break;
217 
218       case OBJ_MID:
219         result = ParseObjectMid(t);
220         break;
221 
222       case ENTRY:
223         result = ParseEntry(t);
224         break;
225 
226       case ENTRY_MID:
227         result = ParseEntryMid(t);
228         break;
229 
230       case ARRAY_VALUE:
231         result = ParseArrayValue(t);
232         break;
233 
234       case ARRAY_MID:
235         result = ParseArrayMid(t);
236         break;
237 
238       default:
239         result = util::Status(util::error::INTERNAL,
240                               StrCat("Unknown parse type: ", type));
241         break;
242     }
243     if (!result.ok()) {
244       // If we were cancelled, save our state and try again later.
245       if (!finishing_ && result == util::Status::CANCELLED) {
246         stack_.push(type);
247         // If we have a key we still need to render, make sure to save off the
248         // contents in our own storage.
249         if (!key_.empty() && key_storage_.empty()) {
250           key_.AppendToString(&key_storage_);
251           key_ = StringPiece(key_storage_);
252         }
253         result = util::Status::OK;
254       }
255       return result;
256     }
257   }
258   return util::Status::OK;
259 }
260 
ParseValue(TokenType type)261 util::Status JsonStreamParser::ParseValue(TokenType type) {
262   switch (type) {
263     case BEGIN_OBJECT:
264       return HandleBeginObject();
265     case BEGIN_ARRAY:
266       return HandleBeginArray();
267     case BEGIN_STRING:
268       return ParseString();
269     case BEGIN_NUMBER:
270       return ParseNumber();
271     case BEGIN_TRUE:
272       return ParseTrue();
273     case BEGIN_FALSE:
274       return ParseFalse();
275     case BEGIN_NULL:
276       return ParseNull();
277     case UNKNOWN:
278       return ReportUnknown("Expected a value.");
279     default: {
280       // Special case for having been cut off while parsing, wait for more data.
281       // This handles things like 'fals' being at the end of the string, we
282       // don't know if the next char would be e, completing it, or something
283       // else, making it invalid.
284       if (!finishing_ && p_.length() < false_len) {
285         return util::Status::CANCELLED;
286       }
287       return ReportFailure("Unexpected token.");
288     }
289   }
290 }
291 
ParseString()292 util::Status JsonStreamParser::ParseString() {
293   util::Status result = ParseStringHelper();
294   if (result.ok()) {
295     ow_->RenderString(key_, parsed_);
296     key_.clear();
297     parsed_.clear();
298     parsed_storage_.clear();
299   }
300   return result;
301 }
302 
ParseStringHelper()303 util::Status JsonStreamParser::ParseStringHelper() {
304   // If we haven't seen the start quote, grab it and remember it for later.
305   if (string_open_ == 0) {
306     string_open_ = *p_.data();
307     GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
308     Advance();
309   }
310   // Track where we last copied data from so we can minimize copying.
311   const char* last = p_.data();
312   while (!p_.empty()) {
313     const char* data = p_.data();
314     if (*data == '\\') {
315       // We're about to handle an escape, copy all bytes from last to data.
316       if (last < data) {
317         parsed_storage_.append(last, data - last);
318         last = data;
319       }
320       // If we ran out of string after the \, cancel or report an error
321       // depending on if we expect more data later.
322       if (p_.length() == 1) {
323         if (!finishing_) {
324           return util::Status::CANCELLED;
325         }
326         return ReportFailure("Closing quote expected in string.");
327       }
328       // Parse a unicode escape if we found \u in the string.
329       if (data[1] == 'u') {
330         util::Status result = ParseUnicodeEscape();
331         if (!result.ok()) {
332           return result;
333         }
334         // Move last pointer past the unicode escape and continue.
335         last = p_.data();
336         continue;
337       }
338       // Handle the standard set of backslash-escaped characters.
339       switch (data[1]) {
340         case 'b':
341           parsed_storage_.push_back('\b');
342           break;
343         case 'f':
344           parsed_storage_.push_back('\f');
345           break;
346         case 'n':
347           parsed_storage_.push_back('\n');
348           break;
349         case 'r':
350           parsed_storage_.push_back('\r');
351           break;
352         case 't':
353           parsed_storage_.push_back('\t');
354           break;
355         case 'v':
356           parsed_storage_.push_back('\v');
357           break;
358         default:
359           parsed_storage_.push_back(data[1]);
360       }
361       // We handled two characters, so advance past them and continue.
362       p_.remove_prefix(2);
363       last = p_.data();
364       continue;
365     }
366     // If we found the closing quote note it, advance past it, and return.
367     if (*data == string_open_) {
368       // If we didn't copy anything, reuse the input buffer.
369       if (parsed_storage_.empty()) {
370         parsed_ = StringPiece(last, data - last);
371       } else {
372         if (last < data) {
373           parsed_storage_.append(last, data - last);
374           last = data;
375         }
376         parsed_ = StringPiece(parsed_storage_);
377       }
378       // Clear the quote char so next time we try to parse a string we'll
379       // start fresh.
380       string_open_ = 0;
381       Advance();
382       return util::Status::OK;
383     }
384     // Normal character, just advance past it.
385     Advance();
386   }
387   // If we ran out of characters, copy over what we have so far.
388   if (last < p_.data()) {
389     parsed_storage_.append(last, p_.data() - last);
390   }
391   // If we didn't find the closing quote but we expect more data, cancel for now
392   if (!finishing_) {
393     return util::Status::CANCELLED;
394   }
395   // End of string reached without a closing quote, report an error.
396   string_open_ = 0;
397   return ReportFailure("Closing quote expected in string.");
398 }
399 
400 // Converts a unicode escaped character to a decimal value stored in a char32
401 // for use in UTF8 encoding utility.  We assume that str begins with \uhhhh and
402 // convert that from the hex number to a decimal value.
403 //
404 // There are some security exploits with UTF-8 that we should be careful of:
405 //   - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
406 //   - http://sites/intl-eng/design-guide/core-application
ParseUnicodeEscape()407 util::Status JsonStreamParser::ParseUnicodeEscape() {
408   if (p_.length() < kUnicodeEscapedLength) {
409     if (!finishing_) {
410       return util::Status::CANCELLED;
411     }
412     return ReportFailure("Illegal hex string.");
413   }
414   GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
415   GOOGLE_DCHECK_EQ('u', p_.data()[1]);
416   uint32 code = 0;
417   for (int i = 2; i < kUnicodeEscapedLength; ++i) {
418     if (!isxdigit(p_.data()[i])) {
419       return ReportFailure("Invalid escape sequence.");
420     }
421     code = (code << 4) + hex_digit_to_int(p_.data()[i]);
422   }
423   if (code >= JsonEscaping::kMinHighSurrogate &&
424       code <= JsonEscaping::kMaxHighSurrogate) {
425     if (p_.length() < 2 * kUnicodeEscapedLength) {
426       if (!finishing_) {
427         return util::Status::CANCELLED;
428       }
429       if (!coerce_to_utf8_) {
430         return ReportFailure("Missing low surrogate.");
431       }
432     } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
433                p_.data()[kUnicodeEscapedLength + 1] == 'u') {
434       uint32 low_code = 0;
435       for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
436            ++i) {
437         if (!isxdigit(p_.data()[i])) {
438           return ReportFailure("Invalid escape sequence.");
439         }
440         low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
441       }
442       if (low_code >= JsonEscaping::kMinLowSurrogate &&
443           low_code <= JsonEscaping::kMaxLowSurrogate) {
444         // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
445         code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
446                JsonEscaping::kMinSupplementaryCodePoint;
447         // Advance past the first code unit escape.
448         p_.remove_prefix(kUnicodeEscapedLength);
449       } else if (!coerce_to_utf8_) {
450         return ReportFailure("Invalid low surrogate.");
451       }
452     } else if (!coerce_to_utf8_) {
453       return ReportFailure("Missing low surrogate.");
454     }
455   }
456   if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
457     return ReportFailure("Invalid unicode code point.");
458   }
459   char buf[UTFmax];
460   int len = EncodeAsUTF8Char(code, buf);
461   // Advance past the [final] code unit escape.
462   p_.remove_prefix(kUnicodeEscapedLength);
463   parsed_storage_.append(buf, len);
464   return util::Status::OK;
465 }
466 
ParseNumber()467 util::Status JsonStreamParser::ParseNumber() {
468   NumberResult number;
469   util::Status result = ParseNumberHelper(&number);
470   if (result.ok()) {
471     switch (number.type) {
472       case NumberResult::DOUBLE:
473         ow_->RenderDouble(key_, number.double_val);
474         key_.clear();
475         break;
476 
477       case NumberResult::INT:
478         ow_->RenderInt64(key_, number.int_val);
479         key_.clear();
480         break;
481 
482       case NumberResult::UINT:
483         ow_->RenderUint64(key_, number.uint_val);
484         key_.clear();
485         break;
486 
487       default:
488         return ReportFailure("Unable to parse number.");
489     }
490   }
491   return result;
492 }
493 
ParseNumberHelper(NumberResult * result)494 util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
495   const char* data = p_.data();
496   int length = p_.length();
497 
498   // Look for the first non-numeric character, or the end of the string.
499   int index = 0;
500   bool floating = false;
501   bool negative = data[index] == '-';
502   // Find the first character that cannot be part of the number. Along the way
503   // detect if the number needs to be parsed as a double.
504   // Note that this restricts numbers to the JSON specification, so for example
505   // we do not support hex or octal notations.
506   for (; index < length; ++index) {
507     char c = data[index];
508     if (isdigit(c)) continue;
509     if (c == '.' || c == 'e' || c == 'E') {
510       floating = true;
511       continue;
512     }
513     if (c == '+' || c == '-' || c == 'x') continue;
514     // Not a valid number character, break out.
515     break;
516   }
517 
518   // If the entire input is a valid number, and we may have more content in the
519   // future, we abort for now and resume when we know more.
520   if (index == length && !finishing_) {
521     return util::Status::CANCELLED;
522   }
523 
524   // Create a string containing just the number, so we can use safe_strtoX
525   string number = p_.substr(0, index).ToString();
526 
527   // Floating point number, parse as a double.
528   if (floating) {
529     if (!safe_strtod(number, &result->double_val)) {
530       return ReportFailure("Unable to parse number.");
531     }
532     result->type = NumberResult::DOUBLE;
533     p_.remove_prefix(index);
534     return util::Status::OK;
535   }
536 
537   // Positive non-floating point number, parse as a uint64.
538   if (!negative) {
539     // Octal/Hex numbers are not valid JSON values.
540     if (number.length() >= 2 && number[0] == '0') {
541       return ReportFailure("Octal/hex numbers are not valid JSON values.");
542     }
543     if (!safe_strtou64(number, &result->uint_val)) {
544       return ReportFailure("Unable to parse number.");
545     }
546     result->type = NumberResult::UINT;
547     p_.remove_prefix(index);
548     return util::Status::OK;
549   }
550 
551   // Octal/Hex numbers are not valid JSON values.
552   if (number.length() >= 3 && number[1] == '0') {
553     return ReportFailure("Octal/hex numbers are not valid JSON values.");
554   }
555   // Negative non-floating point number, parse as an int64.
556   if (!safe_strto64(number, &result->int_val)) {
557     return ReportFailure("Unable to parse number.");
558   }
559   result->type = NumberResult::INT;
560   p_.remove_prefix(index);
561   return util::Status::OK;
562 }
563 
HandleBeginObject()564 util::Status JsonStreamParser::HandleBeginObject() {
565   GOOGLE_DCHECK_EQ('{', *p_.data());
566   Advance();
567   ow_->StartObject(key_);
568   key_.clear();
569   stack_.push(ENTRY);
570   return util::Status::OK;
571 }
572 
ParseObjectMid(TokenType type)573 util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
574   if (type == UNKNOWN) {
575     return ReportUnknown("Expected , or } after key:value pair.");
576   }
577 
578   // Object is complete, advance past the comma and render the EndObject.
579   if (type == END_OBJECT) {
580     Advance();
581     ow_->EndObject();
582     return util::Status::OK;
583   }
584   // Found a comma, advance past it and get ready for an entry.
585   if (type == VALUE_SEPARATOR) {
586     Advance();
587     stack_.push(ENTRY);
588     return util::Status::OK;
589   }
590   // Illegal token after key:value pair.
591   return ReportFailure("Expected , or } after key:value pair.");
592 }
593 
ParseEntry(TokenType type)594 util::Status JsonStreamParser::ParseEntry(TokenType type) {
595   if (type == UNKNOWN) {
596     return ReportUnknown("Expected an object key or }.");
597   }
598 
599   // Close the object and return. This allows for trailing commas.
600   if (type == END_OBJECT) {
601     ow_->EndObject();
602     Advance();
603     return util::Status::OK;
604   }
605 
606   util::Status result;
607   if (type == BEGIN_STRING) {
608     // Key is a string (standard JSON), parse it and store the string.
609     result = ParseStringHelper();
610     if (result.ok()) {
611       key_storage_.clear();
612       if (!parsed_storage_.empty()) {
613         parsed_storage_.swap(key_storage_);
614         key_ = StringPiece(key_storage_);
615       } else {
616         key_ = parsed_;
617       }
618       parsed_.clear();
619     }
620   } else if (type == BEGIN_KEY) {
621     // Key is a bare key (back compat), create a StringPiece pointing to it.
622     result = ParseKey();
623   } else {
624     // Unknown key type, report an error.
625     result = ReportFailure("Expected an object key or }.");
626   }
627   // On success we next expect an entry mid ':' then an object mid ',' or '}'
628   if (result.ok()) {
629     stack_.push(OBJ_MID);
630     stack_.push(ENTRY_MID);
631   }
632   return result;
633 }
634 
ParseEntryMid(TokenType type)635 util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
636   if (type == UNKNOWN) {
637     return ReportUnknown("Expected : between key:value pair.");
638   }
639   if (type == ENTRY_SEPARATOR) {
640     Advance();
641     stack_.push(VALUE);
642     return util::Status::OK;
643   }
644   return ReportFailure("Expected : between key:value pair.");
645 }
646 
HandleBeginArray()647 util::Status JsonStreamParser::HandleBeginArray() {
648   GOOGLE_DCHECK_EQ('[', *p_.data());
649   Advance();
650   ow_->StartList(key_);
651   key_.clear();
652   stack_.push(ARRAY_VALUE);
653   return util::Status::OK;
654 }
655 
ParseArrayValue(TokenType type)656 util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
657   if (type == UNKNOWN) {
658     return ReportUnknown("Expected a value or ] within an array.");
659   }
660 
661   if (type == END_ARRAY) {
662     ow_->EndList();
663     Advance();
664     return util::Status::OK;
665   }
666 
667   // The ParseValue call may push something onto the stack so we need to make
668   // sure an ARRAY_MID is after it, so we push it on now.
669   stack_.push(ARRAY_MID);
670   util::Status result = ParseValue(type);
671   if (result == util::Status::CANCELLED) {
672     // If we were cancelled, pop back off the ARRAY_MID so we don't try to
673     // push it on again when we try over.
674     stack_.pop();
675   }
676   return result;
677 }
678 
ParseArrayMid(TokenType type)679 util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
680   if (type == UNKNOWN) {
681     return ReportUnknown("Expected , or ] after array value.");
682   }
683 
684   if (type == END_ARRAY) {
685     ow_->EndList();
686     Advance();
687     return util::Status::OK;
688   }
689 
690   // Found a comma, advance past it and expect an array value next.
691   if (type == VALUE_SEPARATOR) {
692     Advance();
693     stack_.push(ARRAY_VALUE);
694     return util::Status::OK;
695   }
696   // Illegal token after array value.
697   return ReportFailure("Expected , or ] after array value.");
698 }
699 
ParseTrue()700 util::Status JsonStreamParser::ParseTrue() {
701   ow_->RenderBool(key_, true);
702   key_.clear();
703   p_.remove_prefix(true_len);
704   return util::Status::OK;
705 }
706 
ParseFalse()707 util::Status JsonStreamParser::ParseFalse() {
708   ow_->RenderBool(key_, false);
709   key_.clear();
710   p_.remove_prefix(false_len);
711   return util::Status::OK;
712 }
713 
ParseNull()714 util::Status JsonStreamParser::ParseNull() {
715   ow_->RenderNull(key_);
716   key_.clear();
717   p_.remove_prefix(null_len);
718   return util::Status::OK;
719 }
720 
ReportFailure(StringPiece message)721 util::Status JsonStreamParser::ReportFailure(StringPiece message) {
722   static const int kContextLength = 20;
723   const char* p_start = p_.data();
724   const char* json_start = json_.data();
725   const char* begin = std::max(p_start - kContextLength, json_start);
726   const char* end =
727       std::min(p_start + kContextLength, json_start + json_.size());
728   StringPiece segment(begin, end - begin);
729   string location(p_start - begin, ' ');
730   location.push_back('^');
731   return util::Status(util::error::INVALID_ARGUMENT,
732                       StrCat(message, "\n", segment, "\n", location));
733 }
734 
ReportUnknown(StringPiece message)735 util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
736   // If we aren't finishing the parse, cancel parsing and try later.
737   if (!finishing_) {
738     return util::Status::CANCELLED;
739   }
740   if (p_.empty()) {
741     return ReportFailure(StrCat("Unexpected end of string. ", message));
742   }
743   return ReportFailure(message);
744 }
745 
SkipWhitespace()746 void JsonStreamParser::SkipWhitespace() {
747   while (!p_.empty() && ascii_isspace(*p_.data())) {
748     Advance();
749   }
750 }
751 
Advance()752 void JsonStreamParser::Advance() {
753   // Advance by moving one UTF8 character while making sure we don't go beyond
754   // the length of StringPiece.
755   p_.remove_prefix(std::min<int>(
756       p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
757 }
758 
ParseKey()759 util::Status JsonStreamParser::ParseKey() {
760   StringPiece original = p_;
761   if (!ConsumeKey(&p_, &key_)) {
762     return ReportFailure("Invalid key or variable name.");
763   }
764   // If we consumed everything but expect more data, reset p_ and cancel since
765   // we can't know if the key was complete or not.
766   if (!finishing_ && p_.empty()) {
767     p_ = original;
768     return util::Status::CANCELLED;
769   }
770   // Since we aren't using the key storage, clear it out.
771   key_storage_.clear();
772   return util::Status::OK;
773 }
774 
GetNextTokenType()775 JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
776   SkipWhitespace();
777 
778   int size = p_.size();
779   if (size == 0) {
780     // If we ran out of data, report unknown and we'll place the previous parse
781     // type onto the stack and try again when we have more data.
782     return UNKNOWN;
783   }
784   // TODO(sven): Split this method based on context since different contexts
785   // support different tokens. Would slightly speed up processing?
786   const char* data = p_.data();
787   if (*data == '\"' || *data == '\'') return BEGIN_STRING;
788   if (*data == '-' || ('0' <= *data && *data <= '9')) {
789     return BEGIN_NUMBER;
790   }
791   if (size >= true_len && !strncmp(data, "true", true_len)) {
792     return BEGIN_TRUE;
793   }
794   if (size >= false_len && !strncmp(data, "false", false_len)) {
795     return BEGIN_FALSE;
796   }
797   if (size >= null_len && !strncmp(data, "null", null_len)) {
798     return BEGIN_NULL;
799   }
800   if (*data == '{') return BEGIN_OBJECT;
801   if (*data == '}') return END_OBJECT;
802   if (*data == '[') return BEGIN_ARRAY;
803   if (*data == ']') return END_ARRAY;
804   if (*data == ':') return ENTRY_SEPARATOR;
805   if (*data == ',') return VALUE_SEPARATOR;
806   if (MatchKey(p_)) {
807     return BEGIN_KEY;
808   }
809 
810   // We don't know that we necessarily have an invalid token here, just that we
811   // can't parse what we have so far. So we don't report an error and just
812   // return UNKNOWN so we can try again later when we have more data, or if we
813   // finish and we have leftovers.
814   return UNKNOWN;
815 }
816 
817 }  // namespace converter
818 }  // namespace util
819 }  // namespace protobuf
820 }  // namespace google
821