1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/json/json_parser.h"
6 
7 #include <cmath>
8 #include <utility>
9 #include <vector>
10 
11 #include "base/logging.h"
12 #include "base/macros.h"
13 #include "base/numerics/safe_conversions.h"
14 #include "base/strings/string_number_conversions.h"
15 #include "base/strings/string_piece.h"
16 #include "base/strings/string_util.h"
17 #include "base/strings/stringprintf.h"
18 #include "base/strings/utf_string_conversion_utils.h"
19 #include "base/strings/utf_string_conversions.h"
20 #include "base/third_party/icu/icu_utf.h"
21 #include "base/values.h"
22 
23 namespace base {
24 namespace internal {
25 
26 namespace {
27 
28 const int32_t kExtendedASCIIStart = 0x80;
29 
30 // Simple class that checks for maximum recursion/"stack overflow."
31 class StackMarker {
32  public:
StackMarker(int max_depth,int * depth)33   StackMarker(int max_depth, int* depth)
34       : max_depth_(max_depth), depth_(depth) {
35     ++(*depth_);
36     DCHECK_LE(*depth_, max_depth_);
37   }
~StackMarker()38   ~StackMarker() {
39     --(*depth_);
40   }
41 
IsTooDeep() const42   bool IsTooDeep() const { return *depth_ >= max_depth_; }
43 
44  private:
45   const int max_depth_;
46   int* const depth_;
47 
48   DISALLOW_COPY_AND_ASSIGN(StackMarker);
49 };
50 
51 constexpr uint32_t kUnicodeReplacementPoint = 0xFFFD;
52 
53 }  // namespace
54 
55 // This is U+FFFD.
56 const char kUnicodeReplacementString[] = "\xEF\xBF\xBD";
57 
JSONParser(int options,int max_depth)58 JSONParser::JSONParser(int options, int max_depth)
59     : options_(options),
60       max_depth_(max_depth),
61       index_(0),
62       stack_depth_(0),
63       line_number_(0),
64       index_last_line_(0),
65       error_code_(JSONReader::JSON_NO_ERROR),
66       error_line_(0),
67       error_column_(0) {
68   CHECK_LE(max_depth, JSONReader::kStackMaxDepth);
69 }
70 
71 JSONParser::~JSONParser() = default;
72 
Parse(StringPiece input)73 Optional<Value> JSONParser::Parse(StringPiece input) {
74   input_ = input;
75   index_ = 0;
76   line_number_ = 1;
77   index_last_line_ = 0;
78 
79   error_code_ = JSONReader::JSON_NO_ERROR;
80   error_line_ = 0;
81   error_column_ = 0;
82 
83   // ICU and ReadUnicodeCharacter() use int32_t for lengths, so ensure
84   // that the index_ will not overflow when parsing.
85   if (!base::IsValueInRangeForNumericType<int32_t>(input.length())) {
86     ReportError(JSONReader::JSON_TOO_LARGE, 0);
87     return nullopt;
88   }
89 
90   // When the input JSON string starts with a UTF-8 Byte-Order-Mark,
91   // advance the start position to avoid the ParseNextToken function mis-
92   // treating a Unicode BOM as an invalid character and returning NULL.
93   ConsumeIfMatch("\xEF\xBB\xBF");
94 
95   // Parse the first and any nested tokens.
96   Optional<Value> root(ParseNextToken());
97   if (!root)
98     return nullopt;
99 
100   // Make sure the input stream is at an end.
101   if (GetNextToken() != T_END_OF_INPUT) {
102     ReportError(JSONReader::JSON_UNEXPECTED_DATA_AFTER_ROOT, 1);
103     return nullopt;
104   }
105 
106   return root;
107 }
108 
error_code() const109 JSONReader::JsonParseError JSONParser::error_code() const {
110   return error_code_;
111 }
112 
GetErrorMessage() const113 std::string JSONParser::GetErrorMessage() const {
114   return FormatErrorMessage(error_line_, error_column_,
115       JSONReader::ErrorCodeToString(error_code_));
116 }
117 
error_line() const118 int JSONParser::error_line() const {
119   return error_line_;
120 }
121 
error_column() const122 int JSONParser::error_column() const {
123   return error_column_;
124 }
125 
126 // StringBuilder ///////////////////////////////////////////////////////////////
127 
StringBuilder()128 JSONParser::StringBuilder::StringBuilder() : StringBuilder(nullptr) {}
129 
StringBuilder(const char * pos)130 JSONParser::StringBuilder::StringBuilder(const char* pos)
131     : pos_(pos), length_(0) {}
132 
133 JSONParser::StringBuilder::~StringBuilder() = default;
134 
135 JSONParser::StringBuilder& JSONParser::StringBuilder::operator=(
136     StringBuilder&& other) = default;
137 
Append(uint32_t point)138 void JSONParser::StringBuilder::Append(uint32_t point) {
139   DCHECK(IsValidCharacter(point));
140 
141   if (point < kExtendedASCIIStart && !string_) {
142     DCHECK_EQ(static_cast<char>(point), pos_[length_]);
143     ++length_;
144   } else {
145     Convert();
146     if (UNLIKELY(point == kUnicodeReplacementPoint)) {
147       string_->append(kUnicodeReplacementString);
148     } else {
149       WriteUnicodeCharacter(point, &*string_);
150     }
151   }
152 }
153 
Convert()154 void JSONParser::StringBuilder::Convert() {
155   if (string_)
156     return;
157   string_.emplace(pos_, length_);
158 }
159 
DestructiveAsString()160 std::string JSONParser::StringBuilder::DestructiveAsString() {
161   if (string_)
162     return std::move(*string_);
163   return std::string(pos_, length_);
164 }
165 
166 // JSONParser private //////////////////////////////////////////////////////////
167 
PeekChars(int count)168 Optional<StringPiece> JSONParser::PeekChars(int count) {
169   if (static_cast<size_t>(index_) + count > input_.length())
170     return nullopt;
171   // Using StringPiece::substr() is significantly slower (according to
172   // base_perftests) than constructing a substring manually.
173   return StringPiece(input_.data() + index_, count);
174 }
175 
PeekChar()176 Optional<char> JSONParser::PeekChar() {
177   Optional<StringPiece> chars = PeekChars(1);
178   if (chars)
179     return (*chars)[0];
180   return nullopt;
181 }
182 
ConsumeChars(int count)183 Optional<StringPiece> JSONParser::ConsumeChars(int count) {
184   Optional<StringPiece> chars = PeekChars(count);
185   if (chars)
186     index_ += count;
187   return chars;
188 }
189 
ConsumeChar()190 Optional<char> JSONParser::ConsumeChar() {
191   Optional<StringPiece> chars = ConsumeChars(1);
192   if (chars)
193     return (*chars)[0];
194   return nullopt;
195 }
196 
pos()197 const char* JSONParser::pos() {
198   CHECK_LE(static_cast<size_t>(index_), input_.length());
199   return input_.data() + index_;
200 }
201 
GetNextToken()202 JSONParser::Token JSONParser::GetNextToken() {
203   EatWhitespaceAndComments();
204 
205   Optional<char> c = PeekChar();
206   if (!c)
207     return T_END_OF_INPUT;
208 
209   switch (*c) {
210     case '{':
211       return T_OBJECT_BEGIN;
212     case '}':
213       return T_OBJECT_END;
214     case '[':
215       return T_ARRAY_BEGIN;
216     case ']':
217       return T_ARRAY_END;
218     case '"':
219       return T_STRING;
220     case '0':
221     case '1':
222     case '2':
223     case '3':
224     case '4':
225     case '5':
226     case '6':
227     case '7':
228     case '8':
229     case '9':
230     case '-':
231       return T_NUMBER;
232     case 't':
233       return T_BOOL_TRUE;
234     case 'f':
235       return T_BOOL_FALSE;
236     case 'n':
237       return T_NULL;
238     case ',':
239       return T_LIST_SEPARATOR;
240     case ':':
241       return T_OBJECT_PAIR_SEPARATOR;
242     default:
243       return T_INVALID_TOKEN;
244   }
245 }
246 
EatWhitespaceAndComments()247 void JSONParser::EatWhitespaceAndComments() {
248   while (Optional<char> c = PeekChar()) {
249     switch (*c) {
250       case '\r':
251       case '\n':
252         index_last_line_ = index_;
253         // Don't increment line_number_ twice for "\r\n".
254         if (!(c == '\n' && index_ > 0 && input_[index_ - 1] == '\r')) {
255           ++line_number_;
256         }
257         FALLTHROUGH;
258       case ' ':
259       case '\t':
260         ConsumeChar();
261         break;
262       case '/':
263         if (!EatComment())
264           return;
265         break;
266       default:
267         return;
268     }
269   }
270 }
271 
EatComment()272 bool JSONParser::EatComment() {
273   Optional<StringPiece> comment_start = ConsumeChars(2);
274   if (!comment_start)
275     return false;
276 
277   if (comment_start == "//") {
278     // Single line comment, read to newline.
279     while (Optional<char> c = PeekChar()) {
280       if (c == '\n' || c == '\r')
281         return true;
282       ConsumeChar();
283     }
284   } else if (comment_start == "/*") {
285     char previous_char = '\0';
286     // Block comment, read until end marker.
287     while (Optional<char> c = PeekChar()) {
288       if (previous_char == '*' && c == '/') {
289         // EatWhitespaceAndComments will inspect pos(), which will still be on
290         // the last / of the comment, so advance once more (which may also be
291         // end of input).
292         ConsumeChar();
293         return true;
294       }
295       previous_char = *ConsumeChar();
296     }
297 
298     // If the comment is unterminated, GetNextToken will report T_END_OF_INPUT.
299   }
300 
301   return false;
302 }
303 
ParseNextToken()304 Optional<Value> JSONParser::ParseNextToken() {
305   return ParseToken(GetNextToken());
306 }
307 
ParseToken(Token token)308 Optional<Value> JSONParser::ParseToken(Token token) {
309   switch (token) {
310     case T_OBJECT_BEGIN:
311       return ConsumeDictionary();
312     case T_ARRAY_BEGIN:
313       return ConsumeList();
314     case T_STRING:
315       return ConsumeString();
316     case T_NUMBER:
317       return ConsumeNumber();
318     case T_BOOL_TRUE:
319     case T_BOOL_FALSE:
320     case T_NULL:
321       return ConsumeLiteral();
322     default:
323       ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
324       return nullopt;
325   }
326 }
327 
ConsumeDictionary()328 Optional<Value> JSONParser::ConsumeDictionary() {
329   if (ConsumeChar() != '{') {
330     ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
331     return nullopt;
332   }
333 
334   StackMarker depth_check(max_depth_, &stack_depth_);
335   if (depth_check.IsTooDeep()) {
336     ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
337     return nullopt;
338   }
339 
340   std::vector<Value::DictStorage::value_type> dict_storage;
341 
342   Token token = GetNextToken();
343   while (token != T_OBJECT_END) {
344     if (token != T_STRING) {
345       ReportError(JSONReader::JSON_UNQUOTED_DICTIONARY_KEY, 1);
346       return nullopt;
347     }
348 
349     // First consume the key.
350     StringBuilder key;
351     if (!ConsumeStringRaw(&key)) {
352       return nullopt;
353     }
354 
355     // Read the separator.
356     token = GetNextToken();
357     if (token != T_OBJECT_PAIR_SEPARATOR) {
358       ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
359       return nullopt;
360     }
361 
362     // The next token is the value. Ownership transfers to |dict|.
363     ConsumeChar();
364     Optional<Value> value = ParseNextToken();
365     if (!value) {
366       // ReportError from deeper level.
367       return nullopt;
368     }
369 
370     dict_storage.emplace_back(key.DestructiveAsString(),
371                               std::make_unique<Value>(std::move(*value)));
372 
373     token = GetNextToken();
374     if (token == T_LIST_SEPARATOR) {
375       ConsumeChar();
376       token = GetNextToken();
377       if (token == T_OBJECT_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
378         ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
379         return nullopt;
380       }
381     } else if (token != T_OBJECT_END) {
382       ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
383       return nullopt;
384     }
385   }
386 
387   ConsumeChar();  // Closing '}'.
388 
389   return Value(Value::DictStorage(std::move(dict_storage), KEEP_LAST_OF_DUPES));
390 }
391 
ConsumeList()392 Optional<Value> JSONParser::ConsumeList() {
393   if (ConsumeChar() != '[') {
394     ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
395     return nullopt;
396   }
397 
398   StackMarker depth_check(max_depth_, &stack_depth_);
399   if (depth_check.IsTooDeep()) {
400     ReportError(JSONReader::JSON_TOO_MUCH_NESTING, 0);
401     return nullopt;
402   }
403 
404   Value::ListStorage list_storage;
405 
406   Token token = GetNextToken();
407   while (token != T_ARRAY_END) {
408     Optional<Value> item = ParseToken(token);
409     if (!item) {
410       // ReportError from deeper level.
411       return nullopt;
412     }
413 
414     list_storage.push_back(std::move(*item));
415 
416     token = GetNextToken();
417     if (token == T_LIST_SEPARATOR) {
418       ConsumeChar();
419       token = GetNextToken();
420       if (token == T_ARRAY_END && !(options_ & JSON_ALLOW_TRAILING_COMMAS)) {
421         ReportError(JSONReader::JSON_TRAILING_COMMA, 1);
422         return nullopt;
423       }
424     } else if (token != T_ARRAY_END) {
425       ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
426       return nullopt;
427     }
428   }
429 
430   ConsumeChar();  // Closing ']'.
431 
432   return Value(std::move(list_storage));
433 }
434 
ConsumeString()435 Optional<Value> JSONParser::ConsumeString() {
436   StringBuilder string;
437   if (!ConsumeStringRaw(&string))
438     return nullopt;
439 
440   return Value(string.DestructiveAsString());
441 }
442 
ConsumeStringRaw(StringBuilder * out)443 bool JSONParser::ConsumeStringRaw(StringBuilder* out) {
444   if (ConsumeChar() != '"') {
445     ReportError(JSONReader::JSON_UNEXPECTED_TOKEN, 1);
446     return false;
447   }
448 
449   // StringBuilder will internally build a StringPiece unless a UTF-16
450   // conversion occurs, at which point it will perform a copy into a
451   // std::string.
452   StringBuilder string(pos());
453 
454   while (PeekChar()) {
455     uint32_t next_char = 0;
456     if (!ReadUnicodeCharacter(input_.data(),
457                               static_cast<int32_t>(input_.length()),
458                               &index_,
459                               &next_char) ||
460         !IsValidCharacter(next_char)) {
461       if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
462         ReportError(JSONReader::JSON_UNSUPPORTED_ENCODING, 1);
463         return false;
464       }
465       ConsumeChar();
466       string.Append(kUnicodeReplacementPoint);
467       continue;
468     }
469 
470     if (next_char == '"') {
471       ConsumeChar();
472       *out = std::move(string);
473       return true;
474     } else if (next_char != '\\') {
475       // If this character is not an escape sequence...
476       ConsumeChar();
477       string.Append(next_char);
478     } else {
479       // And if it is an escape sequence, the input string will be adjusted
480       // (either by combining the two characters of an encoded escape sequence,
481       // or with a UTF conversion), so using StringPiece isn't possible -- force
482       // a conversion.
483       string.Convert();
484 
485       // Read past the escape '\' and ensure there's a character following.
486       Optional<StringPiece> escape_sequence = ConsumeChars(2);
487       if (!escape_sequence) {
488         ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
489         return false;
490       }
491 
492       switch ((*escape_sequence)[1]) {
493         // Allowed esape sequences:
494         case 'x': {  // UTF-8 sequence.
495           // UTF-8 \x escape sequences are not allowed in the spec, but they
496           // are supported here for backwards-compatiblity with the old parser.
497           escape_sequence = ConsumeChars(2);
498           if (!escape_sequence) {
499             ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
500             return false;
501           }
502 
503           int hex_digit = 0;
504           if (!HexStringToInt(*escape_sequence, &hex_digit) ||
505               !IsValidCharacter(hex_digit)) {
506             ReportError(JSONReader::JSON_INVALID_ESCAPE, -2);
507             return false;
508           }
509 
510           string.Append(hex_digit);
511           break;
512         }
513         case 'u': {  // UTF-16 sequence.
514           // UTF units are of the form \uXXXX.
515           uint32_t code_point;
516           if (!DecodeUTF16(&code_point)) {
517             ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
518             return false;
519           }
520           string.Append(code_point);
521           break;
522         }
523         case '"':
524           string.Append('"');
525           break;
526         case '\\':
527           string.Append('\\');
528           break;
529         case '/':
530           string.Append('/');
531           break;
532         case 'b':
533           string.Append('\b');
534           break;
535         case 'f':
536           string.Append('\f');
537           break;
538         case 'n':
539           string.Append('\n');
540           break;
541         case 'r':
542           string.Append('\r');
543           break;
544         case 't':
545           string.Append('\t');
546           break;
547         case 'v':  // Not listed as valid escape sequence in the RFC.
548           string.Append('\v');
549           break;
550         // All other escape squences are illegal.
551         default:
552           ReportError(JSONReader::JSON_INVALID_ESCAPE, 0);
553           return false;
554       }
555     }
556   }
557 
558   ReportError(JSONReader::JSON_SYNTAX_ERROR, 0);
559   return false;
560 }
561 
562 // Entry is at the first X in \uXXXX.
DecodeUTF16(uint32_t * out_code_point)563 bool JSONParser::DecodeUTF16(uint32_t* out_code_point) {
564   Optional<StringPiece> escape_sequence = ConsumeChars(4);
565   if (!escape_sequence)
566     return false;
567 
568   // Consume the UTF-16 code unit, which may be a high surrogate.
569   int code_unit16_high = 0;
570   if (!HexStringToInt(*escape_sequence, &code_unit16_high))
571     return false;
572 
573   // If this is a high surrogate, consume the next code unit to get the
574   // low surrogate.
575   if (CBU16_IS_SURROGATE(code_unit16_high)) {
576     // Make sure this is the high surrogate. If not, it's an encoding
577     // error.
578     if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
579       return false;
580 
581     // Make sure that the token has more characters to consume the
582     // lower surrogate.
583     if (!ConsumeIfMatch("\\u"))
584       return false;
585 
586     escape_sequence = ConsumeChars(4);
587     if (!escape_sequence)
588       return false;
589 
590     int code_unit16_low = 0;
591     if (!HexStringToInt(*escape_sequence, &code_unit16_low))
592       return false;
593 
594     if (!CBU16_IS_TRAIL(code_unit16_low))
595       return false;
596 
597     uint32_t code_point =
598         CBU16_GET_SUPPLEMENTARY(code_unit16_high, code_unit16_low);
599     if (!IsValidCharacter(code_point))
600       return false;
601 
602     *out_code_point = code_point;
603   } else {
604     // Not a surrogate.
605     DCHECK(CBU16_IS_SINGLE(code_unit16_high));
606     if (!IsValidCharacter(code_unit16_high)) {
607       if ((options_ & JSON_REPLACE_INVALID_CHARACTERS) == 0) {
608         return false;
609       }
610       *out_code_point = kUnicodeReplacementPoint;
611       return true;
612     }
613 
614     *out_code_point = code_unit16_high;
615   }
616 
617   return true;
618 }
619 
ConsumeNumber()620 Optional<Value> JSONParser::ConsumeNumber() {
621   const char* num_start = pos();
622   const int start_index = index_;
623   int end_index = start_index;
624 
625   if (PeekChar() == '-')
626     ConsumeChar();
627 
628   if (!ReadInt(false)) {
629     ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
630     return nullopt;
631   }
632   end_index = index_;
633 
634   // The optional fraction part.
635   if (PeekChar() == '.') {
636     ConsumeChar();
637     if (!ReadInt(true)) {
638       ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
639       return nullopt;
640     }
641     end_index = index_;
642   }
643 
644   // Optional exponent part.
645   Optional<char> c = PeekChar();
646   if (c == 'e' || c == 'E') {
647     ConsumeChar();
648     if (PeekChar() == '-' || PeekChar() == '+') {
649       ConsumeChar();
650     }
651     if (!ReadInt(true)) {
652       ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
653       return nullopt;
654     }
655     end_index = index_;
656   }
657 
658   // ReadInt is greedy because numbers have no easily detectable sentinel,
659   // so save off where the parser should be on exit (see Consume invariant at
660   // the top of the header), then make sure the next token is one which is
661   // valid.
662   int exit_index = index_;
663 
664   switch (GetNextToken()) {
665     case T_OBJECT_END:
666     case T_ARRAY_END:
667     case T_LIST_SEPARATOR:
668     case T_END_OF_INPUT:
669       break;
670     default:
671       ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
672       return nullopt;
673   }
674 
675   index_ = exit_index;
676 
677   StringPiece num_string(num_start, end_index - start_index);
678 
679   int num_int;
680   if (StringToInt(num_string, &num_int))
681     return Value(num_int);
682 
683   double num_double;
684   if (StringToDouble(num_string.as_string(), &num_double) &&
685       std::isfinite(num_double)) {
686     return Value(num_double);
687   }
688 
689   return nullopt;
690 }
691 
ReadInt(bool allow_leading_zeros)692 bool JSONParser::ReadInt(bool allow_leading_zeros) {
693   size_t len = 0;
694   char first = 0;
695 
696   while (Optional<char> c = PeekChar()) {
697     if (!IsAsciiDigit(c))
698       break;
699 
700     if (len == 0)
701       first = *c;
702 
703     ++len;
704     ConsumeChar();
705   }
706 
707   if (len == 0)
708     return false;
709 
710   if (!allow_leading_zeros && len > 1 && first == '0')
711     return false;
712 
713   return true;
714 }
715 
ConsumeLiteral()716 Optional<Value> JSONParser::ConsumeLiteral() {
717   if (ConsumeIfMatch("true")) {
718     return Value(true);
719   } else if (ConsumeIfMatch("false")) {
720     return Value(false);
721   } else if (ConsumeIfMatch("null")) {
722     return Value(Value::Type::NONE);
723   } else {
724     ReportError(JSONReader::JSON_SYNTAX_ERROR, 1);
725     return nullopt;
726   }
727 }
728 
ConsumeIfMatch(StringPiece match)729 bool JSONParser::ConsumeIfMatch(StringPiece match) {
730   if (match == PeekChars(match.size())) {
731     ConsumeChars(match.size());
732     return true;
733   }
734   return false;
735 }
736 
ReportError(JSONReader::JsonParseError code,int column_adjust)737 void JSONParser::ReportError(JSONReader::JsonParseError code,
738                              int column_adjust) {
739   error_code_ = code;
740   error_line_ = line_number_;
741   error_column_ = index_ - index_last_line_ + column_adjust;
742 }
743 
744 // static
FormatErrorMessage(int line,int column,const std::string & description)745 std::string JSONParser::FormatErrorMessage(int line, int column,
746                                            const std::string& description) {
747   if (line || column) {
748     return StringPrintf("Line: %i, column: %i, %s",
749         line, column, description.c_str());
750   }
751   return description;
752 }
753 
754 }  // namespace internal
755 }  // namespace base
756