json_reader.cc revision ddb351dbec246cf1fab5ec20d2d5520909041de1
1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/json/json_reader.h"
6
7#include "base/float_util.h"
8#include "base/logging.h"
9#include "base/memory/scoped_ptr.h"
10#include "base/string_number_conversions.h"
11#include "base/string_util.h"
12#include "base/utf_string_conversions.h"
13#include "base/values.h"
14
15namespace base {
16
17static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
18                                             0, 0);
19static const int kStackLimit = 100;
20
21namespace {
22
23// A helper method for ParseNumberToken.  It reads an int from the end of
24// token.  The method returns false if there is no valid integer at the end of
25// the token.
26bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
27  wchar_t first = token.NextChar();
28  int len = 0;
29
30  // Read in more digits
31  wchar_t c = first;
32  while ('\0' != c && '0' <= c && c <= '9') {
33    ++token.length;
34    ++len;
35    c = token.NextChar();
36  }
37  // We need at least 1 digit.
38  if (len == 0)
39    return false;
40
41  if (!can_have_leading_zeros && len > 1 && '0' == first)
42    return false;
43
44  return true;
45}
46
47// A helper method for ParseStringToken.  It reads |digits| hex digits from the
48// token. If the sequence if digits is not valid (contains other characters),
49// the method returns false.
50bool ReadHexDigits(JSONReader::Token& token, int digits) {
51  for (int i = 1; i <= digits; ++i) {
52    wchar_t c = *(token.begin + token.length + i);
53    if ('\0' == c)
54      return false;
55    if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
56          ('A' <= c && c <= 'F'))) {
57      return false;
58    }
59  }
60
61  token.length += digits;
62  return true;
63}
64
65}  // anonymous namespace
66
67const char* JSONReader::kBadRootElementType =
68    "Root value must be an array or object.";
69const char* JSONReader::kInvalidEscape =
70    "Invalid escape sequence.";
71const char* JSONReader::kSyntaxError =
72    "Syntax error.";
73const char* JSONReader::kTrailingComma =
74    "Trailing comma not allowed.";
75const char* JSONReader::kTooMuchNesting =
76    "Too much nesting.";
77const char* JSONReader::kUnexpectedDataAfterRoot =
78    "Unexpected data after root element.";
79const char* JSONReader::kUnsupportedEncoding =
80    "Unsupported encoding. JSON must be UTF-8.";
81const char* JSONReader::kUnquotedDictionaryKey =
82    "Dictionary keys must be quoted.";
83
84JSONReader::JSONReader()
85    : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
86      allow_trailing_comma_(false),
87      error_code_(JSON_NO_ERROR), error_line_(0), error_col_(0) {}
88
89/* static */
90Value* JSONReader::Read(const std::string& json,
91                        bool allow_trailing_comma) {
92  return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);
93}
94
95/* static */
96Value* JSONReader::ReadAndReturnError(const std::string& json,
97                                      bool allow_trailing_comma,
98                                      int* error_code_out,
99                                      std::string* error_msg_out) {
100  JSONReader reader = JSONReader();
101  Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
102  if (root)
103    return root;
104
105  if (error_code_out)
106    *error_code_out = reader.error_code();
107  if (error_msg_out)
108    *error_msg_out = reader.GetErrorMessage();
109
110  return NULL;
111}
112
113/* static */
114std::string JSONReader::ErrorCodeToString(JsonParseError error_code) {
115  switch (error_code) {
116    case JSON_NO_ERROR:
117      return std::string();
118    case JSON_BAD_ROOT_ELEMENT_TYPE:
119      return kBadRootElementType;
120    case JSON_INVALID_ESCAPE:
121      return kInvalidEscape;
122    case JSON_SYNTAX_ERROR:
123      return kSyntaxError;
124    case JSON_TRAILING_COMMA:
125      return kTrailingComma;
126    case JSON_TOO_MUCH_NESTING:
127      return kTooMuchNesting;
128    case JSON_UNEXPECTED_DATA_AFTER_ROOT:
129      return kUnexpectedDataAfterRoot;
130    case JSON_UNSUPPORTED_ENCODING:
131      return kUnsupportedEncoding;
132    case JSON_UNQUOTED_DICTIONARY_KEY:
133      return kUnquotedDictionaryKey;
134    default:
135      NOTREACHED();
136      return std::string();
137  }
138}
139
140std::string JSONReader::GetErrorMessage() const {
141  return FormatErrorMessage(error_line_, error_col_,
142                            ErrorCodeToString(error_code_));
143}
144
145Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
146                               bool allow_trailing_comma) {
147  // The input must be in UTF-8.
148  if (!IsStringUTF8(json.c_str())) {
149    error_code_ = JSON_UNSUPPORTED_ENCODING;
150    return NULL;
151  }
152
153  // The conversion from UTF8 to wstring removes null bytes for us
154  // (a good thing).
155  std::wstring json_wide(UTF8ToWide(json));
156  start_pos_ = json_wide.c_str();
157
158  // When the input JSON string starts with a UTF-8 Byte-Order-Mark
159  // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
160  // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
161  // mis-treating a Unicode BOM as an invalid character and returning NULL,
162  // skip a converted Unicode BOM if it exists.
163  if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
164    ++start_pos_;
165  }
166
167  json_pos_ = start_pos_;
168  allow_trailing_comma_ = allow_trailing_comma;
169  stack_depth_ = 0;
170  error_code_ = JSON_NO_ERROR;
171
172  scoped_ptr<Value> root(BuildValue(check_root));
173  if (root.get()) {
174    if (ParseToken().type == Token::END_OF_INPUT) {
175      return root.release();
176    } else {
177      SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_);
178    }
179  }
180
181  // Default to calling errors "syntax errors".
182  if (error_code_ == 0)
183    SetErrorCode(JSON_SYNTAX_ERROR, json_pos_);
184
185  return NULL;
186}
187
188/* static */
189std::string JSONReader::FormatErrorMessage(int line, int column,
190                                           const std::string& description) {
191  if (line || column) {
192    return StringPrintf("Line: %i, column: %i, %s",
193                        line, column, description.c_str());
194  }
195  return description;
196}
197
198Value* JSONReader::BuildValue(bool is_root) {
199  ++stack_depth_;
200  if (stack_depth_ > kStackLimit) {
201    SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_);
202    return NULL;
203  }
204
205  Token token = ParseToken();
206  // The root token must be an array or an object.
207  if (is_root && token.type != Token::OBJECT_BEGIN &&
208      token.type != Token::ARRAY_BEGIN) {
209    SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_);
210    return NULL;
211  }
212
213  scoped_ptr<Value> node;
214
215  switch (token.type) {
216    case Token::END_OF_INPUT:
217    case Token::INVALID_TOKEN:
218      return NULL;
219
220    case Token::NULL_TOKEN:
221      node.reset(Value::CreateNullValue());
222      break;
223
224    case Token::BOOL_TRUE:
225      node.reset(Value::CreateBooleanValue(true));
226      break;
227
228    case Token::BOOL_FALSE:
229      node.reset(Value::CreateBooleanValue(false));
230      break;
231
232    case Token::NUMBER:
233      node.reset(DecodeNumber(token));
234      if (!node.get())
235        return NULL;
236      break;
237
238    case Token::STRING:
239      node.reset(DecodeString(token));
240      if (!node.get())
241        return NULL;
242      break;
243
244    case Token::ARRAY_BEGIN:
245      {
246        json_pos_ += token.length;
247        token = ParseToken();
248
249        node.reset(new ListValue());
250        while (token.type != Token::ARRAY_END) {
251          Value* array_node = BuildValue(false);
252          if (!array_node)
253            return NULL;
254          static_cast<ListValue*>(node.get())->Append(array_node);
255
256          // After a list value, we expect a comma or the end of the list.
257          token = ParseToken();
258          if (token.type == Token::LIST_SEPARATOR) {
259            json_pos_ += token.length;
260            token = ParseToken();
261            // Trailing commas are invalid according to the JSON RFC, but some
262            // consumers need the parsing leniency, so handle accordingly.
263            if (token.type == Token::ARRAY_END) {
264              if (!allow_trailing_comma_) {
265                SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
266                return NULL;
267              }
268              // Trailing comma OK, stop parsing the Array.
269              break;
270            }
271          } else if (token.type != Token::ARRAY_END) {
272            // Unexpected value after list value.  Bail out.
273            return NULL;
274          }
275        }
276        if (token.type != Token::ARRAY_END) {
277          return NULL;
278        }
279        break;
280      }
281
282    case Token::OBJECT_BEGIN:
283      {
284        json_pos_ += token.length;
285        token = ParseToken();
286
287        node.reset(new DictionaryValue);
288        while (token.type != Token::OBJECT_END) {
289          if (token.type != Token::STRING) {
290            SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_);
291            return NULL;
292          }
293          scoped_ptr<Value> dict_key_value(DecodeString(token));
294          if (!dict_key_value.get())
295            return NULL;
296
297          // Convert the key into a wstring.
298          std::string dict_key;
299          bool success = dict_key_value->GetAsString(&dict_key);
300          DCHECK(success);
301
302          json_pos_ += token.length;
303          token = ParseToken();
304          if (token.type != Token::OBJECT_PAIR_SEPARATOR)
305            return NULL;
306
307          json_pos_ += token.length;
308          token = ParseToken();
309          Value* dict_value = BuildValue(false);
310          if (!dict_value)
311            return NULL;
312          static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
313              dict_key, dict_value);
314
315          // After a key/value pair, we expect a comma or the end of the
316          // object.
317          token = ParseToken();
318          if (token.type == Token::LIST_SEPARATOR) {
319            json_pos_ += token.length;
320            token = ParseToken();
321            // Trailing commas are invalid according to the JSON RFC, but some
322            // consumers need the parsing leniency, so handle accordingly.
323            if (token.type == Token::OBJECT_END) {
324              if (!allow_trailing_comma_) {
325                SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
326                return NULL;
327              }
328              // Trailing comma OK, stop parsing the Object.
329              break;
330            }
331          } else if (token.type != Token::OBJECT_END) {
332            // Unexpected value after last object value.  Bail out.
333            return NULL;
334          }
335        }
336        if (token.type != Token::OBJECT_END)
337          return NULL;
338
339        break;
340      }
341
342    default:
343      // We got a token that's not a value.
344      return NULL;
345  }
346  json_pos_ += token.length;
347
348  --stack_depth_;
349  return node.release();
350}
351
352JSONReader::Token JSONReader::ParseNumberToken() {
353  // We just grab the number here.  We validate the size in DecodeNumber.
354  // According   to RFC4627, a valid number is: [minus] int [frac] [exp]
355  Token token(Token::NUMBER, json_pos_, 0);
356  wchar_t c = *json_pos_;
357  if ('-' == c) {
358    ++token.length;
359    c = token.NextChar();
360  }
361
362  if (!ReadInt(token, false))
363    return kInvalidToken;
364
365  // Optional fraction part
366  c = token.NextChar();
367  if ('.' == c) {
368    ++token.length;
369    if (!ReadInt(token, true))
370      return kInvalidToken;
371    c = token.NextChar();
372  }
373
374  // Optional exponent part
375  if ('e' == c || 'E' == c) {
376    ++token.length;
377    c = token.NextChar();
378    if ('-' == c || '+' == c) {
379      ++token.length;
380      c = token.NextChar();
381    }
382    if (!ReadInt(token, true))
383      return kInvalidToken;
384  }
385
386  return token;
387}
388
389Value* JSONReader::DecodeNumber(const Token& token) {
390  const std::wstring num_string(token.begin, token.length);
391
392  int num_int;
393  if (StringToInt(WideToUTF8(num_string), &num_int))
394    return Value::CreateIntegerValue(num_int);
395
396  double num_double;
397  if (StringToDouble(WideToUTF8(num_string), &num_double) &&
398      base::IsFinite(num_double))
399    return Value::CreateDoubleValue(num_double);
400
401  return NULL;
402}
403
404JSONReader::Token JSONReader::ParseStringToken() {
405  Token token(Token::STRING, json_pos_, 1);
406  wchar_t c = token.NextChar();
407  while ('\0' != c) {
408    if ('\\' == c) {
409      ++token.length;
410      c = token.NextChar();
411      // Make sure the escaped char is valid.
412      switch (c) {
413        case 'x':
414          if (!ReadHexDigits(token, 2)) {
415            SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
416            return kInvalidToken;
417          }
418          break;
419        case 'u':
420          if (!ReadHexDigits(token, 4)) {
421            SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
422            return kInvalidToken;
423          }
424          break;
425        case '\\':
426        case '/':
427        case 'b':
428        case 'f':
429        case 'n':
430        case 'r':
431        case 't':
432        case 'v':
433        case '"':
434          break;
435        default:
436          SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
437          return kInvalidToken;
438      }
439    } else if ('"' == c) {
440      ++token.length;
441      return token;
442    }
443    ++token.length;
444    c = token.NextChar();
445  }
446  return kInvalidToken;
447}
448
449Value* JSONReader::DecodeString(const Token& token) {
450  std::wstring decoded_str;
451  decoded_str.reserve(token.length - 2);
452
453  for (int i = 1; i < token.length - 1; ++i) {
454    wchar_t c = *(token.begin + i);
455    if ('\\' == c) {
456      ++i;
457      c = *(token.begin + i);
458      switch (c) {
459        case '"':
460        case '/':
461        case '\\':
462          decoded_str.push_back(c);
463          break;
464        case 'b':
465          decoded_str.push_back('\b');
466          break;
467        case 'f':
468          decoded_str.push_back('\f');
469          break;
470        case 'n':
471          decoded_str.push_back('\n');
472          break;
473        case 'r':
474          decoded_str.push_back('\r');
475          break;
476        case 't':
477          decoded_str.push_back('\t');
478          break;
479        case 'v':
480          decoded_str.push_back('\v');
481          break;
482
483        case 'x':
484          decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
485                                HexDigitToInt(*(token.begin + i + 2)));
486          i += 2;
487          break;
488        case 'u':
489          decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +
490                                (HexDigitToInt(*(token.begin + i + 2)) << 8) +
491                                (HexDigitToInt(*(token.begin + i + 3)) << 4) +
492                                HexDigitToInt(*(token.begin + i + 4)));
493          i += 4;
494          break;
495
496        default:
497          // We should only have valid strings at this point.  If not,
498          // ParseStringToken didn't do it's job.
499          NOTREACHED();
500          return NULL;
501      }
502    } else {
503      // Not escaped
504      decoded_str.push_back(c);
505    }
506  }
507  return Value::CreateStringValue(WideToUTF16Hack(decoded_str));
508}
509
510JSONReader::Token JSONReader::ParseToken() {
511  static const std::wstring kNullString(L"null");
512  static const std::wstring kTrueString(L"true");
513  static const std::wstring kFalseString(L"false");
514
515  EatWhitespaceAndComments();
516
517  Token token(Token::INVALID_TOKEN, 0, 0);
518  switch (*json_pos_) {
519    case '\0':
520      token.type = Token::END_OF_INPUT;
521      break;
522
523    case 'n':
524      if (NextStringMatch(kNullString))
525        token = Token(Token::NULL_TOKEN, json_pos_, 4);
526      break;
527
528    case 't':
529      if (NextStringMatch(kTrueString))
530        token = Token(Token::BOOL_TRUE, json_pos_, 4);
531      break;
532
533    case 'f':
534      if (NextStringMatch(kFalseString))
535        token = Token(Token::BOOL_FALSE, json_pos_, 5);
536      break;
537
538    case '[':
539      token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
540      break;
541
542    case ']':
543      token = Token(Token::ARRAY_END, json_pos_, 1);
544      break;
545
546    case ',':
547      token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
548      break;
549
550    case '{':
551      token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
552      break;
553
554    case '}':
555      token = Token(Token::OBJECT_END, json_pos_, 1);
556      break;
557
558    case ':':
559      token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
560      break;
561
562    case '0':
563    case '1':
564    case '2':
565    case '3':
566    case '4':
567    case '5':
568    case '6':
569    case '7':
570    case '8':
571    case '9':
572    case '-':
573      token = ParseNumberToken();
574      break;
575
576    case '"':
577      token = ParseStringToken();
578      break;
579  }
580  return token;
581}
582
583void JSONReader::EatWhitespaceAndComments() {
584  while ('\0' != *json_pos_) {
585    switch (*json_pos_) {
586      case ' ':
587      case '\n':
588      case '\r':
589      case '\t':
590        ++json_pos_;
591        break;
592      case '/':
593        // TODO(tc): This isn't in the RFC so it should be a parser flag.
594        if (!EatComment())
595          return;
596        break;
597      default:
598        // Not a whitespace char, just exit.
599        return;
600    }
601  }
602}
603
604bool JSONReader::EatComment() {
605  if ('/' != *json_pos_)
606    return false;
607
608  wchar_t next_char = *(json_pos_ + 1);
609  if ('/' == next_char) {
610    // Line comment, read until \n or \r
611    json_pos_ += 2;
612    while ('\0' != *json_pos_) {
613      switch (*json_pos_) {
614        case '\n':
615        case '\r':
616          ++json_pos_;
617          return true;
618        default:
619          ++json_pos_;
620      }
621    }
622  } else if ('*' == next_char) {
623    // Block comment, read until */
624    json_pos_ += 2;
625    while ('\0' != *json_pos_) {
626      if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
627        json_pos_ += 2;
628        return true;
629      }
630      ++json_pos_;
631    }
632  } else {
633    return false;
634  }
635  return true;
636}
637
638bool JSONReader::NextStringMatch(const std::wstring& str) {
639  for (size_t i = 0; i < str.length(); ++i) {
640    if ('\0' == *json_pos_)
641      return false;
642    if (*(json_pos_ + i) != str[i])
643      return false;
644  }
645  return true;
646}
647
648void JSONReader::SetErrorCode(JsonParseError error,
649                              const wchar_t* error_pos) {
650  int line_number = 1;
651  int column_number = 1;
652
653  // Figure out the line and column the error occured at.
654  for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
655    if (*pos == '\0') {
656      NOTREACHED();
657      return;
658    }
659
660    if (*pos == '\n') {
661      ++line_number;
662      column_number = 1;
663    } else {
664      ++column_number;
665    }
666  }
667
668  error_line_ = line_number;
669  error_col_ = column_number;
670  error_code_ = error;
671}
672
673}  // namespace base
674