json_reader.cc revision c7f5f8508d98d5952d42ed7648c2a8f30a4da156
1// Copyright (c) 2009 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "base/json/json_reader.h"
6
7#include "base/float_util.h"
8#include "base/logging.h"
9#include "base/scoped_ptr.h"
10#include "base/string_util.h"
11#include "base/utf_string_conversions.h"
12#include "base/values.h"
13
14namespace base {
15
16static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
17                                             0, 0);
18static const int kStackLimit = 100;
19
20namespace {
21
22inline int HexToInt(wchar_t c) {
23  if ('0' <= c && c <= '9') {
24    return c - '0';
25  } else if ('A' <= c && c <= 'F') {
26    return c - 'A' + 10;
27  } else if ('a' <= c && c <= 'f') {
28    return c - 'a' + 10;
29  }
30  NOTREACHED();
31  return 0;
32}
33
34// A helper method for ParseNumberToken.  It reads an int from the end of
35// token.  The method returns false if there is no valid integer at the end of
36// the token.
37bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
38  wchar_t first = token.NextChar();
39  int len = 0;
40
41  // Read in more digits
42  wchar_t c = first;
43  while ('\0' != c && '0' <= c && c <= '9') {
44    ++token.length;
45    ++len;
46    c = token.NextChar();
47  }
48  // We need at least 1 digit.
49  if (len == 0)
50    return false;
51
52  if (!can_have_leading_zeros && len > 1 && '0' == first)
53    return false;
54
55  return true;
56}
57
58// A helper method for ParseStringToken.  It reads |digits| hex digits from the
59// token. If the sequence if digits is not valid (contains other characters),
60// the method returns false.
61bool ReadHexDigits(JSONReader::Token& token, int digits) {
62  for (int i = 1; i <= digits; ++i) {
63    wchar_t c = *(token.begin + token.length + i);
64    if ('\0' == c)
65      return false;
66    if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
67          ('A' <= c && c <= 'F'))) {
68      return false;
69    }
70  }
71
72  token.length += digits;
73  return true;
74}
75
76}  // anonymous namespace
77
78const char* JSONReader::kBadRootElementType =
79    "Root value must be an array or object.";
80const char* JSONReader::kInvalidEscape =
81    "Invalid escape sequence.";
82const char* JSONReader::kSyntaxError =
83    "Syntax error.";
84const char* JSONReader::kTrailingComma =
85    "Trailing comma not allowed.";
86const char* JSONReader::kTooMuchNesting =
87    "Too much nesting.";
88const char* JSONReader::kUnexpectedDataAfterRoot =
89    "Unexpected data after root element.";
90const char* JSONReader::kUnsupportedEncoding =
91    "Unsupported encoding. JSON must be UTF-8.";
92const char* JSONReader::kUnquotedDictionaryKey =
93    "Dictionary keys must be quoted.";
94
95/* static */
96Value* JSONReader::Read(const std::string& json,
97                        bool allow_trailing_comma) {
98  return ReadAndReturnError(json, allow_trailing_comma, NULL);
99}
100
101/* static */
102Value* JSONReader::ReadAndReturnError(const std::string& json,
103                                      bool allow_trailing_comma,
104                                      std::string *error_message_out) {
105  JSONReader reader = JSONReader();
106  Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
107  if (root)
108    return root;
109
110  if (error_message_out)
111    *error_message_out = reader.error_message();
112
113  return NULL;
114}
115
116/* static */
117std::string JSONReader::FormatErrorMessage(int line, int column,
118                                           const char* description) {
119  return StringPrintf("Line: %i, column: %i, %s",
120                      line, column, description);
121}
122
123JSONReader::JSONReader()
124    : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
125      allow_trailing_comma_(false) {}
126
127Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
128                               bool allow_trailing_comma) {
129  // The input must be in UTF-8.
130  if (!IsStringUTF8(json.c_str())) {
131    error_message_ = kUnsupportedEncoding;
132    return NULL;
133  }
134
135  // The conversion from UTF8 to wstring removes null bytes for us
136  // (a good thing).
137  std::wstring json_wide(UTF8ToWide(json));
138  start_pos_ = json_wide.c_str();
139
140  // When the input JSON string starts with a UTF-8 Byte-Order-Mark
141  // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
142  // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
143  // mis-treating a Unicode BOM as an invalid character and returning NULL,
144  // skip a converted Unicode BOM if it exists.
145  if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
146    ++start_pos_;
147  }
148
149  json_pos_ = start_pos_;
150  allow_trailing_comma_ = allow_trailing_comma;
151  stack_depth_ = 0;
152  error_message_.clear();
153
154  scoped_ptr<Value> root(BuildValue(check_root));
155  if (root.get()) {
156    if (ParseToken().type == Token::END_OF_INPUT) {
157      return root.release();
158    } else {
159      SetErrorMessage(kUnexpectedDataAfterRoot, json_pos_);
160    }
161  }
162
163  // Default to calling errors "syntax errors".
164  if (error_message_.empty())
165    SetErrorMessage(kSyntaxError, json_pos_);
166
167  return NULL;
168}
169
170Value* JSONReader::BuildValue(bool is_root) {
171  ++stack_depth_;
172  if (stack_depth_ > kStackLimit) {
173    SetErrorMessage(kTooMuchNesting, json_pos_);
174    return NULL;
175  }
176
177  Token token = ParseToken();
178  // The root token must be an array or an object.
179  if (is_root && token.type != Token::OBJECT_BEGIN &&
180      token.type != Token::ARRAY_BEGIN) {
181    SetErrorMessage(kBadRootElementType, json_pos_);
182    return NULL;
183  }
184
185  scoped_ptr<Value> node;
186
187  switch (token.type) {
188    case Token::END_OF_INPUT:
189    case Token::INVALID_TOKEN:
190      return NULL;
191
192    case Token::NULL_TOKEN:
193      node.reset(Value::CreateNullValue());
194      break;
195
196    case Token::BOOL_TRUE:
197      node.reset(Value::CreateBooleanValue(true));
198      break;
199
200    case Token::BOOL_FALSE:
201      node.reset(Value::CreateBooleanValue(false));
202      break;
203
204    case Token::NUMBER:
205      node.reset(DecodeNumber(token));
206      if (!node.get())
207        return NULL;
208      break;
209
210    case Token::STRING:
211      node.reset(DecodeString(token));
212      if (!node.get())
213        return NULL;
214      break;
215
216    case Token::ARRAY_BEGIN:
217      {
218        json_pos_ += token.length;
219        token = ParseToken();
220
221        node.reset(new ListValue());
222        while (token.type != Token::ARRAY_END) {
223          Value* array_node = BuildValue(false);
224          if (!array_node)
225            return NULL;
226          static_cast<ListValue*>(node.get())->Append(array_node);
227
228          // After a list value, we expect a comma or the end of the list.
229          token = ParseToken();
230          if (token.type == Token::LIST_SEPARATOR) {
231            json_pos_ += token.length;
232            token = ParseToken();
233            // Trailing commas are invalid according to the JSON RFC, but some
234            // consumers need the parsing leniency, so handle accordingly.
235            if (token.type == Token::ARRAY_END) {
236              if (!allow_trailing_comma_) {
237                SetErrorMessage(kTrailingComma, json_pos_);
238                return NULL;
239              }
240              // Trailing comma OK, stop parsing the Array.
241              break;
242            }
243          } else if (token.type != Token::ARRAY_END) {
244            // Unexpected value after list value.  Bail out.
245            return NULL;
246          }
247        }
248        if (token.type != Token::ARRAY_END) {
249          return NULL;
250        }
251        break;
252      }
253
254    case Token::OBJECT_BEGIN:
255      {
256        json_pos_ += token.length;
257        token = ParseToken();
258
259        node.reset(new DictionaryValue);
260        while (token.type != Token::OBJECT_END) {
261          if (token.type != Token::STRING) {
262            SetErrorMessage(kUnquotedDictionaryKey, json_pos_);
263            return NULL;
264          }
265          scoped_ptr<Value> dict_key_value(DecodeString(token));
266          if (!dict_key_value.get())
267            return NULL;
268
269          // Convert the key into a wstring.
270          std::wstring dict_key;
271          bool success = dict_key_value->GetAsString(&dict_key);
272          DCHECK(success);
273
274          json_pos_ += token.length;
275          token = ParseToken();
276          if (token.type != Token::OBJECT_PAIR_SEPARATOR)
277            return NULL;
278
279          json_pos_ += token.length;
280          token = ParseToken();
281          Value* dict_value = BuildValue(false);
282          if (!dict_value)
283            return NULL;
284          static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
285              dict_key, dict_value);
286
287          // After a key/value pair, we expect a comma or the end of the
288          // object.
289          token = ParseToken();
290          if (token.type == Token::LIST_SEPARATOR) {
291            json_pos_ += token.length;
292            token = ParseToken();
293            // Trailing commas are invalid according to the JSON RFC, but some
294            // consumers need the parsing leniency, so handle accordingly.
295            if (token.type == Token::OBJECT_END) {
296              if (!allow_trailing_comma_) {
297                SetErrorMessage(kTrailingComma, json_pos_);
298                return NULL;
299              }
300              // Trailing comma OK, stop parsing the Object.
301              break;
302            }
303          } else if (token.type != Token::OBJECT_END) {
304            // Unexpected value after last object value.  Bail out.
305            return NULL;
306          }
307        }
308        if (token.type != Token::OBJECT_END)
309          return NULL;
310
311        break;
312      }
313
314    default:
315      // We got a token that's not a value.
316      return NULL;
317  }
318  json_pos_ += token.length;
319
320  --stack_depth_;
321  return node.release();
322}
323
324JSONReader::Token JSONReader::ParseNumberToken() {
325  // We just grab the number here.  We validate the size in DecodeNumber.
326  // According   to RFC4627, a valid number is: [minus] int [frac] [exp]
327  Token token(Token::NUMBER, json_pos_, 0);
328  wchar_t c = *json_pos_;
329  if ('-' == c) {
330    ++token.length;
331    c = token.NextChar();
332  }
333
334  if (!ReadInt(token, false))
335    return kInvalidToken;
336
337  // Optional fraction part
338  c = token.NextChar();
339  if ('.' == c) {
340    ++token.length;
341    if (!ReadInt(token, true))
342      return kInvalidToken;
343    c = token.NextChar();
344  }
345
346  // Optional exponent part
347  if ('e' == c || 'E' == c) {
348    ++token.length;
349    c = token.NextChar();
350    if ('-' == c || '+' == c) {
351      ++token.length;
352      c = token.NextChar();
353    }
354    if (!ReadInt(token, true))
355      return kInvalidToken;
356  }
357
358  return token;
359}
360
361Value* JSONReader::DecodeNumber(const Token& token) {
362  const std::wstring num_string(token.begin, token.length);
363
364  int num_int;
365  if (StringToInt(WideToUTF16Hack(num_string), &num_int))
366    return Value::CreateIntegerValue(num_int);
367
368  double num_double;
369  if (StringToDouble(WideToUTF16Hack(num_string), &num_double) &&
370      base::IsFinite(num_double))
371    return Value::CreateRealValue(num_double);
372
373  return NULL;
374}
375
376JSONReader::Token JSONReader::ParseStringToken() {
377  Token token(Token::STRING, json_pos_, 1);
378  wchar_t c = token.NextChar();
379  while ('\0' != c) {
380    if ('\\' == c) {
381      ++token.length;
382      c = token.NextChar();
383      // Make sure the escaped char is valid.
384      switch (c) {
385        case 'x':
386          if (!ReadHexDigits(token, 2)) {
387            SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
388            return kInvalidToken;
389          }
390          break;
391        case 'u':
392          if (!ReadHexDigits(token, 4)) {
393            SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
394            return kInvalidToken;
395          }
396          break;
397        case '\\':
398        case '/':
399        case 'b':
400        case 'f':
401        case 'n':
402        case 'r':
403        case 't':
404        case 'v':
405        case '"':
406          break;
407        default:
408          SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
409          return kInvalidToken;
410      }
411    } else if ('"' == c) {
412      ++token.length;
413      return token;
414    }
415    ++token.length;
416    c = token.NextChar();
417  }
418  return kInvalidToken;
419}
420
421Value* JSONReader::DecodeString(const Token& token) {
422  std::wstring decoded_str;
423  decoded_str.reserve(token.length - 2);
424
425  for (int i = 1; i < token.length - 1; ++i) {
426    wchar_t c = *(token.begin + i);
427    if ('\\' == c) {
428      ++i;
429      c = *(token.begin + i);
430      switch (c) {
431        case '"':
432        case '/':
433        case '\\':
434          decoded_str.push_back(c);
435          break;
436        case 'b':
437          decoded_str.push_back('\b');
438          break;
439        case 'f':
440          decoded_str.push_back('\f');
441          break;
442        case 'n':
443          decoded_str.push_back('\n');
444          break;
445        case 'r':
446          decoded_str.push_back('\r');
447          break;
448        case 't':
449          decoded_str.push_back('\t');
450          break;
451        case 'v':
452          decoded_str.push_back('\v');
453          break;
454
455        case 'x':
456          decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 4) +
457                                HexToInt(*(token.begin + i + 2)));
458          i += 2;
459          break;
460        case 'u':
461          decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 12 ) +
462                                (HexToInt(*(token.begin + i + 2)) << 8) +
463                                (HexToInt(*(token.begin + i + 3)) << 4) +
464                                HexToInt(*(token.begin + i + 4)));
465          i += 4;
466          break;
467
468        default:
469          // We should only have valid strings at this point.  If not,
470          // ParseStringToken didn't do it's job.
471          NOTREACHED();
472          return NULL;
473      }
474    } else {
475      // Not escaped
476      decoded_str.push_back(c);
477    }
478  }
479  return Value::CreateStringValue(decoded_str);
480}
481
482JSONReader::Token JSONReader::ParseToken() {
483  static const std::wstring kNullString(L"null");
484  static const std::wstring kTrueString(L"true");
485  static const std::wstring kFalseString(L"false");
486
487  EatWhitespaceAndComments();
488
489  Token token(Token::INVALID_TOKEN, 0, 0);
490  switch (*json_pos_) {
491    case '\0':
492      token.type = Token::END_OF_INPUT;
493      break;
494
495    case 'n':
496      if (NextStringMatch(kNullString))
497        token = Token(Token::NULL_TOKEN, json_pos_, 4);
498      break;
499
500    case 't':
501      if (NextStringMatch(kTrueString))
502        token = Token(Token::BOOL_TRUE, json_pos_, 4);
503      break;
504
505    case 'f':
506      if (NextStringMatch(kFalseString))
507        token = Token(Token::BOOL_FALSE, json_pos_, 5);
508      break;
509
510    case '[':
511      token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
512      break;
513
514    case ']':
515      token = Token(Token::ARRAY_END, json_pos_, 1);
516      break;
517
518    case ',':
519      token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
520      break;
521
522    case '{':
523      token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
524      break;
525
526    case '}':
527      token = Token(Token::OBJECT_END, json_pos_, 1);
528      break;
529
530    case ':':
531      token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
532      break;
533
534    case '0':
535    case '1':
536    case '2':
537    case '3':
538    case '4':
539    case '5':
540    case '6':
541    case '7':
542    case '8':
543    case '9':
544    case '-':
545      token = ParseNumberToken();
546      break;
547
548    case '"':
549      token = ParseStringToken();
550      break;
551  }
552  return token;
553}
554
555bool JSONReader::NextStringMatch(const std::wstring& str) {
556  for (size_t i = 0; i < str.length(); ++i) {
557    if ('\0' == *json_pos_)
558      return false;
559    if (*(json_pos_ + i) != str[i])
560      return false;
561  }
562  return true;
563}
564
565void JSONReader::EatWhitespaceAndComments() {
566  while ('\0' != *json_pos_) {
567    switch (*json_pos_) {
568      case ' ':
569      case '\n':
570      case '\r':
571      case '\t':
572        ++json_pos_;
573        break;
574      case '/':
575        // TODO(tc): This isn't in the RFC so it should be a parser flag.
576        if (!EatComment())
577          return;
578        break;
579      default:
580        // Not a whitespace char, just exit.
581        return;
582    }
583  }
584}
585
586bool JSONReader::EatComment() {
587  if ('/' != *json_pos_)
588    return false;
589
590  wchar_t next_char = *(json_pos_ + 1);
591  if ('/' == next_char) {
592    // Line comment, read until \n or \r
593    json_pos_ += 2;
594    while ('\0' != *json_pos_) {
595      switch (*json_pos_) {
596        case '\n':
597        case '\r':
598          ++json_pos_;
599          return true;
600        default:
601          ++json_pos_;
602      }
603    }
604  } else if ('*' == next_char) {
605    // Block comment, read until */
606    json_pos_ += 2;
607    while ('\0' != *json_pos_) {
608      if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
609        json_pos_ += 2;
610        return true;
611      }
612      ++json_pos_;
613    }
614  } else {
615    return false;
616  }
617  return true;
618}
619
620void JSONReader::SetErrorMessage(const char* description,
621                                 const wchar_t* error_pos) {
622  int line_number = 1;
623  int column_number = 1;
624
625  // Figure out the line and column the error occured at.
626  for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
627    if (*pos == '\0') {
628      NOTREACHED();
629      return;
630    }
631
632    if (*pos == '\n') {
633      ++line_number;
634      column_number = 1;
635    } else {
636      ++column_number;
637    }
638  }
639
640  error_message_ = FormatErrorMessage(line_number, column_number, description);
641}
642
643}  // namespace base
644