1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// A JSON parser.  Converts strings of JSON into a Value object (see
6// base/values.h).
7// http://www.ietf.org/rfc/rfc4627.txt?number=4627
8//
9// Known limitations/deviations from the RFC:
10// - Only knows how to parse ints within the range of a signed 32 bit int and
11//   decimal numbers within a double.
12// - Assumes input is encoded as UTF8.  The spec says we should allow UTF-16
13//   (BE or LE) and UTF-32 (BE or LE) as well.
14// - We limit nesting to 100 levels to prevent stack overflow (this is allowed
15//   by the RFC).
16// - A Unicode FAQ ("http://unicode.org/faq/utf_bom.html") writes a data
17//   stream may start with a Unicode Byte-Order-Mark (U+FEFF), i.e. the input
18//   UTF-8 string for the JSONReader::JsonToValue() function may start with a
19//   UTF-8 BOM (0xEF, 0xBB, 0xBF).
20//   To avoid the function from mis-treating a UTF-8 BOM as an invalid
21//   character, the function skips a Unicode BOM at the beginning of the
22//   Unicode string (converted from the input UTF-8 string) before parsing it.
23//
24// TODO(tc): Add a parsing option to to relax object keys being wrapped in
25//   double quotes
26// TODO(tc): Add an option to disable comment stripping
27// TODO(aa): Consider making the constructor public and the static Read() method
28// only a convenience for the common uses with more complex configuration going
29// on the instance.
30
31#ifndef BASE_JSON_JSON_READER_H_
32#define BASE_JSON_JSON_READER_H_
33#pragma once
34
35#include <string>
36
37#include "base/base_api.h"
38#include "base/basictypes.h"
39
40// Chromium and Chromium OS check out gtest to different places, so we're
41// unable to compile on both if we include gtest_prod.h here.  Instead, include
42// its only contents -- this will need to be updated if the macro ever changes.
43#define FRIEND_TEST(test_case_name, test_name)\
44friend class test_case_name##_##test_name##_Test
45
46class Value;
47
48namespace base {
49
50class BASE_API JSONReader {
51 public:
52  // A struct to hold a JS token.
53  class Token {
54   public:
55    enum Type {
56     OBJECT_BEGIN,           // {
57     OBJECT_END,             // }
58     ARRAY_BEGIN,            // [
59     ARRAY_END,              // ]
60     STRING,
61     NUMBER,
62     BOOL_TRUE,              // true
63     BOOL_FALSE,             // false
64     NULL_TOKEN,             // null
65     LIST_SEPARATOR,         // ,
66     OBJECT_PAIR_SEPARATOR,  // :
67     END_OF_INPUT,
68     INVALID_TOKEN,
69    };
70    Token(Type t, const wchar_t* b, int len)
71      : type(t), begin(b), length(len) {}
72
73    // Get the character that's one past the end of this token.
74    wchar_t NextChar() {
75      return *(begin + length);
76    }
77
78    Type type;
79
80    // A pointer into JSONReader::json_pos_ that's the beginning of this token.
81    const wchar_t* begin;
82
83    // End should be one char past the end of the token.
84    int length;
85  };
86
87  // Error codes during parsing.
88  enum JsonParseError {
89    JSON_NO_ERROR = 0,
90    JSON_BAD_ROOT_ELEMENT_TYPE,
91    JSON_INVALID_ESCAPE,
92    JSON_SYNTAX_ERROR,
93    JSON_TRAILING_COMMA,
94    JSON_TOO_MUCH_NESTING,
95    JSON_UNEXPECTED_DATA_AFTER_ROOT,
96    JSON_UNSUPPORTED_ENCODING,
97    JSON_UNQUOTED_DICTIONARY_KEY,
98  };
99
100  // String versions of parse error codes.
101  static const char* kBadRootElementType;
102  static const char* kInvalidEscape;
103  static const char* kSyntaxError;
104  static const char* kTrailingComma;
105  static const char* kTooMuchNesting;
106  static const char* kUnexpectedDataAfterRoot;
107  static const char* kUnsupportedEncoding;
108  static const char* kUnquotedDictionaryKey;
109
110  JSONReader();
111
112  // Reads and parses |json|, returning a Value. The caller owns the returned
113  // instance. If |json| is not a properly formed JSON string, returns NULL.
114  // If |allow_trailing_comma| is true, we will ignore trailing commas in
115  // objects and arrays even though this goes against the RFC.
116  static Value* Read(const std::string& json, bool allow_trailing_comma);
117
118  // Reads and parses |json| like Read(). |error_code_out| and |error_msg_out|
119  // are optional. If specified and NULL is returned, they will be populated
120  // an error code and a formatted error message (including error location if
121  // appropriate). Otherwise, they will be unmodified.
122  static Value* ReadAndReturnError(const std::string& json,
123                                   bool allow_trailing_comma,
124                                   int* error_code_out,
125                                   std::string* error_msg_out);
126
127  // Converts a JSON parse error code into a human readable message.
128  // Returns an empty string if error_code is JSON_NO_ERROR.
129  static std::string ErrorCodeToString(JsonParseError error_code);
130
131  // Returns the error code if the last call to JsonToValue() failed.
132  // Returns JSON_NO_ERROR otherwise.
133  JsonParseError error_code() const { return error_code_; }
134
135  // Converts error_code_ to a human-readable string, including line and column
136  // numbers if appropriate.
137  std::string GetErrorMessage() const;
138
139  // Reads and parses |json|, returning a Value. The caller owns the returned
140  // instance. If |json| is not a properly formed JSON string, returns NULL and
141  // a detailed error can be retrieved from |error_message()|.
142  // If |check_root| is true, we require that the root object be an object or
143  // array. Otherwise, it can be any valid JSON type.
144  // If |allow_trailing_comma| is true, we will ignore trailing commas in
145  // objects and arrays even though this goes against the RFC.
146  Value* JsonToValue(const std::string& json, bool check_root,
147                     bool allow_trailing_comma);
148
149 private:
150  FRIEND_TEST(JSONReaderTest, Reading);
151  FRIEND_TEST(JSONReaderTest, ErrorMessages);
152
153  static std::string FormatErrorMessage(int line, int column,
154                                        const std::string& description);
155
156  // Recursively build Value.  Returns NULL if we don't have a valid JSON
157  // string.  If |is_root| is true, we verify that the root element is either
158  // an object or an array.
159  Value* BuildValue(bool is_root);
160
161  // Parses a sequence of characters into a Token::NUMBER. If the sequence of
162  // characters is not a valid number, returns a Token::INVALID_TOKEN. Note
163  // that DecodeNumber is used to actually convert from a string to an
164  // int/double.
165  Token ParseNumberToken();
166
167  // Try and convert the substring that token holds into an int or a double. If
168  // we can (ie., no overflow), return the value, else return NULL.
169  Value* DecodeNumber(const Token& token);
170
171  // Parses a sequence of characters into a Token::STRING. If the sequence of
172  // characters is not a valid string, returns a Token::INVALID_TOKEN. Note
173  // that DecodeString is used to actually decode the escaped string into an
174  // actual wstring.
175  Token ParseStringToken();
176
177  // Convert the substring into a value string.  This should always succeed
178  // (otherwise ParseStringToken would have failed).
179  Value* DecodeString(const Token& token);
180
181  // Grabs the next token in the JSON stream.  This does not increment the
182  // stream so it can be used to look ahead at the next token.
183  Token ParseToken();
184
185  // Increments |json_pos_| past leading whitespace and comments.
186  void EatWhitespaceAndComments();
187
188  // If |json_pos_| is at the start of a comment, eat it, otherwise, returns
189  // false.
190  bool EatComment();
191
192  // Checks if |json_pos_| matches str.
193  bool NextStringMatch(const std::wstring& str);
194
195  // Sets the error code that will be returned to the caller. The current
196  // line and column are determined and added into the final message.
197  void SetErrorCode(const JsonParseError error, const wchar_t* error_pos);
198
199  // Pointer to the starting position in the input string.
200  const wchar_t* start_pos_;
201
202  // Pointer to the current position in the input string.
203  const wchar_t* json_pos_;
204
205  // Used to keep track of how many nested lists/dicts there are.
206  int stack_depth_;
207
208  // A parser flag that allows trailing commas in objects and arrays.
209  bool allow_trailing_comma_;
210
211  // Contains the error code for the last call to JsonToValue(), if any.
212  JsonParseError error_code_;
213  int error_line_;
214  int error_col_;
215
216  DISALLOW_COPY_AND_ASSIGN(JSONReader);
217};
218
219}  // namespace base
220
221#endif  // BASE_JSON_JSON_READER_H_
222