1// Copyright (c) 2012 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#ifndef BASE_JSON_JSON_PARSER_H_
6#define BASE_JSON_JSON_PARSER_H_
7
8#include <stddef.h>
9#include <stdint.h>
10
11#include <string>
12
13#include "base/base_export.h"
14#include "base/compiler_specific.h"
15#include "base/gtest_prod_util.h"
16#include "base/json/json_reader.h"
17#include "base/macros.h"
18#include "base/strings/string_piece.h"
19
20namespace base {
21
22class Value;
23
24namespace internal {
25
26class JSONParserTest;
27
28// The implementation behind the JSONReader interface. This class is not meant
29// to be used directly; it encapsulates logic that need not be exposed publicly.
30//
31// This parser guarantees O(n) time through the input string. It also optimizes
32// base::StringValue by using StringPiece where possible when returning Value
33// objects by using "hidden roots," discussed in the implementation.
34//
35// Iteration happens on the byte level, with the functions CanConsume and
36// NextChar. The conversion from byte to JSON token happens without advancing
37// the parser in GetNextToken/ParseToken, that is tokenization operates on
38// the current parser position without advancing.
39//
40// Built on top of these are a family of Consume functions that iterate
41// internally. Invariant: on entry of a Consume function, the parser is wound
42// to the first byte of a valid JSON token. On exit, it is on the last byte
43// of a token, such that the next iteration of the parser will be at the byte
44// immediately following the token, which would likely be the first byte of the
45// next token.
46class BASE_EXPORT JSONParser {
47 public:
48  explicit JSONParser(int options);
49  ~JSONParser();
50
51  // Parses the input string according to the set options and returns the
52  // result as a Value owned by the caller.
53  Value* Parse(const StringPiece& input);
54
55  // Returns the error code.
56  JSONReader::JsonParseError error_code() const;
57
58  // Returns the human-friendly error message.
59  std::string GetErrorMessage() const;
60
61  // Returns the error line number if parse error happened. Otherwise always
62  // returns 0.
63  int error_line() const;
64
65  // Returns the error column number if parse error happened. Otherwise always
66  // returns 0.
67  int error_column() const;
68
69 private:
70  enum Token {
71    T_OBJECT_BEGIN,           // {
72    T_OBJECT_END,             // }
73    T_ARRAY_BEGIN,            // [
74    T_ARRAY_END,              // ]
75    T_STRING,
76    T_NUMBER,
77    T_BOOL_TRUE,              // true
78    T_BOOL_FALSE,             // false
79    T_NULL,                   // null
80    T_LIST_SEPARATOR,         // ,
81    T_OBJECT_PAIR_SEPARATOR,  // :
82    T_END_OF_INPUT,
83    T_INVALID_TOKEN,
84  };
85
86  // A helper class used for parsing strings. One optimization performed is to
87  // create base::Value with a StringPiece to avoid unnecessary std::string
88  // copies. This is not possible if the input string needs to be decoded from
89  // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
90  // This class centralizes that logic.
91  class StringBuilder {
92   public:
93    // Empty constructor. Used for creating a builder with which to Swap().
94    StringBuilder();
95
96    // |pos| is the beginning of an input string, excluding the |"|.
97    explicit StringBuilder(const char* pos);
98
99    ~StringBuilder();
100
101    // Swaps the contents of |other| with this.
102    void Swap(StringBuilder* other);
103
104    // Either increases the |length_| of the string or copies the character if
105    // the StringBuilder has been converted. |c| must be in the basic ASCII
106    // plane; all other characters need to be in UTF-8 units, appended with
107    // AppendString below.
108    void Append(const char& c);
109
110    // Appends a string to the std::string. Must be Convert()ed to use.
111    void AppendString(const std::string& str);
112
113    // Converts the builder from its default StringPiece to a full std::string,
114    // performing a copy. Once a builder is converted, it cannot be made a
115    // StringPiece again.
116    void Convert();
117
118    // Returns whether the builder can be converted to a StringPiece.
119    bool CanBeStringPiece() const;
120
121    // Returns the StringPiece representation. Returns an empty piece if it
122    // cannot be converted.
123    StringPiece AsStringPiece();
124
125    // Returns the builder as a std::string.
126    const std::string& AsString();
127
128   private:
129    // The beginning of the input string.
130    const char* pos_;
131
132    // Number of bytes in |pos_| that make up the string being built.
133    size_t length_;
134
135    // The copied string representation. NULL until Convert() is called.
136    // Strong. scoped_ptr<T> has too much of an overhead here.
137    std::string* string_;
138  };
139
140  // Quick check that the stream has capacity to consume |length| more bytes.
141  bool CanConsume(int length);
142
143  // The basic way to consume a single character in the stream. Consumes one
144  // byte of the input stream and returns a pointer to the rest of it.
145  const char* NextChar();
146
147  // Performs the equivalent of NextChar N times.
148  void NextNChars(int n);
149
150  // Skips over whitespace and comments to find the next token in the stream.
151  // This does not advance the parser for non-whitespace or comment chars.
152  Token GetNextToken();
153
154  // Consumes whitespace characters and comments until the next non-that is
155  // encountered.
156  void EatWhitespaceAndComments();
157  // Helper function that consumes a comment, assuming that the parser is
158  // currently wound to a '/'.
159  bool EatComment();
160
161  // Calls GetNextToken() and then ParseToken(). Caller owns the result.
162  Value* ParseNextToken();
163
164  // Takes a token that represents the start of a Value ("a structural token"
165  // in RFC terms) and consumes it, returning the result as an object the
166  // caller owns.
167  Value* ParseToken(Token token);
168
169  // Assuming that the parser is currently wound to '{', this parses a JSON
170  // object into a DictionaryValue.
171  Value* ConsumeDictionary();
172
173  // Assuming that the parser is wound to '[', this parses a JSON list into a
174  // ListValue.
175  Value* ConsumeList();
176
177  // Calls through ConsumeStringRaw and wraps it in a value.
178  Value* ConsumeString();
179
180  // Assuming that the parser is wound to a double quote, this parses a string,
181  // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
182  // success and Swap()s the result into |out|. Returns false on failure with
183  // error information set.
184  bool ConsumeStringRaw(StringBuilder* out);
185  // Helper function for ConsumeStringRaw() that consumes the next four or 10
186  // bytes (parser is wound to the first character of a HEX sequence, with the
187  // potential for consuming another \uXXXX for a surrogate). Returns true on
188  // success and places the UTF8 code units in |dest_string|, and false on
189  // failure.
190  bool DecodeUTF16(std::string* dest_string);
191  // Helper function for ConsumeStringRaw() that takes a single code point,
192  // decodes it into UTF-8 units, and appends it to the given builder. The
193  // point must be valid.
194  void DecodeUTF8(const int32_t& point, StringBuilder* dest);
195
196  // Assuming that the parser is wound to the start of a valid JSON number,
197  // this parses and converts it to either an int or double value.
198  Value* ConsumeNumber();
199  // Helper that reads characters that are ints. Returns true if a number was
200  // read and false on error.
201  bool ReadInt(bool allow_leading_zeros);
202
203  // Consumes the literal values of |true|, |false|, and |null|, assuming the
204  // parser is wound to the first character of any of those.
205  Value* ConsumeLiteral();
206
207  // Compares two string buffers of a given length.
208  static bool StringsAreEqual(const char* left, const char* right, size_t len);
209
210  // Sets the error information to |code| at the current column, based on
211  // |index_| and |index_last_line_|, with an optional positive/negative
212  // adjustment by |column_adjust|.
213  void ReportError(JSONReader::JsonParseError code, int column_adjust);
214
215  // Given the line and column number of an error, formats one of the error
216  // message contants from json_reader.h for human display.
217  static std::string FormatErrorMessage(int line, int column,
218                                        const std::string& description);
219
220  // base::JSONParserOptions that control parsing.
221  int options_;
222
223  // Pointer to the start of the input data.
224  const char* start_pos_;
225
226  // Pointer to the current position in the input data. Equivalent to
227  // |start_pos_ + index_|.
228  const char* pos_;
229
230  // Pointer to the last character of the input data.
231  const char* end_pos_;
232
233  // The index in the input stream to which the parser is wound.
234  int index_;
235
236  // The number of times the parser has recursed (current stack depth).
237  int stack_depth_;
238
239  // The line number that the parser is at currently.
240  int line_number_;
241
242  // The last value of |index_| on the previous line.
243  int index_last_line_;
244
245  // Error information.
246  JSONReader::JsonParseError error_code_;
247  int error_line_;
248  int error_column_;
249
250  friend class JSONParserTest;
251  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
252  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
253  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
254  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
255  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
256  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
257  FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
258
259  DISALLOW_COPY_AND_ASSIGN(JSONParser);
260};
261
262}  // namespace internal
263}  // namespace base
264
265#endif  // BASE_JSON_JSON_PARSER_H_
266