1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#ifndef GOOGLE_PROTOBUF_UTIL_CONVERTER_JSON_STREAM_PARSER_H__
32#define GOOGLE_PROTOBUF_UTIL_CONVERTER_JSON_STREAM_PARSER_H__
33
34#include <stack>
35#include <string>
36
37#include <google/protobuf/stubs/common.h>
38#include <google/protobuf/stubs/stringpiece.h>
39#include <google/protobuf/stubs/status.h>
40
41namespace google {
42namespace util {
43class Status;
44}  // namespace util
45
46namespace protobuf {
47namespace util {
48namespace converter {
49
50class ObjectWriter;
51
52// A JSON parser that can parse a stream of JSON chunks rather than needing the
53// entire JSON string up front. It is a modified version of the parser in
54// //net/proto/json/json-parser.h that has been changed in the following ways:
55// - Changed from recursion to an explicit stack to allow resumption
56// - Added support for int64 and uint64 numbers
57// - Removed support for octal and decimal escapes
58// - Removed support for numeric keys
59// - Removed support for functions (javascript)
60// - Removed some lax-comma support (but kept trailing comma support)
61// - Writes directly to an ObjectWriter rather than using subclassing
62//
63// Here is an example usage:
64// JsonStreamParser parser(ow_.get());
65// util::Status result = parser.Parse(chunk1);
66// result.Update(parser.Parse(chunk2));
67// result.Update(parser.FinishParse());
68// GOOGLE_DCHECK(result.ok()) << "Failed to parse JSON";
69//
70// This parser is thread-compatible as long as only one thread is calling a
71// Parse() method at a time.
72class LIBPROTOBUF_EXPORT JsonStreamParser {
73 public:
74  // Creates a JsonStreamParser that will write to the given ObjectWriter.
75  explicit JsonStreamParser(ObjectWriter* ow);
76  virtual ~JsonStreamParser();
77
78  // Parses a UTF-8 encoded JSON string from a StringPiece.
79  util::Status Parse(StringPiece json);
80
81
82  // Finish parsing the JSON string.
83  util::Status FinishParse();
84
85
86 private:
87  enum TokenType {
88    BEGIN_STRING,     // " or '
89    BEGIN_NUMBER,     // - or digit
90    BEGIN_TRUE,       // true
91    BEGIN_FALSE,      // false
92    BEGIN_NULL,       // null
93    BEGIN_OBJECT,     // {
94    END_OBJECT,       // }
95    BEGIN_ARRAY,      // [
96    END_ARRAY,        // ]
97    ENTRY_SEPARATOR,  // :
98    VALUE_SEPARATOR,  // ,
99    BEGIN_KEY,        // letter, _, $ or digit.  Must begin with non-digit
100    UNKNOWN           // Unknown token or we ran out of the stream.
101  };
102
103  enum ParseType {
104    VALUE,        // Expects a {, [, true, false, null, string or number
105    OBJ_MID,      // Expects a ',' or }
106    ENTRY,        // Expects a key or }
107    ENTRY_MID,    // Expects a :
108    ARRAY_VALUE,  // Expects a value or ]
109    ARRAY_MID     // Expects a ',' or ]
110  };
111
112  // Holds the result of parsing a number
113  struct NumberResult {
114    enum Type { DOUBLE, INT, UINT };
115    Type type;
116    union {
117      double double_val;
118      int64 int_val;
119      uint64 uint_val;
120    };
121  };
122
123  // Parses a single chunk of JSON, returning an error if the JSON was invalid.
124  util::Status ParseChunk(StringPiece json);
125
126  // Runs the parser based on stack_ and p_, until the stack is empty or p_ runs
127  // out of data. If we unexpectedly run out of p_ we push the latest back onto
128  // the stack and return.
129  util::Status RunParser();
130
131  // Parses a value from p_ and writes it to ow_.
132  // A value may be an object, array, true, false, null, string or number.
133  util::Status ParseValue(TokenType type);
134
135  // Parses a string and writes it out to the ow_.
136  util::Status ParseString();
137
138  // Parses a string, storing the result in parsed_.
139  util::Status ParseStringHelper();
140
141  // This function parses unicode escape sequences in strings. It returns an
142  // error when there's a parsing error, either the size is not the expected
143  // size or a character is not a hex digit.  When it returns str will contain
144  // what has been successfully parsed so far.
145  util::Status ParseUnicodeEscape();
146
147  // Expects p_ to point to a JSON number, writes the number to the writer using
148  // the appropriate Render method based on the type of number.
149  util::Status ParseNumber();
150
151  // Parse a number into a NumberResult, reporting an error if no number could
152  // be parsed. This method will try to parse into a uint64, int64, or double
153  // based on whether the number was positive or negative or had a decimal
154  // component.
155  util::Status ParseNumberHelper(NumberResult* result);
156
157  // Handles a { during parsing of a value.
158  util::Status HandleBeginObject();
159
160  // Parses from the ENTRY state.
161  util::Status ParseEntry(TokenType type);
162
163  // Parses from the ENTRY_MID state.
164  util::Status ParseEntryMid(TokenType type);
165
166  // Parses from the OBJ_MID state.
167  util::Status ParseObjectMid(TokenType type);
168
169  // Handles a [ during parsing of a value.
170  util::Status HandleBeginArray();
171
172  // Parses from the ARRAY_VALUE state.
173  util::Status ParseArrayValue(TokenType type);
174
175  // Parses from the ARRAY_MID state.
176  util::Status ParseArrayMid(TokenType type);
177
178  // Expects p_ to point to an unquoted literal
179  util::Status ParseTrue();
180  util::Status ParseFalse();
181  util::Status ParseNull();
182
183  // Report a failure as a util::Status.
184  util::Status ReportFailure(StringPiece message);
185
186  // Report a failure due to an UNKNOWN token type. We check if we hit the
187  // end of the stream and if we're finishing or not to detect what type of
188  // status to return in this case.
189  util::Status ReportUnknown(StringPiece message);
190
191  // Advance p_ past all whitespace or until the end of the string.
192  void SkipWhitespace();
193
194  // Advance p_ one UTF-8 character
195  void Advance();
196
197  // Expects p_ to point to the beginning of a key.
198  util::Status ParseKey();
199
200  // Return the type of the next token at p_.
201  TokenType GetNextTokenType();
202
203  // The object writer to write parse events to.
204  ObjectWriter* ow_;
205
206  // The stack of parsing we still need to do. When the stack runs empty we will
207  // have parsed a single value from the root (e.g. an object or list).
208  std::stack<ParseType> stack_;
209
210  // Contains any leftover text from a previous chunk that we weren't able to
211  // fully parse, for example the start of a key or number.
212  string leftover_;
213
214  // The current chunk of JSON being parsed. Primarily used for providing
215  // context during error reporting.
216  StringPiece json_;
217
218  // A pointer within the current JSON being parsed, used to track location.
219  StringPiece p_;
220
221  // Stores the last key read, as we separate parsing of keys and values.
222  StringPiece key_;
223
224  // Storage for key_ if we need to keep ownership, for example between chunks
225  // or if the key was unescaped from a JSON string.
226  string key_storage_;
227
228  // True during the FinishParse() call, so we know that any errors are fatal.
229  // For example an unterminated string will normally result in cancelling and
230  // trying during the next chunk, but during FinishParse() it is an error.
231  bool finishing_;
232
233  // String we parsed during a call to ParseStringHelper().
234  StringPiece parsed_;
235
236  // Storage for the string we parsed. This may be empty if the string was able
237  // to be parsed directly from the input.
238  string parsed_storage_;
239
240  // The character that opened the string, either ' or ".
241  // A value of 0 indicates that string parsing is not in process.
242  char string_open_;
243
244  // Storage for the chunk that are being parsed in ParseChunk().
245  string chunk_storage_;
246
247  // Whether to allow non UTF-8 encoded input and replace invalid code points.
248  bool coerce_to_utf8_;
249
250  GOOGLE_DISALLOW_IMPLICIT_CONSTRUCTORS(JsonStreamParser);
251};
252
253}  // namespace converter
254}  // namespace util
255}  // namespace protobuf
256
257}  // namespace google
258#endif  // GOOGLE_PROTOBUF_UTIL_CONVERTER_JSON_STREAM_PARSER_H__
259