1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31#include <google/protobuf/util/internal/json_stream_parser.h>
32
33#include <algorithm>
34#include <cctype>
35#include <cerrno>
36#include <cstdlib>
37#include <cstring>
38#include <memory>
39#ifndef _SHARED_PTR_H
40#include <google/protobuf/stubs/shared_ptr.h>
41#endif
42
43#include <google/protobuf/stubs/logging.h>
44#include <google/protobuf/stubs/common.h>
45#include <google/protobuf/util/internal/object_writer.h>
46#include <google/protobuf/util/internal/json_escaping.h>
47#include <google/protobuf/stubs/strutil.h>
48
49namespace google {
50namespace protobuf {
51namespace util {
52
53// Allow these symbols to be referenced as util::Status, util::error::* in
54// this file.
55using util::Status;
56namespace error {
57using util::error::INTERNAL;
58using util::error::INVALID_ARGUMENT;
59}  // namespace error
60
61namespace converter {
62
63// Number of digits in an escaped UTF-16 code unit ('\\' 'u' X X X X)
64static const int kUnicodeEscapedLength = 6;
65
66// Length of the true, false, and null literals.
67static const int true_len = strlen("true");
68static const int false_len = strlen("false");
69static const int null_len = strlen("null");
70
71inline bool IsLetter(char c) {
72  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || (c == '_') ||
73         (c == '$');
74}
75
76inline bool IsAlphanumeric(char c) {
77  return IsLetter(c) || ('0' <= c && c <= '9');
78}
79
80static bool ConsumeKey(StringPiece* input, StringPiece* key) {
81  if (input->empty() || !IsLetter((*input)[0])) return false;
82  int len = 1;
83  for (; len < input->size(); ++len) {
84    if (!IsAlphanumeric((*input)[len])) {
85      break;
86    }
87  }
88  *key = StringPiece(input->data(), len);
89  *input = StringPiece(input->data() + len, input->size() - len);
90  return true;
91}
92
93static bool MatchKey(StringPiece input) {
94  return !input.empty() && IsLetter(input[0]);
95}
96
97JsonStreamParser::JsonStreamParser(ObjectWriter* ow)
98    : ow_(ow),
99      stack_(),
100      leftover_(),
101      json_(),
102      p_(),
103      key_(),
104      key_storage_(),
105      finishing_(false),
106      parsed_(),
107      parsed_storage_(),
108      string_open_(0),
109      chunk_storage_(),
110      coerce_to_utf8_(false) {
111  // Initialize the stack with a single value to be parsed.
112  stack_.push(VALUE);
113}
114
115JsonStreamParser::~JsonStreamParser() {}
116
117
118util::Status JsonStreamParser::Parse(StringPiece json) {
119  StringPiece chunk = json;
120  // If we have leftovers from a previous chunk, append the new chunk to it
121  // and create a new StringPiece pointing at the string's data. This could
122  // be large but we rely on the chunks to be small, assuming they are
123  // fragments of a Cord.
124  if (!leftover_.empty()) {
125    // Don't point chunk to leftover_ because leftover_ will be updated in
126    // ParseChunk(chunk).
127    chunk_storage_.swap(leftover_);
128    json.AppendToString(&chunk_storage_);
129    chunk = StringPiece(chunk_storage_);
130  }
131
132  // Find the structurally valid UTF8 prefix and parse only that.
133  int n = internal::UTF8SpnStructurallyValid(chunk);
134  if (n > 0) {
135    util::Status status = ParseChunk(chunk.substr(0, n));
136
137    // Any leftover characters are stashed in leftover_ for later parsing when
138    // there is more data available.
139    chunk.substr(n).AppendToString(&leftover_);
140    return status;
141  } else {
142    chunk.CopyToString(&leftover_);
143    return util::Status::OK;
144  }
145}
146
147util::Status JsonStreamParser::FinishParse() {
148  // If we do not expect anything and there is nothing left to parse we're all
149  // done.
150  if (stack_.empty() && leftover_.empty()) {
151    return util::Status::OK;
152  }
153
154  // Storage for UTF8-coerced string.
155  google::protobuf::scoped_array<char> utf8;
156  if (coerce_to_utf8_) {
157    utf8.reset(new char[leftover_.size()]);
158    char* coerced = internal::UTF8CoerceToStructurallyValid(leftover_, utf8.get(), ' ');
159    p_ = json_ = StringPiece(coerced, leftover_.size());
160  } else {
161    p_ = json_ = leftover_;
162    if (!internal::IsStructurallyValidUTF8(leftover_)) {
163      return ReportFailure("Encountered non UTF-8 code points.");
164    }
165  }
166
167  // Parse the remainder in finishing mode, which reports errors for things like
168  // unterminated strings or unknown tokens that would normally be retried.
169  finishing_ = true;
170  util::Status result = RunParser();
171  if (result.ok()) {
172    SkipWhitespace();
173    if (!p_.empty()) {
174      result = ReportFailure("Parsing terminated before end of input.");
175    }
176  }
177  return result;
178}
179
180util::Status JsonStreamParser::ParseChunk(StringPiece chunk) {
181  // Do not do any work if the chunk is empty.
182  if (chunk.empty()) return util::Status::OK;
183
184  p_ = json_ = chunk;
185
186  finishing_ = false;
187  util::Status result = RunParser();
188  if (!result.ok()) return result;
189
190  SkipWhitespace();
191  if (p_.empty()) {
192    // If we parsed everything we had, clear the leftover.
193    leftover_.clear();
194  } else {
195    // If we do not expect anything i.e. stack is empty, and we have non-empty
196    // string left to parse, we report an error.
197    if (stack_.empty()) {
198      return ReportFailure("Parsing terminated before end of input.");
199    }
200    // If we expect future data i.e. stack is non-empty, and we have some
201    // unparsed data left, we save it for later parse.
202    leftover_ = p_.ToString();
203  }
204  return util::Status::OK;
205}
206
207util::Status JsonStreamParser::RunParser() {
208  while (!stack_.empty()) {
209    ParseType type = stack_.top();
210    TokenType t = (string_open_ == 0) ? GetNextTokenType() : BEGIN_STRING;
211    stack_.pop();
212    util::Status result;
213    switch (type) {
214      case VALUE:
215        result = ParseValue(t);
216        break;
217
218      case OBJ_MID:
219        result = ParseObjectMid(t);
220        break;
221
222      case ENTRY:
223        result = ParseEntry(t);
224        break;
225
226      case ENTRY_MID:
227        result = ParseEntryMid(t);
228        break;
229
230      case ARRAY_VALUE:
231        result = ParseArrayValue(t);
232        break;
233
234      case ARRAY_MID:
235        result = ParseArrayMid(t);
236        break;
237
238      default:
239        result = util::Status(util::error::INTERNAL,
240                              StrCat("Unknown parse type: ", type));
241        break;
242    }
243    if (!result.ok()) {
244      // If we were cancelled, save our state and try again later.
245      if (!finishing_ && result == util::Status::CANCELLED) {
246        stack_.push(type);
247        // If we have a key we still need to render, make sure to save off the
248        // contents in our own storage.
249        if (!key_.empty() && key_storage_.empty()) {
250          key_.AppendToString(&key_storage_);
251          key_ = StringPiece(key_storage_);
252        }
253        result = util::Status::OK;
254      }
255      return result;
256    }
257  }
258  return util::Status::OK;
259}
260
261util::Status JsonStreamParser::ParseValue(TokenType type) {
262  switch (type) {
263    case BEGIN_OBJECT:
264      return HandleBeginObject();
265    case BEGIN_ARRAY:
266      return HandleBeginArray();
267    case BEGIN_STRING:
268      return ParseString();
269    case BEGIN_NUMBER:
270      return ParseNumber();
271    case BEGIN_TRUE:
272      return ParseTrue();
273    case BEGIN_FALSE:
274      return ParseFalse();
275    case BEGIN_NULL:
276      return ParseNull();
277    case UNKNOWN:
278      return ReportUnknown("Expected a value.");
279    default: {
280      // Special case for having been cut off while parsing, wait for more data.
281      // This handles things like 'fals' being at the end of the string, we
282      // don't know if the next char would be e, completing it, or something
283      // else, making it invalid.
284      if (!finishing_ && p_.length() < false_len) {
285        return util::Status::CANCELLED;
286      }
287      return ReportFailure("Unexpected token.");
288    }
289  }
290}
291
292util::Status JsonStreamParser::ParseString() {
293  util::Status result = ParseStringHelper();
294  if (result.ok()) {
295    ow_->RenderString(key_, parsed_);
296    key_.clear();
297    parsed_.clear();
298    parsed_storage_.clear();
299  }
300  return result;
301}
302
303util::Status JsonStreamParser::ParseStringHelper() {
304  // If we haven't seen the start quote, grab it and remember it for later.
305  if (string_open_ == 0) {
306    string_open_ = *p_.data();
307    GOOGLE_DCHECK(string_open_ == '\"' || string_open_ == '\'');
308    Advance();
309  }
310  // Track where we last copied data from so we can minimize copying.
311  const char* last = p_.data();
312  while (!p_.empty()) {
313    const char* data = p_.data();
314    if (*data == '\\') {
315      // We're about to handle an escape, copy all bytes from last to data.
316      if (last < data) {
317        parsed_storage_.append(last, data - last);
318        last = data;
319      }
320      // If we ran out of string after the \, cancel or report an error
321      // depending on if we expect more data later.
322      if (p_.length() == 1) {
323        if (!finishing_) {
324          return util::Status::CANCELLED;
325        }
326        return ReportFailure("Closing quote expected in string.");
327      }
328      // Parse a unicode escape if we found \u in the string.
329      if (data[1] == 'u') {
330        util::Status result = ParseUnicodeEscape();
331        if (!result.ok()) {
332          return result;
333        }
334        // Move last pointer past the unicode escape and continue.
335        last = p_.data();
336        continue;
337      }
338      // Handle the standard set of backslash-escaped characters.
339      switch (data[1]) {
340        case 'b':
341          parsed_storage_.push_back('\b');
342          break;
343        case 'f':
344          parsed_storage_.push_back('\f');
345          break;
346        case 'n':
347          parsed_storage_.push_back('\n');
348          break;
349        case 'r':
350          parsed_storage_.push_back('\r');
351          break;
352        case 't':
353          parsed_storage_.push_back('\t');
354          break;
355        case 'v':
356          parsed_storage_.push_back('\v');
357          break;
358        default:
359          parsed_storage_.push_back(data[1]);
360      }
361      // We handled two characters, so advance past them and continue.
362      p_.remove_prefix(2);
363      last = p_.data();
364      continue;
365    }
366    // If we found the closing quote note it, advance past it, and return.
367    if (*data == string_open_) {
368      // If we didn't copy anything, reuse the input buffer.
369      if (parsed_storage_.empty()) {
370        parsed_ = StringPiece(last, data - last);
371      } else {
372        if (last < data) {
373          parsed_storage_.append(last, data - last);
374          last = data;
375        }
376        parsed_ = StringPiece(parsed_storage_);
377      }
378      // Clear the quote char so next time we try to parse a string we'll
379      // start fresh.
380      string_open_ = 0;
381      Advance();
382      return util::Status::OK;
383    }
384    // Normal character, just advance past it.
385    Advance();
386  }
387  // If we ran out of characters, copy over what we have so far.
388  if (last < p_.data()) {
389    parsed_storage_.append(last, p_.data() - last);
390  }
391  // If we didn't find the closing quote but we expect more data, cancel for now
392  if (!finishing_) {
393    return util::Status::CANCELLED;
394  }
395  // End of string reached without a closing quote, report an error.
396  string_open_ = 0;
397  return ReportFailure("Closing quote expected in string.");
398}
399
400// Converts a unicode escaped character to a decimal value stored in a char32
401// for use in UTF8 encoding utility.  We assume that str begins with \uhhhh and
402// convert that from the hex number to a decimal value.
403//
404// There are some security exploits with UTF-8 that we should be careful of:
405//   - http://www.unicode.org/reports/tr36/#UTF-8_Exploit
406//   - http://sites/intl-eng/design-guide/core-application
407util::Status JsonStreamParser::ParseUnicodeEscape() {
408  if (p_.length() < kUnicodeEscapedLength) {
409    if (!finishing_) {
410      return util::Status::CANCELLED;
411    }
412    return ReportFailure("Illegal hex string.");
413  }
414  GOOGLE_DCHECK_EQ('\\', p_.data()[0]);
415  GOOGLE_DCHECK_EQ('u', p_.data()[1]);
416  uint32 code = 0;
417  for (int i = 2; i < kUnicodeEscapedLength; ++i) {
418    if (!isxdigit(p_.data()[i])) {
419      return ReportFailure("Invalid escape sequence.");
420    }
421    code = (code << 4) + hex_digit_to_int(p_.data()[i]);
422  }
423  if (code >= JsonEscaping::kMinHighSurrogate &&
424      code <= JsonEscaping::kMaxHighSurrogate) {
425    if (p_.length() < 2 * kUnicodeEscapedLength) {
426      if (!finishing_) {
427        return util::Status::CANCELLED;
428      }
429      if (!coerce_to_utf8_) {
430        return ReportFailure("Missing low surrogate.");
431      }
432    } else if (p_.data()[kUnicodeEscapedLength] == '\\' &&
433               p_.data()[kUnicodeEscapedLength + 1] == 'u') {
434      uint32 low_code = 0;
435      for (int i = kUnicodeEscapedLength + 2; i < 2 * kUnicodeEscapedLength;
436           ++i) {
437        if (!isxdigit(p_.data()[i])) {
438          return ReportFailure("Invalid escape sequence.");
439        }
440        low_code = (low_code << 4) + hex_digit_to_int(p_.data()[i]);
441      }
442      if (low_code >= JsonEscaping::kMinLowSurrogate &&
443          low_code <= JsonEscaping::kMaxLowSurrogate) {
444        // Convert UTF-16 surrogate pair to 21-bit Unicode codepoint.
445        code = (((code & 0x3FF) << 10) | (low_code & 0x3FF)) +
446               JsonEscaping::kMinSupplementaryCodePoint;
447        // Advance past the first code unit escape.
448        p_.remove_prefix(kUnicodeEscapedLength);
449      } else if (!coerce_to_utf8_) {
450        return ReportFailure("Invalid low surrogate.");
451      }
452    } else if (!coerce_to_utf8_) {
453      return ReportFailure("Missing low surrogate.");
454    }
455  }
456  if (!coerce_to_utf8_ && !IsValidCodePoint(code)) {
457    return ReportFailure("Invalid unicode code point.");
458  }
459  char buf[UTFmax];
460  int len = EncodeAsUTF8Char(code, buf);
461  // Advance past the [final] code unit escape.
462  p_.remove_prefix(kUnicodeEscapedLength);
463  parsed_storage_.append(buf, len);
464  return util::Status::OK;
465}
466
467util::Status JsonStreamParser::ParseNumber() {
468  NumberResult number;
469  util::Status result = ParseNumberHelper(&number);
470  if (result.ok()) {
471    switch (number.type) {
472      case NumberResult::DOUBLE:
473        ow_->RenderDouble(key_, number.double_val);
474        key_.clear();
475        break;
476
477      case NumberResult::INT:
478        ow_->RenderInt64(key_, number.int_val);
479        key_.clear();
480        break;
481
482      case NumberResult::UINT:
483        ow_->RenderUint64(key_, number.uint_val);
484        key_.clear();
485        break;
486
487      default:
488        return ReportFailure("Unable to parse number.");
489    }
490  }
491  return result;
492}
493
494util::Status JsonStreamParser::ParseNumberHelper(NumberResult* result) {
495  const char* data = p_.data();
496  int length = p_.length();
497
498  // Look for the first non-numeric character, or the end of the string.
499  int index = 0;
500  bool floating = false;
501  bool negative = data[index] == '-';
502  // Find the first character that cannot be part of the number. Along the way
503  // detect if the number needs to be parsed as a double.
504  // Note that this restricts numbers to the JSON specification, so for example
505  // we do not support hex or octal notations.
506  for (; index < length; ++index) {
507    char c = data[index];
508    if (isdigit(c)) continue;
509    if (c == '.' || c == 'e' || c == 'E') {
510      floating = true;
511      continue;
512    }
513    if (c == '+' || c == '-' || c == 'x') continue;
514    // Not a valid number character, break out.
515    break;
516  }
517
518  // If the entire input is a valid number, and we may have more content in the
519  // future, we abort for now and resume when we know more.
520  if (index == length && !finishing_) {
521    return util::Status::CANCELLED;
522  }
523
524  // Create a string containing just the number, so we can use safe_strtoX
525  string number = p_.substr(0, index).ToString();
526
527  // Floating point number, parse as a double.
528  if (floating) {
529    if (!safe_strtod(number, &result->double_val)) {
530      return ReportFailure("Unable to parse number.");
531    }
532    result->type = NumberResult::DOUBLE;
533    p_.remove_prefix(index);
534    return util::Status::OK;
535  }
536
537  // Positive non-floating point number, parse as a uint64.
538  if (!negative) {
539    // Octal/Hex numbers are not valid JSON values.
540    if (number.length() >= 2 && number[0] == '0') {
541      return ReportFailure("Octal/hex numbers are not valid JSON values.");
542    }
543    if (!safe_strtou64(number, &result->uint_val)) {
544      return ReportFailure("Unable to parse number.");
545    }
546    result->type = NumberResult::UINT;
547    p_.remove_prefix(index);
548    return util::Status::OK;
549  }
550
551  // Octal/Hex numbers are not valid JSON values.
552  if (number.length() >= 3 && number[1] == '0') {
553    return ReportFailure("Octal/hex numbers are not valid JSON values.");
554  }
555  // Negative non-floating point number, parse as an int64.
556  if (!safe_strto64(number, &result->int_val)) {
557    return ReportFailure("Unable to parse number.");
558  }
559  result->type = NumberResult::INT;
560  p_.remove_prefix(index);
561  return util::Status::OK;
562}
563
564util::Status JsonStreamParser::HandleBeginObject() {
565  GOOGLE_DCHECK_EQ('{', *p_.data());
566  Advance();
567  ow_->StartObject(key_);
568  key_.clear();
569  stack_.push(ENTRY);
570  return util::Status::OK;
571}
572
573util::Status JsonStreamParser::ParseObjectMid(TokenType type) {
574  if (type == UNKNOWN) {
575    return ReportUnknown("Expected , or } after key:value pair.");
576  }
577
578  // Object is complete, advance past the comma and render the EndObject.
579  if (type == END_OBJECT) {
580    Advance();
581    ow_->EndObject();
582    return util::Status::OK;
583  }
584  // Found a comma, advance past it and get ready for an entry.
585  if (type == VALUE_SEPARATOR) {
586    Advance();
587    stack_.push(ENTRY);
588    return util::Status::OK;
589  }
590  // Illegal token after key:value pair.
591  return ReportFailure("Expected , or } after key:value pair.");
592}
593
594util::Status JsonStreamParser::ParseEntry(TokenType type) {
595  if (type == UNKNOWN) {
596    return ReportUnknown("Expected an object key or }.");
597  }
598
599  // Close the object and return. This allows for trailing commas.
600  if (type == END_OBJECT) {
601    ow_->EndObject();
602    Advance();
603    return util::Status::OK;
604  }
605
606  util::Status result;
607  if (type == BEGIN_STRING) {
608    // Key is a string (standard JSON), parse it and store the string.
609    result = ParseStringHelper();
610    if (result.ok()) {
611      key_storage_.clear();
612      if (!parsed_storage_.empty()) {
613        parsed_storage_.swap(key_storage_);
614        key_ = StringPiece(key_storage_);
615      } else {
616        key_ = parsed_;
617      }
618      parsed_.clear();
619    }
620  } else if (type == BEGIN_KEY) {
621    // Key is a bare key (back compat), create a StringPiece pointing to it.
622    result = ParseKey();
623  } else {
624    // Unknown key type, report an error.
625    result = ReportFailure("Expected an object key or }.");
626  }
627  // On success we next expect an entry mid ':' then an object mid ',' or '}'
628  if (result.ok()) {
629    stack_.push(OBJ_MID);
630    stack_.push(ENTRY_MID);
631  }
632  return result;
633}
634
635util::Status JsonStreamParser::ParseEntryMid(TokenType type) {
636  if (type == UNKNOWN) {
637    return ReportUnknown("Expected : between key:value pair.");
638  }
639  if (type == ENTRY_SEPARATOR) {
640    Advance();
641    stack_.push(VALUE);
642    return util::Status::OK;
643  }
644  return ReportFailure("Expected : between key:value pair.");
645}
646
647util::Status JsonStreamParser::HandleBeginArray() {
648  GOOGLE_DCHECK_EQ('[', *p_.data());
649  Advance();
650  ow_->StartList(key_);
651  key_.clear();
652  stack_.push(ARRAY_VALUE);
653  return util::Status::OK;
654}
655
656util::Status JsonStreamParser::ParseArrayValue(TokenType type) {
657  if (type == UNKNOWN) {
658    return ReportUnknown("Expected a value or ] within an array.");
659  }
660
661  if (type == END_ARRAY) {
662    ow_->EndList();
663    Advance();
664    return util::Status::OK;
665  }
666
667  // The ParseValue call may push something onto the stack so we need to make
668  // sure an ARRAY_MID is after it, so we push it on now.
669  stack_.push(ARRAY_MID);
670  util::Status result = ParseValue(type);
671  if (result == util::Status::CANCELLED) {
672    // If we were cancelled, pop back off the ARRAY_MID so we don't try to
673    // push it on again when we try over.
674    stack_.pop();
675  }
676  return result;
677}
678
679util::Status JsonStreamParser::ParseArrayMid(TokenType type) {
680  if (type == UNKNOWN) {
681    return ReportUnknown("Expected , or ] after array value.");
682  }
683
684  if (type == END_ARRAY) {
685    ow_->EndList();
686    Advance();
687    return util::Status::OK;
688  }
689
690  // Found a comma, advance past it and expect an array value next.
691  if (type == VALUE_SEPARATOR) {
692    Advance();
693    stack_.push(ARRAY_VALUE);
694    return util::Status::OK;
695  }
696  // Illegal token after array value.
697  return ReportFailure("Expected , or ] after array value.");
698}
699
700util::Status JsonStreamParser::ParseTrue() {
701  ow_->RenderBool(key_, true);
702  key_.clear();
703  p_.remove_prefix(true_len);
704  return util::Status::OK;
705}
706
707util::Status JsonStreamParser::ParseFalse() {
708  ow_->RenderBool(key_, false);
709  key_.clear();
710  p_.remove_prefix(false_len);
711  return util::Status::OK;
712}
713
714util::Status JsonStreamParser::ParseNull() {
715  ow_->RenderNull(key_);
716  key_.clear();
717  p_.remove_prefix(null_len);
718  return util::Status::OK;
719}
720
721util::Status JsonStreamParser::ReportFailure(StringPiece message) {
722  static const int kContextLength = 20;
723  const char* p_start = p_.data();
724  const char* json_start = json_.data();
725  const char* begin = std::max(p_start - kContextLength, json_start);
726  const char* end =
727      std::min(p_start + kContextLength, json_start + json_.size());
728  StringPiece segment(begin, end - begin);
729  string location(p_start - begin, ' ');
730  location.push_back('^');
731  return util::Status(util::error::INVALID_ARGUMENT,
732                      StrCat(message, "\n", segment, "\n", location));
733}
734
735util::Status JsonStreamParser::ReportUnknown(StringPiece message) {
736  // If we aren't finishing the parse, cancel parsing and try later.
737  if (!finishing_) {
738    return util::Status::CANCELLED;
739  }
740  if (p_.empty()) {
741    return ReportFailure(StrCat("Unexpected end of string. ", message));
742  }
743  return ReportFailure(message);
744}
745
746void JsonStreamParser::SkipWhitespace() {
747  while (!p_.empty() && ascii_isspace(*p_.data())) {
748    Advance();
749  }
750}
751
752void JsonStreamParser::Advance() {
753  // Advance by moving one UTF8 character while making sure we don't go beyond
754  // the length of StringPiece.
755  p_.remove_prefix(std::min<int>(
756      p_.length(), UTF8FirstLetterNumBytes(p_.data(), p_.length())));
757}
758
759util::Status JsonStreamParser::ParseKey() {
760  StringPiece original = p_;
761  if (!ConsumeKey(&p_, &key_)) {
762    return ReportFailure("Invalid key or variable name.");
763  }
764  // If we consumed everything but expect more data, reset p_ and cancel since
765  // we can't know if the key was complete or not.
766  if (!finishing_ && p_.empty()) {
767    p_ = original;
768    return util::Status::CANCELLED;
769  }
770  // Since we aren't using the key storage, clear it out.
771  key_storage_.clear();
772  return util::Status::OK;
773}
774
775JsonStreamParser::TokenType JsonStreamParser::GetNextTokenType() {
776  SkipWhitespace();
777
778  int size = p_.size();
779  if (size == 0) {
780    // If we ran out of data, report unknown and we'll place the previous parse
781    // type onto the stack and try again when we have more data.
782    return UNKNOWN;
783  }
784  // TODO(sven): Split this method based on context since different contexts
785  // support different tokens. Would slightly speed up processing?
786  const char* data = p_.data();
787  if (*data == '\"' || *data == '\'') return BEGIN_STRING;
788  if (*data == '-' || ('0' <= *data && *data <= '9')) {
789    return BEGIN_NUMBER;
790  }
791  if (size >= true_len && !strncmp(data, "true", true_len)) {
792    return BEGIN_TRUE;
793  }
794  if (size >= false_len && !strncmp(data, "false", false_len)) {
795    return BEGIN_FALSE;
796  }
797  if (size >= null_len && !strncmp(data, "null", null_len)) {
798    return BEGIN_NULL;
799  }
800  if (*data == '{') return BEGIN_OBJECT;
801  if (*data == '}') return END_OBJECT;
802  if (*data == '[') return BEGIN_ARRAY;
803  if (*data == ']') return END_ARRAY;
804  if (*data == ':') return ENTRY_SEPARATOR;
805  if (*data == ',') return VALUE_SEPARATOR;
806  if (MatchKey(p_)) {
807    return BEGIN_KEY;
808  }
809
810  // We don't know that we necessarily have an invalid token here, just that we
811  // can't parse what we have so far. So we don't report an error and just
812  // return UNKNOWN so we can try again later when we have more data, or if we
813  // finish and we have leftovers.
814  return UNKNOWN;
815}
816
817}  // namespace converter
818}  // namespace util
819}  // namespace protobuf
820}  // namespace google
821