json_reader.cc revision 3345a6884c488ff3a535c2c9acdd33d74b37e311
1// Copyright (c) 2010 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "base/json/json_reader.h" 6 7#include "base/float_util.h" 8#include "base/logging.h" 9#include "base/scoped_ptr.h" 10#include "base/string_number_conversions.h" 11#include "base/string_util.h" 12#include "base/utf_string_conversions.h" 13#include "base/values.h" 14 15namespace base { 16 17static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN, 18 0, 0); 19static const int kStackLimit = 100; 20 21namespace { 22 23// A helper method for ParseNumberToken. It reads an int from the end of 24// token. The method returns false if there is no valid integer at the end of 25// the token. 26bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) { 27 wchar_t first = token.NextChar(); 28 int len = 0; 29 30 // Read in more digits 31 wchar_t c = first; 32 while ('\0' != c && '0' <= c && c <= '9') { 33 ++token.length; 34 ++len; 35 c = token.NextChar(); 36 } 37 // We need at least 1 digit. 38 if (len == 0) 39 return false; 40 41 if (!can_have_leading_zeros && len > 1 && '0' == first) 42 return false; 43 44 return true; 45} 46 47// A helper method for ParseStringToken. It reads |digits| hex digits from the 48// token. If the sequence if digits is not valid (contains other characters), 49// the method returns false. 50bool ReadHexDigits(JSONReader::Token& token, int digits) { 51 for (int i = 1; i <= digits; ++i) { 52 wchar_t c = *(token.begin + token.length + i); 53 if ('\0' == c) 54 return false; 55 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 56 ('A' <= c && c <= 'F'))) { 57 return false; 58 } 59 } 60 61 token.length += digits; 62 return true; 63} 64 65} // anonymous namespace 66 67const char* JSONReader::kBadRootElementType = 68 "Root value must be an array or object."; 69const char* JSONReader::kInvalidEscape = 70 "Invalid escape sequence."; 71const char* JSONReader::kSyntaxError = 72 "Syntax error."; 73const char* JSONReader::kTrailingComma = 74 "Trailing comma not allowed."; 75const char* JSONReader::kTooMuchNesting = 76 "Too much nesting."; 77const char* JSONReader::kUnexpectedDataAfterRoot = 78 "Unexpected data after root element."; 79const char* JSONReader::kUnsupportedEncoding = 80 "Unsupported encoding. JSON must be UTF-8."; 81const char* JSONReader::kUnquotedDictionaryKey = 82 "Dictionary keys must be quoted."; 83 84/* static */ 85Value* JSONReader::Read(const std::string& json, 86 bool allow_trailing_comma) { 87 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL); 88} 89 90/* static */ 91Value* JSONReader::ReadAndReturnError(const std::string& json, 92 bool allow_trailing_comma, 93 int* error_code_out, 94 std::string* error_msg_out) { 95 JSONReader reader = JSONReader(); 96 Value* root = reader.JsonToValue(json, true, allow_trailing_comma); 97 if (root) 98 return root; 99 100 if (error_code_out) 101 *error_code_out = reader.error_code(); 102 if (error_msg_out) 103 *error_msg_out = reader.GetErrorMessage(); 104 105 return NULL; 106} 107 108/* static */ 109std::string JSONReader::FormatErrorMessage(int line, int column, 110 const std::string& description) { 111 if (line || column) { 112 return StringPrintf("Line: %i, column: %i, %s", 113 line, column, description.c_str()); 114 } 115 return description; 116} 117 118/* static */ 119std::string JSONReader::ErrorCodeToString(JsonParseError error_code) { 120 switch (error_code) { 121 case JSON_NO_ERROR: 122 return std::string(); 123 case JSON_BAD_ROOT_ELEMENT_TYPE: 124 return kBadRootElementType; 125 case JSON_INVALID_ESCAPE: 126 return kInvalidEscape; 127 case JSON_SYNTAX_ERROR: 128 return kSyntaxError; 129 case JSON_TRAILING_COMMA: 130 return kTrailingComma; 131 case JSON_TOO_MUCH_NESTING: 132 return kTooMuchNesting; 133 case JSON_UNEXPECTED_DATA_AFTER_ROOT: 134 return kUnexpectedDataAfterRoot; 135 case JSON_UNSUPPORTED_ENCODING: 136 return kUnsupportedEncoding; 137 case JSON_UNQUOTED_DICTIONARY_KEY: 138 return kUnquotedDictionaryKey; 139 default: 140 NOTREACHED(); 141 return std::string(); 142 } 143} 144 145std::string JSONReader::GetErrorMessage() const { 146 return FormatErrorMessage(error_line_, error_col_, 147 ErrorCodeToString(error_code_)); 148} 149 150JSONReader::JSONReader() 151 : start_pos_(NULL), json_pos_(NULL), stack_depth_(0), 152 allow_trailing_comma_(false), 153 error_code_(JSON_NO_ERROR), error_line_(0), error_col_(0) {} 154 155Value* JSONReader::JsonToValue(const std::string& json, bool check_root, 156 bool allow_trailing_comma) { 157 // The input must be in UTF-8. 158 if (!IsStringUTF8(json.c_str())) { 159 error_code_ = JSON_UNSUPPORTED_ENCODING; 160 return NULL; 161 } 162 163 // The conversion from UTF8 to wstring removes null bytes for us 164 // (a good thing). 165 std::wstring json_wide(UTF8ToWide(json)); 166 start_pos_ = json_wide.c_str(); 167 168 // When the input JSON string starts with a UTF-8 Byte-Order-Mark 169 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode 170 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from 171 // mis-treating a Unicode BOM as an invalid character and returning NULL, 172 // skip a converted Unicode BOM if it exists. 173 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { 174 ++start_pos_; 175 } 176 177 json_pos_ = start_pos_; 178 allow_trailing_comma_ = allow_trailing_comma; 179 stack_depth_ = 0; 180 error_code_ = JSON_NO_ERROR; 181 182 scoped_ptr<Value> root(BuildValue(check_root)); 183 if (root.get()) { 184 if (ParseToken().type == Token::END_OF_INPUT) { 185 return root.release(); 186 } else { 187 SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_); 188 } 189 } 190 191 // Default to calling errors "syntax errors". 192 if (error_code_ == 0) 193 SetErrorCode(JSON_SYNTAX_ERROR, json_pos_); 194 195 return NULL; 196} 197 198Value* JSONReader::BuildValue(bool is_root) { 199 ++stack_depth_; 200 if (stack_depth_ > kStackLimit) { 201 SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_); 202 return NULL; 203 } 204 205 Token token = ParseToken(); 206 // The root token must be an array or an object. 207 if (is_root && token.type != Token::OBJECT_BEGIN && 208 token.type != Token::ARRAY_BEGIN) { 209 SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_); 210 return NULL; 211 } 212 213 scoped_ptr<Value> node; 214 215 switch (token.type) { 216 case Token::END_OF_INPUT: 217 case Token::INVALID_TOKEN: 218 return NULL; 219 220 case Token::NULL_TOKEN: 221 node.reset(Value::CreateNullValue()); 222 break; 223 224 case Token::BOOL_TRUE: 225 node.reset(Value::CreateBooleanValue(true)); 226 break; 227 228 case Token::BOOL_FALSE: 229 node.reset(Value::CreateBooleanValue(false)); 230 break; 231 232 case Token::NUMBER: 233 node.reset(DecodeNumber(token)); 234 if (!node.get()) 235 return NULL; 236 break; 237 238 case Token::STRING: 239 node.reset(DecodeString(token)); 240 if (!node.get()) 241 return NULL; 242 break; 243 244 case Token::ARRAY_BEGIN: 245 { 246 json_pos_ += token.length; 247 token = ParseToken(); 248 249 node.reset(new ListValue()); 250 while (token.type != Token::ARRAY_END) { 251 Value* array_node = BuildValue(false); 252 if (!array_node) 253 return NULL; 254 static_cast<ListValue*>(node.get())->Append(array_node); 255 256 // After a list value, we expect a comma or the end of the list. 257 token = ParseToken(); 258 if (token.type == Token::LIST_SEPARATOR) { 259 json_pos_ += token.length; 260 token = ParseToken(); 261 // Trailing commas are invalid according to the JSON RFC, but some 262 // consumers need the parsing leniency, so handle accordingly. 263 if (token.type == Token::ARRAY_END) { 264 if (!allow_trailing_comma_) { 265 SetErrorCode(JSON_TRAILING_COMMA, json_pos_); 266 return NULL; 267 } 268 // Trailing comma OK, stop parsing the Array. 269 break; 270 } 271 } else if (token.type != Token::ARRAY_END) { 272 // Unexpected value after list value. Bail out. 273 return NULL; 274 } 275 } 276 if (token.type != Token::ARRAY_END) { 277 return NULL; 278 } 279 break; 280 } 281 282 case Token::OBJECT_BEGIN: 283 { 284 json_pos_ += token.length; 285 token = ParseToken(); 286 287 node.reset(new DictionaryValue); 288 while (token.type != Token::OBJECT_END) { 289 if (token.type != Token::STRING) { 290 SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_); 291 return NULL; 292 } 293 scoped_ptr<Value> dict_key_value(DecodeString(token)); 294 if (!dict_key_value.get()) 295 return NULL; 296 297 // Convert the key into a wstring. 298 std::string dict_key; 299 bool success = dict_key_value->GetAsString(&dict_key); 300 DCHECK(success); 301 302 json_pos_ += token.length; 303 token = ParseToken(); 304 if (token.type != Token::OBJECT_PAIR_SEPARATOR) 305 return NULL; 306 307 json_pos_ += token.length; 308 token = ParseToken(); 309 Value* dict_value = BuildValue(false); 310 if (!dict_value) 311 return NULL; 312 static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion( 313 dict_key, dict_value); 314 315 // After a key/value pair, we expect a comma or the end of the 316 // object. 317 token = ParseToken(); 318 if (token.type == Token::LIST_SEPARATOR) { 319 json_pos_ += token.length; 320 token = ParseToken(); 321 // Trailing commas are invalid according to the JSON RFC, but some 322 // consumers need the parsing leniency, so handle accordingly. 323 if (token.type == Token::OBJECT_END) { 324 if (!allow_trailing_comma_) { 325 SetErrorCode(JSON_TRAILING_COMMA, json_pos_); 326 return NULL; 327 } 328 // Trailing comma OK, stop parsing the Object. 329 break; 330 } 331 } else if (token.type != Token::OBJECT_END) { 332 // Unexpected value after last object value. Bail out. 333 return NULL; 334 } 335 } 336 if (token.type != Token::OBJECT_END) 337 return NULL; 338 339 break; 340 } 341 342 default: 343 // We got a token that's not a value. 344 return NULL; 345 } 346 json_pos_ += token.length; 347 348 --stack_depth_; 349 return node.release(); 350} 351 352JSONReader::Token JSONReader::ParseNumberToken() { 353 // We just grab the number here. We validate the size in DecodeNumber. 354 // According to RFC4627, a valid number is: [minus] int [frac] [exp] 355 Token token(Token::NUMBER, json_pos_, 0); 356 wchar_t c = *json_pos_; 357 if ('-' == c) { 358 ++token.length; 359 c = token.NextChar(); 360 } 361 362 if (!ReadInt(token, false)) 363 return kInvalidToken; 364 365 // Optional fraction part 366 c = token.NextChar(); 367 if ('.' == c) { 368 ++token.length; 369 if (!ReadInt(token, true)) 370 return kInvalidToken; 371 c = token.NextChar(); 372 } 373 374 // Optional exponent part 375 if ('e' == c || 'E' == c) { 376 ++token.length; 377 c = token.NextChar(); 378 if ('-' == c || '+' == c) { 379 ++token.length; 380 c = token.NextChar(); 381 } 382 if (!ReadInt(token, true)) 383 return kInvalidToken; 384 } 385 386 return token; 387} 388 389Value* JSONReader::DecodeNumber(const Token& token) { 390 const std::wstring num_string(token.begin, token.length); 391 392 int num_int; 393 if (StringToInt(WideToUTF8(num_string), &num_int)) 394 return Value::CreateIntegerValue(num_int); 395 396 double num_double; 397 if (StringToDouble(WideToUTF8(num_string), &num_double) && 398 base::IsFinite(num_double)) 399 return Value::CreateRealValue(num_double); 400 401 return NULL; 402} 403 404JSONReader::Token JSONReader::ParseStringToken() { 405 Token token(Token::STRING, json_pos_, 1); 406 wchar_t c = token.NextChar(); 407 while ('\0' != c) { 408 if ('\\' == c) { 409 ++token.length; 410 c = token.NextChar(); 411 // Make sure the escaped char is valid. 412 switch (c) { 413 case 'x': 414 if (!ReadHexDigits(token, 2)) { 415 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 416 return kInvalidToken; 417 } 418 break; 419 case 'u': 420 if (!ReadHexDigits(token, 4)) { 421 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 422 return kInvalidToken; 423 } 424 break; 425 case '\\': 426 case '/': 427 case 'b': 428 case 'f': 429 case 'n': 430 case 'r': 431 case 't': 432 case 'v': 433 case '"': 434 break; 435 default: 436 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 437 return kInvalidToken; 438 } 439 } else if ('"' == c) { 440 ++token.length; 441 return token; 442 } 443 ++token.length; 444 c = token.NextChar(); 445 } 446 return kInvalidToken; 447} 448 449Value* JSONReader::DecodeString(const Token& token) { 450 std::wstring decoded_str; 451 decoded_str.reserve(token.length - 2); 452 453 for (int i = 1; i < token.length - 1; ++i) { 454 wchar_t c = *(token.begin + i); 455 if ('\\' == c) { 456 ++i; 457 c = *(token.begin + i); 458 switch (c) { 459 case '"': 460 case '/': 461 case '\\': 462 decoded_str.push_back(c); 463 break; 464 case 'b': 465 decoded_str.push_back('\b'); 466 break; 467 case 'f': 468 decoded_str.push_back('\f'); 469 break; 470 case 'n': 471 decoded_str.push_back('\n'); 472 break; 473 case 'r': 474 decoded_str.push_back('\r'); 475 break; 476 case 't': 477 decoded_str.push_back('\t'); 478 break; 479 case 'v': 480 decoded_str.push_back('\v'); 481 break; 482 483 case 'x': 484 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) + 485 HexDigitToInt(*(token.begin + i + 2))); 486 i += 2; 487 break; 488 case 'u': 489 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) + 490 (HexDigitToInt(*(token.begin + i + 2)) << 8) + 491 (HexDigitToInt(*(token.begin + i + 3)) << 4) + 492 HexDigitToInt(*(token.begin + i + 4))); 493 i += 4; 494 break; 495 496 default: 497 // We should only have valid strings at this point. If not, 498 // ParseStringToken didn't do it's job. 499 NOTREACHED(); 500 return NULL; 501 } 502 } else { 503 // Not escaped 504 decoded_str.push_back(c); 505 } 506 } 507 return Value::CreateStringValue(WideToUTF16Hack(decoded_str)); 508} 509 510JSONReader::Token JSONReader::ParseToken() { 511 static const std::wstring kNullString(L"null"); 512 static const std::wstring kTrueString(L"true"); 513 static const std::wstring kFalseString(L"false"); 514 515 EatWhitespaceAndComments(); 516 517 Token token(Token::INVALID_TOKEN, 0, 0); 518 switch (*json_pos_) { 519 case '\0': 520 token.type = Token::END_OF_INPUT; 521 break; 522 523 case 'n': 524 if (NextStringMatch(kNullString)) 525 token = Token(Token::NULL_TOKEN, json_pos_, 4); 526 break; 527 528 case 't': 529 if (NextStringMatch(kTrueString)) 530 token = Token(Token::BOOL_TRUE, json_pos_, 4); 531 break; 532 533 case 'f': 534 if (NextStringMatch(kFalseString)) 535 token = Token(Token::BOOL_FALSE, json_pos_, 5); 536 break; 537 538 case '[': 539 token = Token(Token::ARRAY_BEGIN, json_pos_, 1); 540 break; 541 542 case ']': 543 token = Token(Token::ARRAY_END, json_pos_, 1); 544 break; 545 546 case ',': 547 token = Token(Token::LIST_SEPARATOR, json_pos_, 1); 548 break; 549 550 case '{': 551 token = Token(Token::OBJECT_BEGIN, json_pos_, 1); 552 break; 553 554 case '}': 555 token = Token(Token::OBJECT_END, json_pos_, 1); 556 break; 557 558 case ':': 559 token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1); 560 break; 561 562 case '0': 563 case '1': 564 case '2': 565 case '3': 566 case '4': 567 case '5': 568 case '6': 569 case '7': 570 case '8': 571 case '9': 572 case '-': 573 token = ParseNumberToken(); 574 break; 575 576 case '"': 577 token = ParseStringToken(); 578 break; 579 } 580 return token; 581} 582 583bool JSONReader::NextStringMatch(const std::wstring& str) { 584 for (size_t i = 0; i < str.length(); ++i) { 585 if ('\0' == *json_pos_) 586 return false; 587 if (*(json_pos_ + i) != str[i]) 588 return false; 589 } 590 return true; 591} 592 593void JSONReader::EatWhitespaceAndComments() { 594 while ('\0' != *json_pos_) { 595 switch (*json_pos_) { 596 case ' ': 597 case '\n': 598 case '\r': 599 case '\t': 600 ++json_pos_; 601 break; 602 case '/': 603 // TODO(tc): This isn't in the RFC so it should be a parser flag. 604 if (!EatComment()) 605 return; 606 break; 607 default: 608 // Not a whitespace char, just exit. 609 return; 610 } 611 } 612} 613 614bool JSONReader::EatComment() { 615 if ('/' != *json_pos_) 616 return false; 617 618 wchar_t next_char = *(json_pos_ + 1); 619 if ('/' == next_char) { 620 // Line comment, read until \n or \r 621 json_pos_ += 2; 622 while ('\0' != *json_pos_) { 623 switch (*json_pos_) { 624 case '\n': 625 case '\r': 626 ++json_pos_; 627 return true; 628 default: 629 ++json_pos_; 630 } 631 } 632 } else if ('*' == next_char) { 633 // Block comment, read until */ 634 json_pos_ += 2; 635 while ('\0' != *json_pos_) { 636 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { 637 json_pos_ += 2; 638 return true; 639 } 640 ++json_pos_; 641 } 642 } else { 643 return false; 644 } 645 return true; 646} 647 648void JSONReader::SetErrorCode(JsonParseError error, 649 const wchar_t* error_pos) { 650 int line_number = 1; 651 int column_number = 1; 652 653 // Figure out the line and column the error occured at. 654 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { 655 if (*pos == '\0') { 656 NOTREACHED(); 657 return; 658 } 659 660 if (*pos == '\n') { 661 ++line_number; 662 column_number = 1; 663 } else { 664 ++column_number; 665 } 666 } 667 668 error_line_ = line_number; 669 error_col_ = column_number; 670 error_code_ = error; 671} 672 673} // namespace base 674