json-parser.h revision 69a99ed0b2b2ef69d393c371b03db3a98aaf880e
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_JSON_PARSER_H_
29#define V8_JSON_PARSER_H_
30
31#include "v8.h"
32
33#include "char-predicates-inl.h"
34#include "v8conversions.h"
35#include "messages.h"
36#include "spaces-inl.h"
37#include "token.h"
38
39namespace v8 {
40namespace internal {
41
42// A simple json parser.
43template <bool seq_ascii>
44class JsonParser BASE_EMBEDDED {
45 public:
46  static Handle<Object> Parse(Handle<String> source) {
47    return JsonParser().ParseJson(source);
48  }
49
50  static const int kEndOfString = -1;
51
52 private:
53  // Parse a string containing a single JSON value.
54  Handle<Object> ParseJson(Handle<String> source);
55
56  inline void Advance() {
57    position_++;
58    if (position_ >= source_length_) {
59      c0_ = kEndOfString;
60    } else if (seq_ascii) {
61      c0_ = seq_source_->SeqAsciiStringGet(position_);
62    } else {
63      c0_ = source_->Get(position_);
64    }
65  }
66
67  // The JSON lexical grammar is specified in the ECMAScript 5 standard,
68  // section 15.12.1.1. The only allowed whitespace characters between tokens
69  // are tab, carriage-return, newline and space.
70
71  inline void AdvanceSkipWhitespace() {
72    do {
73      Advance();
74    } while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ');
75  }
76
77  inline void SkipWhitespace() {
78    while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ') {
79      Advance();
80    }
81  }
82
83  inline uc32 AdvanceGetChar() {
84    Advance();
85    return c0_;
86  }
87
88  // Checks that current charater is c.
89  // If so, then consume c and skip whitespace.
90  inline bool MatchSkipWhiteSpace(uc32 c) {
91    if (c0_ == c) {
92      AdvanceSkipWhitespace();
93      return true;
94    }
95    return false;
96  }
97
98  // A JSON string (production JSONString) is subset of valid JavaScript string
99  // literals. The string must only be double-quoted (not single-quoted), and
100  // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
101  // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
102  Handle<String> ParseJsonString() {
103    return ScanJsonString<false>();
104  }
105  Handle<String> ParseJsonSymbol() {
106    return ScanJsonString<true>();
107  }
108  template <bool is_symbol>
109  Handle<String> ScanJsonString();
110  // Creates a new string and copies prefix[start..end] into the beginning
111  // of it. Then scans the rest of the string, adding characters after the
112  // prefix. Called by ScanJsonString when reaching a '\' or non-ASCII char.
113  template <typename StringType, typename SinkChar>
114  Handle<String> SlowScanJsonString(Handle<String> prefix, int start, int end);
115
116  // A JSON number (production JSONNumber) is a subset of the valid JavaScript
117  // decimal number literals.
118  // It includes an optional minus sign, must have at least one
119  // digit before and after a decimal point, may not have prefixed zeros (unless
120  // the integer part is zero), and may include an exponent part (e.g., "e-10").
121  // Hexadecimal and octal numbers are not allowed.
122  Handle<Object> ParseJsonNumber();
123
124  // Parse a single JSON value from input (grammar production JSONValue).
125  // A JSON value is either a (double-quoted) string literal, a number literal,
126  // one of "true", "false", or "null", or an object or array literal.
127  Handle<Object> ParseJsonValue();
128
129  // Parse a JSON object literal (grammar production JSONObject).
130  // An object literal is a squiggly-braced and comma separated sequence
131  // (possibly empty) of key/value pairs, where the key is a JSON string
132  // literal, the value is a JSON value, and the two are separated by a colon.
133  // A JSON array dosn't allow numbers and identifiers as keys, like a
134  // JavaScript array.
135  Handle<Object> ParseJsonObject();
136
137  // Parses a JSON array literal (grammar production JSONArray). An array
138  // literal is a square-bracketed and comma separated sequence (possibly empty)
139  // of JSON values.
140  // A JSON array doesn't allow leaving out values from the sequence, nor does
141  // it allow a terminal comma, like a JavaScript array does.
142  Handle<Object> ParseJsonArray();
143
144
145  // Mark that a parsing error has happened at the current token, and
146  // return a null handle. Primarily for readability.
147  inline Handle<Object> ReportUnexpectedCharacter() {
148    return Handle<Object>::null();
149  }
150
151  inline Isolate* isolate() { return isolate_; }
152
153  static const int kInitialSpecialStringLength = 1024;
154
155
156 private:
157  Handle<String> source_;
158  int source_length_;
159  Handle<SeqAsciiString> seq_source_;
160
161  Isolate* isolate_;
162  uc32 c0_;
163  int position_;
164};
165
166template <bool seq_ascii>
167Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) {
168  isolate_ = source->map()->isolate();
169  FlattenString(source);
170  source_ = source;
171  source_length_ = source_->length();
172
173  // Optimized fast case where we only have ASCII characters.
174  if (seq_ascii) {
175    seq_source_ = Handle<SeqAsciiString>::cast(source_);
176  }
177
178  // Set initial position right before the string.
179  position_ = -1;
180  // Advance to the first character (posibly EOS)
181  AdvanceSkipWhitespace();
182  Handle<Object> result = ParseJsonValue();
183  if (result.is_null() || c0_ != kEndOfString) {
184    // Parse failed. Current character is the unexpected token.
185
186    const char* message;
187    Factory* factory = isolate()->factory();
188    Handle<JSArray> array;
189
190    switch (c0_) {
191      case kEndOfString:
192        message = "unexpected_eos";
193        array = factory->NewJSArray(0);
194        break;
195      case '-':
196      case '0':
197      case '1':
198      case '2':
199      case '3':
200      case '4':
201      case '5':
202      case '6':
203      case '7':
204      case '8':
205      case '9':
206        message = "unexpected_token_number";
207        array = factory->NewJSArray(0);
208        break;
209      case '"':
210        message = "unexpected_token_string";
211        array = factory->NewJSArray(0);
212        break;
213      default:
214        message = "unexpected_token";
215        Handle<Object> name = LookupSingleCharacterStringFromCode(c0_);
216        Handle<FixedArray> element = factory->NewFixedArray(1);
217        element->set(0, *name);
218        array = factory->NewJSArrayWithElements(element);
219        break;
220    }
221
222    MessageLocation location(factory->NewScript(source),
223                             position_,
224                             position_ + 1);
225    Handle<Object> result = factory->NewSyntaxError(message, array);
226    isolate()->Throw(*result, &location);
227    return Handle<Object>::null();
228  }
229  return result;
230}
231
232
233// Parse any JSON value.
234template <bool seq_ascii>
235Handle<Object> JsonParser<seq_ascii>::ParseJsonValue() {
236  switch (c0_) {
237    case '"':
238      return ParseJsonString();
239    case '-':
240    case '0':
241    case '1':
242    case '2':
243    case '3':
244    case '4':
245    case '5':
246    case '6':
247    case '7':
248    case '8':
249    case '9':
250      return ParseJsonNumber();
251    case 'f':
252      if (AdvanceGetChar() == 'a' && AdvanceGetChar() == 'l' &&
253          AdvanceGetChar() == 's' && AdvanceGetChar() == 'e') {
254        AdvanceSkipWhitespace();
255        return isolate()->factory()->false_value();
256      } else {
257        return ReportUnexpectedCharacter();
258      }
259    case 't':
260      if (AdvanceGetChar() == 'r' && AdvanceGetChar() == 'u' &&
261          AdvanceGetChar() == 'e') {
262        AdvanceSkipWhitespace();
263        return isolate()->factory()->true_value();
264      } else {
265        return ReportUnexpectedCharacter();
266      }
267    case 'n':
268      if (AdvanceGetChar() == 'u' && AdvanceGetChar() == 'l' &&
269          AdvanceGetChar() == 'l') {
270        AdvanceSkipWhitespace();
271        return isolate()->factory()->null_value();
272      } else {
273        return ReportUnexpectedCharacter();
274      }
275    case '{':
276      return ParseJsonObject();
277    case '[':
278      return ParseJsonArray();
279    default:
280      return ReportUnexpectedCharacter();
281  }
282}
283
284
285// Parse a JSON object. Position must be right at '{'.
286template <bool seq_ascii>
287Handle<Object> JsonParser<seq_ascii>::ParseJsonObject() {
288  Handle<JSFunction> object_constructor(
289      isolate()->global_context()->object_function());
290  Handle<JSObject> json_object =
291      isolate()->factory()->NewJSObject(object_constructor);
292  ASSERT_EQ(c0_, '{');
293
294  AdvanceSkipWhitespace();
295  if (c0_ != '}') {
296    do {
297      if (c0_ != '"') return ReportUnexpectedCharacter();
298      Handle<String> key = ParseJsonSymbol();
299      if (key.is_null() || c0_ != ':') return ReportUnexpectedCharacter();
300      AdvanceSkipWhitespace();
301      Handle<Object> value = ParseJsonValue();
302      if (value.is_null()) return ReportUnexpectedCharacter();
303
304      uint32_t index;
305      if (key->AsArrayIndex(&index)) {
306        SetOwnElement(json_object, index, value, kNonStrictMode);
307      } else if (key->Equals(isolate()->heap()->Proto_symbol())) {
308        SetPrototype(json_object, value);
309      } else {
310        SetLocalPropertyIgnoreAttributes(json_object, key, value, NONE);
311      }
312    } while (MatchSkipWhiteSpace(','));
313    if (c0_ != '}') {
314      return ReportUnexpectedCharacter();
315    }
316  }
317  AdvanceSkipWhitespace();
318  return json_object;
319}
320
321// Parse a JSON array. Position must be right at '['.
322template <bool seq_ascii>
323Handle<Object> JsonParser<seq_ascii>::ParseJsonArray() {
324  ZoneScope zone_scope(isolate(), DELETE_ON_EXIT);
325  ZoneList<Handle<Object> > elements(4);
326  ASSERT_EQ(c0_, '[');
327
328  AdvanceSkipWhitespace();
329  if (c0_ != ']') {
330    do {
331      Handle<Object> element = ParseJsonValue();
332      if (element.is_null()) return ReportUnexpectedCharacter();
333      elements.Add(element);
334    } while (MatchSkipWhiteSpace(','));
335    if (c0_ != ']') {
336      return ReportUnexpectedCharacter();
337    }
338  }
339  AdvanceSkipWhitespace();
340  // Allocate a fixed array with all the elements.
341  Handle<FixedArray> fast_elements =
342      isolate()->factory()->NewFixedArray(elements.length());
343  for (int i = 0, n = elements.length(); i < n; i++) {
344    fast_elements->set(i, *elements[i]);
345  }
346  return isolate()->factory()->NewJSArrayWithElements(fast_elements);
347}
348
349
350template <bool seq_ascii>
351Handle<Object> JsonParser<seq_ascii>::ParseJsonNumber() {
352  bool negative = false;
353  int beg_pos = position_;
354  if (c0_ == '-') {
355    Advance();
356    negative = true;
357  }
358  if (c0_ == '0') {
359    Advance();
360    // Prefix zero is only allowed if it's the only digit before
361    // a decimal point or exponent.
362    if ('0' <= c0_ && c0_ <= '9') return ReportUnexpectedCharacter();
363  } else {
364    int i = 0;
365    int digits = 0;
366    if (c0_ < '1' || c0_ > '9') return ReportUnexpectedCharacter();
367    do {
368      i = i * 10 + c0_ - '0';
369      digits++;
370      Advance();
371    } while (c0_ >= '0' && c0_ <= '9');
372    if (c0_ != '.' && c0_ != 'e' && c0_ != 'E' && digits < 10) {
373      SkipWhitespace();
374      return Handle<Smi>(Smi::FromInt((negative ? -i : i)), isolate());
375    }
376  }
377  if (c0_ == '.') {
378    Advance();
379    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
380    do {
381      Advance();
382    } while (c0_ >= '0' && c0_ <= '9');
383  }
384  if (AsciiAlphaToLower(c0_) == 'e') {
385    Advance();
386    if (c0_ == '-' || c0_ == '+') Advance();
387    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
388    do {
389      Advance();
390    } while (c0_ >= '0' && c0_ <= '9');
391  }
392  int length = position_ - beg_pos;
393  double number;
394  if (seq_ascii) {
395    Vector<const char> chars(seq_source_->GetChars() +  beg_pos, length);
396    number = StringToDouble(isolate()->unicode_cache(),
397                             chars,
398                             NO_FLAGS,  // Hex, octal or trailing junk.
399                             OS::nan_value());
400  } else {
401    Vector<char> buffer = Vector<char>::New(length);
402    String::WriteToFlat(*source_, buffer.start(), beg_pos, position_);
403    Vector<const char> result =
404        Vector<const char>(reinterpret_cast<const char*>(buffer.start()),
405        length);
406    number = StringToDouble(isolate()->unicode_cache(),
407                             result,
408                             NO_FLAGS,  // Hex, octal or trailing junk.
409                             0.0);
410    buffer.Dispose();
411  }
412  SkipWhitespace();
413  return isolate()->factory()->NewNumber(number);
414}
415
416
417template <typename StringType>
418inline void SeqStringSet(Handle<StringType> seq_str, int i, uc32 c);
419
420template <>
421inline void SeqStringSet(Handle<SeqTwoByteString> seq_str, int i, uc32 c) {
422  seq_str->SeqTwoByteStringSet(i, c);
423}
424
425template <>
426inline void SeqStringSet(Handle<SeqAsciiString> seq_str, int i, uc32 c) {
427  seq_str->SeqAsciiStringSet(i, c);
428}
429
430template <typename StringType>
431inline Handle<StringType> NewRawString(Factory* factory, int length);
432
433template <>
434inline Handle<SeqTwoByteString> NewRawString(Factory* factory, int length) {
435  return factory->NewRawTwoByteString(length, NOT_TENURED);
436}
437
438template <>
439inline Handle<SeqAsciiString> NewRawString(Factory* factory, int length) {
440  return factory->NewRawAsciiString(length, NOT_TENURED);
441}
442
443
444// Scans the rest of a JSON string starting from position_ and writes
445// prefix[start..end] along with the scanned characters into a
446// sequential string of type StringType.
447template <bool seq_ascii>
448template <typename StringType, typename SinkChar>
449Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(
450    Handle<String> prefix, int start, int end) {
451  int count = end - start;
452  int max_length = count + source_length_ - position_;
453  int length = Min(max_length, Max(kInitialSpecialStringLength, 2 * count));
454  Handle<StringType> seq_str = NewRawString<StringType>(isolate()->factory(),
455                                                        length);
456  // Copy prefix into seq_str.
457  SinkChar* dest = seq_str->GetChars();
458  String::WriteToFlat(*prefix, dest, start, end);
459
460  while (c0_ != '"') {
461    // Check for control character (0x00-0x1f) or unterminated string (<0).
462    if (c0_ < 0x20) return Handle<String>::null();
463    if (count >= length) {
464      // We need to create a longer sequential string for the result.
465      return SlowScanJsonString<StringType, SinkChar>(seq_str, 0, count);
466    }
467    if (c0_ != '\\') {
468      // If the sink can contain UC16 characters, or source_ contains only
469      // ASCII characters, there's no need to test whether we can store the
470      // character. Otherwise check whether the UC16 source character can fit
471      // in the ASCII sink.
472      if (sizeof(SinkChar) == kUC16Size ||
473          seq_ascii ||
474          c0_ <= kMaxAsciiCharCode) {
475        SeqStringSet(seq_str, count++, c0_);
476        Advance();
477      } else {
478        // StringType is SeqAsciiString and we just read a non-ASCII char.
479        return SlowScanJsonString<SeqTwoByteString, uc16>(seq_str, 0, count);
480      }
481    } else {
482      Advance();  // Advance past the \.
483      switch (c0_) {
484        case '"':
485        case '\\':
486        case '/':
487          SeqStringSet(seq_str, count++, c0_);
488          break;
489        case 'b':
490          SeqStringSet(seq_str, count++, '\x08');
491          break;
492        case 'f':
493          SeqStringSet(seq_str, count++, '\x0c');
494          break;
495        case 'n':
496          SeqStringSet(seq_str, count++, '\x0a');
497          break;
498        case 'r':
499          SeqStringSet(seq_str, count++, '\x0d');
500          break;
501        case 't':
502          SeqStringSet(seq_str, count++, '\x09');
503          break;
504        case 'u': {
505          uc32 value = 0;
506          for (int i = 0; i < 4; i++) {
507            Advance();
508            int digit = HexValue(c0_);
509            if (digit < 0) {
510              return Handle<String>::null();
511            }
512            value = value * 16 + digit;
513          }
514          if (sizeof(SinkChar) == kUC16Size || value <= kMaxAsciiCharCode) {
515            SeqStringSet(seq_str, count++, value);
516            break;
517          } else {
518            // StringType is SeqAsciiString and we just read a non-ASCII char.
519            position_ -= 6;  // Rewind position_ to \ in \uxxxx.
520            Advance();
521            return SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,
522                                                              0,
523                                                              count);
524          }
525        }
526        default:
527          return Handle<String>::null();
528      }
529      Advance();
530    }
531  }
532  // Shrink seq_string length to count.
533  if (isolate()->heap()->InNewSpace(*seq_str)) {
534    isolate()->heap()->new_space()->
535        template ShrinkStringAtAllocationBoundary<StringType>(
536            *seq_str, count);
537  } else {
538    int string_size = StringType::SizeFor(count);
539    int allocated_string_size = StringType::SizeFor(length);
540    int delta = allocated_string_size - string_size;
541    Address start_filler_object = seq_str->address() + string_size;
542    seq_str->set_length(count);
543    isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);
544  }
545  ASSERT_EQ('"', c0_);
546  // Advance past the last '"'.
547  AdvanceSkipWhitespace();
548  return seq_str;
549}
550
551
552template <bool seq_ascii>
553template <bool is_symbol>
554Handle<String> JsonParser<seq_ascii>::ScanJsonString() {
555  ASSERT_EQ('"', c0_);
556  Advance();
557  if (c0_ == '"') {
558    AdvanceSkipWhitespace();
559    return Handle<String>(isolate()->heap()->empty_string());
560  }
561  int beg_pos = position_;
562  // Fast case for ASCII only without escape characters.
563  do {
564    // Check for control character (0x00-0x1f) or unterminated string (<0).
565    if (c0_ < 0x20) return Handle<String>::null();
566    if (c0_ != '\\') {
567      if (seq_ascii || c0_ <= kMaxAsciiCharCode) {
568        Advance();
569      } else {
570        return SlowScanJsonString<SeqTwoByteString, uc16>(source_,
571                                                          beg_pos,
572                                                          position_);
573      }
574    } else {
575      return SlowScanJsonString<SeqAsciiString, char>(source_,
576                                                      beg_pos,
577                                                      position_);
578    }
579  } while (c0_ != '"');
580  int length = position_ - beg_pos;
581  Handle<String> result;
582  if (seq_ascii && is_symbol) {
583    result = isolate()->factory()->LookupAsciiSymbol(seq_source_,
584                                                     beg_pos,
585                                                     length);
586  } else {
587    result = isolate()->factory()->NewRawAsciiString(length);
588    char* dest = SeqAsciiString::cast(*result)->GetChars();
589    String::WriteToFlat(*source_, dest, beg_pos, position_);
590  }
591  ASSERT_EQ('"', c0_);
592  // Advance past the last '"'.
593  AdvanceSkipWhitespace();
594  return result;
595}
596
597} }  // namespace v8::internal
598
599#endif  // V8_JSON_PARSER_H_
600