1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_JSON_PARSER_H_
29#define V8_JSON_PARSER_H_
30
31#include "v8.h"
32
33#include "char-predicates-inl.h"
34#include "v8conversions.h"
35#include "messages.h"
36#include "spaces-inl.h"
37#include "token.h"
38
39namespace v8 {
40namespace internal {
41
42// A simple json parser.
43template <bool seq_ascii>
44class JsonParser BASE_EMBEDDED {
45 public:
46  static Handle<Object> Parse(Handle<String> source) {
47    return JsonParser().ParseJson(source);
48  }
49
50  static const int kEndOfString = -1;
51
52 private:
53  // Parse a string containing a single JSON value.
54  Handle<Object> ParseJson(Handle<String> source);
55
56  inline void Advance() {
57    position_++;
58    if (position_ >= source_length_) {
59      c0_ = kEndOfString;
60    } else if (seq_ascii) {
61      c0_ = seq_source_->SeqAsciiStringGet(position_);
62    } else {
63      c0_ = source_->Get(position_);
64    }
65  }
66
67  // The JSON lexical grammar is specified in the ECMAScript 5 standard,
68  // section 15.12.1.1. The only allowed whitespace characters between tokens
69  // are tab, carriage-return, newline and space.
70
71  inline void AdvanceSkipWhitespace() {
72    do {
73      Advance();
74    } while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ');
75  }
76
77  inline void SkipWhitespace() {
78    while (c0_ == '\t' || c0_ == '\r' || c0_ == '\n' || c0_ == ' ') {
79      Advance();
80    }
81  }
82
83  inline uc32 AdvanceGetChar() {
84    Advance();
85    return c0_;
86  }
87
88  // Checks that current charater is c.
89  // If so, then consume c and skip whitespace.
90  inline bool MatchSkipWhiteSpace(uc32 c) {
91    if (c0_ == c) {
92      AdvanceSkipWhitespace();
93      return true;
94    }
95    return false;
96  }
97
98  // A JSON string (production JSONString) is subset of valid JavaScript string
99  // literals. The string must only be double-quoted (not single-quoted), and
100  // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
101  // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
102  Handle<String> ParseJsonString() {
103    return ScanJsonString<false>();
104  }
105  Handle<String> ParseJsonSymbol() {
106    return ScanJsonString<true>();
107  }
108  template <bool is_symbol>
109  Handle<String> ScanJsonString();
110  // Creates a new string and copies prefix[start..end] into the beginning
111  // of it. Then scans the rest of the string, adding characters after the
112  // prefix. Called by ScanJsonString when reaching a '\' or non-ASCII char.
113  template <typename StringType, typename SinkChar>
114  Handle<String> SlowScanJsonString(Handle<String> prefix, int start, int end);
115
116  // A JSON number (production JSONNumber) is a subset of the valid JavaScript
117  // decimal number literals.
118  // It includes an optional minus sign, must have at least one
119  // digit before and after a decimal point, may not have prefixed zeros (unless
120  // the integer part is zero), and may include an exponent part (e.g., "e-10").
121  // Hexadecimal and octal numbers are not allowed.
122  Handle<Object> ParseJsonNumber();
123
124  // Parse a single JSON value from input (grammar production JSONValue).
125  // A JSON value is either a (double-quoted) string literal, a number literal,
126  // one of "true", "false", or "null", or an object or array literal.
127  Handle<Object> ParseJsonValue();
128
129  // Parse a JSON object literal (grammar production JSONObject).
130  // An object literal is a squiggly-braced and comma separated sequence
131  // (possibly empty) of key/value pairs, where the key is a JSON string
132  // literal, the value is a JSON value, and the two are separated by a colon.
133  // A JSON array doesn't allow numbers and identifiers as keys, like a
134  // JavaScript array.
135  Handle<Object> ParseJsonObject();
136
137  // Parses a JSON array literal (grammar production JSONArray). An array
138  // literal is a square-bracketed and comma separated sequence (possibly empty)
139  // of JSON values.
140  // A JSON array doesn't allow leaving out values from the sequence, nor does
141  // it allow a terminal comma, like a JavaScript array does.
142  Handle<Object> ParseJsonArray();
143
144
145  // Mark that a parsing error has happened at the current token, and
146  // return a null handle. Primarily for readability.
147  inline Handle<Object> ReportUnexpectedCharacter() {
148    return Handle<Object>::null();
149  }
150
151  inline Isolate* isolate() { return isolate_; }
152
153  static const int kInitialSpecialStringLength = 1024;
154
155
156 private:
157  Handle<String> source_;
158  int source_length_;
159  Handle<SeqAsciiString> seq_source_;
160
161  Isolate* isolate_;
162  uc32 c0_;
163  int position_;
164};
165
166template <bool seq_ascii>
167Handle<Object> JsonParser<seq_ascii>::ParseJson(Handle<String> source) {
168  isolate_ = source->map()->GetHeap()->isolate();
169  FlattenString(source);
170  source_ = source;
171  source_length_ = source_->length();
172
173  // Optimized fast case where we only have ASCII characters.
174  if (seq_ascii) {
175    seq_source_ = Handle<SeqAsciiString>::cast(source_);
176  }
177
178  // Set initial position right before the string.
179  position_ = -1;
180  // Advance to the first character (possibly EOS)
181  AdvanceSkipWhitespace();
182  Handle<Object> result = ParseJsonValue();
183  if (result.is_null() || c0_ != kEndOfString) {
184    // Parse failed. Current character is the unexpected token.
185
186    const char* message;
187    Factory* factory = isolate()->factory();
188    Handle<JSArray> array;
189
190    switch (c0_) {
191      case kEndOfString:
192        message = "unexpected_eos";
193        array = factory->NewJSArray(0);
194        break;
195      case '-':
196      case '0':
197      case '1':
198      case '2':
199      case '3':
200      case '4':
201      case '5':
202      case '6':
203      case '7':
204      case '8':
205      case '9':
206        message = "unexpected_token_number";
207        array = factory->NewJSArray(0);
208        break;
209      case '"':
210        message = "unexpected_token_string";
211        array = factory->NewJSArray(0);
212        break;
213      default:
214        message = "unexpected_token";
215        Handle<Object> name = LookupSingleCharacterStringFromCode(c0_);
216        Handle<FixedArray> element = factory->NewFixedArray(1);
217        element->set(0, *name);
218        array = factory->NewJSArrayWithElements(element);
219        break;
220    }
221
222    MessageLocation location(factory->NewScript(source),
223                             position_,
224                             position_ + 1);
225    Handle<Object> result = factory->NewSyntaxError(message, array);
226    isolate()->Throw(*result, &location);
227    return Handle<Object>::null();
228  }
229  return result;
230}
231
232
233// Parse any JSON value.
234template <bool seq_ascii>
235Handle<Object> JsonParser<seq_ascii>::ParseJsonValue() {
236  switch (c0_) {
237    case '"':
238      return ParseJsonString();
239    case '-':
240    case '0':
241    case '1':
242    case '2':
243    case '3':
244    case '4':
245    case '5':
246    case '6':
247    case '7':
248    case '8':
249    case '9':
250      return ParseJsonNumber();
251    case 'f':
252      if (AdvanceGetChar() == 'a' && AdvanceGetChar() == 'l' &&
253          AdvanceGetChar() == 's' && AdvanceGetChar() == 'e') {
254        AdvanceSkipWhitespace();
255        return isolate()->factory()->false_value();
256      } else {
257        return ReportUnexpectedCharacter();
258      }
259    case 't':
260      if (AdvanceGetChar() == 'r' && AdvanceGetChar() == 'u' &&
261          AdvanceGetChar() == 'e') {
262        AdvanceSkipWhitespace();
263        return isolate()->factory()->true_value();
264      } else {
265        return ReportUnexpectedCharacter();
266      }
267    case 'n':
268      if (AdvanceGetChar() == 'u' && AdvanceGetChar() == 'l' &&
269          AdvanceGetChar() == 'l') {
270        AdvanceSkipWhitespace();
271        return isolate()->factory()->null_value();
272      } else {
273        return ReportUnexpectedCharacter();
274      }
275    case '{':
276      return ParseJsonObject();
277    case '[':
278      return ParseJsonArray();
279    default:
280      return ReportUnexpectedCharacter();
281  }
282}
283
284
285// Parse a JSON object. Position must be right at '{'.
286template <bool seq_ascii>
287Handle<Object> JsonParser<seq_ascii>::ParseJsonObject() {
288  Handle<JSFunction> object_constructor(
289      isolate()->global_context()->object_function());
290  Handle<JSObject> json_object =
291      isolate()->factory()->NewJSObject(object_constructor);
292  ASSERT_EQ(c0_, '{');
293
294  AdvanceSkipWhitespace();
295  if (c0_ != '}') {
296    do {
297      if (c0_ != '"') return ReportUnexpectedCharacter();
298      Handle<String> key = ParseJsonSymbol();
299      if (key.is_null() || c0_ != ':') return ReportUnexpectedCharacter();
300      AdvanceSkipWhitespace();
301      Handle<Object> value = ParseJsonValue();
302      if (value.is_null()) return ReportUnexpectedCharacter();
303
304      uint32_t index;
305      if (key->AsArrayIndex(&index)) {
306        JSObject::SetOwnElement(json_object, index, value, kNonStrictMode);
307      } else if (key->Equals(isolate()->heap()->Proto_symbol())) {
308        SetPrototype(json_object, value);
309      } else {
310        JSObject::SetLocalPropertyIgnoreAttributes(
311            json_object, key, value, NONE);
312      }
313    } while (MatchSkipWhiteSpace(','));
314    if (c0_ != '}') {
315      return ReportUnexpectedCharacter();
316    }
317  }
318  AdvanceSkipWhitespace();
319  return json_object;
320}
321
322// Parse a JSON array. Position must be right at '['.
323template <bool seq_ascii>
324Handle<Object> JsonParser<seq_ascii>::ParseJsonArray() {
325  ZoneScope zone_scope(isolate(), DELETE_ON_EXIT);
326  ZoneList<Handle<Object> > elements(4);
327  ASSERT_EQ(c0_, '[');
328
329  AdvanceSkipWhitespace();
330  if (c0_ != ']') {
331    do {
332      Handle<Object> element = ParseJsonValue();
333      if (element.is_null()) return ReportUnexpectedCharacter();
334      elements.Add(element);
335    } while (MatchSkipWhiteSpace(','));
336    if (c0_ != ']') {
337      return ReportUnexpectedCharacter();
338    }
339  }
340  AdvanceSkipWhitespace();
341  // Allocate a fixed array with all the elements.
342  Handle<FixedArray> fast_elements =
343      isolate()->factory()->NewFixedArray(elements.length());
344  for (int i = 0, n = elements.length(); i < n; i++) {
345    fast_elements->set(i, *elements[i]);
346  }
347  return isolate()->factory()->NewJSArrayWithElements(fast_elements);
348}
349
350
351template <bool seq_ascii>
352Handle<Object> JsonParser<seq_ascii>::ParseJsonNumber() {
353  bool negative = false;
354  int beg_pos = position_;
355  if (c0_ == '-') {
356    Advance();
357    negative = true;
358  }
359  if (c0_ == '0') {
360    Advance();
361    // Prefix zero is only allowed if it's the only digit before
362    // a decimal point or exponent.
363    if ('0' <= c0_ && c0_ <= '9') return ReportUnexpectedCharacter();
364  } else {
365    int i = 0;
366    int digits = 0;
367    if (c0_ < '1' || c0_ > '9') return ReportUnexpectedCharacter();
368    do {
369      i = i * 10 + c0_ - '0';
370      digits++;
371      Advance();
372    } while (c0_ >= '0' && c0_ <= '9');
373    if (c0_ != '.' && c0_ != 'e' && c0_ != 'E' && digits < 10) {
374      SkipWhitespace();
375      return Handle<Smi>(Smi::FromInt((negative ? -i : i)), isolate());
376    }
377  }
378  if (c0_ == '.') {
379    Advance();
380    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
381    do {
382      Advance();
383    } while (c0_ >= '0' && c0_ <= '9');
384  }
385  if (AsciiAlphaToLower(c0_) == 'e') {
386    Advance();
387    if (c0_ == '-' || c0_ == '+') Advance();
388    if (c0_ < '0' || c0_ > '9') return ReportUnexpectedCharacter();
389    do {
390      Advance();
391    } while (c0_ >= '0' && c0_ <= '9');
392  }
393  int length = position_ - beg_pos;
394  double number;
395  if (seq_ascii) {
396    Vector<const char> chars(seq_source_->GetChars() +  beg_pos, length);
397    number = StringToDouble(isolate()->unicode_cache(),
398                             chars,
399                             NO_FLAGS,  // Hex, octal or trailing junk.
400                             OS::nan_value());
401  } else {
402    Vector<char> buffer = Vector<char>::New(length);
403    String::WriteToFlat(*source_, buffer.start(), beg_pos, position_);
404    Vector<const char> result =
405        Vector<const char>(reinterpret_cast<const char*>(buffer.start()),
406        length);
407    number = StringToDouble(isolate()->unicode_cache(),
408                             result,
409                             NO_FLAGS,  // Hex, octal or trailing junk.
410                             0.0);
411    buffer.Dispose();
412  }
413  SkipWhitespace();
414  return isolate()->factory()->NewNumber(number);
415}
416
417
418template <typename StringType>
419inline void SeqStringSet(Handle<StringType> seq_str, int i, uc32 c);
420
421template <>
422inline void SeqStringSet(Handle<SeqTwoByteString> seq_str, int i, uc32 c) {
423  seq_str->SeqTwoByteStringSet(i, c);
424}
425
426template <>
427inline void SeqStringSet(Handle<SeqAsciiString> seq_str, int i, uc32 c) {
428  seq_str->SeqAsciiStringSet(i, c);
429}
430
431template <typename StringType>
432inline Handle<StringType> NewRawString(Factory* factory, int length);
433
434template <>
435inline Handle<SeqTwoByteString> NewRawString(Factory* factory, int length) {
436  return factory->NewRawTwoByteString(length, NOT_TENURED);
437}
438
439template <>
440inline Handle<SeqAsciiString> NewRawString(Factory* factory, int length) {
441  return factory->NewRawAsciiString(length, NOT_TENURED);
442}
443
444
445// Scans the rest of a JSON string starting from position_ and writes
446// prefix[start..end] along with the scanned characters into a
447// sequential string of type StringType.
448template <bool seq_ascii>
449template <typename StringType, typename SinkChar>
450Handle<String> JsonParser<seq_ascii>::SlowScanJsonString(
451    Handle<String> prefix, int start, int end) {
452  int count = end - start;
453  int max_length = count + source_length_ - position_;
454  int length = Min(max_length, Max(kInitialSpecialStringLength, 2 * count));
455  Handle<StringType> seq_str = NewRawString<StringType>(isolate()->factory(),
456                                                        length);
457  // Copy prefix into seq_str.
458  SinkChar* dest = seq_str->GetChars();
459  String::WriteToFlat(*prefix, dest, start, end);
460
461  while (c0_ != '"') {
462    // Check for control character (0x00-0x1f) or unterminated string (<0).
463    if (c0_ < 0x20) return Handle<String>::null();
464    if (count >= length) {
465      // We need to create a longer sequential string for the result.
466      return SlowScanJsonString<StringType, SinkChar>(seq_str, 0, count);
467    }
468    if (c0_ != '\\') {
469      // If the sink can contain UC16 characters, or source_ contains only
470      // ASCII characters, there's no need to test whether we can store the
471      // character. Otherwise check whether the UC16 source character can fit
472      // in the ASCII sink.
473      if (sizeof(SinkChar) == kUC16Size ||
474          seq_ascii ||
475          c0_ <= kMaxAsciiCharCode) {
476        SeqStringSet(seq_str, count++, c0_);
477        Advance();
478      } else {
479        // StringType is SeqAsciiString and we just read a non-ASCII char.
480        return SlowScanJsonString<SeqTwoByteString, uc16>(seq_str, 0, count);
481      }
482    } else {
483      Advance();  // Advance past the \.
484      switch (c0_) {
485        case '"':
486        case '\\':
487        case '/':
488          SeqStringSet(seq_str, count++, c0_);
489          break;
490        case 'b':
491          SeqStringSet(seq_str, count++, '\x08');
492          break;
493        case 'f':
494          SeqStringSet(seq_str, count++, '\x0c');
495          break;
496        case 'n':
497          SeqStringSet(seq_str, count++, '\x0a');
498          break;
499        case 'r':
500          SeqStringSet(seq_str, count++, '\x0d');
501          break;
502        case 't':
503          SeqStringSet(seq_str, count++, '\x09');
504          break;
505        case 'u': {
506          uc32 value = 0;
507          for (int i = 0; i < 4; i++) {
508            Advance();
509            int digit = HexValue(c0_);
510            if (digit < 0) {
511              return Handle<String>::null();
512            }
513            value = value * 16 + digit;
514          }
515          if (sizeof(SinkChar) == kUC16Size || value <= kMaxAsciiCharCode) {
516            SeqStringSet(seq_str, count++, value);
517            break;
518          } else {
519            // StringType is SeqAsciiString and we just read a non-ASCII char.
520            position_ -= 6;  // Rewind position_ to \ in \uxxxx.
521            Advance();
522            return SlowScanJsonString<SeqTwoByteString, uc16>(seq_str,
523                                                              0,
524                                                              count);
525          }
526        }
527        default:
528          return Handle<String>::null();
529      }
530      Advance();
531    }
532  }
533  // Shrink seq_string length to count.
534  if (isolate()->heap()->InNewSpace(*seq_str)) {
535    isolate()->heap()->new_space()->
536        template ShrinkStringAtAllocationBoundary<StringType>(
537            *seq_str, count);
538  } else {
539    int string_size = StringType::SizeFor(count);
540    int allocated_string_size = StringType::SizeFor(length);
541    int delta = allocated_string_size - string_size;
542    Address start_filler_object = seq_str->address() + string_size;
543    seq_str->set_length(count);
544    isolate()->heap()->CreateFillerObjectAt(start_filler_object, delta);
545  }
546  ASSERT_EQ('"', c0_);
547  // Advance past the last '"'.
548  AdvanceSkipWhitespace();
549  return seq_str;
550}
551
552
553template <bool seq_ascii>
554template <bool is_symbol>
555Handle<String> JsonParser<seq_ascii>::ScanJsonString() {
556  ASSERT_EQ('"', c0_);
557  Advance();
558  if (c0_ == '"') {
559    AdvanceSkipWhitespace();
560    return Handle<String>(isolate()->heap()->empty_string());
561  }
562  int beg_pos = position_;
563  // Fast case for ASCII only without escape characters.
564  do {
565    // Check for control character (0x00-0x1f) or unterminated string (<0).
566    if (c0_ < 0x20) return Handle<String>::null();
567    if (c0_ != '\\') {
568      if (seq_ascii || c0_ <= kMaxAsciiCharCode) {
569        Advance();
570      } else {
571        return SlowScanJsonString<SeqTwoByteString, uc16>(source_,
572                                                          beg_pos,
573                                                          position_);
574      }
575    } else {
576      return SlowScanJsonString<SeqAsciiString, char>(source_,
577                                                      beg_pos,
578                                                      position_);
579    }
580  } while (c0_ != '"');
581  int length = position_ - beg_pos;
582  Handle<String> result;
583  if (seq_ascii && is_symbol) {
584    result = isolate()->factory()->LookupAsciiSymbol(seq_source_,
585                                                     beg_pos,
586                                                     length);
587  } else {
588    result = isolate()->factory()->NewRawAsciiString(length);
589    char* dest = SeqAsciiString::cast(*result)->GetChars();
590    String::WriteToFlat(*source_, dest, beg_pos, position_);
591  }
592  ASSERT_EQ('"', c0_);
593  // Advance past the last '"'.
594  AdvanceSkipWhitespace();
595  return result;
596}
597
598} }  // namespace v8::internal
599
600#endif  // V8_JSON_PARSER_H_
601