scanner.cc revision 9ac36c9faca11611ada13b4054edbaa0738661d0
1// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "handles.h"
32#include "scanner.h"
33
34namespace v8 {
35namespace internal {
36
37// ----------------------------------------------------------------------------
38// Character predicates
39
40
41unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
42unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
43unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
44unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
45
46
47StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
48
49
50// ----------------------------------------------------------------------------
51// UTF8Buffer
52
53UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity) { }
54
55
56UTF8Buffer::~UTF8Buffer() {}
57
58
59void UTF8Buffer::AddCharSlow(uc32 c) {
60  ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar);
61  int length = unibrow::Utf8::Length(c);
62  Vector<char> block = buffer_.AddBlock(length, '\0');
63#ifdef DEBUG
64  int written_length = unibrow::Utf8::Encode(block.start(), c);
65  CHECK_EQ(length, written_length);
66#else
67  unibrow::Utf8::Encode(block.start(), c);
68#endif
69}
70
71
72// ----------------------------------------------------------------------------
73// UTF16Buffer
74
75
76UTF16Buffer::UTF16Buffer()
77    : pos_(0), end_(Scanner::kNoEndPosition) { }
78
79
80// CharacterStreamUTF16Buffer
81CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
82    : pushback_buffer_(0), last_(0), stream_(NULL) { }
83
84
85void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
86                                            unibrow::CharacterStream* input,
87                                            int start_position,
88                                            int end_position) {
89  stream_ = input;
90  if (start_position > 0) {
91    SeekForward(start_position);
92  }
93  end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt;
94}
95
96
97void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
98  pushback_buffer()->Add(last_);
99  last_ = ch;
100  pos_--;
101}
102
103
104uc32 CharacterStreamUTF16Buffer::Advance() {
105  ASSERT(end_ != Scanner::kNoEndPosition);
106  ASSERT(end_ >= 0);
107  // NOTE: It is of importance to Persian / Farsi resources that we do
108  // *not* strip format control characters in the scanner; see
109  //
110  //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
111  //
112  // So, even though ECMA-262, section 7.1, page 11, dictates that we
113  // must remove Unicode format-control characters, we do not. This is
114  // in line with how IE and SpiderMonkey handles it.
115  if (!pushback_buffer()->is_empty()) {
116    pos_++;
117    return last_ = pushback_buffer()->RemoveLast();
118  } else if (stream_->has_more() && pos_ < end_) {
119    pos_++;
120    uc32 next = stream_->GetNext();
121    return last_ = next;
122  } else {
123    // Note: currently the following increment is necessary to avoid a
124    // test-parser problem!
125    pos_++;
126    return last_ = static_cast<uc32>(-1);
127  }
128}
129
130
131void CharacterStreamUTF16Buffer::SeekForward(int pos) {
132  pos_ = pos;
133  ASSERT(pushback_buffer()->is_empty());
134  stream_->Seek(pos);
135}
136
137
138// ExternalStringUTF16Buffer
139template <typename StringType, typename CharType>
140ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()
141    : raw_data_(NULL) { }
142
143
144template <typename StringType, typename CharType>
145void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
146     Handle<StringType> data,
147     int start_position,
148     int end_position) {
149  ASSERT(!data.is_null());
150  raw_data_ = data->resource()->data();
151
152  ASSERT(end_position <= data->length());
153  if (start_position > 0) {
154    SeekForward(start_position);
155  }
156  end_ =
157      end_position != Scanner::kNoEndPosition ? end_position : data->length();
158}
159
160
161template <typename StringType, typename CharType>
162uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {
163  if (pos_ < end_) {
164    return raw_data_[pos_++];
165  } else {
166    // note: currently the following increment is necessary to avoid a
167    // test-parser problem!
168    pos_++;
169    return static_cast<uc32>(-1);
170  }
171}
172
173
174template <typename StringType, typename CharType>
175void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) {
176  pos_--;
177  ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
178  ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
179}
180
181
182template <typename StringType, typename CharType>
183void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {
184  pos_ = pos;
185}
186
187
188// ----------------------------------------------------------------------------
189// Keyword Matcher
190
191KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
192  { "break",  KEYWORD_PREFIX, Token::BREAK },
193  { NULL,     C,              Token::ILLEGAL },
194  { NULL,     D,              Token::ILLEGAL },
195  { "else",   KEYWORD_PREFIX, Token::ELSE },
196  { NULL,     F,              Token::ILLEGAL },
197  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
198  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
199  { NULL,     I,              Token::ILLEGAL },
200  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
201  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
202  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
203  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
204  { NULL,     N,              Token::ILLEGAL },
205  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
206  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
207  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
208  { "return", KEYWORD_PREFIX, Token::RETURN },
209  { "switch", KEYWORD_PREFIX, Token::SWITCH },
210  { NULL,     T,              Token::ILLEGAL },
211  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
212  { NULL,     V,              Token::ILLEGAL },
213  { NULL,     W,              Token::ILLEGAL }
214};
215
216
217void KeywordMatcher::Step(uc32 input) {
218  switch (state_) {
219    case INITIAL: {
220      // matching the first character is the only state with significant fanout.
221      // Match only lower-case letters in range 'b'..'w'.
222      unsigned int offset = input - kFirstCharRangeMin;
223      if (offset < kFirstCharRangeLength) {
224        state_ = first_states_[offset].state;
225        if (state_ == KEYWORD_PREFIX) {
226          keyword_ = first_states_[offset].keyword;
227          counter_ = 1;
228          keyword_token_ = first_states_[offset].token;
229        }
230        return;
231      }
232      break;
233    }
234    case KEYWORD_PREFIX:
235      if (keyword_[counter_] == input) {
236        ASSERT_NE(input, '\0');
237        counter_++;
238        if (keyword_[counter_] == '\0') {
239          state_ = KEYWORD_MATCHED;
240          token_ = keyword_token_;
241        }
242        return;
243      }
244      break;
245    case KEYWORD_MATCHED:
246      token_ = Token::IDENTIFIER;
247      break;
248    case C:
249      if (MatchState(input, 'a', CA)) return;
250      if (MatchState(input, 'o', CO)) return;
251      break;
252    case CA:
253      if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
254      if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
255      break;
256    case CO:
257      if (MatchState(input, 'n', CON)) return;
258      break;
259    case CON:
260      if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
261      if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
262      break;
263    case D:
264      if (MatchState(input, 'e', DE)) return;
265      if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
266      break;
267    case DE:
268      if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
269      if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
270      if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
271      break;
272    case F:
273      if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
274      if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
275      if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
276      if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
277      break;
278    case I:
279      if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
280      if (MatchKeyword(input, 'n', IN, Token::IN)) return;
281      break;
282    case IN:
283      token_ = Token::IDENTIFIER;
284      if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
285        return;
286      }
287      break;
288    case N:
289      if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
290      if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
291      if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
292      break;
293    case T:
294      if (MatchState(input, 'h', TH)) return;
295      if (MatchState(input, 'r', TR)) return;
296      if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
297      break;
298    case TH:
299      if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
300      if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
301      break;
302    case TR:
303      if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
304      if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
305      break;
306    case V:
307      if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
308      if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
309      break;
310    case W:
311      if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
312      if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
313      break;
314    default:
315      UNREACHABLE();
316  }
317  // On fallthrough, it's a failure.
318  state_ = UNMATCHABLE;
319}
320
321
322
323// ----------------------------------------------------------------------------
324// Scanner::LiteralScope
325
326Scanner::LiteralScope::LiteralScope(Scanner* self)
327    : scanner_(self), complete_(false) {
328  self->StartLiteral();
329}
330
331
332Scanner::LiteralScope::~LiteralScope() {
333  if (!complete_) scanner_->DropLiteral();
334}
335
336
337void Scanner::LiteralScope::Complete() {
338  scanner_->TerminateLiteral();
339  complete_ = true;
340}
341
342// ----------------------------------------------------------------------------
343// Scanner
344
345Scanner::Scanner(ParserMode pre)
346    : is_pre_parsing_(pre == PREPARSE), stack_overflow_(false) { }
347
348
349void Scanner::Initialize(Handle<String> source,
350                         ParserLanguage language) {
351  Init(source, NULL, 0, source->length(), language);
352}
353
354
355void Scanner::Initialize(Handle<String> source,
356                         unibrow::CharacterStream* stream,
357                         ParserLanguage language) {
358  Init(source, stream, 0, kNoEndPosition, language);
359}
360
361
362void Scanner::Initialize(Handle<String> source,
363                         int start_position,
364                         int end_position,
365                         ParserLanguage language) {
366  Init(source, NULL, start_position, end_position, language);
367}
368
369
370void Scanner::Init(Handle<String> source,
371                   unibrow::CharacterStream* stream,
372                   int start_position,
373                   int end_position,
374                   ParserLanguage language) {
375  // Either initialize the scanner from a character stream or from a
376  // string.
377  ASSERT(source.is_null() || stream == NULL);
378
379  // Initialize the source buffer.
380  if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
381    two_byte_string_buffer_.Initialize(
382        Handle<ExternalTwoByteString>::cast(source),
383        start_position,
384        end_position);
385    source_ = &two_byte_string_buffer_;
386  } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
387    ascii_string_buffer_.Initialize(
388        Handle<ExternalAsciiString>::cast(source),
389        start_position,
390        end_position);
391    source_ = &ascii_string_buffer_;
392  } else {
393    if (!source.is_null()) {
394      safe_string_input_buffer_.Reset(source.location());
395      stream = &safe_string_input_buffer_;
396    }
397    char_stream_buffer_.Initialize(source,
398                                   stream,
399                                   start_position,
400                                   end_position);
401    source_ = &char_stream_buffer_;
402  }
403
404  is_parsing_json_ = (language == JSON);
405
406  // Set c0_ (one character ahead)
407  ASSERT(kCharacterLookaheadBufferSize == 1);
408  Advance();
409  // Initialize current_ to not refer to a literal.
410  current_.literal_chars = Vector<const char>();
411  // Reset literal buffer.
412  literal_buffer_.Reset();
413
414  // Skip initial whitespace allowing HTML comment ends just like
415  // after a newline and scan first token.
416  has_line_terminator_before_next_ = true;
417  SkipWhiteSpace();
418  Scan();
419}
420
421
422Token::Value Scanner::Next() {
423  // BUG 1215673: Find a thread safe way to set a stack limit in
424  // pre-parse mode. Otherwise, we cannot safely pre-parse from other
425  // threads.
426  current_ = next_;
427  // Check for stack-overflow before returning any tokens.
428  StackLimitCheck check;
429  if (check.HasOverflowed()) {
430    stack_overflow_ = true;
431    next_.token = Token::ILLEGAL;
432  } else {
433    has_line_terminator_before_next_ = false;
434    Scan();
435  }
436  return current_.token;
437}
438
439
440void Scanner::StartLiteral() {
441  literal_buffer_.StartLiteral();
442}
443
444
445void Scanner::AddChar(uc32 c) {
446  literal_buffer_.AddChar(c);
447}
448
449
450void Scanner::TerminateLiteral() {
451  next_.literal_chars = literal_buffer_.EndLiteral();
452}
453
454
455void Scanner::DropLiteral() {
456  literal_buffer_.DropLiteral();
457}
458
459
460void Scanner::AddCharAdvance() {
461  AddChar(c0_);
462  Advance();
463}
464
465
466static inline bool IsByteOrderMark(uc32 c) {
467  // The Unicode value U+FFFE is guaranteed never to be assigned as a
468  // Unicode character; this implies that in a Unicode context the
469  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
470  // character expressed in little-endian byte order (since it could
471  // not be a U+FFFE character expressed in big-endian byte
472  // order). Nevertheless, we check for it to be compatible with
473  // Spidermonkey.
474  return c == 0xFEFF || c == 0xFFFE;
475}
476
477
478bool Scanner::SkipJsonWhiteSpace() {
479  int start_position = source_pos();
480  // JSON WhiteSpace is tab, carrige-return, newline and space.
481  while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
482    Advance();
483  }
484  return source_pos() != start_position;
485}
486
487
488bool Scanner::SkipJavaScriptWhiteSpace() {
489  int start_position = source_pos();
490
491  while (true) {
492    // We treat byte-order marks (BOMs) as whitespace for better
493    // compatibility with Spidermonkey and other JavaScript engines.
494    while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
495      // IsWhiteSpace() includes line terminators!
496      if (kIsLineTerminator.get(c0_)) {
497        // Ignore line terminators, but remember them. This is necessary
498        // for automatic semicolon insertion.
499        has_line_terminator_before_next_ = true;
500      }
501      Advance();
502    }
503
504    // If there is an HTML comment end '-->' at the beginning of a
505    // line (with only whitespace in front of it), we treat the rest
506    // of the line as a comment. This is in line with the way
507    // SpiderMonkey handles it.
508    if (c0_ == '-' && has_line_terminator_before_next_) {
509      Advance();
510      if (c0_ == '-') {
511        Advance();
512        if (c0_ == '>') {
513          // Treat the rest of the line as a comment.
514          SkipSingleLineComment();
515          // Continue skipping white space after the comment.
516          continue;
517        }
518        PushBack('-');  // undo Advance()
519      }
520      PushBack('-');  // undo Advance()
521    }
522    // Return whether or not we skipped any characters.
523    return source_pos() != start_position;
524  }
525}
526
527
528Token::Value Scanner::SkipSingleLineComment() {
529  Advance();
530
531  // The line terminator at the end of the line is not considered
532  // to be part of the single-line comment; it is recognized
533  // separately by the lexical grammar and becomes part of the
534  // stream of input elements for the syntactic grammar (see
535  // ECMA-262, section 7.4, page 12).
536  while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
537    Advance();
538  }
539
540  return Token::WHITESPACE;
541}
542
543
544Token::Value Scanner::SkipMultiLineComment() {
545  ASSERT(c0_ == '*');
546  Advance();
547
548  while (c0_ >= 0) {
549    char ch = c0_;
550    Advance();
551    // If we have reached the end of the multi-line comment, we
552    // consume the '/' and insert a whitespace. This way all
553    // multi-line comments are treated as whitespace - even the ones
554    // containing line terminators. This contradicts ECMA-262, section
555    // 7.4, page 12, that says that multi-line comments containing
556    // line terminators should be treated as a line terminator, but it
557    // matches the behaviour of SpiderMonkey and KJS.
558    if (ch == '*' && c0_ == '/') {
559      c0_ = ' ';
560      return Token::WHITESPACE;
561    }
562  }
563
564  // Unterminated multi-line comment.
565  return Token::ILLEGAL;
566}
567
568
569Token::Value Scanner::ScanHtmlComment() {
570  // Check for <!-- comments.
571  ASSERT(c0_ == '!');
572  Advance();
573  if (c0_ == '-') {
574    Advance();
575    if (c0_ == '-') return SkipSingleLineComment();
576    PushBack('-');  // undo Advance()
577  }
578  PushBack('!');  // undo Advance()
579  ASSERT(c0_ == '!');
580  return Token::LT;
581}
582
583
584
585void Scanner::ScanJson() {
586  next_.literal_chars = Vector<const char>();
587  Token::Value token;
588  has_line_terminator_before_next_ = false;
589  do {
590    // Remember the position of the next token
591    next_.location.beg_pos = source_pos();
592    switch (c0_) {
593      case '\t':
594      case '\r':
595      case '\n':
596      case ' ':
597        Advance();
598        token = Token::WHITESPACE;
599        break;
600      case '{':
601        Advance();
602        token = Token::LBRACE;
603        break;
604      case '}':
605        Advance();
606        token = Token::RBRACE;
607        break;
608      case '[':
609        Advance();
610        token = Token::LBRACK;
611        break;
612      case ']':
613        Advance();
614        token = Token::RBRACK;
615        break;
616      case ':':
617        Advance();
618        token = Token::COLON;
619        break;
620      case ',':
621        Advance();
622        token = Token::COMMA;
623        break;
624      case '"':
625        token = ScanJsonString();
626        break;
627      case '-':
628      case '0':
629      case '1':
630      case '2':
631      case '3':
632      case '4':
633      case '5':
634      case '6':
635      case '7':
636      case '8':
637      case '9':
638        token = ScanJsonNumber();
639        break;
640      case 't':
641        token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
642        break;
643      case 'f':
644        token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
645        break;
646      case 'n':
647        token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
648        break;
649      default:
650        if (c0_ < 0) {
651          Advance();
652          token = Token::EOS;
653        } else {
654          Advance();
655          token = Select(Token::ILLEGAL);
656        }
657    }
658  } while (token == Token::WHITESPACE);
659
660  next_.location.end_pos = source_pos();
661  next_.token = token;
662}
663
664
665Token::Value Scanner::ScanJsonString() {
666  ASSERT_EQ('"', c0_);
667  Advance();
668  LiteralScope literal(this);
669  while (c0_ != '"' && c0_ > 0) {
670    // Check for control character (0x00-0x1f) or unterminated string (<0).
671    if (c0_ < 0x20) return Token::ILLEGAL;
672    if (c0_ != '\\') {
673      AddCharAdvance();
674    } else {
675      Advance();
676      switch (c0_) {
677        case '"':
678        case '\\':
679        case '/':
680          AddChar(c0_);
681          break;
682        case 'b':
683          AddChar('\x08');
684          break;
685        case 'f':
686          AddChar('\x0c');
687          break;
688        case 'n':
689          AddChar('\x0a');
690          break;
691        case 'r':
692          AddChar('\x0d');
693          break;
694        case 't':
695          AddChar('\x09');
696          break;
697        case 'u': {
698          uc32 value = 0;
699          for (int i = 0; i < 4; i++) {
700            Advance();
701            int digit = HexValue(c0_);
702            if (digit < 0) {
703              return Token::ILLEGAL;
704            }
705            value = value * 16 + digit;
706          }
707          AddChar(value);
708          break;
709        }
710        default:
711          return Token::ILLEGAL;
712      }
713      Advance();
714    }
715  }
716  if (c0_ != '"') {
717    return Token::ILLEGAL;
718  }
719  literal.Complete();
720  Advance();
721  return Token::STRING;
722}
723
724
725Token::Value Scanner::ScanJsonNumber() {
726  LiteralScope literal(this);
727  if (c0_ == '-') AddCharAdvance();
728  if (c0_ == '0') {
729    AddCharAdvance();
730    // Prefix zero is only allowed if it's the only digit before
731    // a decimal point or exponent.
732    if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
733  } else {
734    if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
735    do {
736      AddCharAdvance();
737    } while (c0_ >= '0' && c0_ <= '9');
738  }
739  if (c0_ == '.') {
740    AddCharAdvance();
741    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
742    do {
743      AddCharAdvance();
744    } while (c0_ >= '0' && c0_ <= '9');
745  }
746  if (AsciiAlphaToLower(c0_) == 'e') {
747    AddCharAdvance();
748    if (c0_ == '-' || c0_ == '+') AddCharAdvance();
749    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
750    do {
751      AddCharAdvance();
752    } while (c0_ >= '0' && c0_ <= '9');
753  }
754  literal.Complete();
755  return Token::NUMBER;
756}
757
758
759Token::Value Scanner::ScanJsonIdentifier(const char* text,
760                                         Token::Value token) {
761  LiteralScope literal(this);
762  while (*text != '\0') {
763    if (c0_ != *text) return Token::ILLEGAL;
764    Advance();
765    text++;
766  }
767  if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
768  literal.Complete();
769  return token;
770}
771
772
773void Scanner::ScanJavaScript() {
774  next_.literal_chars = Vector<const char>();
775  Token::Value token;
776  do {
777    // Remember the position of the next token
778    next_.location.beg_pos = source_pos();
779
780    switch (c0_) {
781      case ' ':
782      case '\t':
783        Advance();
784        token = Token::WHITESPACE;
785        break;
786
787      case '\n':
788        Advance();
789        has_line_terminator_before_next_ = true;
790        token = Token::WHITESPACE;
791        break;
792
793      case '"': case '\'':
794        token = ScanString();
795        break;
796
797      case '<':
798        // < <= << <<= <!--
799        Advance();
800        if (c0_ == '=') {
801          token = Select(Token::LTE);
802        } else if (c0_ == '<') {
803          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
804        } else if (c0_ == '!') {
805          token = ScanHtmlComment();
806        } else {
807          token = Token::LT;
808        }
809        break;
810
811      case '>':
812        // > >= >> >>= >>> >>>=
813        Advance();
814        if (c0_ == '=') {
815          token = Select(Token::GTE);
816        } else if (c0_ == '>') {
817          // >> >>= >>> >>>=
818          Advance();
819          if (c0_ == '=') {
820            token = Select(Token::ASSIGN_SAR);
821          } else if (c0_ == '>') {
822            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
823          } else {
824            token = Token::SAR;
825          }
826        } else {
827          token = Token::GT;
828        }
829        break;
830
831      case '=':
832        // = == ===
833        Advance();
834        if (c0_ == '=') {
835          token = Select('=', Token::EQ_STRICT, Token::EQ);
836        } else {
837          token = Token::ASSIGN;
838        }
839        break;
840
841      case '!':
842        // ! != !==
843        Advance();
844        if (c0_ == '=') {
845          token = Select('=', Token::NE_STRICT, Token::NE);
846        } else {
847          token = Token::NOT;
848        }
849        break;
850
851      case '+':
852        // + ++ +=
853        Advance();
854        if (c0_ == '+') {
855          token = Select(Token::INC);
856        } else if (c0_ == '=') {
857          token = Select(Token::ASSIGN_ADD);
858        } else {
859          token = Token::ADD;
860        }
861        break;
862
863      case '-':
864        // - -- --> -=
865        Advance();
866        if (c0_ == '-') {
867          Advance();
868          if (c0_ == '>' && has_line_terminator_before_next_) {
869            // For compatibility with SpiderMonkey, we skip lines that
870            // start with an HTML comment end '-->'.
871            token = SkipSingleLineComment();
872          } else {
873            token = Token::DEC;
874          }
875        } else if (c0_ == '=') {
876          token = Select(Token::ASSIGN_SUB);
877        } else {
878          token = Token::SUB;
879        }
880        break;
881
882      case '*':
883        // * *=
884        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
885        break;
886
887      case '%':
888        // % %=
889        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
890        break;
891
892      case '/':
893        // /  // /* /=
894        Advance();
895        if (c0_ == '/') {
896          token = SkipSingleLineComment();
897        } else if (c0_ == '*') {
898          token = SkipMultiLineComment();
899        } else if (c0_ == '=') {
900          token = Select(Token::ASSIGN_DIV);
901        } else {
902          token = Token::DIV;
903        }
904        break;
905
906      case '&':
907        // & && &=
908        Advance();
909        if (c0_ == '&') {
910          token = Select(Token::AND);
911        } else if (c0_ == '=') {
912          token = Select(Token::ASSIGN_BIT_AND);
913        } else {
914          token = Token::BIT_AND;
915        }
916        break;
917
918      case '|':
919        // | || |=
920        Advance();
921        if (c0_ == '|') {
922          token = Select(Token::OR);
923        } else if (c0_ == '=') {
924          token = Select(Token::ASSIGN_BIT_OR);
925        } else {
926          token = Token::BIT_OR;
927        }
928        break;
929
930      case '^':
931        // ^ ^=
932        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
933        break;
934
935      case '.':
936        // . Number
937        Advance();
938        if (IsDecimalDigit(c0_)) {
939          token = ScanNumber(true);
940        } else {
941          token = Token::PERIOD;
942        }
943        break;
944
945      case ':':
946        token = Select(Token::COLON);
947        break;
948
949      case ';':
950        token = Select(Token::SEMICOLON);
951        break;
952
953      case ',':
954        token = Select(Token::COMMA);
955        break;
956
957      case '(':
958        token = Select(Token::LPAREN);
959        break;
960
961      case ')':
962        token = Select(Token::RPAREN);
963        break;
964
965      case '[':
966        token = Select(Token::LBRACK);
967        break;
968
969      case ']':
970        token = Select(Token::RBRACK);
971        break;
972
973      case '{':
974        token = Select(Token::LBRACE);
975        break;
976
977      case '}':
978        token = Select(Token::RBRACE);
979        break;
980
981      case '?':
982        token = Select(Token::CONDITIONAL);
983        break;
984
985      case '~':
986        token = Select(Token::BIT_NOT);
987        break;
988
989      default:
990        if (kIsIdentifierStart.get(c0_)) {
991          token = ScanIdentifier();
992        } else if (IsDecimalDigit(c0_)) {
993          token = ScanNumber(false);
994        } else if (SkipWhiteSpace()) {
995          token = Token::WHITESPACE;
996        } else if (c0_ < 0) {
997          token = Token::EOS;
998        } else {
999          token = Select(Token::ILLEGAL);
1000        }
1001        break;
1002    }
1003
1004    // Continue scanning for tokens as long as we're just skipping
1005    // whitespace.
1006  } while (token == Token::WHITESPACE);
1007
1008  next_.location.end_pos = source_pos();
1009  next_.token = token;
1010}
1011
1012
1013void Scanner::SeekForward(int pos) {
1014  source_->SeekForward(pos - 1);
1015  Advance();
1016  // This function is only called to seek to the location
1017  // of the end of a function (at the "}" token). It doesn't matter
1018  // whether there was a line terminator in the part we skip.
1019  has_line_terminator_before_next_ = false;
1020  Scan();
1021}
1022
1023
1024uc32 Scanner::ScanHexEscape(uc32 c, int length) {
1025  ASSERT(length <= 4);  // prevent overflow
1026
1027  uc32 digits[4];
1028  uc32 x = 0;
1029  for (int i = 0; i < length; i++) {
1030    digits[i] = c0_;
1031    int d = HexValue(c0_);
1032    if (d < 0) {
1033      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
1034      // should be illegal, but other JS VMs just return the
1035      // non-escaped version of the original character.
1036
1037      // Push back digits read, except the last one (in c0_).
1038      for (int j = i-1; j >= 0; j--) {
1039        PushBack(digits[j]);
1040      }
1041      // Notice: No handling of error - treat it as "\u"->"u".
1042      return c;
1043    }
1044    x = x * 16 + d;
1045    Advance();
1046  }
1047
1048  return x;
1049}
1050
1051
1052// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
1053// ECMA-262. Other JS VMs support them.
1054uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
1055  uc32 x = c - '0';
1056  for (int i = 0; i < length; i++) {
1057    int d = c0_ - '0';
1058    if (d < 0 || d > 7) break;
1059    int nx = x * 8 + d;
1060    if (nx >= 256) break;
1061    x = nx;
1062    Advance();
1063  }
1064  return x;
1065}
1066
1067
1068void Scanner::ScanEscape() {
1069  uc32 c = c0_;
1070  Advance();
1071
1072  // Skip escaped newlines.
1073  if (kIsLineTerminator.get(c)) {
1074    // Allow CR+LF newlines in multiline string literals.
1075    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
1076    // Allow LF+CR newlines in multiline string literals.
1077    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
1078    return;
1079  }
1080
1081  switch (c) {
1082    case '\'':  // fall through
1083    case '"' :  // fall through
1084    case '\\': break;
1085    case 'b' : c = '\b'; break;
1086    case 'f' : c = '\f'; break;
1087    case 'n' : c = '\n'; break;
1088    case 'r' : c = '\r'; break;
1089    case 't' : c = '\t'; break;
1090    case 'u' : c = ScanHexEscape(c, 4); break;
1091    case 'v' : c = '\v'; break;
1092    case 'x' : c = ScanHexEscape(c, 2); break;
1093    case '0' :  // fall through
1094    case '1' :  // fall through
1095    case '2' :  // fall through
1096    case '3' :  // fall through
1097    case '4' :  // fall through
1098    case '5' :  // fall through
1099    case '6' :  // fall through
1100    case '7' : c = ScanOctalEscape(c, 2); break;
1101  }
1102
1103  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
1104  // should be illegal, but they are commonly handled
1105  // as non-escaped characters by JS VMs.
1106  AddChar(c);
1107}
1108
1109
1110Token::Value Scanner::ScanString() {
1111  uc32 quote = c0_;
1112  Advance();  // consume quote
1113
1114  LiteralScope literal(this);
1115  while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
1116    uc32 c = c0_;
1117    Advance();
1118    if (c == '\\') {
1119      if (c0_ < 0) return Token::ILLEGAL;
1120      ScanEscape();
1121    } else {
1122      AddChar(c);
1123    }
1124  }
1125  if (c0_ != quote) return Token::ILLEGAL;
1126  literal.Complete();
1127
1128  Advance();  // consume quote
1129  return Token::STRING;
1130}
1131
1132
1133Token::Value Scanner::Select(Token::Value tok) {
1134  Advance();
1135  return tok;
1136}
1137
1138
1139Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
1140  Advance();
1141  if (c0_ == next) {
1142    Advance();
1143    return then;
1144  } else {
1145    return else_;
1146  }
1147}
1148
1149
1150// Returns true if any decimal digits were scanned, returns false otherwise.
1151void Scanner::ScanDecimalDigits() {
1152  while (IsDecimalDigit(c0_))
1153    AddCharAdvance();
1154}
1155
1156
1157Token::Value Scanner::ScanNumber(bool seen_period) {
1158  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1159
1160  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
1161
1162  LiteralScope literal(this);
1163  if (seen_period) {
1164    // we have already seen a decimal point of the float
1165    AddChar('.');
1166    ScanDecimalDigits();  // we know we have at least one digit
1167
1168  } else {
1169    // if the first character is '0' we must check for octals and hex
1170    if (c0_ == '0') {
1171      AddCharAdvance();
1172
1173      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
1174      if (c0_ == 'x' || c0_ == 'X') {
1175        // hex number
1176        kind = HEX;
1177        AddCharAdvance();
1178        if (!IsHexDigit(c0_)) {
1179          // we must have at least one hex digit after 'x'/'X'
1180          return Token::ILLEGAL;
1181        }
1182        while (IsHexDigit(c0_)) {
1183          AddCharAdvance();
1184        }
1185      } else if ('0' <= c0_ && c0_ <= '7') {
1186        // (possible) octal number
1187        kind = OCTAL;
1188        while (true) {
1189          if (c0_ == '8' || c0_ == '9') {
1190            kind = DECIMAL;
1191            break;
1192          }
1193          if (c0_  < '0' || '7'  < c0_) break;
1194          AddCharAdvance();
1195        }
1196      }
1197    }
1198
1199    // Parse decimal digits and allow trailing fractional part.
1200    if (kind == DECIMAL) {
1201      ScanDecimalDigits();  // optional
1202      if (c0_ == '.') {
1203        AddCharAdvance();
1204        ScanDecimalDigits();  // optional
1205      }
1206    }
1207  }
1208
1209  // scan exponent, if any
1210  if (c0_ == 'e' || c0_ == 'E') {
1211    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1212    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
1213    // scan exponent
1214    AddCharAdvance();
1215    if (c0_ == '+' || c0_ == '-')
1216      AddCharAdvance();
1217    if (!IsDecimalDigit(c0_)) {
1218      // we must have at least one decimal digit after 'e'/'E'
1219      return Token::ILLEGAL;
1220    }
1221    ScanDecimalDigits();
1222  }
1223
1224  // The source character immediately following a numeric literal must
1225  // not be an identifier start or a decimal digit; see ECMA-262
1226  // section 7.8.3, page 17 (note that we read only one decimal digit
1227  // if the value is 0).
1228  if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
1229    return Token::ILLEGAL;
1230
1231  literal.Complete();
1232
1233  return Token::NUMBER;
1234}
1235
1236
1237uc32 Scanner::ScanIdentifierUnicodeEscape() {
1238  Advance();
1239  if (c0_ != 'u') return unibrow::Utf8::kBadChar;
1240  Advance();
1241  uc32 c = ScanHexEscape('u', 4);
1242  // We do not allow a unicode escape sequence to start another
1243  // unicode escape sequence.
1244  if (c == '\\') return unibrow::Utf8::kBadChar;
1245  return c;
1246}
1247
1248
1249Token::Value Scanner::ScanIdentifier() {
1250  ASSERT(kIsIdentifierStart.get(c0_));
1251
1252  LiteralScope literal(this);
1253  KeywordMatcher keyword_match;
1254
1255  // Scan identifier start character.
1256  if (c0_ == '\\') {
1257    uc32 c = ScanIdentifierUnicodeEscape();
1258    // Only allow legal identifier start characters.
1259    if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
1260    AddChar(c);
1261    keyword_match.Fail();
1262  } else {
1263    AddChar(c0_);
1264    keyword_match.AddChar(c0_);
1265    Advance();
1266  }
1267
1268  // Scan the rest of the identifier characters.
1269  while (kIsIdentifierPart.get(c0_)) {
1270    if (c0_ == '\\') {
1271      uc32 c = ScanIdentifierUnicodeEscape();
1272      // Only allow legal identifier part characters.
1273      if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
1274      AddChar(c);
1275      keyword_match.Fail();
1276    } else {
1277      AddChar(c0_);
1278      keyword_match.AddChar(c0_);
1279      Advance();
1280    }
1281  }
1282  literal.Complete();
1283
1284  return keyword_match.token();
1285}
1286
1287
1288
1289bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
1290  // Checks whether the buffer contains an identifier (no escape).
1291  if (!buffer->has_more()) return false;
1292  if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
1293  while (buffer->has_more()) {
1294    if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
1295  }
1296  return true;
1297}
1298
1299
1300bool Scanner::ScanRegExpPattern(bool seen_equal) {
1301  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1302  bool in_character_class = false;
1303
1304  // Previous token is either '/' or '/=', in the second case, the
1305  // pattern starts at =.
1306  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1307  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1308
1309  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1310  // the scanner should pass uninterpreted bodies to the RegExp
1311  // constructor.
1312  LiteralScope literal(this);
1313  if (seen_equal)
1314    AddChar('=');
1315
1316  while (c0_ != '/' || in_character_class) {
1317    if (kIsLineTerminator.get(c0_) || c0_ < 0) return false;
1318    if (c0_ == '\\') {  // escaped character
1319      AddCharAdvance();
1320      if (kIsLineTerminator.get(c0_) || c0_ < 0) return false;
1321      AddCharAdvance();
1322    } else {  // unescaped character
1323      if (c0_ == '[') in_character_class = true;
1324      if (c0_ == ']') in_character_class = false;
1325      AddCharAdvance();
1326    }
1327  }
1328  Advance();  // consume '/'
1329
1330  literal.Complete();
1331
1332  return true;
1333}
1334
1335bool Scanner::ScanRegExpFlags() {
1336  // Scan regular expression flags.
1337  LiteralScope literal(this);
1338  while (kIsIdentifierPart.get(c0_)) {
1339    if (c0_ == '\\') {
1340      uc32 c = ScanIdentifierUnicodeEscape();
1341      if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1342        // We allow any escaped character, unlike the restriction on
1343        // IdentifierPart when it is used to build an IdentifierName.
1344        AddChar(c);
1345        continue;
1346      }
1347    }
1348    AddCharAdvance();
1349  }
1350  literal.Complete();
1351
1352  next_.location.end_pos = source_pos() - 1;
1353  return true;
1354}
1355
1356} }  // namespace v8::internal
1357