scanner.cc revision 85b71799222b55eb5dd74ea26efe0c64ab655c8c
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#include "scanner.h"
31
32#include "../include/v8stdint.h"
33#include "char-predicates-inl.h"
34
35namespace v8 {
36namespace internal {
37
38// ----------------------------------------------------------------------------
39// Scanner::LiteralScope
40
41Scanner::LiteralScope::LiteralScope(Scanner* self)
42    : scanner_(self), complete_(false) {
43  self->StartLiteral();
44}
45
46
47Scanner::LiteralScope::~LiteralScope() {
48  if (!complete_) scanner_->DropLiteral();
49}
50
51
52void Scanner::LiteralScope::Complete() {
53  scanner_->TerminateLiteral();
54  complete_ = true;
55}
56
57// ----------------------------------------------------------------------------
58// Scanner
59
60Scanner::Scanner(UnicodeCache* unicode_cache)
61    : unicode_cache_(unicode_cache) { }
62
63
64uc32 Scanner::ScanHexNumber(int expected_length) {
65  ASSERT(expected_length <= 4);  // prevent overflow
66
67  uc32 digits[4] = { 0, 0, 0, 0 };
68  uc32 x = 0;
69  for (int i = 0; i < expected_length; i++) {
70    digits[i] = c0_;
71    int d = HexValue(c0_);
72    if (d < 0) {
73      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
74      // should be illegal, but other JS VMs just return the
75      // non-escaped version of the original character.
76
77      // Push back digits that we have advanced past.
78      for (int j = i-1; j >= 0; j--) {
79        PushBack(digits[j]);
80      }
81      return -1;
82    }
83    x = x * 16 + d;
84    Advance();
85  }
86
87  return x;
88}
89
90
91
92// ----------------------------------------------------------------------------
93// JavaScriptScanner
94
95JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
96    : Scanner(scanner_contants),
97      octal_pos_(Location::invalid()),
98      harmony_block_scoping_(false) { }
99
100
101void JavaScriptScanner::Initialize(UC16CharacterStream* source) {
102  source_ = source;
103  // Need to capture identifiers in order to recognize "get" and "set"
104  // in object literals.
105  Init();
106  // Skip initial whitespace allowing HTML comment ends just like
107  // after a newline and scan first token.
108  has_line_terminator_before_next_ = true;
109  SkipWhiteSpace();
110  Scan();
111}
112
113
114// Ensure that tokens can be stored in a byte.
115STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
116
117// Table of one-character tokens, by character (0x00..0x7f only).
118static const byte one_char_tokens[] = {
119  Token::ILLEGAL,
120  Token::ILLEGAL,
121  Token::ILLEGAL,
122  Token::ILLEGAL,
123  Token::ILLEGAL,
124  Token::ILLEGAL,
125  Token::ILLEGAL,
126  Token::ILLEGAL,
127  Token::ILLEGAL,
128  Token::ILLEGAL,
129  Token::ILLEGAL,
130  Token::ILLEGAL,
131  Token::ILLEGAL,
132  Token::ILLEGAL,
133  Token::ILLEGAL,
134  Token::ILLEGAL,
135  Token::ILLEGAL,
136  Token::ILLEGAL,
137  Token::ILLEGAL,
138  Token::ILLEGAL,
139  Token::ILLEGAL,
140  Token::ILLEGAL,
141  Token::ILLEGAL,
142  Token::ILLEGAL,
143  Token::ILLEGAL,
144  Token::ILLEGAL,
145  Token::ILLEGAL,
146  Token::ILLEGAL,
147  Token::ILLEGAL,
148  Token::ILLEGAL,
149  Token::ILLEGAL,
150  Token::ILLEGAL,
151  Token::ILLEGAL,
152  Token::ILLEGAL,
153  Token::ILLEGAL,
154  Token::ILLEGAL,
155  Token::ILLEGAL,
156  Token::ILLEGAL,
157  Token::ILLEGAL,
158  Token::ILLEGAL,
159  Token::LPAREN,       // 0x28
160  Token::RPAREN,       // 0x29
161  Token::ILLEGAL,
162  Token::ILLEGAL,
163  Token::COMMA,        // 0x2c
164  Token::ILLEGAL,
165  Token::ILLEGAL,
166  Token::ILLEGAL,
167  Token::ILLEGAL,
168  Token::ILLEGAL,
169  Token::ILLEGAL,
170  Token::ILLEGAL,
171  Token::ILLEGAL,
172  Token::ILLEGAL,
173  Token::ILLEGAL,
174  Token::ILLEGAL,
175  Token::ILLEGAL,
176  Token::ILLEGAL,
177  Token::COLON,        // 0x3a
178  Token::SEMICOLON,    // 0x3b
179  Token::ILLEGAL,
180  Token::ILLEGAL,
181  Token::ILLEGAL,
182  Token::CONDITIONAL,  // 0x3f
183  Token::ILLEGAL,
184  Token::ILLEGAL,
185  Token::ILLEGAL,
186  Token::ILLEGAL,
187  Token::ILLEGAL,
188  Token::ILLEGAL,
189  Token::ILLEGAL,
190  Token::ILLEGAL,
191  Token::ILLEGAL,
192  Token::ILLEGAL,
193  Token::ILLEGAL,
194  Token::ILLEGAL,
195  Token::ILLEGAL,
196  Token::ILLEGAL,
197  Token::ILLEGAL,
198  Token::ILLEGAL,
199  Token::ILLEGAL,
200  Token::ILLEGAL,
201  Token::ILLEGAL,
202  Token::ILLEGAL,
203  Token::ILLEGAL,
204  Token::ILLEGAL,
205  Token::ILLEGAL,
206  Token::ILLEGAL,
207  Token::ILLEGAL,
208  Token::ILLEGAL,
209  Token::ILLEGAL,
210  Token::LBRACK,     // 0x5b
211  Token::ILLEGAL,
212  Token::RBRACK,     // 0x5d
213  Token::ILLEGAL,
214  Token::ILLEGAL,
215  Token::ILLEGAL,
216  Token::ILLEGAL,
217  Token::ILLEGAL,
218  Token::ILLEGAL,
219  Token::ILLEGAL,
220  Token::ILLEGAL,
221  Token::ILLEGAL,
222  Token::ILLEGAL,
223  Token::ILLEGAL,
224  Token::ILLEGAL,
225  Token::ILLEGAL,
226  Token::ILLEGAL,
227  Token::ILLEGAL,
228  Token::ILLEGAL,
229  Token::ILLEGAL,
230  Token::ILLEGAL,
231  Token::ILLEGAL,
232  Token::ILLEGAL,
233  Token::ILLEGAL,
234  Token::ILLEGAL,
235  Token::ILLEGAL,
236  Token::ILLEGAL,
237  Token::ILLEGAL,
238  Token::ILLEGAL,
239  Token::ILLEGAL,
240  Token::ILLEGAL,
241  Token::ILLEGAL,
242  Token::LBRACE,       // 0x7b
243  Token::ILLEGAL,
244  Token::RBRACE,       // 0x7d
245  Token::BIT_NOT,      // 0x7e
246  Token::ILLEGAL
247};
248
249
250Token::Value JavaScriptScanner::Next() {
251  current_ = next_;
252  has_line_terminator_before_next_ = false;
253  has_multiline_comment_before_next_ = false;
254  if (static_cast<unsigned>(c0_) <= 0x7f) {
255    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
256    if (token != Token::ILLEGAL) {
257      int pos = source_pos();
258      next_.token = token;
259      next_.location.beg_pos = pos;
260      next_.location.end_pos = pos + 1;
261      Advance();
262      return current_.token;
263    }
264  }
265  Scan();
266  return current_.token;
267}
268
269
270static inline bool IsByteOrderMark(uc32 c) {
271  // The Unicode value U+FFFE is guaranteed never to be assigned as a
272  // Unicode character; this implies that in a Unicode context the
273  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
274  // character expressed in little-endian byte order (since it could
275  // not be a U+FFFE character expressed in big-endian byte
276  // order). Nevertheless, we check for it to be compatible with
277  // Spidermonkey.
278  return c == 0xFEFF || c == 0xFFFE;
279}
280
281
282bool JavaScriptScanner::SkipWhiteSpace() {
283  int start_position = source_pos();
284
285  while (true) {
286    // We treat byte-order marks (BOMs) as whitespace for better
287    // compatibility with Spidermonkey and other JavaScript engines.
288    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
289      // IsWhiteSpace() includes line terminators!
290      if (unicode_cache_->IsLineTerminator(c0_)) {
291        // Ignore line terminators, but remember them. This is necessary
292        // for automatic semicolon insertion.
293        has_line_terminator_before_next_ = true;
294      }
295      Advance();
296    }
297
298    // If there is an HTML comment end '-->' at the beginning of a
299    // line (with only whitespace in front of it), we treat the rest
300    // of the line as a comment. This is in line with the way
301    // SpiderMonkey handles it.
302    if (c0_ == '-' && has_line_terminator_before_next_) {
303      Advance();
304      if (c0_ == '-') {
305        Advance();
306        if (c0_ == '>') {
307          // Treat the rest of the line as a comment.
308          SkipSingleLineComment();
309          // Continue skipping white space after the comment.
310          continue;
311        }
312        PushBack('-');  // undo Advance()
313      }
314      PushBack('-');  // undo Advance()
315    }
316    // Return whether or not we skipped any characters.
317    return source_pos() != start_position;
318  }
319}
320
321
322Token::Value JavaScriptScanner::SkipSingleLineComment() {
323  Advance();
324
325  // The line terminator at the end of the line is not considered
326  // to be part of the single-line comment; it is recognized
327  // separately by the lexical grammar and becomes part of the
328  // stream of input elements for the syntactic grammar (see
329  // ECMA-262, section 7.4).
330  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
331    Advance();
332  }
333
334  return Token::WHITESPACE;
335}
336
337
338Token::Value JavaScriptScanner::SkipMultiLineComment() {
339  ASSERT(c0_ == '*');
340  Advance();
341
342  while (c0_ >= 0) {
343    uc32 ch = c0_;
344    Advance();
345    if (unicode_cache_->IsLineTerminator(ch)) {
346      // Following ECMA-262, section 7.4, a comment containing
347      // a newline will make the comment count as a line-terminator.
348      has_multiline_comment_before_next_ = true;
349    }
350    // If we have reached the end of the multi-line comment, we
351    // consume the '/' and insert a whitespace. This way all
352    // multi-line comments are treated as whitespace.
353    if (ch == '*' && c0_ == '/') {
354      c0_ = ' ';
355      return Token::WHITESPACE;
356    }
357  }
358
359  // Unterminated multi-line comment.
360  return Token::ILLEGAL;
361}
362
363
364Token::Value JavaScriptScanner::ScanHtmlComment() {
365  // Check for <!-- comments.
366  ASSERT(c0_ == '!');
367  Advance();
368  if (c0_ == '-') {
369    Advance();
370    if (c0_ == '-') return SkipSingleLineComment();
371    PushBack('-');  // undo Advance()
372  }
373  PushBack('!');  // undo Advance()
374  ASSERT(c0_ == '!');
375  return Token::LT;
376}
377
378
379void JavaScriptScanner::Scan() {
380  next_.literal_chars = NULL;
381  Token::Value token;
382  do {
383    // Remember the position of the next token
384    next_.location.beg_pos = source_pos();
385
386    switch (c0_) {
387      case ' ':
388      case '\t':
389        Advance();
390        token = Token::WHITESPACE;
391        break;
392
393      case '\n':
394        Advance();
395        has_line_terminator_before_next_ = true;
396        token = Token::WHITESPACE;
397        break;
398
399      case '"': case '\'':
400        token = ScanString();
401        break;
402
403      case '<':
404        // < <= << <<= <!--
405        Advance();
406        if (c0_ == '=') {
407          token = Select(Token::LTE);
408        } else if (c0_ == '<') {
409          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
410        } else if (c0_ == '!') {
411          token = ScanHtmlComment();
412        } else {
413          token = Token::LT;
414        }
415        break;
416
417      case '>':
418        // > >= >> >>= >>> >>>=
419        Advance();
420        if (c0_ == '=') {
421          token = Select(Token::GTE);
422        } else if (c0_ == '>') {
423          // >> >>= >>> >>>=
424          Advance();
425          if (c0_ == '=') {
426            token = Select(Token::ASSIGN_SAR);
427          } else if (c0_ == '>') {
428            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
429          } else {
430            token = Token::SAR;
431          }
432        } else {
433          token = Token::GT;
434        }
435        break;
436
437      case '=':
438        // = == ===
439        Advance();
440        if (c0_ == '=') {
441          token = Select('=', Token::EQ_STRICT, Token::EQ);
442        } else {
443          token = Token::ASSIGN;
444        }
445        break;
446
447      case '!':
448        // ! != !==
449        Advance();
450        if (c0_ == '=') {
451          token = Select('=', Token::NE_STRICT, Token::NE);
452        } else {
453          token = Token::NOT;
454        }
455        break;
456
457      case '+':
458        // + ++ +=
459        Advance();
460        if (c0_ == '+') {
461          token = Select(Token::INC);
462        } else if (c0_ == '=') {
463          token = Select(Token::ASSIGN_ADD);
464        } else {
465          token = Token::ADD;
466        }
467        break;
468
469      case '-':
470        // - -- --> -=
471        Advance();
472        if (c0_ == '-') {
473          Advance();
474          if (c0_ == '>' && has_line_terminator_before_next_) {
475            // For compatibility with SpiderMonkey, we skip lines that
476            // start with an HTML comment end '-->'.
477            token = SkipSingleLineComment();
478          } else {
479            token = Token::DEC;
480          }
481        } else if (c0_ == '=') {
482          token = Select(Token::ASSIGN_SUB);
483        } else {
484          token = Token::SUB;
485        }
486        break;
487
488      case '*':
489        // * *=
490        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
491        break;
492
493      case '%':
494        // % %=
495        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
496        break;
497
498      case '/':
499        // /  // /* /=
500        Advance();
501        if (c0_ == '/') {
502          token = SkipSingleLineComment();
503        } else if (c0_ == '*') {
504          token = SkipMultiLineComment();
505        } else if (c0_ == '=') {
506          token = Select(Token::ASSIGN_DIV);
507        } else {
508          token = Token::DIV;
509        }
510        break;
511
512      case '&':
513        // & && &=
514        Advance();
515        if (c0_ == '&') {
516          token = Select(Token::AND);
517        } else if (c0_ == '=') {
518          token = Select(Token::ASSIGN_BIT_AND);
519        } else {
520          token = Token::BIT_AND;
521        }
522        break;
523
524      case '|':
525        // | || |=
526        Advance();
527        if (c0_ == '|') {
528          token = Select(Token::OR);
529        } else if (c0_ == '=') {
530          token = Select(Token::ASSIGN_BIT_OR);
531        } else {
532          token = Token::BIT_OR;
533        }
534        break;
535
536      case '^':
537        // ^ ^=
538        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
539        break;
540
541      case '.':
542        // . Number
543        Advance();
544        if (IsDecimalDigit(c0_)) {
545          token = ScanNumber(true);
546        } else {
547          token = Token::PERIOD;
548        }
549        break;
550
551      case ':':
552        token = Select(Token::COLON);
553        break;
554
555      case ';':
556        token = Select(Token::SEMICOLON);
557        break;
558
559      case ',':
560        token = Select(Token::COMMA);
561        break;
562
563      case '(':
564        token = Select(Token::LPAREN);
565        break;
566
567      case ')':
568        token = Select(Token::RPAREN);
569        break;
570
571      case '[':
572        token = Select(Token::LBRACK);
573        break;
574
575      case ']':
576        token = Select(Token::RBRACK);
577        break;
578
579      case '{':
580        token = Select(Token::LBRACE);
581        break;
582
583      case '}':
584        token = Select(Token::RBRACE);
585        break;
586
587      case '?':
588        token = Select(Token::CONDITIONAL);
589        break;
590
591      case '~':
592        token = Select(Token::BIT_NOT);
593        break;
594
595      default:
596        if (unicode_cache_->IsIdentifierStart(c0_)) {
597          token = ScanIdentifierOrKeyword();
598        } else if (IsDecimalDigit(c0_)) {
599          token = ScanNumber(false);
600        } else if (SkipWhiteSpace()) {
601          token = Token::WHITESPACE;
602        } else if (c0_ < 0) {
603          token = Token::EOS;
604        } else {
605          token = Select(Token::ILLEGAL);
606        }
607        break;
608    }
609
610    // Continue scanning for tokens as long as we're just skipping
611    // whitespace.
612  } while (token == Token::WHITESPACE);
613
614  next_.location.end_pos = source_pos();
615  next_.token = token;
616}
617
618
619void JavaScriptScanner::SeekForward(int pos) {
620  // After this call, we will have the token at the given position as
621  // the "next" token. The "current" token will be invalid.
622  if (pos == next_.location.beg_pos) return;
623  int current_pos = source_pos();
624  ASSERT_EQ(next_.location.end_pos, current_pos);
625  // Positions inside the lookahead token aren't supported.
626  ASSERT(pos >= current_pos);
627  if (pos != current_pos) {
628    source_->SeekForward(pos - source_->pos());
629    Advance();
630    // This function is only called to seek to the location
631    // of the end of a function (at the "}" token). It doesn't matter
632    // whether there was a line terminator in the part we skip.
633    has_line_terminator_before_next_ = false;
634    has_multiline_comment_before_next_ = false;
635  }
636  Scan();
637}
638
639
640void JavaScriptScanner::ScanEscape() {
641  uc32 c = c0_;
642  Advance();
643
644  // Skip escaped newlines.
645  if (unicode_cache_->IsLineTerminator(c)) {
646    // Allow CR+LF newlines in multiline string literals.
647    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
648    // Allow LF+CR newlines in multiline string literals.
649    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
650    return;
651  }
652
653  switch (c) {
654    case '\'':  // fall through
655    case '"' :  // fall through
656    case '\\': break;
657    case 'b' : c = '\b'; break;
658    case 'f' : c = '\f'; break;
659    case 'n' : c = '\n'; break;
660    case 'r' : c = '\r'; break;
661    case 't' : c = '\t'; break;
662    case 'u' : {
663      c = ScanHexNumber(4);
664      if (c < 0) c = 'u';
665      break;
666    }
667    case 'v' : c = '\v'; break;
668    case 'x' : {
669      c = ScanHexNumber(2);
670      if (c < 0) c = 'x';
671      break;
672    }
673    case '0' :  // fall through
674    case '1' :  // fall through
675    case '2' :  // fall through
676    case '3' :  // fall through
677    case '4' :  // fall through
678    case '5' :  // fall through
679    case '6' :  // fall through
680    case '7' : c = ScanOctalEscape(c, 2); break;
681  }
682
683  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
684  // should be illegal, but they are commonly handled
685  // as non-escaped characters by JS VMs.
686  AddLiteralChar(c);
687}
688
689
690// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
691// ECMA-262. Other JS VMs support them.
692uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) {
693  uc32 x = c - '0';
694  int i = 0;
695  for (; i < length; i++) {
696    int d = c0_ - '0';
697    if (d < 0 || d > 7) break;
698    int nx = x * 8 + d;
699    if (nx >= 256) break;
700    x = nx;
701    Advance();
702  }
703  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
704  // Remember the position of octal escape sequences so that an error
705  // can be reported later (in strict mode).
706  // We don't report the error immediately, because the octal escape can
707  // occur before the "use strict" directive.
708  if (c != '0' || i > 0) {
709    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
710  }
711  return x;
712}
713
714
715Token::Value JavaScriptScanner::ScanString() {
716  uc32 quote = c0_;
717  Advance();  // consume quote
718
719  LiteralScope literal(this);
720  while (c0_ != quote && c0_ >= 0
721         && !unicode_cache_->IsLineTerminator(c0_)) {
722    uc32 c = c0_;
723    Advance();
724    if (c == '\\') {
725      if (c0_ < 0) return Token::ILLEGAL;
726      ScanEscape();
727    } else {
728      AddLiteralChar(c);
729    }
730  }
731  if (c0_ != quote) return Token::ILLEGAL;
732  literal.Complete();
733
734  Advance();  // consume quote
735  return Token::STRING;
736}
737
738
739void JavaScriptScanner::ScanDecimalDigits() {
740  while (IsDecimalDigit(c0_))
741    AddLiteralCharAdvance();
742}
743
744
745Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
746  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
747
748  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
749
750  LiteralScope literal(this);
751  if (seen_period) {
752    // we have already seen a decimal point of the float
753    AddLiteralChar('.');
754    ScanDecimalDigits();  // we know we have at least one digit
755
756  } else {
757    // if the first character is '0' we must check for octals and hex
758    if (c0_ == '0') {
759      int start_pos = source_pos();  // For reporting octal positions.
760      AddLiteralCharAdvance();
761
762      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
763      if (c0_ == 'x' || c0_ == 'X') {
764        // hex number
765        kind = HEX;
766        AddLiteralCharAdvance();
767        if (!IsHexDigit(c0_)) {
768          // we must have at least one hex digit after 'x'/'X'
769          return Token::ILLEGAL;
770        }
771        while (IsHexDigit(c0_)) {
772          AddLiteralCharAdvance();
773        }
774      } else if ('0' <= c0_ && c0_ <= '7') {
775        // (possible) octal number
776        kind = OCTAL;
777        while (true) {
778          if (c0_ == '8' || c0_ == '9') {
779            kind = DECIMAL;
780            break;
781          }
782          if (c0_  < '0' || '7'  < c0_) {
783            // Octal literal finished.
784            octal_pos_ = Location(start_pos, source_pos());
785            break;
786          }
787          AddLiteralCharAdvance();
788        }
789      }
790    }
791
792    // Parse decimal digits and allow trailing fractional part.
793    if (kind == DECIMAL) {
794      ScanDecimalDigits();  // optional
795      if (c0_ == '.') {
796        AddLiteralCharAdvance();
797        ScanDecimalDigits();  // optional
798      }
799    }
800  }
801
802  // scan exponent, if any
803  if (c0_ == 'e' || c0_ == 'E') {
804    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
805    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
806    // scan exponent
807    AddLiteralCharAdvance();
808    if (c0_ == '+' || c0_ == '-')
809      AddLiteralCharAdvance();
810    if (!IsDecimalDigit(c0_)) {
811      // we must have at least one decimal digit after 'e'/'E'
812      return Token::ILLEGAL;
813    }
814    ScanDecimalDigits();
815  }
816
817  // The source character immediately following a numeric literal must
818  // not be an identifier start or a decimal digit; see ECMA-262
819  // section 7.8.3, page 17 (note that we read only one decimal digit
820  // if the value is 0).
821  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
822    return Token::ILLEGAL;
823
824  literal.Complete();
825
826  return Token::NUMBER;
827}
828
829
830uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
831  Advance();
832  if (c0_ != 'u') return -1;
833  Advance();
834  uc32 result = ScanHexNumber(4);
835  if (result < 0) PushBack('u');
836  return result;
837}
838
839
840// ----------------------------------------------------------------------------
841// Keyword Matcher
842
843#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
844  KEYWORD_GROUP('b')                                                \
845  KEYWORD("break", Token::BREAK)                                    \
846  KEYWORD_GROUP('c')                                                \
847  KEYWORD("case", Token::CASE)                                      \
848  KEYWORD("catch", Token::CATCH)                                    \
849  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
850  KEYWORD("const", Token::CONST)                                    \
851  KEYWORD("continue", Token::CONTINUE)                              \
852  KEYWORD_GROUP('d')                                                \
853  KEYWORD("debugger", Token::DEBUGGER)                              \
854  KEYWORD("default", Token::DEFAULT)                                \
855  KEYWORD("delete", Token::DELETE)                                  \
856  KEYWORD("do", Token::DO)                                          \
857  KEYWORD_GROUP('e')                                                \
858  KEYWORD("else", Token::ELSE)                                      \
859  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
860  KEYWORD("export", Token::FUTURE_RESERVED_WORD)                    \
861  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
862  KEYWORD_GROUP('f')                                                \
863  KEYWORD("false", Token::FALSE_LITERAL)                            \
864  KEYWORD("finally", Token::FINALLY)                                \
865  KEYWORD("for", Token::FOR)                                        \
866  KEYWORD("function", Token::FUNCTION)                              \
867  KEYWORD_GROUP('i')                                                \
868  KEYWORD("if", Token::IF)                                          \
869  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
870  KEYWORD("import", Token::FUTURE_RESERVED_WORD)                    \
871  KEYWORD("in", Token::IN)                                          \
872  KEYWORD("instanceof", Token::INSTANCEOF)                          \
873  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
874  KEYWORD_GROUP('l')                                                \
875  KEYWORD("let", harmony_block_scoping                              \
876                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
877  KEYWORD_GROUP('n')                                                \
878  KEYWORD("new", Token::NEW)                                        \
879  KEYWORD("null", Token::NULL_LITERAL)                              \
880  KEYWORD_GROUP('p')                                                \
881  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
882  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
883  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
884  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
885  KEYWORD_GROUP('r')                                                \
886  KEYWORD("return", Token::RETURN)                                  \
887  KEYWORD_GROUP('s')                                                \
888  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
889  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
890  KEYWORD("switch", Token::SWITCH)                                  \
891  KEYWORD_GROUP('t')                                                \
892  KEYWORD("this", Token::THIS)                                      \
893  KEYWORD("throw", Token::THROW)                                    \
894  KEYWORD("true", Token::TRUE_LITERAL)                              \
895  KEYWORD("try", Token::TRY)                                        \
896  KEYWORD("typeof", Token::TYPEOF)                                  \
897  KEYWORD_GROUP('v')                                                \
898  KEYWORD("var", Token::VAR)                                        \
899  KEYWORD("void", Token::VOID)                                      \
900  KEYWORD_GROUP('w')                                                \
901  KEYWORD("while", Token::WHILE)                                    \
902  KEYWORD("with", Token::WITH)                                      \
903  KEYWORD_GROUP('y')                                                \
904  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
905
906
907static Token::Value KeywordOrIdentifierToken(const char* input,
908                                             int input_length,
909                                             bool harmony_block_scoping) {
910  ASSERT(input_length >= 1);
911  const int kMinLength = 2;
912  const int kMaxLength = 10;
913  if (input_length < kMinLength || input_length > kMaxLength) {
914    return Token::IDENTIFIER;
915  }
916  switch (input[0]) {
917    default:
918#define KEYWORD_GROUP_CASE(ch)                                \
919      break;                                                  \
920    case ch:
921#define KEYWORD(keyword, token)                               \
922    {                                                         \
923      /* 'keyword' is a char array, so sizeof(keyword) is */  \
924      /* strlen(keyword) plus 1 for the NUL char. */          \
925      const int keyword_length = sizeof(keyword) - 1;         \
926      STATIC_ASSERT(keyword_length >= kMinLength);            \
927      STATIC_ASSERT(keyword_length <= kMaxLength);            \
928      if (input_length == keyword_length &&                   \
929          input[1] == keyword[1] &&                           \
930          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
931          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
932          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
933          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
934          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
935          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
936          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
937          (keyword_length <= 9 || input[9] == keyword[9])) {  \
938        return token;                                         \
939      }                                                       \
940    }
941    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
942  }
943  return Token::IDENTIFIER;
944}
945
946
947Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
948  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
949  LiteralScope literal(this);
950  // Scan identifier start character.
951  if (c0_ == '\\') {
952    uc32 c = ScanIdentifierUnicodeEscape();
953    // Only allow legal identifier start characters.
954    if (c < 0 ||
955        c == '\\' ||  // No recursive escapes.
956        !unicode_cache_->IsIdentifierStart(c)) {
957      return Token::ILLEGAL;
958    }
959    AddLiteralChar(c);
960    return ScanIdentifierSuffix(&literal);
961  }
962
963  uc32 first_char = c0_;
964  Advance();
965  AddLiteralChar(first_char);
966
967  // Scan the rest of the identifier characters.
968  while (unicode_cache_->IsIdentifierPart(c0_)) {
969    if (c0_ != '\\') {
970      uc32 next_char = c0_;
971      Advance();
972      AddLiteralChar(next_char);
973      continue;
974    }
975    // Fallthrough if no longer able to complete keyword.
976    return ScanIdentifierSuffix(&literal);
977  }
978
979  literal.Complete();
980
981  if (next_.literal_chars->is_ascii()) {
982    Vector<const char> chars = next_.literal_chars->ascii_literal();
983    return KeywordOrIdentifierToken(chars.start(),
984                                    chars.length(),
985                                    harmony_block_scoping_);
986  }
987
988  return Token::IDENTIFIER;
989}
990
991
992Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
993  // Scan the rest of the identifier characters.
994  while (unicode_cache_->IsIdentifierPart(c0_)) {
995    if (c0_ == '\\') {
996      uc32 c = ScanIdentifierUnicodeEscape();
997      // Only allow legal identifier part characters.
998      if (c < 0 ||
999          c == '\\' ||
1000          !unicode_cache_->IsIdentifierPart(c)) {
1001        return Token::ILLEGAL;
1002      }
1003      AddLiteralChar(c);
1004    } else {
1005      AddLiteralChar(c0_);
1006      Advance();
1007    }
1008  }
1009  literal->Complete();
1010
1011  return Token::IDENTIFIER;
1012}
1013
1014
1015bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
1016  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1017  bool in_character_class = false;
1018
1019  // Previous token is either '/' or '/=', in the second case, the
1020  // pattern starts at =.
1021  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1022  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1023
1024  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1025  // the scanner should pass uninterpreted bodies to the RegExp
1026  // constructor.
1027  LiteralScope literal(this);
1028  if (seen_equal) {
1029    AddLiteralChar('=');
1030  }
1031
1032  while (c0_ != '/' || in_character_class) {
1033    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1034    if (c0_ == '\\') {  // Escape sequence.
1035      AddLiteralCharAdvance();
1036      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1037      AddLiteralCharAdvance();
1038      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1039      // only "safe" characters are allowed (letters, digits, underscore),
1040      // otherwise the escape isn't valid and the invalid character has
1041      // its normal meaning. I.e., we can just continue scanning without
1042      // worrying whether the following characters are part of the escape
1043      // or not, since any '/', '\\' or '[' is guaranteed to not be part
1044      // of the escape sequence.
1045
1046      // TODO(896): At some point, parse RegExps more throughly to capture
1047      // octal esacpes in strict mode.
1048    } else {  // Unescaped character.
1049      if (c0_ == '[') in_character_class = true;
1050      if (c0_ == ']') in_character_class = false;
1051      AddLiteralCharAdvance();
1052    }
1053  }
1054  Advance();  // consume '/'
1055
1056  literal.Complete();
1057
1058  return true;
1059}
1060
1061
1062bool JavaScriptScanner::ScanLiteralUnicodeEscape() {
1063  ASSERT(c0_ == '\\');
1064  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1065  Advance();
1066  int i = 1;
1067  if (c0_ == 'u') {
1068    i++;
1069    while (i < 6) {
1070      Advance();
1071      if (!IsHexDigit(c0_)) break;
1072      chars_read[i] = c0_;
1073      i++;
1074    }
1075  }
1076  if (i < 6) {
1077    // Incomplete escape. Undo all advances and return false.
1078    while (i > 0) {
1079      i--;
1080      PushBack(chars_read[i]);
1081    }
1082    return false;
1083  }
1084  // Complete escape. Add all chars to current literal buffer.
1085  for (int i = 0; i < 6; i++) {
1086    AddLiteralChar(chars_read[i]);
1087  }
1088  return true;
1089}
1090
1091
1092bool JavaScriptScanner::ScanRegExpFlags() {
1093  // Scan regular expression flags.
1094  LiteralScope literal(this);
1095  while (unicode_cache_->IsIdentifierPart(c0_)) {
1096    if (c0_ != '\\') {
1097      AddLiteralCharAdvance();
1098    } else {
1099      if (!ScanLiteralUnicodeEscape()) {
1100        break;
1101      }
1102    }
1103  }
1104  literal.Complete();
1105
1106  next_.location.end_pos = source_pos() - 1;
1107  return true;
1108}
1109
1110} }  // namespace v8::internal
1111