1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#include "scanner.h"
31
32#include "../include/v8stdint.h"
33#include "char-predicates-inl.h"
34
35namespace v8 {
36namespace internal {
37
38// ----------------------------------------------------------------------------
39// Scanner
40
41Scanner::Scanner(UnicodeCache* unicode_cache)
42    : unicode_cache_(unicode_cache),
43      octal_pos_(Location::invalid()),
44      harmony_scoping_(false),
45      harmony_modules_(false) { }
46
47
48void Scanner::Initialize(Utf16CharacterStream* source) {
49  source_ = source;
50  // Need to capture identifiers in order to recognize "get" and "set"
51  // in object literals.
52  Init();
53  // Skip initial whitespace allowing HTML comment ends just like
54  // after a newline and scan first token.
55  has_line_terminator_before_next_ = true;
56  SkipWhiteSpace();
57  Scan();
58}
59
60
61uc32 Scanner::ScanHexNumber(int expected_length) {
62  ASSERT(expected_length <= 4);  // prevent overflow
63
64  uc32 digits[4] = { 0, 0, 0, 0 };
65  uc32 x = 0;
66  for (int i = 0; i < expected_length; i++) {
67    digits[i] = c0_;
68    int d = HexValue(c0_);
69    if (d < 0) {
70      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
71      // should be illegal, but other JS VMs just return the
72      // non-escaped version of the original character.
73
74      // Push back digits that we have advanced past.
75      for (int j = i-1; j >= 0; j--) {
76        PushBack(digits[j]);
77      }
78      return -1;
79    }
80    x = x * 16 + d;
81    Advance();
82  }
83
84  return x;
85}
86
87
88// Ensure that tokens can be stored in a byte.
89STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
90
91// Table of one-character tokens, by character (0x00..0x7f only).
92static const byte one_char_tokens[] = {
93  Token::ILLEGAL,
94  Token::ILLEGAL,
95  Token::ILLEGAL,
96  Token::ILLEGAL,
97  Token::ILLEGAL,
98  Token::ILLEGAL,
99  Token::ILLEGAL,
100  Token::ILLEGAL,
101  Token::ILLEGAL,
102  Token::ILLEGAL,
103  Token::ILLEGAL,
104  Token::ILLEGAL,
105  Token::ILLEGAL,
106  Token::ILLEGAL,
107  Token::ILLEGAL,
108  Token::ILLEGAL,
109  Token::ILLEGAL,
110  Token::ILLEGAL,
111  Token::ILLEGAL,
112  Token::ILLEGAL,
113  Token::ILLEGAL,
114  Token::ILLEGAL,
115  Token::ILLEGAL,
116  Token::ILLEGAL,
117  Token::ILLEGAL,
118  Token::ILLEGAL,
119  Token::ILLEGAL,
120  Token::ILLEGAL,
121  Token::ILLEGAL,
122  Token::ILLEGAL,
123  Token::ILLEGAL,
124  Token::ILLEGAL,
125  Token::ILLEGAL,
126  Token::ILLEGAL,
127  Token::ILLEGAL,
128  Token::ILLEGAL,
129  Token::ILLEGAL,
130  Token::ILLEGAL,
131  Token::ILLEGAL,
132  Token::ILLEGAL,
133  Token::LPAREN,       // 0x28
134  Token::RPAREN,       // 0x29
135  Token::ILLEGAL,
136  Token::ILLEGAL,
137  Token::COMMA,        // 0x2c
138  Token::ILLEGAL,
139  Token::ILLEGAL,
140  Token::ILLEGAL,
141  Token::ILLEGAL,
142  Token::ILLEGAL,
143  Token::ILLEGAL,
144  Token::ILLEGAL,
145  Token::ILLEGAL,
146  Token::ILLEGAL,
147  Token::ILLEGAL,
148  Token::ILLEGAL,
149  Token::ILLEGAL,
150  Token::ILLEGAL,
151  Token::COLON,        // 0x3a
152  Token::SEMICOLON,    // 0x3b
153  Token::ILLEGAL,
154  Token::ILLEGAL,
155  Token::ILLEGAL,
156  Token::CONDITIONAL,  // 0x3f
157  Token::ILLEGAL,
158  Token::ILLEGAL,
159  Token::ILLEGAL,
160  Token::ILLEGAL,
161  Token::ILLEGAL,
162  Token::ILLEGAL,
163  Token::ILLEGAL,
164  Token::ILLEGAL,
165  Token::ILLEGAL,
166  Token::ILLEGAL,
167  Token::ILLEGAL,
168  Token::ILLEGAL,
169  Token::ILLEGAL,
170  Token::ILLEGAL,
171  Token::ILLEGAL,
172  Token::ILLEGAL,
173  Token::ILLEGAL,
174  Token::ILLEGAL,
175  Token::ILLEGAL,
176  Token::ILLEGAL,
177  Token::ILLEGAL,
178  Token::ILLEGAL,
179  Token::ILLEGAL,
180  Token::ILLEGAL,
181  Token::ILLEGAL,
182  Token::ILLEGAL,
183  Token::ILLEGAL,
184  Token::LBRACK,     // 0x5b
185  Token::ILLEGAL,
186  Token::RBRACK,     // 0x5d
187  Token::ILLEGAL,
188  Token::ILLEGAL,
189  Token::ILLEGAL,
190  Token::ILLEGAL,
191  Token::ILLEGAL,
192  Token::ILLEGAL,
193  Token::ILLEGAL,
194  Token::ILLEGAL,
195  Token::ILLEGAL,
196  Token::ILLEGAL,
197  Token::ILLEGAL,
198  Token::ILLEGAL,
199  Token::ILLEGAL,
200  Token::ILLEGAL,
201  Token::ILLEGAL,
202  Token::ILLEGAL,
203  Token::ILLEGAL,
204  Token::ILLEGAL,
205  Token::ILLEGAL,
206  Token::ILLEGAL,
207  Token::ILLEGAL,
208  Token::ILLEGAL,
209  Token::ILLEGAL,
210  Token::ILLEGAL,
211  Token::ILLEGAL,
212  Token::ILLEGAL,
213  Token::ILLEGAL,
214  Token::ILLEGAL,
215  Token::ILLEGAL,
216  Token::LBRACE,       // 0x7b
217  Token::ILLEGAL,
218  Token::RBRACE,       // 0x7d
219  Token::BIT_NOT,      // 0x7e
220  Token::ILLEGAL
221};
222
223
224Token::Value Scanner::Next() {
225  current_ = next_;
226  has_line_terminator_before_next_ = false;
227  has_multiline_comment_before_next_ = false;
228  if (static_cast<unsigned>(c0_) <= 0x7f) {
229    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
230    if (token != Token::ILLEGAL) {
231      int pos = source_pos();
232      next_.token = token;
233      next_.location.beg_pos = pos;
234      next_.location.end_pos = pos + 1;
235      Advance();
236      return current_.token;
237    }
238  }
239  Scan();
240  return current_.token;
241}
242
243
244static inline bool IsByteOrderMark(uc32 c) {
245  // The Unicode value U+FFFE is guaranteed never to be assigned as a
246  // Unicode character; this implies that in a Unicode context the
247  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
248  // character expressed in little-endian byte order (since it could
249  // not be a U+FFFE character expressed in big-endian byte
250  // order). Nevertheless, we check for it to be compatible with
251  // Spidermonkey.
252  return c == 0xFEFF || c == 0xFFFE;
253}
254
255
256bool Scanner::SkipWhiteSpace() {
257  int start_position = source_pos();
258
259  while (true) {
260    // We treat byte-order marks (BOMs) as whitespace for better
261    // compatibility with Spidermonkey and other JavaScript engines.
262    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
263      // IsWhiteSpace() includes line terminators!
264      if (unicode_cache_->IsLineTerminator(c0_)) {
265        // Ignore line terminators, but remember them. This is necessary
266        // for automatic semicolon insertion.
267        has_line_terminator_before_next_ = true;
268      }
269      Advance();
270    }
271
272    // If there is an HTML comment end '-->' at the beginning of a
273    // line (with only whitespace in front of it), we treat the rest
274    // of the line as a comment. This is in line with the way
275    // SpiderMonkey handles it.
276    if (c0_ == '-' && has_line_terminator_before_next_) {
277      Advance();
278      if (c0_ == '-') {
279        Advance();
280        if (c0_ == '>') {
281          // Treat the rest of the line as a comment.
282          SkipSingleLineComment();
283          // Continue skipping white space after the comment.
284          continue;
285        }
286        PushBack('-');  // undo Advance()
287      }
288      PushBack('-');  // undo Advance()
289    }
290    // Return whether or not we skipped any characters.
291    return source_pos() != start_position;
292  }
293}
294
295
296Token::Value Scanner::SkipSingleLineComment() {
297  Advance();
298
299  // The line terminator at the end of the line is not considered
300  // to be part of the single-line comment; it is recognized
301  // separately by the lexical grammar and becomes part of the
302  // stream of input elements for the syntactic grammar (see
303  // ECMA-262, section 7.4).
304  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
305    Advance();
306  }
307
308  return Token::WHITESPACE;
309}
310
311
312Token::Value Scanner::SkipMultiLineComment() {
313  ASSERT(c0_ == '*');
314  Advance();
315
316  while (c0_ >= 0) {
317    uc32 ch = c0_;
318    Advance();
319    if (unicode_cache_->IsLineTerminator(ch)) {
320      // Following ECMA-262, section 7.4, a comment containing
321      // a newline will make the comment count as a line-terminator.
322      has_multiline_comment_before_next_ = true;
323    }
324    // If we have reached the end of the multi-line comment, we
325    // consume the '/' and insert a whitespace. This way all
326    // multi-line comments are treated as whitespace.
327    if (ch == '*' && c0_ == '/') {
328      c0_ = ' ';
329      return Token::WHITESPACE;
330    }
331  }
332
333  // Unterminated multi-line comment.
334  return Token::ILLEGAL;
335}
336
337
338Token::Value Scanner::ScanHtmlComment() {
339  // Check for <!-- comments.
340  ASSERT(c0_ == '!');
341  Advance();
342  if (c0_ == '-') {
343    Advance();
344    if (c0_ == '-') return SkipSingleLineComment();
345    PushBack('-');  // undo Advance()
346  }
347  PushBack('!');  // undo Advance()
348  ASSERT(c0_ == '!');
349  return Token::LT;
350}
351
352
353void Scanner::Scan() {
354  next_.literal_chars = NULL;
355  Token::Value token;
356  do {
357    // Remember the position of the next token
358    next_.location.beg_pos = source_pos();
359
360    switch (c0_) {
361      case ' ':
362      case '\t':
363        Advance();
364        token = Token::WHITESPACE;
365        break;
366
367      case '\n':
368        Advance();
369        has_line_terminator_before_next_ = true;
370        token = Token::WHITESPACE;
371        break;
372
373      case '"': case '\'':
374        token = ScanString();
375        break;
376
377      case '<':
378        // < <= << <<= <!--
379        Advance();
380        if (c0_ == '=') {
381          token = Select(Token::LTE);
382        } else if (c0_ == '<') {
383          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
384        } else if (c0_ == '!') {
385          token = ScanHtmlComment();
386        } else {
387          token = Token::LT;
388        }
389        break;
390
391      case '>':
392        // > >= >> >>= >>> >>>=
393        Advance();
394        if (c0_ == '=') {
395          token = Select(Token::GTE);
396        } else if (c0_ == '>') {
397          // >> >>= >>> >>>=
398          Advance();
399          if (c0_ == '=') {
400            token = Select(Token::ASSIGN_SAR);
401          } else if (c0_ == '>') {
402            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
403          } else {
404            token = Token::SAR;
405          }
406        } else {
407          token = Token::GT;
408        }
409        break;
410
411      case '=':
412        // = == ===
413        Advance();
414        if (c0_ == '=') {
415          token = Select('=', Token::EQ_STRICT, Token::EQ);
416        } else {
417          token = Token::ASSIGN;
418        }
419        break;
420
421      case '!':
422        // ! != !==
423        Advance();
424        if (c0_ == '=') {
425          token = Select('=', Token::NE_STRICT, Token::NE);
426        } else {
427          token = Token::NOT;
428        }
429        break;
430
431      case '+':
432        // + ++ +=
433        Advance();
434        if (c0_ == '+') {
435          token = Select(Token::INC);
436        } else if (c0_ == '=') {
437          token = Select(Token::ASSIGN_ADD);
438        } else {
439          token = Token::ADD;
440        }
441        break;
442
443      case '-':
444        // - -- --> -=
445        Advance();
446        if (c0_ == '-') {
447          Advance();
448          if (c0_ == '>' && has_line_terminator_before_next_) {
449            // For compatibility with SpiderMonkey, we skip lines that
450            // start with an HTML comment end '-->'.
451            token = SkipSingleLineComment();
452          } else {
453            token = Token::DEC;
454          }
455        } else if (c0_ == '=') {
456          token = Select(Token::ASSIGN_SUB);
457        } else {
458          token = Token::SUB;
459        }
460        break;
461
462      case '*':
463        // * *=
464        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
465        break;
466
467      case '%':
468        // % %=
469        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
470        break;
471
472      case '/':
473        // /  // /* /=
474        Advance();
475        if (c0_ == '/') {
476          token = SkipSingleLineComment();
477        } else if (c0_ == '*') {
478          token = SkipMultiLineComment();
479        } else if (c0_ == '=') {
480          token = Select(Token::ASSIGN_DIV);
481        } else {
482          token = Token::DIV;
483        }
484        break;
485
486      case '&':
487        // & && &=
488        Advance();
489        if (c0_ == '&') {
490          token = Select(Token::AND);
491        } else if (c0_ == '=') {
492          token = Select(Token::ASSIGN_BIT_AND);
493        } else {
494          token = Token::BIT_AND;
495        }
496        break;
497
498      case '|':
499        // | || |=
500        Advance();
501        if (c0_ == '|') {
502          token = Select(Token::OR);
503        } else if (c0_ == '=') {
504          token = Select(Token::ASSIGN_BIT_OR);
505        } else {
506          token = Token::BIT_OR;
507        }
508        break;
509
510      case '^':
511        // ^ ^=
512        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
513        break;
514
515      case '.':
516        // . Number
517        Advance();
518        if (IsDecimalDigit(c0_)) {
519          token = ScanNumber(true);
520        } else {
521          token = Token::PERIOD;
522        }
523        break;
524
525      case ':':
526        token = Select(Token::COLON);
527        break;
528
529      case ';':
530        token = Select(Token::SEMICOLON);
531        break;
532
533      case ',':
534        token = Select(Token::COMMA);
535        break;
536
537      case '(':
538        token = Select(Token::LPAREN);
539        break;
540
541      case ')':
542        token = Select(Token::RPAREN);
543        break;
544
545      case '[':
546        token = Select(Token::LBRACK);
547        break;
548
549      case ']':
550        token = Select(Token::RBRACK);
551        break;
552
553      case '{':
554        token = Select(Token::LBRACE);
555        break;
556
557      case '}':
558        token = Select(Token::RBRACE);
559        break;
560
561      case '?':
562        token = Select(Token::CONDITIONAL);
563        break;
564
565      case '~':
566        token = Select(Token::BIT_NOT);
567        break;
568
569      default:
570        if (unicode_cache_->IsIdentifierStart(c0_)) {
571          token = ScanIdentifierOrKeyword();
572        } else if (IsDecimalDigit(c0_)) {
573          token = ScanNumber(false);
574        } else if (SkipWhiteSpace()) {
575          token = Token::WHITESPACE;
576        } else if (c0_ < 0) {
577          token = Token::EOS;
578        } else {
579          token = Select(Token::ILLEGAL);
580        }
581        break;
582    }
583
584    // Continue scanning for tokens as long as we're just skipping
585    // whitespace.
586  } while (token == Token::WHITESPACE);
587
588  next_.location.end_pos = source_pos();
589  next_.token = token;
590}
591
592
593void Scanner::SeekForward(int pos) {
594  // After this call, we will have the token at the given position as
595  // the "next" token. The "current" token will be invalid.
596  if (pos == next_.location.beg_pos) return;
597  int current_pos = source_pos();
598  ASSERT_EQ(next_.location.end_pos, current_pos);
599  // Positions inside the lookahead token aren't supported.
600  ASSERT(pos >= current_pos);
601  if (pos != current_pos) {
602    source_->SeekForward(pos - source_->pos());
603    Advance();
604    // This function is only called to seek to the location
605    // of the end of a function (at the "}" token). It doesn't matter
606    // whether there was a line terminator in the part we skip.
607    has_line_terminator_before_next_ = false;
608    has_multiline_comment_before_next_ = false;
609  }
610  Scan();
611}
612
613
614void Scanner::ScanEscape() {
615  uc32 c = c0_;
616  Advance();
617
618  // Skip escaped newlines.
619  if (unicode_cache_->IsLineTerminator(c)) {
620    // Allow CR+LF newlines in multiline string literals.
621    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
622    // Allow LF+CR newlines in multiline string literals.
623    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
624    return;
625  }
626
627  switch (c) {
628    case '\'':  // fall through
629    case '"' :  // fall through
630    case '\\': break;
631    case 'b' : c = '\b'; break;
632    case 'f' : c = '\f'; break;
633    case 'n' : c = '\n'; break;
634    case 'r' : c = '\r'; break;
635    case 't' : c = '\t'; break;
636    case 'u' : {
637      c = ScanHexNumber(4);
638      if (c < 0) c = 'u';
639      break;
640    }
641    case 'v' : c = '\v'; break;
642    case 'x' : {
643      c = ScanHexNumber(2);
644      if (c < 0) c = 'x';
645      break;
646    }
647    case '0' :  // fall through
648    case '1' :  // fall through
649    case '2' :  // fall through
650    case '3' :  // fall through
651    case '4' :  // fall through
652    case '5' :  // fall through
653    case '6' :  // fall through
654    case '7' : c = ScanOctalEscape(c, 2); break;
655  }
656
657  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
658  // should be illegal, but they are commonly handled
659  // as non-escaped characters by JS VMs.
660  AddLiteralChar(c);
661}
662
663
664// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
665// ECMA-262. Other JS VMs support them.
666uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
667  uc32 x = c - '0';
668  int i = 0;
669  for (; i < length; i++) {
670    int d = c0_ - '0';
671    if (d < 0 || d > 7) break;
672    int nx = x * 8 + d;
673    if (nx >= 256) break;
674    x = nx;
675    Advance();
676  }
677  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
678  // Remember the position of octal escape sequences so that an error
679  // can be reported later (in strict mode).
680  // We don't report the error immediately, because the octal escape can
681  // occur before the "use strict" directive.
682  if (c != '0' || i > 0) {
683    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
684  }
685  return x;
686}
687
688
689Token::Value Scanner::ScanString() {
690  uc32 quote = c0_;
691  Advance();  // consume quote
692
693  LiteralScope literal(this);
694  while (c0_ != quote && c0_ >= 0
695         && !unicode_cache_->IsLineTerminator(c0_)) {
696    uc32 c = c0_;
697    Advance();
698    if (c == '\\') {
699      if (c0_ < 0) return Token::ILLEGAL;
700      ScanEscape();
701    } else {
702      AddLiteralChar(c);
703    }
704  }
705  if (c0_ != quote) return Token::ILLEGAL;
706  literal.Complete();
707
708  Advance();  // consume quote
709  return Token::STRING;
710}
711
712
713void Scanner::ScanDecimalDigits() {
714  while (IsDecimalDigit(c0_))
715    AddLiteralCharAdvance();
716}
717
718
719Token::Value Scanner::ScanNumber(bool seen_period) {
720  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
721
722  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
723
724  LiteralScope literal(this);
725  if (seen_period) {
726    // we have already seen a decimal point of the float
727    AddLiteralChar('.');
728    ScanDecimalDigits();  // we know we have at least one digit
729
730  } else {
731    // if the first character is '0' we must check for octals and hex
732    if (c0_ == '0') {
733      int start_pos = source_pos();  // For reporting octal positions.
734      AddLiteralCharAdvance();
735
736      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
737      if (c0_ == 'x' || c0_ == 'X') {
738        // hex number
739        kind = HEX;
740        AddLiteralCharAdvance();
741        if (!IsHexDigit(c0_)) {
742          // we must have at least one hex digit after 'x'/'X'
743          return Token::ILLEGAL;
744        }
745        while (IsHexDigit(c0_)) {
746          AddLiteralCharAdvance();
747        }
748      } else if ('0' <= c0_ && c0_ <= '7') {
749        // (possible) octal number
750        kind = OCTAL;
751        while (true) {
752          if (c0_ == '8' || c0_ == '9') {
753            kind = DECIMAL;
754            break;
755          }
756          if (c0_  < '0' || '7'  < c0_) {
757            // Octal literal finished.
758            octal_pos_ = Location(start_pos, source_pos());
759            break;
760          }
761          AddLiteralCharAdvance();
762        }
763      }
764    }
765
766    // Parse decimal digits and allow trailing fractional part.
767    if (kind == DECIMAL) {
768      ScanDecimalDigits();  // optional
769      if (c0_ == '.') {
770        AddLiteralCharAdvance();
771        ScanDecimalDigits();  // optional
772      }
773    }
774  }
775
776  // scan exponent, if any
777  if (c0_ == 'e' || c0_ == 'E') {
778    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
779    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
780    // scan exponent
781    AddLiteralCharAdvance();
782    if (c0_ == '+' || c0_ == '-')
783      AddLiteralCharAdvance();
784    if (!IsDecimalDigit(c0_)) {
785      // we must have at least one decimal digit after 'e'/'E'
786      return Token::ILLEGAL;
787    }
788    ScanDecimalDigits();
789  }
790
791  // The source character immediately following a numeric literal must
792  // not be an identifier start or a decimal digit; see ECMA-262
793  // section 7.8.3, page 17 (note that we read only one decimal digit
794  // if the value is 0).
795  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
796    return Token::ILLEGAL;
797
798  literal.Complete();
799
800  return Token::NUMBER;
801}
802
803
804uc32 Scanner::ScanIdentifierUnicodeEscape() {
805  Advance();
806  if (c0_ != 'u') return -1;
807  Advance();
808  uc32 result = ScanHexNumber(4);
809  if (result < 0) PushBack('u');
810  return result;
811}
812
813
814// ----------------------------------------------------------------------------
815// Keyword Matcher
816
817#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
818  KEYWORD_GROUP('b')                                                \
819  KEYWORD("break", Token::BREAK)                                    \
820  KEYWORD_GROUP('c')                                                \
821  KEYWORD("case", Token::CASE)                                      \
822  KEYWORD("catch", Token::CATCH)                                    \
823  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
824  KEYWORD("const", Token::CONST)                                    \
825  KEYWORD("continue", Token::CONTINUE)                              \
826  KEYWORD_GROUP('d')                                                \
827  KEYWORD("debugger", Token::DEBUGGER)                              \
828  KEYWORD("default", Token::DEFAULT)                                \
829  KEYWORD("delete", Token::DELETE)                                  \
830  KEYWORD("do", Token::DO)                                          \
831  KEYWORD_GROUP('e')                                                \
832  KEYWORD("else", Token::ELSE)                                      \
833  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
834  KEYWORD("export", harmony_modules                                 \
835                    ? Token::EXPORT : Token::FUTURE_RESERVED_WORD)  \
836  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
837  KEYWORD_GROUP('f')                                                \
838  KEYWORD("false", Token::FALSE_LITERAL)                            \
839  KEYWORD("finally", Token::FINALLY)                                \
840  KEYWORD("for", Token::FOR)                                        \
841  KEYWORD("function", Token::FUNCTION)                              \
842  KEYWORD_GROUP('i')                                                \
843  KEYWORD("if", Token::IF)                                          \
844  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
845  KEYWORD("import", harmony_modules                                 \
846                    ? Token::IMPORT : Token::FUTURE_RESERVED_WORD)  \
847  KEYWORD("in", Token::IN)                                          \
848  KEYWORD("instanceof", Token::INSTANCEOF)                          \
849  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
850  KEYWORD_GROUP('l')                                                \
851  KEYWORD("let", harmony_scoping                                    \
852                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
853  KEYWORD_GROUP('n')                                                \
854  KEYWORD("new", Token::NEW)                                        \
855  KEYWORD("null", Token::NULL_LITERAL)                              \
856  KEYWORD_GROUP('p')                                                \
857  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
858  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
859  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
860  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
861  KEYWORD_GROUP('r')                                                \
862  KEYWORD("return", Token::RETURN)                                  \
863  KEYWORD_GROUP('s')                                                \
864  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
865  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
866  KEYWORD("switch", Token::SWITCH)                                  \
867  KEYWORD_GROUP('t')                                                \
868  KEYWORD("this", Token::THIS)                                      \
869  KEYWORD("throw", Token::THROW)                                    \
870  KEYWORD("true", Token::TRUE_LITERAL)                              \
871  KEYWORD("try", Token::TRY)                                        \
872  KEYWORD("typeof", Token::TYPEOF)                                  \
873  KEYWORD_GROUP('v')                                                \
874  KEYWORD("var", Token::VAR)                                        \
875  KEYWORD("void", Token::VOID)                                      \
876  KEYWORD_GROUP('w')                                                \
877  KEYWORD("while", Token::WHILE)                                    \
878  KEYWORD("with", Token::WITH)                                      \
879  KEYWORD_GROUP('y')                                                \
880  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
881
882
883static Token::Value KeywordOrIdentifierToken(const char* input,
884                                             int input_length,
885                                             bool harmony_scoping,
886                                             bool harmony_modules) {
887  ASSERT(input_length >= 1);
888  const int kMinLength = 2;
889  const int kMaxLength = 10;
890  if (input_length < kMinLength || input_length > kMaxLength) {
891    return Token::IDENTIFIER;
892  }
893  switch (input[0]) {
894    default:
895#define KEYWORD_GROUP_CASE(ch)                                \
896      break;                                                  \
897    case ch:
898#define KEYWORD(keyword, token)                               \
899    {                                                         \
900      /* 'keyword' is a char array, so sizeof(keyword) is */  \
901      /* strlen(keyword) plus 1 for the NUL char. */          \
902      const int keyword_length = sizeof(keyword) - 1;         \
903      STATIC_ASSERT(keyword_length >= kMinLength);            \
904      STATIC_ASSERT(keyword_length <= kMaxLength);            \
905      if (input_length == keyword_length &&                   \
906          input[1] == keyword[1] &&                           \
907          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
908          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
909          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
910          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
911          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
912          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
913          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
914          (keyword_length <= 9 || input[9] == keyword[9])) {  \
915        return token;                                         \
916      }                                                       \
917    }
918    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
919  }
920  return Token::IDENTIFIER;
921}
922
923
924Token::Value Scanner::ScanIdentifierOrKeyword() {
925  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
926  LiteralScope literal(this);
927  // Scan identifier start character.
928  if (c0_ == '\\') {
929    uc32 c = ScanIdentifierUnicodeEscape();
930    // Only allow legal identifier start characters.
931    if (c < 0 ||
932        c == '\\' ||  // No recursive escapes.
933        !unicode_cache_->IsIdentifierStart(c)) {
934      return Token::ILLEGAL;
935    }
936    AddLiteralChar(c);
937    return ScanIdentifierSuffix(&literal);
938  }
939
940  uc32 first_char = c0_;
941  Advance();
942  AddLiteralChar(first_char);
943
944  // Scan the rest of the identifier characters.
945  while (unicode_cache_->IsIdentifierPart(c0_)) {
946    if (c0_ != '\\') {
947      uc32 next_char = c0_;
948      Advance();
949      AddLiteralChar(next_char);
950      continue;
951    }
952    // Fallthrough if no longer able to complete keyword.
953    return ScanIdentifierSuffix(&literal);
954  }
955
956  literal.Complete();
957
958  if (next_.literal_chars->is_ascii()) {
959    Vector<const char> chars = next_.literal_chars->ascii_literal();
960    return KeywordOrIdentifierToken(chars.start(),
961                                    chars.length(),
962                                    harmony_scoping_,
963                                    harmony_modules_);
964  }
965
966  return Token::IDENTIFIER;
967}
968
969
970Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
971  // Scan the rest of the identifier characters.
972  while (unicode_cache_->IsIdentifierPart(c0_)) {
973    if (c0_ == '\\') {
974      uc32 c = ScanIdentifierUnicodeEscape();
975      // Only allow legal identifier part characters.
976      if (c < 0 ||
977          c == '\\' ||
978          !unicode_cache_->IsIdentifierPart(c)) {
979        return Token::ILLEGAL;
980      }
981      AddLiteralChar(c);
982    } else {
983      AddLiteralChar(c0_);
984      Advance();
985    }
986  }
987  literal->Complete();
988
989  return Token::IDENTIFIER;
990}
991
992
993bool Scanner::ScanRegExpPattern(bool seen_equal) {
994  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
995  bool in_character_class = false;
996
997  // Previous token is either '/' or '/=', in the second case, the
998  // pattern starts at =.
999  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1000  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1001
1002  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1003  // the scanner should pass uninterpreted bodies to the RegExp
1004  // constructor.
1005  LiteralScope literal(this);
1006  if (seen_equal) {
1007    AddLiteralChar('=');
1008  }
1009
1010  while (c0_ != '/' || in_character_class) {
1011    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1012    if (c0_ == '\\') {  // Escape sequence.
1013      AddLiteralCharAdvance();
1014      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1015      AddLiteralCharAdvance();
1016      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1017      // only "safe" characters are allowed (letters, digits, underscore),
1018      // otherwise the escape isn't valid and the invalid character has
1019      // its normal meaning. I.e., we can just continue scanning without
1020      // worrying whether the following characters are part of the escape
1021      // or not, since any '/', '\\' or '[' is guaranteed to not be part
1022      // of the escape sequence.
1023
1024      // TODO(896): At some point, parse RegExps more throughly to capture
1025      // octal esacpes in strict mode.
1026    } else {  // Unescaped character.
1027      if (c0_ == '[') in_character_class = true;
1028      if (c0_ == ']') in_character_class = false;
1029      AddLiteralCharAdvance();
1030    }
1031  }
1032  Advance();  // consume '/'
1033
1034  literal.Complete();
1035
1036  return true;
1037}
1038
1039
1040bool Scanner::ScanLiteralUnicodeEscape() {
1041  ASSERT(c0_ == '\\');
1042  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1043  Advance();
1044  int i = 1;
1045  if (c0_ == 'u') {
1046    i++;
1047    while (i < 6) {
1048      Advance();
1049      if (!IsHexDigit(c0_)) break;
1050      chars_read[i] = c0_;
1051      i++;
1052    }
1053  }
1054  if (i < 6) {
1055    // Incomplete escape. Undo all advances and return false.
1056    while (i > 0) {
1057      i--;
1058      PushBack(chars_read[i]);
1059    }
1060    return false;
1061  }
1062  // Complete escape. Add all chars to current literal buffer.
1063  for (int i = 0; i < 6; i++) {
1064    AddLiteralChar(chars_read[i]);
1065  }
1066  return true;
1067}
1068
1069
1070bool Scanner::ScanRegExpFlags() {
1071  // Scan regular expression flags.
1072  LiteralScope literal(this);
1073  while (unicode_cache_->IsIdentifierPart(c0_)) {
1074    if (c0_ != '\\') {
1075      AddLiteralCharAdvance();
1076    } else {
1077      if (!ScanLiteralUnicodeEscape()) {
1078        break;
1079      }
1080    }
1081  }
1082  literal.Complete();
1083
1084  next_.location.end_pos = source_pos() - 1;
1085  return true;
1086}
1087
1088} }  // namespace v8::internal
1089