scanner.cc revision 592a9fc1d8ea420377a2e7efd0600e20b058be2b
1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#include "scanner.h"
31
32#include "../include/v8stdint.h"
33#include "char-predicates-inl.h"
34
35namespace v8 {
36namespace internal {
37
38// ----------------------------------------------------------------------------
39// Scanner
40
41Scanner::Scanner(UnicodeCache* unicode_cache)
42    : unicode_cache_(unicode_cache),
43      octal_pos_(Location::invalid()),
44      harmony_scoping_(false) { }
45
46
47void Scanner::Initialize(UC16CharacterStream* source) {
48  source_ = source;
49  // Need to capture identifiers in order to recognize "get" and "set"
50  // in object literals.
51  Init();
52  // Skip initial whitespace allowing HTML comment ends just like
53  // after a newline and scan first token.
54  has_line_terminator_before_next_ = true;
55  SkipWhiteSpace();
56  Scan();
57}
58
59
60uc32 Scanner::ScanHexNumber(int expected_length) {
61  ASSERT(expected_length <= 4);  // prevent overflow
62
63  uc32 digits[4] = { 0, 0, 0, 0 };
64  uc32 x = 0;
65  for (int i = 0; i < expected_length; i++) {
66    digits[i] = c0_;
67    int d = HexValue(c0_);
68    if (d < 0) {
69      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
70      // should be illegal, but other JS VMs just return the
71      // non-escaped version of the original character.
72
73      // Push back digits that we have advanced past.
74      for (int j = i-1; j >= 0; j--) {
75        PushBack(digits[j]);
76      }
77      return -1;
78    }
79    x = x * 16 + d;
80    Advance();
81  }
82
83  return x;
84}
85
86
87// Ensure that tokens can be stored in a byte.
88STATIC_ASSERT(Token::NUM_TOKENS <= 0x100);
89
90// Table of one-character tokens, by character (0x00..0x7f only).
91static const byte one_char_tokens[] = {
92  Token::ILLEGAL,
93  Token::ILLEGAL,
94  Token::ILLEGAL,
95  Token::ILLEGAL,
96  Token::ILLEGAL,
97  Token::ILLEGAL,
98  Token::ILLEGAL,
99  Token::ILLEGAL,
100  Token::ILLEGAL,
101  Token::ILLEGAL,
102  Token::ILLEGAL,
103  Token::ILLEGAL,
104  Token::ILLEGAL,
105  Token::ILLEGAL,
106  Token::ILLEGAL,
107  Token::ILLEGAL,
108  Token::ILLEGAL,
109  Token::ILLEGAL,
110  Token::ILLEGAL,
111  Token::ILLEGAL,
112  Token::ILLEGAL,
113  Token::ILLEGAL,
114  Token::ILLEGAL,
115  Token::ILLEGAL,
116  Token::ILLEGAL,
117  Token::ILLEGAL,
118  Token::ILLEGAL,
119  Token::ILLEGAL,
120  Token::ILLEGAL,
121  Token::ILLEGAL,
122  Token::ILLEGAL,
123  Token::ILLEGAL,
124  Token::ILLEGAL,
125  Token::ILLEGAL,
126  Token::ILLEGAL,
127  Token::ILLEGAL,
128  Token::ILLEGAL,
129  Token::ILLEGAL,
130  Token::ILLEGAL,
131  Token::ILLEGAL,
132  Token::LPAREN,       // 0x28
133  Token::RPAREN,       // 0x29
134  Token::ILLEGAL,
135  Token::ILLEGAL,
136  Token::COMMA,        // 0x2c
137  Token::ILLEGAL,
138  Token::ILLEGAL,
139  Token::ILLEGAL,
140  Token::ILLEGAL,
141  Token::ILLEGAL,
142  Token::ILLEGAL,
143  Token::ILLEGAL,
144  Token::ILLEGAL,
145  Token::ILLEGAL,
146  Token::ILLEGAL,
147  Token::ILLEGAL,
148  Token::ILLEGAL,
149  Token::ILLEGAL,
150  Token::COLON,        // 0x3a
151  Token::SEMICOLON,    // 0x3b
152  Token::ILLEGAL,
153  Token::ILLEGAL,
154  Token::ILLEGAL,
155  Token::CONDITIONAL,  // 0x3f
156  Token::ILLEGAL,
157  Token::ILLEGAL,
158  Token::ILLEGAL,
159  Token::ILLEGAL,
160  Token::ILLEGAL,
161  Token::ILLEGAL,
162  Token::ILLEGAL,
163  Token::ILLEGAL,
164  Token::ILLEGAL,
165  Token::ILLEGAL,
166  Token::ILLEGAL,
167  Token::ILLEGAL,
168  Token::ILLEGAL,
169  Token::ILLEGAL,
170  Token::ILLEGAL,
171  Token::ILLEGAL,
172  Token::ILLEGAL,
173  Token::ILLEGAL,
174  Token::ILLEGAL,
175  Token::ILLEGAL,
176  Token::ILLEGAL,
177  Token::ILLEGAL,
178  Token::ILLEGAL,
179  Token::ILLEGAL,
180  Token::ILLEGAL,
181  Token::ILLEGAL,
182  Token::ILLEGAL,
183  Token::LBRACK,     // 0x5b
184  Token::ILLEGAL,
185  Token::RBRACK,     // 0x5d
186  Token::ILLEGAL,
187  Token::ILLEGAL,
188  Token::ILLEGAL,
189  Token::ILLEGAL,
190  Token::ILLEGAL,
191  Token::ILLEGAL,
192  Token::ILLEGAL,
193  Token::ILLEGAL,
194  Token::ILLEGAL,
195  Token::ILLEGAL,
196  Token::ILLEGAL,
197  Token::ILLEGAL,
198  Token::ILLEGAL,
199  Token::ILLEGAL,
200  Token::ILLEGAL,
201  Token::ILLEGAL,
202  Token::ILLEGAL,
203  Token::ILLEGAL,
204  Token::ILLEGAL,
205  Token::ILLEGAL,
206  Token::ILLEGAL,
207  Token::ILLEGAL,
208  Token::ILLEGAL,
209  Token::ILLEGAL,
210  Token::ILLEGAL,
211  Token::ILLEGAL,
212  Token::ILLEGAL,
213  Token::ILLEGAL,
214  Token::ILLEGAL,
215  Token::LBRACE,       // 0x7b
216  Token::ILLEGAL,
217  Token::RBRACE,       // 0x7d
218  Token::BIT_NOT,      // 0x7e
219  Token::ILLEGAL
220};
221
222
223Token::Value Scanner::Next() {
224  current_ = next_;
225  has_line_terminator_before_next_ = false;
226  has_multiline_comment_before_next_ = false;
227  if (static_cast<unsigned>(c0_) <= 0x7f) {
228    Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]);
229    if (token != Token::ILLEGAL) {
230      int pos = source_pos();
231      next_.token = token;
232      next_.location.beg_pos = pos;
233      next_.location.end_pos = pos + 1;
234      Advance();
235      return current_.token;
236    }
237  }
238  Scan();
239  return current_.token;
240}
241
242
243static inline bool IsByteOrderMark(uc32 c) {
244  // The Unicode value U+FFFE is guaranteed never to be assigned as a
245  // Unicode character; this implies that in a Unicode context the
246  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
247  // character expressed in little-endian byte order (since it could
248  // not be a U+FFFE character expressed in big-endian byte
249  // order). Nevertheless, we check for it to be compatible with
250  // Spidermonkey.
251  return c == 0xFEFF || c == 0xFFFE;
252}
253
254
255bool Scanner::SkipWhiteSpace() {
256  int start_position = source_pos();
257
258  while (true) {
259    // We treat byte-order marks (BOMs) as whitespace for better
260    // compatibility with Spidermonkey and other JavaScript engines.
261    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
262      // IsWhiteSpace() includes line terminators!
263      if (unicode_cache_->IsLineTerminator(c0_)) {
264        // Ignore line terminators, but remember them. This is necessary
265        // for automatic semicolon insertion.
266        has_line_terminator_before_next_ = true;
267      }
268      Advance();
269    }
270
271    // If there is an HTML comment end '-->' at the beginning of a
272    // line (with only whitespace in front of it), we treat the rest
273    // of the line as a comment. This is in line with the way
274    // SpiderMonkey handles it.
275    if (c0_ == '-' && has_line_terminator_before_next_) {
276      Advance();
277      if (c0_ == '-') {
278        Advance();
279        if (c0_ == '>') {
280          // Treat the rest of the line as a comment.
281          SkipSingleLineComment();
282          // Continue skipping white space after the comment.
283          continue;
284        }
285        PushBack('-');  // undo Advance()
286      }
287      PushBack('-');  // undo Advance()
288    }
289    // Return whether or not we skipped any characters.
290    return source_pos() != start_position;
291  }
292}
293
294
295Token::Value Scanner::SkipSingleLineComment() {
296  Advance();
297
298  // The line terminator at the end of the line is not considered
299  // to be part of the single-line comment; it is recognized
300  // separately by the lexical grammar and becomes part of the
301  // stream of input elements for the syntactic grammar (see
302  // ECMA-262, section 7.4).
303  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
304    Advance();
305  }
306
307  return Token::WHITESPACE;
308}
309
310
311Token::Value Scanner::SkipMultiLineComment() {
312  ASSERT(c0_ == '*');
313  Advance();
314
315  while (c0_ >= 0) {
316    uc32 ch = c0_;
317    Advance();
318    if (unicode_cache_->IsLineTerminator(ch)) {
319      // Following ECMA-262, section 7.4, a comment containing
320      // a newline will make the comment count as a line-terminator.
321      has_multiline_comment_before_next_ = true;
322    }
323    // If we have reached the end of the multi-line comment, we
324    // consume the '/' and insert a whitespace. This way all
325    // multi-line comments are treated as whitespace.
326    if (ch == '*' && c0_ == '/') {
327      c0_ = ' ';
328      return Token::WHITESPACE;
329    }
330  }
331
332  // Unterminated multi-line comment.
333  return Token::ILLEGAL;
334}
335
336
337Token::Value Scanner::ScanHtmlComment() {
338  // Check for <!-- comments.
339  ASSERT(c0_ == '!');
340  Advance();
341  if (c0_ == '-') {
342    Advance();
343    if (c0_ == '-') return SkipSingleLineComment();
344    PushBack('-');  // undo Advance()
345  }
346  PushBack('!');  // undo Advance()
347  ASSERT(c0_ == '!');
348  return Token::LT;
349}
350
351
352void Scanner::Scan() {
353  next_.literal_chars = NULL;
354  Token::Value token;
355  do {
356    // Remember the position of the next token
357    next_.location.beg_pos = source_pos();
358
359    switch (c0_) {
360      case ' ':
361      case '\t':
362        Advance();
363        token = Token::WHITESPACE;
364        break;
365
366      case '\n':
367        Advance();
368        has_line_terminator_before_next_ = true;
369        token = Token::WHITESPACE;
370        break;
371
372      case '"': case '\'':
373        token = ScanString();
374        break;
375
376      case '<':
377        // < <= << <<= <!--
378        Advance();
379        if (c0_ == '=') {
380          token = Select(Token::LTE);
381        } else if (c0_ == '<') {
382          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
383        } else if (c0_ == '!') {
384          token = ScanHtmlComment();
385        } else {
386          token = Token::LT;
387        }
388        break;
389
390      case '>':
391        // > >= >> >>= >>> >>>=
392        Advance();
393        if (c0_ == '=') {
394          token = Select(Token::GTE);
395        } else if (c0_ == '>') {
396          // >> >>= >>> >>>=
397          Advance();
398          if (c0_ == '=') {
399            token = Select(Token::ASSIGN_SAR);
400          } else if (c0_ == '>') {
401            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
402          } else {
403            token = Token::SAR;
404          }
405        } else {
406          token = Token::GT;
407        }
408        break;
409
410      case '=':
411        // = == ===
412        Advance();
413        if (c0_ == '=') {
414          token = Select('=', Token::EQ_STRICT, Token::EQ);
415        } else {
416          token = Token::ASSIGN;
417        }
418        break;
419
420      case '!':
421        // ! != !==
422        Advance();
423        if (c0_ == '=') {
424          token = Select('=', Token::NE_STRICT, Token::NE);
425        } else {
426          token = Token::NOT;
427        }
428        break;
429
430      case '+':
431        // + ++ +=
432        Advance();
433        if (c0_ == '+') {
434          token = Select(Token::INC);
435        } else if (c0_ == '=') {
436          token = Select(Token::ASSIGN_ADD);
437        } else {
438          token = Token::ADD;
439        }
440        break;
441
442      case '-':
443        // - -- --> -=
444        Advance();
445        if (c0_ == '-') {
446          Advance();
447          if (c0_ == '>' && has_line_terminator_before_next_) {
448            // For compatibility with SpiderMonkey, we skip lines that
449            // start with an HTML comment end '-->'.
450            token = SkipSingleLineComment();
451          } else {
452            token = Token::DEC;
453          }
454        } else if (c0_ == '=') {
455          token = Select(Token::ASSIGN_SUB);
456        } else {
457          token = Token::SUB;
458        }
459        break;
460
461      case '*':
462        // * *=
463        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
464        break;
465
466      case '%':
467        // % %=
468        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
469        break;
470
471      case '/':
472        // /  // /* /=
473        Advance();
474        if (c0_ == '/') {
475          token = SkipSingleLineComment();
476        } else if (c0_ == '*') {
477          token = SkipMultiLineComment();
478        } else if (c0_ == '=') {
479          token = Select(Token::ASSIGN_DIV);
480        } else {
481          token = Token::DIV;
482        }
483        break;
484
485      case '&':
486        // & && &=
487        Advance();
488        if (c0_ == '&') {
489          token = Select(Token::AND);
490        } else if (c0_ == '=') {
491          token = Select(Token::ASSIGN_BIT_AND);
492        } else {
493          token = Token::BIT_AND;
494        }
495        break;
496
497      case '|':
498        // | || |=
499        Advance();
500        if (c0_ == '|') {
501          token = Select(Token::OR);
502        } else if (c0_ == '=') {
503          token = Select(Token::ASSIGN_BIT_OR);
504        } else {
505          token = Token::BIT_OR;
506        }
507        break;
508
509      case '^':
510        // ^ ^=
511        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
512        break;
513
514      case '.':
515        // . Number
516        Advance();
517        if (IsDecimalDigit(c0_)) {
518          token = ScanNumber(true);
519        } else {
520          token = Token::PERIOD;
521        }
522        break;
523
524      case ':':
525        token = Select(Token::COLON);
526        break;
527
528      case ';':
529        token = Select(Token::SEMICOLON);
530        break;
531
532      case ',':
533        token = Select(Token::COMMA);
534        break;
535
536      case '(':
537        token = Select(Token::LPAREN);
538        break;
539
540      case ')':
541        token = Select(Token::RPAREN);
542        break;
543
544      case '[':
545        token = Select(Token::LBRACK);
546        break;
547
548      case ']':
549        token = Select(Token::RBRACK);
550        break;
551
552      case '{':
553        token = Select(Token::LBRACE);
554        break;
555
556      case '}':
557        token = Select(Token::RBRACE);
558        break;
559
560      case '?':
561        token = Select(Token::CONDITIONAL);
562        break;
563
564      case '~':
565        token = Select(Token::BIT_NOT);
566        break;
567
568      default:
569        if (unicode_cache_->IsIdentifierStart(c0_)) {
570          token = ScanIdentifierOrKeyword();
571        } else if (IsDecimalDigit(c0_)) {
572          token = ScanNumber(false);
573        } else if (SkipWhiteSpace()) {
574          token = Token::WHITESPACE;
575        } else if (c0_ < 0) {
576          token = Token::EOS;
577        } else {
578          token = Select(Token::ILLEGAL);
579        }
580        break;
581    }
582
583    // Continue scanning for tokens as long as we're just skipping
584    // whitespace.
585  } while (token == Token::WHITESPACE);
586
587  next_.location.end_pos = source_pos();
588  next_.token = token;
589}
590
591
592void Scanner::SeekForward(int pos) {
593  // After this call, we will have the token at the given position as
594  // the "next" token. The "current" token will be invalid.
595  if (pos == next_.location.beg_pos) return;
596  int current_pos = source_pos();
597  ASSERT_EQ(next_.location.end_pos, current_pos);
598  // Positions inside the lookahead token aren't supported.
599  ASSERT(pos >= current_pos);
600  if (pos != current_pos) {
601    source_->SeekForward(pos - source_->pos());
602    Advance();
603    // This function is only called to seek to the location
604    // of the end of a function (at the "}" token). It doesn't matter
605    // whether there was a line terminator in the part we skip.
606    has_line_terminator_before_next_ = false;
607    has_multiline_comment_before_next_ = false;
608  }
609  Scan();
610}
611
612
613void Scanner::ScanEscape() {
614  uc32 c = c0_;
615  Advance();
616
617  // Skip escaped newlines.
618  if (unicode_cache_->IsLineTerminator(c)) {
619    // Allow CR+LF newlines in multiline string literals.
620    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
621    // Allow LF+CR newlines in multiline string literals.
622    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
623    return;
624  }
625
626  switch (c) {
627    case '\'':  // fall through
628    case '"' :  // fall through
629    case '\\': break;
630    case 'b' : c = '\b'; break;
631    case 'f' : c = '\f'; break;
632    case 'n' : c = '\n'; break;
633    case 'r' : c = '\r'; break;
634    case 't' : c = '\t'; break;
635    case 'u' : {
636      c = ScanHexNumber(4);
637      if (c < 0) c = 'u';
638      break;
639    }
640    case 'v' : c = '\v'; break;
641    case 'x' : {
642      c = ScanHexNumber(2);
643      if (c < 0) c = 'x';
644      break;
645    }
646    case '0' :  // fall through
647    case '1' :  // fall through
648    case '2' :  // fall through
649    case '3' :  // fall through
650    case '4' :  // fall through
651    case '5' :  // fall through
652    case '6' :  // fall through
653    case '7' : c = ScanOctalEscape(c, 2); break;
654  }
655
656  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
657  // should be illegal, but they are commonly handled
658  // as non-escaped characters by JS VMs.
659  AddLiteralChar(c);
660}
661
662
663// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
664// ECMA-262. Other JS VMs support them.
665uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
666  uc32 x = c - '0';
667  int i = 0;
668  for (; i < length; i++) {
669    int d = c0_ - '0';
670    if (d < 0 || d > 7) break;
671    int nx = x * 8 + d;
672    if (nx >= 256) break;
673    x = nx;
674    Advance();
675  }
676  // Anything except '\0' is an octal escape sequence, illegal in strict mode.
677  // Remember the position of octal escape sequences so that an error
678  // can be reported later (in strict mode).
679  // We don't report the error immediately, because the octal escape can
680  // occur before the "use strict" directive.
681  if (c != '0' || i > 0) {
682    octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1);
683  }
684  return x;
685}
686
687
688Token::Value Scanner::ScanString() {
689  uc32 quote = c0_;
690  Advance();  // consume quote
691
692  LiteralScope literal(this);
693  while (c0_ != quote && c0_ >= 0
694         && !unicode_cache_->IsLineTerminator(c0_)) {
695    uc32 c = c0_;
696    Advance();
697    if (c == '\\') {
698      if (c0_ < 0) return Token::ILLEGAL;
699      ScanEscape();
700    } else {
701      AddLiteralChar(c);
702    }
703  }
704  if (c0_ != quote) return Token::ILLEGAL;
705  literal.Complete();
706
707  Advance();  // consume quote
708  return Token::STRING;
709}
710
711
712void Scanner::ScanDecimalDigits() {
713  while (IsDecimalDigit(c0_))
714    AddLiteralCharAdvance();
715}
716
717
718Token::Value Scanner::ScanNumber(bool seen_period) {
719  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
720
721  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
722
723  LiteralScope literal(this);
724  if (seen_period) {
725    // we have already seen a decimal point of the float
726    AddLiteralChar('.');
727    ScanDecimalDigits();  // we know we have at least one digit
728
729  } else {
730    // if the first character is '0' we must check for octals and hex
731    if (c0_ == '0') {
732      int start_pos = source_pos();  // For reporting octal positions.
733      AddLiteralCharAdvance();
734
735      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
736      if (c0_ == 'x' || c0_ == 'X') {
737        // hex number
738        kind = HEX;
739        AddLiteralCharAdvance();
740        if (!IsHexDigit(c0_)) {
741          // we must have at least one hex digit after 'x'/'X'
742          return Token::ILLEGAL;
743        }
744        while (IsHexDigit(c0_)) {
745          AddLiteralCharAdvance();
746        }
747      } else if ('0' <= c0_ && c0_ <= '7') {
748        // (possible) octal number
749        kind = OCTAL;
750        while (true) {
751          if (c0_ == '8' || c0_ == '9') {
752            kind = DECIMAL;
753            break;
754          }
755          if (c0_  < '0' || '7'  < c0_) {
756            // Octal literal finished.
757            octal_pos_ = Location(start_pos, source_pos());
758            break;
759          }
760          AddLiteralCharAdvance();
761        }
762      }
763    }
764
765    // Parse decimal digits and allow trailing fractional part.
766    if (kind == DECIMAL) {
767      ScanDecimalDigits();  // optional
768      if (c0_ == '.') {
769        AddLiteralCharAdvance();
770        ScanDecimalDigits();  // optional
771      }
772    }
773  }
774
775  // scan exponent, if any
776  if (c0_ == 'e' || c0_ == 'E') {
777    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
778    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
779    // scan exponent
780    AddLiteralCharAdvance();
781    if (c0_ == '+' || c0_ == '-')
782      AddLiteralCharAdvance();
783    if (!IsDecimalDigit(c0_)) {
784      // we must have at least one decimal digit after 'e'/'E'
785      return Token::ILLEGAL;
786    }
787    ScanDecimalDigits();
788  }
789
790  // The source character immediately following a numeric literal must
791  // not be an identifier start or a decimal digit; see ECMA-262
792  // section 7.8.3, page 17 (note that we read only one decimal digit
793  // if the value is 0).
794  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
795    return Token::ILLEGAL;
796
797  literal.Complete();
798
799  return Token::NUMBER;
800}
801
802
803uc32 Scanner::ScanIdentifierUnicodeEscape() {
804  Advance();
805  if (c0_ != 'u') return -1;
806  Advance();
807  uc32 result = ScanHexNumber(4);
808  if (result < 0) PushBack('u');
809  return result;
810}
811
812
813// ----------------------------------------------------------------------------
814// Keyword Matcher
815
816#define KEYWORDS(KEYWORD_GROUP, KEYWORD)                            \
817  KEYWORD_GROUP('b')                                                \
818  KEYWORD("break", Token::BREAK)                                    \
819  KEYWORD_GROUP('c')                                                \
820  KEYWORD("case", Token::CASE)                                      \
821  KEYWORD("catch", Token::CATCH)                                    \
822  KEYWORD("class", Token::FUTURE_RESERVED_WORD)                     \
823  KEYWORD("const", Token::CONST)                                    \
824  KEYWORD("continue", Token::CONTINUE)                              \
825  KEYWORD_GROUP('d')                                                \
826  KEYWORD("debugger", Token::DEBUGGER)                              \
827  KEYWORD("default", Token::DEFAULT)                                \
828  KEYWORD("delete", Token::DELETE)                                  \
829  KEYWORD("do", Token::DO)                                          \
830  KEYWORD_GROUP('e')                                                \
831  KEYWORD("else", Token::ELSE)                                      \
832  KEYWORD("enum", Token::FUTURE_RESERVED_WORD)                      \
833  KEYWORD("export", Token::FUTURE_RESERVED_WORD)                    \
834  KEYWORD("extends", Token::FUTURE_RESERVED_WORD)                   \
835  KEYWORD_GROUP('f')                                                \
836  KEYWORD("false", Token::FALSE_LITERAL)                            \
837  KEYWORD("finally", Token::FINALLY)                                \
838  KEYWORD("for", Token::FOR)                                        \
839  KEYWORD("function", Token::FUNCTION)                              \
840  KEYWORD_GROUP('i')                                                \
841  KEYWORD("if", Token::IF)                                          \
842  KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD)         \
843  KEYWORD("import", Token::FUTURE_RESERVED_WORD)                    \
844  KEYWORD("in", Token::IN)                                          \
845  KEYWORD("instanceof", Token::INSTANCEOF)                          \
846  KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD)          \
847  KEYWORD_GROUP('l')                                                \
848  KEYWORD("let", harmony_scoping                                    \
849                 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \
850  KEYWORD_GROUP('n')                                                \
851  KEYWORD("new", Token::NEW)                                        \
852  KEYWORD("null", Token::NULL_LITERAL)                              \
853  KEYWORD_GROUP('p')                                                \
854  KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD)            \
855  KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD)            \
856  KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD)          \
857  KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD)             \
858  KEYWORD_GROUP('r')                                                \
859  KEYWORD("return", Token::RETURN)                                  \
860  KEYWORD_GROUP('s')                                                \
861  KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD)             \
862  KEYWORD("super", Token::FUTURE_RESERVED_WORD)                     \
863  KEYWORD("switch", Token::SWITCH)                                  \
864  KEYWORD_GROUP('t')                                                \
865  KEYWORD("this", Token::THIS)                                      \
866  KEYWORD("throw", Token::THROW)                                    \
867  KEYWORD("true", Token::TRUE_LITERAL)                              \
868  KEYWORD("try", Token::TRY)                                        \
869  KEYWORD("typeof", Token::TYPEOF)                                  \
870  KEYWORD_GROUP('v')                                                \
871  KEYWORD("var", Token::VAR)                                        \
872  KEYWORD("void", Token::VOID)                                      \
873  KEYWORD_GROUP('w')                                                \
874  KEYWORD("while", Token::WHILE)                                    \
875  KEYWORD("with", Token::WITH)                                      \
876  KEYWORD_GROUP('y')                                                \
877  KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD)
878
879
880static Token::Value KeywordOrIdentifierToken(const char* input,
881                                             int input_length,
882                                             bool harmony_scoping) {
883  ASSERT(input_length >= 1);
884  const int kMinLength = 2;
885  const int kMaxLength = 10;
886  if (input_length < kMinLength || input_length > kMaxLength) {
887    return Token::IDENTIFIER;
888  }
889  switch (input[0]) {
890    default:
891#define KEYWORD_GROUP_CASE(ch)                                \
892      break;                                                  \
893    case ch:
894#define KEYWORD(keyword, token)                               \
895    {                                                         \
896      /* 'keyword' is a char array, so sizeof(keyword) is */  \
897      /* strlen(keyword) plus 1 for the NUL char. */          \
898      const int keyword_length = sizeof(keyword) - 1;         \
899      STATIC_ASSERT(keyword_length >= kMinLength);            \
900      STATIC_ASSERT(keyword_length <= kMaxLength);            \
901      if (input_length == keyword_length &&                   \
902          input[1] == keyword[1] &&                           \
903          (keyword_length <= 2 || input[2] == keyword[2]) &&  \
904          (keyword_length <= 3 || input[3] == keyword[3]) &&  \
905          (keyword_length <= 4 || input[4] == keyword[4]) &&  \
906          (keyword_length <= 5 || input[5] == keyword[5]) &&  \
907          (keyword_length <= 6 || input[6] == keyword[6]) &&  \
908          (keyword_length <= 7 || input[7] == keyword[7]) &&  \
909          (keyword_length <= 8 || input[8] == keyword[8]) &&  \
910          (keyword_length <= 9 || input[9] == keyword[9])) {  \
911        return token;                                         \
912      }                                                       \
913    }
914    KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD)
915  }
916  return Token::IDENTIFIER;
917}
918
919
920Token::Value Scanner::ScanIdentifierOrKeyword() {
921  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
922  LiteralScope literal(this);
923  // Scan identifier start character.
924  if (c0_ == '\\') {
925    uc32 c = ScanIdentifierUnicodeEscape();
926    // Only allow legal identifier start characters.
927    if (c < 0 ||
928        c == '\\' ||  // No recursive escapes.
929        !unicode_cache_->IsIdentifierStart(c)) {
930      return Token::ILLEGAL;
931    }
932    AddLiteralChar(c);
933    return ScanIdentifierSuffix(&literal);
934  }
935
936  uc32 first_char = c0_;
937  Advance();
938  AddLiteralChar(first_char);
939
940  // Scan the rest of the identifier characters.
941  while (unicode_cache_->IsIdentifierPart(c0_)) {
942    if (c0_ != '\\') {
943      uc32 next_char = c0_;
944      Advance();
945      AddLiteralChar(next_char);
946      continue;
947    }
948    // Fallthrough if no longer able to complete keyword.
949    return ScanIdentifierSuffix(&literal);
950  }
951
952  literal.Complete();
953
954  if (next_.literal_chars->is_ascii()) {
955    Vector<const char> chars = next_.literal_chars->ascii_literal();
956    return KeywordOrIdentifierToken(chars.start(),
957                                    chars.length(),
958                                    harmony_scoping_);
959  }
960
961  return Token::IDENTIFIER;
962}
963
964
965Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) {
966  // Scan the rest of the identifier characters.
967  while (unicode_cache_->IsIdentifierPart(c0_)) {
968    if (c0_ == '\\') {
969      uc32 c = ScanIdentifierUnicodeEscape();
970      // Only allow legal identifier part characters.
971      if (c < 0 ||
972          c == '\\' ||
973          !unicode_cache_->IsIdentifierPart(c)) {
974        return Token::ILLEGAL;
975      }
976      AddLiteralChar(c);
977    } else {
978      AddLiteralChar(c0_);
979      Advance();
980    }
981  }
982  literal->Complete();
983
984  return Token::IDENTIFIER;
985}
986
987
988bool Scanner::ScanRegExpPattern(bool seen_equal) {
989  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
990  bool in_character_class = false;
991
992  // Previous token is either '/' or '/=', in the second case, the
993  // pattern starts at =.
994  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
995  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
996
997  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
998  // the scanner should pass uninterpreted bodies to the RegExp
999  // constructor.
1000  LiteralScope literal(this);
1001  if (seen_equal) {
1002    AddLiteralChar('=');
1003  }
1004
1005  while (c0_ != '/' || in_character_class) {
1006    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1007    if (c0_ == '\\') {  // Escape sequence.
1008      AddLiteralCharAdvance();
1009      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
1010      AddLiteralCharAdvance();
1011      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
1012      // only "safe" characters are allowed (letters, digits, underscore),
1013      // otherwise the escape isn't valid and the invalid character has
1014      // its normal meaning. I.e., we can just continue scanning without
1015      // worrying whether the following characters are part of the escape
1016      // or not, since any '/', '\\' or '[' is guaranteed to not be part
1017      // of the escape sequence.
1018
1019      // TODO(896): At some point, parse RegExps more throughly to capture
1020      // octal esacpes in strict mode.
1021    } else {  // Unescaped character.
1022      if (c0_ == '[') in_character_class = true;
1023      if (c0_ == ']') in_character_class = false;
1024      AddLiteralCharAdvance();
1025    }
1026  }
1027  Advance();  // consume '/'
1028
1029  literal.Complete();
1030
1031  return true;
1032}
1033
1034
1035bool Scanner::ScanLiteralUnicodeEscape() {
1036  ASSERT(c0_ == '\\');
1037  uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0};
1038  Advance();
1039  int i = 1;
1040  if (c0_ == 'u') {
1041    i++;
1042    while (i < 6) {
1043      Advance();
1044      if (!IsHexDigit(c0_)) break;
1045      chars_read[i] = c0_;
1046      i++;
1047    }
1048  }
1049  if (i < 6) {
1050    // Incomplete escape. Undo all advances and return false.
1051    while (i > 0) {
1052      i--;
1053      PushBack(chars_read[i]);
1054    }
1055    return false;
1056  }
1057  // Complete escape. Add all chars to current literal buffer.
1058  for (int i = 0; i < 6; i++) {
1059    AddLiteralChar(chars_read[i]);
1060  }
1061  return true;
1062}
1063
1064
1065bool Scanner::ScanRegExpFlags() {
1066  // Scan regular expression flags.
1067  LiteralScope literal(this);
1068  while (unicode_cache_->IsIdentifierPart(c0_)) {
1069    if (c0_ != '\\') {
1070      AddLiteralCharAdvance();
1071    } else {
1072      if (!ScanLiteralUnicodeEscape()) {
1073        break;
1074      }
1075    }
1076  }
1077  literal.Complete();
1078
1079  next_.location.end_pos = source_pos() - 1;
1080  return true;
1081}
1082
1083} }  // namespace v8::internal
1084