1// Copyright 2011 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28// Features shared by parsing and pre-parsing scanners.
29
30#include "../include/v8stdint.h"
31#include "scanner-base.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37// ----------------------------------------------------------------------------
38// Scanner
39
40Scanner::Scanner(UnicodeCache* unicode_cache)
41    : unicode_cache_(unicode_cache),
42      octal_pos_(kNoOctalLocation) { }
43
44
45uc32 Scanner::ScanHexEscape(uc32 c, int length) {
46  ASSERT(length <= 4);  // prevent overflow
47
48  uc32 digits[4];
49  uc32 x = 0;
50  for (int i = 0; i < length; i++) {
51    digits[i] = c0_;
52    int d = HexValue(c0_);
53    if (d < 0) {
54      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
55      // should be illegal, but other JS VMs just return the
56      // non-escaped version of the original character.
57
58      // Push back digits read, except the last one (in c0_).
59      for (int j = i-1; j >= 0; j--) {
60        PushBack(digits[j]);
61      }
62      // Notice: No handling of error - treat it as "\u"->"u".
63      return c;
64    }
65    x = x * 16 + d;
66    Advance();
67  }
68
69  return x;
70}
71
72
73// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
74// ECMA-262. Other JS VMs support them.
75uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
76  uc32 x = c - '0';
77  int i = 0;
78  for (; i < length; i++) {
79    int d = c0_ - '0';
80    if (d < 0 || d > 7) break;
81    int nx = x * 8 + d;
82    if (nx >= 256) break;
83    x = nx;
84    Advance();
85  }
86  // Anything excelt '\0' is an octal escape sequence, illegal in strict mode.
87  // Remember the position of octal escape sequences so that better error
88  // can be reported later (in strict mode).
89  if (c != '0' || i > 0) {
90    octal_pos_ = source_pos() - i - 1;     // Already advanced
91  }
92  return x;
93}
94
95
96// ----------------------------------------------------------------------------
97// JavaScriptScanner
98
99JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants)
100    : Scanner(scanner_contants) { }
101
102
103Token::Value JavaScriptScanner::Next() {
104  current_ = next_;
105  has_line_terminator_before_next_ = false;
106  Scan();
107  return current_.token;
108}
109
110
111static inline bool IsByteOrderMark(uc32 c) {
112  // The Unicode value U+FFFE is guaranteed never to be assigned as a
113  // Unicode character; this implies that in a Unicode context the
114  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
115  // character expressed in little-endian byte order (since it could
116  // not be a U+FFFE character expressed in big-endian byte
117  // order). Nevertheless, we check for it to be compatible with
118  // Spidermonkey.
119  return c == 0xFEFF || c == 0xFFFE;
120}
121
122
123bool JavaScriptScanner::SkipWhiteSpace() {
124  int start_position = source_pos();
125
126  while (true) {
127    // We treat byte-order marks (BOMs) as whitespace for better
128    // compatibility with Spidermonkey and other JavaScript engines.
129    while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) {
130      // IsWhiteSpace() includes line terminators!
131      if (unicode_cache_->IsLineTerminator(c0_)) {
132        // Ignore line terminators, but remember them. This is necessary
133        // for automatic semicolon insertion.
134        has_line_terminator_before_next_ = true;
135      }
136      Advance();
137    }
138
139    // If there is an HTML comment end '-->' at the beginning of a
140    // line (with only whitespace in front of it), we treat the rest
141    // of the line as a comment. This is in line with the way
142    // SpiderMonkey handles it.
143    if (c0_ == '-' && has_line_terminator_before_next_) {
144      Advance();
145      if (c0_ == '-') {
146        Advance();
147        if (c0_ == '>') {
148          // Treat the rest of the line as a comment.
149          SkipSingleLineComment();
150          // Continue skipping white space after the comment.
151          continue;
152        }
153        PushBack('-');  // undo Advance()
154      }
155      PushBack('-');  // undo Advance()
156    }
157    // Return whether or not we skipped any characters.
158    return source_pos() != start_position;
159  }
160}
161
162
163Token::Value JavaScriptScanner::SkipSingleLineComment() {
164  Advance();
165
166  // The line terminator at the end of the line is not considered
167  // to be part of the single-line comment; it is recognized
168  // separately by the lexical grammar and becomes part of the
169  // stream of input elements for the syntactic grammar (see
170  // ECMA-262, section 7.4, page 12).
171  while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) {
172    Advance();
173  }
174
175  return Token::WHITESPACE;
176}
177
178
179Token::Value JavaScriptScanner::SkipMultiLineComment() {
180  ASSERT(c0_ == '*');
181  Advance();
182
183  while (c0_ >= 0) {
184    char ch = c0_;
185    Advance();
186    // If we have reached the end of the multi-line comment, we
187    // consume the '/' and insert a whitespace. This way all
188    // multi-line comments are treated as whitespace - even the ones
189    // containing line terminators. This contradicts ECMA-262, section
190    // 7.4, page 12, that says that multi-line comments containing
191    // line terminators should be treated as a line terminator, but it
192    // matches the behaviour of SpiderMonkey and KJS.
193    if (ch == '*' && c0_ == '/') {
194      c0_ = ' ';
195      return Token::WHITESPACE;
196    }
197  }
198
199  // Unterminated multi-line comment.
200  return Token::ILLEGAL;
201}
202
203
204Token::Value JavaScriptScanner::ScanHtmlComment() {
205  // Check for <!-- comments.
206  ASSERT(c0_ == '!');
207  Advance();
208  if (c0_ == '-') {
209    Advance();
210    if (c0_ == '-') return SkipSingleLineComment();
211    PushBack('-');  // undo Advance()
212  }
213  PushBack('!');  // undo Advance()
214  ASSERT(c0_ == '!');
215  return Token::LT;
216}
217
218
219void JavaScriptScanner::Scan() {
220  next_.literal_chars = NULL;
221  Token::Value token;
222  do {
223    // Remember the position of the next token
224    next_.location.beg_pos = source_pos();
225
226    switch (c0_) {
227      case ' ':
228      case '\t':
229        Advance();
230        token = Token::WHITESPACE;
231        break;
232
233      case '\n':
234        Advance();
235        has_line_terminator_before_next_ = true;
236        token = Token::WHITESPACE;
237        break;
238
239      case '"': case '\'':
240        token = ScanString();
241        break;
242
243      case '<':
244        // < <= << <<= <!--
245        Advance();
246        if (c0_ == '=') {
247          token = Select(Token::LTE);
248        } else if (c0_ == '<') {
249          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
250        } else if (c0_ == '!') {
251          token = ScanHtmlComment();
252        } else {
253          token = Token::LT;
254        }
255        break;
256
257      case '>':
258        // > >= >> >>= >>> >>>=
259        Advance();
260        if (c0_ == '=') {
261          token = Select(Token::GTE);
262        } else if (c0_ == '>') {
263          // >> >>= >>> >>>=
264          Advance();
265          if (c0_ == '=') {
266            token = Select(Token::ASSIGN_SAR);
267          } else if (c0_ == '>') {
268            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
269          } else {
270            token = Token::SAR;
271          }
272        } else {
273          token = Token::GT;
274        }
275        break;
276
277      case '=':
278        // = == ===
279        Advance();
280        if (c0_ == '=') {
281          token = Select('=', Token::EQ_STRICT, Token::EQ);
282        } else {
283          token = Token::ASSIGN;
284        }
285        break;
286
287      case '!':
288        // ! != !==
289        Advance();
290        if (c0_ == '=') {
291          token = Select('=', Token::NE_STRICT, Token::NE);
292        } else {
293          token = Token::NOT;
294        }
295        break;
296
297      case '+':
298        // + ++ +=
299        Advance();
300        if (c0_ == '+') {
301          token = Select(Token::INC);
302        } else if (c0_ == '=') {
303          token = Select(Token::ASSIGN_ADD);
304        } else {
305          token = Token::ADD;
306        }
307        break;
308
309      case '-':
310        // - -- --> -=
311        Advance();
312        if (c0_ == '-') {
313          Advance();
314          if (c0_ == '>' && has_line_terminator_before_next_) {
315            // For compatibility with SpiderMonkey, we skip lines that
316            // start with an HTML comment end '-->'.
317            token = SkipSingleLineComment();
318          } else {
319            token = Token::DEC;
320          }
321        } else if (c0_ == '=') {
322          token = Select(Token::ASSIGN_SUB);
323        } else {
324          token = Token::SUB;
325        }
326        break;
327
328      case '*':
329        // * *=
330        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
331        break;
332
333      case '%':
334        // % %=
335        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
336        break;
337
338      case '/':
339        // /  // /* /=
340        Advance();
341        if (c0_ == '/') {
342          token = SkipSingleLineComment();
343        } else if (c0_ == '*') {
344          token = SkipMultiLineComment();
345        } else if (c0_ == '=') {
346          token = Select(Token::ASSIGN_DIV);
347        } else {
348          token = Token::DIV;
349        }
350        break;
351
352      case '&':
353        // & && &=
354        Advance();
355        if (c0_ == '&') {
356          token = Select(Token::AND);
357        } else if (c0_ == '=') {
358          token = Select(Token::ASSIGN_BIT_AND);
359        } else {
360          token = Token::BIT_AND;
361        }
362        break;
363
364      case '|':
365        // | || |=
366        Advance();
367        if (c0_ == '|') {
368          token = Select(Token::OR);
369        } else if (c0_ == '=') {
370          token = Select(Token::ASSIGN_BIT_OR);
371        } else {
372          token = Token::BIT_OR;
373        }
374        break;
375
376      case '^':
377        // ^ ^=
378        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
379        break;
380
381      case '.':
382        // . Number
383        Advance();
384        if (IsDecimalDigit(c0_)) {
385          token = ScanNumber(true);
386        } else {
387          token = Token::PERIOD;
388        }
389        break;
390
391      case ':':
392        token = Select(Token::COLON);
393        break;
394
395      case ';':
396        token = Select(Token::SEMICOLON);
397        break;
398
399      case ',':
400        token = Select(Token::COMMA);
401        break;
402
403      case '(':
404        token = Select(Token::LPAREN);
405        break;
406
407      case ')':
408        token = Select(Token::RPAREN);
409        break;
410
411      case '[':
412        token = Select(Token::LBRACK);
413        break;
414
415      case ']':
416        token = Select(Token::RBRACK);
417        break;
418
419      case '{':
420        token = Select(Token::LBRACE);
421        break;
422
423      case '}':
424        token = Select(Token::RBRACE);
425        break;
426
427      case '?':
428        token = Select(Token::CONDITIONAL);
429        break;
430
431      case '~':
432        token = Select(Token::BIT_NOT);
433        break;
434
435      default:
436        if (unicode_cache_->IsIdentifierStart(c0_)) {
437          token = ScanIdentifierOrKeyword();
438        } else if (IsDecimalDigit(c0_)) {
439          token = ScanNumber(false);
440        } else if (SkipWhiteSpace()) {
441          token = Token::WHITESPACE;
442        } else if (c0_ < 0) {
443          token = Token::EOS;
444        } else {
445          token = Select(Token::ILLEGAL);
446        }
447        break;
448    }
449
450    // Continue scanning for tokens as long as we're just skipping
451    // whitespace.
452  } while (token == Token::WHITESPACE);
453
454  next_.location.end_pos = source_pos();
455  next_.token = token;
456}
457
458
459void JavaScriptScanner::SeekForward(int pos) {
460  // After this call, we will have the token at the given position as
461  // the "next" token. The "current" token will be invalid.
462  if (pos == next_.location.beg_pos) return;
463  int current_pos = source_pos();
464  ASSERT_EQ(next_.location.end_pos, current_pos);
465  // Positions inside the lookahead token aren't supported.
466  ASSERT(pos >= current_pos);
467  if (pos != current_pos) {
468    source_->SeekForward(pos - source_->pos());
469    Advance();
470    // This function is only called to seek to the location
471    // of the end of a function (at the "}" token). It doesn't matter
472    // whether there was a line terminator in the part we skip.
473    has_line_terminator_before_next_ = false;
474  }
475  Scan();
476}
477
478
479void JavaScriptScanner::ScanEscape() {
480  uc32 c = c0_;
481  Advance();
482
483  // Skip escaped newlines.
484  if (unicode_cache_->IsLineTerminator(c)) {
485    // Allow CR+LF newlines in multiline string literals.
486    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
487    // Allow LF+CR newlines in multiline string literals.
488    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
489    return;
490  }
491
492  switch (c) {
493    case '\'':  // fall through
494    case '"' :  // fall through
495    case '\\': break;
496    case 'b' : c = '\b'; break;
497    case 'f' : c = '\f'; break;
498    case 'n' : c = '\n'; break;
499    case 'r' : c = '\r'; break;
500    case 't' : c = '\t'; break;
501    case 'u' : c = ScanHexEscape(c, 4); break;
502    case 'v' : c = '\v'; break;
503    case 'x' : c = ScanHexEscape(c, 2); break;
504    case '0' :  // fall through
505    case '1' :  // fall through
506    case '2' :  // fall through
507    case '3' :  // fall through
508    case '4' :  // fall through
509    case '5' :  // fall through
510    case '6' :  // fall through
511    case '7' : c = ScanOctalEscape(c, 2); break;
512  }
513
514  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
515  // should be illegal, but they are commonly handled
516  // as non-escaped characters by JS VMs.
517  AddLiteralChar(c);
518}
519
520
521Token::Value JavaScriptScanner::ScanString() {
522  uc32 quote = c0_;
523  Advance();  // consume quote
524
525  LiteralScope literal(this);
526  while (c0_ != quote && c0_ >= 0
527         && !unicode_cache_->IsLineTerminator(c0_)) {
528    uc32 c = c0_;
529    Advance();
530    if (c == '\\') {
531      if (c0_ < 0) return Token::ILLEGAL;
532      ScanEscape();
533    } else {
534      AddLiteralChar(c);
535    }
536  }
537  if (c0_ != quote) return Token::ILLEGAL;
538  literal.Complete();
539
540  Advance();  // consume quote
541  return Token::STRING;
542}
543
544
545void JavaScriptScanner::ScanDecimalDigits() {
546  while (IsDecimalDigit(c0_))
547    AddLiteralCharAdvance();
548}
549
550
551Token::Value JavaScriptScanner::ScanNumber(bool seen_period) {
552  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
553
554  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
555
556  LiteralScope literal(this);
557  if (seen_period) {
558    // we have already seen a decimal point of the float
559    AddLiteralChar('.');
560    ScanDecimalDigits();  // we know we have at least one digit
561
562  } else {
563    // if the first character is '0' we must check for octals and hex
564    if (c0_ == '0') {
565      AddLiteralCharAdvance();
566
567      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
568      if (c0_ == 'x' || c0_ == 'X') {
569        // hex number
570        kind = HEX;
571        AddLiteralCharAdvance();
572        if (!IsHexDigit(c0_)) {
573          // we must have at least one hex digit after 'x'/'X'
574          return Token::ILLEGAL;
575        }
576        while (IsHexDigit(c0_)) {
577          AddLiteralCharAdvance();
578        }
579      } else if ('0' <= c0_ && c0_ <= '7') {
580        // (possible) octal number
581        kind = OCTAL;
582        while (true) {
583          if (c0_ == '8' || c0_ == '9') {
584            kind = DECIMAL;
585            break;
586          }
587          if (c0_  < '0' || '7'  < c0_) {
588            // Octal literal finished.
589            octal_pos_ = next_.location.beg_pos;
590            break;
591          }
592          AddLiteralCharAdvance();
593        }
594      }
595    }
596
597    // Parse decimal digits and allow trailing fractional part.
598    if (kind == DECIMAL) {
599      ScanDecimalDigits();  // optional
600      if (c0_ == '.') {
601        AddLiteralCharAdvance();
602        ScanDecimalDigits();  // optional
603      }
604    }
605  }
606
607  // scan exponent, if any
608  if (c0_ == 'e' || c0_ == 'E') {
609    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
610    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
611    // scan exponent
612    AddLiteralCharAdvance();
613    if (c0_ == '+' || c0_ == '-')
614      AddLiteralCharAdvance();
615    if (!IsDecimalDigit(c0_)) {
616      // we must have at least one decimal digit after 'e'/'E'
617      return Token::ILLEGAL;
618    }
619    ScanDecimalDigits();
620  }
621
622  // The source character immediately following a numeric literal must
623  // not be an identifier start or a decimal digit; see ECMA-262
624  // section 7.8.3, page 17 (note that we read only one decimal digit
625  // if the value is 0).
626  if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_))
627    return Token::ILLEGAL;
628
629  literal.Complete();
630
631  return Token::NUMBER;
632}
633
634
635uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() {
636  Advance();
637  if (c0_ != 'u') return unibrow::Utf8::kBadChar;
638  Advance();
639  uc32 c = ScanHexEscape('u', 4);
640  // We do not allow a unicode escape sequence to start another
641  // unicode escape sequence.
642  if (c == '\\') return unibrow::Utf8::kBadChar;
643  return c;
644}
645
646
647Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() {
648  ASSERT(unicode_cache_->IsIdentifierStart(c0_));
649  LiteralScope literal(this);
650  KeywordMatcher keyword_match;
651  // Scan identifier start character.
652  if (c0_ == '\\') {
653    uc32 c = ScanIdentifierUnicodeEscape();
654    // Only allow legal identifier start characters.
655    if (!unicode_cache_->IsIdentifierStart(c)) return Token::ILLEGAL;
656    AddLiteralChar(c);
657    return ScanIdentifierSuffix(&literal);
658  }
659
660  uc32 first_char = c0_;
661  Advance();
662  AddLiteralChar(first_char);
663  if (!keyword_match.AddChar(first_char)) {
664    return ScanIdentifierSuffix(&literal);
665  }
666
667  // Scan the rest of the identifier characters.
668  while (unicode_cache_->IsIdentifierPart(c0_)) {
669    if (c0_ != '\\') {
670      uc32 next_char = c0_;
671      Advance();
672      AddLiteralChar(next_char);
673      if (keyword_match.AddChar(next_char)) continue;
674    }
675    // Fallthrough if no loner able to complete keyword.
676    return ScanIdentifierSuffix(&literal);
677  }
678  literal.Complete();
679
680  return keyword_match.token();
681}
682
683
684Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) {
685  // Scan the rest of the identifier characters.
686  while (unicode_cache_->IsIdentifierPart(c0_)) {
687    if (c0_ == '\\') {
688      uc32 c = ScanIdentifierUnicodeEscape();
689      // Only allow legal identifier part characters.
690      if (!unicode_cache_->IsIdentifierPart(c)) return Token::ILLEGAL;
691      AddLiteralChar(c);
692    } else {
693      AddLiteralChar(c0_);
694      Advance();
695    }
696  }
697  literal->Complete();
698
699  return Token::IDENTIFIER;
700}
701
702
703bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) {
704  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
705  bool in_character_class = false;
706
707  // Previous token is either '/' or '/=', in the second case, the
708  // pattern starts at =.
709  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
710  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
711
712  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
713  // the scanner should pass uninterpreted bodies to the RegExp
714  // constructor.
715  LiteralScope literal(this);
716  if (seen_equal)
717    AddLiteralChar('=');
718
719  while (c0_ != '/' || in_character_class) {
720    if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
721    if (c0_ == '\\') {  // Escape sequence.
722      AddLiteralCharAdvance();
723      if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false;
724      AddLiteralCharAdvance();
725      // If the escape allows more characters, i.e., \x??, \u????, or \c?,
726      // only "safe" characters are allowed (letters, digits, underscore),
727      // otherwise the escape isn't valid and the invalid character has
728      // its normal meaning. I.e., we can just continue scanning without
729      // worrying whether the following characters are part of the escape
730      // or not, since any '/', '\\' or '[' is guaranteed to not be part
731      // of the escape sequence.
732    } else {  // Unescaped character.
733      if (c0_ == '[') in_character_class = true;
734      if (c0_ == ']') in_character_class = false;
735      AddLiteralCharAdvance();
736    }
737  }
738  Advance();  // consume '/'
739
740  literal.Complete();
741
742  return true;
743}
744
745
746bool JavaScriptScanner::ScanRegExpFlags() {
747  // Scan regular expression flags.
748  LiteralScope literal(this);
749  while (unicode_cache_->IsIdentifierPart(c0_)) {
750    if (c0_ == '\\') {
751      uc32 c = ScanIdentifierUnicodeEscape();
752      if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
753        // We allow any escaped character, unlike the restriction on
754        // IdentifierPart when it is used to build an IdentifierName.
755        AddLiteralChar(c);
756        continue;
757      }
758    }
759    AddLiteralCharAdvance();
760  }
761  literal.Complete();
762
763  next_.location.end_pos = source_pos() - 1;
764  return true;
765}
766
767// ----------------------------------------------------------------------------
768// Keyword Matcher
769
770KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
771  { "break",  KEYWORD_PREFIX, Token::BREAK },
772  { NULL,     C,              Token::ILLEGAL },
773  { NULL,     D,              Token::ILLEGAL },
774  { NULL,     E,              Token::ILLEGAL },
775  { NULL,     F,              Token::ILLEGAL },
776  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
777  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
778  { NULL,     I,              Token::ILLEGAL },
779  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
780  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
781  { "let",    KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD },
782  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
783  { NULL,     N,              Token::ILLEGAL },
784  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
785  { NULL,     P,              Token::ILLEGAL },
786  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
787  { "return", KEYWORD_PREFIX, Token::RETURN },
788  { NULL,     S,              Token::ILLEGAL },
789  { NULL,     T,              Token::ILLEGAL },
790  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
791  { NULL,     V,              Token::ILLEGAL },
792  { NULL,     W,              Token::ILLEGAL },
793  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
794  { "yield",  KEYWORD_PREFIX, Token::FUTURE_RESERVED_WORD }
795};
796
797
798void KeywordMatcher::Step(unibrow::uchar input) {
799  switch (state_) {
800    case INITIAL: {
801      // matching the first character is the only state with significant fanout.
802      // Match only lower-case letters in range 'b'..'y'.
803      unsigned int offset = input - kFirstCharRangeMin;
804      if (offset < kFirstCharRangeLength) {
805        state_ = first_states_[offset].state;
806        if (state_ == KEYWORD_PREFIX) {
807          keyword_ = first_states_[offset].keyword;
808          counter_ = 1;
809          keyword_token_ = first_states_[offset].token;
810        }
811        return;
812      }
813      break;
814    }
815    case KEYWORD_PREFIX:
816      if (static_cast<unibrow::uchar>(keyword_[counter_]) == input) {
817        counter_++;
818        if (keyword_[counter_] == '\0') {
819          state_ = KEYWORD_MATCHED;
820          token_ = keyword_token_;
821        }
822        return;
823      }
824      break;
825    case KEYWORD_MATCHED:
826      token_ = Token::IDENTIFIER;
827      break;
828    case C:
829      if (MatchState(input, 'a', CA)) return;
830      if (MatchKeywordStart(input, "class", 1,
831          Token::FUTURE_RESERVED_WORD)) return;
832      if (MatchState(input, 'o', CO)) return;
833      break;
834    case CA:
835      if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
836      if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
837      break;
838    case CO:
839      if (MatchState(input, 'n', CON)) return;
840      break;
841    case CON:
842      if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
843      if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
844      break;
845    case D:
846      if (MatchState(input, 'e', DE)) return;
847      if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
848      break;
849    case DE:
850      if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
851      if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
852      if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
853      break;
854    case E:
855      if (MatchKeywordStart(input, "else", 1, Token::ELSE)) return;
856      if (MatchKeywordStart(input, "enum", 1,
857          Token::FUTURE_RESERVED_WORD)) return;
858      if (MatchState(input, 'x', EX)) return;
859      break;
860    case EX:
861      if (MatchKeywordStart(input, "export", 2,
862          Token::FUTURE_RESERVED_WORD)) return;
863      if (MatchKeywordStart(input, "extends", 2,
864          Token::FUTURE_RESERVED_WORD)) return;
865      break;
866    case F:
867      if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
868      if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
869      if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
870      if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
871      break;
872    case I:
873      if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
874      if (MatchState(input, 'm', IM)) return;
875      if (MatchKeyword(input, 'n', IN, Token::IN)) return;
876      break;
877    case IM:
878      if (MatchState(input, 'p', IMP)) return;
879      break;
880    case IMP:
881      if (MatchKeywordStart(input, "implements", 3,
882         Token::FUTURE_RESERVED_WORD )) return;
883      if (MatchKeywordStart(input, "import", 3,
884         Token::FUTURE_RESERVED_WORD)) return;
885      break;
886    case IN:
887      token_ = Token::IDENTIFIER;
888      if (MatchKeywordStart(input, "interface", 2,
889         Token::FUTURE_RESERVED_WORD)) return;
890      if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) return;
891      break;
892    case N:
893      if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
894      if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
895      if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
896      break;
897    case P:
898      if (MatchKeywordStart(input, "package", 1,
899          Token::FUTURE_RESERVED_WORD)) return;
900      if (MatchState(input, 'r', PR)) return;
901      if (MatchKeywordStart(input, "public", 1,
902          Token::FUTURE_RESERVED_WORD)) return;
903      break;
904    case PR:
905      if (MatchKeywordStart(input, "private", 2,
906          Token::FUTURE_RESERVED_WORD)) return;
907      if (MatchKeywordStart(input, "protected", 2,
908          Token::FUTURE_RESERVED_WORD)) return;
909      break;
910    case S:
911      if (MatchKeywordStart(input, "static", 1,
912          Token::FUTURE_RESERVED_WORD)) return;
913      if (MatchKeywordStart(input, "super", 1,
914          Token::FUTURE_RESERVED_WORD)) return;
915      if (MatchKeywordStart(input, "switch", 1,
916          Token::SWITCH)) return;
917      break;
918    case T:
919      if (MatchState(input, 'h', TH)) return;
920      if (MatchState(input, 'r', TR)) return;
921      if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
922      break;
923    case TH:
924      if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
925      if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
926      break;
927    case TR:
928      if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
929      if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
930      break;
931    case V:
932      if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
933      if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
934      break;
935    case W:
936      if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
937      if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
938      break;
939    case UNMATCHABLE:
940      break;
941  }
942  // On fallthrough, it's a failure.
943  state_ = UNMATCHABLE;
944}
945
946} }  // namespace v8::internal
947