scanner.cc revision 592a9fc1d8ea420377a2e7efd0600e20b058be2b
1// Copyright 2011 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28// Features shared by parsing and pre-parsing scanners. 29 30#include "scanner.h" 31 32#include "../include/v8stdint.h" 33#include "char-predicates-inl.h" 34 35namespace v8 { 36namespace internal { 37 38// ---------------------------------------------------------------------------- 39// Scanner 40 41Scanner::Scanner(UnicodeCache* unicode_cache) 42 : unicode_cache_(unicode_cache), 43 octal_pos_(Location::invalid()), 44 harmony_scoping_(false) { } 45 46 47void Scanner::Initialize(UC16CharacterStream* source) { 48 source_ = source; 49 // Need to capture identifiers in order to recognize "get" and "set" 50 // in object literals. 51 Init(); 52 // Skip initial whitespace allowing HTML comment ends just like 53 // after a newline and scan first token. 54 has_line_terminator_before_next_ = true; 55 SkipWhiteSpace(); 56 Scan(); 57} 58 59 60uc32 Scanner::ScanHexNumber(int expected_length) { 61 ASSERT(expected_length <= 4); // prevent overflow 62 63 uc32 digits[4] = { 0, 0, 0, 0 }; 64 uc32 x = 0; 65 for (int i = 0; i < expected_length; i++) { 66 digits[i] = c0_; 67 int d = HexValue(c0_); 68 if (d < 0) { 69 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 70 // should be illegal, but other JS VMs just return the 71 // non-escaped version of the original character. 72 73 // Push back digits that we have advanced past. 74 for (int j = i-1; j >= 0; j--) { 75 PushBack(digits[j]); 76 } 77 return -1; 78 } 79 x = x * 16 + d; 80 Advance(); 81 } 82 83 return x; 84} 85 86 87// Ensure that tokens can be stored in a byte. 88STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 89 90// Table of one-character tokens, by character (0x00..0x7f only). 91static const byte one_char_tokens[] = { 92 Token::ILLEGAL, 93 Token::ILLEGAL, 94 Token::ILLEGAL, 95 Token::ILLEGAL, 96 Token::ILLEGAL, 97 Token::ILLEGAL, 98 Token::ILLEGAL, 99 Token::ILLEGAL, 100 Token::ILLEGAL, 101 Token::ILLEGAL, 102 Token::ILLEGAL, 103 Token::ILLEGAL, 104 Token::ILLEGAL, 105 Token::ILLEGAL, 106 Token::ILLEGAL, 107 Token::ILLEGAL, 108 Token::ILLEGAL, 109 Token::ILLEGAL, 110 Token::ILLEGAL, 111 Token::ILLEGAL, 112 Token::ILLEGAL, 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::LPAREN, // 0x28 133 Token::RPAREN, // 0x29 134 Token::ILLEGAL, 135 Token::ILLEGAL, 136 Token::COMMA, // 0x2c 137 Token::ILLEGAL, 138 Token::ILLEGAL, 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::COLON, // 0x3a 151 Token::SEMICOLON, // 0x3b 152 Token::ILLEGAL, 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::CONDITIONAL, // 0x3f 156 Token::ILLEGAL, 157 Token::ILLEGAL, 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::ILLEGAL, 183 Token::LBRACK, // 0x5b 184 Token::ILLEGAL, 185 Token::RBRACK, // 0x5d 186 Token::ILLEGAL, 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::ILLEGAL, 212 Token::ILLEGAL, 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::LBRACE, // 0x7b 216 Token::ILLEGAL, 217 Token::RBRACE, // 0x7d 218 Token::BIT_NOT, // 0x7e 219 Token::ILLEGAL 220}; 221 222 223Token::Value Scanner::Next() { 224 current_ = next_; 225 has_line_terminator_before_next_ = false; 226 has_multiline_comment_before_next_ = false; 227 if (static_cast<unsigned>(c0_) <= 0x7f) { 228 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 229 if (token != Token::ILLEGAL) { 230 int pos = source_pos(); 231 next_.token = token; 232 next_.location.beg_pos = pos; 233 next_.location.end_pos = pos + 1; 234 Advance(); 235 return current_.token; 236 } 237 } 238 Scan(); 239 return current_.token; 240} 241 242 243static inline bool IsByteOrderMark(uc32 c) { 244 // The Unicode value U+FFFE is guaranteed never to be assigned as a 245 // Unicode character; this implies that in a Unicode context the 246 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 247 // character expressed in little-endian byte order (since it could 248 // not be a U+FFFE character expressed in big-endian byte 249 // order). Nevertheless, we check for it to be compatible with 250 // Spidermonkey. 251 return c == 0xFEFF || c == 0xFFFE; 252} 253 254 255bool Scanner::SkipWhiteSpace() { 256 int start_position = source_pos(); 257 258 while (true) { 259 // We treat byte-order marks (BOMs) as whitespace for better 260 // compatibility with Spidermonkey and other JavaScript engines. 261 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 262 // IsWhiteSpace() includes line terminators! 263 if (unicode_cache_->IsLineTerminator(c0_)) { 264 // Ignore line terminators, but remember them. This is necessary 265 // for automatic semicolon insertion. 266 has_line_terminator_before_next_ = true; 267 } 268 Advance(); 269 } 270 271 // If there is an HTML comment end '-->' at the beginning of a 272 // line (with only whitespace in front of it), we treat the rest 273 // of the line as a comment. This is in line with the way 274 // SpiderMonkey handles it. 275 if (c0_ == '-' && has_line_terminator_before_next_) { 276 Advance(); 277 if (c0_ == '-') { 278 Advance(); 279 if (c0_ == '>') { 280 // Treat the rest of the line as a comment. 281 SkipSingleLineComment(); 282 // Continue skipping white space after the comment. 283 continue; 284 } 285 PushBack('-'); // undo Advance() 286 } 287 PushBack('-'); // undo Advance() 288 } 289 // Return whether or not we skipped any characters. 290 return source_pos() != start_position; 291 } 292} 293 294 295Token::Value Scanner::SkipSingleLineComment() { 296 Advance(); 297 298 // The line terminator at the end of the line is not considered 299 // to be part of the single-line comment; it is recognized 300 // separately by the lexical grammar and becomes part of the 301 // stream of input elements for the syntactic grammar (see 302 // ECMA-262, section 7.4). 303 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 304 Advance(); 305 } 306 307 return Token::WHITESPACE; 308} 309 310 311Token::Value Scanner::SkipMultiLineComment() { 312 ASSERT(c0_ == '*'); 313 Advance(); 314 315 while (c0_ >= 0) { 316 uc32 ch = c0_; 317 Advance(); 318 if (unicode_cache_->IsLineTerminator(ch)) { 319 // Following ECMA-262, section 7.4, a comment containing 320 // a newline will make the comment count as a line-terminator. 321 has_multiline_comment_before_next_ = true; 322 } 323 // If we have reached the end of the multi-line comment, we 324 // consume the '/' and insert a whitespace. This way all 325 // multi-line comments are treated as whitespace. 326 if (ch == '*' && c0_ == '/') { 327 c0_ = ' '; 328 return Token::WHITESPACE; 329 } 330 } 331 332 // Unterminated multi-line comment. 333 return Token::ILLEGAL; 334} 335 336 337Token::Value Scanner::ScanHtmlComment() { 338 // Check for <!-- comments. 339 ASSERT(c0_ == '!'); 340 Advance(); 341 if (c0_ == '-') { 342 Advance(); 343 if (c0_ == '-') return SkipSingleLineComment(); 344 PushBack('-'); // undo Advance() 345 } 346 PushBack('!'); // undo Advance() 347 ASSERT(c0_ == '!'); 348 return Token::LT; 349} 350 351 352void Scanner::Scan() { 353 next_.literal_chars = NULL; 354 Token::Value token; 355 do { 356 // Remember the position of the next token 357 next_.location.beg_pos = source_pos(); 358 359 switch (c0_) { 360 case ' ': 361 case '\t': 362 Advance(); 363 token = Token::WHITESPACE; 364 break; 365 366 case '\n': 367 Advance(); 368 has_line_terminator_before_next_ = true; 369 token = Token::WHITESPACE; 370 break; 371 372 case '"': case '\'': 373 token = ScanString(); 374 break; 375 376 case '<': 377 // < <= << <<= <!-- 378 Advance(); 379 if (c0_ == '=') { 380 token = Select(Token::LTE); 381 } else if (c0_ == '<') { 382 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 383 } else if (c0_ == '!') { 384 token = ScanHtmlComment(); 385 } else { 386 token = Token::LT; 387 } 388 break; 389 390 case '>': 391 // > >= >> >>= >>> >>>= 392 Advance(); 393 if (c0_ == '=') { 394 token = Select(Token::GTE); 395 } else if (c0_ == '>') { 396 // >> >>= >>> >>>= 397 Advance(); 398 if (c0_ == '=') { 399 token = Select(Token::ASSIGN_SAR); 400 } else if (c0_ == '>') { 401 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 402 } else { 403 token = Token::SAR; 404 } 405 } else { 406 token = Token::GT; 407 } 408 break; 409 410 case '=': 411 // = == === 412 Advance(); 413 if (c0_ == '=') { 414 token = Select('=', Token::EQ_STRICT, Token::EQ); 415 } else { 416 token = Token::ASSIGN; 417 } 418 break; 419 420 case '!': 421 // ! != !== 422 Advance(); 423 if (c0_ == '=') { 424 token = Select('=', Token::NE_STRICT, Token::NE); 425 } else { 426 token = Token::NOT; 427 } 428 break; 429 430 case '+': 431 // + ++ += 432 Advance(); 433 if (c0_ == '+') { 434 token = Select(Token::INC); 435 } else if (c0_ == '=') { 436 token = Select(Token::ASSIGN_ADD); 437 } else { 438 token = Token::ADD; 439 } 440 break; 441 442 case '-': 443 // - -- --> -= 444 Advance(); 445 if (c0_ == '-') { 446 Advance(); 447 if (c0_ == '>' && has_line_terminator_before_next_) { 448 // For compatibility with SpiderMonkey, we skip lines that 449 // start with an HTML comment end '-->'. 450 token = SkipSingleLineComment(); 451 } else { 452 token = Token::DEC; 453 } 454 } else if (c0_ == '=') { 455 token = Select(Token::ASSIGN_SUB); 456 } else { 457 token = Token::SUB; 458 } 459 break; 460 461 case '*': 462 // * *= 463 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 464 break; 465 466 case '%': 467 // % %= 468 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 469 break; 470 471 case '/': 472 // / // /* /= 473 Advance(); 474 if (c0_ == '/') { 475 token = SkipSingleLineComment(); 476 } else if (c0_ == '*') { 477 token = SkipMultiLineComment(); 478 } else if (c0_ == '=') { 479 token = Select(Token::ASSIGN_DIV); 480 } else { 481 token = Token::DIV; 482 } 483 break; 484 485 case '&': 486 // & && &= 487 Advance(); 488 if (c0_ == '&') { 489 token = Select(Token::AND); 490 } else if (c0_ == '=') { 491 token = Select(Token::ASSIGN_BIT_AND); 492 } else { 493 token = Token::BIT_AND; 494 } 495 break; 496 497 case '|': 498 // | || |= 499 Advance(); 500 if (c0_ == '|') { 501 token = Select(Token::OR); 502 } else if (c0_ == '=') { 503 token = Select(Token::ASSIGN_BIT_OR); 504 } else { 505 token = Token::BIT_OR; 506 } 507 break; 508 509 case '^': 510 // ^ ^= 511 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 512 break; 513 514 case '.': 515 // . Number 516 Advance(); 517 if (IsDecimalDigit(c0_)) { 518 token = ScanNumber(true); 519 } else { 520 token = Token::PERIOD; 521 } 522 break; 523 524 case ':': 525 token = Select(Token::COLON); 526 break; 527 528 case ';': 529 token = Select(Token::SEMICOLON); 530 break; 531 532 case ',': 533 token = Select(Token::COMMA); 534 break; 535 536 case '(': 537 token = Select(Token::LPAREN); 538 break; 539 540 case ')': 541 token = Select(Token::RPAREN); 542 break; 543 544 case '[': 545 token = Select(Token::LBRACK); 546 break; 547 548 case ']': 549 token = Select(Token::RBRACK); 550 break; 551 552 case '{': 553 token = Select(Token::LBRACE); 554 break; 555 556 case '}': 557 token = Select(Token::RBRACE); 558 break; 559 560 case '?': 561 token = Select(Token::CONDITIONAL); 562 break; 563 564 case '~': 565 token = Select(Token::BIT_NOT); 566 break; 567 568 default: 569 if (unicode_cache_->IsIdentifierStart(c0_)) { 570 token = ScanIdentifierOrKeyword(); 571 } else if (IsDecimalDigit(c0_)) { 572 token = ScanNumber(false); 573 } else if (SkipWhiteSpace()) { 574 token = Token::WHITESPACE; 575 } else if (c0_ < 0) { 576 token = Token::EOS; 577 } else { 578 token = Select(Token::ILLEGAL); 579 } 580 break; 581 } 582 583 // Continue scanning for tokens as long as we're just skipping 584 // whitespace. 585 } while (token == Token::WHITESPACE); 586 587 next_.location.end_pos = source_pos(); 588 next_.token = token; 589} 590 591 592void Scanner::SeekForward(int pos) { 593 // After this call, we will have the token at the given position as 594 // the "next" token. The "current" token will be invalid. 595 if (pos == next_.location.beg_pos) return; 596 int current_pos = source_pos(); 597 ASSERT_EQ(next_.location.end_pos, current_pos); 598 // Positions inside the lookahead token aren't supported. 599 ASSERT(pos >= current_pos); 600 if (pos != current_pos) { 601 source_->SeekForward(pos - source_->pos()); 602 Advance(); 603 // This function is only called to seek to the location 604 // of the end of a function (at the "}" token). It doesn't matter 605 // whether there was a line terminator in the part we skip. 606 has_line_terminator_before_next_ = false; 607 has_multiline_comment_before_next_ = false; 608 } 609 Scan(); 610} 611 612 613void Scanner::ScanEscape() { 614 uc32 c = c0_; 615 Advance(); 616 617 // Skip escaped newlines. 618 if (unicode_cache_->IsLineTerminator(c)) { 619 // Allow CR+LF newlines in multiline string literals. 620 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 621 // Allow LF+CR newlines in multiline string literals. 622 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 623 return; 624 } 625 626 switch (c) { 627 case '\'': // fall through 628 case '"' : // fall through 629 case '\\': break; 630 case 'b' : c = '\b'; break; 631 case 'f' : c = '\f'; break; 632 case 'n' : c = '\n'; break; 633 case 'r' : c = '\r'; break; 634 case 't' : c = '\t'; break; 635 case 'u' : { 636 c = ScanHexNumber(4); 637 if (c < 0) c = 'u'; 638 break; 639 } 640 case 'v' : c = '\v'; break; 641 case 'x' : { 642 c = ScanHexNumber(2); 643 if (c < 0) c = 'x'; 644 break; 645 } 646 case '0' : // fall through 647 case '1' : // fall through 648 case '2' : // fall through 649 case '3' : // fall through 650 case '4' : // fall through 651 case '5' : // fall through 652 case '6' : // fall through 653 case '7' : c = ScanOctalEscape(c, 2); break; 654 } 655 656 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 657 // should be illegal, but they are commonly handled 658 // as non-escaped characters by JS VMs. 659 AddLiteralChar(c); 660} 661 662 663// Octal escapes of the forms '\0xx' and '\xxx' are not a part of 664// ECMA-262. Other JS VMs support them. 665uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 666 uc32 x = c - '0'; 667 int i = 0; 668 for (; i < length; i++) { 669 int d = c0_ - '0'; 670 if (d < 0 || d > 7) break; 671 int nx = x * 8 + d; 672 if (nx >= 256) break; 673 x = nx; 674 Advance(); 675 } 676 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 677 // Remember the position of octal escape sequences so that an error 678 // can be reported later (in strict mode). 679 // We don't report the error immediately, because the octal escape can 680 // occur before the "use strict" directive. 681 if (c != '0' || i > 0) { 682 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 683 } 684 return x; 685} 686 687 688Token::Value Scanner::ScanString() { 689 uc32 quote = c0_; 690 Advance(); // consume quote 691 692 LiteralScope literal(this); 693 while (c0_ != quote && c0_ >= 0 694 && !unicode_cache_->IsLineTerminator(c0_)) { 695 uc32 c = c0_; 696 Advance(); 697 if (c == '\\') { 698 if (c0_ < 0) return Token::ILLEGAL; 699 ScanEscape(); 700 } else { 701 AddLiteralChar(c); 702 } 703 } 704 if (c0_ != quote) return Token::ILLEGAL; 705 literal.Complete(); 706 707 Advance(); // consume quote 708 return Token::STRING; 709} 710 711 712void Scanner::ScanDecimalDigits() { 713 while (IsDecimalDigit(c0_)) 714 AddLiteralCharAdvance(); 715} 716 717 718Token::Value Scanner::ScanNumber(bool seen_period) { 719 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 720 721 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 722 723 LiteralScope literal(this); 724 if (seen_period) { 725 // we have already seen a decimal point of the float 726 AddLiteralChar('.'); 727 ScanDecimalDigits(); // we know we have at least one digit 728 729 } else { 730 // if the first character is '0' we must check for octals and hex 731 if (c0_ == '0') { 732 int start_pos = source_pos(); // For reporting octal positions. 733 AddLiteralCharAdvance(); 734 735 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 736 if (c0_ == 'x' || c0_ == 'X') { 737 // hex number 738 kind = HEX; 739 AddLiteralCharAdvance(); 740 if (!IsHexDigit(c0_)) { 741 // we must have at least one hex digit after 'x'/'X' 742 return Token::ILLEGAL; 743 } 744 while (IsHexDigit(c0_)) { 745 AddLiteralCharAdvance(); 746 } 747 } else if ('0' <= c0_ && c0_ <= '7') { 748 // (possible) octal number 749 kind = OCTAL; 750 while (true) { 751 if (c0_ == '8' || c0_ == '9') { 752 kind = DECIMAL; 753 break; 754 } 755 if (c0_ < '0' || '7' < c0_) { 756 // Octal literal finished. 757 octal_pos_ = Location(start_pos, source_pos()); 758 break; 759 } 760 AddLiteralCharAdvance(); 761 } 762 } 763 } 764 765 // Parse decimal digits and allow trailing fractional part. 766 if (kind == DECIMAL) { 767 ScanDecimalDigits(); // optional 768 if (c0_ == '.') { 769 AddLiteralCharAdvance(); 770 ScanDecimalDigits(); // optional 771 } 772 } 773 } 774 775 // scan exponent, if any 776 if (c0_ == 'e' || c0_ == 'E') { 777 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 778 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 779 // scan exponent 780 AddLiteralCharAdvance(); 781 if (c0_ == '+' || c0_ == '-') 782 AddLiteralCharAdvance(); 783 if (!IsDecimalDigit(c0_)) { 784 // we must have at least one decimal digit after 'e'/'E' 785 return Token::ILLEGAL; 786 } 787 ScanDecimalDigits(); 788 } 789 790 // The source character immediately following a numeric literal must 791 // not be an identifier start or a decimal digit; see ECMA-262 792 // section 7.8.3, page 17 (note that we read only one decimal digit 793 // if the value is 0). 794 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 795 return Token::ILLEGAL; 796 797 literal.Complete(); 798 799 return Token::NUMBER; 800} 801 802 803uc32 Scanner::ScanIdentifierUnicodeEscape() { 804 Advance(); 805 if (c0_ != 'u') return -1; 806 Advance(); 807 uc32 result = ScanHexNumber(4); 808 if (result < 0) PushBack('u'); 809 return result; 810} 811 812 813// ---------------------------------------------------------------------------- 814// Keyword Matcher 815 816#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 817 KEYWORD_GROUP('b') \ 818 KEYWORD("break", Token::BREAK) \ 819 KEYWORD_GROUP('c') \ 820 KEYWORD("case", Token::CASE) \ 821 KEYWORD("catch", Token::CATCH) \ 822 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ 823 KEYWORD("const", Token::CONST) \ 824 KEYWORD("continue", Token::CONTINUE) \ 825 KEYWORD_GROUP('d') \ 826 KEYWORD("debugger", Token::DEBUGGER) \ 827 KEYWORD("default", Token::DEFAULT) \ 828 KEYWORD("delete", Token::DELETE) \ 829 KEYWORD("do", Token::DO) \ 830 KEYWORD_GROUP('e') \ 831 KEYWORD("else", Token::ELSE) \ 832 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 833 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \ 834 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ 835 KEYWORD_GROUP('f') \ 836 KEYWORD("false", Token::FALSE_LITERAL) \ 837 KEYWORD("finally", Token::FINALLY) \ 838 KEYWORD("for", Token::FOR) \ 839 KEYWORD("function", Token::FUNCTION) \ 840 KEYWORD_GROUP('i') \ 841 KEYWORD("if", Token::IF) \ 842 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 843 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \ 844 KEYWORD("in", Token::IN) \ 845 KEYWORD("instanceof", Token::INSTANCEOF) \ 846 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 847 KEYWORD_GROUP('l') \ 848 KEYWORD("let", harmony_scoping \ 849 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 850 KEYWORD_GROUP('n') \ 851 KEYWORD("new", Token::NEW) \ 852 KEYWORD("null", Token::NULL_LITERAL) \ 853 KEYWORD_GROUP('p') \ 854 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 855 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 856 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 857 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 858 KEYWORD_GROUP('r') \ 859 KEYWORD("return", Token::RETURN) \ 860 KEYWORD_GROUP('s') \ 861 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ 862 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ 863 KEYWORD("switch", Token::SWITCH) \ 864 KEYWORD_GROUP('t') \ 865 KEYWORD("this", Token::THIS) \ 866 KEYWORD("throw", Token::THROW) \ 867 KEYWORD("true", Token::TRUE_LITERAL) \ 868 KEYWORD("try", Token::TRY) \ 869 KEYWORD("typeof", Token::TYPEOF) \ 870 KEYWORD_GROUP('v') \ 871 KEYWORD("var", Token::VAR) \ 872 KEYWORD("void", Token::VOID) \ 873 KEYWORD_GROUP('w') \ 874 KEYWORD("while", Token::WHILE) \ 875 KEYWORD("with", Token::WITH) \ 876 KEYWORD_GROUP('y') \ 877 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) 878 879 880static Token::Value KeywordOrIdentifierToken(const char* input, 881 int input_length, 882 bool harmony_scoping) { 883 ASSERT(input_length >= 1); 884 const int kMinLength = 2; 885 const int kMaxLength = 10; 886 if (input_length < kMinLength || input_length > kMaxLength) { 887 return Token::IDENTIFIER; 888 } 889 switch (input[0]) { 890 default: 891#define KEYWORD_GROUP_CASE(ch) \ 892 break; \ 893 case ch: 894#define KEYWORD(keyword, token) \ 895 { \ 896 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 897 /* strlen(keyword) plus 1 for the NUL char. */ \ 898 const int keyword_length = sizeof(keyword) - 1; \ 899 STATIC_ASSERT(keyword_length >= kMinLength); \ 900 STATIC_ASSERT(keyword_length <= kMaxLength); \ 901 if (input_length == keyword_length && \ 902 input[1] == keyword[1] && \ 903 (keyword_length <= 2 || input[2] == keyword[2]) && \ 904 (keyword_length <= 3 || input[3] == keyword[3]) && \ 905 (keyword_length <= 4 || input[4] == keyword[4]) && \ 906 (keyword_length <= 5 || input[5] == keyword[5]) && \ 907 (keyword_length <= 6 || input[6] == keyword[6]) && \ 908 (keyword_length <= 7 || input[7] == keyword[7]) && \ 909 (keyword_length <= 8 || input[8] == keyword[8]) && \ 910 (keyword_length <= 9 || input[9] == keyword[9])) { \ 911 return token; \ 912 } \ 913 } 914 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 915 } 916 return Token::IDENTIFIER; 917} 918 919 920Token::Value Scanner::ScanIdentifierOrKeyword() { 921 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 922 LiteralScope literal(this); 923 // Scan identifier start character. 924 if (c0_ == '\\') { 925 uc32 c = ScanIdentifierUnicodeEscape(); 926 // Only allow legal identifier start characters. 927 if (c < 0 || 928 c == '\\' || // No recursive escapes. 929 !unicode_cache_->IsIdentifierStart(c)) { 930 return Token::ILLEGAL; 931 } 932 AddLiteralChar(c); 933 return ScanIdentifierSuffix(&literal); 934 } 935 936 uc32 first_char = c0_; 937 Advance(); 938 AddLiteralChar(first_char); 939 940 // Scan the rest of the identifier characters. 941 while (unicode_cache_->IsIdentifierPart(c0_)) { 942 if (c0_ != '\\') { 943 uc32 next_char = c0_; 944 Advance(); 945 AddLiteralChar(next_char); 946 continue; 947 } 948 // Fallthrough if no longer able to complete keyword. 949 return ScanIdentifierSuffix(&literal); 950 } 951 952 literal.Complete(); 953 954 if (next_.literal_chars->is_ascii()) { 955 Vector<const char> chars = next_.literal_chars->ascii_literal(); 956 return KeywordOrIdentifierToken(chars.start(), 957 chars.length(), 958 harmony_scoping_); 959 } 960 961 return Token::IDENTIFIER; 962} 963 964 965Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) { 966 // Scan the rest of the identifier characters. 967 while (unicode_cache_->IsIdentifierPart(c0_)) { 968 if (c0_ == '\\') { 969 uc32 c = ScanIdentifierUnicodeEscape(); 970 // Only allow legal identifier part characters. 971 if (c < 0 || 972 c == '\\' || 973 !unicode_cache_->IsIdentifierPart(c)) { 974 return Token::ILLEGAL; 975 } 976 AddLiteralChar(c); 977 } else { 978 AddLiteralChar(c0_); 979 Advance(); 980 } 981 } 982 literal->Complete(); 983 984 return Token::IDENTIFIER; 985} 986 987 988bool Scanner::ScanRegExpPattern(bool seen_equal) { 989 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 990 bool in_character_class = false; 991 992 // Previous token is either '/' or '/=', in the second case, the 993 // pattern starts at =. 994 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 995 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 996 997 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 998 // the scanner should pass uninterpreted bodies to the RegExp 999 // constructor. 1000 LiteralScope literal(this); 1001 if (seen_equal) { 1002 AddLiteralChar('='); 1003 } 1004 1005 while (c0_ != '/' || in_character_class) { 1006 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1007 if (c0_ == '\\') { // Escape sequence. 1008 AddLiteralCharAdvance(); 1009 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1010 AddLiteralCharAdvance(); 1011 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1012 // only "safe" characters are allowed (letters, digits, underscore), 1013 // otherwise the escape isn't valid and the invalid character has 1014 // its normal meaning. I.e., we can just continue scanning without 1015 // worrying whether the following characters are part of the escape 1016 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1017 // of the escape sequence. 1018 1019 // TODO(896): At some point, parse RegExps more throughly to capture 1020 // octal esacpes in strict mode. 1021 } else { // Unescaped character. 1022 if (c0_ == '[') in_character_class = true; 1023 if (c0_ == ']') in_character_class = false; 1024 AddLiteralCharAdvance(); 1025 } 1026 } 1027 Advance(); // consume '/' 1028 1029 literal.Complete(); 1030 1031 return true; 1032} 1033 1034 1035bool Scanner::ScanLiteralUnicodeEscape() { 1036 ASSERT(c0_ == '\\'); 1037 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1038 Advance(); 1039 int i = 1; 1040 if (c0_ == 'u') { 1041 i++; 1042 while (i < 6) { 1043 Advance(); 1044 if (!IsHexDigit(c0_)) break; 1045 chars_read[i] = c0_; 1046 i++; 1047 } 1048 } 1049 if (i < 6) { 1050 // Incomplete escape. Undo all advances and return false. 1051 while (i > 0) { 1052 i--; 1053 PushBack(chars_read[i]); 1054 } 1055 return false; 1056 } 1057 // Complete escape. Add all chars to current literal buffer. 1058 for (int i = 0; i < 6; i++) { 1059 AddLiteralChar(chars_read[i]); 1060 } 1061 return true; 1062} 1063 1064 1065bool Scanner::ScanRegExpFlags() { 1066 // Scan regular expression flags. 1067 LiteralScope literal(this); 1068 while (unicode_cache_->IsIdentifierPart(c0_)) { 1069 if (c0_ != '\\') { 1070 AddLiteralCharAdvance(); 1071 } else { 1072 if (!ScanLiteralUnicodeEscape()) { 1073 break; 1074 } 1075 } 1076 } 1077 literal.Complete(); 1078 1079 next_.location.end_pos = source_pos() - 1; 1080 return true; 1081} 1082 1083} } // namespace v8::internal 1084