scanner.cc revision 85b71799222b55eb5dd74ea26efe0c64ab655c8c
1// Copyright 2011 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28// Features shared by parsing and pre-parsing scanners. 29 30#include "scanner.h" 31 32#include "../include/v8stdint.h" 33#include "char-predicates-inl.h" 34 35namespace v8 { 36namespace internal { 37 38// ---------------------------------------------------------------------------- 39// Scanner::LiteralScope 40 41Scanner::LiteralScope::LiteralScope(Scanner* self) 42 : scanner_(self), complete_(false) { 43 self->StartLiteral(); 44} 45 46 47Scanner::LiteralScope::~LiteralScope() { 48 if (!complete_) scanner_->DropLiteral(); 49} 50 51 52void Scanner::LiteralScope::Complete() { 53 scanner_->TerminateLiteral(); 54 complete_ = true; 55} 56 57// ---------------------------------------------------------------------------- 58// Scanner 59 60Scanner::Scanner(UnicodeCache* unicode_cache) 61 : unicode_cache_(unicode_cache) { } 62 63 64uc32 Scanner::ScanHexNumber(int expected_length) { 65 ASSERT(expected_length <= 4); // prevent overflow 66 67 uc32 digits[4] = { 0, 0, 0, 0 }; 68 uc32 x = 0; 69 for (int i = 0; i < expected_length; i++) { 70 digits[i] = c0_; 71 int d = HexValue(c0_); 72 if (d < 0) { 73 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 74 // should be illegal, but other JS VMs just return the 75 // non-escaped version of the original character. 76 77 // Push back digits that we have advanced past. 78 for (int j = i-1; j >= 0; j--) { 79 PushBack(digits[j]); 80 } 81 return -1; 82 } 83 x = x * 16 + d; 84 Advance(); 85 } 86 87 return x; 88} 89 90 91 92// ---------------------------------------------------------------------------- 93// JavaScriptScanner 94 95JavaScriptScanner::JavaScriptScanner(UnicodeCache* scanner_contants) 96 : Scanner(scanner_contants), 97 octal_pos_(Location::invalid()), 98 harmony_block_scoping_(false) { } 99 100 101void JavaScriptScanner::Initialize(UC16CharacterStream* source) { 102 source_ = source; 103 // Need to capture identifiers in order to recognize "get" and "set" 104 // in object literals. 105 Init(); 106 // Skip initial whitespace allowing HTML comment ends just like 107 // after a newline and scan first token. 108 has_line_terminator_before_next_ = true; 109 SkipWhiteSpace(); 110 Scan(); 111} 112 113 114// Ensure that tokens can be stored in a byte. 115STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 116 117// Table of one-character tokens, by character (0x00..0x7f only). 118static const byte one_char_tokens[] = { 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::ILLEGAL, 129 Token::ILLEGAL, 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::ILLEGAL, 133 Token::ILLEGAL, 134 Token::ILLEGAL, 135 Token::ILLEGAL, 136 Token::ILLEGAL, 137 Token::ILLEGAL, 138 Token::ILLEGAL, 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::ILLEGAL, 147 Token::ILLEGAL, 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::ILLEGAL, 152 Token::ILLEGAL, 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::ILLEGAL, 157 Token::ILLEGAL, 158 Token::ILLEGAL, 159 Token::LPAREN, // 0x28 160 Token::RPAREN, // 0x29 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::COMMA, // 0x2c 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::COLON, // 0x3a 178 Token::SEMICOLON, // 0x3b 179 Token::ILLEGAL, 180 Token::ILLEGAL, 181 Token::ILLEGAL, 182 Token::CONDITIONAL, // 0x3f 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::ILLEGAL, 186 Token::ILLEGAL, 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::LBRACK, // 0x5b 211 Token::ILLEGAL, 212 Token::RBRACK, // 0x5d 213 Token::ILLEGAL, 214 Token::ILLEGAL, 215 Token::ILLEGAL, 216 Token::ILLEGAL, 217 Token::ILLEGAL, 218 Token::ILLEGAL, 219 Token::ILLEGAL, 220 Token::ILLEGAL, 221 Token::ILLEGAL, 222 Token::ILLEGAL, 223 Token::ILLEGAL, 224 Token::ILLEGAL, 225 Token::ILLEGAL, 226 Token::ILLEGAL, 227 Token::ILLEGAL, 228 Token::ILLEGAL, 229 Token::ILLEGAL, 230 Token::ILLEGAL, 231 Token::ILLEGAL, 232 Token::ILLEGAL, 233 Token::ILLEGAL, 234 Token::ILLEGAL, 235 Token::ILLEGAL, 236 Token::ILLEGAL, 237 Token::ILLEGAL, 238 Token::ILLEGAL, 239 Token::ILLEGAL, 240 Token::ILLEGAL, 241 Token::ILLEGAL, 242 Token::LBRACE, // 0x7b 243 Token::ILLEGAL, 244 Token::RBRACE, // 0x7d 245 Token::BIT_NOT, // 0x7e 246 Token::ILLEGAL 247}; 248 249 250Token::Value JavaScriptScanner::Next() { 251 current_ = next_; 252 has_line_terminator_before_next_ = false; 253 has_multiline_comment_before_next_ = false; 254 if (static_cast<unsigned>(c0_) <= 0x7f) { 255 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 256 if (token != Token::ILLEGAL) { 257 int pos = source_pos(); 258 next_.token = token; 259 next_.location.beg_pos = pos; 260 next_.location.end_pos = pos + 1; 261 Advance(); 262 return current_.token; 263 } 264 } 265 Scan(); 266 return current_.token; 267} 268 269 270static inline bool IsByteOrderMark(uc32 c) { 271 // The Unicode value U+FFFE is guaranteed never to be assigned as a 272 // Unicode character; this implies that in a Unicode context the 273 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 274 // character expressed in little-endian byte order (since it could 275 // not be a U+FFFE character expressed in big-endian byte 276 // order). Nevertheless, we check for it to be compatible with 277 // Spidermonkey. 278 return c == 0xFEFF || c == 0xFFFE; 279} 280 281 282bool JavaScriptScanner::SkipWhiteSpace() { 283 int start_position = source_pos(); 284 285 while (true) { 286 // We treat byte-order marks (BOMs) as whitespace for better 287 // compatibility with Spidermonkey and other JavaScript engines. 288 while (unicode_cache_->IsWhiteSpace(c0_) || IsByteOrderMark(c0_)) { 289 // IsWhiteSpace() includes line terminators! 290 if (unicode_cache_->IsLineTerminator(c0_)) { 291 // Ignore line terminators, but remember them. This is necessary 292 // for automatic semicolon insertion. 293 has_line_terminator_before_next_ = true; 294 } 295 Advance(); 296 } 297 298 // If there is an HTML comment end '-->' at the beginning of a 299 // line (with only whitespace in front of it), we treat the rest 300 // of the line as a comment. This is in line with the way 301 // SpiderMonkey handles it. 302 if (c0_ == '-' && has_line_terminator_before_next_) { 303 Advance(); 304 if (c0_ == '-') { 305 Advance(); 306 if (c0_ == '>') { 307 // Treat the rest of the line as a comment. 308 SkipSingleLineComment(); 309 // Continue skipping white space after the comment. 310 continue; 311 } 312 PushBack('-'); // undo Advance() 313 } 314 PushBack('-'); // undo Advance() 315 } 316 // Return whether or not we skipped any characters. 317 return source_pos() != start_position; 318 } 319} 320 321 322Token::Value JavaScriptScanner::SkipSingleLineComment() { 323 Advance(); 324 325 // The line terminator at the end of the line is not considered 326 // to be part of the single-line comment; it is recognized 327 // separately by the lexical grammar and becomes part of the 328 // stream of input elements for the syntactic grammar (see 329 // ECMA-262, section 7.4). 330 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 331 Advance(); 332 } 333 334 return Token::WHITESPACE; 335} 336 337 338Token::Value JavaScriptScanner::SkipMultiLineComment() { 339 ASSERT(c0_ == '*'); 340 Advance(); 341 342 while (c0_ >= 0) { 343 uc32 ch = c0_; 344 Advance(); 345 if (unicode_cache_->IsLineTerminator(ch)) { 346 // Following ECMA-262, section 7.4, a comment containing 347 // a newline will make the comment count as a line-terminator. 348 has_multiline_comment_before_next_ = true; 349 } 350 // If we have reached the end of the multi-line comment, we 351 // consume the '/' and insert a whitespace. This way all 352 // multi-line comments are treated as whitespace. 353 if (ch == '*' && c0_ == '/') { 354 c0_ = ' '; 355 return Token::WHITESPACE; 356 } 357 } 358 359 // Unterminated multi-line comment. 360 return Token::ILLEGAL; 361} 362 363 364Token::Value JavaScriptScanner::ScanHtmlComment() { 365 // Check for <!-- comments. 366 ASSERT(c0_ == '!'); 367 Advance(); 368 if (c0_ == '-') { 369 Advance(); 370 if (c0_ == '-') return SkipSingleLineComment(); 371 PushBack('-'); // undo Advance() 372 } 373 PushBack('!'); // undo Advance() 374 ASSERT(c0_ == '!'); 375 return Token::LT; 376} 377 378 379void JavaScriptScanner::Scan() { 380 next_.literal_chars = NULL; 381 Token::Value token; 382 do { 383 // Remember the position of the next token 384 next_.location.beg_pos = source_pos(); 385 386 switch (c0_) { 387 case ' ': 388 case '\t': 389 Advance(); 390 token = Token::WHITESPACE; 391 break; 392 393 case '\n': 394 Advance(); 395 has_line_terminator_before_next_ = true; 396 token = Token::WHITESPACE; 397 break; 398 399 case '"': case '\'': 400 token = ScanString(); 401 break; 402 403 case '<': 404 // < <= << <<= <!-- 405 Advance(); 406 if (c0_ == '=') { 407 token = Select(Token::LTE); 408 } else if (c0_ == '<') { 409 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 410 } else if (c0_ == '!') { 411 token = ScanHtmlComment(); 412 } else { 413 token = Token::LT; 414 } 415 break; 416 417 case '>': 418 // > >= >> >>= >>> >>>= 419 Advance(); 420 if (c0_ == '=') { 421 token = Select(Token::GTE); 422 } else if (c0_ == '>') { 423 // >> >>= >>> >>>= 424 Advance(); 425 if (c0_ == '=') { 426 token = Select(Token::ASSIGN_SAR); 427 } else if (c0_ == '>') { 428 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 429 } else { 430 token = Token::SAR; 431 } 432 } else { 433 token = Token::GT; 434 } 435 break; 436 437 case '=': 438 // = == === 439 Advance(); 440 if (c0_ == '=') { 441 token = Select('=', Token::EQ_STRICT, Token::EQ); 442 } else { 443 token = Token::ASSIGN; 444 } 445 break; 446 447 case '!': 448 // ! != !== 449 Advance(); 450 if (c0_ == '=') { 451 token = Select('=', Token::NE_STRICT, Token::NE); 452 } else { 453 token = Token::NOT; 454 } 455 break; 456 457 case '+': 458 // + ++ += 459 Advance(); 460 if (c0_ == '+') { 461 token = Select(Token::INC); 462 } else if (c0_ == '=') { 463 token = Select(Token::ASSIGN_ADD); 464 } else { 465 token = Token::ADD; 466 } 467 break; 468 469 case '-': 470 // - -- --> -= 471 Advance(); 472 if (c0_ == '-') { 473 Advance(); 474 if (c0_ == '>' && has_line_terminator_before_next_) { 475 // For compatibility with SpiderMonkey, we skip lines that 476 // start with an HTML comment end '-->'. 477 token = SkipSingleLineComment(); 478 } else { 479 token = Token::DEC; 480 } 481 } else if (c0_ == '=') { 482 token = Select(Token::ASSIGN_SUB); 483 } else { 484 token = Token::SUB; 485 } 486 break; 487 488 case '*': 489 // * *= 490 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 491 break; 492 493 case '%': 494 // % %= 495 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 496 break; 497 498 case '/': 499 // / // /* /= 500 Advance(); 501 if (c0_ == '/') { 502 token = SkipSingleLineComment(); 503 } else if (c0_ == '*') { 504 token = SkipMultiLineComment(); 505 } else if (c0_ == '=') { 506 token = Select(Token::ASSIGN_DIV); 507 } else { 508 token = Token::DIV; 509 } 510 break; 511 512 case '&': 513 // & && &= 514 Advance(); 515 if (c0_ == '&') { 516 token = Select(Token::AND); 517 } else if (c0_ == '=') { 518 token = Select(Token::ASSIGN_BIT_AND); 519 } else { 520 token = Token::BIT_AND; 521 } 522 break; 523 524 case '|': 525 // | || |= 526 Advance(); 527 if (c0_ == '|') { 528 token = Select(Token::OR); 529 } else if (c0_ == '=') { 530 token = Select(Token::ASSIGN_BIT_OR); 531 } else { 532 token = Token::BIT_OR; 533 } 534 break; 535 536 case '^': 537 // ^ ^= 538 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 539 break; 540 541 case '.': 542 // . Number 543 Advance(); 544 if (IsDecimalDigit(c0_)) { 545 token = ScanNumber(true); 546 } else { 547 token = Token::PERIOD; 548 } 549 break; 550 551 case ':': 552 token = Select(Token::COLON); 553 break; 554 555 case ';': 556 token = Select(Token::SEMICOLON); 557 break; 558 559 case ',': 560 token = Select(Token::COMMA); 561 break; 562 563 case '(': 564 token = Select(Token::LPAREN); 565 break; 566 567 case ')': 568 token = Select(Token::RPAREN); 569 break; 570 571 case '[': 572 token = Select(Token::LBRACK); 573 break; 574 575 case ']': 576 token = Select(Token::RBRACK); 577 break; 578 579 case '{': 580 token = Select(Token::LBRACE); 581 break; 582 583 case '}': 584 token = Select(Token::RBRACE); 585 break; 586 587 case '?': 588 token = Select(Token::CONDITIONAL); 589 break; 590 591 case '~': 592 token = Select(Token::BIT_NOT); 593 break; 594 595 default: 596 if (unicode_cache_->IsIdentifierStart(c0_)) { 597 token = ScanIdentifierOrKeyword(); 598 } else if (IsDecimalDigit(c0_)) { 599 token = ScanNumber(false); 600 } else if (SkipWhiteSpace()) { 601 token = Token::WHITESPACE; 602 } else if (c0_ < 0) { 603 token = Token::EOS; 604 } else { 605 token = Select(Token::ILLEGAL); 606 } 607 break; 608 } 609 610 // Continue scanning for tokens as long as we're just skipping 611 // whitespace. 612 } while (token == Token::WHITESPACE); 613 614 next_.location.end_pos = source_pos(); 615 next_.token = token; 616} 617 618 619void JavaScriptScanner::SeekForward(int pos) { 620 // After this call, we will have the token at the given position as 621 // the "next" token. The "current" token will be invalid. 622 if (pos == next_.location.beg_pos) return; 623 int current_pos = source_pos(); 624 ASSERT_EQ(next_.location.end_pos, current_pos); 625 // Positions inside the lookahead token aren't supported. 626 ASSERT(pos >= current_pos); 627 if (pos != current_pos) { 628 source_->SeekForward(pos - source_->pos()); 629 Advance(); 630 // This function is only called to seek to the location 631 // of the end of a function (at the "}" token). It doesn't matter 632 // whether there was a line terminator in the part we skip. 633 has_line_terminator_before_next_ = false; 634 has_multiline_comment_before_next_ = false; 635 } 636 Scan(); 637} 638 639 640void JavaScriptScanner::ScanEscape() { 641 uc32 c = c0_; 642 Advance(); 643 644 // Skip escaped newlines. 645 if (unicode_cache_->IsLineTerminator(c)) { 646 // Allow CR+LF newlines in multiline string literals. 647 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 648 // Allow LF+CR newlines in multiline string literals. 649 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 650 return; 651 } 652 653 switch (c) { 654 case '\'': // fall through 655 case '"' : // fall through 656 case '\\': break; 657 case 'b' : c = '\b'; break; 658 case 'f' : c = '\f'; break; 659 case 'n' : c = '\n'; break; 660 case 'r' : c = '\r'; break; 661 case 't' : c = '\t'; break; 662 case 'u' : { 663 c = ScanHexNumber(4); 664 if (c < 0) c = 'u'; 665 break; 666 } 667 case 'v' : c = '\v'; break; 668 case 'x' : { 669 c = ScanHexNumber(2); 670 if (c < 0) c = 'x'; 671 break; 672 } 673 case '0' : // fall through 674 case '1' : // fall through 675 case '2' : // fall through 676 case '3' : // fall through 677 case '4' : // fall through 678 case '5' : // fall through 679 case '6' : // fall through 680 case '7' : c = ScanOctalEscape(c, 2); break; 681 } 682 683 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 684 // should be illegal, but they are commonly handled 685 // as non-escaped characters by JS VMs. 686 AddLiteralChar(c); 687} 688 689 690// Octal escapes of the forms '\0xx' and '\xxx' are not a part of 691// ECMA-262. Other JS VMs support them. 692uc32 JavaScriptScanner::ScanOctalEscape(uc32 c, int length) { 693 uc32 x = c - '0'; 694 int i = 0; 695 for (; i < length; i++) { 696 int d = c0_ - '0'; 697 if (d < 0 || d > 7) break; 698 int nx = x * 8 + d; 699 if (nx >= 256) break; 700 x = nx; 701 Advance(); 702 } 703 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 704 // Remember the position of octal escape sequences so that an error 705 // can be reported later (in strict mode). 706 // We don't report the error immediately, because the octal escape can 707 // occur before the "use strict" directive. 708 if (c != '0' || i > 0) { 709 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 710 } 711 return x; 712} 713 714 715Token::Value JavaScriptScanner::ScanString() { 716 uc32 quote = c0_; 717 Advance(); // consume quote 718 719 LiteralScope literal(this); 720 while (c0_ != quote && c0_ >= 0 721 && !unicode_cache_->IsLineTerminator(c0_)) { 722 uc32 c = c0_; 723 Advance(); 724 if (c == '\\') { 725 if (c0_ < 0) return Token::ILLEGAL; 726 ScanEscape(); 727 } else { 728 AddLiteralChar(c); 729 } 730 } 731 if (c0_ != quote) return Token::ILLEGAL; 732 literal.Complete(); 733 734 Advance(); // consume quote 735 return Token::STRING; 736} 737 738 739void JavaScriptScanner::ScanDecimalDigits() { 740 while (IsDecimalDigit(c0_)) 741 AddLiteralCharAdvance(); 742} 743 744 745Token::Value JavaScriptScanner::ScanNumber(bool seen_period) { 746 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 747 748 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 749 750 LiteralScope literal(this); 751 if (seen_period) { 752 // we have already seen a decimal point of the float 753 AddLiteralChar('.'); 754 ScanDecimalDigits(); // we know we have at least one digit 755 756 } else { 757 // if the first character is '0' we must check for octals and hex 758 if (c0_ == '0') { 759 int start_pos = source_pos(); // For reporting octal positions. 760 AddLiteralCharAdvance(); 761 762 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 763 if (c0_ == 'x' || c0_ == 'X') { 764 // hex number 765 kind = HEX; 766 AddLiteralCharAdvance(); 767 if (!IsHexDigit(c0_)) { 768 // we must have at least one hex digit after 'x'/'X' 769 return Token::ILLEGAL; 770 } 771 while (IsHexDigit(c0_)) { 772 AddLiteralCharAdvance(); 773 } 774 } else if ('0' <= c0_ && c0_ <= '7') { 775 // (possible) octal number 776 kind = OCTAL; 777 while (true) { 778 if (c0_ == '8' || c0_ == '9') { 779 kind = DECIMAL; 780 break; 781 } 782 if (c0_ < '0' || '7' < c0_) { 783 // Octal literal finished. 784 octal_pos_ = Location(start_pos, source_pos()); 785 break; 786 } 787 AddLiteralCharAdvance(); 788 } 789 } 790 } 791 792 // Parse decimal digits and allow trailing fractional part. 793 if (kind == DECIMAL) { 794 ScanDecimalDigits(); // optional 795 if (c0_ == '.') { 796 AddLiteralCharAdvance(); 797 ScanDecimalDigits(); // optional 798 } 799 } 800 } 801 802 // scan exponent, if any 803 if (c0_ == 'e' || c0_ == 'E') { 804 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 805 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 806 // scan exponent 807 AddLiteralCharAdvance(); 808 if (c0_ == '+' || c0_ == '-') 809 AddLiteralCharAdvance(); 810 if (!IsDecimalDigit(c0_)) { 811 // we must have at least one decimal digit after 'e'/'E' 812 return Token::ILLEGAL; 813 } 814 ScanDecimalDigits(); 815 } 816 817 // The source character immediately following a numeric literal must 818 // not be an identifier start or a decimal digit; see ECMA-262 819 // section 7.8.3, page 17 (note that we read only one decimal digit 820 // if the value is 0). 821 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 822 return Token::ILLEGAL; 823 824 literal.Complete(); 825 826 return Token::NUMBER; 827} 828 829 830uc32 JavaScriptScanner::ScanIdentifierUnicodeEscape() { 831 Advance(); 832 if (c0_ != 'u') return -1; 833 Advance(); 834 uc32 result = ScanHexNumber(4); 835 if (result < 0) PushBack('u'); 836 return result; 837} 838 839 840// ---------------------------------------------------------------------------- 841// Keyword Matcher 842 843#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 844 KEYWORD_GROUP('b') \ 845 KEYWORD("break", Token::BREAK) \ 846 KEYWORD_GROUP('c') \ 847 KEYWORD("case", Token::CASE) \ 848 KEYWORD("catch", Token::CATCH) \ 849 KEYWORD("class", Token::FUTURE_RESERVED_WORD) \ 850 KEYWORD("const", Token::CONST) \ 851 KEYWORD("continue", Token::CONTINUE) \ 852 KEYWORD_GROUP('d') \ 853 KEYWORD("debugger", Token::DEBUGGER) \ 854 KEYWORD("default", Token::DEFAULT) \ 855 KEYWORD("delete", Token::DELETE) \ 856 KEYWORD("do", Token::DO) \ 857 KEYWORD_GROUP('e') \ 858 KEYWORD("else", Token::ELSE) \ 859 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 860 KEYWORD("export", Token::FUTURE_RESERVED_WORD) \ 861 KEYWORD("extends", Token::FUTURE_RESERVED_WORD) \ 862 KEYWORD_GROUP('f') \ 863 KEYWORD("false", Token::FALSE_LITERAL) \ 864 KEYWORD("finally", Token::FINALLY) \ 865 KEYWORD("for", Token::FOR) \ 866 KEYWORD("function", Token::FUNCTION) \ 867 KEYWORD_GROUP('i') \ 868 KEYWORD("if", Token::IF) \ 869 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 870 KEYWORD("import", Token::FUTURE_RESERVED_WORD) \ 871 KEYWORD("in", Token::IN) \ 872 KEYWORD("instanceof", Token::INSTANCEOF) \ 873 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 874 KEYWORD_GROUP('l') \ 875 KEYWORD("let", harmony_block_scoping \ 876 ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 877 KEYWORD_GROUP('n') \ 878 KEYWORD("new", Token::NEW) \ 879 KEYWORD("null", Token::NULL_LITERAL) \ 880 KEYWORD_GROUP('p') \ 881 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 882 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 883 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 884 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 885 KEYWORD_GROUP('r') \ 886 KEYWORD("return", Token::RETURN) \ 887 KEYWORD_GROUP('s') \ 888 KEYWORD("static", Token::FUTURE_STRICT_RESERVED_WORD) \ 889 KEYWORD("super", Token::FUTURE_RESERVED_WORD) \ 890 KEYWORD("switch", Token::SWITCH) \ 891 KEYWORD_GROUP('t') \ 892 KEYWORD("this", Token::THIS) \ 893 KEYWORD("throw", Token::THROW) \ 894 KEYWORD("true", Token::TRUE_LITERAL) \ 895 KEYWORD("try", Token::TRY) \ 896 KEYWORD("typeof", Token::TYPEOF) \ 897 KEYWORD_GROUP('v') \ 898 KEYWORD("var", Token::VAR) \ 899 KEYWORD("void", Token::VOID) \ 900 KEYWORD_GROUP('w') \ 901 KEYWORD("while", Token::WHILE) \ 902 KEYWORD("with", Token::WITH) \ 903 KEYWORD_GROUP('y') \ 904 KEYWORD("yield", Token::FUTURE_STRICT_RESERVED_WORD) 905 906 907static Token::Value KeywordOrIdentifierToken(const char* input, 908 int input_length, 909 bool harmony_block_scoping) { 910 ASSERT(input_length >= 1); 911 const int kMinLength = 2; 912 const int kMaxLength = 10; 913 if (input_length < kMinLength || input_length > kMaxLength) { 914 return Token::IDENTIFIER; 915 } 916 switch (input[0]) { 917 default: 918#define KEYWORD_GROUP_CASE(ch) \ 919 break; \ 920 case ch: 921#define KEYWORD(keyword, token) \ 922 { \ 923 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 924 /* strlen(keyword) plus 1 for the NUL char. */ \ 925 const int keyword_length = sizeof(keyword) - 1; \ 926 STATIC_ASSERT(keyword_length >= kMinLength); \ 927 STATIC_ASSERT(keyword_length <= kMaxLength); \ 928 if (input_length == keyword_length && \ 929 input[1] == keyword[1] && \ 930 (keyword_length <= 2 || input[2] == keyword[2]) && \ 931 (keyword_length <= 3 || input[3] == keyword[3]) && \ 932 (keyword_length <= 4 || input[4] == keyword[4]) && \ 933 (keyword_length <= 5 || input[5] == keyword[5]) && \ 934 (keyword_length <= 6 || input[6] == keyword[6]) && \ 935 (keyword_length <= 7 || input[7] == keyword[7]) && \ 936 (keyword_length <= 8 || input[8] == keyword[8]) && \ 937 (keyword_length <= 9 || input[9] == keyword[9])) { \ 938 return token; \ 939 } \ 940 } 941 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 942 } 943 return Token::IDENTIFIER; 944} 945 946 947Token::Value JavaScriptScanner::ScanIdentifierOrKeyword() { 948 ASSERT(unicode_cache_->IsIdentifierStart(c0_)); 949 LiteralScope literal(this); 950 // Scan identifier start character. 951 if (c0_ == '\\') { 952 uc32 c = ScanIdentifierUnicodeEscape(); 953 // Only allow legal identifier start characters. 954 if (c < 0 || 955 c == '\\' || // No recursive escapes. 956 !unicode_cache_->IsIdentifierStart(c)) { 957 return Token::ILLEGAL; 958 } 959 AddLiteralChar(c); 960 return ScanIdentifierSuffix(&literal); 961 } 962 963 uc32 first_char = c0_; 964 Advance(); 965 AddLiteralChar(first_char); 966 967 // Scan the rest of the identifier characters. 968 while (unicode_cache_->IsIdentifierPart(c0_)) { 969 if (c0_ != '\\') { 970 uc32 next_char = c0_; 971 Advance(); 972 AddLiteralChar(next_char); 973 continue; 974 } 975 // Fallthrough if no longer able to complete keyword. 976 return ScanIdentifierSuffix(&literal); 977 } 978 979 literal.Complete(); 980 981 if (next_.literal_chars->is_ascii()) { 982 Vector<const char> chars = next_.literal_chars->ascii_literal(); 983 return KeywordOrIdentifierToken(chars.start(), 984 chars.length(), 985 harmony_block_scoping_); 986 } 987 988 return Token::IDENTIFIER; 989} 990 991 992Token::Value JavaScriptScanner::ScanIdentifierSuffix(LiteralScope* literal) { 993 // Scan the rest of the identifier characters. 994 while (unicode_cache_->IsIdentifierPart(c0_)) { 995 if (c0_ == '\\') { 996 uc32 c = ScanIdentifierUnicodeEscape(); 997 // Only allow legal identifier part characters. 998 if (c < 0 || 999 c == '\\' || 1000 !unicode_cache_->IsIdentifierPart(c)) { 1001 return Token::ILLEGAL; 1002 } 1003 AddLiteralChar(c); 1004 } else { 1005 AddLiteralChar(c0_); 1006 Advance(); 1007 } 1008 } 1009 literal->Complete(); 1010 1011 return Token::IDENTIFIER; 1012} 1013 1014 1015bool JavaScriptScanner::ScanRegExpPattern(bool seen_equal) { 1016 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1017 bool in_character_class = false; 1018 1019 // Previous token is either '/' or '/=', in the second case, the 1020 // pattern starts at =. 1021 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1022 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1023 1024 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1025 // the scanner should pass uninterpreted bodies to the RegExp 1026 // constructor. 1027 LiteralScope literal(this); 1028 if (seen_equal) { 1029 AddLiteralChar('='); 1030 } 1031 1032 while (c0_ != '/' || in_character_class) { 1033 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1034 if (c0_ == '\\') { // Escape sequence. 1035 AddLiteralCharAdvance(); 1036 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1037 AddLiteralCharAdvance(); 1038 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1039 // only "safe" characters are allowed (letters, digits, underscore), 1040 // otherwise the escape isn't valid and the invalid character has 1041 // its normal meaning. I.e., we can just continue scanning without 1042 // worrying whether the following characters are part of the escape 1043 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1044 // of the escape sequence. 1045 1046 // TODO(896): At some point, parse RegExps more throughly to capture 1047 // octal esacpes in strict mode. 1048 } else { // Unescaped character. 1049 if (c0_ == '[') in_character_class = true; 1050 if (c0_ == ']') in_character_class = false; 1051 AddLiteralCharAdvance(); 1052 } 1053 } 1054 Advance(); // consume '/' 1055 1056 literal.Complete(); 1057 1058 return true; 1059} 1060 1061 1062bool JavaScriptScanner::ScanLiteralUnicodeEscape() { 1063 ASSERT(c0_ == '\\'); 1064 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1065 Advance(); 1066 int i = 1; 1067 if (c0_ == 'u') { 1068 i++; 1069 while (i < 6) { 1070 Advance(); 1071 if (!IsHexDigit(c0_)) break; 1072 chars_read[i] = c0_; 1073 i++; 1074 } 1075 } 1076 if (i < 6) { 1077 // Incomplete escape. Undo all advances and return false. 1078 while (i > 0) { 1079 i--; 1080 PushBack(chars_read[i]); 1081 } 1082 return false; 1083 } 1084 // Complete escape. Add all chars to current literal buffer. 1085 for (int i = 0; i < 6; i++) { 1086 AddLiteralChar(chars_read[i]); 1087 } 1088 return true; 1089} 1090 1091 1092bool JavaScriptScanner::ScanRegExpFlags() { 1093 // Scan regular expression flags. 1094 LiteralScope literal(this); 1095 while (unicode_cache_->IsIdentifierPart(c0_)) { 1096 if (c0_ != '\\') { 1097 AddLiteralCharAdvance(); 1098 } else { 1099 if (!ScanLiteralUnicodeEscape()) { 1100 break; 1101 } 1102 } 1103 } 1104 literal.Complete(); 1105 1106 next_.location.end_pos = source_pos() - 1; 1107 return true; 1108} 1109 1110} } // namespace v8::internal 1111