1// Copyright 2011 the V8 project authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5// Features shared by parsing and pre-parsing scanners. 6 7#include <cmath> 8 9#include "src/v8.h" 10 11#include "include/v8stdint.h" 12#include "src/ast-value-factory.h" 13#include "src/char-predicates-inl.h" 14#include "src/conversions-inl.h" 15#include "src/list-inl.h" 16#include "src/parser.h" 17#include "src/scanner.h" 18 19namespace v8 { 20namespace internal { 21 22 23Handle<String> LiteralBuffer::Internalize(Isolate* isolate) const { 24 if (is_one_byte()) { 25 return isolate->factory()->InternalizeOneByteString(one_byte_literal()); 26 } 27 return isolate->factory()->InternalizeTwoByteString(two_byte_literal()); 28} 29 30 31// ---------------------------------------------------------------------------- 32// Scanner 33 34Scanner::Scanner(UnicodeCache* unicode_cache) 35 : unicode_cache_(unicode_cache), 36 octal_pos_(Location::invalid()), 37 harmony_scoping_(false), 38 harmony_modules_(false), 39 harmony_numeric_literals_(false), 40 harmony_classes_(false) { } 41 42 43void Scanner::Initialize(Utf16CharacterStream* source) { 44 source_ = source; 45 // Need to capture identifiers in order to recognize "get" and "set" 46 // in object literals. 47 Init(); 48 // Skip initial whitespace allowing HTML comment ends just like 49 // after a newline and scan first token. 50 has_line_terminator_before_next_ = true; 51 SkipWhiteSpace(); 52 Scan(); 53} 54 55 56uc32 Scanner::ScanHexNumber(int expected_length) { 57 DCHECK(expected_length <= 4); // prevent overflow 58 59 uc32 digits[4] = { 0, 0, 0, 0 }; 60 uc32 x = 0; 61 for (int i = 0; i < expected_length; i++) { 62 digits[i] = c0_; 63 int d = HexValue(c0_); 64 if (d < 0) { 65 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 66 // should be illegal, but other JS VMs just return the 67 // non-escaped version of the original character. 68 69 // Push back digits that we have advanced past. 70 for (int j = i-1; j >= 0; j--) { 71 PushBack(digits[j]); 72 } 73 return -1; 74 } 75 x = x * 16 + d; 76 Advance(); 77 } 78 79 return x; 80} 81 82 83// Ensure that tokens can be stored in a byte. 84STATIC_ASSERT(Token::NUM_TOKENS <= 0x100); 85 86// Table of one-character tokens, by character (0x00..0x7f only). 87static const byte one_char_tokens[] = { 88 Token::ILLEGAL, 89 Token::ILLEGAL, 90 Token::ILLEGAL, 91 Token::ILLEGAL, 92 Token::ILLEGAL, 93 Token::ILLEGAL, 94 Token::ILLEGAL, 95 Token::ILLEGAL, 96 Token::ILLEGAL, 97 Token::ILLEGAL, 98 Token::ILLEGAL, 99 Token::ILLEGAL, 100 Token::ILLEGAL, 101 Token::ILLEGAL, 102 Token::ILLEGAL, 103 Token::ILLEGAL, 104 Token::ILLEGAL, 105 Token::ILLEGAL, 106 Token::ILLEGAL, 107 Token::ILLEGAL, 108 Token::ILLEGAL, 109 Token::ILLEGAL, 110 Token::ILLEGAL, 111 Token::ILLEGAL, 112 Token::ILLEGAL, 113 Token::ILLEGAL, 114 Token::ILLEGAL, 115 Token::ILLEGAL, 116 Token::ILLEGAL, 117 Token::ILLEGAL, 118 Token::ILLEGAL, 119 Token::ILLEGAL, 120 Token::ILLEGAL, 121 Token::ILLEGAL, 122 Token::ILLEGAL, 123 Token::ILLEGAL, 124 Token::ILLEGAL, 125 Token::ILLEGAL, 126 Token::ILLEGAL, 127 Token::ILLEGAL, 128 Token::LPAREN, // 0x28 129 Token::RPAREN, // 0x29 130 Token::ILLEGAL, 131 Token::ILLEGAL, 132 Token::COMMA, // 0x2c 133 Token::ILLEGAL, 134 Token::ILLEGAL, 135 Token::ILLEGAL, 136 Token::ILLEGAL, 137 Token::ILLEGAL, 138 Token::ILLEGAL, 139 Token::ILLEGAL, 140 Token::ILLEGAL, 141 Token::ILLEGAL, 142 Token::ILLEGAL, 143 Token::ILLEGAL, 144 Token::ILLEGAL, 145 Token::ILLEGAL, 146 Token::COLON, // 0x3a 147 Token::SEMICOLON, // 0x3b 148 Token::ILLEGAL, 149 Token::ILLEGAL, 150 Token::ILLEGAL, 151 Token::CONDITIONAL, // 0x3f 152 Token::ILLEGAL, 153 Token::ILLEGAL, 154 Token::ILLEGAL, 155 Token::ILLEGAL, 156 Token::ILLEGAL, 157 Token::ILLEGAL, 158 Token::ILLEGAL, 159 Token::ILLEGAL, 160 Token::ILLEGAL, 161 Token::ILLEGAL, 162 Token::ILLEGAL, 163 Token::ILLEGAL, 164 Token::ILLEGAL, 165 Token::ILLEGAL, 166 Token::ILLEGAL, 167 Token::ILLEGAL, 168 Token::ILLEGAL, 169 Token::ILLEGAL, 170 Token::ILLEGAL, 171 Token::ILLEGAL, 172 Token::ILLEGAL, 173 Token::ILLEGAL, 174 Token::ILLEGAL, 175 Token::ILLEGAL, 176 Token::ILLEGAL, 177 Token::ILLEGAL, 178 Token::ILLEGAL, 179 Token::LBRACK, // 0x5b 180 Token::ILLEGAL, 181 Token::RBRACK, // 0x5d 182 Token::ILLEGAL, 183 Token::ILLEGAL, 184 Token::ILLEGAL, 185 Token::ILLEGAL, 186 Token::ILLEGAL, 187 Token::ILLEGAL, 188 Token::ILLEGAL, 189 Token::ILLEGAL, 190 Token::ILLEGAL, 191 Token::ILLEGAL, 192 Token::ILLEGAL, 193 Token::ILLEGAL, 194 Token::ILLEGAL, 195 Token::ILLEGAL, 196 Token::ILLEGAL, 197 Token::ILLEGAL, 198 Token::ILLEGAL, 199 Token::ILLEGAL, 200 Token::ILLEGAL, 201 Token::ILLEGAL, 202 Token::ILLEGAL, 203 Token::ILLEGAL, 204 Token::ILLEGAL, 205 Token::ILLEGAL, 206 Token::ILLEGAL, 207 Token::ILLEGAL, 208 Token::ILLEGAL, 209 Token::ILLEGAL, 210 Token::ILLEGAL, 211 Token::LBRACE, // 0x7b 212 Token::ILLEGAL, 213 Token::RBRACE, // 0x7d 214 Token::BIT_NOT, // 0x7e 215 Token::ILLEGAL 216}; 217 218 219Token::Value Scanner::Next() { 220 current_ = next_; 221 has_line_terminator_before_next_ = false; 222 has_multiline_comment_before_next_ = false; 223 if (static_cast<unsigned>(c0_) <= 0x7f) { 224 Token::Value token = static_cast<Token::Value>(one_char_tokens[c0_]); 225 if (token != Token::ILLEGAL) { 226 int pos = source_pos(); 227 next_.token = token; 228 next_.location.beg_pos = pos; 229 next_.location.end_pos = pos + 1; 230 Advance(); 231 return current_.token; 232 } 233 } 234 Scan(); 235 return current_.token; 236} 237 238 239// TODO(yangguo): check whether this is actually necessary. 240static inline bool IsLittleEndianByteOrderMark(uc32 c) { 241 // The Unicode value U+FFFE is guaranteed never to be assigned as a 242 // Unicode character; this implies that in a Unicode context the 243 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 244 // character expressed in little-endian byte order (since it could 245 // not be a U+FFFE character expressed in big-endian byte 246 // order). Nevertheless, we check for it to be compatible with 247 // Spidermonkey. 248 return c == 0xFFFE; 249} 250 251 252bool Scanner::SkipWhiteSpace() { 253 int start_position = source_pos(); 254 255 while (true) { 256 while (true) { 257 // Advance as long as character is a WhiteSpace or LineTerminator. 258 // Remember if the latter is the case. 259 if (unicode_cache_->IsLineTerminator(c0_)) { 260 has_line_terminator_before_next_ = true; 261 } else if (!unicode_cache_->IsWhiteSpace(c0_) && 262 !IsLittleEndianByteOrderMark(c0_)) { 263 break; 264 } 265 Advance(); 266 } 267 268 // If there is an HTML comment end '-->' at the beginning of a 269 // line (with only whitespace in front of it), we treat the rest 270 // of the line as a comment. This is in line with the way 271 // SpiderMonkey handles it. 272 if (c0_ == '-' && has_line_terminator_before_next_) { 273 Advance(); 274 if (c0_ == '-') { 275 Advance(); 276 if (c0_ == '>') { 277 // Treat the rest of the line as a comment. 278 SkipSingleLineComment(); 279 // Continue skipping white space after the comment. 280 continue; 281 } 282 PushBack('-'); // undo Advance() 283 } 284 PushBack('-'); // undo Advance() 285 } 286 // Return whether or not we skipped any characters. 287 return source_pos() != start_position; 288 } 289} 290 291 292Token::Value Scanner::SkipSingleLineComment() { 293 Advance(); 294 295 // The line terminator at the end of the line is not considered 296 // to be part of the single-line comment; it is recognized 297 // separately by the lexical grammar and becomes part of the 298 // stream of input elements for the syntactic grammar (see 299 // ECMA-262, section 7.4). 300 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 301 Advance(); 302 } 303 304 return Token::WHITESPACE; 305} 306 307 308Token::Value Scanner::SkipSourceURLComment() { 309 TryToParseSourceURLComment(); 310 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 311 Advance(); 312 } 313 314 return Token::WHITESPACE; 315} 316 317 318void Scanner::TryToParseSourceURLComment() { 319 // Magic comments are of the form: //[#@]\s<name>=\s*<value>\s*.* and this 320 // function will just return if it cannot parse a magic comment. 321 if (!unicode_cache_->IsWhiteSpace(c0_)) 322 return; 323 Advance(); 324 LiteralBuffer name; 325 while (c0_ >= 0 && !unicode_cache_->IsWhiteSpaceOrLineTerminator(c0_) && 326 c0_ != '=') { 327 name.AddChar(c0_); 328 Advance(); 329 } 330 if (!name.is_one_byte()) return; 331 Vector<const uint8_t> name_literal = name.one_byte_literal(); 332 LiteralBuffer* value; 333 if (name_literal == STATIC_CHAR_VECTOR("sourceURL")) { 334 value = &source_url_; 335 } else if (name_literal == STATIC_CHAR_VECTOR("sourceMappingURL")) { 336 value = &source_mapping_url_; 337 } else { 338 return; 339 } 340 if (c0_ != '=') 341 return; 342 Advance(); 343 value->Reset(); 344 while (c0_ >= 0 && unicode_cache_->IsWhiteSpace(c0_)) { 345 Advance(); 346 } 347 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 348 // Disallowed characters. 349 if (c0_ == '"' || c0_ == '\'') { 350 value->Reset(); 351 return; 352 } 353 if (unicode_cache_->IsWhiteSpace(c0_)) { 354 break; 355 } 356 value->AddChar(c0_); 357 Advance(); 358 } 359 // Allow whitespace at the end. 360 while (c0_ >= 0 && !unicode_cache_->IsLineTerminator(c0_)) { 361 if (!unicode_cache_->IsWhiteSpace(c0_)) { 362 value->Reset(); 363 break; 364 } 365 Advance(); 366 } 367} 368 369 370Token::Value Scanner::SkipMultiLineComment() { 371 DCHECK(c0_ == '*'); 372 Advance(); 373 374 while (c0_ >= 0) { 375 uc32 ch = c0_; 376 Advance(); 377 if (unicode_cache_->IsLineTerminator(ch)) { 378 // Following ECMA-262, section 7.4, a comment containing 379 // a newline will make the comment count as a line-terminator. 380 has_multiline_comment_before_next_ = true; 381 } 382 // If we have reached the end of the multi-line comment, we 383 // consume the '/' and insert a whitespace. This way all 384 // multi-line comments are treated as whitespace. 385 if (ch == '*' && c0_ == '/') { 386 c0_ = ' '; 387 return Token::WHITESPACE; 388 } 389 } 390 391 // Unterminated multi-line comment. 392 return Token::ILLEGAL; 393} 394 395 396Token::Value Scanner::ScanHtmlComment() { 397 // Check for <!-- comments. 398 DCHECK(c0_ == '!'); 399 Advance(); 400 if (c0_ == '-') { 401 Advance(); 402 if (c0_ == '-') return SkipSingleLineComment(); 403 PushBack('-'); // undo Advance() 404 } 405 PushBack('!'); // undo Advance() 406 DCHECK(c0_ == '!'); 407 return Token::LT; 408} 409 410 411void Scanner::Scan() { 412 next_.literal_chars = NULL; 413 Token::Value token; 414 do { 415 // Remember the position of the next token 416 next_.location.beg_pos = source_pos(); 417 418 switch (c0_) { 419 case ' ': 420 case '\t': 421 Advance(); 422 token = Token::WHITESPACE; 423 break; 424 425 case '\n': 426 Advance(); 427 has_line_terminator_before_next_ = true; 428 token = Token::WHITESPACE; 429 break; 430 431 case '"': case '\'': 432 token = ScanString(); 433 break; 434 435 case '<': 436 // < <= << <<= <!-- 437 Advance(); 438 if (c0_ == '=') { 439 token = Select(Token::LTE); 440 } else if (c0_ == '<') { 441 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 442 } else if (c0_ == '!') { 443 token = ScanHtmlComment(); 444 } else { 445 token = Token::LT; 446 } 447 break; 448 449 case '>': 450 // > >= >> >>= >>> >>>= 451 Advance(); 452 if (c0_ == '=') { 453 token = Select(Token::GTE); 454 } else if (c0_ == '>') { 455 // >> >>= >>> >>>= 456 Advance(); 457 if (c0_ == '=') { 458 token = Select(Token::ASSIGN_SAR); 459 } else if (c0_ == '>') { 460 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 461 } else { 462 token = Token::SAR; 463 } 464 } else { 465 token = Token::GT; 466 } 467 break; 468 469 case '=': 470 // = == === => 471 Advance(); 472 if (c0_ == '=') { 473 token = Select('=', Token::EQ_STRICT, Token::EQ); 474 } else if (c0_ == '>') { 475 token = Select(Token::ARROW); 476 } else { 477 token = Token::ASSIGN; 478 } 479 break; 480 481 case '!': 482 // ! != !== 483 Advance(); 484 if (c0_ == '=') { 485 token = Select('=', Token::NE_STRICT, Token::NE); 486 } else { 487 token = Token::NOT; 488 } 489 break; 490 491 case '+': 492 // + ++ += 493 Advance(); 494 if (c0_ == '+') { 495 token = Select(Token::INC); 496 } else if (c0_ == '=') { 497 token = Select(Token::ASSIGN_ADD); 498 } else { 499 token = Token::ADD; 500 } 501 break; 502 503 case '-': 504 // - -- --> -= 505 Advance(); 506 if (c0_ == '-') { 507 Advance(); 508 if (c0_ == '>' && has_line_terminator_before_next_) { 509 // For compatibility with SpiderMonkey, we skip lines that 510 // start with an HTML comment end '-->'. 511 token = SkipSingleLineComment(); 512 } else { 513 token = Token::DEC; 514 } 515 } else if (c0_ == '=') { 516 token = Select(Token::ASSIGN_SUB); 517 } else { 518 token = Token::SUB; 519 } 520 break; 521 522 case '*': 523 // * *= 524 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 525 break; 526 527 case '%': 528 // % %= 529 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 530 break; 531 532 case '/': 533 // / // /* /= 534 Advance(); 535 if (c0_ == '/') { 536 Advance(); 537 if (c0_ == '@' || c0_ == '#') { 538 Advance(); 539 token = SkipSourceURLComment(); 540 } else { 541 PushBack(c0_); 542 token = SkipSingleLineComment(); 543 } 544 } else if (c0_ == '*') { 545 token = SkipMultiLineComment(); 546 } else if (c0_ == '=') { 547 token = Select(Token::ASSIGN_DIV); 548 } else { 549 token = Token::DIV; 550 } 551 break; 552 553 case '&': 554 // & && &= 555 Advance(); 556 if (c0_ == '&') { 557 token = Select(Token::AND); 558 } else if (c0_ == '=') { 559 token = Select(Token::ASSIGN_BIT_AND); 560 } else { 561 token = Token::BIT_AND; 562 } 563 break; 564 565 case '|': 566 // | || |= 567 Advance(); 568 if (c0_ == '|') { 569 token = Select(Token::OR); 570 } else if (c0_ == '=') { 571 token = Select(Token::ASSIGN_BIT_OR); 572 } else { 573 token = Token::BIT_OR; 574 } 575 break; 576 577 case '^': 578 // ^ ^= 579 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 580 break; 581 582 case '.': 583 // . Number 584 Advance(); 585 if (IsDecimalDigit(c0_)) { 586 token = ScanNumber(true); 587 } else { 588 token = Token::PERIOD; 589 } 590 break; 591 592 case ':': 593 token = Select(Token::COLON); 594 break; 595 596 case ';': 597 token = Select(Token::SEMICOLON); 598 break; 599 600 case ',': 601 token = Select(Token::COMMA); 602 break; 603 604 case '(': 605 token = Select(Token::LPAREN); 606 break; 607 608 case ')': 609 token = Select(Token::RPAREN); 610 break; 611 612 case '[': 613 token = Select(Token::LBRACK); 614 break; 615 616 case ']': 617 token = Select(Token::RBRACK); 618 break; 619 620 case '{': 621 token = Select(Token::LBRACE); 622 break; 623 624 case '}': 625 token = Select(Token::RBRACE); 626 break; 627 628 case '?': 629 token = Select(Token::CONDITIONAL); 630 break; 631 632 case '~': 633 token = Select(Token::BIT_NOT); 634 break; 635 636 default: 637 if (unicode_cache_->IsIdentifierStart(c0_)) { 638 token = ScanIdentifierOrKeyword(); 639 } else if (IsDecimalDigit(c0_)) { 640 token = ScanNumber(false); 641 } else if (SkipWhiteSpace()) { 642 token = Token::WHITESPACE; 643 } else if (c0_ < 0) { 644 token = Token::EOS; 645 } else { 646 token = Select(Token::ILLEGAL); 647 } 648 break; 649 } 650 651 // Continue scanning for tokens as long as we're just skipping 652 // whitespace. 653 } while (token == Token::WHITESPACE); 654 655 next_.location.end_pos = source_pos(); 656 next_.token = token; 657} 658 659 660void Scanner::SeekForward(int pos) { 661 // After this call, we will have the token at the given position as 662 // the "next" token. The "current" token will be invalid. 663 if (pos == next_.location.beg_pos) return; 664 int current_pos = source_pos(); 665 DCHECK_EQ(next_.location.end_pos, current_pos); 666 // Positions inside the lookahead token aren't supported. 667 DCHECK(pos >= current_pos); 668 if (pos != current_pos) { 669 source_->SeekForward(pos - source_->pos()); 670 Advance(); 671 // This function is only called to seek to the location 672 // of the end of a function (at the "}" token). It doesn't matter 673 // whether there was a line terminator in the part we skip. 674 has_line_terminator_before_next_ = false; 675 has_multiline_comment_before_next_ = false; 676 } 677 Scan(); 678} 679 680 681bool Scanner::ScanEscape() { 682 uc32 c = c0_; 683 Advance(); 684 685 // Skip escaped newlines. 686 if (unicode_cache_->IsLineTerminator(c)) { 687 // Allow CR+LF newlines in multiline string literals. 688 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 689 // Allow LF+CR newlines in multiline string literals. 690 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 691 return true; 692 } 693 694 switch (c) { 695 case '\'': // fall through 696 case '"' : // fall through 697 case '\\': break; 698 case 'b' : c = '\b'; break; 699 case 'f' : c = '\f'; break; 700 case 'n' : c = '\n'; break; 701 case 'r' : c = '\r'; break; 702 case 't' : c = '\t'; break; 703 case 'u' : { 704 c = ScanHexNumber(4); 705 if (c < 0) return false; 706 break; 707 } 708 case 'v' : c = '\v'; break; 709 case 'x' : { 710 c = ScanHexNumber(2); 711 if (c < 0) return false; 712 break; 713 } 714 case '0' : // fall through 715 case '1' : // fall through 716 case '2' : // fall through 717 case '3' : // fall through 718 case '4' : // fall through 719 case '5' : // fall through 720 case '6' : // fall through 721 case '7' : c = ScanOctalEscape(c, 2); break; 722 } 723 724 // According to ECMA-262, section 7.8.4, characters not covered by the 725 // above cases should be illegal, but they are commonly handled as 726 // non-escaped characters by JS VMs. 727 AddLiteralChar(c); 728 return true; 729} 730 731 732// Octal escapes of the forms '\0xx' and '\xxx' are not a part of 733// ECMA-262. Other JS VMs support them. 734uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 735 uc32 x = c - '0'; 736 int i = 0; 737 for (; i < length; i++) { 738 int d = c0_ - '0'; 739 if (d < 0 || d > 7) break; 740 int nx = x * 8 + d; 741 if (nx >= 256) break; 742 x = nx; 743 Advance(); 744 } 745 // Anything except '\0' is an octal escape sequence, illegal in strict mode. 746 // Remember the position of octal escape sequences so that an error 747 // can be reported later (in strict mode). 748 // We don't report the error immediately, because the octal escape can 749 // occur before the "use strict" directive. 750 if (c != '0' || i > 0) { 751 octal_pos_ = Location(source_pos() - i - 1, source_pos() - 1); 752 } 753 return x; 754} 755 756 757Token::Value Scanner::ScanString() { 758 uc32 quote = c0_; 759 Advance(); // consume quote 760 761 LiteralScope literal(this); 762 while (c0_ != quote && c0_ >= 0 763 && !unicode_cache_->IsLineTerminator(c0_)) { 764 uc32 c = c0_; 765 Advance(); 766 if (c == '\\') { 767 if (c0_ < 0 || !ScanEscape()) return Token::ILLEGAL; 768 } else { 769 AddLiteralChar(c); 770 } 771 } 772 if (c0_ != quote) return Token::ILLEGAL; 773 literal.Complete(); 774 775 Advance(); // consume quote 776 return Token::STRING; 777} 778 779 780void Scanner::ScanDecimalDigits() { 781 while (IsDecimalDigit(c0_)) 782 AddLiteralCharAdvance(); 783} 784 785 786Token::Value Scanner::ScanNumber(bool seen_period) { 787 DCHECK(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 788 789 enum { DECIMAL, HEX, OCTAL, IMPLICIT_OCTAL, BINARY } kind = DECIMAL; 790 791 LiteralScope literal(this); 792 if (seen_period) { 793 // we have already seen a decimal point of the float 794 AddLiteralChar('.'); 795 ScanDecimalDigits(); // we know we have at least one digit 796 797 } else { 798 // if the first character is '0' we must check for octals and hex 799 if (c0_ == '0') { 800 int start_pos = source_pos(); // For reporting octal positions. 801 AddLiteralCharAdvance(); 802 803 // either 0, 0exxx, 0Exxx, 0.xxx, a hex number, a binary number or 804 // an octal number. 805 if (c0_ == 'x' || c0_ == 'X') { 806 // hex number 807 kind = HEX; 808 AddLiteralCharAdvance(); 809 if (!IsHexDigit(c0_)) { 810 // we must have at least one hex digit after 'x'/'X' 811 return Token::ILLEGAL; 812 } 813 while (IsHexDigit(c0_)) { 814 AddLiteralCharAdvance(); 815 } 816 } else if (harmony_numeric_literals_ && (c0_ == 'o' || c0_ == 'O')) { 817 kind = OCTAL; 818 AddLiteralCharAdvance(); 819 if (!IsOctalDigit(c0_)) { 820 // we must have at least one octal digit after 'o'/'O' 821 return Token::ILLEGAL; 822 } 823 while (IsOctalDigit(c0_)) { 824 AddLiteralCharAdvance(); 825 } 826 } else if (harmony_numeric_literals_ && (c0_ == 'b' || c0_ == 'B')) { 827 kind = BINARY; 828 AddLiteralCharAdvance(); 829 if (!IsBinaryDigit(c0_)) { 830 // we must have at least one binary digit after 'b'/'B' 831 return Token::ILLEGAL; 832 } 833 while (IsBinaryDigit(c0_)) { 834 AddLiteralCharAdvance(); 835 } 836 } else if ('0' <= c0_ && c0_ <= '7') { 837 // (possible) octal number 838 kind = IMPLICIT_OCTAL; 839 while (true) { 840 if (c0_ == '8' || c0_ == '9') { 841 kind = DECIMAL; 842 break; 843 } 844 if (c0_ < '0' || '7' < c0_) { 845 // Octal literal finished. 846 octal_pos_ = Location(start_pos, source_pos()); 847 break; 848 } 849 AddLiteralCharAdvance(); 850 } 851 } 852 } 853 854 // Parse decimal digits and allow trailing fractional part. 855 if (kind == DECIMAL) { 856 ScanDecimalDigits(); // optional 857 if (c0_ == '.') { 858 AddLiteralCharAdvance(); 859 ScanDecimalDigits(); // optional 860 } 861 } 862 } 863 864 // scan exponent, if any 865 if (c0_ == 'e' || c0_ == 'E') { 866 DCHECK(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 867 if (kind != DECIMAL) return Token::ILLEGAL; 868 // scan exponent 869 AddLiteralCharAdvance(); 870 if (c0_ == '+' || c0_ == '-') 871 AddLiteralCharAdvance(); 872 if (!IsDecimalDigit(c0_)) { 873 // we must have at least one decimal digit after 'e'/'E' 874 return Token::ILLEGAL; 875 } 876 ScanDecimalDigits(); 877 } 878 879 // The source character immediately following a numeric literal must 880 // not be an identifier start or a decimal digit; see ECMA-262 881 // section 7.8.3, page 17 (note that we read only one decimal digit 882 // if the value is 0). 883 if (IsDecimalDigit(c0_) || unicode_cache_->IsIdentifierStart(c0_)) 884 return Token::ILLEGAL; 885 886 literal.Complete(); 887 888 return Token::NUMBER; 889} 890 891 892uc32 Scanner::ScanIdentifierUnicodeEscape() { 893 Advance(); 894 if (c0_ != 'u') return -1; 895 Advance(); 896 uc32 result = ScanHexNumber(4); 897 if (result < 0) PushBack('u'); 898 return result; 899} 900 901 902// ---------------------------------------------------------------------------- 903// Keyword Matcher 904 905#define KEYWORDS(KEYWORD_GROUP, KEYWORD) \ 906 KEYWORD_GROUP('b') \ 907 KEYWORD("break", Token::BREAK) \ 908 KEYWORD_GROUP('c') \ 909 KEYWORD("case", Token::CASE) \ 910 KEYWORD("catch", Token::CATCH) \ 911 KEYWORD("class", \ 912 harmony_classes ? Token::CLASS : Token::FUTURE_RESERVED_WORD) \ 913 KEYWORD("const", Token::CONST) \ 914 KEYWORD("continue", Token::CONTINUE) \ 915 KEYWORD_GROUP('d') \ 916 KEYWORD("debugger", Token::DEBUGGER) \ 917 KEYWORD("default", Token::DEFAULT) \ 918 KEYWORD("delete", Token::DELETE) \ 919 KEYWORD("do", Token::DO) \ 920 KEYWORD_GROUP('e') \ 921 KEYWORD("else", Token::ELSE) \ 922 KEYWORD("enum", Token::FUTURE_RESERVED_WORD) \ 923 KEYWORD("export", \ 924 harmony_modules ? Token::EXPORT : Token::FUTURE_RESERVED_WORD) \ 925 KEYWORD("extends", \ 926 harmony_classes ? Token::EXTENDS : Token::FUTURE_RESERVED_WORD) \ 927 KEYWORD_GROUP('f') \ 928 KEYWORD("false", Token::FALSE_LITERAL) \ 929 KEYWORD("finally", Token::FINALLY) \ 930 KEYWORD("for", Token::FOR) \ 931 KEYWORD("function", Token::FUNCTION) \ 932 KEYWORD_GROUP('i') \ 933 KEYWORD("if", Token::IF) \ 934 KEYWORD("implements", Token::FUTURE_STRICT_RESERVED_WORD) \ 935 KEYWORD("import", \ 936 harmony_modules ? Token::IMPORT : Token::FUTURE_RESERVED_WORD) \ 937 KEYWORD("in", Token::IN) \ 938 KEYWORD("instanceof", Token::INSTANCEOF) \ 939 KEYWORD("interface", Token::FUTURE_STRICT_RESERVED_WORD) \ 940 KEYWORD_GROUP('l') \ 941 KEYWORD("let", \ 942 harmony_scoping ? Token::LET : Token::FUTURE_STRICT_RESERVED_WORD) \ 943 KEYWORD_GROUP('n') \ 944 KEYWORD("new", Token::NEW) \ 945 KEYWORD("null", Token::NULL_LITERAL) \ 946 KEYWORD_GROUP('p') \ 947 KEYWORD("package", Token::FUTURE_STRICT_RESERVED_WORD) \ 948 KEYWORD("private", Token::FUTURE_STRICT_RESERVED_WORD) \ 949 KEYWORD("protected", Token::FUTURE_STRICT_RESERVED_WORD) \ 950 KEYWORD("public", Token::FUTURE_STRICT_RESERVED_WORD) \ 951 KEYWORD_GROUP('r') \ 952 KEYWORD("return", Token::RETURN) \ 953 KEYWORD_GROUP('s') \ 954 KEYWORD("static", harmony_classes ? Token::STATIC \ 955 : Token::FUTURE_STRICT_RESERVED_WORD) \ 956 KEYWORD("super", \ 957 harmony_classes ? Token::SUPER : Token::FUTURE_RESERVED_WORD) \ 958 KEYWORD("switch", Token::SWITCH) \ 959 KEYWORD_GROUP('t') \ 960 KEYWORD("this", Token::THIS) \ 961 KEYWORD("throw", Token::THROW) \ 962 KEYWORD("true", Token::TRUE_LITERAL) \ 963 KEYWORD("try", Token::TRY) \ 964 KEYWORD("typeof", Token::TYPEOF) \ 965 KEYWORD_GROUP('v') \ 966 KEYWORD("var", Token::VAR) \ 967 KEYWORD("void", Token::VOID) \ 968 KEYWORD_GROUP('w') \ 969 KEYWORD("while", Token::WHILE) \ 970 KEYWORD("with", Token::WITH) \ 971 KEYWORD_GROUP('y') \ 972 KEYWORD("yield", Token::YIELD) 973 974 975static Token::Value KeywordOrIdentifierToken(const uint8_t* input, 976 int input_length, 977 bool harmony_scoping, 978 bool harmony_modules, 979 bool harmony_classes) { 980 DCHECK(input_length >= 1); 981 const int kMinLength = 2; 982 const int kMaxLength = 10; 983 if (input_length < kMinLength || input_length > kMaxLength) { 984 return Token::IDENTIFIER; 985 } 986 switch (input[0]) { 987 default: 988#define KEYWORD_GROUP_CASE(ch) \ 989 break; \ 990 case ch: 991#define KEYWORD(keyword, token) \ 992 { \ 993 /* 'keyword' is a char array, so sizeof(keyword) is */ \ 994 /* strlen(keyword) plus 1 for the NUL char. */ \ 995 const int keyword_length = sizeof(keyword) - 1; \ 996 STATIC_ASSERT(keyword_length >= kMinLength); \ 997 STATIC_ASSERT(keyword_length <= kMaxLength); \ 998 if (input_length == keyword_length && \ 999 input[1] == keyword[1] && \ 1000 (keyword_length <= 2 || input[2] == keyword[2]) && \ 1001 (keyword_length <= 3 || input[3] == keyword[3]) && \ 1002 (keyword_length <= 4 || input[4] == keyword[4]) && \ 1003 (keyword_length <= 5 || input[5] == keyword[5]) && \ 1004 (keyword_length <= 6 || input[6] == keyword[6]) && \ 1005 (keyword_length <= 7 || input[7] == keyword[7]) && \ 1006 (keyword_length <= 8 || input[8] == keyword[8]) && \ 1007 (keyword_length <= 9 || input[9] == keyword[9])) { \ 1008 return token; \ 1009 } \ 1010 } 1011 KEYWORDS(KEYWORD_GROUP_CASE, KEYWORD) 1012 } 1013 return Token::IDENTIFIER; 1014} 1015 1016 1017bool Scanner::IdentifierIsFutureStrictReserved( 1018 const AstRawString* string) const { 1019 // Keywords are always 1-byte strings. 1020 return string->is_one_byte() && 1021 Token::FUTURE_STRICT_RESERVED_WORD == 1022 KeywordOrIdentifierToken(string->raw_data(), string->length(), 1023 harmony_scoping_, harmony_modules_, 1024 harmony_classes_); 1025} 1026 1027 1028Token::Value Scanner::ScanIdentifierOrKeyword() { 1029 DCHECK(unicode_cache_->IsIdentifierStart(c0_)); 1030 LiteralScope literal(this); 1031 // Scan identifier start character. 1032 if (c0_ == '\\') { 1033 uc32 c = ScanIdentifierUnicodeEscape(); 1034 // Only allow legal identifier start characters. 1035 if (c < 0 || 1036 c == '\\' || // No recursive escapes. 1037 !unicode_cache_->IsIdentifierStart(c)) { 1038 return Token::ILLEGAL; 1039 } 1040 AddLiteralChar(c); 1041 return ScanIdentifierSuffix(&literal); 1042 } 1043 1044 uc32 first_char = c0_; 1045 Advance(); 1046 AddLiteralChar(first_char); 1047 1048 // Scan the rest of the identifier characters. 1049 while (unicode_cache_->IsIdentifierPart(c0_)) { 1050 if (c0_ != '\\') { 1051 uc32 next_char = c0_; 1052 Advance(); 1053 AddLiteralChar(next_char); 1054 continue; 1055 } 1056 // Fallthrough if no longer able to complete keyword. 1057 return ScanIdentifierSuffix(&literal); 1058 } 1059 1060 literal.Complete(); 1061 1062 if (next_.literal_chars->is_one_byte()) { 1063 Vector<const uint8_t> chars = next_.literal_chars->one_byte_literal(); 1064 return KeywordOrIdentifierToken(chars.start(), 1065 chars.length(), 1066 harmony_scoping_, 1067 harmony_modules_, 1068 harmony_classes_); 1069 } 1070 1071 return Token::IDENTIFIER; 1072} 1073 1074 1075Token::Value Scanner::ScanIdentifierSuffix(LiteralScope* literal) { 1076 // Scan the rest of the identifier characters. 1077 while (unicode_cache_->IsIdentifierPart(c0_)) { 1078 if (c0_ == '\\') { 1079 uc32 c = ScanIdentifierUnicodeEscape(); 1080 // Only allow legal identifier part characters. 1081 if (c < 0 || 1082 c == '\\' || 1083 !unicode_cache_->IsIdentifierPart(c)) { 1084 return Token::ILLEGAL; 1085 } 1086 AddLiteralChar(c); 1087 } else { 1088 AddLiteralChar(c0_); 1089 Advance(); 1090 } 1091 } 1092 literal->Complete(); 1093 1094 return Token::IDENTIFIER; 1095} 1096 1097 1098bool Scanner::ScanRegExpPattern(bool seen_equal) { 1099 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1100 bool in_character_class = false; 1101 1102 // Previous token is either '/' or '/=', in the second case, the 1103 // pattern starts at =. 1104 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1105 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1106 1107 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1108 // the scanner should pass uninterpreted bodies to the RegExp 1109 // constructor. 1110 LiteralScope literal(this); 1111 if (seen_equal) { 1112 AddLiteralChar('='); 1113 } 1114 1115 while (c0_ != '/' || in_character_class) { 1116 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1117 if (c0_ == '\\') { // Escape sequence. 1118 AddLiteralCharAdvance(); 1119 if (unicode_cache_->IsLineTerminator(c0_) || c0_ < 0) return false; 1120 AddLiteralCharAdvance(); 1121 // If the escape allows more characters, i.e., \x??, \u????, or \c?, 1122 // only "safe" characters are allowed (letters, digits, underscore), 1123 // otherwise the escape isn't valid and the invalid character has 1124 // its normal meaning. I.e., we can just continue scanning without 1125 // worrying whether the following characters are part of the escape 1126 // or not, since any '/', '\\' or '[' is guaranteed to not be part 1127 // of the escape sequence. 1128 1129 // TODO(896): At some point, parse RegExps more throughly to capture 1130 // octal esacpes in strict mode. 1131 } else { // Unescaped character. 1132 if (c0_ == '[') in_character_class = true; 1133 if (c0_ == ']') in_character_class = false; 1134 AddLiteralCharAdvance(); 1135 } 1136 } 1137 Advance(); // consume '/' 1138 1139 literal.Complete(); 1140 1141 return true; 1142} 1143 1144 1145bool Scanner::ScanLiteralUnicodeEscape() { 1146 DCHECK(c0_ == '\\'); 1147 uc32 chars_read[6] = {'\\', 'u', 0, 0, 0, 0}; 1148 Advance(); 1149 int i = 1; 1150 if (c0_ == 'u') { 1151 i++; 1152 while (i < 6) { 1153 Advance(); 1154 if (!IsHexDigit(c0_)) break; 1155 chars_read[i] = c0_; 1156 i++; 1157 } 1158 } 1159 if (i < 6) { 1160 // Incomplete escape. Undo all advances and return false. 1161 while (i > 0) { 1162 i--; 1163 PushBack(chars_read[i]); 1164 } 1165 return false; 1166 } 1167 // Complete escape. Add all chars to current literal buffer. 1168 for (int i = 0; i < 6; i++) { 1169 AddLiteralChar(chars_read[i]); 1170 } 1171 return true; 1172} 1173 1174 1175bool Scanner::ScanRegExpFlags() { 1176 // Scan regular expression flags. 1177 LiteralScope literal(this); 1178 while (unicode_cache_->IsIdentifierPart(c0_)) { 1179 if (c0_ != '\\') { 1180 AddLiteralCharAdvance(); 1181 } else { 1182 if (!ScanLiteralUnicodeEscape()) { 1183 break; 1184 } 1185 Advance(); 1186 } 1187 } 1188 literal.Complete(); 1189 1190 next_.location.end_pos = source_pos() - 1; 1191 return true; 1192} 1193 1194 1195const AstRawString* Scanner::CurrentSymbol(AstValueFactory* ast_value_factory) { 1196 if (is_literal_one_byte()) { 1197 return ast_value_factory->GetOneByteString(literal_one_byte_string()); 1198 } 1199 return ast_value_factory->GetTwoByteString(literal_two_byte_string()); 1200} 1201 1202 1203const AstRawString* Scanner::NextSymbol(AstValueFactory* ast_value_factory) { 1204 if (is_next_literal_one_byte()) { 1205 return ast_value_factory->GetOneByteString(next_literal_one_byte_string()); 1206 } 1207 return ast_value_factory->GetTwoByteString(next_literal_two_byte_string()); 1208} 1209 1210 1211double Scanner::DoubleValue() { 1212 DCHECK(is_literal_one_byte()); 1213 return StringToDouble( 1214 unicode_cache_, 1215 literal_one_byte_string(), 1216 ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY); 1217} 1218 1219 1220int Scanner::FindNumber(DuplicateFinder* finder, int value) { 1221 return finder->AddNumber(literal_one_byte_string(), value); 1222} 1223 1224 1225int Scanner::FindSymbol(DuplicateFinder* finder, int value) { 1226 if (is_literal_one_byte()) { 1227 return finder->AddOneByteSymbol(literal_one_byte_string(), value); 1228 } 1229 return finder->AddTwoByteSymbol(literal_two_byte_string(), value); 1230} 1231 1232 1233int DuplicateFinder::AddOneByteSymbol(Vector<const uint8_t> key, int value) { 1234 return AddSymbol(key, true, value); 1235} 1236 1237 1238int DuplicateFinder::AddTwoByteSymbol(Vector<const uint16_t> key, int value) { 1239 return AddSymbol(Vector<const uint8_t>::cast(key), false, value); 1240} 1241 1242 1243int DuplicateFinder::AddSymbol(Vector<const uint8_t> key, 1244 bool is_one_byte, 1245 int value) { 1246 uint32_t hash = Hash(key, is_one_byte); 1247 byte* encoding = BackupKey(key, is_one_byte); 1248 HashMap::Entry* entry = map_.Lookup(encoding, hash, true); 1249 int old_value = static_cast<int>(reinterpret_cast<intptr_t>(entry->value)); 1250 entry->value = 1251 reinterpret_cast<void*>(static_cast<intptr_t>(value | old_value)); 1252 return old_value; 1253} 1254 1255 1256int DuplicateFinder::AddNumber(Vector<const uint8_t> key, int value) { 1257 DCHECK(key.length() > 0); 1258 // Quick check for already being in canonical form. 1259 if (IsNumberCanonical(key)) { 1260 return AddOneByteSymbol(key, value); 1261 } 1262 1263 int flags = ALLOW_HEX | ALLOW_OCTAL | ALLOW_IMPLICIT_OCTAL | ALLOW_BINARY; 1264 double double_value = StringToDouble( 1265 unicode_constants_, key, flags, 0.0); 1266 int length; 1267 const char* string; 1268 if (!std::isfinite(double_value)) { 1269 string = "Infinity"; 1270 length = 8; // strlen("Infinity"); 1271 } else { 1272 string = DoubleToCString(double_value, 1273 Vector<char>(number_buffer_, kBufferSize)); 1274 length = StrLength(string); 1275 } 1276 return AddSymbol(Vector<const byte>(reinterpret_cast<const byte*>(string), 1277 length), true, value); 1278} 1279 1280 1281bool DuplicateFinder::IsNumberCanonical(Vector<const uint8_t> number) { 1282 // Test for a safe approximation of number literals that are already 1283 // in canonical form: max 15 digits, no leading zeroes, except an 1284 // integer part that is a single zero, and no trailing zeros below 1285 // the decimal point. 1286 int pos = 0; 1287 int length = number.length(); 1288 if (number.length() > 15) return false; 1289 if (number[pos] == '0') { 1290 pos++; 1291 } else { 1292 while (pos < length && 1293 static_cast<unsigned>(number[pos] - '0') <= ('9' - '0')) pos++; 1294 } 1295 if (length == pos) return true; 1296 if (number[pos] != '.') return false; 1297 pos++; 1298 bool invalid_last_digit = true; 1299 while (pos < length) { 1300 uint8_t digit = number[pos] - '0'; 1301 if (digit > '9' - '0') return false; 1302 invalid_last_digit = (digit == 0); 1303 pos++; 1304 } 1305 return !invalid_last_digit; 1306} 1307 1308 1309uint32_t DuplicateFinder::Hash(Vector<const uint8_t> key, bool is_one_byte) { 1310 // Primitive hash function, almost identical to the one used 1311 // for strings (except that it's seeded by the length and representation). 1312 int length = key.length(); 1313 uint32_t hash = (length << 1) | (is_one_byte ? 1 : 0) ; 1314 for (int i = 0; i < length; i++) { 1315 uint32_t c = key[i]; 1316 hash = (hash + c) * 1025; 1317 hash ^= (hash >> 6); 1318 } 1319 return hash; 1320} 1321 1322 1323bool DuplicateFinder::Match(void* first, void* second) { 1324 // Decode lengths. 1325 // Length + representation is encoded as base 128, most significant heptet 1326 // first, with a 8th bit being non-zero while there are more heptets. 1327 // The value encodes the number of bytes following, and whether the original 1328 // was Latin1. 1329 byte* s1 = reinterpret_cast<byte*>(first); 1330 byte* s2 = reinterpret_cast<byte*>(second); 1331 uint32_t length_one_byte_field = 0; 1332 byte c1; 1333 do { 1334 c1 = *s1; 1335 if (c1 != *s2) return false; 1336 length_one_byte_field = (length_one_byte_field << 7) | (c1 & 0x7f); 1337 s1++; 1338 s2++; 1339 } while ((c1 & 0x80) != 0); 1340 int length = static_cast<int>(length_one_byte_field >> 1); 1341 return memcmp(s1, s2, length) == 0; 1342} 1343 1344 1345byte* DuplicateFinder::BackupKey(Vector<const uint8_t> bytes, 1346 bool is_one_byte) { 1347 uint32_t one_byte_length = (bytes.length() << 1) | (is_one_byte ? 1 : 0); 1348 backing_store_.StartSequence(); 1349 // Emit one_byte_length as base-128 encoded number, with the 7th bit set 1350 // on the byte of every heptet except the last, least significant, one. 1351 if (one_byte_length >= (1 << 7)) { 1352 if (one_byte_length >= (1 << 14)) { 1353 if (one_byte_length >= (1 << 21)) { 1354 if (one_byte_length >= (1 << 28)) { 1355 backing_store_.Add( 1356 static_cast<uint8_t>((one_byte_length >> 28) | 0x80)); 1357 } 1358 backing_store_.Add( 1359 static_cast<uint8_t>((one_byte_length >> 21) | 0x80u)); 1360 } 1361 backing_store_.Add( 1362 static_cast<uint8_t>((one_byte_length >> 14) | 0x80u)); 1363 } 1364 backing_store_.Add(static_cast<uint8_t>((one_byte_length >> 7) | 0x80u)); 1365 } 1366 backing_store_.Add(static_cast<uint8_t>(one_byte_length & 0x7f)); 1367 1368 backing_store_.AddBlock(bytes); 1369 return backing_store_.EndSequence().start(); 1370} 1371 1372} } // namespace v8::internal 1373