scanner.cc revision 9ac36c9faca11611ada13b4054edbaa0738661d0
1// Copyright 2006-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#include "v8.h" 29 30#include "ast.h" 31#include "handles.h" 32#include "scanner.h" 33 34namespace v8 { 35namespace internal { 36 37// ---------------------------------------------------------------------------- 38// Character predicates 39 40 41unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart; 42unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart; 43unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator; 44unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace; 45 46 47StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_; 48 49 50// ---------------------------------------------------------------------------- 51// UTF8Buffer 52 53UTF8Buffer::UTF8Buffer() : buffer_(kInitialCapacity) { } 54 55 56UTF8Buffer::~UTF8Buffer() {} 57 58 59void UTF8Buffer::AddCharSlow(uc32 c) { 60 ASSERT(static_cast<unsigned>(c) > unibrow::Utf8::kMaxOneByteChar); 61 int length = unibrow::Utf8::Length(c); 62 Vector<char> block = buffer_.AddBlock(length, '\0'); 63#ifdef DEBUG 64 int written_length = unibrow::Utf8::Encode(block.start(), c); 65 CHECK_EQ(length, written_length); 66#else 67 unibrow::Utf8::Encode(block.start(), c); 68#endif 69} 70 71 72// ---------------------------------------------------------------------------- 73// UTF16Buffer 74 75 76UTF16Buffer::UTF16Buffer() 77 : pos_(0), end_(Scanner::kNoEndPosition) { } 78 79 80// CharacterStreamUTF16Buffer 81CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() 82 : pushback_buffer_(0), last_(0), stream_(NULL) { } 83 84 85void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, 86 unibrow::CharacterStream* input, 87 int start_position, 88 int end_position) { 89 stream_ = input; 90 if (start_position > 0) { 91 SeekForward(start_position); 92 } 93 end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt; 94} 95 96 97void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { 98 pushback_buffer()->Add(last_); 99 last_ = ch; 100 pos_--; 101} 102 103 104uc32 CharacterStreamUTF16Buffer::Advance() { 105 ASSERT(end_ != Scanner::kNoEndPosition); 106 ASSERT(end_ >= 0); 107 // NOTE: It is of importance to Persian / Farsi resources that we do 108 // *not* strip format control characters in the scanner; see 109 // 110 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 111 // 112 // So, even though ECMA-262, section 7.1, page 11, dictates that we 113 // must remove Unicode format-control characters, we do not. This is 114 // in line with how IE and SpiderMonkey handles it. 115 if (!pushback_buffer()->is_empty()) { 116 pos_++; 117 return last_ = pushback_buffer()->RemoveLast(); 118 } else if (stream_->has_more() && pos_ < end_) { 119 pos_++; 120 uc32 next = stream_->GetNext(); 121 return last_ = next; 122 } else { 123 // Note: currently the following increment is necessary to avoid a 124 // test-parser problem! 125 pos_++; 126 return last_ = static_cast<uc32>(-1); 127 } 128} 129 130 131void CharacterStreamUTF16Buffer::SeekForward(int pos) { 132 pos_ = pos; 133 ASSERT(pushback_buffer()->is_empty()); 134 stream_->Seek(pos); 135} 136 137 138// ExternalStringUTF16Buffer 139template <typename StringType, typename CharType> 140ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() 141 : raw_data_(NULL) { } 142 143 144template <typename StringType, typename CharType> 145void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( 146 Handle<StringType> data, 147 int start_position, 148 int end_position) { 149 ASSERT(!data.is_null()); 150 raw_data_ = data->resource()->data(); 151 152 ASSERT(end_position <= data->length()); 153 if (start_position > 0) { 154 SeekForward(start_position); 155 } 156 end_ = 157 end_position != Scanner::kNoEndPosition ? end_position : data->length(); 158} 159 160 161template <typename StringType, typename CharType> 162uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { 163 if (pos_ < end_) { 164 return raw_data_[pos_++]; 165 } else { 166 // note: currently the following increment is necessary to avoid a 167 // test-parser problem! 168 pos_++; 169 return static_cast<uc32>(-1); 170 } 171} 172 173 174template <typename StringType, typename CharType> 175void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) { 176 pos_--; 177 ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize); 178 ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch); 179} 180 181 182template <typename StringType, typename CharType> 183void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { 184 pos_ = pos; 185} 186 187 188// ---------------------------------------------------------------------------- 189// Keyword Matcher 190 191KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { 192 { "break", KEYWORD_PREFIX, Token::BREAK }, 193 { NULL, C, Token::ILLEGAL }, 194 { NULL, D, Token::ILLEGAL }, 195 { "else", KEYWORD_PREFIX, Token::ELSE }, 196 { NULL, F, Token::ILLEGAL }, 197 { NULL, UNMATCHABLE, Token::ILLEGAL }, 198 { NULL, UNMATCHABLE, Token::ILLEGAL }, 199 { NULL, I, Token::ILLEGAL }, 200 { NULL, UNMATCHABLE, Token::ILLEGAL }, 201 { NULL, UNMATCHABLE, Token::ILLEGAL }, 202 { NULL, UNMATCHABLE, Token::ILLEGAL }, 203 { NULL, UNMATCHABLE, Token::ILLEGAL }, 204 { NULL, N, Token::ILLEGAL }, 205 { NULL, UNMATCHABLE, Token::ILLEGAL }, 206 { NULL, UNMATCHABLE, Token::ILLEGAL }, 207 { NULL, UNMATCHABLE, Token::ILLEGAL }, 208 { "return", KEYWORD_PREFIX, Token::RETURN }, 209 { "switch", KEYWORD_PREFIX, Token::SWITCH }, 210 { NULL, T, Token::ILLEGAL }, 211 { NULL, UNMATCHABLE, Token::ILLEGAL }, 212 { NULL, V, Token::ILLEGAL }, 213 { NULL, W, Token::ILLEGAL } 214}; 215 216 217void KeywordMatcher::Step(uc32 input) { 218 switch (state_) { 219 case INITIAL: { 220 // matching the first character is the only state with significant fanout. 221 // Match only lower-case letters in range 'b'..'w'. 222 unsigned int offset = input - kFirstCharRangeMin; 223 if (offset < kFirstCharRangeLength) { 224 state_ = first_states_[offset].state; 225 if (state_ == KEYWORD_PREFIX) { 226 keyword_ = first_states_[offset].keyword; 227 counter_ = 1; 228 keyword_token_ = first_states_[offset].token; 229 } 230 return; 231 } 232 break; 233 } 234 case KEYWORD_PREFIX: 235 if (keyword_[counter_] == input) { 236 ASSERT_NE(input, '\0'); 237 counter_++; 238 if (keyword_[counter_] == '\0') { 239 state_ = KEYWORD_MATCHED; 240 token_ = keyword_token_; 241 } 242 return; 243 } 244 break; 245 case KEYWORD_MATCHED: 246 token_ = Token::IDENTIFIER; 247 break; 248 case C: 249 if (MatchState(input, 'a', CA)) return; 250 if (MatchState(input, 'o', CO)) return; 251 break; 252 case CA: 253 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; 254 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; 255 break; 256 case CO: 257 if (MatchState(input, 'n', CON)) return; 258 break; 259 case CON: 260 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; 261 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; 262 break; 263 case D: 264 if (MatchState(input, 'e', DE)) return; 265 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; 266 break; 267 case DE: 268 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; 269 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; 270 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; 271 break; 272 case F: 273 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; 274 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; 275 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; 276 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; 277 break; 278 case I: 279 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; 280 if (MatchKeyword(input, 'n', IN, Token::IN)) return; 281 break; 282 case IN: 283 token_ = Token::IDENTIFIER; 284 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) { 285 return; 286 } 287 break; 288 case N: 289 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; 290 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; 291 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; 292 break; 293 case T: 294 if (MatchState(input, 'h', TH)) return; 295 if (MatchState(input, 'r', TR)) return; 296 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; 297 break; 298 case TH: 299 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; 300 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; 301 break; 302 case TR: 303 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; 304 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; 305 break; 306 case V: 307 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; 308 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; 309 break; 310 case W: 311 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; 312 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; 313 break; 314 default: 315 UNREACHABLE(); 316 } 317 // On fallthrough, it's a failure. 318 state_ = UNMATCHABLE; 319} 320 321 322 323// ---------------------------------------------------------------------------- 324// Scanner::LiteralScope 325 326Scanner::LiteralScope::LiteralScope(Scanner* self) 327 : scanner_(self), complete_(false) { 328 self->StartLiteral(); 329} 330 331 332Scanner::LiteralScope::~LiteralScope() { 333 if (!complete_) scanner_->DropLiteral(); 334} 335 336 337void Scanner::LiteralScope::Complete() { 338 scanner_->TerminateLiteral(); 339 complete_ = true; 340} 341 342// ---------------------------------------------------------------------------- 343// Scanner 344 345Scanner::Scanner(ParserMode pre) 346 : is_pre_parsing_(pre == PREPARSE), stack_overflow_(false) { } 347 348 349void Scanner::Initialize(Handle<String> source, 350 ParserLanguage language) { 351 Init(source, NULL, 0, source->length(), language); 352} 353 354 355void Scanner::Initialize(Handle<String> source, 356 unibrow::CharacterStream* stream, 357 ParserLanguage language) { 358 Init(source, stream, 0, kNoEndPosition, language); 359} 360 361 362void Scanner::Initialize(Handle<String> source, 363 int start_position, 364 int end_position, 365 ParserLanguage language) { 366 Init(source, NULL, start_position, end_position, language); 367} 368 369 370void Scanner::Init(Handle<String> source, 371 unibrow::CharacterStream* stream, 372 int start_position, 373 int end_position, 374 ParserLanguage language) { 375 // Either initialize the scanner from a character stream or from a 376 // string. 377 ASSERT(source.is_null() || stream == NULL); 378 379 // Initialize the source buffer. 380 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { 381 two_byte_string_buffer_.Initialize( 382 Handle<ExternalTwoByteString>::cast(source), 383 start_position, 384 end_position); 385 source_ = &two_byte_string_buffer_; 386 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) { 387 ascii_string_buffer_.Initialize( 388 Handle<ExternalAsciiString>::cast(source), 389 start_position, 390 end_position); 391 source_ = &ascii_string_buffer_; 392 } else { 393 if (!source.is_null()) { 394 safe_string_input_buffer_.Reset(source.location()); 395 stream = &safe_string_input_buffer_; 396 } 397 char_stream_buffer_.Initialize(source, 398 stream, 399 start_position, 400 end_position); 401 source_ = &char_stream_buffer_; 402 } 403 404 is_parsing_json_ = (language == JSON); 405 406 // Set c0_ (one character ahead) 407 ASSERT(kCharacterLookaheadBufferSize == 1); 408 Advance(); 409 // Initialize current_ to not refer to a literal. 410 current_.literal_chars = Vector<const char>(); 411 // Reset literal buffer. 412 literal_buffer_.Reset(); 413 414 // Skip initial whitespace allowing HTML comment ends just like 415 // after a newline and scan first token. 416 has_line_terminator_before_next_ = true; 417 SkipWhiteSpace(); 418 Scan(); 419} 420 421 422Token::Value Scanner::Next() { 423 // BUG 1215673: Find a thread safe way to set a stack limit in 424 // pre-parse mode. Otherwise, we cannot safely pre-parse from other 425 // threads. 426 current_ = next_; 427 // Check for stack-overflow before returning any tokens. 428 StackLimitCheck check; 429 if (check.HasOverflowed()) { 430 stack_overflow_ = true; 431 next_.token = Token::ILLEGAL; 432 } else { 433 has_line_terminator_before_next_ = false; 434 Scan(); 435 } 436 return current_.token; 437} 438 439 440void Scanner::StartLiteral() { 441 literal_buffer_.StartLiteral(); 442} 443 444 445void Scanner::AddChar(uc32 c) { 446 literal_buffer_.AddChar(c); 447} 448 449 450void Scanner::TerminateLiteral() { 451 next_.literal_chars = literal_buffer_.EndLiteral(); 452} 453 454 455void Scanner::DropLiteral() { 456 literal_buffer_.DropLiteral(); 457} 458 459 460void Scanner::AddCharAdvance() { 461 AddChar(c0_); 462 Advance(); 463} 464 465 466static inline bool IsByteOrderMark(uc32 c) { 467 // The Unicode value U+FFFE is guaranteed never to be assigned as a 468 // Unicode character; this implies that in a Unicode context the 469 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 470 // character expressed in little-endian byte order (since it could 471 // not be a U+FFFE character expressed in big-endian byte 472 // order). Nevertheless, we check for it to be compatible with 473 // Spidermonkey. 474 return c == 0xFEFF || c == 0xFFFE; 475} 476 477 478bool Scanner::SkipJsonWhiteSpace() { 479 int start_position = source_pos(); 480 // JSON WhiteSpace is tab, carrige-return, newline and space. 481 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { 482 Advance(); 483 } 484 return source_pos() != start_position; 485} 486 487 488bool Scanner::SkipJavaScriptWhiteSpace() { 489 int start_position = source_pos(); 490 491 while (true) { 492 // We treat byte-order marks (BOMs) as whitespace for better 493 // compatibility with Spidermonkey and other JavaScript engines. 494 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { 495 // IsWhiteSpace() includes line terminators! 496 if (kIsLineTerminator.get(c0_)) { 497 // Ignore line terminators, but remember them. This is necessary 498 // for automatic semicolon insertion. 499 has_line_terminator_before_next_ = true; 500 } 501 Advance(); 502 } 503 504 // If there is an HTML comment end '-->' at the beginning of a 505 // line (with only whitespace in front of it), we treat the rest 506 // of the line as a comment. This is in line with the way 507 // SpiderMonkey handles it. 508 if (c0_ == '-' && has_line_terminator_before_next_) { 509 Advance(); 510 if (c0_ == '-') { 511 Advance(); 512 if (c0_ == '>') { 513 // Treat the rest of the line as a comment. 514 SkipSingleLineComment(); 515 // Continue skipping white space after the comment. 516 continue; 517 } 518 PushBack('-'); // undo Advance() 519 } 520 PushBack('-'); // undo Advance() 521 } 522 // Return whether or not we skipped any characters. 523 return source_pos() != start_position; 524 } 525} 526 527 528Token::Value Scanner::SkipSingleLineComment() { 529 Advance(); 530 531 // The line terminator at the end of the line is not considered 532 // to be part of the single-line comment; it is recognized 533 // separately by the lexical grammar and becomes part of the 534 // stream of input elements for the syntactic grammar (see 535 // ECMA-262, section 7.4, page 12). 536 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 537 Advance(); 538 } 539 540 return Token::WHITESPACE; 541} 542 543 544Token::Value Scanner::SkipMultiLineComment() { 545 ASSERT(c0_ == '*'); 546 Advance(); 547 548 while (c0_ >= 0) { 549 char ch = c0_; 550 Advance(); 551 // If we have reached the end of the multi-line comment, we 552 // consume the '/' and insert a whitespace. This way all 553 // multi-line comments are treated as whitespace - even the ones 554 // containing line terminators. This contradicts ECMA-262, section 555 // 7.4, page 12, that says that multi-line comments containing 556 // line terminators should be treated as a line terminator, but it 557 // matches the behaviour of SpiderMonkey and KJS. 558 if (ch == '*' && c0_ == '/') { 559 c0_ = ' '; 560 return Token::WHITESPACE; 561 } 562 } 563 564 // Unterminated multi-line comment. 565 return Token::ILLEGAL; 566} 567 568 569Token::Value Scanner::ScanHtmlComment() { 570 // Check for <!-- comments. 571 ASSERT(c0_ == '!'); 572 Advance(); 573 if (c0_ == '-') { 574 Advance(); 575 if (c0_ == '-') return SkipSingleLineComment(); 576 PushBack('-'); // undo Advance() 577 } 578 PushBack('!'); // undo Advance() 579 ASSERT(c0_ == '!'); 580 return Token::LT; 581} 582 583 584 585void Scanner::ScanJson() { 586 next_.literal_chars = Vector<const char>(); 587 Token::Value token; 588 has_line_terminator_before_next_ = false; 589 do { 590 // Remember the position of the next token 591 next_.location.beg_pos = source_pos(); 592 switch (c0_) { 593 case '\t': 594 case '\r': 595 case '\n': 596 case ' ': 597 Advance(); 598 token = Token::WHITESPACE; 599 break; 600 case '{': 601 Advance(); 602 token = Token::LBRACE; 603 break; 604 case '}': 605 Advance(); 606 token = Token::RBRACE; 607 break; 608 case '[': 609 Advance(); 610 token = Token::LBRACK; 611 break; 612 case ']': 613 Advance(); 614 token = Token::RBRACK; 615 break; 616 case ':': 617 Advance(); 618 token = Token::COLON; 619 break; 620 case ',': 621 Advance(); 622 token = Token::COMMA; 623 break; 624 case '"': 625 token = ScanJsonString(); 626 break; 627 case '-': 628 case '0': 629 case '1': 630 case '2': 631 case '3': 632 case '4': 633 case '5': 634 case '6': 635 case '7': 636 case '8': 637 case '9': 638 token = ScanJsonNumber(); 639 break; 640 case 't': 641 token = ScanJsonIdentifier("true", Token::TRUE_LITERAL); 642 break; 643 case 'f': 644 token = ScanJsonIdentifier("false", Token::FALSE_LITERAL); 645 break; 646 case 'n': 647 token = ScanJsonIdentifier("null", Token::NULL_LITERAL); 648 break; 649 default: 650 if (c0_ < 0) { 651 Advance(); 652 token = Token::EOS; 653 } else { 654 Advance(); 655 token = Select(Token::ILLEGAL); 656 } 657 } 658 } while (token == Token::WHITESPACE); 659 660 next_.location.end_pos = source_pos(); 661 next_.token = token; 662} 663 664 665Token::Value Scanner::ScanJsonString() { 666 ASSERT_EQ('"', c0_); 667 Advance(); 668 LiteralScope literal(this); 669 while (c0_ != '"' && c0_ > 0) { 670 // Check for control character (0x00-0x1f) or unterminated string (<0). 671 if (c0_ < 0x20) return Token::ILLEGAL; 672 if (c0_ != '\\') { 673 AddCharAdvance(); 674 } else { 675 Advance(); 676 switch (c0_) { 677 case '"': 678 case '\\': 679 case '/': 680 AddChar(c0_); 681 break; 682 case 'b': 683 AddChar('\x08'); 684 break; 685 case 'f': 686 AddChar('\x0c'); 687 break; 688 case 'n': 689 AddChar('\x0a'); 690 break; 691 case 'r': 692 AddChar('\x0d'); 693 break; 694 case 't': 695 AddChar('\x09'); 696 break; 697 case 'u': { 698 uc32 value = 0; 699 for (int i = 0; i < 4; i++) { 700 Advance(); 701 int digit = HexValue(c0_); 702 if (digit < 0) { 703 return Token::ILLEGAL; 704 } 705 value = value * 16 + digit; 706 } 707 AddChar(value); 708 break; 709 } 710 default: 711 return Token::ILLEGAL; 712 } 713 Advance(); 714 } 715 } 716 if (c0_ != '"') { 717 return Token::ILLEGAL; 718 } 719 literal.Complete(); 720 Advance(); 721 return Token::STRING; 722} 723 724 725Token::Value Scanner::ScanJsonNumber() { 726 LiteralScope literal(this); 727 if (c0_ == '-') AddCharAdvance(); 728 if (c0_ == '0') { 729 AddCharAdvance(); 730 // Prefix zero is only allowed if it's the only digit before 731 // a decimal point or exponent. 732 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; 733 } else { 734 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; 735 do { 736 AddCharAdvance(); 737 } while (c0_ >= '0' && c0_ <= '9'); 738 } 739 if (c0_ == '.') { 740 AddCharAdvance(); 741 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 742 do { 743 AddCharAdvance(); 744 } while (c0_ >= '0' && c0_ <= '9'); 745 } 746 if (AsciiAlphaToLower(c0_) == 'e') { 747 AddCharAdvance(); 748 if (c0_ == '-' || c0_ == '+') AddCharAdvance(); 749 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 750 do { 751 AddCharAdvance(); 752 } while (c0_ >= '0' && c0_ <= '9'); 753 } 754 literal.Complete(); 755 return Token::NUMBER; 756} 757 758 759Token::Value Scanner::ScanJsonIdentifier(const char* text, 760 Token::Value token) { 761 LiteralScope literal(this); 762 while (*text != '\0') { 763 if (c0_ != *text) return Token::ILLEGAL; 764 Advance(); 765 text++; 766 } 767 if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; 768 literal.Complete(); 769 return token; 770} 771 772 773void Scanner::ScanJavaScript() { 774 next_.literal_chars = Vector<const char>(); 775 Token::Value token; 776 do { 777 // Remember the position of the next token 778 next_.location.beg_pos = source_pos(); 779 780 switch (c0_) { 781 case ' ': 782 case '\t': 783 Advance(); 784 token = Token::WHITESPACE; 785 break; 786 787 case '\n': 788 Advance(); 789 has_line_terminator_before_next_ = true; 790 token = Token::WHITESPACE; 791 break; 792 793 case '"': case '\'': 794 token = ScanString(); 795 break; 796 797 case '<': 798 // < <= << <<= <!-- 799 Advance(); 800 if (c0_ == '=') { 801 token = Select(Token::LTE); 802 } else if (c0_ == '<') { 803 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 804 } else if (c0_ == '!') { 805 token = ScanHtmlComment(); 806 } else { 807 token = Token::LT; 808 } 809 break; 810 811 case '>': 812 // > >= >> >>= >>> >>>= 813 Advance(); 814 if (c0_ == '=') { 815 token = Select(Token::GTE); 816 } else if (c0_ == '>') { 817 // >> >>= >>> >>>= 818 Advance(); 819 if (c0_ == '=') { 820 token = Select(Token::ASSIGN_SAR); 821 } else if (c0_ == '>') { 822 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 823 } else { 824 token = Token::SAR; 825 } 826 } else { 827 token = Token::GT; 828 } 829 break; 830 831 case '=': 832 // = == === 833 Advance(); 834 if (c0_ == '=') { 835 token = Select('=', Token::EQ_STRICT, Token::EQ); 836 } else { 837 token = Token::ASSIGN; 838 } 839 break; 840 841 case '!': 842 // ! != !== 843 Advance(); 844 if (c0_ == '=') { 845 token = Select('=', Token::NE_STRICT, Token::NE); 846 } else { 847 token = Token::NOT; 848 } 849 break; 850 851 case '+': 852 // + ++ += 853 Advance(); 854 if (c0_ == '+') { 855 token = Select(Token::INC); 856 } else if (c0_ == '=') { 857 token = Select(Token::ASSIGN_ADD); 858 } else { 859 token = Token::ADD; 860 } 861 break; 862 863 case '-': 864 // - -- --> -= 865 Advance(); 866 if (c0_ == '-') { 867 Advance(); 868 if (c0_ == '>' && has_line_terminator_before_next_) { 869 // For compatibility with SpiderMonkey, we skip lines that 870 // start with an HTML comment end '-->'. 871 token = SkipSingleLineComment(); 872 } else { 873 token = Token::DEC; 874 } 875 } else if (c0_ == '=') { 876 token = Select(Token::ASSIGN_SUB); 877 } else { 878 token = Token::SUB; 879 } 880 break; 881 882 case '*': 883 // * *= 884 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 885 break; 886 887 case '%': 888 // % %= 889 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 890 break; 891 892 case '/': 893 // / // /* /= 894 Advance(); 895 if (c0_ == '/') { 896 token = SkipSingleLineComment(); 897 } else if (c0_ == '*') { 898 token = SkipMultiLineComment(); 899 } else if (c0_ == '=') { 900 token = Select(Token::ASSIGN_DIV); 901 } else { 902 token = Token::DIV; 903 } 904 break; 905 906 case '&': 907 // & && &= 908 Advance(); 909 if (c0_ == '&') { 910 token = Select(Token::AND); 911 } else if (c0_ == '=') { 912 token = Select(Token::ASSIGN_BIT_AND); 913 } else { 914 token = Token::BIT_AND; 915 } 916 break; 917 918 case '|': 919 // | || |= 920 Advance(); 921 if (c0_ == '|') { 922 token = Select(Token::OR); 923 } else if (c0_ == '=') { 924 token = Select(Token::ASSIGN_BIT_OR); 925 } else { 926 token = Token::BIT_OR; 927 } 928 break; 929 930 case '^': 931 // ^ ^= 932 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 933 break; 934 935 case '.': 936 // . Number 937 Advance(); 938 if (IsDecimalDigit(c0_)) { 939 token = ScanNumber(true); 940 } else { 941 token = Token::PERIOD; 942 } 943 break; 944 945 case ':': 946 token = Select(Token::COLON); 947 break; 948 949 case ';': 950 token = Select(Token::SEMICOLON); 951 break; 952 953 case ',': 954 token = Select(Token::COMMA); 955 break; 956 957 case '(': 958 token = Select(Token::LPAREN); 959 break; 960 961 case ')': 962 token = Select(Token::RPAREN); 963 break; 964 965 case '[': 966 token = Select(Token::LBRACK); 967 break; 968 969 case ']': 970 token = Select(Token::RBRACK); 971 break; 972 973 case '{': 974 token = Select(Token::LBRACE); 975 break; 976 977 case '}': 978 token = Select(Token::RBRACE); 979 break; 980 981 case '?': 982 token = Select(Token::CONDITIONAL); 983 break; 984 985 case '~': 986 token = Select(Token::BIT_NOT); 987 break; 988 989 default: 990 if (kIsIdentifierStart.get(c0_)) { 991 token = ScanIdentifier(); 992 } else if (IsDecimalDigit(c0_)) { 993 token = ScanNumber(false); 994 } else if (SkipWhiteSpace()) { 995 token = Token::WHITESPACE; 996 } else if (c0_ < 0) { 997 token = Token::EOS; 998 } else { 999 token = Select(Token::ILLEGAL); 1000 } 1001 break; 1002 } 1003 1004 // Continue scanning for tokens as long as we're just skipping 1005 // whitespace. 1006 } while (token == Token::WHITESPACE); 1007 1008 next_.location.end_pos = source_pos(); 1009 next_.token = token; 1010} 1011 1012 1013void Scanner::SeekForward(int pos) { 1014 source_->SeekForward(pos - 1); 1015 Advance(); 1016 // This function is only called to seek to the location 1017 // of the end of a function (at the "}" token). It doesn't matter 1018 // whether there was a line terminator in the part we skip. 1019 has_line_terminator_before_next_ = false; 1020 Scan(); 1021} 1022 1023 1024uc32 Scanner::ScanHexEscape(uc32 c, int length) { 1025 ASSERT(length <= 4); // prevent overflow 1026 1027 uc32 digits[4]; 1028 uc32 x = 0; 1029 for (int i = 0; i < length; i++) { 1030 digits[i] = c0_; 1031 int d = HexValue(c0_); 1032 if (d < 0) { 1033 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 1034 // should be illegal, but other JS VMs just return the 1035 // non-escaped version of the original character. 1036 1037 // Push back digits read, except the last one (in c0_). 1038 for (int j = i-1; j >= 0; j--) { 1039 PushBack(digits[j]); 1040 } 1041 // Notice: No handling of error - treat it as "\u"->"u". 1042 return c; 1043 } 1044 x = x * 16 + d; 1045 Advance(); 1046 } 1047 1048 return x; 1049} 1050 1051 1052// Octal escapes of the forms '\0xx' and '\xxx' are not a part of 1053// ECMA-262. Other JS VMs support them. 1054uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 1055 uc32 x = c - '0'; 1056 for (int i = 0; i < length; i++) { 1057 int d = c0_ - '0'; 1058 if (d < 0 || d > 7) break; 1059 int nx = x * 8 + d; 1060 if (nx >= 256) break; 1061 x = nx; 1062 Advance(); 1063 } 1064 return x; 1065} 1066 1067 1068void Scanner::ScanEscape() { 1069 uc32 c = c0_; 1070 Advance(); 1071 1072 // Skip escaped newlines. 1073 if (kIsLineTerminator.get(c)) { 1074 // Allow CR+LF newlines in multiline string literals. 1075 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 1076 // Allow LF+CR newlines in multiline string literals. 1077 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 1078 return; 1079 } 1080 1081 switch (c) { 1082 case '\'': // fall through 1083 case '"' : // fall through 1084 case '\\': break; 1085 case 'b' : c = '\b'; break; 1086 case 'f' : c = '\f'; break; 1087 case 'n' : c = '\n'; break; 1088 case 'r' : c = '\r'; break; 1089 case 't' : c = '\t'; break; 1090 case 'u' : c = ScanHexEscape(c, 4); break; 1091 case 'v' : c = '\v'; break; 1092 case 'x' : c = ScanHexEscape(c, 2); break; 1093 case '0' : // fall through 1094 case '1' : // fall through 1095 case '2' : // fall through 1096 case '3' : // fall through 1097 case '4' : // fall through 1098 case '5' : // fall through 1099 case '6' : // fall through 1100 case '7' : c = ScanOctalEscape(c, 2); break; 1101 } 1102 1103 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 1104 // should be illegal, but they are commonly handled 1105 // as non-escaped characters by JS VMs. 1106 AddChar(c); 1107} 1108 1109 1110Token::Value Scanner::ScanString() { 1111 uc32 quote = c0_; 1112 Advance(); // consume quote 1113 1114 LiteralScope literal(this); 1115 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 1116 uc32 c = c0_; 1117 Advance(); 1118 if (c == '\\') { 1119 if (c0_ < 0) return Token::ILLEGAL; 1120 ScanEscape(); 1121 } else { 1122 AddChar(c); 1123 } 1124 } 1125 if (c0_ != quote) return Token::ILLEGAL; 1126 literal.Complete(); 1127 1128 Advance(); // consume quote 1129 return Token::STRING; 1130} 1131 1132 1133Token::Value Scanner::Select(Token::Value tok) { 1134 Advance(); 1135 return tok; 1136} 1137 1138 1139Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { 1140 Advance(); 1141 if (c0_ == next) { 1142 Advance(); 1143 return then; 1144 } else { 1145 return else_; 1146 } 1147} 1148 1149 1150// Returns true if any decimal digits were scanned, returns false otherwise. 1151void Scanner::ScanDecimalDigits() { 1152 while (IsDecimalDigit(c0_)) 1153 AddCharAdvance(); 1154} 1155 1156 1157Token::Value Scanner::ScanNumber(bool seen_period) { 1158 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1159 1160 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 1161 1162 LiteralScope literal(this); 1163 if (seen_period) { 1164 // we have already seen a decimal point of the float 1165 AddChar('.'); 1166 ScanDecimalDigits(); // we know we have at least one digit 1167 1168 } else { 1169 // if the first character is '0' we must check for octals and hex 1170 if (c0_ == '0') { 1171 AddCharAdvance(); 1172 1173 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 1174 if (c0_ == 'x' || c0_ == 'X') { 1175 // hex number 1176 kind = HEX; 1177 AddCharAdvance(); 1178 if (!IsHexDigit(c0_)) { 1179 // we must have at least one hex digit after 'x'/'X' 1180 return Token::ILLEGAL; 1181 } 1182 while (IsHexDigit(c0_)) { 1183 AddCharAdvance(); 1184 } 1185 } else if ('0' <= c0_ && c0_ <= '7') { 1186 // (possible) octal number 1187 kind = OCTAL; 1188 while (true) { 1189 if (c0_ == '8' || c0_ == '9') { 1190 kind = DECIMAL; 1191 break; 1192 } 1193 if (c0_ < '0' || '7' < c0_) break; 1194 AddCharAdvance(); 1195 } 1196 } 1197 } 1198 1199 // Parse decimal digits and allow trailing fractional part. 1200 if (kind == DECIMAL) { 1201 ScanDecimalDigits(); // optional 1202 if (c0_ == '.') { 1203 AddCharAdvance(); 1204 ScanDecimalDigits(); // optional 1205 } 1206 } 1207 } 1208 1209 // scan exponent, if any 1210 if (c0_ == 'e' || c0_ == 'E') { 1211 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1212 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 1213 // scan exponent 1214 AddCharAdvance(); 1215 if (c0_ == '+' || c0_ == '-') 1216 AddCharAdvance(); 1217 if (!IsDecimalDigit(c0_)) { 1218 // we must have at least one decimal digit after 'e'/'E' 1219 return Token::ILLEGAL; 1220 } 1221 ScanDecimalDigits(); 1222 } 1223 1224 // The source character immediately following a numeric literal must 1225 // not be an identifier start or a decimal digit; see ECMA-262 1226 // section 7.8.3, page 17 (note that we read only one decimal digit 1227 // if the value is 0). 1228 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_)) 1229 return Token::ILLEGAL; 1230 1231 literal.Complete(); 1232 1233 return Token::NUMBER; 1234} 1235 1236 1237uc32 Scanner::ScanIdentifierUnicodeEscape() { 1238 Advance(); 1239 if (c0_ != 'u') return unibrow::Utf8::kBadChar; 1240 Advance(); 1241 uc32 c = ScanHexEscape('u', 4); 1242 // We do not allow a unicode escape sequence to start another 1243 // unicode escape sequence. 1244 if (c == '\\') return unibrow::Utf8::kBadChar; 1245 return c; 1246} 1247 1248 1249Token::Value Scanner::ScanIdentifier() { 1250 ASSERT(kIsIdentifierStart.get(c0_)); 1251 1252 LiteralScope literal(this); 1253 KeywordMatcher keyword_match; 1254 1255 // Scan identifier start character. 1256 if (c0_ == '\\') { 1257 uc32 c = ScanIdentifierUnicodeEscape(); 1258 // Only allow legal identifier start characters. 1259 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL; 1260 AddChar(c); 1261 keyword_match.Fail(); 1262 } else { 1263 AddChar(c0_); 1264 keyword_match.AddChar(c0_); 1265 Advance(); 1266 } 1267 1268 // Scan the rest of the identifier characters. 1269 while (kIsIdentifierPart.get(c0_)) { 1270 if (c0_ == '\\') { 1271 uc32 c = ScanIdentifierUnicodeEscape(); 1272 // Only allow legal identifier part characters. 1273 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL; 1274 AddChar(c); 1275 keyword_match.Fail(); 1276 } else { 1277 AddChar(c0_); 1278 keyword_match.AddChar(c0_); 1279 Advance(); 1280 } 1281 } 1282 literal.Complete(); 1283 1284 return keyword_match.token(); 1285} 1286 1287 1288 1289bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) { 1290 // Checks whether the buffer contains an identifier (no escape). 1291 if (!buffer->has_more()) return false; 1292 if (!kIsIdentifierStart.get(buffer->GetNext())) return false; 1293 while (buffer->has_more()) { 1294 if (!kIsIdentifierPart.get(buffer->GetNext())) return false; 1295 } 1296 return true; 1297} 1298 1299 1300bool Scanner::ScanRegExpPattern(bool seen_equal) { 1301 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1302 bool in_character_class = false; 1303 1304 // Previous token is either '/' or '/=', in the second case, the 1305 // pattern starts at =. 1306 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1307 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1308 1309 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1310 // the scanner should pass uninterpreted bodies to the RegExp 1311 // constructor. 1312 LiteralScope literal(this); 1313 if (seen_equal) 1314 AddChar('='); 1315 1316 while (c0_ != '/' || in_character_class) { 1317 if (kIsLineTerminator.get(c0_) || c0_ < 0) return false; 1318 if (c0_ == '\\') { // escaped character 1319 AddCharAdvance(); 1320 if (kIsLineTerminator.get(c0_) || c0_ < 0) return false; 1321 AddCharAdvance(); 1322 } else { // unescaped character 1323 if (c0_ == '[') in_character_class = true; 1324 if (c0_ == ']') in_character_class = false; 1325 AddCharAdvance(); 1326 } 1327 } 1328 Advance(); // consume '/' 1329 1330 literal.Complete(); 1331 1332 return true; 1333} 1334 1335bool Scanner::ScanRegExpFlags() { 1336 // Scan regular expression flags. 1337 LiteralScope literal(this); 1338 while (kIsIdentifierPart.get(c0_)) { 1339 if (c0_ == '\\') { 1340 uc32 c = ScanIdentifierUnicodeEscape(); 1341 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { 1342 // We allow any escaped character, unlike the restriction on 1343 // IdentifierPart when it is used to build an IdentifierName. 1344 AddChar(c); 1345 continue; 1346 } 1347 } 1348 AddCharAdvance(); 1349 } 1350 literal.Complete(); 1351 1352 next_.location.end_pos = source_pos() - 1; 1353 return true; 1354} 1355 1356} } // namespace v8::internal 1357