scanner.cc revision 6ded16be15dd865a9b21ea304d5273c8be299c87
1// Copyright 2006-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#include "v8.h" 29 30#include "ast.h" 31#include "handles.h" 32#include "scanner.h" 33 34namespace v8 { 35namespace internal { 36 37// ---------------------------------------------------------------------------- 38// Character predicates 39 40 41unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart; 42unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart; 43unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator; 44unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace; 45 46 47StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_; 48 49 50// ---------------------------------------------------------------------------- 51// UTF8Buffer 52 53UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { } 54 55 56UTF8Buffer::~UTF8Buffer() { 57 if (data_ != NULL) DeleteArray(data_); 58} 59 60 61void UTF8Buffer::AddCharSlow(uc32 c) { 62 static const int kCapacityGrowthLimit = 1 * MB; 63 if (cursor_ > limit_) { 64 int old_capacity = Capacity(); 65 int old_position = pos(); 66 int new_capacity = 67 Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit); 68 char* new_data = NewArray<char>(new_capacity); 69 memcpy(new_data, data_, old_position); 70 DeleteArray(data_); 71 data_ = new_data; 72 cursor_ = new_data + old_position; 73 limit_ = ComputeLimit(new_data, new_capacity); 74 ASSERT(Capacity() == new_capacity && pos() == old_position); 75 } 76 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 77 *cursor_++ = c; // Common case: 7-bit ASCII. 78 } else { 79 cursor_ += unibrow::Utf8::Encode(cursor_, c); 80 } 81 ASSERT(pos() <= Capacity()); 82} 83 84 85// ---------------------------------------------------------------------------- 86// UTF16Buffer 87 88 89UTF16Buffer::UTF16Buffer() 90 : pos_(0), end_(Scanner::kNoEndPosition) { } 91 92 93// CharacterStreamUTF16Buffer 94CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer() 95 : pushback_buffer_(0), last_(0), stream_(NULL) { } 96 97 98void CharacterStreamUTF16Buffer::Initialize(Handle<String> data, 99 unibrow::CharacterStream* input, 100 int start_position, 101 int end_position) { 102 stream_ = input; 103 if (start_position > 0) { 104 SeekForward(start_position); 105 } 106 end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt; 107} 108 109 110void CharacterStreamUTF16Buffer::PushBack(uc32 ch) { 111 pushback_buffer()->Add(last_); 112 last_ = ch; 113 pos_--; 114} 115 116 117uc32 CharacterStreamUTF16Buffer::Advance() { 118 ASSERT(end_ != Scanner::kNoEndPosition); 119 ASSERT(end_ >= 0); 120 // NOTE: It is of importance to Persian / Farsi resources that we do 121 // *not* strip format control characters in the scanner; see 122 // 123 // https://bugzilla.mozilla.org/show_bug.cgi?id=274152 124 // 125 // So, even though ECMA-262, section 7.1, page 11, dictates that we 126 // must remove Unicode format-control characters, we do not. This is 127 // in line with how IE and SpiderMonkey handles it. 128 if (!pushback_buffer()->is_empty()) { 129 pos_++; 130 return last_ = pushback_buffer()->RemoveLast(); 131 } else if (stream_->has_more() && pos_ < end_) { 132 pos_++; 133 uc32 next = stream_->GetNext(); 134 return last_ = next; 135 } else { 136 // Note: currently the following increment is necessary to avoid a 137 // test-parser problem! 138 pos_++; 139 return last_ = static_cast<uc32>(-1); 140 } 141} 142 143 144void CharacterStreamUTF16Buffer::SeekForward(int pos) { 145 pos_ = pos; 146 ASSERT(pushback_buffer()->is_empty()); 147 stream_->Seek(pos); 148} 149 150 151// ExternalStringUTF16Buffer 152template <typename StringType, typename CharType> 153ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer() 154 : raw_data_(NULL) { } 155 156 157template <typename StringType, typename CharType> 158void ExternalStringUTF16Buffer<StringType, CharType>::Initialize( 159 Handle<StringType> data, 160 int start_position, 161 int end_position) { 162 ASSERT(!data.is_null()); 163 raw_data_ = data->resource()->data(); 164 165 ASSERT(end_position <= data->length()); 166 if (start_position > 0) { 167 SeekForward(start_position); 168 } 169 end_ = 170 end_position != Scanner::kNoEndPosition ? end_position : data->length(); 171} 172 173 174template <typename StringType, typename CharType> 175uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() { 176 if (pos_ < end_) { 177 return raw_data_[pos_++]; 178 } else { 179 // note: currently the following increment is necessary to avoid a 180 // test-parser problem! 181 pos_++; 182 return static_cast<uc32>(-1); 183 } 184} 185 186 187template <typename StringType, typename CharType> 188void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) { 189 pos_--; 190 ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize); 191 ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch); 192} 193 194 195template <typename StringType, typename CharType> 196void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) { 197 pos_ = pos; 198} 199 200 201// ---------------------------------------------------------------------------- 202// Keyword Matcher 203KeywordMatcher::FirstState KeywordMatcher::first_states_[] = { 204 { "break", KEYWORD_PREFIX, Token::BREAK }, 205 { NULL, C, Token::ILLEGAL }, 206 { NULL, D, Token::ILLEGAL }, 207 { "else", KEYWORD_PREFIX, Token::ELSE }, 208 { NULL, F, Token::ILLEGAL }, 209 { NULL, UNMATCHABLE, Token::ILLEGAL }, 210 { NULL, UNMATCHABLE, Token::ILLEGAL }, 211 { NULL, I, Token::ILLEGAL }, 212 { NULL, UNMATCHABLE, Token::ILLEGAL }, 213 { NULL, UNMATCHABLE, Token::ILLEGAL }, 214 { NULL, UNMATCHABLE, Token::ILLEGAL }, 215 { NULL, UNMATCHABLE, Token::ILLEGAL }, 216 { NULL, N, Token::ILLEGAL }, 217 { NULL, UNMATCHABLE, Token::ILLEGAL }, 218 { NULL, UNMATCHABLE, Token::ILLEGAL }, 219 { NULL, UNMATCHABLE, Token::ILLEGAL }, 220 { "return", KEYWORD_PREFIX, Token::RETURN }, 221 { "switch", KEYWORD_PREFIX, Token::SWITCH }, 222 { NULL, T, Token::ILLEGAL }, 223 { NULL, UNMATCHABLE, Token::ILLEGAL }, 224 { NULL, V, Token::ILLEGAL }, 225 { NULL, W, Token::ILLEGAL } 226}; 227 228 229void KeywordMatcher::Step(uc32 input) { 230 switch (state_) { 231 case INITIAL: { 232 // matching the first character is the only state with significant fanout. 233 // Match only lower-case letters in range 'b'..'w'. 234 unsigned int offset = input - kFirstCharRangeMin; 235 if (offset < kFirstCharRangeLength) { 236 state_ = first_states_[offset].state; 237 if (state_ == KEYWORD_PREFIX) { 238 keyword_ = first_states_[offset].keyword; 239 counter_ = 1; 240 keyword_token_ = first_states_[offset].token; 241 } 242 return; 243 } 244 break; 245 } 246 case KEYWORD_PREFIX: 247 if (keyword_[counter_] == input) { 248 ASSERT_NE(input, '\0'); 249 counter_++; 250 if (keyword_[counter_] == '\0') { 251 state_ = KEYWORD_MATCHED; 252 token_ = keyword_token_; 253 } 254 return; 255 } 256 break; 257 case KEYWORD_MATCHED: 258 token_ = Token::IDENTIFIER; 259 break; 260 case C: 261 if (MatchState(input, 'a', CA)) return; 262 if (MatchState(input, 'o', CO)) return; 263 break; 264 case CA: 265 if (MatchKeywordStart(input, "case", 2, Token::CASE)) return; 266 if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return; 267 break; 268 case CO: 269 if (MatchState(input, 'n', CON)) return; 270 break; 271 case CON: 272 if (MatchKeywordStart(input, "const", 3, Token::CONST)) return; 273 if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return; 274 break; 275 case D: 276 if (MatchState(input, 'e', DE)) return; 277 if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return; 278 break; 279 case DE: 280 if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return; 281 if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return; 282 if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return; 283 break; 284 case F: 285 if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return; 286 if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return; 287 if (MatchKeywordStart(input, "for", 1, Token::FOR)) return; 288 if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return; 289 break; 290 case I: 291 if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return; 292 if (MatchKeyword(input, 'n', IN, Token::IN)) return; 293 break; 294 case IN: 295 token_ = Token::IDENTIFIER; 296 if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) { 297 return; 298 } 299 break; 300 case N: 301 if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return; 302 if (MatchKeywordStart(input, "new", 1, Token::NEW)) return; 303 if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return; 304 break; 305 case T: 306 if (MatchState(input, 'h', TH)) return; 307 if (MatchState(input, 'r', TR)) return; 308 if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return; 309 break; 310 case TH: 311 if (MatchKeywordStart(input, "this", 2, Token::THIS)) return; 312 if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return; 313 break; 314 case TR: 315 if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return; 316 if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return; 317 break; 318 case V: 319 if (MatchKeywordStart(input, "var", 1, Token::VAR)) return; 320 if (MatchKeywordStart(input, "void", 1, Token::VOID)) return; 321 break; 322 case W: 323 if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return; 324 if (MatchKeywordStart(input, "with", 1, Token::WITH)) return; 325 break; 326 default: 327 UNREACHABLE(); 328 } 329 // On fallthrough, it's a failure. 330 state_ = UNMATCHABLE; 331} 332 333 334// ---------------------------------------------------------------------------- 335// Scanner 336 337Scanner::Scanner(ParserMode pre) 338 : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { } 339 340 341void Scanner::Initialize(Handle<String> source, 342 ParserLanguage language) { 343 safe_string_input_buffer_.Reset(source.location()); 344 Init(source, &safe_string_input_buffer_, 0, source->length(), language); 345} 346 347 348void Scanner::Initialize(Handle<String> source, 349 unibrow::CharacterStream* stream, 350 ParserLanguage language) { 351 Init(source, stream, 0, kNoEndPosition, language); 352} 353 354 355void Scanner::Initialize(Handle<String> source, 356 int start_position, 357 int end_position, 358 ParserLanguage language) { 359 safe_string_input_buffer_.Reset(source.location()); 360 Init(source, &safe_string_input_buffer_, 361 start_position, end_position, language); 362} 363 364 365void Scanner::Init(Handle<String> source, 366 unibrow::CharacterStream* stream, 367 int start_position, 368 int end_position, 369 ParserLanguage language) { 370 // Initialize the source buffer. 371 if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) { 372 two_byte_string_buffer_.Initialize( 373 Handle<ExternalTwoByteString>::cast(source), 374 start_position, 375 end_position); 376 source_ = &two_byte_string_buffer_; 377 } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) { 378 ascii_string_buffer_.Initialize( 379 Handle<ExternalAsciiString>::cast(source), 380 start_position, 381 end_position); 382 source_ = &ascii_string_buffer_; 383 } else { 384 char_stream_buffer_.Initialize(source, 385 stream, 386 start_position, 387 end_position); 388 source_ = &char_stream_buffer_; 389 } 390 391 is_parsing_json_ = (language == JSON); 392 393 // Set c0_ (one character ahead) 394 ASSERT(kCharacterLookaheadBufferSize == 1); 395 Advance(); 396 // Initializer current_ to not refer to a literal buffer. 397 current_.literal_buffer = NULL; 398 399 // Skip initial whitespace allowing HTML comment ends just like 400 // after a newline and scan first token. 401 has_line_terminator_before_next_ = true; 402 SkipWhiteSpace(); 403 Scan(); 404} 405 406 407Token::Value Scanner::Next() { 408 // BUG 1215673: Find a thread safe way to set a stack limit in 409 // pre-parse mode. Otherwise, we cannot safely pre-parse from other 410 // threads. 411 current_ = next_; 412 // Check for stack-overflow before returning any tokens. 413 StackLimitCheck check; 414 if (check.HasOverflowed()) { 415 stack_overflow_ = true; 416 next_.token = Token::ILLEGAL; 417 } else { 418 Scan(); 419 } 420 return current_.token; 421} 422 423 424void Scanner::StartLiteral() { 425 // Use the first buffer unless it's currently in use by the current_ token. 426 // In most cases we won't have two literals/identifiers in a row, so 427 // the second buffer won't be used very often and is unlikely to grow much. 428 UTF8Buffer* free_buffer = 429 (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_ 430 : &literal_buffer_2_; 431 next_.literal_buffer = free_buffer; 432 free_buffer->Reset(); 433} 434 435 436void Scanner::AddChar(uc32 c) { 437 next_.literal_buffer->AddChar(c); 438} 439 440 441void Scanner::TerminateLiteral() { 442 AddChar(0); 443} 444 445 446void Scanner::AddCharAdvance() { 447 AddChar(c0_); 448 Advance(); 449} 450 451 452static inline bool IsByteOrderMark(uc32 c) { 453 // The Unicode value U+FFFE is guaranteed never to be assigned as a 454 // Unicode character; this implies that in a Unicode context the 455 // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF 456 // character expressed in little-endian byte order (since it could 457 // not be a U+FFFE character expressed in big-endian byte 458 // order). Nevertheless, we check for it to be compatible with 459 // Spidermonkey. 460 return c == 0xFEFF || c == 0xFFFE; 461} 462 463 464bool Scanner::SkipJsonWhiteSpace() { 465 int start_position = source_pos(); 466 // JSON WhiteSpace is tab, carrige-return, newline and space. 467 while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') { 468 Advance(); 469 } 470 return source_pos() != start_position; 471} 472 473 474bool Scanner::SkipJavaScriptWhiteSpace() { 475 int start_position = source_pos(); 476 477 while (true) { 478 // We treat byte-order marks (BOMs) as whitespace for better 479 // compatibility with Spidermonkey and other JavaScript engines. 480 while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) { 481 // IsWhiteSpace() includes line terminators! 482 if (kIsLineTerminator.get(c0_)) { 483 // Ignore line terminators, but remember them. This is necessary 484 // for automatic semicolon insertion. 485 has_line_terminator_before_next_ = true; 486 } 487 Advance(); 488 } 489 490 // If there is an HTML comment end '-->' at the beginning of a 491 // line (with only whitespace in front of it), we treat the rest 492 // of the line as a comment. This is in line with the way 493 // SpiderMonkey handles it. 494 if (c0_ == '-' && has_line_terminator_before_next_) { 495 Advance(); 496 if (c0_ == '-') { 497 Advance(); 498 if (c0_ == '>') { 499 // Treat the rest of the line as a comment. 500 SkipSingleLineComment(); 501 // Continue skipping white space after the comment. 502 continue; 503 } 504 PushBack('-'); // undo Advance() 505 } 506 PushBack('-'); // undo Advance() 507 } 508 // Return whether or not we skipped any characters. 509 return source_pos() != start_position; 510 } 511} 512 513 514Token::Value Scanner::SkipSingleLineComment() { 515 Advance(); 516 517 // The line terminator at the end of the line is not considered 518 // to be part of the single-line comment; it is recognized 519 // separately by the lexical grammar and becomes part of the 520 // stream of input elements for the syntactic grammar (see 521 // ECMA-262, section 7.4, page 12). 522 while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 523 Advance(); 524 } 525 526 return Token::WHITESPACE; 527} 528 529 530Token::Value Scanner::SkipMultiLineComment() { 531 ASSERT(c0_ == '*'); 532 Advance(); 533 534 while (c0_ >= 0) { 535 char ch = c0_; 536 Advance(); 537 // If we have reached the end of the multi-line comment, we 538 // consume the '/' and insert a whitespace. This way all 539 // multi-line comments are treated as whitespace - even the ones 540 // containing line terminators. This contradicts ECMA-262, section 541 // 7.4, page 12, that says that multi-line comments containing 542 // line terminators should be treated as a line terminator, but it 543 // matches the behaviour of SpiderMonkey and KJS. 544 if (ch == '*' && c0_ == '/') { 545 c0_ = ' '; 546 return Token::WHITESPACE; 547 } 548 } 549 550 // Unterminated multi-line comment. 551 return Token::ILLEGAL; 552} 553 554 555Token::Value Scanner::ScanHtmlComment() { 556 // Check for <!-- comments. 557 ASSERT(c0_ == '!'); 558 Advance(); 559 if (c0_ == '-') { 560 Advance(); 561 if (c0_ == '-') return SkipSingleLineComment(); 562 PushBack('-'); // undo Advance() 563 } 564 PushBack('!'); // undo Advance() 565 ASSERT(c0_ == '!'); 566 return Token::LT; 567} 568 569 570 571void Scanner::ScanJson() { 572 next_.literal_buffer = NULL; 573 Token::Value token; 574 has_line_terminator_before_next_ = false; 575 do { 576 // Remember the position of the next token 577 next_.location.beg_pos = source_pos(); 578 switch (c0_) { 579 case '\t': 580 case '\r': 581 case '\n': 582 case ' ': 583 Advance(); 584 token = Token::WHITESPACE; 585 break; 586 case '{': 587 Advance(); 588 token = Token::LBRACE; 589 break; 590 case '}': 591 Advance(); 592 token = Token::RBRACE; 593 break; 594 case '[': 595 Advance(); 596 token = Token::LBRACK; 597 break; 598 case ']': 599 Advance(); 600 token = Token::RBRACK; 601 break; 602 case ':': 603 Advance(); 604 token = Token::COLON; 605 break; 606 case ',': 607 Advance(); 608 token = Token::COMMA; 609 break; 610 case '"': 611 token = ScanJsonString(); 612 break; 613 case '-': 614 case '0': 615 case '1': 616 case '2': 617 case '3': 618 case '4': 619 case '5': 620 case '6': 621 case '7': 622 case '8': 623 case '9': 624 token = ScanJsonNumber(); 625 break; 626 case 't': 627 token = ScanJsonIdentifier("true", Token::TRUE_LITERAL); 628 break; 629 case 'f': 630 token = ScanJsonIdentifier("false", Token::FALSE_LITERAL); 631 break; 632 case 'n': 633 token = ScanJsonIdentifier("null", Token::NULL_LITERAL); 634 break; 635 default: 636 if (c0_ < 0) { 637 Advance(); 638 token = Token::EOS; 639 } else { 640 Advance(); 641 token = Select(Token::ILLEGAL); 642 } 643 } 644 } while (token == Token::WHITESPACE); 645 646 next_.location.end_pos = source_pos(); 647 next_.token = token; 648} 649 650 651Token::Value Scanner::ScanJsonString() { 652 ASSERT_EQ('"', c0_); 653 Advance(); 654 StartLiteral(); 655 while (c0_ != '"' && c0_ > 0) { 656 // Check for control character (0x00-0x1f) or unterminated string (<0). 657 if (c0_ < 0x20) return Token::ILLEGAL; 658 if (c0_ != '\\') { 659 AddCharAdvance(); 660 } else { 661 Advance(); 662 switch (c0_) { 663 case '"': 664 case '\\': 665 case '/': 666 AddChar(c0_); 667 break; 668 case 'b': 669 AddChar('\x08'); 670 break; 671 case 'f': 672 AddChar('\x0c'); 673 break; 674 case 'n': 675 AddChar('\x0a'); 676 break; 677 case 'r': 678 AddChar('\x0d'); 679 break; 680 case 't': 681 AddChar('\x09'); 682 break; 683 case 'u': { 684 uc32 value = 0; 685 for (int i = 0; i < 4; i++) { 686 Advance(); 687 int digit = HexValue(c0_); 688 if (digit < 0) return Token::ILLEGAL; 689 value = value * 16 + digit; 690 } 691 AddChar(value); 692 break; 693 } 694 default: 695 return Token::ILLEGAL; 696 } 697 Advance(); 698 } 699 } 700 if (c0_ != '"') { 701 return Token::ILLEGAL; 702 } 703 TerminateLiteral(); 704 Advance(); 705 return Token::STRING; 706} 707 708 709Token::Value Scanner::ScanJsonNumber() { 710 StartLiteral(); 711 if (c0_ == '-') AddCharAdvance(); 712 if (c0_ == '0') { 713 AddCharAdvance(); 714 // Prefix zero is only allowed if it's the only digit before 715 // a decimal point or exponent. 716 if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL; 717 } else { 718 if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL; 719 do { 720 AddCharAdvance(); 721 } while (c0_ >= '0' && c0_ <= '9'); 722 } 723 if (c0_ == '.') { 724 AddCharAdvance(); 725 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 726 do { 727 AddCharAdvance(); 728 } while (c0_ >= '0' && c0_ <= '9'); 729 } 730 if ((c0_ | 0x20) == 'e') { 731 AddCharAdvance(); 732 if (c0_ == '-' || c0_ == '+') AddCharAdvance(); 733 if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL; 734 do { 735 AddCharAdvance(); 736 } while (c0_ >= '0' && c0_ <= '9'); 737 } 738 TerminateLiteral(); 739 return Token::NUMBER; 740} 741 742 743Token::Value Scanner::ScanJsonIdentifier(const char* text, 744 Token::Value token) { 745 StartLiteral(); 746 while (*text != '\0') { 747 if (c0_ != *text) return Token::ILLEGAL; 748 Advance(); 749 text++; 750 } 751 if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL; 752 TerminateLiteral(); 753 return token; 754} 755 756 757void Scanner::ScanJavaScript() { 758 next_.literal_buffer = NULL; 759 Token::Value token; 760 has_line_terminator_before_next_ = false; 761 do { 762 // Remember the position of the next token 763 next_.location.beg_pos = source_pos(); 764 765 switch (c0_) { 766 case ' ': 767 case '\t': 768 Advance(); 769 token = Token::WHITESPACE; 770 break; 771 772 case '\n': 773 Advance(); 774 has_line_terminator_before_next_ = true; 775 token = Token::WHITESPACE; 776 break; 777 778 case '"': case '\'': 779 token = ScanString(); 780 break; 781 782 case '<': 783 // < <= << <<= <!-- 784 Advance(); 785 if (c0_ == '=') { 786 token = Select(Token::LTE); 787 } else if (c0_ == '<') { 788 token = Select('=', Token::ASSIGN_SHL, Token::SHL); 789 } else if (c0_ == '!') { 790 token = ScanHtmlComment(); 791 } else { 792 token = Token::LT; 793 } 794 break; 795 796 case '>': 797 // > >= >> >>= >>> >>>= 798 Advance(); 799 if (c0_ == '=') { 800 token = Select(Token::GTE); 801 } else if (c0_ == '>') { 802 // >> >>= >>> >>>= 803 Advance(); 804 if (c0_ == '=') { 805 token = Select(Token::ASSIGN_SAR); 806 } else if (c0_ == '>') { 807 token = Select('=', Token::ASSIGN_SHR, Token::SHR); 808 } else { 809 token = Token::SAR; 810 } 811 } else { 812 token = Token::GT; 813 } 814 break; 815 816 case '=': 817 // = == === 818 Advance(); 819 if (c0_ == '=') { 820 token = Select('=', Token::EQ_STRICT, Token::EQ); 821 } else { 822 token = Token::ASSIGN; 823 } 824 break; 825 826 case '!': 827 // ! != !== 828 Advance(); 829 if (c0_ == '=') { 830 token = Select('=', Token::NE_STRICT, Token::NE); 831 } else { 832 token = Token::NOT; 833 } 834 break; 835 836 case '+': 837 // + ++ += 838 Advance(); 839 if (c0_ == '+') { 840 token = Select(Token::INC); 841 } else if (c0_ == '=') { 842 token = Select(Token::ASSIGN_ADD); 843 } else { 844 token = Token::ADD; 845 } 846 break; 847 848 case '-': 849 // - -- --> -= 850 Advance(); 851 if (c0_ == '-') { 852 Advance(); 853 if (c0_ == '>' && has_line_terminator_before_next_) { 854 // For compatibility with SpiderMonkey, we skip lines that 855 // start with an HTML comment end '-->'. 856 token = SkipSingleLineComment(); 857 } else { 858 token = Token::DEC; 859 } 860 } else if (c0_ == '=') { 861 token = Select(Token::ASSIGN_SUB); 862 } else { 863 token = Token::SUB; 864 } 865 break; 866 867 case '*': 868 // * *= 869 token = Select('=', Token::ASSIGN_MUL, Token::MUL); 870 break; 871 872 case '%': 873 // % %= 874 token = Select('=', Token::ASSIGN_MOD, Token::MOD); 875 break; 876 877 case '/': 878 // / // /* /= 879 Advance(); 880 if (c0_ == '/') { 881 token = SkipSingleLineComment(); 882 } else if (c0_ == '*') { 883 token = SkipMultiLineComment(); 884 } else if (c0_ == '=') { 885 token = Select(Token::ASSIGN_DIV); 886 } else { 887 token = Token::DIV; 888 } 889 break; 890 891 case '&': 892 // & && &= 893 Advance(); 894 if (c0_ == '&') { 895 token = Select(Token::AND); 896 } else if (c0_ == '=') { 897 token = Select(Token::ASSIGN_BIT_AND); 898 } else { 899 token = Token::BIT_AND; 900 } 901 break; 902 903 case '|': 904 // | || |= 905 Advance(); 906 if (c0_ == '|') { 907 token = Select(Token::OR); 908 } else if (c0_ == '=') { 909 token = Select(Token::ASSIGN_BIT_OR); 910 } else { 911 token = Token::BIT_OR; 912 } 913 break; 914 915 case '^': 916 // ^ ^= 917 token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR); 918 break; 919 920 case '.': 921 // . Number 922 Advance(); 923 if (IsDecimalDigit(c0_)) { 924 token = ScanNumber(true); 925 } else { 926 token = Token::PERIOD; 927 } 928 break; 929 930 case ':': 931 token = Select(Token::COLON); 932 break; 933 934 case ';': 935 token = Select(Token::SEMICOLON); 936 break; 937 938 case ',': 939 token = Select(Token::COMMA); 940 break; 941 942 case '(': 943 token = Select(Token::LPAREN); 944 break; 945 946 case ')': 947 token = Select(Token::RPAREN); 948 break; 949 950 case '[': 951 token = Select(Token::LBRACK); 952 break; 953 954 case ']': 955 token = Select(Token::RBRACK); 956 break; 957 958 case '{': 959 token = Select(Token::LBRACE); 960 break; 961 962 case '}': 963 token = Select(Token::RBRACE); 964 break; 965 966 case '?': 967 token = Select(Token::CONDITIONAL); 968 break; 969 970 case '~': 971 token = Select(Token::BIT_NOT); 972 break; 973 974 default: 975 if (kIsIdentifierStart.get(c0_)) { 976 token = ScanIdentifier(); 977 } else if (IsDecimalDigit(c0_)) { 978 token = ScanNumber(false); 979 } else if (SkipWhiteSpace()) { 980 token = Token::WHITESPACE; 981 } else if (c0_ < 0) { 982 token = Token::EOS; 983 } else { 984 token = Select(Token::ILLEGAL); 985 } 986 break; 987 } 988 989 // Continue scanning for tokens as long as we're just skipping 990 // whitespace. 991 } while (token == Token::WHITESPACE); 992 993 next_.location.end_pos = source_pos(); 994 next_.token = token; 995} 996 997 998void Scanner::SeekForward(int pos) { 999 source_->SeekForward(pos - 1); 1000 Advance(); 1001 Scan(); 1002} 1003 1004 1005uc32 Scanner::ScanHexEscape(uc32 c, int length) { 1006 ASSERT(length <= 4); // prevent overflow 1007 1008 uc32 digits[4]; 1009 uc32 x = 0; 1010 for (int i = 0; i < length; i++) { 1011 digits[i] = c0_; 1012 int d = HexValue(c0_); 1013 if (d < 0) { 1014 // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes 1015 // should be illegal, but other JS VMs just return the 1016 // non-escaped version of the original character. 1017 1018 // Push back digits read, except the last one (in c0_). 1019 for (int j = i-1; j >= 0; j--) { 1020 PushBack(digits[j]); 1021 } 1022 // Notice: No handling of error - treat it as "\u"->"u". 1023 return c; 1024 } 1025 x = x * 16 + d; 1026 Advance(); 1027 } 1028 1029 return x; 1030} 1031 1032 1033// Octal escapes of the forms '\0xx' and '\xxx' are not a part of 1034// ECMA-262. Other JS VMs support them. 1035uc32 Scanner::ScanOctalEscape(uc32 c, int length) { 1036 uc32 x = c - '0'; 1037 for (int i = 0; i < length; i++) { 1038 int d = c0_ - '0'; 1039 if (d < 0 || d > 7) break; 1040 int nx = x * 8 + d; 1041 if (nx >= 256) break; 1042 x = nx; 1043 Advance(); 1044 } 1045 return x; 1046} 1047 1048 1049void Scanner::ScanEscape() { 1050 uc32 c = c0_; 1051 Advance(); 1052 1053 // Skip escaped newlines. 1054 if (kIsLineTerminator.get(c)) { 1055 // Allow CR+LF newlines in multiline string literals. 1056 if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance(); 1057 // Allow LF+CR newlines in multiline string literals. 1058 if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance(); 1059 return; 1060 } 1061 1062 switch (c) { 1063 case '\'': // fall through 1064 case '"' : // fall through 1065 case '\\': break; 1066 case 'b' : c = '\b'; break; 1067 case 'f' : c = '\f'; break; 1068 case 'n' : c = '\n'; break; 1069 case 'r' : c = '\r'; break; 1070 case 't' : c = '\t'; break; 1071 case 'u' : c = ScanHexEscape(c, 4); break; 1072 case 'v' : c = '\v'; break; 1073 case 'x' : c = ScanHexEscape(c, 2); break; 1074 case '0' : // fall through 1075 case '1' : // fall through 1076 case '2' : // fall through 1077 case '3' : // fall through 1078 case '4' : // fall through 1079 case '5' : // fall through 1080 case '6' : // fall through 1081 case '7' : c = ScanOctalEscape(c, 2); break; 1082 } 1083 1084 // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these 1085 // should be illegal, but they are commonly handled 1086 // as non-escaped characters by JS VMs. 1087 AddChar(c); 1088} 1089 1090 1091Token::Value Scanner::ScanString() { 1092 uc32 quote = c0_; 1093 Advance(); // consume quote 1094 1095 StartLiteral(); 1096 while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) { 1097 uc32 c = c0_; 1098 Advance(); 1099 if (c == '\\') { 1100 if (c0_ < 0) return Token::ILLEGAL; 1101 ScanEscape(); 1102 } else { 1103 AddChar(c); 1104 } 1105 } 1106 if (c0_ != quote) { 1107 return Token::ILLEGAL; 1108 } 1109 TerminateLiteral(); 1110 1111 Advance(); // consume quote 1112 return Token::STRING; 1113} 1114 1115 1116Token::Value Scanner::Select(Token::Value tok) { 1117 Advance(); 1118 return tok; 1119} 1120 1121 1122Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) { 1123 Advance(); 1124 if (c0_ == next) { 1125 Advance(); 1126 return then; 1127 } else { 1128 return else_; 1129 } 1130} 1131 1132 1133// Returns true if any decimal digits were scanned, returns false otherwise. 1134void Scanner::ScanDecimalDigits() { 1135 while (IsDecimalDigit(c0_)) 1136 AddCharAdvance(); 1137} 1138 1139 1140Token::Value Scanner::ScanNumber(bool seen_period) { 1141 ASSERT(IsDecimalDigit(c0_)); // the first digit of the number or the fraction 1142 1143 enum { DECIMAL, HEX, OCTAL } kind = DECIMAL; 1144 1145 StartLiteral(); 1146 if (seen_period) { 1147 // we have already seen a decimal point of the float 1148 AddChar('.'); 1149 ScanDecimalDigits(); // we know we have at least one digit 1150 1151 } else { 1152 // if the first character is '0' we must check for octals and hex 1153 if (c0_ == '0') { 1154 AddCharAdvance(); 1155 1156 // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number 1157 if (c0_ == 'x' || c0_ == 'X') { 1158 // hex number 1159 kind = HEX; 1160 AddCharAdvance(); 1161 if (!IsHexDigit(c0_)) 1162 // we must have at least one hex digit after 'x'/'X' 1163 return Token::ILLEGAL; 1164 while (IsHexDigit(c0_)) 1165 AddCharAdvance(); 1166 1167 } else if ('0' <= c0_ && c0_ <= '7') { 1168 // (possible) octal number 1169 kind = OCTAL; 1170 while (true) { 1171 if (c0_ == '8' || c0_ == '9') { 1172 kind = DECIMAL; 1173 break; 1174 } 1175 if (c0_ < '0' || '7' < c0_) break; 1176 AddCharAdvance(); 1177 } 1178 } 1179 } 1180 1181 // Parse decimal digits and allow trailing fractional part. 1182 if (kind == DECIMAL) { 1183 ScanDecimalDigits(); // optional 1184 if (c0_ == '.') { 1185 AddCharAdvance(); 1186 ScanDecimalDigits(); // optional 1187 } 1188 } 1189 } 1190 1191 // scan exponent, if any 1192 if (c0_ == 'e' || c0_ == 'E') { 1193 ASSERT(kind != HEX); // 'e'/'E' must be scanned as part of the hex number 1194 if (kind == OCTAL) return Token::ILLEGAL; // no exponent for octals allowed 1195 // scan exponent 1196 AddCharAdvance(); 1197 if (c0_ == '+' || c0_ == '-') 1198 AddCharAdvance(); 1199 if (!IsDecimalDigit(c0_)) 1200 // we must have at least one decimal digit after 'e'/'E' 1201 return Token::ILLEGAL; 1202 ScanDecimalDigits(); 1203 } 1204 TerminateLiteral(); 1205 1206 // The source character immediately following a numeric literal must 1207 // not be an identifier start or a decimal digit; see ECMA-262 1208 // section 7.8.3, page 17 (note that we read only one decimal digit 1209 // if the value is 0). 1210 if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_)) 1211 return Token::ILLEGAL; 1212 1213 return Token::NUMBER; 1214} 1215 1216 1217uc32 Scanner::ScanIdentifierUnicodeEscape() { 1218 Advance(); 1219 if (c0_ != 'u') return unibrow::Utf8::kBadChar; 1220 Advance(); 1221 uc32 c = ScanHexEscape('u', 4); 1222 // We do not allow a unicode escape sequence to start another 1223 // unicode escape sequence. 1224 if (c == '\\') return unibrow::Utf8::kBadChar; 1225 return c; 1226} 1227 1228 1229Token::Value Scanner::ScanIdentifier() { 1230 ASSERT(kIsIdentifierStart.get(c0_)); 1231 1232 StartLiteral(); 1233 KeywordMatcher keyword_match; 1234 1235 // Scan identifier start character. 1236 if (c0_ == '\\') { 1237 uc32 c = ScanIdentifierUnicodeEscape(); 1238 // Only allow legal identifier start characters. 1239 if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL; 1240 AddChar(c); 1241 keyword_match.Fail(); 1242 } else { 1243 AddChar(c0_); 1244 keyword_match.AddChar(c0_); 1245 Advance(); 1246 } 1247 1248 // Scan the rest of the identifier characters. 1249 while (kIsIdentifierPart.get(c0_)) { 1250 if (c0_ == '\\') { 1251 uc32 c = ScanIdentifierUnicodeEscape(); 1252 // Only allow legal identifier part characters. 1253 if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL; 1254 AddChar(c); 1255 keyword_match.Fail(); 1256 } else { 1257 AddChar(c0_); 1258 keyword_match.AddChar(c0_); 1259 Advance(); 1260 } 1261 } 1262 TerminateLiteral(); 1263 1264 return keyword_match.token(); 1265} 1266 1267 1268 1269bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) { 1270 // Checks whether the buffer contains an identifier (no escape). 1271 if (!buffer->has_more()) return false; 1272 if (!kIsIdentifierStart.get(buffer->GetNext())) return false; 1273 while (buffer->has_more()) { 1274 if (!kIsIdentifierPart.get(buffer->GetNext())) return false; 1275 } 1276 return true; 1277} 1278 1279 1280bool Scanner::ScanRegExpPattern(bool seen_equal) { 1281 // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags 1282 bool in_character_class = false; 1283 1284 // Previous token is either '/' or '/=', in the second case, the 1285 // pattern starts at =. 1286 next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1); 1287 next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0); 1288 1289 // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5, 1290 // the scanner should pass uninterpreted bodies to the RegExp 1291 // constructor. 1292 StartLiteral(); 1293 if (seen_equal) 1294 AddChar('='); 1295 1296 while (c0_ != '/' || in_character_class) { 1297 if (kIsLineTerminator.get(c0_) || c0_ < 0) 1298 return false; 1299 if (c0_ == '\\') { // escaped character 1300 AddCharAdvance(); 1301 if (kIsLineTerminator.get(c0_) || c0_ < 0) 1302 return false; 1303 AddCharAdvance(); 1304 } else { // unescaped character 1305 if (c0_ == '[') 1306 in_character_class = true; 1307 if (c0_ == ']') 1308 in_character_class = false; 1309 AddCharAdvance(); 1310 } 1311 } 1312 Advance(); // consume '/' 1313 1314 TerminateLiteral(); 1315 1316 return true; 1317} 1318 1319bool Scanner::ScanRegExpFlags() { 1320 // Scan regular expression flags. 1321 StartLiteral(); 1322 while (kIsIdentifierPart.get(c0_)) { 1323 if (c0_ == '\\') { 1324 uc32 c = ScanIdentifierUnicodeEscape(); 1325 if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) { 1326 // We allow any escaped character, unlike the restriction on 1327 // IdentifierPart when it is used to build an IdentifierName. 1328 AddChar(c); 1329 continue; 1330 } 1331 } 1332 AddCharAdvance(); 1333 } 1334 TerminateLiteral(); 1335 1336 next_.location.end_pos = source_pos() - 1; 1337 return true; 1338} 1339 1340} } // namespace v8::internal 1341