scanner.h revision 0d5e116f6aee03185f237311a943491bb079a768
1// Copyright 2010 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#ifndef V8_SCANNER_H_ 29#define V8_SCANNER_H_ 30 31#include "token.h" 32#include "char-predicates-inl.h" 33 34namespace v8 { 35namespace internal { 36 37 38class UTF8Buffer { 39 public: 40 UTF8Buffer(); 41 ~UTF8Buffer(); 42 43 inline void AddChar(uc32 c) { 44 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 45 buffer_.Add(static_cast<char>(c)); 46 } else { 47 AddCharSlow(c); 48 } 49 } 50 51 void StartLiteral() { 52 buffer_.StartSequence(); 53 } 54 55 Vector<const char> EndLiteral() { 56 buffer_.Add(kEndMarker); 57 Vector<char> sequence = buffer_.EndSequence(); 58 return Vector<const char>(sequence.start(), sequence.length()); 59 } 60 61 void DropLiteral() { 62 buffer_.DropSequence(); 63 } 64 65 void Reset() { 66 buffer_.Reset(); 67 } 68 69 // The end marker added after a parsed literal. 70 // Using zero allows the usage of strlen and similar functions on 71 // identifiers and numbers (but not strings, since they may contain zero 72 // bytes). 73 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside 74 // an utf-8 string. This requires changes in all places that uses 75 // str-functions on the literals, but allows a single pointer to represent 76 // the literal, even if it contains embedded zeros. 77 static const char kEndMarker = '\x00'; 78 private: 79 static const int kInitialCapacity = 256; 80 SequenceCollector<char, 4> buffer_; 81 82 void AddCharSlow(uc32 c); 83}; 84 85 86// Interface through which the scanner reads characters from the input source. 87class UTF16Buffer { 88 public: 89 UTF16Buffer(); 90 virtual ~UTF16Buffer() {} 91 92 virtual void PushBack(uc32 ch) = 0; 93 // Returns a value < 0 when the buffer end is reached. 94 virtual uc32 Advance() = 0; 95 virtual void SeekForward(int pos) = 0; 96 97 int pos() const { return pos_; } 98 99 protected: 100 int pos_; // Current position in the buffer. 101 int end_; // Position where scanning should stop (EOF). 102}; 103 104 105// UTF16 buffer to read characters from a character stream. 106class CharacterStreamUTF16Buffer: public UTF16Buffer { 107 public: 108 CharacterStreamUTF16Buffer(); 109 virtual ~CharacterStreamUTF16Buffer() {} 110 void Initialize(Handle<String> data, 111 unibrow::CharacterStream* stream, 112 int start_position, 113 int end_position); 114 virtual void PushBack(uc32 ch); 115 virtual uc32 Advance(); 116 virtual void SeekForward(int pos); 117 118 private: 119 List<uc32> pushback_buffer_; 120 uc32 last_; 121 unibrow::CharacterStream* stream_; 122 123 List<uc32>* pushback_buffer() { return &pushback_buffer_; } 124}; 125 126 127// UTF16 buffer to read characters from an external string. 128template <typename StringType, typename CharType> 129class ExternalStringUTF16Buffer: public UTF16Buffer { 130 public: 131 ExternalStringUTF16Buffer(); 132 virtual ~ExternalStringUTF16Buffer() {} 133 void Initialize(Handle<StringType> data, 134 int start_position, 135 int end_position); 136 virtual void PushBack(uc32 ch); 137 virtual uc32 Advance(); 138 virtual void SeekForward(int pos); 139 140 private: 141 const CharType* raw_data_; // Pointer to the actual array of characters. 142}; 143 144 145class KeywordMatcher { 146// Incrementally recognize keywords. 147// 148// Recognized keywords: 149// break case catch const* continue debugger* default delete do else 150// finally false for function if in instanceof native* new null 151// return switch this throw true try typeof var void while with 152// 153// *: Actually "future reserved keywords". These are the only ones we 154// recognized, the remaining are allowed as identifiers. 155 public: 156 KeywordMatcher() 157 : state_(INITIAL), 158 token_(Token::IDENTIFIER), 159 keyword_(NULL), 160 counter_(0), 161 keyword_token_(Token::ILLEGAL) {} 162 163 Token::Value token() { return token_; } 164 165 inline void AddChar(uc32 input) { 166 if (state_ != UNMATCHABLE) { 167 Step(input); 168 } 169 } 170 171 void Fail() { 172 token_ = Token::IDENTIFIER; 173 state_ = UNMATCHABLE; 174 } 175 176 private: 177 enum State { 178 UNMATCHABLE, 179 INITIAL, 180 KEYWORD_PREFIX, 181 KEYWORD_MATCHED, 182 C, 183 CA, 184 CO, 185 CON, 186 D, 187 DE, 188 F, 189 I, 190 IN, 191 N, 192 T, 193 TH, 194 TR, 195 V, 196 W 197 }; 198 199 struct FirstState { 200 const char* keyword; 201 State state; 202 Token::Value token; 203 }; 204 205 // Range of possible first characters of a keyword. 206 static const unsigned int kFirstCharRangeMin = 'b'; 207 static const unsigned int kFirstCharRangeMax = 'w'; 208 static const unsigned int kFirstCharRangeLength = 209 kFirstCharRangeMax - kFirstCharRangeMin + 1; 210 // State map for first keyword character range. 211 static FirstState first_states_[kFirstCharRangeLength]; 212 213 // If input equals keyword's character at position, continue matching keyword 214 // from that position. 215 inline bool MatchKeywordStart(uc32 input, 216 const char* keyword, 217 int position, 218 Token::Value token_if_match) { 219 if (input == keyword[position]) { 220 state_ = KEYWORD_PREFIX; 221 this->keyword_ = keyword; 222 this->counter_ = position + 1; 223 this->keyword_token_ = token_if_match; 224 return true; 225 } 226 return false; 227 } 228 229 // If input equals match character, transition to new state and return true. 230 inline bool MatchState(uc32 input, char match, State new_state) { 231 if (input == match) { 232 state_ = new_state; 233 return true; 234 } 235 return false; 236 } 237 238 inline bool MatchKeyword(uc32 input, 239 char match, 240 State new_state, 241 Token::Value keyword_token) { 242 if (input != match) { 243 return false; 244 } 245 state_ = new_state; 246 token_ = keyword_token; 247 return true; 248 } 249 250 void Step(uc32 input); 251 252 // Current state. 253 State state_; 254 // Token for currently added characters. 255 Token::Value token_; 256 257 // Matching a specific keyword string (there is only one possible valid 258 // keyword with the current prefix). 259 const char* keyword_; 260 int counter_; 261 Token::Value keyword_token_; 262}; 263 264 265enum ParserMode { PARSE, PREPARSE }; 266enum ParserLanguage { JAVASCRIPT, JSON }; 267 268 269class Scanner { 270 public: 271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 272 273 class LiteralScope { 274 public: 275 explicit LiteralScope(Scanner* self); 276 ~LiteralScope(); 277 void Complete(); 278 279 private: 280 Scanner* scanner_; 281 bool complete_; 282 }; 283 284 Scanner(); 285 286 // Initialize the Scanner to scan source. 287 void Initialize(Handle<String> source, 288 ParserLanguage language); 289 void Initialize(Handle<String> source, 290 unibrow::CharacterStream* stream, 291 ParserLanguage language); 292 void Initialize(Handle<String> source, 293 int start_position, int end_position, 294 ParserLanguage language); 295 296 // Returns the next token. 297 Token::Value Next(); 298 299 // One token look-ahead (past the token returned by Next()). 300 Token::Value peek() const { return next_.token; } 301 302 // Returns true if there was a line terminator before the peek'ed token. 303 bool has_line_terminator_before_next() const { 304 return has_line_terminator_before_next_; 305 } 306 307 struct Location { 308 Location(int b, int e) : beg_pos(b), end_pos(e) { } 309 Location() : beg_pos(0), end_pos(0) { } 310 int beg_pos; 311 int end_pos; 312 }; 313 314 // Returns the location information for the current token 315 // (the token returned by Next()). 316 Location location() const { return current_.location; } 317 Location peek_location() const { return next_.location; } 318 319 // Returns the literal string, if any, for the current token (the 320 // token returned by Next()). The string is 0-terminated and in 321 // UTF-8 format; they may contain 0-characters. Literal strings are 322 // collected for identifiers, strings, and numbers. 323 // These functions only give the correct result if the literal 324 // was scanned between calls to StartLiteral() and TerminateLiteral(). 325 const char* literal_string() const { 326 return current_.literal_chars.start(); 327 } 328 329 int literal_length() const { 330 // Excluding terminal '\x00' added by TerminateLiteral(). 331 return current_.literal_chars.length() - 1; 332 } 333 334 Vector<const char> literal() const { 335 return Vector<const char>(literal_string(), literal_length()); 336 } 337 338 // Returns the literal string for the next token (the token that 339 // would be returned if Next() were called). 340 const char* next_literal_string() const { 341 return next_.literal_chars.start(); 342 } 343 344 345 // Returns the length of the next token (that would be returned if 346 // Next() were called). 347 int next_literal_length() const { 348 // Excluding terminal '\x00' added by TerminateLiteral(). 349 return next_.literal_chars.length() - 1; 350 } 351 352 Vector<const char> next_literal() const { 353 return Vector<const char>(next_literal_string(), next_literal_length()); 354 } 355 356 // Scans the input as a regular expression pattern, previous 357 // character(s) must be /(=). Returns true if a pattern is scanned. 358 bool ScanRegExpPattern(bool seen_equal); 359 // Returns true if regexp flags are scanned (always since flags can 360 // be empty). 361 bool ScanRegExpFlags(); 362 363 // Seek forward to the given position. This operation does not 364 // work in general, for instance when there are pushed back 365 // characters, but works for seeking forward until simple delimiter 366 // tokens, which is what it is used for. 367 void SeekForward(int pos); 368 369 bool stack_overflow() { return stack_overflow_; } 370 371 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } 372 373 // Tells whether the buffer contains an identifier (no escapes). 374 // Used for checking if a property name is an identifier. 375 static bool IsIdentifier(unibrow::CharacterStream* buffer); 376 377 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 378 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 379 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 380 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 381 382 static const int kCharacterLookaheadBufferSize = 1; 383 static const int kNoEndPosition = 1; 384 385 private: 386 // The current and look-ahead token. 387 struct TokenDesc { 388 Token::Value token; 389 Location location; 390 Vector<const char> literal_chars; 391 }; 392 393 void Init(Handle<String> source, 394 unibrow::CharacterStream* stream, 395 int start_position, int end_position, 396 ParserLanguage language); 397 398 // Literal buffer support 399 inline void StartLiteral(); 400 inline void AddChar(uc32 ch); 401 inline void AddCharAdvance(); 402 inline void TerminateLiteral(); 403 // Stops scanning of a literal, e.g., due to an encountered error. 404 inline void DropLiteral(); 405 406 // Low-level scanning support. 407 void Advance() { c0_ = source_->Advance(); } 408 void PushBack(uc32 ch) { 409 source_->PushBack(ch); 410 c0_ = ch; 411 } 412 413 bool SkipWhiteSpace() { 414 if (is_parsing_json_) { 415 return SkipJsonWhiteSpace(); 416 } else { 417 return SkipJavaScriptWhiteSpace(); 418 } 419 } 420 421 bool SkipJavaScriptWhiteSpace(); 422 bool SkipJsonWhiteSpace(); 423 Token::Value SkipSingleLineComment(); 424 Token::Value SkipMultiLineComment(); 425 426 inline Token::Value Select(Token::Value tok); 427 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); 428 429 inline void Scan() { 430 if (is_parsing_json_) { 431 ScanJson(); 432 } else { 433 ScanJavaScript(); 434 } 435 } 436 437 // Scans a single JavaScript token. 438 void ScanJavaScript(); 439 440 // Scan a single JSON token. The JSON lexical grammar is specified in the 441 // ECMAScript 5 standard, section 15.12.1.1. 442 // Recognizes all of the single-character tokens directly, or calls a function 443 // to scan a number, string or identifier literal. 444 // The only allowed whitespace characters between tokens are tab, 445 // carrige-return, newline and space. 446 void ScanJson(); 447 448 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 449 // decimal number literals. 450 // It includes an optional minus sign, must have at least one 451 // digit before and after a decimal point, may not have prefixed zeros (unless 452 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 453 // Hexadecimal and octal numbers are not allowed. 454 Token::Value ScanJsonNumber(); 455 456 // A JSON string (production JSONString) is subset of valid JavaScript string 457 // literals. The string must only be double-quoted (not single-quoted), and 458 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 459 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 460 Token::Value ScanJsonString(); 461 462 // Used to recognizes one of the literals "true", "false", or "null". These 463 // are the only valid JSON identifiers (productions JSONBooleanLiteral, 464 // JSONNullLiteral). 465 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); 466 467 void ScanDecimalDigits(); 468 Token::Value ScanNumber(bool seen_period); 469 Token::Value ScanIdentifier(); 470 uc32 ScanHexEscape(uc32 c, int length); 471 uc32 ScanOctalEscape(uc32 c, int length); 472 void ScanEscape(); 473 Token::Value ScanString(); 474 475 // Scans a possible HTML comment -- begins with '<!'. 476 Token::Value ScanHtmlComment(); 477 478 // Return the current source position. 479 int source_pos() { 480 return source_->pos() - kCharacterLookaheadBufferSize; 481 } 482 483 // Decodes a unicode escape-sequence which is part of an identifier. 484 // If the escape sequence cannot be decoded the result is kBadRune. 485 uc32 ScanIdentifierUnicodeEscape(); 486 487 TokenDesc current_; // desc for current token (as returned by Next()) 488 TokenDesc next_; // desc for next token (one token look-ahead) 489 bool has_line_terminator_before_next_; 490 bool is_parsing_json_; 491 492 // Different UTF16 buffers used to pull characters from. Based on input one of 493 // these will be initialized as the actual data source. 494 CharacterStreamUTF16Buffer char_stream_buffer_; 495 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> 496 two_byte_string_buffer_; 497 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; 498 499 // Source. Will point to one of the buffers declared above. 500 UTF16Buffer* source_; 501 502 // Used to convert the source string into a character stream when a stream 503 // is not passed to the scanner. 504 SafeStringInputBuffer safe_string_input_buffer_; 505 506 // Buffer to hold literal values (identifiers, strings, numbers) 507 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. 508 UTF8Buffer literal_buffer_; 509 510 bool stack_overflow_; 511 static StaticResource<Utf8Decoder> utf8_decoder_; 512 513 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 514 uc32 c0_; 515}; 516 517} } // namespace v8::internal 518 519#endif // V8_SCANNER_H_ 520