scanner.h revision 6ded16be15dd865a9b21ea304d5273c8be299c87
1// Copyright 2006-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#ifndef V8_SCANNER_H_ 29#define V8_SCANNER_H_ 30 31#include "token.h" 32#include "char-predicates-inl.h" 33 34namespace v8 { 35namespace internal { 36 37 38class UTF8Buffer { 39 public: 40 UTF8Buffer(); 41 ~UTF8Buffer(); 42 43 void AddChar(uc32 c) { 44 ASSERT_NOT_NULL(data_); 45 if (cursor_ <= limit_ && 46 static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 47 *cursor_++ = static_cast<char>(c); 48 } else { 49 AddCharSlow(c); 50 } 51 } 52 53 void Reset() { 54 if (data_ == NULL) { 55 data_ = NewArray<char>(kInitialCapacity); 56 limit_ = ComputeLimit(data_, kInitialCapacity); 57 } 58 cursor_ = data_; 59 } 60 61 int pos() const { 62 ASSERT_NOT_NULL(data_); 63 return static_cast<int>(cursor_ - data_); 64 } 65 66 char* data() const { return data_; } 67 68 private: 69 static const int kInitialCapacity = 256; 70 char* data_; 71 char* cursor_; 72 char* limit_; 73 74 int Capacity() const { 75 ASSERT_NOT_NULL(data_); 76 return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize; 77 } 78 79 static char* ComputeLimit(char* data, int capacity) { 80 return (data + capacity) - unibrow::Utf8::kMaxEncodedSize; 81 } 82 83 void AddCharSlow(uc32 c); 84}; 85 86 87// Interface through which the scanner reads characters from the input source. 88class UTF16Buffer { 89 public: 90 UTF16Buffer(); 91 virtual ~UTF16Buffer() {} 92 93 virtual void PushBack(uc32 ch) = 0; 94 // Returns a value < 0 when the buffer end is reached. 95 virtual uc32 Advance() = 0; 96 virtual void SeekForward(int pos) = 0; 97 98 int pos() const { return pos_; } 99 100 protected: 101 int pos_; // Current position in the buffer. 102 int end_; // Position where scanning should stop (EOF). 103}; 104 105 106// UTF16 buffer to read characters from a character stream. 107class CharacterStreamUTF16Buffer: public UTF16Buffer { 108 public: 109 CharacterStreamUTF16Buffer(); 110 virtual ~CharacterStreamUTF16Buffer() {} 111 void Initialize(Handle<String> data, 112 unibrow::CharacterStream* stream, 113 int start_position, 114 int end_position); 115 virtual void PushBack(uc32 ch); 116 virtual uc32 Advance(); 117 virtual void SeekForward(int pos); 118 119 private: 120 List<uc32> pushback_buffer_; 121 uc32 last_; 122 unibrow::CharacterStream* stream_; 123 124 List<uc32>* pushback_buffer() { return &pushback_buffer_; } 125}; 126 127 128// UTF16 buffer to read characters from an external string. 129template <typename StringType, typename CharType> 130class ExternalStringUTF16Buffer: public UTF16Buffer { 131 public: 132 ExternalStringUTF16Buffer(); 133 virtual ~ExternalStringUTF16Buffer() {} 134 void Initialize(Handle<StringType> data, 135 int start_position, 136 int end_position); 137 virtual void PushBack(uc32 ch); 138 virtual uc32 Advance(); 139 virtual void SeekForward(int pos); 140 141 private: 142 const CharType* raw_data_; // Pointer to the actual array of characters. 143}; 144 145 146class KeywordMatcher { 147// Incrementally recognize keywords. 148// 149// Recognized keywords: 150// break case catch const* continue debugger* default delete do else 151// finally false for function if in instanceof native* new null 152// return switch this throw true try typeof var void while with 153// 154// *: Actually "future reserved keywords". These are the only ones we 155// recognized, the remaining are allowed as identifiers. 156 public: 157 KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} 158 159 Token::Value token() { return token_; } 160 161 inline void AddChar(uc32 input) { 162 if (state_ != UNMATCHABLE) { 163 Step(input); 164 } 165 } 166 167 void Fail() { 168 token_ = Token::IDENTIFIER; 169 state_ = UNMATCHABLE; 170 } 171 172 private: 173 enum State { 174 UNMATCHABLE, 175 INITIAL, 176 KEYWORD_PREFIX, 177 KEYWORD_MATCHED, 178 C, 179 CA, 180 CO, 181 CON, 182 D, 183 DE, 184 F, 185 I, 186 IN, 187 N, 188 T, 189 TH, 190 TR, 191 V, 192 W 193 }; 194 195 struct FirstState { 196 const char* keyword; 197 State state; 198 Token::Value token; 199 }; 200 201 // Range of possible first characters of a keyword. 202 static const unsigned int kFirstCharRangeMin = 'b'; 203 static const unsigned int kFirstCharRangeMax = 'w'; 204 static const unsigned int kFirstCharRangeLength = 205 kFirstCharRangeMax - kFirstCharRangeMin + 1; 206 // State map for first keyword character range. 207 static FirstState first_states_[kFirstCharRangeLength]; 208 209 // Current state. 210 State state_; 211 // Token for currently added characters. 212 Token::Value token_; 213 214 // Matching a specific keyword string (there is only one possible valid 215 // keyword with the current prefix). 216 const char* keyword_; 217 int counter_; 218 Token::Value keyword_token_; 219 220 // If input equals keyword's character at position, continue matching keyword 221 // from that position. 222 inline bool MatchKeywordStart(uc32 input, 223 const char* keyword, 224 int position, 225 Token::Value token_if_match) { 226 if (input == keyword[position]) { 227 state_ = KEYWORD_PREFIX; 228 this->keyword_ = keyword; 229 this->counter_ = position + 1; 230 this->keyword_token_ = token_if_match; 231 return true; 232 } 233 return false; 234 } 235 236 // If input equals match character, transition to new state and return true. 237 inline bool MatchState(uc32 input, char match, State new_state) { 238 if (input == match) { 239 state_ = new_state; 240 return true; 241 } 242 return false; 243 } 244 245 inline bool MatchKeyword(uc32 input, 246 char match, 247 State new_state, 248 Token::Value keyword_token) { 249 if (input == match) { // Matched "do". 250 state_ = new_state; 251 token_ = keyword_token; 252 return true; 253 } 254 return false; 255 } 256 257 void Step(uc32 input); 258}; 259 260 261enum ParserMode { PARSE, PREPARSE }; 262enum ParserLanguage { JAVASCRIPT, JSON }; 263 264 265class Scanner { 266 public: 267 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 268 269 // Construction 270 explicit Scanner(ParserMode parse_mode); 271 272 // Initialize the Scanner to scan source. 273 void Initialize(Handle<String> source, 274 ParserLanguage language); 275 void Initialize(Handle<String> source, 276 unibrow::CharacterStream* stream, 277 ParserLanguage language); 278 void Initialize(Handle<String> source, 279 int start_position, int end_position, 280 ParserLanguage language); 281 282 // Returns the next token. 283 Token::Value Next(); 284 285 // One token look-ahead (past the token returned by Next()). 286 Token::Value peek() const { return next_.token; } 287 288 // Returns true if there was a line terminator before the peek'ed token. 289 bool has_line_terminator_before_next() const { 290 return has_line_terminator_before_next_; 291 } 292 293 struct Location { 294 Location(int b, int e) : beg_pos(b), end_pos(e) { } 295 Location() : beg_pos(0), end_pos(0) { } 296 int beg_pos; 297 int end_pos; 298 }; 299 300 // Returns the location information for the current token 301 // (the token returned by Next()). 302 Location location() const { return current_.location; } 303 Location peek_location() const { return next_.location; } 304 305 // Returns the literal string, if any, for the current token (the 306 // token returned by Next()). The string is 0-terminated and in 307 // UTF-8 format; they may contain 0-characters. Literal strings are 308 // collected for identifiers, strings, and numbers. 309 // These functions only give the correct result if the literal 310 // was scanned between calls to StartLiteral() and TerminateLiteral(). 311 const char* literal_string() const { 312 return current_.literal_buffer->data(); 313 } 314 int literal_length() const { 315 // Excluding terminal '\0' added by TerminateLiteral(). 316 return current_.literal_buffer->pos() - 1; 317 } 318 319 // Returns the literal string for the next token (the token that 320 // would be returned if Next() were called). 321 const char* next_literal_string() const { 322 return next_.literal_buffer->data(); 323 } 324 // Returns the length of the next token (that would be returned if 325 // Next() were called). 326 int next_literal_length() const { 327 return next_.literal_buffer->pos() - 1; 328 } 329 330 Vector<const char> next_literal() const { 331 return Vector<const char>(next_literal_string(), 332 next_literal_length()); 333 } 334 335 // Scans the input as a regular expression pattern, previous 336 // character(s) must be /(=). Returns true if a pattern is scanned. 337 bool ScanRegExpPattern(bool seen_equal); 338 // Returns true if regexp flags are scanned (always since flags can 339 // be empty). 340 bool ScanRegExpFlags(); 341 342 // Seek forward to the given position. This operation does not 343 // work in general, for instance when there are pushed back 344 // characters, but works for seeking forward until simple delimiter 345 // tokens, which is what it is used for. 346 void SeekForward(int pos); 347 348 bool stack_overflow() { return stack_overflow_; } 349 350 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } 351 352 // Tells whether the buffer contains an identifier (no escapes). 353 // Used for checking if a property name is an identifier. 354 static bool IsIdentifier(unibrow::CharacterStream* buffer); 355 356 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 357 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 358 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 359 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 360 361 static const int kCharacterLookaheadBufferSize = 1; 362 static const int kNoEndPosition = 1; 363 364 private: 365 void Init(Handle<String> source, 366 unibrow::CharacterStream* stream, 367 int start_position, int end_position, 368 ParserLanguage language); 369 370 371 // Different UTF16 buffers used to pull characters from. Based on input one of 372 // these will be initialized as the actual data source. 373 CharacterStreamUTF16Buffer char_stream_buffer_; 374 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> 375 two_byte_string_buffer_; 376 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; 377 378 // Source. Will point to one of the buffers declared above. 379 UTF16Buffer* source_; 380 381 // Used to convert the source string into a character stream when a stream 382 // is not passed to the scanner. 383 SafeStringInputBuffer safe_string_input_buffer_; 384 385 // Buffer to hold literal values (identifiers, strings, numbers) 386 // using 0-terminated UTF-8 encoding. 387 UTF8Buffer literal_buffer_1_; 388 UTF8Buffer literal_buffer_2_; 389 390 bool stack_overflow_; 391 static StaticResource<Utf8Decoder> utf8_decoder_; 392 393 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 394 uc32 c0_; 395 396 // The current and look-ahead token. 397 struct TokenDesc { 398 Token::Value token; 399 Location location; 400 UTF8Buffer* literal_buffer; 401 }; 402 403 TokenDesc current_; // desc for current token (as returned by Next()) 404 TokenDesc next_; // desc for next token (one token look-ahead) 405 bool has_line_terminator_before_next_; 406 bool is_pre_parsing_; 407 bool is_parsing_json_; 408 409 // Literal buffer support 410 void StartLiteral(); 411 void AddChar(uc32 ch); 412 void AddCharAdvance(); 413 void TerminateLiteral(); 414 415 // Low-level scanning support. 416 void Advance() { c0_ = source_->Advance(); } 417 void PushBack(uc32 ch) { 418 source_->PushBack(ch); 419 c0_ = ch; 420 } 421 422 bool SkipWhiteSpace() { 423 if (is_parsing_json_) { 424 return SkipJsonWhiteSpace(); 425 } else { 426 return SkipJavaScriptWhiteSpace(); 427 } 428 } 429 bool SkipJavaScriptWhiteSpace(); 430 bool SkipJsonWhiteSpace(); 431 Token::Value SkipSingleLineComment(); 432 Token::Value SkipMultiLineComment(); 433 434 inline Token::Value Select(Token::Value tok); 435 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); 436 437 inline void Scan() { 438 if (is_parsing_json_) { 439 ScanJson(); 440 } else { 441 ScanJavaScript(); 442 } 443 } 444 445 // Scans a single JavaScript token. 446 void ScanJavaScript(); 447 448 // Scan a single JSON token. The JSON lexical grammar is specified in the 449 // ECMAScript 5 standard, section 15.12.1.1. 450 // Recognizes all of the single-character tokens directly, or calls a function 451 // to scan a number, string or identifier literal. 452 // The only allowed whitespace characters between tokens are tab, 453 // carrige-return, newline and space. 454 void ScanJson(); 455 456 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 457 // decimal number literals. 458 // It includes an optional minus sign, must have at least one 459 // digit before and after a decimal point, may not have prefixed zeros (unless 460 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 461 // Hexadecimal and octal numbers are not allowed. 462 Token::Value ScanJsonNumber(); 463 // A JSON string (production JSONString) is subset of valid JavaScript string 464 // literals. The string must only be double-quoted (not single-quoted), and 465 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 466 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 467 Token::Value ScanJsonString(); 468 // Used to recognizes one of the literals "true", "false", or "null". These 469 // are the only valid JSON identifiers (productions JSONBooleanLiteral, 470 // JSONNullLiteral). 471 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); 472 473 void ScanDecimalDigits(); 474 Token::Value ScanNumber(bool seen_period); 475 Token::Value ScanIdentifier(); 476 uc32 ScanHexEscape(uc32 c, int length); 477 uc32 ScanOctalEscape(uc32 c, int length); 478 void ScanEscape(); 479 Token::Value ScanString(); 480 481 // Scans a possible HTML comment -- begins with '<!'. 482 Token::Value ScanHtmlComment(); 483 484 // Return the current source position. 485 int source_pos() { 486 return source_->pos() - kCharacterLookaheadBufferSize; 487 } 488 489 // Decodes a unicode escape-sequence which is part of an identifier. 490 // If the escape sequence cannot be decoded the result is kBadRune. 491 uc32 ScanIdentifierUnicodeEscape(); 492}; 493 494} } // namespace v8::internal 495 496#endif // V8_SCANNER_H_ 497