scanner.h revision 80d68eab642096c1a48b6474d6ec33064b0ad1f5
1// Copyright 2006-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#ifndef V8_SCANNER_H_ 29#define V8_SCANNER_H_ 30 31#include "token.h" 32#include "char-predicates-inl.h" 33 34namespace v8 { 35namespace internal { 36 37 38class UTF8Buffer { 39 public: 40 UTF8Buffer(); 41 ~UTF8Buffer(); 42 43 inline void AddChar(uc32 c) { 44 if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 45 buffer_.Add(static_cast<char>(c)); 46 } else { 47 AddCharSlow(c); 48 } 49 } 50 51 void StartLiteral() { 52 buffer_.StartSequence(); 53 } 54 55 Vector<const char> EndLiteral() { 56 buffer_.Add(kEndMarker); 57 Vector<char> sequence = buffer_.EndSequence(); 58 return Vector<const char>(sequence.start(), sequence.length()); 59 } 60 61 void DropLiteral() { 62 buffer_.DropSequence(); 63 } 64 65 void Reset() { 66 buffer_.Reset(); 67 } 68 69 // The end marker added after a parsed literal. 70 // Using zero allows the usage of strlen and similar functions on 71 // identifiers and numbers (but not strings, since they may contain zero 72 // bytes). 73 // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside 74 // an utf-8 string. This requires changes in all places that uses 75 // str-functions on the literals, but allows a single pointer to represent 76 // the literal, even if it contains embedded zeros. 77 static const char kEndMarker = '\x00'; 78 private: 79 static const int kInitialCapacity = 256; 80 SequenceCollector<char, 4> buffer_; 81 82 void AddCharSlow(uc32 c); 83}; 84 85 86// Interface through which the scanner reads characters from the input source. 87class UTF16Buffer { 88 public: 89 UTF16Buffer(); 90 virtual ~UTF16Buffer() {} 91 92 virtual void PushBack(uc32 ch) = 0; 93 // Returns a value < 0 when the buffer end is reached. 94 virtual uc32 Advance() = 0; 95 virtual void SeekForward(int pos) = 0; 96 97 int pos() const { return pos_; } 98 99 protected: 100 int pos_; // Current position in the buffer. 101 int end_; // Position where scanning should stop (EOF). 102}; 103 104 105// UTF16 buffer to read characters from a character stream. 106class CharacterStreamUTF16Buffer: public UTF16Buffer { 107 public: 108 CharacterStreamUTF16Buffer(); 109 virtual ~CharacterStreamUTF16Buffer() {} 110 void Initialize(Handle<String> data, 111 unibrow::CharacterStream* stream, 112 int start_position, 113 int end_position); 114 virtual void PushBack(uc32 ch); 115 virtual uc32 Advance(); 116 virtual void SeekForward(int pos); 117 118 private: 119 List<uc32> pushback_buffer_; 120 uc32 last_; 121 unibrow::CharacterStream* stream_; 122 123 List<uc32>* pushback_buffer() { return &pushback_buffer_; } 124}; 125 126 127// UTF16 buffer to read characters from an external string. 128template <typename StringType, typename CharType> 129class ExternalStringUTF16Buffer: public UTF16Buffer { 130 public: 131 ExternalStringUTF16Buffer(); 132 virtual ~ExternalStringUTF16Buffer() {} 133 void Initialize(Handle<StringType> data, 134 int start_position, 135 int end_position); 136 virtual void PushBack(uc32 ch); 137 virtual uc32 Advance(); 138 virtual void SeekForward(int pos); 139 140 private: 141 const CharType* raw_data_; // Pointer to the actual array of characters. 142}; 143 144 145class KeywordMatcher { 146// Incrementally recognize keywords. 147// 148// Recognized keywords: 149// break case catch const* continue debugger* default delete do else 150// finally false for function if in instanceof native* new null 151// return switch this throw true try typeof var void while with 152// 153// *: Actually "future reserved keywords". These are the only ones we 154// recognized, the remaining are allowed as identifiers. 155 public: 156 KeywordMatcher() 157 : state_(INITIAL), 158 token_(Token::IDENTIFIER), 159 keyword_(NULL), 160 counter_(0), 161 keyword_token_(Token::ILLEGAL) {} 162 163 Token::Value token() { return token_; } 164 165 inline void AddChar(uc32 input) { 166 if (state_ != UNMATCHABLE) { 167 Step(input); 168 } 169 } 170 171 void Fail() { 172 token_ = Token::IDENTIFIER; 173 state_ = UNMATCHABLE; 174 } 175 176 private: 177 enum State { 178 UNMATCHABLE, 179 INITIAL, 180 KEYWORD_PREFIX, 181 KEYWORD_MATCHED, 182 C, 183 CA, 184 CO, 185 CON, 186 D, 187 DE, 188 F, 189 I, 190 IN, 191 N, 192 T, 193 TH, 194 TR, 195 V, 196 W 197 }; 198 199 struct FirstState { 200 const char* keyword; 201 State state; 202 Token::Value token; 203 }; 204 205 // Range of possible first characters of a keyword. 206 static const unsigned int kFirstCharRangeMin = 'b'; 207 static const unsigned int kFirstCharRangeMax = 'w'; 208 static const unsigned int kFirstCharRangeLength = 209 kFirstCharRangeMax - kFirstCharRangeMin + 1; 210 // State map for first keyword character range. 211 static FirstState first_states_[kFirstCharRangeLength]; 212 213 // If input equals keyword's character at position, continue matching keyword 214 // from that position. 215 inline bool MatchKeywordStart(uc32 input, 216 const char* keyword, 217 int position, 218 Token::Value token_if_match) { 219 if (input == keyword[position]) { 220 state_ = KEYWORD_PREFIX; 221 this->keyword_ = keyword; 222 this->counter_ = position + 1; 223 this->keyword_token_ = token_if_match; 224 return true; 225 } 226 return false; 227 } 228 229 // If input equals match character, transition to new state and return true. 230 inline bool MatchState(uc32 input, char match, State new_state) { 231 if (input == match) { 232 state_ = new_state; 233 return true; 234 } 235 return false; 236 } 237 238 inline bool MatchKeyword(uc32 input, 239 char match, 240 State new_state, 241 Token::Value keyword_token) { 242 if (input != match) { 243 return false; 244 } 245 state_ = new_state; 246 token_ = keyword_token; 247 return true; 248 } 249 250 void Step(uc32 input); 251 252 // Current state. 253 State state_; 254 // Token for currently added characters. 255 Token::Value token_; 256 257 // Matching a specific keyword string (there is only one possible valid 258 // keyword with the current prefix). 259 const char* keyword_; 260 int counter_; 261 Token::Value keyword_token_; 262}; 263 264 265enum ParserMode { PARSE, PREPARSE }; 266enum ParserLanguage { JAVASCRIPT, JSON }; 267 268 269class Scanner { 270 public: 271 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 272 273 class LiteralScope { 274 public: 275 explicit LiteralScope(Scanner* self); 276 ~LiteralScope(); 277 void Complete(); 278 279 private: 280 Scanner* scanner_; 281 bool complete_; 282 }; 283 284 // Construction 285 explicit Scanner(ParserMode parse_mode); 286 287 // Initialize the Scanner to scan source. 288 void Initialize(Handle<String> source, 289 ParserLanguage language); 290 void Initialize(Handle<String> source, 291 unibrow::CharacterStream* stream, 292 ParserLanguage language); 293 void Initialize(Handle<String> source, 294 int start_position, int end_position, 295 ParserLanguage language); 296 297 // Returns the next token. 298 Token::Value Next(); 299 300 // One token look-ahead (past the token returned by Next()). 301 Token::Value peek() const { return next_.token; } 302 303 // Returns true if there was a line terminator before the peek'ed token. 304 bool has_line_terminator_before_next() const { 305 return has_line_terminator_before_next_; 306 } 307 308 struct Location { 309 Location(int b, int e) : beg_pos(b), end_pos(e) { } 310 Location() : beg_pos(0), end_pos(0) { } 311 int beg_pos; 312 int end_pos; 313 }; 314 315 // Returns the location information for the current token 316 // (the token returned by Next()). 317 Location location() const { return current_.location; } 318 Location peek_location() const { return next_.location; } 319 320 // Returns the literal string, if any, for the current token (the 321 // token returned by Next()). The string is 0-terminated and in 322 // UTF-8 format; they may contain 0-characters. Literal strings are 323 // collected for identifiers, strings, and numbers. 324 // These functions only give the correct result if the literal 325 // was scanned between calls to StartLiteral() and TerminateLiteral(). 326 const char* literal_string() const { 327 return current_.literal_chars.start(); 328 } 329 330 int literal_length() const { 331 // Excluding terminal '\x00' added by TerminateLiteral(). 332 return current_.literal_chars.length() - 1; 333 } 334 335 Vector<const char> literal() const { 336 return Vector<const char>(literal_string(), literal_length()); 337 } 338 339 // Returns the literal string for the next token (the token that 340 // would be returned if Next() were called). 341 const char* next_literal_string() const { 342 return next_.literal_chars.start(); 343 } 344 345 346 // Returns the length of the next token (that would be returned if 347 // Next() were called). 348 int next_literal_length() const { 349 // Excluding terminal '\x00' added by TerminateLiteral(). 350 return next_.literal_chars.length() - 1; 351 } 352 353 Vector<const char> next_literal() const { 354 return Vector<const char>(next_literal_string(), next_literal_length()); 355 } 356 357 // Scans the input as a regular expression pattern, previous 358 // character(s) must be /(=). Returns true if a pattern is scanned. 359 bool ScanRegExpPattern(bool seen_equal); 360 // Returns true if regexp flags are scanned (always since flags can 361 // be empty). 362 bool ScanRegExpFlags(); 363 364 // Seek forward to the given position. This operation does not 365 // work in general, for instance when there are pushed back 366 // characters, but works for seeking forward until simple delimiter 367 // tokens, which is what it is used for. 368 void SeekForward(int pos); 369 370 bool stack_overflow() { return stack_overflow_; } 371 372 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } 373 374 // Tells whether the buffer contains an identifier (no escapes). 375 // Used for checking if a property name is an identifier. 376 static bool IsIdentifier(unibrow::CharacterStream* buffer); 377 378 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 379 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 380 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 381 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 382 383 static const int kCharacterLookaheadBufferSize = 1; 384 static const int kNoEndPosition = 1; 385 386 private: 387 // The current and look-ahead token. 388 struct TokenDesc { 389 Token::Value token; 390 Location location; 391 Vector<const char> literal_chars; 392 }; 393 394 void Init(Handle<String> source, 395 unibrow::CharacterStream* stream, 396 int start_position, int end_position, 397 ParserLanguage language); 398 399 // Literal buffer support 400 inline void StartLiteral(); 401 inline void AddChar(uc32 ch); 402 inline void AddCharAdvance(); 403 inline void TerminateLiteral(); 404 // Stops scanning of a literal, e.g., due to an encountered error. 405 inline void DropLiteral(); 406 407 // Low-level scanning support. 408 void Advance() { c0_ = source_->Advance(); } 409 void PushBack(uc32 ch) { 410 source_->PushBack(ch); 411 c0_ = ch; 412 } 413 414 bool SkipWhiteSpace() { 415 if (is_parsing_json_) { 416 return SkipJsonWhiteSpace(); 417 } else { 418 return SkipJavaScriptWhiteSpace(); 419 } 420 } 421 422 bool SkipJavaScriptWhiteSpace(); 423 bool SkipJsonWhiteSpace(); 424 Token::Value SkipSingleLineComment(); 425 Token::Value SkipMultiLineComment(); 426 427 inline Token::Value Select(Token::Value tok); 428 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); 429 430 inline void Scan() { 431 if (is_parsing_json_) { 432 ScanJson(); 433 } else { 434 ScanJavaScript(); 435 } 436 } 437 438 // Scans a single JavaScript token. 439 void ScanJavaScript(); 440 441 // Scan a single JSON token. The JSON lexical grammar is specified in the 442 // ECMAScript 5 standard, section 15.12.1.1. 443 // Recognizes all of the single-character tokens directly, or calls a function 444 // to scan a number, string or identifier literal. 445 // The only allowed whitespace characters between tokens are tab, 446 // carrige-return, newline and space. 447 void ScanJson(); 448 449 // A JSON number (production JSONNumber) is a subset of the valid JavaScript 450 // decimal number literals. 451 // It includes an optional minus sign, must have at least one 452 // digit before and after a decimal point, may not have prefixed zeros (unless 453 // the integer part is zero), and may include an exponent part (e.g., "e-10"). 454 // Hexadecimal and octal numbers are not allowed. 455 Token::Value ScanJsonNumber(); 456 457 // A JSON string (production JSONString) is subset of valid JavaScript string 458 // literals. The string must only be double-quoted (not single-quoted), and 459 // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and 460 // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid. 461 Token::Value ScanJsonString(); 462 463 // Used to recognizes one of the literals "true", "false", or "null". These 464 // are the only valid JSON identifiers (productions JSONBooleanLiteral, 465 // JSONNullLiteral). 466 Token::Value ScanJsonIdentifier(const char* text, Token::Value token); 467 468 void ScanDecimalDigits(); 469 Token::Value ScanNumber(bool seen_period); 470 Token::Value ScanIdentifier(); 471 uc32 ScanHexEscape(uc32 c, int length); 472 uc32 ScanOctalEscape(uc32 c, int length); 473 void ScanEscape(); 474 Token::Value ScanString(); 475 476 // Scans a possible HTML comment -- begins with '<!'. 477 Token::Value ScanHtmlComment(); 478 479 // Return the current source position. 480 int source_pos() { 481 return source_->pos() - kCharacterLookaheadBufferSize; 482 } 483 484 // Decodes a unicode escape-sequence which is part of an identifier. 485 // If the escape sequence cannot be decoded the result is kBadRune. 486 uc32 ScanIdentifierUnicodeEscape(); 487 488 TokenDesc current_; // desc for current token (as returned by Next()) 489 TokenDesc next_; // desc for next token (one token look-ahead) 490 bool has_line_terminator_before_next_; 491 bool is_pre_parsing_; 492 bool is_parsing_json_; 493 494 // Different UTF16 buffers used to pull characters from. Based on input one of 495 // these will be initialized as the actual data source. 496 CharacterStreamUTF16Buffer char_stream_buffer_; 497 ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t> 498 two_byte_string_buffer_; 499 ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_; 500 501 // Source. Will point to one of the buffers declared above. 502 UTF16Buffer* source_; 503 504 // Used to convert the source string into a character stream when a stream 505 // is not passed to the scanner. 506 SafeStringInputBuffer safe_string_input_buffer_; 507 508 // Buffer to hold literal values (identifiers, strings, numbers) 509 // using '\x00'-terminated UTF-8 encoding. Handles allocation internally. 510 UTF8Buffer literal_buffer_; 511 512 bool stack_overflow_; 513 static StaticResource<Utf8Decoder> utf8_decoder_; 514 515 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 516 uc32 c0_; 517}; 518 519} } // namespace v8::internal 520 521#endif // V8_SCANNER_H_ 522