scanner.h revision d0582a6c46733687d045e4188a1bcd0123c758a1
1// Copyright 2006-2008 the V8 project authors. All rights reserved. 2// Redistribution and use in source and binary forms, with or without 3// modification, are permitted provided that the following conditions are 4// met: 5// 6// * Redistributions of source code must retain the above copyright 7// notice, this list of conditions and the following disclaimer. 8// * Redistributions in binary form must reproduce the above 9// copyright notice, this list of conditions and the following 10// disclaimer in the documentation and/or other materials provided 11// with the distribution. 12// * Neither the name of Google Inc. nor the names of its 13// contributors may be used to endorse or promote products derived 14// from this software without specific prior written permission. 15// 16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 28#ifndef V8_SCANNER_H_ 29#define V8_SCANNER_H_ 30 31#include "token.h" 32#include "char-predicates-inl.h" 33 34namespace v8 { 35namespace internal { 36 37 38class UTF8Buffer { 39 public: 40 UTF8Buffer(); 41 ~UTF8Buffer(); 42 43 void AddChar(uc32 c) { 44 ASSERT_NOT_NULL(data_); 45 if (cursor_ <= limit_ && 46 static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) { 47 *cursor_++ = static_cast<char>(c); 48 } else { 49 AddCharSlow(c); 50 } 51 } 52 53 void Reset() { 54 if (data_ == NULL) { 55 data_ = NewArray<char>(kInitialCapacity); 56 limit_ = ComputeLimit(data_, kInitialCapacity); 57 } 58 cursor_ = data_; 59 } 60 61 int pos() const { 62 ASSERT_NOT_NULL(data_); 63 return static_cast<int>(cursor_ - data_); 64 } 65 66 char* data() const { return data_; } 67 68 private: 69 static const int kInitialCapacity = 256; 70 char* data_; 71 char* cursor_; 72 char* limit_; 73 74 int Capacity() const { 75 ASSERT_NOT_NULL(data_); 76 return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize; 77 } 78 79 static char* ComputeLimit(char* data, int capacity) { 80 return (data + capacity) - unibrow::Utf8::kMaxEncodedSize; 81 } 82 83 void AddCharSlow(uc32 c); 84}; 85 86 87class UTF16Buffer { 88 public: 89 UTF16Buffer(); 90 virtual ~UTF16Buffer() {} 91 92 virtual void PushBack(uc32 ch) = 0; 93 // returns a value < 0 when the buffer end is reached 94 virtual uc32 Advance() = 0; 95 virtual void SeekForward(int pos) = 0; 96 97 int pos() const { return pos_; } 98 int size() const { return size_; } 99 Handle<String> SubString(int start, int end); 100 101 protected: 102 Handle<String> data_; 103 int pos_; 104 int size_; 105}; 106 107 108class CharacterStreamUTF16Buffer: public UTF16Buffer { 109 public: 110 CharacterStreamUTF16Buffer(); 111 virtual ~CharacterStreamUTF16Buffer() {} 112 void Initialize(Handle<String> data, unibrow::CharacterStream* stream); 113 virtual void PushBack(uc32 ch); 114 virtual uc32 Advance(); 115 virtual void SeekForward(int pos); 116 117 private: 118 List<uc32> pushback_buffer_; 119 uc32 last_; 120 unibrow::CharacterStream* stream_; 121 122 List<uc32>* pushback_buffer() { return &pushback_buffer_; } 123}; 124 125 126class TwoByteStringUTF16Buffer: public UTF16Buffer { 127 public: 128 TwoByteStringUTF16Buffer(); 129 virtual ~TwoByteStringUTF16Buffer() {} 130 void Initialize(Handle<ExternalTwoByteString> data); 131 virtual void PushBack(uc32 ch); 132 virtual uc32 Advance(); 133 virtual void SeekForward(int pos); 134 135 private: 136 const uint16_t* raw_data_; 137}; 138 139 140class KeywordMatcher { 141// Incrementally recognize keywords. 142// 143// Recognized keywords: 144// break case catch const* continue debugger* default delete do else 145// finally false for function if in instanceof native* new null 146// return switch this throw true try typeof var void while with 147// 148// *: Actually "future reserved keywords". These are the only ones we 149// recognized, the remaining are allowed as identifiers. 150 public: 151 KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {} 152 153 Token::Value token() { return token_; } 154 155 inline void AddChar(uc32 input) { 156 if (state_ != UNMATCHABLE) { 157 Step(input); 158 } 159 } 160 161 void Fail() { 162 token_ = Token::IDENTIFIER; 163 state_ = UNMATCHABLE; 164 } 165 166 private: 167 enum State { 168 UNMATCHABLE, 169 INITIAL, 170 KEYWORD_PREFIX, 171 KEYWORD_MATCHED, 172 C, 173 CA, 174 CO, 175 CON, 176 D, 177 DE, 178 F, 179 I, 180 IN, 181 N, 182 T, 183 TH, 184 TR, 185 V, 186 W 187 }; 188 189 struct FirstState { 190 const char* keyword; 191 State state; 192 Token::Value token; 193 }; 194 195 // Range of possible first characters of a keyword. 196 static const unsigned int kFirstCharRangeMin = 'b'; 197 static const unsigned int kFirstCharRangeMax = 'w'; 198 static const unsigned int kFirstCharRangeLength = 199 kFirstCharRangeMax - kFirstCharRangeMin + 1; 200 // State map for first keyword character range. 201 static FirstState first_states_[kFirstCharRangeLength]; 202 203 // Current state. 204 State state_; 205 // Token for currently added characters. 206 Token::Value token_; 207 208 // Matching a specific keyword string (there is only one possible valid 209 // keyword with the current prefix). 210 const char* keyword_; 211 int counter_; 212 Token::Value keyword_token_; 213 214 // If input equals keyword's character at position, continue matching keyword 215 // from that position. 216 inline bool MatchKeywordStart(uc32 input, 217 const char* keyword, 218 int position, 219 Token::Value token_if_match) { 220 if (input == keyword[position]) { 221 state_ = KEYWORD_PREFIX; 222 this->keyword_ = keyword; 223 this->counter_ = position + 1; 224 this->keyword_token_ = token_if_match; 225 return true; 226 } 227 return false; 228 } 229 230 // If input equals match character, transition to new state and return true. 231 inline bool MatchState(uc32 input, char match, State new_state) { 232 if (input == match) { 233 state_ = new_state; 234 return true; 235 } 236 return false; 237 } 238 239 inline bool MatchKeyword(uc32 input, 240 char match, 241 State new_state, 242 Token::Value keyword_token) { 243 if (input == match) { // Matched "do". 244 state_ = new_state; 245 token_ = keyword_token; 246 return true; 247 } 248 return false; 249 } 250 251 void Step(uc32 input); 252}; 253 254 255class Scanner { 256 public: 257 258 typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder; 259 260 // Construction 261 explicit Scanner(bool is_pre_parsing); 262 263 // Initialize the Scanner to scan source: 264 void Init(Handle<String> source, 265 unibrow::CharacterStream* stream, 266 int position); 267 268 // Returns the next token. 269 Token::Value Next(); 270 271 // One token look-ahead (past the token returned by Next()). 272 Token::Value peek() const { return next_.token; } 273 274 // Returns true if there was a line terminator before the peek'ed token. 275 bool has_line_terminator_before_next() const { 276 return has_line_terminator_before_next_; 277 } 278 279 struct Location { 280 Location(int b, int e) : beg_pos(b), end_pos(e) { } 281 Location() : beg_pos(0), end_pos(0) { } 282 int beg_pos; 283 int end_pos; 284 }; 285 286 // Returns the location information for the current token 287 // (the token returned by Next()). 288 Location location() const { return current_.location; } 289 Location peek_location() const { return next_.location; } 290 291 // Returns the literal string, if any, for the current token (the 292 // token returned by Next()). The string is 0-terminated and in 293 // UTF-8 format; they may contain 0-characters. Literal strings are 294 // collected for identifiers, strings, and numbers. 295 // These functions only give the correct result if the literal 296 // was scanned between calls to StartLiteral() and TerminateLiteral(). 297 const char* literal_string() const { 298 return current_.literal_buffer->data(); 299 } 300 int literal_length() const { 301 // Excluding terminal '\0' added by TerminateLiteral(). 302 return current_.literal_buffer->pos() - 1; 303 } 304 305 // Returns the literal string for the next token (the token that 306 // would be returned if Next() were called). 307 const char* next_literal_string() const { 308 return next_.literal_buffer->data(); 309 } 310 // Returns the length of the next token (that would be returned if 311 // Next() were called). 312 int next_literal_length() const { 313 return next_.literal_buffer->pos() - 1; 314 } 315 316 Vector<const char> next_literal() const { 317 return Vector<const char>(next_literal_string(), 318 next_literal_length()); 319 } 320 321 // Scans the input as a regular expression pattern, previous 322 // character(s) must be /(=). Returns true if a pattern is scanned. 323 bool ScanRegExpPattern(bool seen_equal); 324 // Returns true if regexp flags are scanned (always since flags can 325 // be empty). 326 bool ScanRegExpFlags(); 327 328 // Seek forward to the given position. This operation does not 329 // work in general, for instance when there are pushed back 330 // characters, but works for seeking forward until simple delimiter 331 // tokens, which is what it is used for. 332 void SeekForward(int pos); 333 334 Handle<String> SubString(int start_pos, int end_pos); 335 bool stack_overflow() { return stack_overflow_; } 336 337 static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; } 338 339 // Tells whether the buffer contains an identifier (no escapes). 340 // Used for checking if a property name is an identifier. 341 static bool IsIdentifier(unibrow::CharacterStream* buffer); 342 343 static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart; 344 static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart; 345 static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator; 346 static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace; 347 348 static const int kCharacterLookaheadBufferSize = 1; 349 350 private: 351 CharacterStreamUTF16Buffer char_stream_buffer_; 352 TwoByteStringUTF16Buffer two_byte_string_buffer_; 353 354 // Source. 355 UTF16Buffer* source_; 356 int position_; 357 358 // Buffer to hold literal values (identifiers, strings, numbers) 359 // using 0-terminated UTF-8 encoding. 360 UTF8Buffer literal_buffer_1_; 361 UTF8Buffer literal_buffer_2_; 362 363 bool stack_overflow_; 364 static StaticResource<Utf8Decoder> utf8_decoder_; 365 366 // One Unicode character look-ahead; c0_ < 0 at the end of the input. 367 uc32 c0_; 368 369 // The current and look-ahead token. 370 struct TokenDesc { 371 Token::Value token; 372 Location location; 373 UTF8Buffer* literal_buffer; 374 }; 375 376 TokenDesc current_; // desc for current token (as returned by Next()) 377 TokenDesc next_; // desc for next token (one token look-ahead) 378 bool has_line_terminator_before_next_; 379 bool is_pre_parsing_; 380 381 // Literal buffer support 382 void StartLiteral(); 383 void AddChar(uc32 ch); 384 void AddCharAdvance(); 385 void TerminateLiteral(); 386 387 // Low-level scanning support. 388 void Advance() { c0_ = source_->Advance(); } 389 void PushBack(uc32 ch) { 390 source_->PushBack(ch); 391 c0_ = ch; 392 } 393 394 bool SkipWhiteSpace(); 395 Token::Value SkipSingleLineComment(); 396 Token::Value SkipMultiLineComment(); 397 398 inline Token::Value Select(Token::Value tok); 399 inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_); 400 401 void Scan(); 402 void ScanDecimalDigits(); 403 Token::Value ScanNumber(bool seen_period); 404 Token::Value ScanIdentifier(); 405 uc32 ScanHexEscape(uc32 c, int length); 406 uc32 ScanOctalEscape(uc32 c, int length); 407 void ScanEscape(); 408 Token::Value ScanString(); 409 410 // Scans a possible HTML comment -- begins with '<!'. 411 Token::Value ScanHtmlComment(); 412 413 // Return the current source position. 414 int source_pos() { 415 return source_->pos() - kCharacterLookaheadBufferSize + position_; 416 } 417 418 // Decodes a unicode escape-sequence which is part of an identifier. 419 // If the escape sequence cannot be decoded the result is kBadRune. 420 uc32 ScanIdentifierUnicodeEscape(); 421}; 422 423} } // namespace v8::internal 424 425#endif // V8_SCANNER_H_ 426