scanner.h revision d0582a6c46733687d045e4188a1bcd0123c758a1
1// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40  UTF8Buffer();
41  ~UTF8Buffer();
42
43  void AddChar(uc32 c) {
44    ASSERT_NOT_NULL(data_);
45    if (cursor_ <= limit_ &&
46        static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
47      *cursor_++ = static_cast<char>(c);
48    } else {
49      AddCharSlow(c);
50    }
51  }
52
53  void Reset() {
54    if (data_ == NULL) {
55      data_ = NewArray<char>(kInitialCapacity);
56      limit_ = ComputeLimit(data_, kInitialCapacity);
57    }
58    cursor_ = data_;
59  }
60
61  int pos() const {
62    ASSERT_NOT_NULL(data_);
63    return static_cast<int>(cursor_ - data_);
64  }
65
66  char* data() const { return data_; }
67
68 private:
69  static const int kInitialCapacity = 256;
70  char* data_;
71  char* cursor_;
72  char* limit_;
73
74  int Capacity() const {
75    ASSERT_NOT_NULL(data_);
76    return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
77  }
78
79  static char* ComputeLimit(char* data, int capacity) {
80    return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
81  }
82
83  void AddCharSlow(uc32 c);
84};
85
86
87class UTF16Buffer {
88 public:
89  UTF16Buffer();
90  virtual ~UTF16Buffer() {}
91
92  virtual void PushBack(uc32 ch) = 0;
93  // returns a value < 0 when the buffer end is reached
94  virtual uc32 Advance() = 0;
95  virtual void SeekForward(int pos) = 0;
96
97  int pos() const { return pos_; }
98  int size() const { return size_; }
99  Handle<String> SubString(int start, int end);
100
101 protected:
102  Handle<String> data_;
103  int pos_;
104  int size_;
105};
106
107
108class CharacterStreamUTF16Buffer: public UTF16Buffer {
109 public:
110  CharacterStreamUTF16Buffer();
111  virtual ~CharacterStreamUTF16Buffer() {}
112  void Initialize(Handle<String> data, unibrow::CharacterStream* stream);
113  virtual void PushBack(uc32 ch);
114  virtual uc32 Advance();
115  virtual void SeekForward(int pos);
116
117 private:
118  List<uc32> pushback_buffer_;
119  uc32 last_;
120  unibrow::CharacterStream* stream_;
121
122  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
123};
124
125
126class TwoByteStringUTF16Buffer: public UTF16Buffer {
127 public:
128  TwoByteStringUTF16Buffer();
129  virtual ~TwoByteStringUTF16Buffer() {}
130  void Initialize(Handle<ExternalTwoByteString> data);
131  virtual void PushBack(uc32 ch);
132  virtual uc32 Advance();
133  virtual void SeekForward(int pos);
134
135 private:
136  const uint16_t* raw_data_;
137};
138
139
140class KeywordMatcher {
141//  Incrementally recognize keywords.
142//
143//  Recognized keywords:
144//      break case catch const* continue debugger* default delete do else
145//      finally false for function if in instanceof native* new null
146//      return switch this throw true try typeof var void while with
147//
148//  *: Actually "future reserved keywords". These are the only ones we
149//     recognized, the remaining are allowed as identifiers.
150 public:
151  KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
152
153  Token::Value token() { return token_; }
154
155  inline void AddChar(uc32 input) {
156    if (state_ != UNMATCHABLE) {
157      Step(input);
158    }
159  }
160
161  void Fail() {
162    token_ = Token::IDENTIFIER;
163    state_ = UNMATCHABLE;
164  }
165
166 private:
167  enum State {
168    UNMATCHABLE,
169    INITIAL,
170    KEYWORD_PREFIX,
171    KEYWORD_MATCHED,
172    C,
173    CA,
174    CO,
175    CON,
176    D,
177    DE,
178    F,
179    I,
180    IN,
181    N,
182    T,
183    TH,
184    TR,
185    V,
186    W
187  };
188
189  struct FirstState {
190    const char* keyword;
191    State state;
192    Token::Value token;
193  };
194
195  // Range of possible first characters of a keyword.
196  static const unsigned int kFirstCharRangeMin = 'b';
197  static const unsigned int kFirstCharRangeMax = 'w';
198  static const unsigned int kFirstCharRangeLength =
199      kFirstCharRangeMax - kFirstCharRangeMin + 1;
200  // State map for first keyword character range.
201  static FirstState first_states_[kFirstCharRangeLength];
202
203  // Current state.
204  State state_;
205  // Token for currently added characters.
206  Token::Value token_;
207
208  // Matching a specific keyword string (there is only one possible valid
209  // keyword with the current prefix).
210  const char* keyword_;
211  int counter_;
212  Token::Value keyword_token_;
213
214  // If input equals keyword's character at position, continue matching keyword
215  // from that position.
216  inline bool MatchKeywordStart(uc32 input,
217                                const char* keyword,
218                                int position,
219                                Token::Value token_if_match) {
220    if (input == keyword[position]) {
221      state_ = KEYWORD_PREFIX;
222      this->keyword_ = keyword;
223      this->counter_ = position + 1;
224      this->keyword_token_ = token_if_match;
225      return true;
226    }
227    return false;
228  }
229
230  // If input equals match character, transition to new state and return true.
231  inline bool MatchState(uc32 input, char match, State new_state) {
232    if (input == match) {
233      state_ = new_state;
234      return true;
235    }
236    return false;
237  }
238
239  inline bool MatchKeyword(uc32 input,
240                           char match,
241                           State new_state,
242                           Token::Value keyword_token) {
243    if (input == match) {  // Matched "do".
244      state_ = new_state;
245      token_ = keyword_token;
246      return true;
247    }
248    return false;
249  }
250
251  void Step(uc32 input);
252};
253
254
255class Scanner {
256 public:
257
258  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
259
260  // Construction
261  explicit Scanner(bool is_pre_parsing);
262
263  // Initialize the Scanner to scan source:
264  void Init(Handle<String> source,
265            unibrow::CharacterStream* stream,
266            int position);
267
268  // Returns the next token.
269  Token::Value Next();
270
271  // One token look-ahead (past the token returned by Next()).
272  Token::Value peek() const  { return next_.token; }
273
274  // Returns true if there was a line terminator before the peek'ed token.
275  bool has_line_terminator_before_next() const {
276    return has_line_terminator_before_next_;
277  }
278
279  struct Location {
280    Location(int b, int e) : beg_pos(b), end_pos(e) { }
281    Location() : beg_pos(0), end_pos(0) { }
282    int beg_pos;
283    int end_pos;
284  };
285
286  // Returns the location information for the current token
287  // (the token returned by Next()).
288  Location location() const  { return current_.location; }
289  Location peek_location() const  { return next_.location; }
290
291  // Returns the literal string, if any, for the current token (the
292  // token returned by Next()). The string is 0-terminated and in
293  // UTF-8 format; they may contain 0-characters. Literal strings are
294  // collected for identifiers, strings, and numbers.
295  // These functions only give the correct result if the literal
296  // was scanned between calls to StartLiteral() and TerminateLiteral().
297  const char* literal_string() const {
298    return current_.literal_buffer->data();
299  }
300  int literal_length() const {
301    // Excluding terminal '\0' added by TerminateLiteral().
302    return current_.literal_buffer->pos() - 1;
303  }
304
305  // Returns the literal string for the next token (the token that
306  // would be returned if Next() were called).
307  const char* next_literal_string() const {
308    return next_.literal_buffer->data();
309  }
310  // Returns the length of the next token (that would be returned if
311  // Next() were called).
312  int next_literal_length() const {
313    return next_.literal_buffer->pos() - 1;
314  }
315
316  Vector<const char> next_literal() const {
317    return Vector<const char>(next_literal_string(),
318                              next_literal_length());
319  }
320
321  // Scans the input as a regular expression pattern, previous
322  // character(s) must be /(=). Returns true if a pattern is scanned.
323  bool ScanRegExpPattern(bool seen_equal);
324  // Returns true if regexp flags are scanned (always since flags can
325  // be empty).
326  bool ScanRegExpFlags();
327
328  // Seek forward to the given position.  This operation does not
329  // work in general, for instance when there are pushed back
330  // characters, but works for seeking forward until simple delimiter
331  // tokens, which is what it is used for.
332  void SeekForward(int pos);
333
334  Handle<String> SubString(int start_pos, int end_pos);
335  bool stack_overflow() { return stack_overflow_; }
336
337  static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
338
339  // Tells whether the buffer contains an identifier (no escapes).
340  // Used for checking if a property name is an identifier.
341  static bool IsIdentifier(unibrow::CharacterStream* buffer);
342
343  static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
344  static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
345  static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
346  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
347
348  static const int kCharacterLookaheadBufferSize = 1;
349
350 private:
351  CharacterStreamUTF16Buffer char_stream_buffer_;
352  TwoByteStringUTF16Buffer two_byte_string_buffer_;
353
354  // Source.
355  UTF16Buffer* source_;
356  int position_;
357
358  // Buffer to hold literal values (identifiers, strings, numbers)
359  // using 0-terminated UTF-8 encoding.
360  UTF8Buffer literal_buffer_1_;
361  UTF8Buffer literal_buffer_2_;
362
363  bool stack_overflow_;
364  static StaticResource<Utf8Decoder> utf8_decoder_;
365
366  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
367  uc32 c0_;
368
369  // The current and look-ahead token.
370  struct TokenDesc {
371    Token::Value token;
372    Location location;
373    UTF8Buffer* literal_buffer;
374  };
375
376  TokenDesc current_;  // desc for current token (as returned by Next())
377  TokenDesc next_;     // desc for next token (one token look-ahead)
378  bool has_line_terminator_before_next_;
379  bool is_pre_parsing_;
380
381  // Literal buffer support
382  void StartLiteral();
383  void AddChar(uc32 ch);
384  void AddCharAdvance();
385  void TerminateLiteral();
386
387  // Low-level scanning support.
388  void Advance() { c0_ = source_->Advance(); }
389  void PushBack(uc32 ch) {
390    source_->PushBack(ch);
391    c0_ = ch;
392  }
393
394  bool SkipWhiteSpace();
395  Token::Value SkipSingleLineComment();
396  Token::Value SkipMultiLineComment();
397
398  inline Token::Value Select(Token::Value tok);
399  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
400
401  void Scan();
402  void ScanDecimalDigits();
403  Token::Value ScanNumber(bool seen_period);
404  Token::Value ScanIdentifier();
405  uc32 ScanHexEscape(uc32 c, int length);
406  uc32 ScanOctalEscape(uc32 c, int length);
407  void ScanEscape();
408  Token::Value ScanString();
409
410  // Scans a possible HTML comment -- begins with '<!'.
411  Token::Value ScanHtmlComment();
412
413  // Return the current source position.
414  int source_pos() {
415    return source_->pos() - kCharacterLookaheadBufferSize + position_;
416  }
417
418  // Decodes a unicode escape-sequence which is part of an identifier.
419  // If the escape sequence cannot be decoded the result is kBadRune.
420  uc32 ScanIdentifierUnicodeEscape();
421};
422
423} }  // namespace v8::internal
424
425#endif  // V8_SCANNER_H_
426