scanner.h revision 6ded16be15dd865a9b21ea304d5273c8be299c87
1// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40  UTF8Buffer();
41  ~UTF8Buffer();
42
43  void AddChar(uc32 c) {
44    ASSERT_NOT_NULL(data_);
45    if (cursor_ <= limit_ &&
46        static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
47      *cursor_++ = static_cast<char>(c);
48    } else {
49      AddCharSlow(c);
50    }
51  }
52
53  void Reset() {
54    if (data_ == NULL) {
55      data_ = NewArray<char>(kInitialCapacity);
56      limit_ = ComputeLimit(data_, kInitialCapacity);
57    }
58    cursor_ = data_;
59  }
60
61  int pos() const {
62    ASSERT_NOT_NULL(data_);
63    return static_cast<int>(cursor_ - data_);
64  }
65
66  char* data() const { return data_; }
67
68 private:
69  static const int kInitialCapacity = 256;
70  char* data_;
71  char* cursor_;
72  char* limit_;
73
74  int Capacity() const {
75    ASSERT_NOT_NULL(data_);
76    return static_cast<int>(limit_ - data_) + unibrow::Utf8::kMaxEncodedSize;
77  }
78
79  static char* ComputeLimit(char* data, int capacity) {
80    return (data + capacity) - unibrow::Utf8::kMaxEncodedSize;
81  }
82
83  void AddCharSlow(uc32 c);
84};
85
86
87// Interface through which the scanner reads characters from the input source.
88class UTF16Buffer {
89 public:
90  UTF16Buffer();
91  virtual ~UTF16Buffer() {}
92
93  virtual void PushBack(uc32 ch) = 0;
94  // Returns a value < 0 when the buffer end is reached.
95  virtual uc32 Advance() = 0;
96  virtual void SeekForward(int pos) = 0;
97
98  int pos() const { return pos_; }
99
100 protected:
101  int pos_;  // Current position in the buffer.
102  int end_;  // Position where scanning should stop (EOF).
103};
104
105
106// UTF16 buffer to read characters from a character stream.
107class CharacterStreamUTF16Buffer: public UTF16Buffer {
108 public:
109  CharacterStreamUTF16Buffer();
110  virtual ~CharacterStreamUTF16Buffer() {}
111  void Initialize(Handle<String> data,
112                  unibrow::CharacterStream* stream,
113                  int start_position,
114                  int end_position);
115  virtual void PushBack(uc32 ch);
116  virtual uc32 Advance();
117  virtual void SeekForward(int pos);
118
119 private:
120  List<uc32> pushback_buffer_;
121  uc32 last_;
122  unibrow::CharacterStream* stream_;
123
124  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
125};
126
127
128// UTF16 buffer to read characters from an external string.
129template <typename StringType, typename CharType>
130class ExternalStringUTF16Buffer: public UTF16Buffer {
131 public:
132  ExternalStringUTF16Buffer();
133  virtual ~ExternalStringUTF16Buffer() {}
134  void Initialize(Handle<StringType> data,
135                  int start_position,
136                  int end_position);
137  virtual void PushBack(uc32 ch);
138  virtual uc32 Advance();
139  virtual void SeekForward(int pos);
140
141 private:
142  const CharType* raw_data_;  // Pointer to the actual array of characters.
143};
144
145
146class KeywordMatcher {
147//  Incrementally recognize keywords.
148//
149//  Recognized keywords:
150//      break case catch const* continue debugger* default delete do else
151//      finally false for function if in instanceof native* new null
152//      return switch this throw true try typeof var void while with
153//
154//  *: Actually "future reserved keywords". These are the only ones we
155//     recognized, the remaining are allowed as identifiers.
156 public:
157  KeywordMatcher() : state_(INITIAL), token_(Token::IDENTIFIER) {}
158
159  Token::Value token() { return token_; }
160
161  inline void AddChar(uc32 input) {
162    if (state_ != UNMATCHABLE) {
163      Step(input);
164    }
165  }
166
167  void Fail() {
168    token_ = Token::IDENTIFIER;
169    state_ = UNMATCHABLE;
170  }
171
172 private:
173  enum State {
174    UNMATCHABLE,
175    INITIAL,
176    KEYWORD_PREFIX,
177    KEYWORD_MATCHED,
178    C,
179    CA,
180    CO,
181    CON,
182    D,
183    DE,
184    F,
185    I,
186    IN,
187    N,
188    T,
189    TH,
190    TR,
191    V,
192    W
193  };
194
195  struct FirstState {
196    const char* keyword;
197    State state;
198    Token::Value token;
199  };
200
201  // Range of possible first characters of a keyword.
202  static const unsigned int kFirstCharRangeMin = 'b';
203  static const unsigned int kFirstCharRangeMax = 'w';
204  static const unsigned int kFirstCharRangeLength =
205      kFirstCharRangeMax - kFirstCharRangeMin + 1;
206  // State map for first keyword character range.
207  static FirstState first_states_[kFirstCharRangeLength];
208
209  // Current state.
210  State state_;
211  // Token for currently added characters.
212  Token::Value token_;
213
214  // Matching a specific keyword string (there is only one possible valid
215  // keyword with the current prefix).
216  const char* keyword_;
217  int counter_;
218  Token::Value keyword_token_;
219
220  // If input equals keyword's character at position, continue matching keyword
221  // from that position.
222  inline bool MatchKeywordStart(uc32 input,
223                                const char* keyword,
224                                int position,
225                                Token::Value token_if_match) {
226    if (input == keyword[position]) {
227      state_ = KEYWORD_PREFIX;
228      this->keyword_ = keyword;
229      this->counter_ = position + 1;
230      this->keyword_token_ = token_if_match;
231      return true;
232    }
233    return false;
234  }
235
236  // If input equals match character, transition to new state and return true.
237  inline bool MatchState(uc32 input, char match, State new_state) {
238    if (input == match) {
239      state_ = new_state;
240      return true;
241    }
242    return false;
243  }
244
245  inline bool MatchKeyword(uc32 input,
246                           char match,
247                           State new_state,
248                           Token::Value keyword_token) {
249    if (input == match) {  // Matched "do".
250      state_ = new_state;
251      token_ = keyword_token;
252      return true;
253    }
254    return false;
255  }
256
257  void Step(uc32 input);
258};
259
260
261enum ParserMode { PARSE, PREPARSE };
262enum ParserLanguage { JAVASCRIPT, JSON };
263
264
265class Scanner {
266 public:
267  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
268
269  // Construction
270  explicit Scanner(ParserMode parse_mode);
271
272  // Initialize the Scanner to scan source.
273  void Initialize(Handle<String> source,
274                  ParserLanguage language);
275  void Initialize(Handle<String> source,
276                  unibrow::CharacterStream* stream,
277                  ParserLanguage language);
278  void Initialize(Handle<String> source,
279                  int start_position, int end_position,
280                  ParserLanguage language);
281
282  // Returns the next token.
283  Token::Value Next();
284
285  // One token look-ahead (past the token returned by Next()).
286  Token::Value peek() const  { return next_.token; }
287
288  // Returns true if there was a line terminator before the peek'ed token.
289  bool has_line_terminator_before_next() const {
290    return has_line_terminator_before_next_;
291  }
292
293  struct Location {
294    Location(int b, int e) : beg_pos(b), end_pos(e) { }
295    Location() : beg_pos(0), end_pos(0) { }
296    int beg_pos;
297    int end_pos;
298  };
299
300  // Returns the location information for the current token
301  // (the token returned by Next()).
302  Location location() const  { return current_.location; }
303  Location peek_location() const  { return next_.location; }
304
305  // Returns the literal string, if any, for the current token (the
306  // token returned by Next()). The string is 0-terminated and in
307  // UTF-8 format; they may contain 0-characters. Literal strings are
308  // collected for identifiers, strings, and numbers.
309  // These functions only give the correct result if the literal
310  // was scanned between calls to StartLiteral() and TerminateLiteral().
311  const char* literal_string() const {
312    return current_.literal_buffer->data();
313  }
314  int literal_length() const {
315    // Excluding terminal '\0' added by TerminateLiteral().
316    return current_.literal_buffer->pos() - 1;
317  }
318
319  // Returns the literal string for the next token (the token that
320  // would be returned if Next() were called).
321  const char* next_literal_string() const {
322    return next_.literal_buffer->data();
323  }
324  // Returns the length of the next token (that would be returned if
325  // Next() were called).
326  int next_literal_length() const {
327    return next_.literal_buffer->pos() - 1;
328  }
329
330  Vector<const char> next_literal() const {
331    return Vector<const char>(next_literal_string(),
332                              next_literal_length());
333  }
334
335  // Scans the input as a regular expression pattern, previous
336  // character(s) must be /(=). Returns true if a pattern is scanned.
337  bool ScanRegExpPattern(bool seen_equal);
338  // Returns true if regexp flags are scanned (always since flags can
339  // be empty).
340  bool ScanRegExpFlags();
341
342  // Seek forward to the given position.  This operation does not
343  // work in general, for instance when there are pushed back
344  // characters, but works for seeking forward until simple delimiter
345  // tokens, which is what it is used for.
346  void SeekForward(int pos);
347
348  bool stack_overflow() { return stack_overflow_; }
349
350  static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
351
352  // Tells whether the buffer contains an identifier (no escapes).
353  // Used for checking if a property name is an identifier.
354  static bool IsIdentifier(unibrow::CharacterStream* buffer);
355
356  static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
357  static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
358  static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
359  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
360
361  static const int kCharacterLookaheadBufferSize = 1;
362  static const int kNoEndPosition = 1;
363
364 private:
365  void Init(Handle<String> source,
366            unibrow::CharacterStream* stream,
367            int start_position, int end_position,
368            ParserLanguage language);
369
370
371  // Different UTF16 buffers used to pull characters from. Based on input one of
372  // these will be initialized as the actual data source.
373  CharacterStreamUTF16Buffer char_stream_buffer_;
374  ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
375      two_byte_string_buffer_;
376  ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
377
378  // Source. Will point to one of the buffers declared above.
379  UTF16Buffer* source_;
380
381  // Used to convert the source string into a character stream when a stream
382  // is not passed to the scanner.
383  SafeStringInputBuffer safe_string_input_buffer_;
384
385  // Buffer to hold literal values (identifiers, strings, numbers)
386  // using 0-terminated UTF-8 encoding.
387  UTF8Buffer literal_buffer_1_;
388  UTF8Buffer literal_buffer_2_;
389
390  bool stack_overflow_;
391  static StaticResource<Utf8Decoder> utf8_decoder_;
392
393  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
394  uc32 c0_;
395
396  // The current and look-ahead token.
397  struct TokenDesc {
398    Token::Value token;
399    Location location;
400    UTF8Buffer* literal_buffer;
401  };
402
403  TokenDesc current_;  // desc for current token (as returned by Next())
404  TokenDesc next_;     // desc for next token (one token look-ahead)
405  bool has_line_terminator_before_next_;
406  bool is_pre_parsing_;
407  bool is_parsing_json_;
408
409  // Literal buffer support
410  void StartLiteral();
411  void AddChar(uc32 ch);
412  void AddCharAdvance();
413  void TerminateLiteral();
414
415  // Low-level scanning support.
416  void Advance() { c0_ = source_->Advance(); }
417  void PushBack(uc32 ch) {
418    source_->PushBack(ch);
419    c0_ = ch;
420  }
421
422  bool SkipWhiteSpace() {
423    if (is_parsing_json_) {
424      return SkipJsonWhiteSpace();
425    } else {
426      return SkipJavaScriptWhiteSpace();
427    }
428  }
429  bool SkipJavaScriptWhiteSpace();
430  bool SkipJsonWhiteSpace();
431  Token::Value SkipSingleLineComment();
432  Token::Value SkipMultiLineComment();
433
434  inline Token::Value Select(Token::Value tok);
435  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
436
437  inline void Scan() {
438    if (is_parsing_json_) {
439      ScanJson();
440    } else {
441      ScanJavaScript();
442    }
443  }
444
445  // Scans a single JavaScript token.
446  void ScanJavaScript();
447
448  // Scan a single JSON token. The JSON lexical grammar is specified in the
449  // ECMAScript 5 standard, section 15.12.1.1.
450  // Recognizes all of the single-character tokens directly, or calls a function
451  // to scan a number, string or identifier literal.
452  // The only allowed whitespace characters between tokens are tab,
453  // carrige-return, newline and space.
454  void ScanJson();
455
456  // A JSON number (production JSONNumber) is a subset of the valid JavaScript
457  // decimal number literals.
458  // It includes an optional minus sign, must have at least one
459  // digit before and after a decimal point, may not have prefixed zeros (unless
460  // the integer part is zero), and may include an exponent part (e.g., "e-10").
461  // Hexadecimal and octal numbers are not allowed.
462  Token::Value ScanJsonNumber();
463  // A JSON string (production JSONString) is subset of valid JavaScript string
464  // literals. The string must only be double-quoted (not single-quoted), and
465  // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
466  // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
467  Token::Value ScanJsonString();
468  // Used to recognizes one of the literals "true", "false", or "null". These
469  // are the only valid JSON identifiers (productions JSONBooleanLiteral,
470  // JSONNullLiteral).
471  Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
472
473  void ScanDecimalDigits();
474  Token::Value ScanNumber(bool seen_period);
475  Token::Value ScanIdentifier();
476  uc32 ScanHexEscape(uc32 c, int length);
477  uc32 ScanOctalEscape(uc32 c, int length);
478  void ScanEscape();
479  Token::Value ScanString();
480
481  // Scans a possible HTML comment -- begins with '<!'.
482  Token::Value ScanHtmlComment();
483
484  // Return the current source position.
485  int source_pos() {
486    return source_->pos() - kCharacterLookaheadBufferSize;
487  }
488
489  // Decodes a unicode escape-sequence which is part of an identifier.
490  // If the escape sequence cannot be decoded the result is kBadRune.
491  uc32 ScanIdentifierUnicodeEscape();
492};
493
494} }  // namespace v8::internal
495
496#endif  // V8_SCANNER_H_
497