scanner.h revision 80d68eab642096c1a48b6474d6ec33064b0ad1f5
1// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40  UTF8Buffer();
41  ~UTF8Buffer();
42
43  inline void AddChar(uc32 c) {
44    if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
45      buffer_.Add(static_cast<char>(c));
46    } else {
47      AddCharSlow(c);
48    }
49  }
50
51  void StartLiteral() {
52    buffer_.StartSequence();
53  }
54
55  Vector<const char> EndLiteral() {
56    buffer_.Add(kEndMarker);
57    Vector<char> sequence = buffer_.EndSequence();
58    return Vector<const char>(sequence.start(), sequence.length());
59  }
60
61  void DropLiteral() {
62    buffer_.DropSequence();
63  }
64
65  void Reset() {
66    buffer_.Reset();
67  }
68
69  // The end marker added after a parsed literal.
70  // Using zero allows the usage of strlen and similar functions on
71  // identifiers and numbers (but not strings, since they may contain zero
72  // bytes).
73  // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
74  // an utf-8 string. This requires changes in all places that uses
75  // str-functions on the literals, but allows a single pointer to represent
76  // the literal, even if it contains embedded zeros.
77  static const char kEndMarker = '\x00';
78 private:
79  static const int kInitialCapacity = 256;
80  SequenceCollector<char, 4> buffer_;
81
82  void AddCharSlow(uc32 c);
83};
84
85
86// Interface through which the scanner reads characters from the input source.
87class UTF16Buffer {
88 public:
89  UTF16Buffer();
90  virtual ~UTF16Buffer() {}
91
92  virtual void PushBack(uc32 ch) = 0;
93  // Returns a value < 0 when the buffer end is reached.
94  virtual uc32 Advance() = 0;
95  virtual void SeekForward(int pos) = 0;
96
97  int pos() const { return pos_; }
98
99 protected:
100  int pos_;  // Current position in the buffer.
101  int end_;  // Position where scanning should stop (EOF).
102};
103
104
105// UTF16 buffer to read characters from a character stream.
106class CharacterStreamUTF16Buffer: public UTF16Buffer {
107 public:
108  CharacterStreamUTF16Buffer();
109  virtual ~CharacterStreamUTF16Buffer() {}
110  void Initialize(Handle<String> data,
111                  unibrow::CharacterStream* stream,
112                  int start_position,
113                  int end_position);
114  virtual void PushBack(uc32 ch);
115  virtual uc32 Advance();
116  virtual void SeekForward(int pos);
117
118 private:
119  List<uc32> pushback_buffer_;
120  uc32 last_;
121  unibrow::CharacterStream* stream_;
122
123  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
124};
125
126
127// UTF16 buffer to read characters from an external string.
128template <typename StringType, typename CharType>
129class ExternalStringUTF16Buffer: public UTF16Buffer {
130 public:
131  ExternalStringUTF16Buffer();
132  virtual ~ExternalStringUTF16Buffer() {}
133  void Initialize(Handle<StringType> data,
134                  int start_position,
135                  int end_position);
136  virtual void PushBack(uc32 ch);
137  virtual uc32 Advance();
138  virtual void SeekForward(int pos);
139
140 private:
141  const CharType* raw_data_;  // Pointer to the actual array of characters.
142};
143
144
145class KeywordMatcher {
146//  Incrementally recognize keywords.
147//
148//  Recognized keywords:
149//      break case catch const* continue debugger* default delete do else
150//      finally false for function if in instanceof native* new null
151//      return switch this throw true try typeof var void while with
152//
153//  *: Actually "future reserved keywords". These are the only ones we
154//     recognized, the remaining are allowed as identifiers.
155 public:
156  KeywordMatcher()
157      : state_(INITIAL),
158        token_(Token::IDENTIFIER),
159        keyword_(NULL),
160        counter_(0),
161        keyword_token_(Token::ILLEGAL) {}
162
163  Token::Value token() { return token_; }
164
165  inline void AddChar(uc32 input) {
166    if (state_ != UNMATCHABLE) {
167      Step(input);
168    }
169  }
170
171  void Fail() {
172    token_ = Token::IDENTIFIER;
173    state_ = UNMATCHABLE;
174  }
175
176 private:
177  enum State {
178    UNMATCHABLE,
179    INITIAL,
180    KEYWORD_PREFIX,
181    KEYWORD_MATCHED,
182    C,
183    CA,
184    CO,
185    CON,
186    D,
187    DE,
188    F,
189    I,
190    IN,
191    N,
192    T,
193    TH,
194    TR,
195    V,
196    W
197  };
198
199  struct FirstState {
200    const char* keyword;
201    State state;
202    Token::Value token;
203  };
204
205  // Range of possible first characters of a keyword.
206  static const unsigned int kFirstCharRangeMin = 'b';
207  static const unsigned int kFirstCharRangeMax = 'w';
208  static const unsigned int kFirstCharRangeLength =
209      kFirstCharRangeMax - kFirstCharRangeMin + 1;
210  // State map for first keyword character range.
211  static FirstState first_states_[kFirstCharRangeLength];
212
213  // If input equals keyword's character at position, continue matching keyword
214  // from that position.
215  inline bool MatchKeywordStart(uc32 input,
216                                const char* keyword,
217                                int position,
218                                Token::Value token_if_match) {
219    if (input == keyword[position]) {
220      state_ = KEYWORD_PREFIX;
221      this->keyword_ = keyword;
222      this->counter_ = position + 1;
223      this->keyword_token_ = token_if_match;
224      return true;
225    }
226    return false;
227  }
228
229  // If input equals match character, transition to new state and return true.
230  inline bool MatchState(uc32 input, char match, State new_state) {
231    if (input == match) {
232      state_ = new_state;
233      return true;
234    }
235    return false;
236  }
237
238  inline bool MatchKeyword(uc32 input,
239                           char match,
240                           State new_state,
241                           Token::Value keyword_token) {
242    if (input != match) {
243      return false;
244    }
245    state_ = new_state;
246    token_ = keyword_token;
247    return true;
248  }
249
250  void Step(uc32 input);
251
252  // Current state.
253  State state_;
254  // Token for currently added characters.
255  Token::Value token_;
256
257  // Matching a specific keyword string (there is only one possible valid
258  // keyword with the current prefix).
259  const char* keyword_;
260  int counter_;
261  Token::Value keyword_token_;
262};
263
264
265enum ParserMode { PARSE, PREPARSE };
266enum ParserLanguage { JAVASCRIPT, JSON };
267
268
269class Scanner {
270 public:
271  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
272
273  class LiteralScope {
274   public:
275    explicit LiteralScope(Scanner* self);
276    ~LiteralScope();
277    void Complete();
278
279   private:
280    Scanner* scanner_;
281    bool complete_;
282  };
283
284  // Construction
285  explicit Scanner(ParserMode parse_mode);
286
287  // Initialize the Scanner to scan source.
288  void Initialize(Handle<String> source,
289                  ParserLanguage language);
290  void Initialize(Handle<String> source,
291                  unibrow::CharacterStream* stream,
292                  ParserLanguage language);
293  void Initialize(Handle<String> source,
294                  int start_position, int end_position,
295                  ParserLanguage language);
296
297  // Returns the next token.
298  Token::Value Next();
299
300  // One token look-ahead (past the token returned by Next()).
301  Token::Value peek() const  { return next_.token; }
302
303  // Returns true if there was a line terminator before the peek'ed token.
304  bool has_line_terminator_before_next() const {
305    return has_line_terminator_before_next_;
306  }
307
308  struct Location {
309    Location(int b, int e) : beg_pos(b), end_pos(e) { }
310    Location() : beg_pos(0), end_pos(0) { }
311    int beg_pos;
312    int end_pos;
313  };
314
315  // Returns the location information for the current token
316  // (the token returned by Next()).
317  Location location() const  { return current_.location; }
318  Location peek_location() const  { return next_.location; }
319
320  // Returns the literal string, if any, for the current token (the
321  // token returned by Next()). The string is 0-terminated and in
322  // UTF-8 format; they may contain 0-characters. Literal strings are
323  // collected for identifiers, strings, and numbers.
324  // These functions only give the correct result if the literal
325  // was scanned between calls to StartLiteral() and TerminateLiteral().
326  const char* literal_string() const {
327    return current_.literal_chars.start();
328  }
329
330  int literal_length() const {
331    // Excluding terminal '\x00' added by TerminateLiteral().
332    return current_.literal_chars.length() - 1;
333  }
334
335  Vector<const char> literal() const {
336    return Vector<const char>(literal_string(), literal_length());
337  }
338
339  // Returns the literal string for the next token (the token that
340  // would be returned if Next() were called).
341  const char* next_literal_string() const {
342    return next_.literal_chars.start();
343  }
344
345
346  // Returns the length of the next token (that would be returned if
347  // Next() were called).
348  int next_literal_length() const {
349    // Excluding terminal '\x00' added by TerminateLiteral().
350    return next_.literal_chars.length() - 1;
351  }
352
353  Vector<const char> next_literal() const {
354    return Vector<const char>(next_literal_string(), next_literal_length());
355  }
356
357  // Scans the input as a regular expression pattern, previous
358  // character(s) must be /(=). Returns true if a pattern is scanned.
359  bool ScanRegExpPattern(bool seen_equal);
360  // Returns true if regexp flags are scanned (always since flags can
361  // be empty).
362  bool ScanRegExpFlags();
363
364  // Seek forward to the given position.  This operation does not
365  // work in general, for instance when there are pushed back
366  // characters, but works for seeking forward until simple delimiter
367  // tokens, which is what it is used for.
368  void SeekForward(int pos);
369
370  bool stack_overflow() { return stack_overflow_; }
371
372  static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
373
374  // Tells whether the buffer contains an identifier (no escapes).
375  // Used for checking if a property name is an identifier.
376  static bool IsIdentifier(unibrow::CharacterStream* buffer);
377
378  static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
379  static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
380  static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
381  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
382
383  static const int kCharacterLookaheadBufferSize = 1;
384  static const int kNoEndPosition = 1;
385
386 private:
387  // The current and look-ahead token.
388  struct TokenDesc {
389    Token::Value token;
390    Location location;
391    Vector<const char> literal_chars;
392  };
393
394  void Init(Handle<String> source,
395            unibrow::CharacterStream* stream,
396            int start_position, int end_position,
397            ParserLanguage language);
398
399  // Literal buffer support
400  inline void StartLiteral();
401  inline void AddChar(uc32 ch);
402  inline void AddCharAdvance();
403  inline void TerminateLiteral();
404  // Stops scanning of a literal, e.g., due to an encountered error.
405  inline void DropLiteral();
406
407  // Low-level scanning support.
408  void Advance() { c0_ = source_->Advance(); }
409  void PushBack(uc32 ch) {
410    source_->PushBack(ch);
411    c0_ = ch;
412  }
413
414  bool SkipWhiteSpace() {
415    if (is_parsing_json_) {
416      return SkipJsonWhiteSpace();
417    } else {
418      return SkipJavaScriptWhiteSpace();
419    }
420  }
421
422  bool SkipJavaScriptWhiteSpace();
423  bool SkipJsonWhiteSpace();
424  Token::Value SkipSingleLineComment();
425  Token::Value SkipMultiLineComment();
426
427  inline Token::Value Select(Token::Value tok);
428  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
429
430  inline void Scan() {
431    if (is_parsing_json_) {
432      ScanJson();
433    } else {
434      ScanJavaScript();
435    }
436  }
437
438  // Scans a single JavaScript token.
439  void ScanJavaScript();
440
441  // Scan a single JSON token. The JSON lexical grammar is specified in the
442  // ECMAScript 5 standard, section 15.12.1.1.
443  // Recognizes all of the single-character tokens directly, or calls a function
444  // to scan a number, string or identifier literal.
445  // The only allowed whitespace characters between tokens are tab,
446  // carrige-return, newline and space.
447  void ScanJson();
448
449  // A JSON number (production JSONNumber) is a subset of the valid JavaScript
450  // decimal number literals.
451  // It includes an optional minus sign, must have at least one
452  // digit before and after a decimal point, may not have prefixed zeros (unless
453  // the integer part is zero), and may include an exponent part (e.g., "e-10").
454  // Hexadecimal and octal numbers are not allowed.
455  Token::Value ScanJsonNumber();
456
457  // A JSON string (production JSONString) is subset of valid JavaScript string
458  // literals. The string must only be double-quoted (not single-quoted), and
459  // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
460  // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
461  Token::Value ScanJsonString();
462
463  // Used to recognizes one of the literals "true", "false", or "null". These
464  // are the only valid JSON identifiers (productions JSONBooleanLiteral,
465  // JSONNullLiteral).
466  Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
467
468  void ScanDecimalDigits();
469  Token::Value ScanNumber(bool seen_period);
470  Token::Value ScanIdentifier();
471  uc32 ScanHexEscape(uc32 c, int length);
472  uc32 ScanOctalEscape(uc32 c, int length);
473  void ScanEscape();
474  Token::Value ScanString();
475
476  // Scans a possible HTML comment -- begins with '<!'.
477  Token::Value ScanHtmlComment();
478
479  // Return the current source position.
480  int source_pos() {
481    return source_->pos() - kCharacterLookaheadBufferSize;
482  }
483
484  // Decodes a unicode escape-sequence which is part of an identifier.
485  // If the escape sequence cannot be decoded the result is kBadRune.
486  uc32 ScanIdentifierUnicodeEscape();
487
488  TokenDesc current_;  // desc for current token (as returned by Next())
489  TokenDesc next_;     // desc for next token (one token look-ahead)
490  bool has_line_terminator_before_next_;
491  bool is_pre_parsing_;
492  bool is_parsing_json_;
493
494  // Different UTF16 buffers used to pull characters from. Based on input one of
495  // these will be initialized as the actual data source.
496  CharacterStreamUTF16Buffer char_stream_buffer_;
497  ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
498      two_byte_string_buffer_;
499  ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
500
501  // Source. Will point to one of the buffers declared above.
502  UTF16Buffer* source_;
503
504  // Used to convert the source string into a character stream when a stream
505  // is not passed to the scanner.
506  SafeStringInputBuffer safe_string_input_buffer_;
507
508  // Buffer to hold literal values (identifiers, strings, numbers)
509  // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
510  UTF8Buffer literal_buffer_;
511
512  bool stack_overflow_;
513  static StaticResource<Utf8Decoder> utf8_decoder_;
514
515  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
516  uc32 c0_;
517};
518
519} }  // namespace v8::internal
520
521#endif  // V8_SCANNER_H_
522