scanner.h revision 0d5e116f6aee03185f237311a943491bb079a768
1// Copyright 2010 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#ifndef V8_SCANNER_H_
29#define V8_SCANNER_H_
30
31#include "token.h"
32#include "char-predicates-inl.h"
33
34namespace v8 {
35namespace internal {
36
37
38class UTF8Buffer {
39 public:
40  UTF8Buffer();
41  ~UTF8Buffer();
42
43  inline void AddChar(uc32 c) {
44    if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
45      buffer_.Add(static_cast<char>(c));
46    } else {
47      AddCharSlow(c);
48    }
49  }
50
51  void StartLiteral() {
52    buffer_.StartSequence();
53  }
54
55  Vector<const char> EndLiteral() {
56    buffer_.Add(kEndMarker);
57    Vector<char> sequence = buffer_.EndSequence();
58    return Vector<const char>(sequence.start(), sequence.length());
59  }
60
61  void DropLiteral() {
62    buffer_.DropSequence();
63  }
64
65  void Reset() {
66    buffer_.Reset();
67  }
68
69  // The end marker added after a parsed literal.
70  // Using zero allows the usage of strlen and similar functions on
71  // identifiers and numbers (but not strings, since they may contain zero
72  // bytes).
73  // TODO(lrn): Use '\xff' as end marker, since it cannot occur inside
74  // an utf-8 string. This requires changes in all places that uses
75  // str-functions on the literals, but allows a single pointer to represent
76  // the literal, even if it contains embedded zeros.
77  static const char kEndMarker = '\x00';
78 private:
79  static const int kInitialCapacity = 256;
80  SequenceCollector<char, 4> buffer_;
81
82  void AddCharSlow(uc32 c);
83};
84
85
86// Interface through which the scanner reads characters from the input source.
87class UTF16Buffer {
88 public:
89  UTF16Buffer();
90  virtual ~UTF16Buffer() {}
91
92  virtual void PushBack(uc32 ch) = 0;
93  // Returns a value < 0 when the buffer end is reached.
94  virtual uc32 Advance() = 0;
95  virtual void SeekForward(int pos) = 0;
96
97  int pos() const { return pos_; }
98
99 protected:
100  int pos_;  // Current position in the buffer.
101  int end_;  // Position where scanning should stop (EOF).
102};
103
104
105// UTF16 buffer to read characters from a character stream.
106class CharacterStreamUTF16Buffer: public UTF16Buffer {
107 public:
108  CharacterStreamUTF16Buffer();
109  virtual ~CharacterStreamUTF16Buffer() {}
110  void Initialize(Handle<String> data,
111                  unibrow::CharacterStream* stream,
112                  int start_position,
113                  int end_position);
114  virtual void PushBack(uc32 ch);
115  virtual uc32 Advance();
116  virtual void SeekForward(int pos);
117
118 private:
119  List<uc32> pushback_buffer_;
120  uc32 last_;
121  unibrow::CharacterStream* stream_;
122
123  List<uc32>* pushback_buffer() { return &pushback_buffer_; }
124};
125
126
127// UTF16 buffer to read characters from an external string.
128template <typename StringType, typename CharType>
129class ExternalStringUTF16Buffer: public UTF16Buffer {
130 public:
131  ExternalStringUTF16Buffer();
132  virtual ~ExternalStringUTF16Buffer() {}
133  void Initialize(Handle<StringType> data,
134                  int start_position,
135                  int end_position);
136  virtual void PushBack(uc32 ch);
137  virtual uc32 Advance();
138  virtual void SeekForward(int pos);
139
140 private:
141  const CharType* raw_data_;  // Pointer to the actual array of characters.
142};
143
144
145class KeywordMatcher {
146//  Incrementally recognize keywords.
147//
148//  Recognized keywords:
149//      break case catch const* continue debugger* default delete do else
150//      finally false for function if in instanceof native* new null
151//      return switch this throw true try typeof var void while with
152//
153//  *: Actually "future reserved keywords". These are the only ones we
154//     recognized, the remaining are allowed as identifiers.
155 public:
156  KeywordMatcher()
157      : state_(INITIAL),
158        token_(Token::IDENTIFIER),
159        keyword_(NULL),
160        counter_(0),
161        keyword_token_(Token::ILLEGAL) {}
162
163  Token::Value token() { return token_; }
164
165  inline void AddChar(uc32 input) {
166    if (state_ != UNMATCHABLE) {
167      Step(input);
168    }
169  }
170
171  void Fail() {
172    token_ = Token::IDENTIFIER;
173    state_ = UNMATCHABLE;
174  }
175
176 private:
177  enum State {
178    UNMATCHABLE,
179    INITIAL,
180    KEYWORD_PREFIX,
181    KEYWORD_MATCHED,
182    C,
183    CA,
184    CO,
185    CON,
186    D,
187    DE,
188    F,
189    I,
190    IN,
191    N,
192    T,
193    TH,
194    TR,
195    V,
196    W
197  };
198
199  struct FirstState {
200    const char* keyword;
201    State state;
202    Token::Value token;
203  };
204
205  // Range of possible first characters of a keyword.
206  static const unsigned int kFirstCharRangeMin = 'b';
207  static const unsigned int kFirstCharRangeMax = 'w';
208  static const unsigned int kFirstCharRangeLength =
209      kFirstCharRangeMax - kFirstCharRangeMin + 1;
210  // State map for first keyword character range.
211  static FirstState first_states_[kFirstCharRangeLength];
212
213  // If input equals keyword's character at position, continue matching keyword
214  // from that position.
215  inline bool MatchKeywordStart(uc32 input,
216                                const char* keyword,
217                                int position,
218                                Token::Value token_if_match) {
219    if (input == keyword[position]) {
220      state_ = KEYWORD_PREFIX;
221      this->keyword_ = keyword;
222      this->counter_ = position + 1;
223      this->keyword_token_ = token_if_match;
224      return true;
225    }
226    return false;
227  }
228
229  // If input equals match character, transition to new state and return true.
230  inline bool MatchState(uc32 input, char match, State new_state) {
231    if (input == match) {
232      state_ = new_state;
233      return true;
234    }
235    return false;
236  }
237
238  inline bool MatchKeyword(uc32 input,
239                           char match,
240                           State new_state,
241                           Token::Value keyword_token) {
242    if (input != match) {
243      return false;
244    }
245    state_ = new_state;
246    token_ = keyword_token;
247    return true;
248  }
249
250  void Step(uc32 input);
251
252  // Current state.
253  State state_;
254  // Token for currently added characters.
255  Token::Value token_;
256
257  // Matching a specific keyword string (there is only one possible valid
258  // keyword with the current prefix).
259  const char* keyword_;
260  int counter_;
261  Token::Value keyword_token_;
262};
263
264
265enum ParserMode { PARSE, PREPARSE };
266enum ParserLanguage { JAVASCRIPT, JSON };
267
268
269class Scanner {
270 public:
271  typedef unibrow::Utf8InputBuffer<1024> Utf8Decoder;
272
273  class LiteralScope {
274   public:
275    explicit LiteralScope(Scanner* self);
276    ~LiteralScope();
277    void Complete();
278
279   private:
280    Scanner* scanner_;
281    bool complete_;
282  };
283
284  Scanner();
285
286  // Initialize the Scanner to scan source.
287  void Initialize(Handle<String> source,
288                  ParserLanguage language);
289  void Initialize(Handle<String> source,
290                  unibrow::CharacterStream* stream,
291                  ParserLanguage language);
292  void Initialize(Handle<String> source,
293                  int start_position, int end_position,
294                  ParserLanguage language);
295
296  // Returns the next token.
297  Token::Value Next();
298
299  // One token look-ahead (past the token returned by Next()).
300  Token::Value peek() const { return next_.token; }
301
302  // Returns true if there was a line terminator before the peek'ed token.
303  bool has_line_terminator_before_next() const {
304    return has_line_terminator_before_next_;
305  }
306
307  struct Location {
308    Location(int b, int e) : beg_pos(b), end_pos(e) { }
309    Location() : beg_pos(0), end_pos(0) { }
310    int beg_pos;
311    int end_pos;
312  };
313
314  // Returns the location information for the current token
315  // (the token returned by Next()).
316  Location location() const { return current_.location; }
317  Location peek_location() const { return next_.location; }
318
319  // Returns the literal string, if any, for the current token (the
320  // token returned by Next()). The string is 0-terminated and in
321  // UTF-8 format; they may contain 0-characters. Literal strings are
322  // collected for identifiers, strings, and numbers.
323  // These functions only give the correct result if the literal
324  // was scanned between calls to StartLiteral() and TerminateLiteral().
325  const char* literal_string() const {
326    return current_.literal_chars.start();
327  }
328
329  int literal_length() const {
330    // Excluding terminal '\x00' added by TerminateLiteral().
331    return current_.literal_chars.length() - 1;
332  }
333
334  Vector<const char> literal() const {
335    return Vector<const char>(literal_string(), literal_length());
336  }
337
338  // Returns the literal string for the next token (the token that
339  // would be returned if Next() were called).
340  const char* next_literal_string() const {
341    return next_.literal_chars.start();
342  }
343
344
345  // Returns the length of the next token (that would be returned if
346  // Next() were called).
347  int next_literal_length() const {
348    // Excluding terminal '\x00' added by TerminateLiteral().
349    return next_.literal_chars.length() - 1;
350  }
351
352  Vector<const char> next_literal() const {
353    return Vector<const char>(next_literal_string(), next_literal_length());
354  }
355
356  // Scans the input as a regular expression pattern, previous
357  // character(s) must be /(=). Returns true if a pattern is scanned.
358  bool ScanRegExpPattern(bool seen_equal);
359  // Returns true if regexp flags are scanned (always since flags can
360  // be empty).
361  bool ScanRegExpFlags();
362
363  // Seek forward to the given position.  This operation does not
364  // work in general, for instance when there are pushed back
365  // characters, but works for seeking forward until simple delimiter
366  // tokens, which is what it is used for.
367  void SeekForward(int pos);
368
369  bool stack_overflow() { return stack_overflow_; }
370
371  static StaticResource<Utf8Decoder>* utf8_decoder() { return &utf8_decoder_; }
372
373  // Tells whether the buffer contains an identifier (no escapes).
374  // Used for checking if a property name is an identifier.
375  static bool IsIdentifier(unibrow::CharacterStream* buffer);
376
377  static unibrow::Predicate<IdentifierStart, 128> kIsIdentifierStart;
378  static unibrow::Predicate<IdentifierPart, 128> kIsIdentifierPart;
379  static unibrow::Predicate<unibrow::LineTerminator, 128> kIsLineTerminator;
380  static unibrow::Predicate<unibrow::WhiteSpace, 128> kIsWhiteSpace;
381
382  static const int kCharacterLookaheadBufferSize = 1;
383  static const int kNoEndPosition = 1;
384
385 private:
386  // The current and look-ahead token.
387  struct TokenDesc {
388    Token::Value token;
389    Location location;
390    Vector<const char> literal_chars;
391  };
392
393  void Init(Handle<String> source,
394            unibrow::CharacterStream* stream,
395            int start_position, int end_position,
396            ParserLanguage language);
397
398  // Literal buffer support
399  inline void StartLiteral();
400  inline void AddChar(uc32 ch);
401  inline void AddCharAdvance();
402  inline void TerminateLiteral();
403  // Stops scanning of a literal, e.g., due to an encountered error.
404  inline void DropLiteral();
405
406  // Low-level scanning support.
407  void Advance() { c0_ = source_->Advance(); }
408  void PushBack(uc32 ch) {
409    source_->PushBack(ch);
410    c0_ = ch;
411  }
412
413  bool SkipWhiteSpace() {
414    if (is_parsing_json_) {
415      return SkipJsonWhiteSpace();
416    } else {
417      return SkipJavaScriptWhiteSpace();
418    }
419  }
420
421  bool SkipJavaScriptWhiteSpace();
422  bool SkipJsonWhiteSpace();
423  Token::Value SkipSingleLineComment();
424  Token::Value SkipMultiLineComment();
425
426  inline Token::Value Select(Token::Value tok);
427  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_);
428
429  inline void Scan() {
430    if (is_parsing_json_) {
431      ScanJson();
432    } else {
433      ScanJavaScript();
434    }
435  }
436
437  // Scans a single JavaScript token.
438  void ScanJavaScript();
439
440  // Scan a single JSON token. The JSON lexical grammar is specified in the
441  // ECMAScript 5 standard, section 15.12.1.1.
442  // Recognizes all of the single-character tokens directly, or calls a function
443  // to scan a number, string or identifier literal.
444  // The only allowed whitespace characters between tokens are tab,
445  // carrige-return, newline and space.
446  void ScanJson();
447
448  // A JSON number (production JSONNumber) is a subset of the valid JavaScript
449  // decimal number literals.
450  // It includes an optional minus sign, must have at least one
451  // digit before and after a decimal point, may not have prefixed zeros (unless
452  // the integer part is zero), and may include an exponent part (e.g., "e-10").
453  // Hexadecimal and octal numbers are not allowed.
454  Token::Value ScanJsonNumber();
455
456  // A JSON string (production JSONString) is subset of valid JavaScript string
457  // literals. The string must only be double-quoted (not single-quoted), and
458  // the only allowed backslash-escapes are ", /, \, b, f, n, r, t and
459  // four-digit hex escapes (uXXXX). Any other use of backslashes is invalid.
460  Token::Value ScanJsonString();
461
462  // Used to recognizes one of the literals "true", "false", or "null". These
463  // are the only valid JSON identifiers (productions JSONBooleanLiteral,
464  // JSONNullLiteral).
465  Token::Value ScanJsonIdentifier(const char* text, Token::Value token);
466
467  void ScanDecimalDigits();
468  Token::Value ScanNumber(bool seen_period);
469  Token::Value ScanIdentifier();
470  uc32 ScanHexEscape(uc32 c, int length);
471  uc32 ScanOctalEscape(uc32 c, int length);
472  void ScanEscape();
473  Token::Value ScanString();
474
475  // Scans a possible HTML comment -- begins with '<!'.
476  Token::Value ScanHtmlComment();
477
478  // Return the current source position.
479  int source_pos() {
480    return source_->pos() - kCharacterLookaheadBufferSize;
481  }
482
483  // Decodes a unicode escape-sequence which is part of an identifier.
484  // If the escape sequence cannot be decoded the result is kBadRune.
485  uc32 ScanIdentifierUnicodeEscape();
486
487  TokenDesc current_;  // desc for current token (as returned by Next())
488  TokenDesc next_;     // desc for next token (one token look-ahead)
489  bool has_line_terminator_before_next_;
490  bool is_parsing_json_;
491
492  // Different UTF16 buffers used to pull characters from. Based on input one of
493  // these will be initialized as the actual data source.
494  CharacterStreamUTF16Buffer char_stream_buffer_;
495  ExternalStringUTF16Buffer<ExternalTwoByteString, uint16_t>
496      two_byte_string_buffer_;
497  ExternalStringUTF16Buffer<ExternalAsciiString, char> ascii_string_buffer_;
498
499  // Source. Will point to one of the buffers declared above.
500  UTF16Buffer* source_;
501
502  // Used to convert the source string into a character stream when a stream
503  // is not passed to the scanner.
504  SafeStringInputBuffer safe_string_input_buffer_;
505
506  // Buffer to hold literal values (identifiers, strings, numbers)
507  // using '\x00'-terminated UTF-8 encoding. Handles allocation internally.
508  UTF8Buffer literal_buffer_;
509
510  bool stack_overflow_;
511  static StaticResource<Utf8Decoder> utf8_decoder_;
512
513  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
514  uc32 c0_;
515};
516
517} }  // namespace v8::internal
518
519#endif  // V8_SCANNER_H_
520