1// Copyright 2011 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Features shared by parsing and pre-parsing scanners.
6
7#ifndef V8_PARSING_SCANNER_H_
8#define V8_PARSING_SCANNER_H_
9
10#include "src/allocation.h"
11#include "src/base/logging.h"
12#include "src/char-predicates.h"
13#include "src/globals.h"
14#include "src/messages.h"
15#include "src/parsing/token.h"
16#include "src/unicode-decoder.h"
17#include "src/unicode.h"
18
19namespace v8 {
20namespace internal {
21
22
23class AstRawString;
24class AstValueFactory;
25class DuplicateFinder;
26class ExternalOneByteString;
27class ExternalTwoByteString;
28class ParserRecorder;
29class UnicodeCache;
30
31// ---------------------------------------------------------------------
32// Buffered stream of UTF-16 code units, using an internal UTF-16 buffer.
33// A code unit is a 16 bit value representing either a 16 bit code point
34// or one part of a surrogate pair that make a single 21 bit code point.
35class Utf16CharacterStream {
36 public:
37  static const uc32 kEndOfInput = -1;
38
39  virtual ~Utf16CharacterStream() { }
40
41  // Returns and advances past the next UTF-16 code unit in the input
42  // stream. If there are no more code units it returns kEndOfInput.
43  inline uc32 Advance() {
44    if (V8_LIKELY(buffer_cursor_ < buffer_end_)) {
45      return static_cast<uc32>(*(buffer_cursor_++));
46    } else if (ReadBlock()) {
47      return static_cast<uc32>(*(buffer_cursor_++));
48    } else {
49      // Note: currently the following increment is necessary to avoid a
50      // parser problem! The scanner treats the final kEndOfInput as
51      // a code unit with a position, and does math relative to that
52      // position.
53      buffer_cursor_++;
54      return kEndOfInput;
55    }
56  }
57
58  // Go back one by one character in the input stream.
59  // This undoes the most recent Advance().
60  inline void Back() {
61    // The common case - if the previous character is within
62    // buffer_start_ .. buffer_end_ will be handles locally.
63    // Otherwise, a new block is requested.
64    if (V8_LIKELY(buffer_cursor_ > buffer_start_)) {
65      buffer_cursor_--;
66    } else {
67      ReadBlockAt(pos() - 1);
68    }
69  }
70
71  // Go back one by two characters in the input stream. (This is the same as
72  // calling Back() twice. But Back() may - in some instances - do substantial
73  // work. Back2() guarantees this work will be done only once.)
74  inline void Back2() {
75    if (V8_LIKELY(buffer_cursor_ - 2 >= buffer_start_)) {
76      buffer_cursor_ -= 2;
77    } else {
78      ReadBlockAt(pos() - 2);
79    }
80  }
81
82  inline size_t pos() const {
83    return buffer_pos_ + (buffer_cursor_ - buffer_start_);
84  }
85
86  inline void Seek(size_t pos) {
87    if (V8_LIKELY(pos >= buffer_pos_ &&
88                  pos < (buffer_pos_ + (buffer_end_ - buffer_start_)))) {
89      buffer_cursor_ = buffer_start_ + (pos - buffer_pos_);
90    } else {
91      ReadBlockAt(pos);
92    }
93  }
94
95 protected:
96  Utf16CharacterStream(const uint16_t* buffer_start,
97                       const uint16_t* buffer_cursor,
98                       const uint16_t* buffer_end, size_t buffer_pos)
99      : buffer_start_(buffer_start),
100        buffer_cursor_(buffer_cursor),
101        buffer_end_(buffer_end),
102        buffer_pos_(buffer_pos) {}
103  Utf16CharacterStream() : Utf16CharacterStream(nullptr, nullptr, nullptr, 0) {}
104
105  void ReadBlockAt(size_t new_pos) {
106    // The callers of this method (Back/Back2/Seek) should handle the easy
107    // case (seeking within the current buffer), and we should only get here
108    // if we actually require new data.
109    // (This is really an efficiency check, not a correctness invariant.)
110    DCHECK(new_pos < buffer_pos_ ||
111           new_pos >= buffer_pos_ + (buffer_end_ - buffer_start_));
112
113    // Change pos() to point to new_pos.
114    buffer_pos_ = new_pos;
115    buffer_cursor_ = buffer_start_;
116    bool success = ReadBlock();
117    USE(success);
118
119    // Post-conditions: 1, on success, we should be at the right position.
120    //                  2, success == we should have more characters available.
121    DCHECK_IMPLIES(success, pos() == new_pos);
122    DCHECK_EQ(success, buffer_cursor_ < buffer_end_);
123    DCHECK_EQ(success, buffer_start_ < buffer_end_);
124  }
125
126  // Read more data, and update buffer_*_ to point to it.
127  // Returns true if more data was available.
128  //
129  // ReadBlock() may modify any of the buffer_*_ members, but must sure that
130  // the result of pos() remains unaffected.
131  //
132  // Examples:
133  // - a stream could either fill a separate buffer. Then buffer_start_ and
134  //   buffer_cursor_ would point to the beginning of the buffer, and
135  //   buffer_pos would be the old pos().
136  // - a stream with existing buffer chunks would set buffer_start_ and
137  //   buffer_end_ to cover the full chunk, and then buffer_cursor_ would
138  //   point into the middle of the buffer, while buffer_pos_ would describe
139  //   the start of the buffer.
140  virtual bool ReadBlock() = 0;
141
142  const uint16_t* buffer_start_;
143  const uint16_t* buffer_cursor_;
144  const uint16_t* buffer_end_;
145  size_t buffer_pos_;
146};
147
148
149// ----------------------------------------------------------------------------
150// JavaScript Scanner.
151
152class Scanner {
153 public:
154  // Scoped helper for a re-settable bookmark.
155  class BookmarkScope {
156   public:
157    explicit BookmarkScope(Scanner* scanner)
158        : scanner_(scanner), bookmark_(kNoBookmark) {
159      DCHECK_NOT_NULL(scanner_);
160    }
161    ~BookmarkScope() {}
162
163    void Set();
164    void Apply();
165    bool HasBeenSet();
166    bool HasBeenApplied();
167
168   private:
169    static const size_t kNoBookmark;
170    static const size_t kBookmarkWasApplied;
171    static const size_t kBookmarkAtFirstPos;
172
173    Scanner* scanner_;
174    size_t bookmark_;
175
176    DISALLOW_COPY_AND_ASSIGN(BookmarkScope);
177  };
178
179  // Representation of an interval of source positions.
180  struct Location {
181    Location(int b, int e) : beg_pos(b), end_pos(e) { }
182    Location() : beg_pos(0), end_pos(0) { }
183
184    bool IsValid() const {
185      return beg_pos >= 0 && end_pos >= beg_pos;
186    }
187
188    static Location invalid() { return Location(-1, -1); }
189
190    int beg_pos;
191    int end_pos;
192  };
193
194  // -1 is outside of the range of any real source code.
195  static const int kNoOctalLocation = -1;
196  static const uc32 kEndOfInput = Utf16CharacterStream::kEndOfInput;
197
198  explicit Scanner(UnicodeCache* scanner_contants);
199
200  void Initialize(Utf16CharacterStream* source);
201
202  // Returns the next token and advances input.
203  Token::Value Next();
204  // Returns the token following peek()
205  Token::Value PeekAhead();
206  // Returns the current token again.
207  Token::Value current_token() { return current_.token; }
208  // Returns the location information for the current token
209  // (the token last returned by Next()).
210  Location location() const { return current_.location; }
211
212  // This error is specifically an invalid hex or unicode escape sequence.
213  bool has_error() const { return scanner_error_ != MessageTemplate::kNone; }
214  MessageTemplate::Template error() const { return scanner_error_; }
215  Location error_location() const { return scanner_error_location_; }
216
217  bool has_invalid_template_escape() const {
218    return invalid_template_escape_message_ != MessageTemplate::kNone;
219  }
220  MessageTemplate::Template invalid_template_escape_message() const {
221    return invalid_template_escape_message_;
222  }
223  Location invalid_template_escape_location() const {
224    return invalid_template_escape_location_;
225  }
226
227  void clear_invalid_template_escape() {
228    DCHECK(has_invalid_template_escape());
229    invalid_template_escape_message_ = MessageTemplate::kNone;
230    invalid_template_escape_location_ = Location::invalid();
231  }
232
233  // Similar functions for the upcoming token.
234
235  // One token look-ahead (past the token returned by Next()).
236  Token::Value peek() const { return next_.token; }
237
238  Location peek_location() const { return next_.location; }
239
240  bool literal_contains_escapes() const {
241    return LiteralContainsEscapes(current_);
242  }
243  bool is_literal_contextual_keyword(Vector<const char> keyword) {
244    DCHECK(current_.token == Token::IDENTIFIER ||
245           current_.token == Token::ESCAPED_STRICT_RESERVED_WORD);
246    DCHECK_NOT_NULL(current_.literal_chars);
247    return current_.literal_chars->is_contextual_keyword(keyword);
248  }
249  bool is_next_contextual_keyword(Vector<const char> keyword) {
250    DCHECK_NOT_NULL(next_.literal_chars);
251    return next_.literal_chars->is_contextual_keyword(keyword);
252  }
253
254  const AstRawString* CurrentSymbol(AstValueFactory* ast_value_factory);
255  const AstRawString* NextSymbol(AstValueFactory* ast_value_factory);
256  const AstRawString* CurrentRawSymbol(AstValueFactory* ast_value_factory);
257
258  double DoubleValue();
259  bool ContainsDot();
260  bool LiteralMatches(const char* data, int length, bool allow_escapes = true) {
261    if (!current_.literal_chars) {
262      return !strncmp(Token::Name(current_.token), data, length);
263    } else if (is_literal_one_byte() && literal_length() == length &&
264               (allow_escapes || !literal_contains_escapes())) {
265      const char* token =
266          reinterpret_cast<const char*>(literal_one_byte_string().start());
267      return !strncmp(token, data, length);
268    }
269    return false;
270  }
271  inline bool UnescapedLiteralMatches(const char* data, int length) {
272    return LiteralMatches(data, length, false);
273  }
274
275  bool IsGetOrSet(bool* is_get, bool* is_set) {
276    if (is_literal_one_byte() &&
277        literal_length() == 3 &&
278        !literal_contains_escapes()) {
279      const char* token =
280          reinterpret_cast<const char*>(literal_one_byte_string().start());
281      *is_get = strncmp(token, "get", 3) == 0;
282      *is_set = !*is_get && strncmp(token, "set", 3) == 0;
283      return *is_get || *is_set;
284    }
285    return false;
286  }
287
288  bool FindSymbol(DuplicateFinder* finder);
289
290  UnicodeCache* unicode_cache() { return unicode_cache_; }
291
292  // Returns the location of the last seen octal literal.
293  Location octal_position() const { return octal_pos_; }
294  void clear_octal_position() {
295    octal_pos_ = Location::invalid();
296    octal_message_ = MessageTemplate::kNone;
297  }
298  MessageTemplate::Template octal_message() const { return octal_message_; }
299
300  // Returns the value of the last smi that was scanned.
301  uint32_t smi_value() const { return current_.smi_value_; }
302
303  // Seek forward to the given position.  This operation does not
304  // work in general, for instance when there are pushed back
305  // characters, but works for seeking forward until simple delimiter
306  // tokens, which is what it is used for.
307  void SeekForward(int pos);
308
309  // Returns true if there was a line terminator before the peek'ed token,
310  // possibly inside a multi-line comment.
311  bool HasAnyLineTerminatorBeforeNext() const {
312    return has_line_terminator_before_next_ ||
313           has_multiline_comment_before_next_;
314  }
315
316  bool HasAnyLineTerminatorAfterNext() {
317    Token::Value ensure_next_next = PeekAhead();
318    USE(ensure_next_next);
319    return has_line_terminator_after_next_;
320  }
321
322  // Scans the input as a regular expression pattern, next token must be /(=).
323  // Returns true if a pattern is scanned.
324  bool ScanRegExpPattern();
325  // Scans the input as regular expression flags. Returns the flags on success.
326  Maybe<RegExp::Flags> ScanRegExpFlags();
327
328  // Scans the input as a template literal
329  Token::Value ScanTemplateStart();
330  Token::Value ScanTemplateContinuation();
331
332  Handle<String> SourceUrl(Isolate* isolate) const {
333    Handle<String> tmp;
334    if (source_url_.length() > 0) tmp = source_url_.Internalize(isolate);
335    return tmp;
336  }
337
338  Handle<String> SourceMappingUrl(Isolate* isolate) const {
339    Handle<String> tmp;
340    if (source_mapping_url_.length() > 0)
341      tmp = source_mapping_url_.Internalize(isolate);
342    return tmp;
343  }
344
345  bool FoundHtmlComment() const { return found_html_comment_; }
346
347 private:
348  // Scoped helper for literal recording. Automatically drops the literal
349  // if aborting the scanning before it's complete.
350  class LiteralScope {
351   public:
352    explicit LiteralScope(Scanner* self) : scanner_(self), complete_(false) {
353      scanner_->StartLiteral();
354    }
355    ~LiteralScope() {
356      if (!complete_) scanner_->DropLiteral();
357    }
358    void Complete() { complete_ = true; }
359
360   private:
361    Scanner* scanner_;
362    bool complete_;
363  };
364
365  // LiteralBuffer -  Collector of chars of literals.
366  class LiteralBuffer {
367   public:
368    LiteralBuffer() : is_one_byte_(true), position_(0), backing_store_() {}
369
370    ~LiteralBuffer() { backing_store_.Dispose(); }
371
372    INLINE(void AddChar(char code_unit)) {
373      DCHECK(IsValidAscii(code_unit));
374      AddOneByteChar(static_cast<byte>(code_unit));
375    }
376
377    INLINE(void AddChar(uc32 code_unit)) {
378      if (is_one_byte_ &&
379          code_unit <= static_cast<uc32>(unibrow::Latin1::kMaxChar)) {
380        AddOneByteChar(static_cast<byte>(code_unit));
381      } else {
382        AddCharSlow(code_unit);
383      }
384    }
385
386    bool is_one_byte() const { return is_one_byte_; }
387
388    bool is_contextual_keyword(Vector<const char> keyword) const {
389      return is_one_byte() && keyword.length() == position_ &&
390             (memcmp(keyword.start(), backing_store_.start(), position_) == 0);
391    }
392
393    Vector<const uint16_t> two_byte_literal() const {
394      DCHECK(!is_one_byte_);
395      DCHECK((position_ & 0x1) == 0);
396      return Vector<const uint16_t>(
397          reinterpret_cast<const uint16_t*>(backing_store_.start()),
398          position_ >> 1);
399    }
400
401    Vector<const uint8_t> one_byte_literal() const {
402      DCHECK(is_one_byte_);
403      return Vector<const uint8_t>(
404          reinterpret_cast<const uint8_t*>(backing_store_.start()), position_);
405    }
406
407    int length() const { return is_one_byte_ ? position_ : (position_ >> 1); }
408
409    void ReduceLength(int delta) {
410      position_ -= delta * (is_one_byte_ ? kOneByteSize : kUC16Size);
411    }
412
413    void Reset() {
414      position_ = 0;
415      is_one_byte_ = true;
416    }
417
418    Handle<String> Internalize(Isolate* isolate) const;
419
420   private:
421    static const int kInitialCapacity = 16;
422    static const int kGrowthFactory = 4;
423    static const int kMinConversionSlack = 256;
424    static const int kMaxGrowth = 1 * MB;
425
426    inline bool IsValidAscii(char code_unit) {
427      // Control characters and printable characters span the range of
428      // valid ASCII characters (0-127). Chars are unsigned on some
429      // platforms which causes compiler warnings if the validity check
430      // tests the lower bound >= 0 as it's always true.
431      return iscntrl(code_unit) || isprint(code_unit);
432    }
433
434    INLINE(void AddOneByteChar(byte one_byte_char)) {
435      DCHECK(is_one_byte_);
436      if (position_ >= backing_store_.length()) ExpandBuffer();
437      backing_store_[position_] = one_byte_char;
438      position_ += kOneByteSize;
439    }
440
441    void AddCharSlow(uc32 code_unit);
442    int NewCapacity(int min_capacity);
443    void ExpandBuffer();
444    void ConvertToTwoByte();
445
446    bool is_one_byte_;
447    int position_;
448    Vector<byte> backing_store_;
449
450    DISALLOW_COPY_AND_ASSIGN(LiteralBuffer);
451  };
452
453  // The current and look-ahead token.
454  struct TokenDesc {
455    Location location;
456    LiteralBuffer* literal_chars;
457    LiteralBuffer* raw_literal_chars;
458    uint32_t smi_value_;
459    Token::Value token;
460  };
461
462  static const int kCharacterLookaheadBufferSize = 1;
463  const int kMaxAscii = 127;
464
465  // Scans octal escape sequence. Also accepts "\0" decimal escape sequence.
466  template <bool capture_raw>
467  uc32 ScanOctalEscape(uc32 c, int length);
468
469  // Call this after setting source_ to the input.
470  void Init() {
471    // Set c0_ (one character ahead)
472    STATIC_ASSERT(kCharacterLookaheadBufferSize == 1);
473    Advance();
474    // Initialize current_ to not refer to a literal.
475    current_.token = Token::UNINITIALIZED;
476    current_.literal_chars = NULL;
477    current_.raw_literal_chars = NULL;
478    next_.token = Token::UNINITIALIZED;
479    next_.literal_chars = NULL;
480    next_.raw_literal_chars = NULL;
481    next_next_.token = Token::UNINITIALIZED;
482    next_next_.literal_chars = NULL;
483    next_next_.raw_literal_chars = NULL;
484    found_html_comment_ = false;
485    scanner_error_ = MessageTemplate::kNone;
486    invalid_template_escape_message_ = MessageTemplate::kNone;
487  }
488
489  void ReportScannerError(const Location& location,
490                          MessageTemplate::Template error) {
491    if (has_error()) return;
492    scanner_error_ = error;
493    scanner_error_location_ = location;
494  }
495
496  void ReportScannerError(int pos, MessageTemplate::Template error) {
497    if (has_error()) return;
498    scanner_error_ = error;
499    scanner_error_location_ = Location(pos, pos + 1);
500  }
501
502  // Seek to the next_ token at the given position.
503  void SeekNext(size_t position);
504
505  // Literal buffer support
506  inline void StartLiteral() {
507    LiteralBuffer* free_buffer =
508        (current_.literal_chars == &literal_buffer0_)
509            ? &literal_buffer1_
510            : (current_.literal_chars == &literal_buffer1_) ? &literal_buffer2_
511                                                            : &literal_buffer0_;
512    free_buffer->Reset();
513    next_.literal_chars = free_buffer;
514  }
515
516  inline void StartRawLiteral() {
517    LiteralBuffer* free_buffer =
518        (current_.raw_literal_chars == &raw_literal_buffer0_)
519            ? &raw_literal_buffer1_
520            : (current_.raw_literal_chars == &raw_literal_buffer1_)
521                  ? &raw_literal_buffer2_
522                  : &raw_literal_buffer0_;
523    free_buffer->Reset();
524    next_.raw_literal_chars = free_buffer;
525  }
526
527  INLINE(void AddLiteralChar(uc32 c)) {
528    DCHECK_NOT_NULL(next_.literal_chars);
529    next_.literal_chars->AddChar(c);
530  }
531
532  INLINE(void AddLiteralChar(char c)) {
533    DCHECK_NOT_NULL(next_.literal_chars);
534    next_.literal_chars->AddChar(c);
535  }
536
537  INLINE(void AddRawLiteralChar(uc32 c)) {
538    DCHECK_NOT_NULL(next_.raw_literal_chars);
539    next_.raw_literal_chars->AddChar(c);
540  }
541
542  INLINE(void ReduceRawLiteralLength(int delta)) {
543    DCHECK_NOT_NULL(next_.raw_literal_chars);
544    next_.raw_literal_chars->ReduceLength(delta);
545  }
546
547  // Stops scanning of a literal and drop the collected characters,
548  // e.g., due to an encountered error.
549  inline void DropLiteral() {
550    next_.literal_chars = NULL;
551    next_.raw_literal_chars = NULL;
552  }
553
554  inline void AddLiteralCharAdvance() {
555    AddLiteralChar(c0_);
556    Advance();
557  }
558
559  // Low-level scanning support.
560  template <bool capture_raw = false, bool check_surrogate = true>
561  void Advance() {
562    if (capture_raw) {
563      AddRawLiteralChar(c0_);
564    }
565    c0_ = source_->Advance();
566    if (check_surrogate) HandleLeadSurrogate();
567  }
568
569  void HandleLeadSurrogate() {
570    if (unibrow::Utf16::IsLeadSurrogate(c0_)) {
571      uc32 c1 = source_->Advance();
572      if (!unibrow::Utf16::IsTrailSurrogate(c1)) {
573        source_->Back();
574      } else {
575        c0_ = unibrow::Utf16::CombineSurrogatePair(c0_, c1);
576      }
577    }
578  }
579
580  void PushBack(uc32 ch) {
581    if (c0_ > static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
582      source_->Back2();
583    } else {
584      source_->Back();
585    }
586    c0_ = ch;
587  }
588
589  // Same as PushBack(ch1); PushBack(ch2).
590  // - Potentially more efficient as it uses Back2() on the stream.
591  // - Uses char as parameters, since we're only calling it with ASCII chars in
592  //   practice. This way, we can avoid a few edge cases.
593  void PushBack2(char ch1, char ch2) {
594    source_->Back2();
595    c0_ = ch2;
596  }
597
598  inline Token::Value Select(Token::Value tok) {
599    Advance();
600    return tok;
601  }
602
603  inline Token::Value Select(uc32 next, Token::Value then, Token::Value else_) {
604    Advance();
605    if (c0_ == next) {
606      Advance();
607      return then;
608    } else {
609      return else_;
610    }
611  }
612
613  // Returns the literal string, if any, for the current token (the
614  // token last returned by Next()). The string is 0-terminated.
615  // Literal strings are collected for identifiers, strings, numbers as well
616  // as for template literals. For template literals we also collect the raw
617  // form.
618  // These functions only give the correct result if the literal was scanned
619  // when a LiteralScope object is alive.
620  //
621  // Current usage of these functions is unfortunately a little undisciplined,
622  // and is_literal_one_byte() + is_literal_one_byte_string() is also
623  // requested for tokens that do not have a literal. Hence, we treat any
624  // token as a one-byte literal. E.g. Token::FUNCTION pretends to have a
625  // literal "function".
626  Vector<const uint8_t> literal_one_byte_string() {
627    if (current_.literal_chars)
628      return current_.literal_chars->one_byte_literal();
629    const char* str = Token::String(current_.token);
630    const uint8_t* str_as_uint8 = reinterpret_cast<const uint8_t*>(str);
631    return Vector<const uint8_t>(str_as_uint8,
632                                 Token::StringLength(current_.token));
633  }
634  Vector<const uint16_t> literal_two_byte_string() {
635    DCHECK_NOT_NULL(current_.literal_chars);
636    return current_.literal_chars->two_byte_literal();
637  }
638  bool is_literal_one_byte() {
639    return !current_.literal_chars || current_.literal_chars->is_one_byte();
640  }
641  int literal_length() const {
642    if (current_.literal_chars) return current_.literal_chars->length();
643    return Token::StringLength(current_.token);
644  }
645  // Returns the literal string for the next token (the token that
646  // would be returned if Next() were called).
647  Vector<const uint8_t> next_literal_one_byte_string() {
648    DCHECK_NOT_NULL(next_.literal_chars);
649    return next_.literal_chars->one_byte_literal();
650  }
651  Vector<const uint16_t> next_literal_two_byte_string() {
652    DCHECK_NOT_NULL(next_.literal_chars);
653    return next_.literal_chars->two_byte_literal();
654  }
655  bool is_next_literal_one_byte() {
656    DCHECK_NOT_NULL(next_.literal_chars);
657    return next_.literal_chars->is_one_byte();
658  }
659  Vector<const uint8_t> raw_literal_one_byte_string() {
660    DCHECK_NOT_NULL(current_.raw_literal_chars);
661    return current_.raw_literal_chars->one_byte_literal();
662  }
663  Vector<const uint16_t> raw_literal_two_byte_string() {
664    DCHECK_NOT_NULL(current_.raw_literal_chars);
665    return current_.raw_literal_chars->two_byte_literal();
666  }
667  bool is_raw_literal_one_byte() {
668    DCHECK_NOT_NULL(current_.raw_literal_chars);
669    return current_.raw_literal_chars->is_one_byte();
670  }
671
672  template <bool capture_raw, bool unicode = false>
673  uc32 ScanHexNumber(int expected_length);
674  // Scan a number of any length but not bigger than max_value. For example, the
675  // number can be 000000001, so it's very long in characters but its value is
676  // small.
677  template <bool capture_raw>
678  uc32 ScanUnlimitedLengthHexNumber(int max_value, int beg_pos);
679
680  // Scans a single JavaScript token.
681  void Scan();
682
683  bool SkipWhiteSpace();
684  Token::Value SkipSingleLineComment();
685  Token::Value SkipSourceURLComment();
686  void TryToParseSourceURLComment();
687  Token::Value SkipMultiLineComment();
688  // Scans a possible HTML comment -- begins with '<!'.
689  Token::Value ScanHtmlComment();
690
691  void ScanDecimalDigits();
692  Token::Value ScanNumber(bool seen_period);
693  Token::Value ScanIdentifierOrKeyword();
694  Token::Value ScanIdentifierSuffix(LiteralScope* literal, bool escaped);
695
696  Token::Value ScanString();
697
698  // Scans an escape-sequence which is part of a string and adds the
699  // decoded character to the current literal. Returns true if a pattern
700  // is scanned.
701  template <bool capture_raw, bool in_template_literal>
702  bool ScanEscape();
703
704  // Decodes a Unicode escape-sequence which is part of an identifier.
705  // If the escape sequence cannot be decoded the result is kBadChar.
706  uc32 ScanIdentifierUnicodeEscape();
707  // Helper for the above functions.
708  template <bool capture_raw>
709  uc32 ScanUnicodeEscape();
710
711  Token::Value ScanTemplateSpan();
712
713  // Return the current source position.
714  int source_pos() {
715    return static_cast<int>(source_->pos()) - kCharacterLookaheadBufferSize;
716  }
717
718  static bool LiteralContainsEscapes(const TokenDesc& token) {
719    Location location = token.location;
720    int source_length = (location.end_pos - location.beg_pos);
721    if (token.token == Token::STRING) {
722      // Subtract delimiters.
723      source_length -= 2;
724    }
725    return token.literal_chars &&
726           (token.literal_chars->length() != source_length);
727  }
728
729#ifdef DEBUG
730  void SanityCheckTokenDesc(const TokenDesc&) const;
731#endif
732
733  UnicodeCache* unicode_cache_;
734
735  // Buffers collecting literal strings, numbers, etc.
736  LiteralBuffer literal_buffer0_;
737  LiteralBuffer literal_buffer1_;
738  LiteralBuffer literal_buffer2_;
739
740  // Values parsed from magic comments.
741  LiteralBuffer source_url_;
742  LiteralBuffer source_mapping_url_;
743
744  // Buffer to store raw string values
745  LiteralBuffer raw_literal_buffer0_;
746  LiteralBuffer raw_literal_buffer1_;
747  LiteralBuffer raw_literal_buffer2_;
748
749  TokenDesc current_;    // desc for current token (as returned by Next())
750  TokenDesc next_;       // desc for next token (one token look-ahead)
751  TokenDesc next_next_;  // desc for the token after next (after PeakAhead())
752
753  // Input stream. Must be initialized to an Utf16CharacterStream.
754  Utf16CharacterStream* source_;
755
756  // Last-seen positions of potentially problematic tokens.
757  Location octal_pos_;
758  MessageTemplate::Template octal_message_;
759
760  // One Unicode character look-ahead; c0_ < 0 at the end of the input.
761  uc32 c0_;
762
763  // Whether there is a line terminator whitespace character after
764  // the current token, and  before the next. Does not count newlines
765  // inside multiline comments.
766  bool has_line_terminator_before_next_;
767  // Whether there is a multi-line comment that contains a
768  // line-terminator after the current token, and before the next.
769  bool has_multiline_comment_before_next_;
770  bool has_line_terminator_after_next_;
771
772  // Whether this scanner encountered an HTML comment.
773  bool found_html_comment_;
774
775  MessageTemplate::Template scanner_error_;
776  Location scanner_error_location_;
777
778  MessageTemplate::Template invalid_template_escape_message_;
779  Location invalid_template_escape_location_;
780};
781
782}  // namespace internal
783}  // namespace v8
784
785#endif  // V8_PARSING_SCANNER_H_
786