scanner.cc revision 6ded16be15dd865a9b21ea304d5273c8be299c87
1// Copyright 2006-2008 the V8 project authors. All rights reserved.
2// Redistribution and use in source and binary forms, with or without
3// modification, are permitted provided that the following conditions are
4// met:
5//
6//     * Redistributions of source code must retain the above copyright
7//       notice, this list of conditions and the following disclaimer.
8//     * Redistributions in binary form must reproduce the above
9//       copyright notice, this list of conditions and the following
10//       disclaimer in the documentation and/or other materials provided
11//       with the distribution.
12//     * Neither the name of Google Inc. nor the names of its
13//       contributors may be used to endorse or promote products derived
14//       from this software without specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28#include "v8.h"
29
30#include "ast.h"
31#include "handles.h"
32#include "scanner.h"
33
34namespace v8 {
35namespace internal {
36
37// ----------------------------------------------------------------------------
38// Character predicates
39
40
41unibrow::Predicate<IdentifierStart, 128> Scanner::kIsIdentifierStart;
42unibrow::Predicate<IdentifierPart, 128> Scanner::kIsIdentifierPart;
43unibrow::Predicate<unibrow::LineTerminator, 128> Scanner::kIsLineTerminator;
44unibrow::Predicate<unibrow::WhiteSpace, 128> Scanner::kIsWhiteSpace;
45
46
47StaticResource<Scanner::Utf8Decoder> Scanner::utf8_decoder_;
48
49
50// ----------------------------------------------------------------------------
51// UTF8Buffer
52
53UTF8Buffer::UTF8Buffer() : data_(NULL), limit_(NULL) { }
54
55
56UTF8Buffer::~UTF8Buffer() {
57  if (data_ != NULL) DeleteArray(data_);
58}
59
60
61void UTF8Buffer::AddCharSlow(uc32 c) {
62  static const int kCapacityGrowthLimit = 1 * MB;
63  if (cursor_ > limit_) {
64    int old_capacity = Capacity();
65    int old_position = pos();
66    int new_capacity =
67        Min(old_capacity * 3, old_capacity + kCapacityGrowthLimit);
68    char* new_data = NewArray<char>(new_capacity);
69    memcpy(new_data, data_, old_position);
70    DeleteArray(data_);
71    data_ = new_data;
72    cursor_ = new_data + old_position;
73    limit_ = ComputeLimit(new_data, new_capacity);
74    ASSERT(Capacity() == new_capacity && pos() == old_position);
75  }
76  if (static_cast<unsigned>(c) <= unibrow::Utf8::kMaxOneByteChar) {
77    *cursor_++ = c;  // Common case: 7-bit ASCII.
78  } else {
79    cursor_ += unibrow::Utf8::Encode(cursor_, c);
80  }
81  ASSERT(pos() <= Capacity());
82}
83
84
85// ----------------------------------------------------------------------------
86// UTF16Buffer
87
88
89UTF16Buffer::UTF16Buffer()
90    : pos_(0), end_(Scanner::kNoEndPosition) { }
91
92
93// CharacterStreamUTF16Buffer
94CharacterStreamUTF16Buffer::CharacterStreamUTF16Buffer()
95    : pushback_buffer_(0), last_(0), stream_(NULL) { }
96
97
98void CharacterStreamUTF16Buffer::Initialize(Handle<String> data,
99                                            unibrow::CharacterStream* input,
100                                            int start_position,
101                                            int end_position) {
102  stream_ = input;
103  if (start_position > 0) {
104    SeekForward(start_position);
105  }
106  end_ = end_position != Scanner::kNoEndPosition ? end_position : kMaxInt;
107}
108
109
110void CharacterStreamUTF16Buffer::PushBack(uc32 ch) {
111  pushback_buffer()->Add(last_);
112  last_ = ch;
113  pos_--;
114}
115
116
117uc32 CharacterStreamUTF16Buffer::Advance() {
118  ASSERT(end_ != Scanner::kNoEndPosition);
119  ASSERT(end_ >= 0);
120  // NOTE: It is of importance to Persian / Farsi resources that we do
121  // *not* strip format control characters in the scanner; see
122  //
123  //    https://bugzilla.mozilla.org/show_bug.cgi?id=274152
124  //
125  // So, even though ECMA-262, section 7.1, page 11, dictates that we
126  // must remove Unicode format-control characters, we do not. This is
127  // in line with how IE and SpiderMonkey handles it.
128  if (!pushback_buffer()->is_empty()) {
129    pos_++;
130    return last_ = pushback_buffer()->RemoveLast();
131  } else if (stream_->has_more() && pos_ < end_) {
132    pos_++;
133    uc32 next = stream_->GetNext();
134    return last_ = next;
135  } else {
136    // Note: currently the following increment is necessary to avoid a
137    // test-parser problem!
138    pos_++;
139    return last_ = static_cast<uc32>(-1);
140  }
141}
142
143
144void CharacterStreamUTF16Buffer::SeekForward(int pos) {
145  pos_ = pos;
146  ASSERT(pushback_buffer()->is_empty());
147  stream_->Seek(pos);
148}
149
150
151// ExternalStringUTF16Buffer
152template <typename StringType, typename CharType>
153ExternalStringUTF16Buffer<StringType, CharType>::ExternalStringUTF16Buffer()
154    : raw_data_(NULL) { }
155
156
157template <typename StringType, typename CharType>
158void ExternalStringUTF16Buffer<StringType, CharType>::Initialize(
159     Handle<StringType> data,
160     int start_position,
161     int end_position) {
162  ASSERT(!data.is_null());
163  raw_data_ = data->resource()->data();
164
165  ASSERT(end_position <= data->length());
166  if (start_position > 0) {
167    SeekForward(start_position);
168  }
169  end_ =
170      end_position != Scanner::kNoEndPosition ? end_position : data->length();
171}
172
173
174template <typename StringType, typename CharType>
175uc32 ExternalStringUTF16Buffer<StringType, CharType>::Advance() {
176  if (pos_ < end_) {
177    return raw_data_[pos_++];
178  } else {
179    // note: currently the following increment is necessary to avoid a
180    // test-parser problem!
181    pos_++;
182    return static_cast<uc32>(-1);
183  }
184}
185
186
187template <typename StringType, typename CharType>
188void ExternalStringUTF16Buffer<StringType, CharType>::PushBack(uc32 ch) {
189  pos_--;
190  ASSERT(pos_ >= Scanner::kCharacterLookaheadBufferSize);
191  ASSERT(raw_data_[pos_ - Scanner::kCharacterLookaheadBufferSize] == ch);
192}
193
194
195template <typename StringType, typename CharType>
196void ExternalStringUTF16Buffer<StringType, CharType>::SeekForward(int pos) {
197  pos_ = pos;
198}
199
200
201// ----------------------------------------------------------------------------
202// Keyword Matcher
203KeywordMatcher::FirstState KeywordMatcher::first_states_[] = {
204  { "break",  KEYWORD_PREFIX, Token::BREAK },
205  { NULL,     C,              Token::ILLEGAL },
206  { NULL,     D,              Token::ILLEGAL },
207  { "else",   KEYWORD_PREFIX, Token::ELSE },
208  { NULL,     F,              Token::ILLEGAL },
209  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
210  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
211  { NULL,     I,              Token::ILLEGAL },
212  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
213  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
214  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
215  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
216  { NULL,     N,              Token::ILLEGAL },
217  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
218  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
219  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
220  { "return", KEYWORD_PREFIX, Token::RETURN },
221  { "switch", KEYWORD_PREFIX, Token::SWITCH },
222  { NULL,     T,              Token::ILLEGAL },
223  { NULL,     UNMATCHABLE,    Token::ILLEGAL },
224  { NULL,     V,              Token::ILLEGAL },
225  { NULL,     W,              Token::ILLEGAL }
226};
227
228
229void KeywordMatcher::Step(uc32 input) {
230  switch (state_) {
231    case INITIAL: {
232      // matching the first character is the only state with significant fanout.
233      // Match only lower-case letters in range 'b'..'w'.
234      unsigned int offset = input - kFirstCharRangeMin;
235      if (offset < kFirstCharRangeLength) {
236        state_ = first_states_[offset].state;
237        if (state_ == KEYWORD_PREFIX) {
238          keyword_ = first_states_[offset].keyword;
239          counter_ = 1;
240          keyword_token_ = first_states_[offset].token;
241        }
242        return;
243      }
244      break;
245    }
246    case KEYWORD_PREFIX:
247      if (keyword_[counter_] == input) {
248        ASSERT_NE(input, '\0');
249        counter_++;
250        if (keyword_[counter_] == '\0') {
251          state_ = KEYWORD_MATCHED;
252          token_ = keyword_token_;
253        }
254        return;
255      }
256      break;
257    case KEYWORD_MATCHED:
258      token_ = Token::IDENTIFIER;
259      break;
260    case C:
261      if (MatchState(input, 'a', CA)) return;
262      if (MatchState(input, 'o', CO)) return;
263      break;
264    case CA:
265      if (MatchKeywordStart(input, "case", 2, Token::CASE)) return;
266      if (MatchKeywordStart(input, "catch", 2, Token::CATCH)) return;
267      break;
268    case CO:
269      if (MatchState(input, 'n', CON)) return;
270      break;
271    case CON:
272      if (MatchKeywordStart(input, "const", 3, Token::CONST)) return;
273      if (MatchKeywordStart(input, "continue", 3, Token::CONTINUE)) return;
274      break;
275    case D:
276      if (MatchState(input, 'e', DE)) return;
277      if (MatchKeyword(input, 'o', KEYWORD_MATCHED, Token::DO)) return;
278      break;
279    case DE:
280      if (MatchKeywordStart(input, "debugger", 2, Token::DEBUGGER)) return;
281      if (MatchKeywordStart(input, "default", 2, Token::DEFAULT)) return;
282      if (MatchKeywordStart(input, "delete", 2, Token::DELETE)) return;
283      break;
284    case F:
285      if (MatchKeywordStart(input, "false", 1, Token::FALSE_LITERAL)) return;
286      if (MatchKeywordStart(input, "finally", 1, Token::FINALLY)) return;
287      if (MatchKeywordStart(input, "for", 1, Token::FOR)) return;
288      if (MatchKeywordStart(input, "function", 1, Token::FUNCTION)) return;
289      break;
290    case I:
291      if (MatchKeyword(input, 'f', KEYWORD_MATCHED, Token::IF)) return;
292      if (MatchKeyword(input, 'n', IN, Token::IN)) return;
293      break;
294    case IN:
295      token_ = Token::IDENTIFIER;
296      if (MatchKeywordStart(input, "instanceof", 2, Token::INSTANCEOF)) {
297        return;
298      }
299      break;
300    case N:
301      if (MatchKeywordStart(input, "native", 1, Token::NATIVE)) return;
302      if (MatchKeywordStart(input, "new", 1, Token::NEW)) return;
303      if (MatchKeywordStart(input, "null", 1, Token::NULL_LITERAL)) return;
304      break;
305    case T:
306      if (MatchState(input, 'h', TH)) return;
307      if (MatchState(input, 'r', TR)) return;
308      if (MatchKeywordStart(input, "typeof", 1, Token::TYPEOF)) return;
309      break;
310    case TH:
311      if (MatchKeywordStart(input, "this", 2, Token::THIS)) return;
312      if (MatchKeywordStart(input, "throw", 2, Token::THROW)) return;
313      break;
314    case TR:
315      if (MatchKeywordStart(input, "true", 2, Token::TRUE_LITERAL)) return;
316      if (MatchKeyword(input, 'y', KEYWORD_MATCHED, Token::TRY)) return;
317      break;
318    case V:
319      if (MatchKeywordStart(input, "var", 1, Token::VAR)) return;
320      if (MatchKeywordStart(input, "void", 1, Token::VOID)) return;
321      break;
322    case W:
323      if (MatchKeywordStart(input, "while", 1, Token::WHILE)) return;
324      if (MatchKeywordStart(input, "with", 1, Token::WITH)) return;
325      break;
326    default:
327      UNREACHABLE();
328  }
329  // On fallthrough, it's a failure.
330  state_ = UNMATCHABLE;
331}
332
333
334// ----------------------------------------------------------------------------
335// Scanner
336
337Scanner::Scanner(ParserMode pre)
338    : stack_overflow_(false), is_pre_parsing_(pre == PREPARSE) { }
339
340
341void Scanner::Initialize(Handle<String> source,
342                         ParserLanguage language) {
343  safe_string_input_buffer_.Reset(source.location());
344  Init(source, &safe_string_input_buffer_, 0, source->length(), language);
345}
346
347
348void Scanner::Initialize(Handle<String> source,
349                         unibrow::CharacterStream* stream,
350                         ParserLanguage language) {
351  Init(source, stream, 0, kNoEndPosition, language);
352}
353
354
355void Scanner::Initialize(Handle<String> source,
356                         int start_position,
357                         int end_position,
358                         ParserLanguage language) {
359  safe_string_input_buffer_.Reset(source.location());
360  Init(source, &safe_string_input_buffer_,
361       start_position, end_position, language);
362}
363
364
365void Scanner::Init(Handle<String> source,
366                   unibrow::CharacterStream* stream,
367                   int start_position,
368                   int end_position,
369                   ParserLanguage language) {
370  // Initialize the source buffer.
371  if (!source.is_null() && StringShape(*source).IsExternalTwoByte()) {
372    two_byte_string_buffer_.Initialize(
373        Handle<ExternalTwoByteString>::cast(source),
374        start_position,
375        end_position);
376    source_ = &two_byte_string_buffer_;
377  } else if (!source.is_null() && StringShape(*source).IsExternalAscii()) {
378    ascii_string_buffer_.Initialize(
379        Handle<ExternalAsciiString>::cast(source),
380        start_position,
381        end_position);
382    source_ = &ascii_string_buffer_;
383  } else {
384    char_stream_buffer_.Initialize(source,
385                                   stream,
386                                   start_position,
387                                   end_position);
388    source_ = &char_stream_buffer_;
389  }
390
391  is_parsing_json_ = (language == JSON);
392
393  // Set c0_ (one character ahead)
394  ASSERT(kCharacterLookaheadBufferSize == 1);
395  Advance();
396  // Initializer current_ to not refer to a literal buffer.
397  current_.literal_buffer = NULL;
398
399  // Skip initial whitespace allowing HTML comment ends just like
400  // after a newline and scan first token.
401  has_line_terminator_before_next_ = true;
402  SkipWhiteSpace();
403  Scan();
404}
405
406
407Token::Value Scanner::Next() {
408  // BUG 1215673: Find a thread safe way to set a stack limit in
409  // pre-parse mode. Otherwise, we cannot safely pre-parse from other
410  // threads.
411  current_ = next_;
412  // Check for stack-overflow before returning any tokens.
413  StackLimitCheck check;
414  if (check.HasOverflowed()) {
415    stack_overflow_ = true;
416    next_.token = Token::ILLEGAL;
417  } else {
418    Scan();
419  }
420  return current_.token;
421}
422
423
424void Scanner::StartLiteral() {
425  // Use the first buffer unless it's currently in use by the current_ token.
426  // In most cases we won't have two literals/identifiers in a row, so
427  // the second buffer won't be used very often and is unlikely to grow much.
428  UTF8Buffer* free_buffer =
429      (current_.literal_buffer != &literal_buffer_1_) ? &literal_buffer_1_
430                                                      : &literal_buffer_2_;
431  next_.literal_buffer = free_buffer;
432  free_buffer->Reset();
433}
434
435
436void Scanner::AddChar(uc32 c) {
437  next_.literal_buffer->AddChar(c);
438}
439
440
441void Scanner::TerminateLiteral() {
442  AddChar(0);
443}
444
445
446void Scanner::AddCharAdvance() {
447  AddChar(c0_);
448  Advance();
449}
450
451
452static inline bool IsByteOrderMark(uc32 c) {
453  // The Unicode value U+FFFE is guaranteed never to be assigned as a
454  // Unicode character; this implies that in a Unicode context the
455  // 0xFF, 0xFE byte pattern can only be interpreted as the U+FEFF
456  // character expressed in little-endian byte order (since it could
457  // not be a U+FFFE character expressed in big-endian byte
458  // order). Nevertheless, we check for it to be compatible with
459  // Spidermonkey.
460  return c == 0xFEFF || c == 0xFFFE;
461}
462
463
464bool Scanner::SkipJsonWhiteSpace() {
465  int start_position = source_pos();
466  // JSON WhiteSpace is tab, carrige-return, newline and space.
467  while (c0_ == ' ' || c0_ == '\n' || c0_ == '\r' || c0_ == '\t') {
468    Advance();
469  }
470  return source_pos() != start_position;
471}
472
473
474bool Scanner::SkipJavaScriptWhiteSpace() {
475  int start_position = source_pos();
476
477  while (true) {
478    // We treat byte-order marks (BOMs) as whitespace for better
479    // compatibility with Spidermonkey and other JavaScript engines.
480    while (kIsWhiteSpace.get(c0_) || IsByteOrderMark(c0_)) {
481      // IsWhiteSpace() includes line terminators!
482      if (kIsLineTerminator.get(c0_)) {
483        // Ignore line terminators, but remember them. This is necessary
484        // for automatic semicolon insertion.
485        has_line_terminator_before_next_ = true;
486      }
487      Advance();
488    }
489
490    // If there is an HTML comment end '-->' at the beginning of a
491    // line (with only whitespace in front of it), we treat the rest
492    // of the line as a comment. This is in line with the way
493    // SpiderMonkey handles it.
494    if (c0_ == '-' && has_line_terminator_before_next_) {
495      Advance();
496      if (c0_ == '-') {
497        Advance();
498        if (c0_ == '>') {
499          // Treat the rest of the line as a comment.
500          SkipSingleLineComment();
501          // Continue skipping white space after the comment.
502          continue;
503        }
504        PushBack('-');  // undo Advance()
505      }
506      PushBack('-');  // undo Advance()
507    }
508    // Return whether or not we skipped any characters.
509    return source_pos() != start_position;
510  }
511}
512
513
514Token::Value Scanner::SkipSingleLineComment() {
515  Advance();
516
517  // The line terminator at the end of the line is not considered
518  // to be part of the single-line comment; it is recognized
519  // separately by the lexical grammar and becomes part of the
520  // stream of input elements for the syntactic grammar (see
521  // ECMA-262, section 7.4, page 12).
522  while (c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
523    Advance();
524  }
525
526  return Token::WHITESPACE;
527}
528
529
530Token::Value Scanner::SkipMultiLineComment() {
531  ASSERT(c0_ == '*');
532  Advance();
533
534  while (c0_ >= 0) {
535    char ch = c0_;
536    Advance();
537    // If we have reached the end of the multi-line comment, we
538    // consume the '/' and insert a whitespace. This way all
539    // multi-line comments are treated as whitespace - even the ones
540    // containing line terminators. This contradicts ECMA-262, section
541    // 7.4, page 12, that says that multi-line comments containing
542    // line terminators should be treated as a line terminator, but it
543    // matches the behaviour of SpiderMonkey and KJS.
544    if (ch == '*' && c0_ == '/') {
545      c0_ = ' ';
546      return Token::WHITESPACE;
547    }
548  }
549
550  // Unterminated multi-line comment.
551  return Token::ILLEGAL;
552}
553
554
555Token::Value Scanner::ScanHtmlComment() {
556  // Check for <!-- comments.
557  ASSERT(c0_ == '!');
558  Advance();
559  if (c0_ == '-') {
560    Advance();
561    if (c0_ == '-') return SkipSingleLineComment();
562    PushBack('-');  // undo Advance()
563  }
564  PushBack('!');  // undo Advance()
565  ASSERT(c0_ == '!');
566  return Token::LT;
567}
568
569
570
571void Scanner::ScanJson() {
572  next_.literal_buffer = NULL;
573  Token::Value token;
574  has_line_terminator_before_next_ = false;
575  do {
576    // Remember the position of the next token
577    next_.location.beg_pos = source_pos();
578    switch (c0_) {
579      case '\t':
580      case '\r':
581      case '\n':
582      case ' ':
583        Advance();
584        token = Token::WHITESPACE;
585        break;
586      case '{':
587        Advance();
588        token = Token::LBRACE;
589        break;
590      case '}':
591        Advance();
592        token = Token::RBRACE;
593        break;
594      case '[':
595        Advance();
596        token = Token::LBRACK;
597        break;
598      case ']':
599        Advance();
600        token = Token::RBRACK;
601        break;
602      case ':':
603        Advance();
604        token = Token::COLON;
605        break;
606      case ',':
607        Advance();
608        token = Token::COMMA;
609        break;
610      case '"':
611        token = ScanJsonString();
612        break;
613      case '-':
614      case '0':
615      case '1':
616      case '2':
617      case '3':
618      case '4':
619      case '5':
620      case '6':
621      case '7':
622      case '8':
623      case '9':
624        token = ScanJsonNumber();
625        break;
626      case 't':
627        token = ScanJsonIdentifier("true", Token::TRUE_LITERAL);
628        break;
629      case 'f':
630        token = ScanJsonIdentifier("false", Token::FALSE_LITERAL);
631        break;
632      case 'n':
633        token = ScanJsonIdentifier("null", Token::NULL_LITERAL);
634        break;
635      default:
636        if (c0_ < 0) {
637          Advance();
638          token = Token::EOS;
639        } else {
640          Advance();
641          token = Select(Token::ILLEGAL);
642        }
643    }
644  } while (token == Token::WHITESPACE);
645
646  next_.location.end_pos = source_pos();
647  next_.token = token;
648}
649
650
651Token::Value Scanner::ScanJsonString() {
652  ASSERT_EQ('"', c0_);
653  Advance();
654  StartLiteral();
655  while (c0_ != '"' && c0_ > 0) {
656    // Check for control character (0x00-0x1f) or unterminated string (<0).
657    if (c0_ < 0x20) return Token::ILLEGAL;
658    if (c0_ != '\\') {
659      AddCharAdvance();
660    } else {
661      Advance();
662      switch (c0_) {
663        case '"':
664        case '\\':
665        case '/':
666          AddChar(c0_);
667          break;
668        case 'b':
669          AddChar('\x08');
670          break;
671        case 'f':
672          AddChar('\x0c');
673          break;
674        case 'n':
675          AddChar('\x0a');
676          break;
677        case 'r':
678          AddChar('\x0d');
679          break;
680        case 't':
681          AddChar('\x09');
682          break;
683        case 'u': {
684          uc32 value = 0;
685          for (int i = 0; i < 4; i++) {
686            Advance();
687            int digit = HexValue(c0_);
688            if (digit < 0) return Token::ILLEGAL;
689            value = value * 16 + digit;
690          }
691          AddChar(value);
692          break;
693        }
694        default:
695          return Token::ILLEGAL;
696      }
697      Advance();
698    }
699  }
700  if (c0_ != '"') {
701    return Token::ILLEGAL;
702  }
703  TerminateLiteral();
704  Advance();
705  return Token::STRING;
706}
707
708
709Token::Value Scanner::ScanJsonNumber() {
710  StartLiteral();
711  if (c0_ == '-') AddCharAdvance();
712  if (c0_ == '0') {
713    AddCharAdvance();
714    // Prefix zero is only allowed if it's the only digit before
715    // a decimal point or exponent.
716    if ('0' <= c0_ && c0_ <= '9') return Token::ILLEGAL;
717  } else {
718    if (c0_ < '1' || c0_ > '9') return Token::ILLEGAL;
719    do {
720      AddCharAdvance();
721    } while (c0_ >= '0' && c0_ <= '9');
722  }
723  if (c0_ == '.') {
724    AddCharAdvance();
725    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
726    do {
727      AddCharAdvance();
728    } while (c0_ >= '0' && c0_ <= '9');
729  }
730  if ((c0_ | 0x20) == 'e') {
731    AddCharAdvance();
732    if (c0_ == '-' || c0_ == '+') AddCharAdvance();
733    if (c0_ < '0' || c0_ > '9') return Token::ILLEGAL;
734    do {
735      AddCharAdvance();
736    } while (c0_ >= '0' && c0_ <= '9');
737  }
738  TerminateLiteral();
739  return Token::NUMBER;
740}
741
742
743Token::Value Scanner::ScanJsonIdentifier(const char* text,
744                                         Token::Value token) {
745  StartLiteral();
746  while (*text != '\0') {
747    if (c0_ != *text) return Token::ILLEGAL;
748    Advance();
749    text++;
750  }
751  if (kIsIdentifierPart.get(c0_)) return Token::ILLEGAL;
752  TerminateLiteral();
753  return token;
754}
755
756
757void Scanner::ScanJavaScript() {
758  next_.literal_buffer = NULL;
759  Token::Value token;
760  has_line_terminator_before_next_ = false;
761  do {
762    // Remember the position of the next token
763    next_.location.beg_pos = source_pos();
764
765    switch (c0_) {
766      case ' ':
767      case '\t':
768        Advance();
769        token = Token::WHITESPACE;
770        break;
771
772      case '\n':
773        Advance();
774        has_line_terminator_before_next_ = true;
775        token = Token::WHITESPACE;
776        break;
777
778      case '"': case '\'':
779        token = ScanString();
780        break;
781
782      case '<':
783        // < <= << <<= <!--
784        Advance();
785        if (c0_ == '=') {
786          token = Select(Token::LTE);
787        } else if (c0_ == '<') {
788          token = Select('=', Token::ASSIGN_SHL, Token::SHL);
789        } else if (c0_ == '!') {
790          token = ScanHtmlComment();
791        } else {
792          token = Token::LT;
793        }
794        break;
795
796      case '>':
797        // > >= >> >>= >>> >>>=
798        Advance();
799        if (c0_ == '=') {
800          token = Select(Token::GTE);
801        } else if (c0_ == '>') {
802          // >> >>= >>> >>>=
803          Advance();
804          if (c0_ == '=') {
805            token = Select(Token::ASSIGN_SAR);
806          } else if (c0_ == '>') {
807            token = Select('=', Token::ASSIGN_SHR, Token::SHR);
808          } else {
809            token = Token::SAR;
810          }
811        } else {
812          token = Token::GT;
813        }
814        break;
815
816      case '=':
817        // = == ===
818        Advance();
819        if (c0_ == '=') {
820          token = Select('=', Token::EQ_STRICT, Token::EQ);
821        } else {
822          token = Token::ASSIGN;
823        }
824        break;
825
826      case '!':
827        // ! != !==
828        Advance();
829        if (c0_ == '=') {
830          token = Select('=', Token::NE_STRICT, Token::NE);
831        } else {
832          token = Token::NOT;
833        }
834        break;
835
836      case '+':
837        // + ++ +=
838        Advance();
839        if (c0_ == '+') {
840          token = Select(Token::INC);
841        } else if (c0_ == '=') {
842          token = Select(Token::ASSIGN_ADD);
843        } else {
844          token = Token::ADD;
845        }
846        break;
847
848      case '-':
849        // - -- --> -=
850        Advance();
851        if (c0_ == '-') {
852          Advance();
853          if (c0_ == '>' && has_line_terminator_before_next_) {
854            // For compatibility with SpiderMonkey, we skip lines that
855            // start with an HTML comment end '-->'.
856            token = SkipSingleLineComment();
857          } else {
858            token = Token::DEC;
859          }
860        } else if (c0_ == '=') {
861          token = Select(Token::ASSIGN_SUB);
862        } else {
863          token = Token::SUB;
864        }
865        break;
866
867      case '*':
868        // * *=
869        token = Select('=', Token::ASSIGN_MUL, Token::MUL);
870        break;
871
872      case '%':
873        // % %=
874        token = Select('=', Token::ASSIGN_MOD, Token::MOD);
875        break;
876
877      case '/':
878        // /  // /* /=
879        Advance();
880        if (c0_ == '/') {
881          token = SkipSingleLineComment();
882        } else if (c0_ == '*') {
883          token = SkipMultiLineComment();
884        } else if (c0_ == '=') {
885          token = Select(Token::ASSIGN_DIV);
886        } else {
887          token = Token::DIV;
888        }
889        break;
890
891      case '&':
892        // & && &=
893        Advance();
894        if (c0_ == '&') {
895          token = Select(Token::AND);
896        } else if (c0_ == '=') {
897          token = Select(Token::ASSIGN_BIT_AND);
898        } else {
899          token = Token::BIT_AND;
900        }
901        break;
902
903      case '|':
904        // | || |=
905        Advance();
906        if (c0_ == '|') {
907          token = Select(Token::OR);
908        } else if (c0_ == '=') {
909          token = Select(Token::ASSIGN_BIT_OR);
910        } else {
911          token = Token::BIT_OR;
912        }
913        break;
914
915      case '^':
916        // ^ ^=
917        token = Select('=', Token::ASSIGN_BIT_XOR, Token::BIT_XOR);
918        break;
919
920      case '.':
921        // . Number
922        Advance();
923        if (IsDecimalDigit(c0_)) {
924          token = ScanNumber(true);
925        } else {
926          token = Token::PERIOD;
927        }
928        break;
929
930      case ':':
931        token = Select(Token::COLON);
932        break;
933
934      case ';':
935        token = Select(Token::SEMICOLON);
936        break;
937
938      case ',':
939        token = Select(Token::COMMA);
940        break;
941
942      case '(':
943        token = Select(Token::LPAREN);
944        break;
945
946      case ')':
947        token = Select(Token::RPAREN);
948        break;
949
950      case '[':
951        token = Select(Token::LBRACK);
952        break;
953
954      case ']':
955        token = Select(Token::RBRACK);
956        break;
957
958      case '{':
959        token = Select(Token::LBRACE);
960        break;
961
962      case '}':
963        token = Select(Token::RBRACE);
964        break;
965
966      case '?':
967        token = Select(Token::CONDITIONAL);
968        break;
969
970      case '~':
971        token = Select(Token::BIT_NOT);
972        break;
973
974      default:
975        if (kIsIdentifierStart.get(c0_)) {
976          token = ScanIdentifier();
977        } else if (IsDecimalDigit(c0_)) {
978          token = ScanNumber(false);
979        } else if (SkipWhiteSpace()) {
980          token = Token::WHITESPACE;
981        } else if (c0_ < 0) {
982          token = Token::EOS;
983        } else {
984          token = Select(Token::ILLEGAL);
985        }
986        break;
987    }
988
989    // Continue scanning for tokens as long as we're just skipping
990    // whitespace.
991  } while (token == Token::WHITESPACE);
992
993  next_.location.end_pos = source_pos();
994  next_.token = token;
995}
996
997
998void Scanner::SeekForward(int pos) {
999  source_->SeekForward(pos - 1);
1000  Advance();
1001  Scan();
1002}
1003
1004
1005uc32 Scanner::ScanHexEscape(uc32 c, int length) {
1006  ASSERT(length <= 4);  // prevent overflow
1007
1008  uc32 digits[4];
1009  uc32 x = 0;
1010  for (int i = 0; i < length; i++) {
1011    digits[i] = c0_;
1012    int d = HexValue(c0_);
1013    if (d < 0) {
1014      // According to ECMA-262, 3rd, 7.8.4, page 18, these hex escapes
1015      // should be illegal, but other JS VMs just return the
1016      // non-escaped version of the original character.
1017
1018      // Push back digits read, except the last one (in c0_).
1019      for (int j = i-1; j >= 0; j--) {
1020        PushBack(digits[j]);
1021      }
1022      // Notice: No handling of error - treat it as "\u"->"u".
1023      return c;
1024    }
1025    x = x * 16 + d;
1026    Advance();
1027  }
1028
1029  return x;
1030}
1031
1032
1033// Octal escapes of the forms '\0xx' and '\xxx' are not a part of
1034// ECMA-262. Other JS VMs support them.
1035uc32 Scanner::ScanOctalEscape(uc32 c, int length) {
1036  uc32 x = c - '0';
1037  for (int i = 0; i < length; i++) {
1038    int d = c0_ - '0';
1039    if (d < 0 || d > 7) break;
1040    int nx = x * 8 + d;
1041    if (nx >= 256) break;
1042    x = nx;
1043    Advance();
1044  }
1045  return x;
1046}
1047
1048
1049void Scanner::ScanEscape() {
1050  uc32 c = c0_;
1051  Advance();
1052
1053  // Skip escaped newlines.
1054  if (kIsLineTerminator.get(c)) {
1055    // Allow CR+LF newlines in multiline string literals.
1056    if (IsCarriageReturn(c) && IsLineFeed(c0_)) Advance();
1057    // Allow LF+CR newlines in multiline string literals.
1058    if (IsLineFeed(c) && IsCarriageReturn(c0_)) Advance();
1059    return;
1060  }
1061
1062  switch (c) {
1063    case '\'':  // fall through
1064    case '"' :  // fall through
1065    case '\\': break;
1066    case 'b' : c = '\b'; break;
1067    case 'f' : c = '\f'; break;
1068    case 'n' : c = '\n'; break;
1069    case 'r' : c = '\r'; break;
1070    case 't' : c = '\t'; break;
1071    case 'u' : c = ScanHexEscape(c, 4); break;
1072    case 'v' : c = '\v'; break;
1073    case 'x' : c = ScanHexEscape(c, 2); break;
1074    case '0' :  // fall through
1075    case '1' :  // fall through
1076    case '2' :  // fall through
1077    case '3' :  // fall through
1078    case '4' :  // fall through
1079    case '5' :  // fall through
1080    case '6' :  // fall through
1081    case '7' : c = ScanOctalEscape(c, 2); break;
1082  }
1083
1084  // According to ECMA-262, 3rd, 7.8.4 (p 18ff) these
1085  // should be illegal, but they are commonly handled
1086  // as non-escaped characters by JS VMs.
1087  AddChar(c);
1088}
1089
1090
1091Token::Value Scanner::ScanString() {
1092  uc32 quote = c0_;
1093  Advance();  // consume quote
1094
1095  StartLiteral();
1096  while (c0_ != quote && c0_ >= 0 && !kIsLineTerminator.get(c0_)) {
1097    uc32 c = c0_;
1098    Advance();
1099    if (c == '\\') {
1100      if (c0_ < 0) return Token::ILLEGAL;
1101      ScanEscape();
1102    } else {
1103      AddChar(c);
1104    }
1105  }
1106  if (c0_ != quote) {
1107    return Token::ILLEGAL;
1108  }
1109  TerminateLiteral();
1110
1111  Advance();  // consume quote
1112  return Token::STRING;
1113}
1114
1115
1116Token::Value Scanner::Select(Token::Value tok) {
1117  Advance();
1118  return tok;
1119}
1120
1121
1122Token::Value Scanner::Select(uc32 next, Token::Value then, Token::Value else_) {
1123  Advance();
1124  if (c0_ == next) {
1125    Advance();
1126    return then;
1127  } else {
1128    return else_;
1129  }
1130}
1131
1132
1133// Returns true if any decimal digits were scanned, returns false otherwise.
1134void Scanner::ScanDecimalDigits() {
1135  while (IsDecimalDigit(c0_))
1136    AddCharAdvance();
1137}
1138
1139
1140Token::Value Scanner::ScanNumber(bool seen_period) {
1141  ASSERT(IsDecimalDigit(c0_));  // the first digit of the number or the fraction
1142
1143  enum { DECIMAL, HEX, OCTAL } kind = DECIMAL;
1144
1145  StartLiteral();
1146  if (seen_period) {
1147    // we have already seen a decimal point of the float
1148    AddChar('.');
1149    ScanDecimalDigits();  // we know we have at least one digit
1150
1151  } else {
1152    // if the first character is '0' we must check for octals and hex
1153    if (c0_ == '0') {
1154      AddCharAdvance();
1155
1156      // either 0, 0exxx, 0Exxx, 0.xxx, an octal number, or a hex number
1157      if (c0_ == 'x' || c0_ == 'X') {
1158        // hex number
1159        kind = HEX;
1160        AddCharAdvance();
1161        if (!IsHexDigit(c0_))
1162          // we must have at least one hex digit after 'x'/'X'
1163          return Token::ILLEGAL;
1164        while (IsHexDigit(c0_))
1165          AddCharAdvance();
1166
1167      } else if ('0' <= c0_ && c0_ <= '7') {
1168        // (possible) octal number
1169        kind = OCTAL;
1170        while (true) {
1171          if (c0_ == '8' || c0_ == '9') {
1172            kind = DECIMAL;
1173            break;
1174          }
1175          if (c0_  < '0' || '7'  < c0_) break;
1176          AddCharAdvance();
1177        }
1178      }
1179    }
1180
1181    // Parse decimal digits and allow trailing fractional part.
1182    if (kind == DECIMAL) {
1183      ScanDecimalDigits();  // optional
1184      if (c0_ == '.') {
1185        AddCharAdvance();
1186        ScanDecimalDigits();  // optional
1187      }
1188    }
1189  }
1190
1191  // scan exponent, if any
1192  if (c0_ == 'e' || c0_ == 'E') {
1193    ASSERT(kind != HEX);  // 'e'/'E' must be scanned as part of the hex number
1194    if (kind == OCTAL) return Token::ILLEGAL;  // no exponent for octals allowed
1195    // scan exponent
1196    AddCharAdvance();
1197    if (c0_ == '+' || c0_ == '-')
1198      AddCharAdvance();
1199    if (!IsDecimalDigit(c0_))
1200      // we must have at least one decimal digit after 'e'/'E'
1201      return Token::ILLEGAL;
1202    ScanDecimalDigits();
1203  }
1204  TerminateLiteral();
1205
1206  // The source character immediately following a numeric literal must
1207  // not be an identifier start or a decimal digit; see ECMA-262
1208  // section 7.8.3, page 17 (note that we read only one decimal digit
1209  // if the value is 0).
1210  if (IsDecimalDigit(c0_) || kIsIdentifierStart.get(c0_))
1211    return Token::ILLEGAL;
1212
1213  return Token::NUMBER;
1214}
1215
1216
1217uc32 Scanner::ScanIdentifierUnicodeEscape() {
1218  Advance();
1219  if (c0_ != 'u') return unibrow::Utf8::kBadChar;
1220  Advance();
1221  uc32 c = ScanHexEscape('u', 4);
1222  // We do not allow a unicode escape sequence to start another
1223  // unicode escape sequence.
1224  if (c == '\\') return unibrow::Utf8::kBadChar;
1225  return c;
1226}
1227
1228
1229Token::Value Scanner::ScanIdentifier() {
1230  ASSERT(kIsIdentifierStart.get(c0_));
1231
1232  StartLiteral();
1233  KeywordMatcher keyword_match;
1234
1235  // Scan identifier start character.
1236  if (c0_ == '\\') {
1237    uc32 c = ScanIdentifierUnicodeEscape();
1238    // Only allow legal identifier start characters.
1239    if (!kIsIdentifierStart.get(c)) return Token::ILLEGAL;
1240    AddChar(c);
1241    keyword_match.Fail();
1242  } else {
1243    AddChar(c0_);
1244    keyword_match.AddChar(c0_);
1245    Advance();
1246  }
1247
1248  // Scan the rest of the identifier characters.
1249  while (kIsIdentifierPart.get(c0_)) {
1250    if (c0_ == '\\') {
1251      uc32 c = ScanIdentifierUnicodeEscape();
1252      // Only allow legal identifier part characters.
1253      if (!kIsIdentifierPart.get(c)) return Token::ILLEGAL;
1254      AddChar(c);
1255      keyword_match.Fail();
1256    } else {
1257      AddChar(c0_);
1258      keyword_match.AddChar(c0_);
1259      Advance();
1260    }
1261  }
1262  TerminateLiteral();
1263
1264  return keyword_match.token();
1265}
1266
1267
1268
1269bool Scanner::IsIdentifier(unibrow::CharacterStream* buffer) {
1270  // Checks whether the buffer contains an identifier (no escape).
1271  if (!buffer->has_more()) return false;
1272  if (!kIsIdentifierStart.get(buffer->GetNext())) return false;
1273  while (buffer->has_more()) {
1274    if (!kIsIdentifierPart.get(buffer->GetNext())) return false;
1275  }
1276  return true;
1277}
1278
1279
1280bool Scanner::ScanRegExpPattern(bool seen_equal) {
1281  // Scan: ('/' | '/=') RegularExpressionBody '/' RegularExpressionFlags
1282  bool in_character_class = false;
1283
1284  // Previous token is either '/' or '/=', in the second case, the
1285  // pattern starts at =.
1286  next_.location.beg_pos = source_pos() - (seen_equal ? 2 : 1);
1287  next_.location.end_pos = source_pos() - (seen_equal ? 1 : 0);
1288
1289  // Scan regular expression body: According to ECMA-262, 3rd, 7.8.5,
1290  // the scanner should pass uninterpreted bodies to the RegExp
1291  // constructor.
1292  StartLiteral();
1293  if (seen_equal)
1294    AddChar('=');
1295
1296  while (c0_ != '/' || in_character_class) {
1297    if (kIsLineTerminator.get(c0_) || c0_ < 0)
1298      return false;
1299    if (c0_ == '\\') {  // escaped character
1300      AddCharAdvance();
1301      if (kIsLineTerminator.get(c0_) || c0_ < 0)
1302        return false;
1303      AddCharAdvance();
1304    } else {  // unescaped character
1305      if (c0_ == '[')
1306        in_character_class = true;
1307      if (c0_ == ']')
1308        in_character_class = false;
1309      AddCharAdvance();
1310    }
1311  }
1312  Advance();  // consume '/'
1313
1314  TerminateLiteral();
1315
1316  return true;
1317}
1318
1319bool Scanner::ScanRegExpFlags() {
1320  // Scan regular expression flags.
1321  StartLiteral();
1322  while (kIsIdentifierPart.get(c0_)) {
1323    if (c0_ == '\\') {
1324      uc32 c = ScanIdentifierUnicodeEscape();
1325      if (c != static_cast<uc32>(unibrow::Utf8::kBadChar)) {
1326        // We allow any escaped character, unlike the restriction on
1327        // IdentifierPart when it is used to build an IdentifierName.
1328        AddChar(c);
1329        continue;
1330      }
1331    }
1332    AddCharAdvance();
1333  }
1334  TerminateLiteral();
1335
1336  next_.location.end_pos = source_pos() - 1;
1337  return true;
1338}
1339
1340} }  // namespace v8::internal
1341