1d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Use of this source code is governed by a BSD-style license that can be
3d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// found in the LICENSE file.
4d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
5d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/tokenizer.h"
6d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
7d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "base/logging.h"
81320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci#include "base/strings/string_util.h"
9d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/input_file.h"
10d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
11d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochnamespace {
12d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
13d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool CouldBeTwoCharOperatorBegin(char c) {
14d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
15d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch         c == '+' || c == '|' || c == '&';
16d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
17d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
18d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool CouldBeTwoCharOperatorEnd(char c) {
19d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return c == '=' || c == '|' || c == '&';
20d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
21d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
22d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool CouldBeOneCharOperator(char c) {
23d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
24d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch         c == ':' || c == '|' || c == '&' || c == '-';
25d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
26d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
27d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool CouldBeOperator(char c) {
28d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
29d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
30d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
31d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool IsScoperChar(char c) {
32d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
33d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
34d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
353551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)Token::Type GetSpecificOperatorType(base::StringPiece value) {
363551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "=")
373551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::EQUAL;
383551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "+")
393551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::PLUS;
403551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "-")
413551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::MINUS;
423551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "+=")
433551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::PLUS_EQUALS;
443551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "-=")
453551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::MINUS_EQUALS;
463551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "==")
473551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::EQUAL_EQUAL;
483551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "!=")
493551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::NOT_EQUAL;
503551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "<=")
513551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::LESS_EQUAL;
523551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == ">=")
533551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::GREATER_EQUAL;
543551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "<")
553551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::LESS_THAN;
563551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == ">")
573551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::GREATER_THAN;
583551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "&&")
593551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::BOOLEAN_AND;
603551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "||")
613551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::BOOLEAN_OR;
623551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (value == "!")
633551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::BANG;
64effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  if (value == ".")
65effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch    return Token::DOT;
663551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  return Token::INVALID;
673551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)}
683551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
69d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}  // namespace
70d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
71d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochTokenizer::Tokenizer(const InputFile* input_file, Err* err)
72d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    : input_file_(input_file),
73d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      input_(input_file->contents()),
74d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      err_(err),
75d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      cur_(0),
76d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      line_number_(1),
77d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      char_in_line_(1) {
78d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
79d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
80d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochTokenizer::~Tokenizer() {
81d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
82d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
83d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// static
84d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochstd::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
85d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  Tokenizer t(input_file, err);
86d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return t.Run();
87d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
88d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
89d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochstd::vector<Token> Tokenizer::Run() {
903551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  DCHECK(tokens_.empty());
91d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  while (!done()) {
92d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    AdvanceToNextToken();
93d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    if (done())
94d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
95d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    Location location = GetCurrentLocation();
96d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
97d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    Token::Type type = ClassifyCurrent();
98d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    if (type == Token::INVALID) {
99d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      *err_ = GetErrorForInvalidToken(location);
100d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
101d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    }
102d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    size_t token_begin = cur_;
103d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    AdvanceToEndOfToken(location, type);
104d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    if (has_error())
105d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
106d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    size_t token_end = cur_;
107d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
1083551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    base::StringPiece token_value(&input_.data()[token_begin],
1093551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)                                  token_end - token_begin);
1103551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
1111320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    if (type == Token::UNCLASSIFIED_OPERATOR) {
1123551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      type = GetSpecificOperatorType(token_value);
1131320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    } else if (type == Token::IDENTIFIER) {
1143551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      if (token_value == "if")
1153551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)        type = Token::IF;
1163551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      else if (token_value == "else")
1173551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)        type = Token::ELSE;
1183551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      else if (token_value == "true")
1193551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)        type = Token::TRUE_TOKEN;
1203551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      else if (token_value == "false")
1213551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)        type = Token::FALSE_TOKEN;
1221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    } else if (type == Token::UNCLASSIFIED_COMMENT) {
1231320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      if (AtStartOfLine(token_begin) &&
1241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          // If it's a standalone comment, but is a continuation of a comment on
1251320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          // a previous line, then instead make it a continued suffix comment.
1261320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
1271320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci           tokens_.back().location().line_number() + 1 !=
1281320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci               location.line_number() ||
1291320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci           tokens_.back().location().char_offset() != location.char_offset())) {
1301320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        type = Token::LINE_COMMENT;
1311320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        Advance();  // The current \n.
1321320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        // If this comment is separated from the next syntax element, then we
1331320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        // want to tag it as a block comment. This will become a standalone
1341320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        // statement at the parser level to keep this comment separate, rather
1351320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        // than attached to the subsequent statement.
1361320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        while (!at_end() && IsCurrentWhitespace()) {
1371320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          if (IsCurrentNewline()) {
1381320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            type = Token::BLOCK_COMMENT;
1391320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci            break;
1401320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          }
1411320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci          Advance();
1421320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        }
1431320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      } else {
1441320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci        type = Token::SUFFIX_COMMENT;
1451320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      }
1463551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    }
1473551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
1481320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    tokens_.push_back(Token(location, type, token_value));
149d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
150d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (err_->has_error())
1513551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    tokens_.clear();
1523551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  return tokens_;
153d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
154d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
155d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// static
156d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochsize_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
157116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  DCHECK_GT(n, 0);
158d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
159d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (n == 1)
160d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    return 0;
161d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
162116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  int cur_line = 1;
163116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  size_t cur_byte = 0;
164d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  while (cur_byte < buf.size()) {
165d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    if (IsNewline(buf, cur_byte)) {
166d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      cur_line++;
167d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      if (cur_line == n)
168d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        return cur_byte + 1;
169d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    }
170d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    cur_byte++;
171d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
172116680a4aac90f2aa7413d9095a592090648e557Ben Murdoch  return static_cast<size_t>(-1);
173d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
174d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
175d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// static
176d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
177d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DCHECK(offset < buffer.size());
178d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // We may need more logic here to handle different line ending styles.
179d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return buffer[offset] == '\n';
180d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
181d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
182d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
183d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochvoid Tokenizer::AdvanceToNextToken() {
184d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  while (!at_end() && IsCurrentWhitespace())
185d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    Advance();
186d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
187d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
188d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochToken::Type Tokenizer::ClassifyCurrent() const {
189d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DCHECK(!at_end());
190d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  char next_char = cur_char();
191cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)  if (IsAsciiDigit(next_char))
192d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    return Token::INTEGER;
193d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (next_char == '"')
194d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    return Token::STRING;
195d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
196d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Note: '-' handled specially below.
197d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (next_char != '-' && CouldBeOperator(next_char))
1983551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::UNCLASSIFIED_OPERATOR;
199d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
200d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (IsIdentifierFirstChar(next_char))
201d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    return Token::IDENTIFIER;
202d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
2033551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == '[')
2043551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::LEFT_BRACKET;
2053551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == ']')
2063551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::RIGHT_BRACKET;
2073551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == '(')
2083551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::LEFT_PAREN;
2093551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == ')')
2103551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::RIGHT_PAREN;
2113551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == '{')
2123551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::LEFT_BRACE;
2133551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == '}')
2143551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::RIGHT_BRACE;
2153551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
216effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch  if (next_char == '.')
217effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch    return Token::DOT;
2183551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  if (next_char == ',')
2193551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::COMMA;
220d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
221d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (next_char == '#')
2221320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    return Token::UNCLASSIFIED_COMMENT;
223d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
224d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // For the case of '-' differentiate between a negative number and anything
225d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // else.
226d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (next_char == '-') {
227d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    if (!CanIncrement())
2283551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)      return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
2293551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)                                            // file.
230d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    char following_char = input_[cur_ + 1];
231cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    if (IsAsciiDigit(following_char))
232d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      return Token::INTEGER;
2333551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    return Token::UNCLASSIFIED_OPERATOR;
234d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
235d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
236d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return Token::INVALID;
237d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
238d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
239d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochvoid Tokenizer::AdvanceToEndOfToken(const Location& location,
240d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch                                    Token::Type type) {
241d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  switch (type) {
242d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    case Token::INTEGER:
243d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      do {
244d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        Advance();
245cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)      } while (!at_end() && IsAsciiDigit(cur_char()));
246d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      if (!at_end()) {
247d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        // Require the char after a number to be some kind of space, scope,
248d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        // or operator.
249d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        char c = cur_char();
250d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
2513551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)            !IsScoperChar(c) && c != ',') {
252d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          *err_ = Err(GetCurrentLocation(),
253cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                      "This is not a valid number.",
254cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                      "Learn to count.");
255d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          // Highlight the number.
256d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          err_->AppendRange(LocationRange(location, GetCurrentLocation()));
257d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        }
258d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      }
259d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
260d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
261d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    case Token::STRING: {
262d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      char initial = cur_char();
263d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      Advance();  // Advance past initial "
264d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      for (;;) {
265d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        if (at_end()) {
266cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)          *err_ = Err(LocationRange(location, GetCurrentLocation()),
267cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                      "Unterminated string literal.",
268cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                      "Don't leave me hanging like this!");
269d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          break;
270d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        }
271d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        if (IsCurrentStringTerminator(initial)) {
272d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          Advance();  // Skip past last "
273d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          break;
274d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        } else if (cur_char() == '\n') {
275cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)          *err_ = Err(LocationRange(location, GetCurrentLocation()),
276cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)                      "Newline in string constant.");
277d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        }
278d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        Advance();
279d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      }
280d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
281d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    }
282d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
2833551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::UNCLASSIFIED_OPERATOR:
284d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      // Some operators are two characters, some are one.
285d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      if (CouldBeTwoCharOperatorBegin(cur_char())) {
286d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
287d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch          Advance();
288d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      }
289d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      Advance();
290d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
291d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
292d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    case Token::IDENTIFIER:
293d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      while (!at_end() && IsIdentifierContinuingChar(cur_char()))
294d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        Advance();
295d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
296d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
2973551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::LEFT_BRACKET:
2983551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::RIGHT_BRACKET:
2993551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::LEFT_BRACE:
3003551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::RIGHT_BRACE:
3013551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::LEFT_PAREN:
3023551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::RIGHT_PAREN:
303effb81e5f8246d0db0270817048dc992db66e9fbBen Murdoch    case Token::DOT:
3043551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    case Token::COMMA:
305d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      Advance();  // All are one char.
306d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
307d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
3081320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    case Token::UNCLASSIFIED_COMMENT:
309d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      // Eat to EOL.
310d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      while (!at_end() && !IsCurrentNewline())
311d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch        Advance();
312d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      break;
313d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
314d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    case Token::INVALID:
3153551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)    default:
316d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      *err_ = Err(location, "Everything is all messed up",
317d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch                  "Please insert system disk in drive A: and press any key.");
318d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      NOTREACHED();
319d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      return;
320d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
321d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
322d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
3231320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tuccibool Tokenizer::AtStartOfLine(size_t location) const {
3241320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  while (location > 0) {
3251320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    --location;
3261320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    char c = input_[location];
3271320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    if (c == '\n')
3281320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      return true;
3291320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci    if (c != ' ')
3301320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      return false;
3311320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  }
3321320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  return true;
3331320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci}
3341320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
335d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool Tokenizer::IsCurrentWhitespace() const {
336d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DCHECK(!at_end());
337d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  char c = input_[cur_];
3381320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
3391320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  return c == 0x0A || c == 0x0D || c == 0x20;
340d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
341d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
342d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
343d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DCHECK(!at_end());
344d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (cur_char() != quote_char)
345d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    return false;
346d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
347d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Check for escaping. \" is not a string terminator, but \\" is. Count
348d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // the number of preceeding backslashes.
349d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  int num_backslashes = 0;
350d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
351d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    num_backslashes++;
352d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
353d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Even backslashes mean that they were escaping each other and don't count
354d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // as escaping this quote.
355d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return (num_backslashes % 2) == 0;
356d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
357d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
358d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochbool Tokenizer::IsCurrentNewline() const {
359d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return IsNewline(input_, cur_);
360d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
361d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
362d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochvoid Tokenizer::Advance() {
363d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DCHECK(cur_ < input_.size());
364d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (IsCurrentNewline()) {
365d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    line_number_++;
366d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    char_in_line_ = 1;
367d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  } else {
368d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    char_in_line_++;
369d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
370d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  cur_++;
371d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
372d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
373d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochLocation Tokenizer::GetCurrentLocation() const {
3741320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  return Location(
3751320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci      input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
376d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
377d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
378d3868032626d59662ff73b372b5d584c1d144c53Ben MurdochErr Tokenizer::GetErrorForInvalidToken(const Location& location) const {
379d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  std::string help;
380d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  if (cur_char() == ';') {
381d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    // Semicolon.
382d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    help = "Semicolons are not needed, delete this one.";
383d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  } else if (cur_char() == '\t') {
384d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    // Tab.
385d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    help = "You got a tab character in here. Tabs are evil. "
386d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch           "Convert to spaces.";
387d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
388d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch      (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
389d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    // Different types of comments.
390d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    help = "Comments should start with # instead";
391d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  } else {
392d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    help = "I have no idea what this is.";
393d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
394d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
395d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  return Err(location, "Invalid token.", help);
396d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}
397