1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
8#include "base/strings/string_util.h"
9#include "tools/gn/input_file.h"
10
11namespace {
12
13bool CouldBeTwoCharOperatorBegin(char c) {
14  return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
15         c == '+' || c == '|' || c == '&';
16}
17
18bool CouldBeTwoCharOperatorEnd(char c) {
19  return c == '=' || c == '|' || c == '&';
20}
21
22bool CouldBeOneCharOperator(char c) {
23  return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
24         c == ':' || c == '|' || c == '&' || c == '-';
25}
26
27bool CouldBeOperator(char c) {
28  return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
29}
30
31bool IsScoperChar(char c) {
32  return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
33}
34
35Token::Type GetSpecificOperatorType(base::StringPiece value) {
36  if (value == "=")
37    return Token::EQUAL;
38  if (value == "+")
39    return Token::PLUS;
40  if (value == "-")
41    return Token::MINUS;
42  if (value == "+=")
43    return Token::PLUS_EQUALS;
44  if (value == "-=")
45    return Token::MINUS_EQUALS;
46  if (value == "==")
47    return Token::EQUAL_EQUAL;
48  if (value == "!=")
49    return Token::NOT_EQUAL;
50  if (value == "<=")
51    return Token::LESS_EQUAL;
52  if (value == ">=")
53    return Token::GREATER_EQUAL;
54  if (value == "<")
55    return Token::LESS_THAN;
56  if (value == ">")
57    return Token::GREATER_THAN;
58  if (value == "&&")
59    return Token::BOOLEAN_AND;
60  if (value == "||")
61    return Token::BOOLEAN_OR;
62  if (value == "!")
63    return Token::BANG;
64  if (value == ".")
65    return Token::DOT;
66  return Token::INVALID;
67}
68
69}  // namespace
70
71Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
72    : input_file_(input_file),
73      input_(input_file->contents()),
74      err_(err),
75      cur_(0),
76      line_number_(1),
77      char_in_line_(1) {
78}
79
80Tokenizer::~Tokenizer() {
81}
82
83// static
84std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
85  Tokenizer t(input_file, err);
86  return t.Run();
87}
88
89std::vector<Token> Tokenizer::Run() {
90  DCHECK(tokens_.empty());
91  while (!done()) {
92    AdvanceToNextToken();
93    if (done())
94      break;
95    Location location = GetCurrentLocation();
96
97    Token::Type type = ClassifyCurrent();
98    if (type == Token::INVALID) {
99      *err_ = GetErrorForInvalidToken(location);
100      break;
101    }
102    size_t token_begin = cur_;
103    AdvanceToEndOfToken(location, type);
104    if (has_error())
105      break;
106    size_t token_end = cur_;
107
108    base::StringPiece token_value(&input_.data()[token_begin],
109                                  token_end - token_begin);
110
111    if (type == Token::UNCLASSIFIED_OPERATOR) {
112      type = GetSpecificOperatorType(token_value);
113    } else if (type == Token::IDENTIFIER) {
114      if (token_value == "if")
115        type = Token::IF;
116      else if (token_value == "else")
117        type = Token::ELSE;
118      else if (token_value == "true")
119        type = Token::TRUE_TOKEN;
120      else if (token_value == "false")
121        type = Token::FALSE_TOKEN;
122    } else if (type == Token::UNCLASSIFIED_COMMENT) {
123      if (AtStartOfLine(token_begin) &&
124          // If it's a standalone comment, but is a continuation of a comment on
125          // a previous line, then instead make it a continued suffix comment.
126          (tokens_.empty() || tokens_.back().type() != Token::SUFFIX_COMMENT ||
127           tokens_.back().location().line_number() + 1 !=
128               location.line_number() ||
129           tokens_.back().location().char_offset() != location.char_offset())) {
130        type = Token::LINE_COMMENT;
131        Advance();  // The current \n.
132        // If this comment is separated from the next syntax element, then we
133        // want to tag it as a block comment. This will become a standalone
134        // statement at the parser level to keep this comment separate, rather
135        // than attached to the subsequent statement.
136        while (!at_end() && IsCurrentWhitespace()) {
137          if (IsCurrentNewline()) {
138            type = Token::BLOCK_COMMENT;
139            break;
140          }
141          Advance();
142        }
143      } else {
144        type = Token::SUFFIX_COMMENT;
145      }
146    }
147
148    tokens_.push_back(Token(location, type, token_value));
149  }
150  if (err_->has_error())
151    tokens_.clear();
152  return tokens_;
153}
154
155// static
156size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
157  DCHECK_GT(n, 0);
158
159  if (n == 1)
160    return 0;
161
162  int cur_line = 1;
163  size_t cur_byte = 0;
164  while (cur_byte < buf.size()) {
165    if (IsNewline(buf, cur_byte)) {
166      cur_line++;
167      if (cur_line == n)
168        return cur_byte + 1;
169    }
170    cur_byte++;
171  }
172  return static_cast<size_t>(-1);
173}
174
175// static
176bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
177  DCHECK(offset < buffer.size());
178  // We may need more logic here to handle different line ending styles.
179  return buffer[offset] == '\n';
180}
181
182
183void Tokenizer::AdvanceToNextToken() {
184  while (!at_end() && IsCurrentWhitespace())
185    Advance();
186}
187
188Token::Type Tokenizer::ClassifyCurrent() const {
189  DCHECK(!at_end());
190  char next_char = cur_char();
191  if (IsAsciiDigit(next_char))
192    return Token::INTEGER;
193  if (next_char == '"')
194    return Token::STRING;
195
196  // Note: '-' handled specially below.
197  if (next_char != '-' && CouldBeOperator(next_char))
198    return Token::UNCLASSIFIED_OPERATOR;
199
200  if (IsIdentifierFirstChar(next_char))
201    return Token::IDENTIFIER;
202
203  if (next_char == '[')
204    return Token::LEFT_BRACKET;
205  if (next_char == ']')
206    return Token::RIGHT_BRACKET;
207  if (next_char == '(')
208    return Token::LEFT_PAREN;
209  if (next_char == ')')
210    return Token::RIGHT_PAREN;
211  if (next_char == '{')
212    return Token::LEFT_BRACE;
213  if (next_char == '}')
214    return Token::RIGHT_BRACE;
215
216  if (next_char == '.')
217    return Token::DOT;
218  if (next_char == ',')
219    return Token::COMMA;
220
221  if (next_char == '#')
222    return Token::UNCLASSIFIED_COMMENT;
223
224  // For the case of '-' differentiate between a negative number and anything
225  // else.
226  if (next_char == '-') {
227    if (!CanIncrement())
228      return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
229                                            // file.
230    char following_char = input_[cur_ + 1];
231    if (IsAsciiDigit(following_char))
232      return Token::INTEGER;
233    return Token::UNCLASSIFIED_OPERATOR;
234  }
235
236  return Token::INVALID;
237}
238
239void Tokenizer::AdvanceToEndOfToken(const Location& location,
240                                    Token::Type type) {
241  switch (type) {
242    case Token::INTEGER:
243      do {
244        Advance();
245      } while (!at_end() && IsAsciiDigit(cur_char()));
246      if (!at_end()) {
247        // Require the char after a number to be some kind of space, scope,
248        // or operator.
249        char c = cur_char();
250        if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
251            !IsScoperChar(c) && c != ',') {
252          *err_ = Err(GetCurrentLocation(),
253                      "This is not a valid number.",
254                      "Learn to count.");
255          // Highlight the number.
256          err_->AppendRange(LocationRange(location, GetCurrentLocation()));
257        }
258      }
259      break;
260
261    case Token::STRING: {
262      char initial = cur_char();
263      Advance();  // Advance past initial "
264      for (;;) {
265        if (at_end()) {
266          *err_ = Err(LocationRange(location, GetCurrentLocation()),
267                      "Unterminated string literal.",
268                      "Don't leave me hanging like this!");
269          break;
270        }
271        if (IsCurrentStringTerminator(initial)) {
272          Advance();  // Skip past last "
273          break;
274        } else if (cur_char() == '\n') {
275          *err_ = Err(LocationRange(location, GetCurrentLocation()),
276                      "Newline in string constant.");
277        }
278        Advance();
279      }
280      break;
281    }
282
283    case Token::UNCLASSIFIED_OPERATOR:
284      // Some operators are two characters, some are one.
285      if (CouldBeTwoCharOperatorBegin(cur_char())) {
286        if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
287          Advance();
288      }
289      Advance();
290      break;
291
292    case Token::IDENTIFIER:
293      while (!at_end() && IsIdentifierContinuingChar(cur_char()))
294        Advance();
295      break;
296
297    case Token::LEFT_BRACKET:
298    case Token::RIGHT_BRACKET:
299    case Token::LEFT_BRACE:
300    case Token::RIGHT_BRACE:
301    case Token::LEFT_PAREN:
302    case Token::RIGHT_PAREN:
303    case Token::DOT:
304    case Token::COMMA:
305      Advance();  // All are one char.
306      break;
307
308    case Token::UNCLASSIFIED_COMMENT:
309      // Eat to EOL.
310      while (!at_end() && !IsCurrentNewline())
311        Advance();
312      break;
313
314    case Token::INVALID:
315    default:
316      *err_ = Err(location, "Everything is all messed up",
317                  "Please insert system disk in drive A: and press any key.");
318      NOTREACHED();
319      return;
320  }
321}
322
323bool Tokenizer::AtStartOfLine(size_t location) const {
324  while (location > 0) {
325    --location;
326    char c = input_[location];
327    if (c == '\n')
328      return true;
329    if (c != ' ')
330      return false;
331  }
332  return true;
333}
334
335bool Tokenizer::IsCurrentWhitespace() const {
336  DCHECK(!at_end());
337  char c = input_[cur_];
338  // Note that tab (0x09), vertical tab (0x0B), and formfeed (0x0C) are illegal.
339  return c == 0x0A || c == 0x0D || c == 0x20;
340}
341
342bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
343  DCHECK(!at_end());
344  if (cur_char() != quote_char)
345    return false;
346
347  // Check for escaping. \" is not a string terminator, but \\" is. Count
348  // the number of preceeding backslashes.
349  int num_backslashes = 0;
350  for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
351    num_backslashes++;
352
353  // Even backslashes mean that they were escaping each other and don't count
354  // as escaping this quote.
355  return (num_backslashes % 2) == 0;
356}
357
358bool Tokenizer::IsCurrentNewline() const {
359  return IsNewline(input_, cur_);
360}
361
362void Tokenizer::Advance() {
363  DCHECK(cur_ < input_.size());
364  if (IsCurrentNewline()) {
365    line_number_++;
366    char_in_line_ = 1;
367  } else {
368    char_in_line_++;
369  }
370  cur_++;
371}
372
373Location Tokenizer::GetCurrentLocation() const {
374  return Location(
375      input_file_, line_number_, char_in_line_, static_cast<int>(cur_));
376}
377
378Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
379  std::string help;
380  if (cur_char() == ';') {
381    // Semicolon.
382    help = "Semicolons are not needed, delete this one.";
383  } else if (cur_char() == '\t') {
384    // Tab.
385    help = "You got a tab character in here. Tabs are evil. "
386           "Convert to spaces.";
387  } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
388      (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
389    // Different types of comments.
390    help = "Comments should start with # instead";
391  } else {
392    help = "I have no idea what this is.";
393  }
394
395  return Err(location, "Invalid token.", help);
396}
397