1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
8#include "tools/gn/input_file.h"
9
10namespace {
11
12bool IsNumberChar(char c) {
13  return c >= '0' && c <= '9';
14}
15
16bool CouldBeTwoCharOperatorBegin(char c) {
17  return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
18         c == '+' || c == '|' || c == '&';
19}
20
21bool CouldBeTwoCharOperatorEnd(char c) {
22  return c == '=' || c == '|' || c == '&';
23}
24
25bool CouldBeOneCharOperator(char c) {
26  return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
27         c == ':' || c == '|' || c == '&' || c == '-';
28}
29
30bool CouldBeOperator(char c) {
31  return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
32}
33
34bool IsScoperChar(char c) {
35  return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
36}
37
38Token::Type GetSpecificOperatorType(base::StringPiece value) {
39  if (value == "=")
40    return Token::EQUAL;
41  if (value == "+")
42    return Token::PLUS;
43  if (value == "-")
44    return Token::MINUS;
45  if (value == "+=")
46    return Token::PLUS_EQUALS;
47  if (value == "-=")
48    return Token::MINUS_EQUALS;
49  if (value == "==")
50    return Token::EQUAL_EQUAL;
51  if (value == "!=")
52    return Token::NOT_EQUAL;
53  if (value == "<=")
54    return Token::LESS_EQUAL;
55  if (value == ">=")
56    return Token::GREATER_EQUAL;
57  if (value == "<")
58    return Token::LESS_THAN;
59  if (value == ">")
60    return Token::GREATER_THAN;
61  if (value == "&&")
62    return Token::BOOLEAN_AND;
63  if (value == "||")
64    return Token::BOOLEAN_OR;
65  if (value == "!")
66    return Token::BANG;
67  return Token::INVALID;
68}
69
70}  // namespace
71
72Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
73    : input_file_(input_file),
74      input_(input_file->contents()),
75      err_(err),
76      cur_(0),
77      line_number_(1),
78      char_in_line_(1) {
79}
80
81Tokenizer::~Tokenizer() {
82}
83
84// static
85std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
86  Tokenizer t(input_file, err);
87  return t.Run();
88}
89
90std::vector<Token> Tokenizer::Run() {
91  DCHECK(tokens_.empty());
92  while (!done()) {
93    AdvanceToNextToken();
94    if (done())
95      break;
96    Location location = GetCurrentLocation();
97
98    Token::Type type = ClassifyCurrent();
99    if (type == Token::INVALID) {
100      *err_ = GetErrorForInvalidToken(location);
101      break;
102    }
103    size_t token_begin = cur_;
104    AdvanceToEndOfToken(location, type);
105    if (has_error())
106      break;
107    size_t token_end = cur_;
108
109    base::StringPiece token_value(&input_.data()[token_begin],
110                                  token_end - token_begin);
111
112    if (type == Token::UNCLASSIFIED_OPERATOR)
113      type = GetSpecificOperatorType(token_value);
114    if (type == Token::IDENTIFIER) {
115      if (token_value == "if")
116        type = Token::IF;
117      else if (token_value == "else")
118        type = Token::ELSE;
119      else if (token_value == "true")
120        type = Token::TRUE_TOKEN;
121      else if (token_value == "false")
122        type = Token::FALSE_TOKEN;
123    }
124
125    // TODO(brettw) This just strips comments from the token stream. This
126    // is probably wrong, they should be removed at a later stage so we can
127    // do things like rewrite the file. But this makes the parser simpler and
128    // is OK for now.
129    if (type != Token::COMMENT)
130      tokens_.push_back(Token(location, type, token_value));
131  }
132  if (err_->has_error())
133    tokens_.clear();
134  return tokens_;
135}
136
137// static
138size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
139  int cur_line = 1;
140  size_t cur_byte = 0;
141
142  DCHECK(n > 0);
143
144  if (n == 1)
145    return 0;
146
147  while (cur_byte < buf.size()) {
148    if (IsNewline(buf, cur_byte)) {
149      cur_line++;
150      if (cur_line == n)
151        return cur_byte + 1;
152    }
153    cur_byte++;
154  }
155  return -1;
156}
157
158// static
159bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
160  DCHECK(offset < buffer.size());
161  // We may need more logic here to handle different line ending styles.
162  return buffer[offset] == '\n';
163}
164
165
166void Tokenizer::AdvanceToNextToken() {
167  while (!at_end() && IsCurrentWhitespace())
168    Advance();
169}
170
171Token::Type Tokenizer::ClassifyCurrent() const {
172  DCHECK(!at_end());
173  char next_char = cur_char();
174  if (next_char >= '0' && next_char <= '9')
175    return Token::INTEGER;
176  if (next_char == '"')
177    return Token::STRING;
178
179  // Note: '-' handled specially below.
180  if (next_char != '-' && CouldBeOperator(next_char))
181    return Token::UNCLASSIFIED_OPERATOR;
182
183  if (IsIdentifierFirstChar(next_char))
184    return Token::IDENTIFIER;
185
186  if (next_char == '[')
187    return Token::LEFT_BRACKET;
188  if (next_char == ']')
189    return Token::RIGHT_BRACKET;
190  if (next_char == '(')
191    return Token::LEFT_PAREN;
192  if (next_char == ')')
193    return Token::RIGHT_PAREN;
194  if (next_char == '{')
195    return Token::LEFT_BRACE;
196  if (next_char == '}')
197    return Token::RIGHT_BRACE;
198
199  if (next_char == ',')
200    return Token::COMMA;
201
202  if (next_char == '#')
203    return Token::COMMENT;
204
205  // For the case of '-' differentiate between a negative number and anything
206  // else.
207  if (next_char == '-') {
208    if (!CanIncrement())
209      return Token::UNCLASSIFIED_OPERATOR;  // Just the minus before end of
210                                            // file.
211    char following_char = input_[cur_ + 1];
212    if (following_char >= '0' && following_char <= '9')
213      return Token::INTEGER;
214    return Token::UNCLASSIFIED_OPERATOR;
215  }
216
217  return Token::INVALID;
218}
219
220void Tokenizer::AdvanceToEndOfToken(const Location& location,
221                                    Token::Type type) {
222  switch (type) {
223    case Token::INTEGER:
224      do {
225        Advance();
226      } while (!at_end() && IsNumberChar(cur_char()));
227      if (!at_end()) {
228        // Require the char after a number to be some kind of space, scope,
229        // or operator.
230        char c = cur_char();
231        if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
232            !IsScoperChar(c) && c != ',') {
233          *err_ = Err(GetCurrentLocation(),
234              "This is not a valid number.",
235              "Learn to count.");
236          // Highlight the number.
237          err_->AppendRange(LocationRange(location, GetCurrentLocation()));
238        }
239      }
240      break;
241
242    case Token::STRING: {
243      char initial = cur_char();
244      Advance();  // Advance past initial "
245      for (;;) {
246        if (at_end()) {
247          *err_ = Err(LocationRange(location,
248                          Location(input_file_, line_number_, char_in_line_)),
249                     "Unterminated string literal.",
250                     "Don't leave me hanging like this!");
251          break;
252        }
253        if (IsCurrentStringTerminator(initial)) {
254          Advance();  // Skip past last "
255          break;
256        } else if (cur_char() == '\n') {
257          *err_ = Err(LocationRange(location,
258                                   GetCurrentLocation()),
259                     "Newline in string constant.");
260        }
261        Advance();
262      }
263      break;
264    }
265
266    case Token::UNCLASSIFIED_OPERATOR:
267      // Some operators are two characters, some are one.
268      if (CouldBeTwoCharOperatorBegin(cur_char())) {
269        if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
270          Advance();
271      }
272      Advance();
273      break;
274
275    case Token::IDENTIFIER:
276      while (!at_end() && IsIdentifierContinuingChar(cur_char()))
277        Advance();
278      break;
279
280    case Token::LEFT_BRACKET:
281    case Token::RIGHT_BRACKET:
282    case Token::LEFT_BRACE:
283    case Token::RIGHT_BRACE:
284    case Token::LEFT_PAREN:
285    case Token::RIGHT_PAREN:
286    case Token::COMMA:
287      Advance();  // All are one char.
288      break;
289
290    case Token::COMMENT:
291      // Eat to EOL.
292      while (!at_end() && !IsCurrentNewline())
293        Advance();
294      break;
295
296    case Token::INVALID:
297    default:
298      *err_ = Err(location, "Everything is all messed up",
299                  "Please insert system disk in drive A: and press any key.");
300      NOTREACHED();
301      return;
302  }
303}
304
305bool Tokenizer::IsCurrentWhitespace() const {
306  DCHECK(!at_end());
307  char c = input_[cur_];
308  // Note that tab (0x09) is illegal.
309  return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
310}
311
312bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
313  DCHECK(!at_end());
314  if (cur_char() != quote_char)
315    return false;
316
317  // Check for escaping. \" is not a string terminator, but \\" is. Count
318  // the number of preceeding backslashes.
319  int num_backslashes = 0;
320  for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
321    num_backslashes++;
322
323  // Even backslashes mean that they were escaping each other and don't count
324  // as escaping this quote.
325  return (num_backslashes % 2) == 0;
326}
327
328bool Tokenizer::IsCurrentNewline() const {
329  return IsNewline(input_, cur_);
330}
331
332void Tokenizer::Advance() {
333  DCHECK(cur_ < input_.size());
334  if (IsCurrentNewline()) {
335    line_number_++;
336    char_in_line_ = 1;
337  } else {
338    char_in_line_++;
339  }
340  cur_++;
341}
342
343Location Tokenizer::GetCurrentLocation() const {
344  return Location(input_file_, line_number_, char_in_line_);
345}
346
347Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
348  std::string help;
349  if (cur_char() == ';') {
350    // Semicolon.
351    help = "Semicolons are not needed, delete this one.";
352  } else if (cur_char() == '\t') {
353    // Tab.
354    help = "You got a tab character in here. Tabs are evil. "
355           "Convert to spaces.";
356  } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
357      (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
358    // Different types of comments.
359    help = "Comments should start with # instead";
360  } else {
361    help = "I have no idea what this is.";
362  }
363
364  return Err(location, "Invalid token.", help);
365}
366