1// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "tools/gn/tokenizer.h"
6
7#include "base/logging.h"
8#include "tools/gn/input_file.h"
9
10namespace {
11
12bool IsNumberChar(char c) {
13  return c == '-' || (c >= '0' && c <= '9');
14}
15
16bool CouldBeTwoCharOperatorBegin(char c) {
17  return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
18         c == '+' || c == '|' || c == '&';
19}
20
21bool CouldBeTwoCharOperatorEnd(char c) {
22  return c == '=' || c == '|' || c == '&';
23}
24
25bool CouldBeOneCharOperator(char c) {
26  return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
27         c == ':' || c == '|' || c == '&' || c == '-';
28}
29
30bool CouldBeOperator(char c) {
31  return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
32}
33
34bool IsSeparatorChar(char c) {
35  return c == ',';
36}
37
38bool IsScoperChar(char c) {
39  return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
40}
41
42}  // namespace
43
44Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
45    : input_file_(input_file),
46      input_(input_file->contents()),
47      err_(err),
48      cur_(0),
49      line_number_(1),
50      char_in_line_(1) {
51}
52
53Tokenizer::~Tokenizer() {
54}
55
56// static
57std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
58  Tokenizer t(input_file, err);
59  return t.Run();
60}
61
62std::vector<Token> Tokenizer::Run() {
63  std::vector<Token> tokens;
64  while (!done()) {
65    AdvanceToNextToken();
66    if (done())
67      break;
68    Location location = GetCurrentLocation();
69
70    Token::Type type = ClassifyCurrent();
71    if (type == Token::INVALID) {
72      *err_ = GetErrorForInvalidToken(location);
73      break;
74    }
75    size_t token_begin = cur_;
76    AdvanceToEndOfToken(location, type);
77    if (has_error())
78      break;
79    size_t token_end = cur_;
80
81    // TODO(brettw) This just strips comments from the token stream. This
82    // is probably wrong, they should be removed at a later stage so we can
83    // do things like rewrite the file. But this makes the parser simpler and
84    // is OK for now.
85    if (type != Token::COMMENT) {
86      tokens.push_back(Token(
87          location,
88          type,
89          base::StringPiece(&input_.data()[token_begin],
90                            token_end - token_begin)));
91    }
92  }
93  if (err_->has_error())
94    tokens.clear();
95  return tokens;
96}
97
98// static
99size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
100  int cur_line = 1;
101  size_t cur_byte = 0;
102
103  DCHECK(n > 0);
104
105  if (n == 1)
106    return 0;
107
108  while (cur_byte < buf.size()) {
109    if (IsNewline(buf, cur_byte)) {
110      cur_line++;
111      if (cur_line == n)
112        return cur_byte + 1;
113    }
114    cur_byte++;
115  }
116  return -1;
117}
118
119// static
120bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
121  DCHECK(offset < buffer.size());
122  // We may need more logic here to handle different line ending styles.
123  return buffer[offset] == '\n';
124}
125
126
127void Tokenizer::AdvanceToNextToken() {
128  while (!at_end() && IsCurrentWhitespace())
129    Advance();
130}
131
132Token::Type Tokenizer::ClassifyCurrent() const {
133  DCHECK(!at_end());
134  char next_char = cur_char();
135  if (next_char >= '0' && next_char <= '9')
136    return Token::INTEGER;
137  if (next_char == '"')
138    return Token::STRING;
139
140  // Note: '-' handled specially below.
141  if (next_char != '-' && CouldBeOperator(next_char))
142    return Token::OPERATOR;
143
144  if (IsIdentifierFirstChar(next_char))
145    return Token::IDENTIFIER;
146
147  if (IsScoperChar(next_char))
148    return Token::SCOPER;
149
150  if (IsSeparatorChar(next_char))
151    return Token::SEPARATOR;
152
153  if (next_char == '#')
154    return Token::COMMENT;
155
156  // For the case of '-' differentiate between a negative number and anything
157  // else.
158  if (next_char == '-') {
159    if (!CanIncrement())
160      return Token::OPERATOR;  // Just the minus before end of file.
161    char following_char = input_[cur_ + 1];
162    if (following_char >= '0' && following_char <= '9')
163      return Token::INTEGER;
164    return Token::OPERATOR;
165  }
166
167  return Token::INVALID;
168}
169
170void Tokenizer::AdvanceToEndOfToken(const Location& location,
171                                    Token::Type type) {
172  switch (type) {
173    case Token::INTEGER:
174      do {
175        Advance();
176      } while (!at_end() && IsNumberChar(cur_char()));
177      if (!at_end()) {
178        // Require the char after a number to be some kind of space, scope,
179        // or operator.
180        char c = cur_char();
181        if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
182            !IsScoperChar(c) && !IsSeparatorChar(c)) {
183          *err_ = Err(GetCurrentLocation(),
184              "This is not a valid number.",
185              "Learn to count.");
186          // Highlight the number.
187          err_->AppendRange(LocationRange(location, GetCurrentLocation()));
188        }
189      }
190      break;
191
192    case Token::STRING: {
193      char initial = cur_char();
194      Advance();  // Advance past initial "
195      for (;;) {
196        if (at_end()) {
197          *err_ = Err(LocationRange(location,
198                          Location(input_file_, line_number_, char_in_line_)),
199                     "Unterminated string literal.",
200                     "Don't leave me hanging like this!");
201          break;
202        }
203        if (IsCurrentStringTerminator(initial)) {
204          Advance();  // Skip past last "
205          break;
206        } else if (cur_char() == '\n') {
207          *err_ = Err(LocationRange(location,
208                                   GetCurrentLocation()),
209                     "Newline in string constant.");
210        }
211        Advance();
212      }
213      break;
214    }
215
216    case Token::OPERATOR:
217      // Some operators are two characters, some are one.
218      if (CouldBeTwoCharOperatorBegin(cur_char())) {
219        if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
220          Advance();
221      }
222      Advance();
223      break;
224
225    case Token::IDENTIFIER:
226      while (!at_end() && IsIdentifierContinuingChar(cur_char()))
227        Advance();
228      break;
229
230    case Token::SCOPER:
231    case Token::SEPARATOR:
232      Advance();  // All are one char.
233      break;
234
235    case Token::COMMENT:
236      // Eat to EOL.
237      while (!at_end() && !IsCurrentNewline())
238        Advance();
239      break;
240
241    case Token::INVALID:
242      *err_ = Err(location, "Everything is all messed up",
243                  "Please insert system disk in drive A: and press any key.");
244      NOTREACHED();
245      return;
246  }
247}
248
249bool Tokenizer::IsCurrentWhitespace() const {
250  DCHECK(!at_end());
251  char c = input_[cur_];
252  // Note that tab (0x09) is illegal.
253  return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
254}
255
256bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
257  DCHECK(!at_end());
258  if (cur_char() != quote_char)
259    return false;
260
261  // Check for escaping. \" is not a string terminator, but \\" is. Count
262  // the number of preceeding backslashes.
263  int num_backslashes = 0;
264  for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
265    num_backslashes++;
266
267  // Even backslashes mean that they were escaping each other and don't count
268  // as escaping this quote.
269  return (num_backslashes % 2) == 0;
270}
271
272bool Tokenizer::IsCurrentNewline() const {
273  return IsNewline(input_, cur_);
274}
275
276void Tokenizer::Advance() {
277  DCHECK(cur_ < input_.size());
278  if (IsCurrentNewline()) {
279    line_number_++;
280    char_in_line_ = 1;
281  } else {
282    char_in_line_++;
283  }
284  cur_++;
285}
286
287Location Tokenizer::GetCurrentLocation() const {
288  return Location(input_file_, line_number_, char_in_line_);
289}
290
291Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
292  std::string help;
293  if (cur_char() == ';') {
294    // Semicolon.
295    help = "Semicolons are not needed, delete this one.";
296  } else if (cur_char() == '\t') {
297    // Tab.
298    help = "You got a tab character in here. Tabs are evil. "
299           "Convert to spaces.";
300  } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
301      (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
302    // Different types of comments.
303    help = "Comments should start with # instead";
304  } else {
305    help = "I have no idea what this is.";
306  }
307
308  return Err(location, "Invalid token.", help);
309}
310