1d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Copyright (c) 2013 The Chromium Authors. All rights reserved.
2d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Use of this source code is governed by a BSD-style license that can be
3d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// found in the LICENSE file.
4d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
5d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#ifndef TOOLS_GN_TOKENIZER_H_
6d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#define TOOLS_GN_TOKENIZER_H_
7d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
8d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include <vector>
9d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
10d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "base/basictypes.h"
11d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "base/strings/string_piece.h"
12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/string_util.h"
13d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/err.h"
14d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/token.h"
15d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
16d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochclass InputFile;
17d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
18d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochclass Tokenizer {
19d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch public:
20d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  static std::vector<Token> Tokenize(const InputFile* input_file, Err* err);
21d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
22d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Counts lines in the given buffer (the first line is "1") and returns
23d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // the byte offset of the beginning of that line, or (size_t)-1 if there
24d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // aren't that many lines in the file. Note that this will return the byte
25d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // one past the end of the input if the last character is a newline.
26d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  //
27d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // This is a helper function for error output so that the tokenizer's
28d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // notion of lines can be used elsewhere.
29d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  static size_t ByteOffsetOfNthLine(const base::StringPiece& buf, int n);
30d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
31d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Returns true if the given offset of the string piece counts as a newline.
32d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // The offset must be in the buffer.
33d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  static bool IsNewline(const base::StringPiece& buffer, size_t offset);
34d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
35d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  static bool IsIdentifierFirstChar(char c) {
36cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    return IsAsciiAlpha(c) || c == '_';
37d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
38d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
39d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  static bool IsIdentifierContinuingChar(char c) {
40d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch    // Also allow digits after the first char.
41cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)    return IsIdentifierFirstChar(c) || IsAsciiDigit(c);
42d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  }
43d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
44d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch private:
45d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // InputFile must outlive the tokenizer and all generated tokens.
46d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  explicit Tokenizer(const InputFile* input_file, Err* err);
47d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  ~Tokenizer();
48d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
49d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  std::vector<Token> Run();
50d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
51d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  void AdvanceToNextToken();
52d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  Token::Type ClassifyCurrent() const;
53d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  void AdvanceToEndOfToken(const Location& location, Token::Type type);
54d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
551320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // Whether from this location back to the beginning of the line is only
561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // whitespace. |location| should be the first character of the token to be
571320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  // checked.
581320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci  bool AtStartOfLine(size_t location) const;
591320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci
60d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool IsCurrentWhitespace() const;
61d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool IsCurrentNewline() const;
62d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool IsCurrentStringTerminator(char quote_char) const;
63d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
64d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool CanIncrement() const { return cur_ < input_.size(); }
65d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
66d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Increments the current location by one.
67d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  void Advance();
68d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
69d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  // Returns the current character in the file as a location.
70d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  Location GetCurrentLocation() const;
71d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
72d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  Err GetErrorForInvalidToken(const Location& location) const;
73d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
74d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool done() const { return at_end() || has_error(); }
75d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
76d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool at_end() const { return cur_ == input_.size(); }
77d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  char cur_char() const { return input_[cur_]; }
78d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
79d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  bool has_error() const { return err_->has_error(); }
80d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
813551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)  std::vector<Token> tokens_;
823551c9c881056c480085172ff9840cab31610854Torne (Richard Coles)
83d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  const InputFile* input_file_;
84d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  const base::StringPiece input_;
85d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  Err* err_;
86d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  size_t cur_;  // Byte offset into input buffer.
87d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
88d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  int line_number_;
89d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  int char_in_line_;
90d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
91d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch  DISALLOW_COPY_AND_ASSIGN(Tokenizer);
92d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch};
93d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch
94d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#endif  // TOOLS_GN_TOKENIZER_H_
95