1d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// Use of this source code is governed by a BSD-style license that can be 3d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch// found in the LICENSE file. 4d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 5d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#ifndef TOOLS_GN_TOKENIZER_H_ 6d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#define TOOLS_GN_TOKENIZER_H_ 7d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 8d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include <vector> 9d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 10d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "base/basictypes.h" 11d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "base/strings/string_piece.h" 12cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles)#include "base/strings/string_util.h" 13d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/err.h" 14d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#include "tools/gn/token.h" 15d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 16d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochclass InputFile; 17d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 18d3868032626d59662ff73b372b5d584c1d144c53Ben Murdochclass Tokenizer { 19d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch public: 20d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch static std::vector<Token> Tokenize(const InputFile* input_file, Err* err); 21d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 22d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // Counts lines in the given buffer (the first line is "1") and returns 23d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // the byte offset of the beginning of that line, or (size_t)-1 if there 24d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // aren't that many lines in the file. Note that this will return the byte 25d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // one past the end of the input if the last character is a newline. 26d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // 27d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // This is a helper function for error output so that the tokenizer's 28d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // notion of lines can be used elsewhere. 29d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch static size_t ByteOffsetOfNthLine(const base::StringPiece& buf, int n); 30d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 31d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // Returns true if the given offset of the string piece counts as a newline. 32d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // The offset must be in the buffer. 33d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch static bool IsNewline(const base::StringPiece& buffer, size_t offset); 34d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 35d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch static bool IsIdentifierFirstChar(char c) { 36cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) return IsAsciiAlpha(c) || c == '_'; 37d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch } 38d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 39d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch static bool IsIdentifierContinuingChar(char c) { 40d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // Also allow digits after the first char. 41cedac228d2dd51db4b79ea1e72c7f249408ee061Torne (Richard Coles) return IsIdentifierFirstChar(c) || IsAsciiDigit(c); 42d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch } 43d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 44d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch private: 45d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // InputFile must outlive the tokenizer and all generated tokens. 46d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch explicit Tokenizer(const InputFile* input_file, Err* err); 47d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch ~Tokenizer(); 48d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 49d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch std::vector<Token> Run(); 50d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 51d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch void AdvanceToNextToken(); 52d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch Token::Type ClassifyCurrent() const; 53d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch void AdvanceToEndOfToken(const Location& location, Token::Type type); 54d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 551320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // Whether from this location back to the beginning of the line is only 561320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // whitespace. |location| should be the first character of the token to be 571320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci // checked. 581320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci bool AtStartOfLine(size_t location) const; 591320f92c476a1ad9d19dba2a48c72b75566198e9Primiano Tucci 60d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool IsCurrentWhitespace() const; 61d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool IsCurrentNewline() const; 62d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool IsCurrentStringTerminator(char quote_char) const; 63d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 64d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool CanIncrement() const { return cur_ < input_.size(); } 65d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 66d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // Increments the current location by one. 67d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch void Advance(); 68d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 69d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch // Returns the current character in the file as a location. 70d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch Location GetCurrentLocation() const; 71d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 72d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch Err GetErrorForInvalidToken(const Location& location) const; 73d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 74d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool done() const { return at_end() || has_error(); } 75d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 76d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool at_end() const { return cur_ == input_.size(); } 77d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch char cur_char() const { return input_[cur_]; } 78d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 79d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch bool has_error() const { return err_->has_error(); } 80d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 813551c9c881056c480085172ff9840cab31610854Torne (Richard Coles) std::vector<Token> tokens_; 823551c9c881056c480085172ff9840cab31610854Torne (Richard Coles) 83d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch const InputFile* input_file_; 84d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch const base::StringPiece input_; 85d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch Err* err_; 86d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch size_t cur_; // Byte offset into input buffer. 87d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 88d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch int line_number_; 89d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch int char_in_line_; 90d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 91d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch DISALLOW_COPY_AND_ASSIGN(Tokenizer); 92d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch}; 93d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch 94d3868032626d59662ff73b372b5d584c1d144c53Ben Murdoch#endif // TOOLS_GN_TOKENIZER_H_ 95