1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "tools/gn/tokenizer.h" 6 7#include "base/logging.h" 8#include "tools/gn/input_file.h" 9 10namespace { 11 12bool IsNumberChar(char c) { 13 return c == '-' || (c >= '0' && c <= '9'); 14} 15 16bool CouldBeTwoCharOperatorBegin(char c) { 17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || 18 c == '+' || c == '|' || c == '&'; 19} 20 21bool CouldBeTwoCharOperatorEnd(char c) { 22 return c == '=' || c == '|' || c == '&'; 23} 24 25bool CouldBeOneCharOperator(char c) { 26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || 27 c == ':' || c == '|' || c == '&' || c == '-'; 28} 29 30bool CouldBeOperator(char c) { 31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); 32} 33 34bool IsSeparatorChar(char c) { 35 return c == ','; 36} 37 38bool IsScoperChar(char c) { 39 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; 40} 41 42} // namespace 43 44Tokenizer::Tokenizer(const InputFile* input_file, Err* err) 45 : input_file_(input_file), 46 input_(input_file->contents()), 47 err_(err), 48 cur_(0), 49 line_number_(1), 50 char_in_line_(1) { 51} 52 53Tokenizer::~Tokenizer() { 54} 55 56// static 57std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { 58 Tokenizer t(input_file, err); 59 return t.Run(); 60} 61 62std::vector<Token> Tokenizer::Run() { 63 std::vector<Token> tokens; 64 while (!done()) { 65 AdvanceToNextToken(); 66 if (done()) 67 break; 68 Location location = GetCurrentLocation(); 69 70 Token::Type type = ClassifyCurrent(); 71 if (type == Token::INVALID) { 72 *err_ = GetErrorForInvalidToken(location); 73 break; 74 } 75 size_t token_begin = cur_; 76 AdvanceToEndOfToken(location, type); 77 if (has_error()) 78 break; 79 size_t token_end = cur_; 80 81 // TODO(brettw) This just strips comments from the token stream. This 82 // is probably wrong, they should be removed at a later stage so we can 83 // do things like rewrite the file. But this makes the parser simpler and 84 // is OK for now. 85 if (type != Token::COMMENT) { 86 tokens.push_back(Token( 87 location, 88 type, 89 base::StringPiece(&input_.data()[token_begin], 90 token_end - token_begin))); 91 } 92 } 93 if (err_->has_error()) 94 tokens.clear(); 95 return tokens; 96} 97 98// static 99size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { 100 int cur_line = 1; 101 size_t cur_byte = 0; 102 103 DCHECK(n > 0); 104 105 if (n == 1) 106 return 0; 107 108 while (cur_byte < buf.size()) { 109 if (IsNewline(buf, cur_byte)) { 110 cur_line++; 111 if (cur_line == n) 112 return cur_byte + 1; 113 } 114 cur_byte++; 115 } 116 return -1; 117} 118 119// static 120bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { 121 DCHECK(offset < buffer.size()); 122 // We may need more logic here to handle different line ending styles. 123 return buffer[offset] == '\n'; 124} 125 126 127void Tokenizer::AdvanceToNextToken() { 128 while (!at_end() && IsCurrentWhitespace()) 129 Advance(); 130} 131 132Token::Type Tokenizer::ClassifyCurrent() const { 133 DCHECK(!at_end()); 134 char next_char = cur_char(); 135 if (next_char >= '0' && next_char <= '9') 136 return Token::INTEGER; 137 if (next_char == '"') 138 return Token::STRING; 139 140 // Note: '-' handled specially below. 141 if (next_char != '-' && CouldBeOperator(next_char)) 142 return Token::OPERATOR; 143 144 if (IsIdentifierFirstChar(next_char)) 145 return Token::IDENTIFIER; 146 147 if (IsScoperChar(next_char)) 148 return Token::SCOPER; 149 150 if (IsSeparatorChar(next_char)) 151 return Token::SEPARATOR; 152 153 if (next_char == '#') 154 return Token::COMMENT; 155 156 // For the case of '-' differentiate between a negative number and anything 157 // else. 158 if (next_char == '-') { 159 if (!CanIncrement()) 160 return Token::OPERATOR; // Just the minus before end of file. 161 char following_char = input_[cur_ + 1]; 162 if (following_char >= '0' && following_char <= '9') 163 return Token::INTEGER; 164 return Token::OPERATOR; 165 } 166 167 return Token::INVALID; 168} 169 170void Tokenizer::AdvanceToEndOfToken(const Location& location, 171 Token::Type type) { 172 switch (type) { 173 case Token::INTEGER: 174 do { 175 Advance(); 176 } while (!at_end() && IsNumberChar(cur_char())); 177 if (!at_end()) { 178 // Require the char after a number to be some kind of space, scope, 179 // or operator. 180 char c = cur_char(); 181 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && 182 !IsScoperChar(c) && !IsSeparatorChar(c)) { 183 *err_ = Err(GetCurrentLocation(), 184 "This is not a valid number.", 185 "Learn to count."); 186 // Highlight the number. 187 err_->AppendRange(LocationRange(location, GetCurrentLocation())); 188 } 189 } 190 break; 191 192 case Token::STRING: { 193 char initial = cur_char(); 194 Advance(); // Advance past initial " 195 for (;;) { 196 if (at_end()) { 197 *err_ = Err(LocationRange(location, 198 Location(input_file_, line_number_, char_in_line_)), 199 "Unterminated string literal.", 200 "Don't leave me hanging like this!"); 201 break; 202 } 203 if (IsCurrentStringTerminator(initial)) { 204 Advance(); // Skip past last " 205 break; 206 } else if (cur_char() == '\n') { 207 *err_ = Err(LocationRange(location, 208 GetCurrentLocation()), 209 "Newline in string constant."); 210 } 211 Advance(); 212 } 213 break; 214 } 215 216 case Token::OPERATOR: 217 // Some operators are two characters, some are one. 218 if (CouldBeTwoCharOperatorBegin(cur_char())) { 219 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) 220 Advance(); 221 } 222 Advance(); 223 break; 224 225 case Token::IDENTIFIER: 226 while (!at_end() && IsIdentifierContinuingChar(cur_char())) 227 Advance(); 228 break; 229 230 case Token::SCOPER: 231 case Token::SEPARATOR: 232 Advance(); // All are one char. 233 break; 234 235 case Token::COMMENT: 236 // Eat to EOL. 237 while (!at_end() && !IsCurrentNewline()) 238 Advance(); 239 break; 240 241 case Token::INVALID: 242 *err_ = Err(location, "Everything is all messed up", 243 "Please insert system disk in drive A: and press any key."); 244 NOTREACHED(); 245 return; 246 } 247} 248 249bool Tokenizer::IsCurrentWhitespace() const { 250 DCHECK(!at_end()); 251 char c = input_[cur_]; 252 // Note that tab (0x09) is illegal. 253 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; 254} 255 256bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { 257 DCHECK(!at_end()); 258 if (cur_char() != quote_char) 259 return false; 260 261 // Check for escaping. \" is not a string terminator, but \\" is. Count 262 // the number of preceeding backslashes. 263 int num_backslashes = 0; 264 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) 265 num_backslashes++; 266 267 // Even backslashes mean that they were escaping each other and don't count 268 // as escaping this quote. 269 return (num_backslashes % 2) == 0; 270} 271 272bool Tokenizer::IsCurrentNewline() const { 273 return IsNewline(input_, cur_); 274} 275 276void Tokenizer::Advance() { 277 DCHECK(cur_ < input_.size()); 278 if (IsCurrentNewline()) { 279 line_number_++; 280 char_in_line_ = 1; 281 } else { 282 char_in_line_++; 283 } 284 cur_++; 285} 286 287Location Tokenizer::GetCurrentLocation() const { 288 return Location(input_file_, line_number_, char_in_line_); 289} 290 291Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { 292 std::string help; 293 if (cur_char() == ';') { 294 // Semicolon. 295 help = "Semicolons are not needed, delete this one."; 296 } else if (cur_char() == '\t') { 297 // Tab. 298 help = "You got a tab character in here. Tabs are evil. " 299 "Convert to spaces."; 300 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && 301 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { 302 // Different types of comments. 303 help = "Comments should start with # instead"; 304 } else { 305 help = "I have no idea what this is."; 306 } 307 308 return Err(location, "Invalid token.", help); 309} 310