1// Copyright (c) 2013 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4 5#include "tools/gn/tokenizer.h" 6 7#include "base/logging.h" 8#include "tools/gn/input_file.h" 9 10namespace { 11 12bool IsNumberChar(char c) { 13 return c >= '0' && c <= '9'; 14} 15 16bool CouldBeTwoCharOperatorBegin(char c) { 17 return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || 18 c == '+' || c == '|' || c == '&'; 19} 20 21bool CouldBeTwoCharOperatorEnd(char c) { 22 return c == '=' || c == '|' || c == '&'; 23} 24 25bool CouldBeOneCharOperator(char c) { 26 return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || 27 c == ':' || c == '|' || c == '&' || c == '-'; 28} 29 30bool CouldBeOperator(char c) { 31 return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); 32} 33 34bool IsScoperChar(char c) { 35 return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; 36} 37 38Token::Type GetSpecificOperatorType(base::StringPiece value) { 39 if (value == "=") 40 return Token::EQUAL; 41 if (value == "+") 42 return Token::PLUS; 43 if (value == "-") 44 return Token::MINUS; 45 if (value == "+=") 46 return Token::PLUS_EQUALS; 47 if (value == "-=") 48 return Token::MINUS_EQUALS; 49 if (value == "==") 50 return Token::EQUAL_EQUAL; 51 if (value == "!=") 52 return Token::NOT_EQUAL; 53 if (value == "<=") 54 return Token::LESS_EQUAL; 55 if (value == ">=") 56 return Token::GREATER_EQUAL; 57 if (value == "<") 58 return Token::LESS_THAN; 59 if (value == ">") 60 return Token::GREATER_THAN; 61 if (value == "&&") 62 return Token::BOOLEAN_AND; 63 if (value == "||") 64 return Token::BOOLEAN_OR; 65 if (value == "!") 66 return Token::BANG; 67 return Token::INVALID; 68} 69 70} // namespace 71 72Tokenizer::Tokenizer(const InputFile* input_file, Err* err) 73 : input_file_(input_file), 74 input_(input_file->contents()), 75 err_(err), 76 cur_(0), 77 line_number_(1), 78 char_in_line_(1) { 79} 80 81Tokenizer::~Tokenizer() { 82} 83 84// static 85std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { 86 Tokenizer t(input_file, err); 87 return t.Run(); 88} 89 90std::vector<Token> Tokenizer::Run() { 91 DCHECK(tokens_.empty()); 92 while (!done()) { 93 AdvanceToNextToken(); 94 if (done()) 95 break; 96 Location location = GetCurrentLocation(); 97 98 Token::Type type = ClassifyCurrent(); 99 if (type == Token::INVALID) { 100 *err_ = GetErrorForInvalidToken(location); 101 break; 102 } 103 size_t token_begin = cur_; 104 AdvanceToEndOfToken(location, type); 105 if (has_error()) 106 break; 107 size_t token_end = cur_; 108 109 base::StringPiece token_value(&input_.data()[token_begin], 110 token_end - token_begin); 111 112 if (type == Token::UNCLASSIFIED_OPERATOR) 113 type = GetSpecificOperatorType(token_value); 114 if (type == Token::IDENTIFIER) { 115 if (token_value == "if") 116 type = Token::IF; 117 else if (token_value == "else") 118 type = Token::ELSE; 119 else if (token_value == "true") 120 type = Token::TRUE_TOKEN; 121 else if (token_value == "false") 122 type = Token::FALSE_TOKEN; 123 } 124 125 // TODO(brettw) This just strips comments from the token stream. This 126 // is probably wrong, they should be removed at a later stage so we can 127 // do things like rewrite the file. But this makes the parser simpler and 128 // is OK for now. 129 if (type != Token::COMMENT) 130 tokens_.push_back(Token(location, type, token_value)); 131 } 132 if (err_->has_error()) 133 tokens_.clear(); 134 return tokens_; 135} 136 137// static 138size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { 139 int cur_line = 1; 140 size_t cur_byte = 0; 141 142 DCHECK(n > 0); 143 144 if (n == 1) 145 return 0; 146 147 while (cur_byte < buf.size()) { 148 if (IsNewline(buf, cur_byte)) { 149 cur_line++; 150 if (cur_line == n) 151 return cur_byte + 1; 152 } 153 cur_byte++; 154 } 155 return -1; 156} 157 158// static 159bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { 160 DCHECK(offset < buffer.size()); 161 // We may need more logic here to handle different line ending styles. 162 return buffer[offset] == '\n'; 163} 164 165 166void Tokenizer::AdvanceToNextToken() { 167 while (!at_end() && IsCurrentWhitespace()) 168 Advance(); 169} 170 171Token::Type Tokenizer::ClassifyCurrent() const { 172 DCHECK(!at_end()); 173 char next_char = cur_char(); 174 if (next_char >= '0' && next_char <= '9') 175 return Token::INTEGER; 176 if (next_char == '"') 177 return Token::STRING; 178 179 // Note: '-' handled specially below. 180 if (next_char != '-' && CouldBeOperator(next_char)) 181 return Token::UNCLASSIFIED_OPERATOR; 182 183 if (IsIdentifierFirstChar(next_char)) 184 return Token::IDENTIFIER; 185 186 if (next_char == '[') 187 return Token::LEFT_BRACKET; 188 if (next_char == ']') 189 return Token::RIGHT_BRACKET; 190 if (next_char == '(') 191 return Token::LEFT_PAREN; 192 if (next_char == ')') 193 return Token::RIGHT_PAREN; 194 if (next_char == '{') 195 return Token::LEFT_BRACE; 196 if (next_char == '}') 197 return Token::RIGHT_BRACE; 198 199 if (next_char == ',') 200 return Token::COMMA; 201 202 if (next_char == '#') 203 return Token::COMMENT; 204 205 // For the case of '-' differentiate between a negative number and anything 206 // else. 207 if (next_char == '-') { 208 if (!CanIncrement()) 209 return Token::UNCLASSIFIED_OPERATOR; // Just the minus before end of 210 // file. 211 char following_char = input_[cur_ + 1]; 212 if (following_char >= '0' && following_char <= '9') 213 return Token::INTEGER; 214 return Token::UNCLASSIFIED_OPERATOR; 215 } 216 217 return Token::INVALID; 218} 219 220void Tokenizer::AdvanceToEndOfToken(const Location& location, 221 Token::Type type) { 222 switch (type) { 223 case Token::INTEGER: 224 do { 225 Advance(); 226 } while (!at_end() && IsNumberChar(cur_char())); 227 if (!at_end()) { 228 // Require the char after a number to be some kind of space, scope, 229 // or operator. 230 char c = cur_char(); 231 if (!IsCurrentWhitespace() && !CouldBeOperator(c) && 232 !IsScoperChar(c) && c != ',') { 233 *err_ = Err(GetCurrentLocation(), 234 "This is not a valid number.", 235 "Learn to count."); 236 // Highlight the number. 237 err_->AppendRange(LocationRange(location, GetCurrentLocation())); 238 } 239 } 240 break; 241 242 case Token::STRING: { 243 char initial = cur_char(); 244 Advance(); // Advance past initial " 245 for (;;) { 246 if (at_end()) { 247 *err_ = Err(LocationRange(location, 248 Location(input_file_, line_number_, char_in_line_)), 249 "Unterminated string literal.", 250 "Don't leave me hanging like this!"); 251 break; 252 } 253 if (IsCurrentStringTerminator(initial)) { 254 Advance(); // Skip past last " 255 break; 256 } else if (cur_char() == '\n') { 257 *err_ = Err(LocationRange(location, 258 GetCurrentLocation()), 259 "Newline in string constant."); 260 } 261 Advance(); 262 } 263 break; 264 } 265 266 case Token::UNCLASSIFIED_OPERATOR: 267 // Some operators are two characters, some are one. 268 if (CouldBeTwoCharOperatorBegin(cur_char())) { 269 if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) 270 Advance(); 271 } 272 Advance(); 273 break; 274 275 case Token::IDENTIFIER: 276 while (!at_end() && IsIdentifierContinuingChar(cur_char())) 277 Advance(); 278 break; 279 280 case Token::LEFT_BRACKET: 281 case Token::RIGHT_BRACKET: 282 case Token::LEFT_BRACE: 283 case Token::RIGHT_BRACE: 284 case Token::LEFT_PAREN: 285 case Token::RIGHT_PAREN: 286 case Token::COMMA: 287 Advance(); // All are one char. 288 break; 289 290 case Token::COMMENT: 291 // Eat to EOL. 292 while (!at_end() && !IsCurrentNewline()) 293 Advance(); 294 break; 295 296 case Token::INVALID: 297 default: 298 *err_ = Err(location, "Everything is all messed up", 299 "Please insert system disk in drive A: and press any key."); 300 NOTREACHED(); 301 return; 302 } 303} 304 305bool Tokenizer::IsCurrentWhitespace() const { 306 DCHECK(!at_end()); 307 char c = input_[cur_]; 308 // Note that tab (0x09) is illegal. 309 return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; 310} 311 312bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { 313 DCHECK(!at_end()); 314 if (cur_char() != quote_char) 315 return false; 316 317 // Check for escaping. \" is not a string terminator, but \\" is. Count 318 // the number of preceeding backslashes. 319 int num_backslashes = 0; 320 for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) 321 num_backslashes++; 322 323 // Even backslashes mean that they were escaping each other and don't count 324 // as escaping this quote. 325 return (num_backslashes % 2) == 0; 326} 327 328bool Tokenizer::IsCurrentNewline() const { 329 return IsNewline(input_, cur_); 330} 331 332void Tokenizer::Advance() { 333 DCHECK(cur_ < input_.size()); 334 if (IsCurrentNewline()) { 335 line_number_++; 336 char_in_line_ = 1; 337 } else { 338 char_in_line_++; 339 } 340 cur_++; 341} 342 343Location Tokenizer::GetCurrentLocation() const { 344 return Location(input_file_, line_number_, char_in_line_); 345} 346 347Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { 348 std::string help; 349 if (cur_char() == ';') { 350 // Semicolon. 351 help = "Semicolons are not needed, delete this one."; 352 } else if (cur_char() == '\t') { 353 // Tab. 354 help = "You got a tab character in here. Tabs are evil. " 355 "Convert to spaces."; 356 } else if (cur_char() == '/' && cur_ + 1 < input_.size() && 357 (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { 358 // Different types of comments. 359 help = "Comments should start with # instead"; 360 } else { 361 help = "I have no idea what this is."; 362 } 363 364 return Err(location, "Invalid token.", help); 365} 366