tokenizer.cc revision fbaaef999ba563838ebd00874ed8a1c01fbf286d
1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// http://code.google.com/p/protobuf/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34// 35// Here we have a hand-written lexer. At first you might ask yourself, 36// "Hand-written text processing? Is Kenton crazy?!" Well, first of all, 37// yes I am crazy, but that's beside the point. There are actually reasons 38// why I ended up writing this this way. 39// 40// The traditional approach to lexing is to use lex to generate a lexer for 41// you. Unfortunately, lex's output is ridiculously ugly and difficult to 42// integrate cleanly with C++ code, especially abstract code or code meant 43// as a library. Better parser-generators exist but would add dependencies 44// which most users won't already have, which we'd like to avoid. (GNU flex 45// has a C++ output option, but it's still ridiculously ugly, non-abstract, 46// and not library-friendly.) 47// 48// The next approach that any good software engineer should look at is to 49// use regular expressions. And, indeed, I did. I have code which 50// implements this same class using regular expressions. It's about 200 51// lines shorter. However: 52// - Rather than error messages telling you "This string has an invalid 53// escape sequence at line 5, column 45", you get error messages like 54// "Parse error on line 5". Giving more precise errors requires adding 55// a lot of code that ends up basically as complex as the hand-coded 56// version anyway. 57// - The regular expression to match a string literal looks like this: 58// kString = new RE("(\"([^\"\\\\]|" // non-escaped 59// "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape 60// "\\\\x[0-9a-fA-F])*\"|" // hex escape 61// "\'([^\'\\\\]|" // Also support single-quotes. 62// "\\\\[abfnrtv?\"'\\\\0-7]|" 63// "\\\\x[0-9a-fA-F])*\')"); 64// Verifying the correctness of this line noise is actually harder than 65// verifying the correctness of ConsumeString(), defined below. I'm not 66// even confident that the above is correct, after staring at it for some 67// time. 68// - PCRE is fast, but there's still more overhead involved than the code 69// below. 70// - Sadly, regular expressions are not part of the C standard library, so 71// using them would require depending on some other library. For the 72// open source release, this could be really annoying. Nobody likes 73// downloading one piece of software just to find that they need to 74// download something else to make it work, and in all likelihood 75// people downloading Protocol Buffers will already be doing so just 76// to make something else work. We could include a copy of PCRE with 77// our code, but that obligates us to keep it up-to-date and just seems 78// like a big waste just to save 200 lines of code. 79// 80// On a similar but unrelated note, I'm even scared to use ctype.h. 81// Apparently functions like isalpha() are locale-dependent. So, if we used 82// that, then if this code is being called from some program that doesn't 83// have its locale set to "C", it would behave strangely. We can't just set 84// the locale to "C" ourselves since we might break the calling program that 85// way, particularly if it is multi-threaded. WTF? Someone please let me 86// (Kenton) know if I'm missing something here... 87// 88// I'd love to hear about other alternatives, though, as this code isn't 89// exactly pretty. 90 91#include <google/protobuf/io/tokenizer.h> 92#include <google/protobuf/io/zero_copy_stream.h> 93#include <google/protobuf/stubs/strutil.h> 94 95namespace google { 96namespace protobuf { 97namespace io { 98namespace { 99 100// As mentioned above, I don't trust ctype.h due to the presence of "locales". 101// So, I have written replacement functions here. Someone please smack me if 102// this is a bad idea or if there is some way around this. 103// 104// These "character classes" are designed to be used in template methods. 105// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat 106// whitespace. 107 108// Note: No class is allowed to contain '\0', since this is used to mark end- 109// of-input and is handled specially. 110 111#define CHARACTER_CLASS(NAME, EXPRESSION) \ 112 class NAME { \ 113 public: \ 114 static inline bool InClass(char c) { \ 115 return EXPRESSION; \ 116 } \ 117 } 118 119CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || 120 c == '\r' || c == '\v'); 121 122CHARACTER_CLASS(Unprintable, c < ' ' && c != '\0'); 123 124CHARACTER_CLASS(Digit, '0' <= c && c <= '9'); 125CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7'); 126CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || 127 ('a' <= c && c <= 'f') || 128 ('A' <= c && c <= 'F')); 129 130CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') || 131 ('A' <= c && c <= 'Z') || 132 (c == '_')); 133 134CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') || 135 ('A' <= c && c <= 'Z') || 136 ('0' <= c && c <= '9') || 137 (c == '_')); 138 139CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' || 140 c == 'r' || c == 't' || c == 'v' || c == '\\' || 141 c == '?' || c == '\'' || c == '\"'); 142 143#undef CHARACTER_CLASS 144 145// Given a char, interpret it as a numeric digit and return its value. 146// This supports any number base up to 36. 147inline int DigitValue(char digit) { 148 if ('0' <= digit && digit <= '9') return digit - '0'; 149 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10; 150 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10; 151 return -1; 152} 153 154// Inline because it's only used in one place. 155inline char TranslateEscape(char c) { 156 switch (c) { 157 case 'a': return '\a'; 158 case 'b': return '\b'; 159 case 'f': return '\f'; 160 case 'n': return '\n'; 161 case 'r': return '\r'; 162 case 't': return '\t'; 163 case 'v': return '\v'; 164 case '\\': return '\\'; 165 case '?': return '\?'; // Trigraphs = :( 166 case '\'': return '\''; 167 case '"': return '\"'; 168 169 // We expect escape sequences to have been validated separately. 170 default: return '?'; 171 } 172} 173 174} // anonymous namespace 175 176ErrorCollector::~ErrorCollector() {} 177 178// =================================================================== 179 180Tokenizer::Tokenizer(ZeroCopyInputStream* input, 181 ErrorCollector* error_collector) 182 : input_(input), 183 error_collector_(error_collector), 184 buffer_(NULL), 185 buffer_size_(0), 186 buffer_pos_(0), 187 read_error_(false), 188 line_(0), 189 column_(0), 190 token_start_(-1), 191 allow_f_after_float_(false), 192 comment_style_(CPP_COMMENT_STYLE) { 193 194 current_.line = 0; 195 current_.column = 0; 196 current_.type = TYPE_START; 197 198 Refresh(); 199} 200 201Tokenizer::~Tokenizer() { 202 // If we had any buffer left unread, return it to the underlying stream 203 // so that someone else can read it. 204 if (buffer_size_ > buffer_pos_) { 205 input_->BackUp(buffer_size_ - buffer_pos_); 206 } 207} 208 209// ------------------------------------------------------------------- 210// Internal helpers. 211 212void Tokenizer::NextChar() { 213 // Update our line and column counters based on the character being 214 // consumed. 215 if (current_char_ == '\n') { 216 ++line_; 217 column_ = 0; 218 } else if (current_char_ == '\t') { 219 column_ += kTabWidth - column_ % kTabWidth; 220 } else { 221 ++column_; 222 } 223 224 // Advance to the next character. 225 ++buffer_pos_; 226 if (buffer_pos_ < buffer_size_) { 227 current_char_ = buffer_[buffer_pos_]; 228 } else { 229 Refresh(); 230 } 231} 232 233void Tokenizer::Refresh() { 234 if (read_error_) { 235 current_char_ = '\0'; 236 return; 237 } 238 239 // If we're in a token, append the rest of the buffer to it. 240 if (token_start_ >= 0 && token_start_ < buffer_size_) { 241 current_.text.append(buffer_ + token_start_, buffer_size_ - token_start_); 242 token_start_ = 0; 243 } 244 245 const void* data = NULL; 246 buffer_ = NULL; 247 buffer_pos_ = 0; 248 do { 249 if (!input_->Next(&data, &buffer_size_)) { 250 // end of stream (or read error) 251 buffer_size_ = 0; 252 read_error_ = true; 253 current_char_ = '\0'; 254 return; 255 } 256 } while (buffer_size_ == 0); 257 258 buffer_ = static_cast<const char*>(data); 259 260 current_char_ = buffer_[0]; 261} 262 263inline void Tokenizer::StartToken() { 264 token_start_ = buffer_pos_; 265 current_.type = TYPE_START; // Just for the sake of initializing it. 266 current_.text.clear(); 267 current_.line = line_; 268 current_.column = column_; 269} 270 271inline void Tokenizer::EndToken() { 272 // Note: The if() is necessary because some STL implementations crash when 273 // you call string::append(NULL, 0), presumably because they are trying to 274 // be helpful by detecting the NULL pointer, even though there's nothing 275 // wrong with reading zero bytes from NULL. 276 if (buffer_pos_ != token_start_) { 277 current_.text.append(buffer_ + token_start_, buffer_pos_ - token_start_); 278 } 279 token_start_ = -1; 280} 281 282// ------------------------------------------------------------------- 283// Helper methods that consume characters. 284 285template<typename CharacterClass> 286inline bool Tokenizer::LookingAt() { 287 return CharacterClass::InClass(current_char_); 288} 289 290template<typename CharacterClass> 291inline bool Tokenizer::TryConsumeOne() { 292 if (CharacterClass::InClass(current_char_)) { 293 NextChar(); 294 return true; 295 } else { 296 return false; 297 } 298} 299 300inline bool Tokenizer::TryConsume(char c) { 301 if (current_char_ == c) { 302 NextChar(); 303 return true; 304 } else { 305 return false; 306 } 307} 308 309template<typename CharacterClass> 310inline void Tokenizer::ConsumeZeroOrMore() { 311 while (CharacterClass::InClass(current_char_)) { 312 NextChar(); 313 } 314} 315 316template<typename CharacterClass> 317inline void Tokenizer::ConsumeOneOrMore(const char* error) { 318 if (!CharacterClass::InClass(current_char_)) { 319 AddError(error); 320 } else { 321 do { 322 NextChar(); 323 } while (CharacterClass::InClass(current_char_)); 324 } 325} 326 327// ------------------------------------------------------------------- 328// Methods that read whole patterns matching certain kinds of tokens 329// or comments. 330 331void Tokenizer::ConsumeString(char delimiter) { 332 while (true) { 333 switch (current_char_) { 334 case '\0': 335 case '\n': { 336 AddError("String literals cannot cross line boundaries."); 337 return; 338 } 339 340 case '\\': { 341 // An escape sequence. 342 NextChar(); 343 if (TryConsumeOne<Escape>()) { 344 // Valid escape sequence. 345 } else if (TryConsumeOne<OctalDigit>()) { 346 // Possibly followed by two more octal digits, but these will 347 // just be consumed by the main loop anyway so we don't need 348 // to do so explicitly here. 349 } else if (TryConsume('x') || TryConsume('X')) { 350 if (!TryConsumeOne<HexDigit>()) { 351 AddError("Expected hex digits for escape sequence."); 352 } 353 // Possibly followed by another hex digit, but again we don't care. 354 } else { 355 AddError("Invalid escape sequence in string literal."); 356 } 357 break; 358 } 359 360 default: { 361 if (current_char_ == delimiter) { 362 NextChar(); 363 return; 364 } 365 NextChar(); 366 break; 367 } 368 } 369 } 370} 371 372Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, 373 bool started_with_dot) { 374 bool is_float = false; 375 376 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) { 377 // A hex number (started with "0x"). 378 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits."); 379 380 } else if (started_with_zero && LookingAt<Digit>()) { 381 // An octal number (had a leading zero). 382 ConsumeZeroOrMore<OctalDigit>(); 383 if (LookingAt<Digit>()) { 384 AddError("Numbers starting with leading zero must be in octal."); 385 ConsumeZeroOrMore<Digit>(); 386 } 387 388 } else { 389 // A decimal number. 390 if (started_with_dot) { 391 is_float = true; 392 ConsumeZeroOrMore<Digit>(); 393 } else { 394 ConsumeZeroOrMore<Digit>(); 395 396 if (TryConsume('.')) { 397 is_float = true; 398 ConsumeZeroOrMore<Digit>(); 399 } 400 } 401 402 if (TryConsume('e') || TryConsume('E')) { 403 is_float = true; 404 TryConsume('-') || TryConsume('+'); 405 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent."); 406 } 407 408 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) { 409 is_float = true; 410 } 411 } 412 413 if (LookingAt<Letter>()) { 414 AddError("Need space between number and identifier."); 415 } else if (current_char_ == '.') { 416 if (is_float) { 417 AddError( 418 "Already saw decimal point or exponent; can't have another one."); 419 } else { 420 AddError("Hex and octal numbers must be integers."); 421 } 422 } 423 424 return is_float ? TYPE_FLOAT : TYPE_INTEGER; 425} 426 427void Tokenizer::ConsumeLineComment() { 428 while (current_char_ != '\0' && current_char_ != '\n') { 429 NextChar(); 430 } 431 TryConsume('\n'); 432} 433 434void Tokenizer::ConsumeBlockComment() { 435 int start_line = line_; 436 int start_column = column_ - 2; 437 438 while (true) { 439 while (current_char_ != '\0' && 440 current_char_ != '*' && 441 current_char_ != '/') { 442 NextChar(); 443 } 444 445 if (TryConsume('*') && TryConsume('/')) { 446 // End of comment. 447 break; 448 } else if (TryConsume('/') && current_char_ == '*') { 449 // Note: We didn't consume the '*' because if there is a '/' after it 450 // we want to interpret that as the end of the comment. 451 AddError( 452 "\"/*\" inside block comment. Block comments cannot be nested."); 453 } else if (current_char_ == '\0') { 454 AddError("End-of-file inside block comment."); 455 error_collector_->AddError( 456 start_line, start_column, " Comment started here."); 457 break; 458 } 459 } 460} 461 462// ------------------------------------------------------------------- 463 464bool Tokenizer::Next() { 465 TokenType last_token_type = current_.type; 466 467 // Did we skip any characters after the last token? 468 bool skipped_stuff = false; 469 470 while (!read_error_) { 471 if (TryConsumeOne<Whitespace>()) { 472 ConsumeZeroOrMore<Whitespace>(); 473 474 } else if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { 475 // Starting a comment? 476 if (TryConsume('/')) { 477 ConsumeLineComment(); 478 } else if (TryConsume('*')) { 479 ConsumeBlockComment(); 480 } else { 481 // Oops, it was just a slash. Return it. 482 current_.type = TYPE_SYMBOL; 483 current_.text = "/"; 484 current_.line = line_; 485 current_.column = column_ - 1; 486 return true; 487 } 488 489 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { 490 ConsumeLineComment(); 491 492 } else if (LookingAt<Unprintable>() || current_char_ == '\0') { 493 AddError("Invalid control characters encountered in text."); 494 NextChar(); 495 // Skip more unprintable characters, too. But, remember that '\0' is 496 // also what current_char_ is set to after EOF / read error. We have 497 // to be careful not to go into an infinite loop of trying to consume 498 // it, so make sure to check read_error_ explicitly before consuming 499 // '\0'. 500 while (TryConsumeOne<Unprintable>() || 501 (!read_error_ && TryConsume('\0'))) { 502 // Ignore. 503 } 504 505 } else { 506 // Reading some sort of token. 507 StartToken(); 508 509 if (TryConsumeOne<Letter>()) { 510 ConsumeZeroOrMore<Alphanumeric>(); 511 current_.type = TYPE_IDENTIFIER; 512 } else if (TryConsume('0')) { 513 current_.type = ConsumeNumber(true, false); 514 } else if (TryConsume('.')) { 515 // This could be the beginning of a floating-point number, or it could 516 // just be a '.' symbol. 517 518 if (TryConsumeOne<Digit>()) { 519 // It's a floating-point number. 520 if (last_token_type == TYPE_IDENTIFIER && !skipped_stuff) { 521 // We don't accept syntax like "blah.123". 522 error_collector_->AddError(line_, column_ - 2, 523 "Need space between identifier and decimal point."); 524 } 525 current_.type = ConsumeNumber(false, true); 526 } else { 527 current_.type = TYPE_SYMBOL; 528 } 529 } else if (TryConsumeOne<Digit>()) { 530 current_.type = ConsumeNumber(false, false); 531 } else if (TryConsume('\"')) { 532 ConsumeString('\"'); 533 current_.type = TYPE_STRING; 534 } else if (TryConsume('\'')) { 535 ConsumeString('\''); 536 current_.type = TYPE_STRING; 537 } else { 538 NextChar(); 539 current_.type = TYPE_SYMBOL; 540 } 541 542 EndToken(); 543 return true; 544 } 545 546 skipped_stuff = true; 547 } 548 549 // EOF 550 current_.type = TYPE_END; 551 current_.text.clear(); 552 current_.line = line_; 553 current_.column = column_; 554 return false; 555} 556 557// ------------------------------------------------------------------- 558// Token-parsing helpers. Remember that these don't need to report 559// errors since any errors should already have been reported while 560// tokenizing. Also, these can assume that whatever text they 561// are given is text that the tokenizer actually parsed as a token 562// of the given type. 563 564bool Tokenizer::ParseInteger(const string& text, uint64 max_value, 565 uint64* output) { 566 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull() 567 // is non-standard. I hate the C standard library. :( 568 569// return strtoull(text.c_str(), NULL, 0); 570 571 const char* ptr = text.c_str(); 572 int base = 10; 573 if (ptr[0] == '0') { 574 if (ptr[1] == 'x' || ptr[1] == 'X') { 575 // This is hex. 576 base = 16; 577 ptr += 2; 578 } else { 579 // This is octal. 580 base = 8; 581 } 582 } 583 584 uint64 result = 0; 585 for (; *ptr != '\0'; ptr++) { 586 int digit = DigitValue(*ptr); 587 GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base) 588 << " Tokenizer::ParseInteger() passed text that could not have been" 589 " tokenized as an integer: " << CEscape(text); 590 if (digit > max_value || result > (max_value - digit) / base) { 591 // Overflow. 592 return false; 593 } 594 result = result * base + digit; 595 } 596 597 *output = result; 598 return true; 599} 600 601double Tokenizer::ParseFloat(const string& text) { 602 const char* start = text.c_str(); 603 char* end; 604 double result = NoLocaleStrtod(start, &end); 605 606 // "1e" is not a valid float, but if the tokenizer reads it, it will 607 // report an error but still return it as a valid token. We need to 608 // accept anything the tokenizer could possibly return, error or not. 609 if (*end == 'e' || *end == 'E') { 610 ++end; 611 if (*end == '-' || *end == '+') ++end; 612 } 613 614 // If the Tokenizer had allow_f_after_float_ enabled, the float may be 615 // suffixed with the letter 'f'. 616 if (*end == 'f' || *end == 'F') { 617 ++end; 618 } 619 620 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-') 621 << " Tokenizer::ParseFloat() passed text that could not have been" 622 " tokenized as a float: " << CEscape(text); 623 return result; 624} 625 626void Tokenizer::ParseStringAppend(const string& text, string* output) { 627 // Reminder: text[0] is always the quote character. (If text is 628 // empty, it's invalid, so we'll just return.) 629 if (text.empty()) { 630 GOOGLE_LOG(DFATAL) 631 << " Tokenizer::ParseStringAppend() passed text that could not" 632 " have been tokenized as a string: " << CEscape(text); 633 return; 634 } 635 636 output->reserve(output->size() + text.size()); 637 638 // Loop through the string copying characters to "output" and 639 // interpreting escape sequences. Note that any invalid escape 640 // sequences or other errors were already reported while tokenizing. 641 // In this case we do not need to produce valid results. 642 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) { 643 if (*ptr == '\\' && ptr[1] != '\0') { 644 // An escape sequence. 645 ++ptr; 646 647 if (OctalDigit::InClass(*ptr)) { 648 // An octal escape. May one, two, or three digits. 649 int code = DigitValue(*ptr); 650 if (OctalDigit::InClass(ptr[1])) { 651 ++ptr; 652 code = code * 8 + DigitValue(*ptr); 653 } 654 if (OctalDigit::InClass(ptr[1])) { 655 ++ptr; 656 code = code * 8 + DigitValue(*ptr); 657 } 658 output->push_back(static_cast<char>(code)); 659 660 } else if (*ptr == 'x') { 661 // A hex escape. May zero, one, or two digits. (The zero case 662 // will have been caught as an error earlier.) 663 int code = 0; 664 if (HexDigit::InClass(ptr[1])) { 665 ++ptr; 666 code = DigitValue(*ptr); 667 } 668 if (HexDigit::InClass(ptr[1])) { 669 ++ptr; 670 code = code * 16 + DigitValue(*ptr); 671 } 672 output->push_back(static_cast<char>(code)); 673 674 } else { 675 // Some other escape code. 676 output->push_back(TranslateEscape(*ptr)); 677 } 678 679 } else if (*ptr == text[0]) { 680 // Ignore quote matching the starting quote. 681 } else { 682 output->push_back(*ptr); 683 } 684 } 685 686 return; 687} 688 689} // namespace io 690} // namespace protobuf 691} // namespace google 692