1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// http://code.google.com/p/protobuf/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34// 35// Here we have a hand-written lexer. At first you might ask yourself, 36// "Hand-written text processing? Is Kenton crazy?!" Well, first of all, 37// yes I am crazy, but that's beside the point. There are actually reasons 38// why I ended up writing this this way. 39// 40// The traditional approach to lexing is to use lex to generate a lexer for 41// you. Unfortunately, lex's output is ridiculously ugly and difficult to 42// integrate cleanly with C++ code, especially abstract code or code meant 43// as a library. Better parser-generators exist but would add dependencies 44// which most users won't already have, which we'd like to avoid. (GNU flex 45// has a C++ output option, but it's still ridiculously ugly, non-abstract, 46// and not library-friendly.) 47// 48// The next approach that any good software engineer should look at is to 49// use regular expressions. And, indeed, I did. I have code which 50// implements this same class using regular expressions. It's about 200 51// lines shorter. However: 52// - Rather than error messages telling you "This string has an invalid 53// escape sequence at line 5, column 45", you get error messages like 54// "Parse error on line 5". Giving more precise errors requires adding 55// a lot of code that ends up basically as complex as the hand-coded 56// version anyway. 57// - The regular expression to match a string literal looks like this: 58// kString = new RE("(\"([^\"\\\\]|" // non-escaped 59// "\\\\[abfnrtv?\"'\\\\0-7]|" // normal escape 60// "\\\\x[0-9a-fA-F])*\"|" // hex escape 61// "\'([^\'\\\\]|" // Also support single-quotes. 62// "\\\\[abfnrtv?\"'\\\\0-7]|" 63// "\\\\x[0-9a-fA-F])*\')"); 64// Verifying the correctness of this line noise is actually harder than 65// verifying the correctness of ConsumeString(), defined below. I'm not 66// even confident that the above is correct, after staring at it for some 67// time. 68// - PCRE is fast, but there's still more overhead involved than the code 69// below. 70// - Sadly, regular expressions are not part of the C standard library, so 71// using them would require depending on some other library. For the 72// open source release, this could be really annoying. Nobody likes 73// downloading one piece of software just to find that they need to 74// download something else to make it work, and in all likelihood 75// people downloading Protocol Buffers will already be doing so just 76// to make something else work. We could include a copy of PCRE with 77// our code, but that obligates us to keep it up-to-date and just seems 78// like a big waste just to save 200 lines of code. 79// 80// On a similar but unrelated note, I'm even scared to use ctype.h. 81// Apparently functions like isalpha() are locale-dependent. So, if we used 82// that, then if this code is being called from some program that doesn't 83// have its locale set to "C", it would behave strangely. We can't just set 84// the locale to "C" ourselves since we might break the calling program that 85// way, particularly if it is multi-threaded. WTF? Someone please let me 86// (Kenton) know if I'm missing something here... 87// 88// I'd love to hear about other alternatives, though, as this code isn't 89// exactly pretty. 90 91#include <google/protobuf/io/tokenizer.h> 92#include <google/protobuf/stubs/common.h> 93#include <google/protobuf/stubs/stringprintf.h> 94#include <google/protobuf/io/zero_copy_stream.h> 95#include <google/protobuf/stubs/strutil.h> 96#include <google/protobuf/stubs/stl_util.h> 97 98namespace google { 99namespace protobuf { 100namespace io { 101namespace { 102 103// As mentioned above, I don't trust ctype.h due to the presence of "locales". 104// So, I have written replacement functions here. Someone please smack me if 105// this is a bad idea or if there is some way around this. 106// 107// These "character classes" are designed to be used in template methods. 108// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat 109// whitespace. 110 111// Note: No class is allowed to contain '\0', since this is used to mark end- 112// of-input and is handled specially. 113 114#define CHARACTER_CLASS(NAME, EXPRESSION) \ 115 class NAME { \ 116 public: \ 117 static inline bool InClass(char c) { \ 118 return EXPRESSION; \ 119 } \ 120 } 121 122CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' || 123 c == '\r' || c == '\v' || c == '\f'); 124CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' || 125 c == '\r' || c == '\v' || c == '\f'); 126 127CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0'); 128 129CHARACTER_CLASS(Digit, '0' <= c && c <= '9'); 130CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7'); 131CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') || 132 ('a' <= c && c <= 'f') || 133 ('A' <= c && c <= 'F')); 134 135CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') || 136 ('A' <= c && c <= 'Z') || 137 (c == '_')); 138 139CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') || 140 ('A' <= c && c <= 'Z') || 141 ('0' <= c && c <= '9') || 142 (c == '_')); 143 144CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' || 145 c == 'r' || c == 't' || c == 'v' || c == '\\' || 146 c == '?' || c == '\'' || c == '\"'); 147 148#undef CHARACTER_CLASS 149 150// Given a char, interpret it as a numeric digit and return its value. 151// This supports any number base up to 36. 152inline int DigitValue(char digit) { 153 if ('0' <= digit && digit <= '9') return digit - '0'; 154 if ('a' <= digit && digit <= 'z') return digit - 'a' + 10; 155 if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10; 156 return -1; 157} 158 159// Inline because it's only used in one place. 160inline char TranslateEscape(char c) { 161 switch (c) { 162 case 'a': return '\a'; 163 case 'b': return '\b'; 164 case 'f': return '\f'; 165 case 'n': return '\n'; 166 case 'r': return '\r'; 167 case 't': return '\t'; 168 case 'v': return '\v'; 169 case '\\': return '\\'; 170 case '?': return '\?'; // Trigraphs = :( 171 case '\'': return '\''; 172 case '"': return '\"'; 173 174 // We expect escape sequences to have been validated separately. 175 default: return '?'; 176 } 177} 178 179} // anonymous namespace 180 181ErrorCollector::~ErrorCollector() {} 182 183// =================================================================== 184 185Tokenizer::Tokenizer(ZeroCopyInputStream* input, 186 ErrorCollector* error_collector) 187 : input_(input), 188 error_collector_(error_collector), 189 buffer_(NULL), 190 buffer_size_(0), 191 buffer_pos_(0), 192 read_error_(false), 193 line_(0), 194 column_(0), 195 record_target_(NULL), 196 record_start_(-1), 197 allow_f_after_float_(false), 198 comment_style_(CPP_COMMENT_STYLE) { 199 200 current_.line = 0; 201 current_.column = 0; 202 current_.end_column = 0; 203 current_.type = TYPE_START; 204 205 Refresh(); 206} 207 208Tokenizer::~Tokenizer() { 209 // If we had any buffer left unread, return it to the underlying stream 210 // so that someone else can read it. 211 if (buffer_size_ > buffer_pos_) { 212 input_->BackUp(buffer_size_ - buffer_pos_); 213 } 214} 215 216// ------------------------------------------------------------------- 217// Internal helpers. 218 219void Tokenizer::NextChar() { 220 // Update our line and column counters based on the character being 221 // consumed. 222 if (current_char_ == '\n') { 223 ++line_; 224 column_ = 0; 225 } else if (current_char_ == '\t') { 226 column_ += kTabWidth - column_ % kTabWidth; 227 } else { 228 ++column_; 229 } 230 231 // Advance to the next character. 232 ++buffer_pos_; 233 if (buffer_pos_ < buffer_size_) { 234 current_char_ = buffer_[buffer_pos_]; 235 } else { 236 Refresh(); 237 } 238} 239 240void Tokenizer::Refresh() { 241 if (read_error_) { 242 current_char_ = '\0'; 243 return; 244 } 245 246 // If we're in a token, append the rest of the buffer to it. 247 if (record_target_ != NULL && record_start_ < buffer_size_) { 248 record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_); 249 record_start_ = 0; 250 } 251 252 const void* data = NULL; 253 buffer_ = NULL; 254 buffer_pos_ = 0; 255 do { 256 if (!input_->Next(&data, &buffer_size_)) { 257 // end of stream (or read error) 258 buffer_size_ = 0; 259 read_error_ = true; 260 current_char_ = '\0'; 261 return; 262 } 263 } while (buffer_size_ == 0); 264 265 buffer_ = static_cast<const char*>(data); 266 267 current_char_ = buffer_[0]; 268} 269 270inline void Tokenizer::RecordTo(string* target) { 271 record_target_ = target; 272 record_start_ = buffer_pos_; 273} 274 275inline void Tokenizer::StopRecording() { 276 // Note: The if() is necessary because some STL implementations crash when 277 // you call string::append(NULL, 0), presumably because they are trying to 278 // be helpful by detecting the NULL pointer, even though there's nothing 279 // wrong with reading zero bytes from NULL. 280 if (buffer_pos_ != record_start_) { 281 record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_); 282 } 283 record_target_ = NULL; 284 record_start_ = -1; 285} 286 287inline void Tokenizer::StartToken() { 288 current_.type = TYPE_START; // Just for the sake of initializing it. 289 current_.text.clear(); 290 current_.line = line_; 291 current_.column = column_; 292 RecordTo(¤t_.text); 293} 294 295inline void Tokenizer::EndToken() { 296 StopRecording(); 297 current_.end_column = column_; 298} 299 300// ------------------------------------------------------------------- 301// Helper methods that consume characters. 302 303template<typename CharacterClass> 304inline bool Tokenizer::LookingAt() { 305 return CharacterClass::InClass(current_char_); 306} 307 308template<typename CharacterClass> 309inline bool Tokenizer::TryConsumeOne() { 310 if (CharacterClass::InClass(current_char_)) { 311 NextChar(); 312 return true; 313 } else { 314 return false; 315 } 316} 317 318inline bool Tokenizer::TryConsume(char c) { 319 if (current_char_ == c) { 320 NextChar(); 321 return true; 322 } else { 323 return false; 324 } 325} 326 327template<typename CharacterClass> 328inline void Tokenizer::ConsumeZeroOrMore() { 329 while (CharacterClass::InClass(current_char_)) { 330 NextChar(); 331 } 332} 333 334template<typename CharacterClass> 335inline void Tokenizer::ConsumeOneOrMore(const char* error) { 336 if (!CharacterClass::InClass(current_char_)) { 337 AddError(error); 338 } else { 339 do { 340 NextChar(); 341 } while (CharacterClass::InClass(current_char_)); 342 } 343} 344 345// ------------------------------------------------------------------- 346// Methods that read whole patterns matching certain kinds of tokens 347// or comments. 348 349void Tokenizer::ConsumeString(char delimiter) { 350 while (true) { 351 switch (current_char_) { 352 case '\0': 353 case '\n': { 354 AddError("String literals cannot cross line boundaries."); 355 return; 356 } 357 358 case '\\': { 359 // An escape sequence. 360 NextChar(); 361 if (TryConsumeOne<Escape>()) { 362 // Valid escape sequence. 363 } else if (TryConsumeOne<OctalDigit>()) { 364 // Possibly followed by two more octal digits, but these will 365 // just be consumed by the main loop anyway so we don't need 366 // to do so explicitly here. 367 } else if (TryConsume('x') || TryConsume('X')) { 368 if (!TryConsumeOne<HexDigit>()) { 369 AddError("Expected hex digits for escape sequence."); 370 } 371 // Possibly followed by another hex digit, but again we don't care. 372 } else if (TryConsume('u')) { 373 if (!TryConsumeOne<HexDigit>() || 374 !TryConsumeOne<HexDigit>() || 375 !TryConsumeOne<HexDigit>() || 376 !TryConsumeOne<HexDigit>()) { 377 AddError("Expected four hex digits for \\u escape sequence."); 378 } 379 } else if (TryConsume('U')) { 380 // We expect 8 hex digits; but only the range up to 0x10ffff is 381 // legal. 382 if (!TryConsume('0') || 383 !TryConsume('0') || 384 !(TryConsume('0') || TryConsume('1')) || 385 !TryConsumeOne<HexDigit>() || 386 !TryConsumeOne<HexDigit>() || 387 !TryConsumeOne<HexDigit>() || 388 !TryConsumeOne<HexDigit>() || 389 !TryConsumeOne<HexDigit>()) { 390 AddError("Expected eight hex digits up to 10ffff for \\U escape " 391 "sequence"); 392 } 393 } else { 394 AddError("Invalid escape sequence in string literal."); 395 } 396 break; 397 } 398 399 default: { 400 if (current_char_ == delimiter) { 401 NextChar(); 402 return; 403 } 404 NextChar(); 405 break; 406 } 407 } 408 } 409} 410 411Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero, 412 bool started_with_dot) { 413 bool is_float = false; 414 415 if (started_with_zero && (TryConsume('x') || TryConsume('X'))) { 416 // A hex number (started with "0x"). 417 ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits."); 418 419 } else if (started_with_zero && LookingAt<Digit>()) { 420 // An octal number (had a leading zero). 421 ConsumeZeroOrMore<OctalDigit>(); 422 if (LookingAt<Digit>()) { 423 AddError("Numbers starting with leading zero must be in octal."); 424 ConsumeZeroOrMore<Digit>(); 425 } 426 427 } else { 428 // A decimal number. 429 if (started_with_dot) { 430 is_float = true; 431 ConsumeZeroOrMore<Digit>(); 432 } else { 433 ConsumeZeroOrMore<Digit>(); 434 435 if (TryConsume('.')) { 436 is_float = true; 437 ConsumeZeroOrMore<Digit>(); 438 } 439 } 440 441 if (TryConsume('e') || TryConsume('E')) { 442 is_float = true; 443 TryConsume('-') || TryConsume('+'); 444 ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent."); 445 } 446 447 if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) { 448 is_float = true; 449 } 450 } 451 452 if (LookingAt<Letter>()) { 453 AddError("Need space between number and identifier."); 454 } else if (current_char_ == '.') { 455 if (is_float) { 456 AddError( 457 "Already saw decimal point or exponent; can't have another one."); 458 } else { 459 AddError("Hex and octal numbers must be integers."); 460 } 461 } 462 463 return is_float ? TYPE_FLOAT : TYPE_INTEGER; 464} 465 466void Tokenizer::ConsumeLineComment(string* content) { 467 if (content != NULL) RecordTo(content); 468 469 while (current_char_ != '\0' && current_char_ != '\n') { 470 NextChar(); 471 } 472 TryConsume('\n'); 473 474 if (content != NULL) StopRecording(); 475} 476 477void Tokenizer::ConsumeBlockComment(string* content) { 478 int start_line = line_; 479 int start_column = column_ - 2; 480 481 if (content != NULL) RecordTo(content); 482 483 while (true) { 484 while (current_char_ != '\0' && 485 current_char_ != '*' && 486 current_char_ != '/' && 487 current_char_ != '\n') { 488 NextChar(); 489 } 490 491 if (TryConsume('\n')) { 492 if (content != NULL) StopRecording(); 493 494 // Consume leading whitespace and asterisk; 495 ConsumeZeroOrMore<WhitespaceNoNewline>(); 496 if (TryConsume('*')) { 497 if (TryConsume('/')) { 498 // End of comment. 499 break; 500 } 501 } 502 503 if (content != NULL) RecordTo(content); 504 } else if (TryConsume('*') && TryConsume('/')) { 505 // End of comment. 506 if (content != NULL) { 507 StopRecording(); 508 // Strip trailing "*/". 509 content->erase(content->size() - 2); 510 } 511 break; 512 } else if (TryConsume('/') && current_char_ == '*') { 513 // Note: We didn't consume the '*' because if there is a '/' after it 514 // we want to interpret that as the end of the comment. 515 AddError( 516 "\"/*\" inside block comment. Block comments cannot be nested."); 517 } else if (current_char_ == '\0') { 518 AddError("End-of-file inside block comment."); 519 error_collector_->AddError( 520 start_line, start_column, " Comment started here."); 521 if (content != NULL) StopRecording(); 522 break; 523 } 524 } 525} 526 527Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() { 528 if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) { 529 if (TryConsume('/')) { 530 return LINE_COMMENT; 531 } else if (TryConsume('*')) { 532 return BLOCK_COMMENT; 533 } else { 534 // Oops, it was just a slash. Return it. 535 current_.type = TYPE_SYMBOL; 536 current_.text = "/"; 537 current_.line = line_; 538 current_.column = column_ - 1; 539 current_.end_column = column_; 540 return SLASH_NOT_COMMENT; 541 } 542 } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) { 543 return LINE_COMMENT; 544 } else { 545 return NO_COMMENT; 546 } 547} 548 549// ------------------------------------------------------------------- 550 551bool Tokenizer::Next() { 552 previous_ = current_; 553 554 while (!read_error_) { 555 ConsumeZeroOrMore<Whitespace>(); 556 557 switch (TryConsumeCommentStart()) { 558 case LINE_COMMENT: 559 ConsumeLineComment(NULL); 560 continue; 561 case BLOCK_COMMENT: 562 ConsumeBlockComment(NULL); 563 continue; 564 case SLASH_NOT_COMMENT: 565 return true; 566 case NO_COMMENT: 567 break; 568 } 569 570 // Check for EOF before continuing. 571 if (read_error_) break; 572 573 if (LookingAt<Unprintable>() || current_char_ == '\0') { 574 AddError("Invalid control characters encountered in text."); 575 NextChar(); 576 // Skip more unprintable characters, too. But, remember that '\0' is 577 // also what current_char_ is set to after EOF / read error. We have 578 // to be careful not to go into an infinite loop of trying to consume 579 // it, so make sure to check read_error_ explicitly before consuming 580 // '\0'. 581 while (TryConsumeOne<Unprintable>() || 582 (!read_error_ && TryConsume('\0'))) { 583 // Ignore. 584 } 585 586 } else { 587 // Reading some sort of token. 588 StartToken(); 589 590 if (TryConsumeOne<Letter>()) { 591 ConsumeZeroOrMore<Alphanumeric>(); 592 current_.type = TYPE_IDENTIFIER; 593 } else if (TryConsume('0')) { 594 current_.type = ConsumeNumber(true, false); 595 } else if (TryConsume('.')) { 596 // This could be the beginning of a floating-point number, or it could 597 // just be a '.' symbol. 598 599 if (TryConsumeOne<Digit>()) { 600 // It's a floating-point number. 601 if (previous_.type == TYPE_IDENTIFIER && 602 current_.line == previous_.line && 603 current_.column == previous_.end_column) { 604 // We don't accept syntax like "blah.123". 605 error_collector_->AddError(line_, column_ - 2, 606 "Need space between identifier and decimal point."); 607 } 608 current_.type = ConsumeNumber(false, true); 609 } else { 610 current_.type = TYPE_SYMBOL; 611 } 612 } else if (TryConsumeOne<Digit>()) { 613 current_.type = ConsumeNumber(false, false); 614 } else if (TryConsume('\"')) { 615 ConsumeString('\"'); 616 current_.type = TYPE_STRING; 617 } else if (TryConsume('\'')) { 618 ConsumeString('\''); 619 current_.type = TYPE_STRING; 620 } else { 621 NextChar(); 622 current_.type = TYPE_SYMBOL; 623 } 624 625 EndToken(); 626 return true; 627 } 628 } 629 630 // EOF 631 current_.type = TYPE_END; 632 current_.text.clear(); 633 current_.line = line_; 634 current_.column = column_; 635 current_.end_column = column_; 636 return false; 637} 638 639namespace { 640 641// Helper class for collecting comments and putting them in the right places. 642// 643// This basically just buffers the most recent comment until it can be decided 644// exactly where that comment should be placed. When Flush() is called, the 645// current comment goes into either prev_trailing_comments or detached_comments. 646// When the CommentCollector is destroyed, the last buffered comment goes into 647// next_leading_comments. 648class CommentCollector { 649 public: 650 CommentCollector(string* prev_trailing_comments, 651 vector<string>* detached_comments, 652 string* next_leading_comments) 653 : prev_trailing_comments_(prev_trailing_comments), 654 detached_comments_(detached_comments), 655 next_leading_comments_(next_leading_comments), 656 has_comment_(false), 657 is_line_comment_(false), 658 can_attach_to_prev_(true) { 659 if (prev_trailing_comments != NULL) prev_trailing_comments->clear(); 660 if (detached_comments != NULL) detached_comments->clear(); 661 if (next_leading_comments != NULL) next_leading_comments->clear(); 662 } 663 664 ~CommentCollector() { 665 // Whatever is in the buffer is a leading comment. 666 if (next_leading_comments_ != NULL && has_comment_) { 667 comment_buffer_.swap(*next_leading_comments_); 668 } 669 } 670 671 // About to read a line comment. Get the comment buffer pointer in order to 672 // read into it. 673 string* GetBufferForLineComment() { 674 // We want to combine with previous line comments, but not block comments. 675 if (has_comment_ && !is_line_comment_) { 676 Flush(); 677 } 678 has_comment_ = true; 679 is_line_comment_ = true; 680 return &comment_buffer_; 681 } 682 683 // About to read a block comment. Get the comment buffer pointer in order to 684 // read into it. 685 string* GetBufferForBlockComment() { 686 if (has_comment_) { 687 Flush(); 688 } 689 has_comment_ = true; 690 is_line_comment_ = false; 691 return &comment_buffer_; 692 } 693 694 void ClearBuffer() { 695 comment_buffer_.clear(); 696 has_comment_ = false; 697 } 698 699 // Called once we know that the comment buffer is complete and is *not* 700 // connected to the next token. 701 void Flush() { 702 if (has_comment_) { 703 if (can_attach_to_prev_) { 704 if (prev_trailing_comments_ != NULL) { 705 prev_trailing_comments_->append(comment_buffer_); 706 } 707 can_attach_to_prev_ = false; 708 } else { 709 if (detached_comments_ != NULL) { 710 detached_comments_->push_back(comment_buffer_); 711 } 712 } 713 ClearBuffer(); 714 } 715 } 716 717 void DetachFromPrev() { 718 can_attach_to_prev_ = false; 719 } 720 721 private: 722 string* prev_trailing_comments_; 723 vector<string>* detached_comments_; 724 string* next_leading_comments_; 725 726 string comment_buffer_; 727 728 // True if any comments were read into comment_buffer_. This can be true even 729 // if comment_buffer_ is empty, namely if the comment was "/**/". 730 bool has_comment_; 731 732 // Is the comment in the comment buffer a line comment? 733 bool is_line_comment_; 734 735 // Is it still possible that we could be reading a comment attached to the 736 // previous token? 737 bool can_attach_to_prev_; 738}; 739 740} // namespace 741 742bool Tokenizer::NextWithComments(string* prev_trailing_comments, 743 vector<string>* detached_comments, 744 string* next_leading_comments) { 745 CommentCollector collector(prev_trailing_comments, detached_comments, 746 next_leading_comments); 747 748 if (current_.type == TYPE_START) { 749 collector.DetachFromPrev(); 750 } else { 751 // A comment appearing on the same line must be attached to the previous 752 // declaration. 753 ConsumeZeroOrMore<WhitespaceNoNewline>(); 754 switch (TryConsumeCommentStart()) { 755 case LINE_COMMENT: 756 ConsumeLineComment(collector.GetBufferForLineComment()); 757 758 // Don't allow comments on subsequent lines to be attached to a trailing 759 // comment. 760 collector.Flush(); 761 break; 762 case BLOCK_COMMENT: 763 ConsumeBlockComment(collector.GetBufferForBlockComment()); 764 765 ConsumeZeroOrMore<WhitespaceNoNewline>(); 766 if (!TryConsume('\n')) { 767 // Oops, the next token is on the same line. If we recorded a comment 768 // we really have no idea which token it should be attached to. 769 collector.ClearBuffer(); 770 return Next(); 771 } 772 773 // Don't allow comments on subsequent lines to be attached to a trailing 774 // comment. 775 collector.Flush(); 776 break; 777 case SLASH_NOT_COMMENT: 778 return true; 779 case NO_COMMENT: 780 if (!TryConsume('\n')) { 781 // The next token is on the same line. There are no comments. 782 return Next(); 783 } 784 break; 785 } 786 } 787 788 // OK, we are now on the line *after* the previous token. 789 while (true) { 790 ConsumeZeroOrMore<WhitespaceNoNewline>(); 791 792 switch (TryConsumeCommentStart()) { 793 case LINE_COMMENT: 794 ConsumeLineComment(collector.GetBufferForLineComment()); 795 break; 796 case BLOCK_COMMENT: 797 ConsumeBlockComment(collector.GetBufferForBlockComment()); 798 799 // Consume the rest of the line so that we don't interpret it as a 800 // blank line the next time around the loop. 801 ConsumeZeroOrMore<WhitespaceNoNewline>(); 802 TryConsume('\n'); 803 break; 804 case SLASH_NOT_COMMENT: 805 return true; 806 case NO_COMMENT: 807 if (TryConsume('\n')) { 808 // Completely blank line. 809 collector.Flush(); 810 collector.DetachFromPrev(); 811 } else { 812 bool result = Next(); 813 if (!result || 814 current_.text == "}" || 815 current_.text == "]" || 816 current_.text == ")") { 817 // It looks like we're at the end of a scope. In this case it 818 // makes no sense to attach a comment to the following token. 819 collector.Flush(); 820 } 821 return result; 822 } 823 break; 824 } 825 } 826} 827 828// ------------------------------------------------------------------- 829// Token-parsing helpers. Remember that these don't need to report 830// errors since any errors should already have been reported while 831// tokenizing. Also, these can assume that whatever text they 832// are given is text that the tokenizer actually parsed as a token 833// of the given type. 834 835bool Tokenizer::ParseInteger(const string& text, uint64 max_value, 836 uint64* output) { 837 // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull() 838 // is non-standard. I hate the C standard library. :( 839 840// return strtoull(text.c_str(), NULL, 0); 841 842 const char* ptr = text.c_str(); 843 int base = 10; 844 if (ptr[0] == '0') { 845 if (ptr[1] == 'x' || ptr[1] == 'X') { 846 // This is hex. 847 base = 16; 848 ptr += 2; 849 } else { 850 // This is octal. 851 base = 8; 852 } 853 } 854 855 uint64 result = 0; 856 for (; *ptr != '\0'; ptr++) { 857 int digit = DigitValue(*ptr); 858 GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base) 859 << " Tokenizer::ParseInteger() passed text that could not have been" 860 " tokenized as an integer: " << CEscape(text); 861 if (digit > max_value || result > (max_value - digit) / base) { 862 // Overflow. 863 return false; 864 } 865 result = result * base + digit; 866 } 867 868 *output = result; 869 return true; 870} 871 872double Tokenizer::ParseFloat(const string& text) { 873 const char* start = text.c_str(); 874 char* end; 875 double result = NoLocaleStrtod(start, &end); 876 877 // "1e" is not a valid float, but if the tokenizer reads it, it will 878 // report an error but still return it as a valid token. We need to 879 // accept anything the tokenizer could possibly return, error or not. 880 if (*end == 'e' || *end == 'E') { 881 ++end; 882 if (*end == '-' || *end == '+') ++end; 883 } 884 885 // If the Tokenizer had allow_f_after_float_ enabled, the float may be 886 // suffixed with the letter 'f'. 887 if (*end == 'f' || *end == 'F') { 888 ++end; 889 } 890 891 GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-') 892 << " Tokenizer::ParseFloat() passed text that could not have been" 893 " tokenized as a float: " << CEscape(text); 894 return result; 895} 896 897// Helper to append a Unicode code point to a string as UTF8, without bringing 898// in any external dependencies. 899static void AppendUTF8(uint32 code_point, string* output) { 900 uint32 tmp = 0; 901 int len = 0; 902 if (code_point <= 0x7f) { 903 tmp = code_point; 904 len = 1; 905 } else if (code_point <= 0x07ff) { 906 tmp = 0x0000c080 | 907 ((code_point & 0x07c0) << 2) | 908 (code_point & 0x003f); 909 len = 2; 910 } else if (code_point <= 0xffff) { 911 tmp = 0x00e08080 | 912 ((code_point & 0xf000) << 4) | 913 ((code_point & 0x0fc0) << 2) | 914 (code_point & 0x003f); 915 len = 3; 916 } else if (code_point <= 0x1fffff) { 917 tmp = 0xf0808080 | 918 ((code_point & 0x1c0000) << 6) | 919 ((code_point & 0x03f000) << 4) | 920 ((code_point & 0x000fc0) << 2) | 921 (code_point & 0x003f); 922 len = 4; 923 } else { 924 // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is 925 // normally only defined up to there as well. 926 StringAppendF(output, "\\U%08x", code_point); 927 return; 928 } 929 tmp = ghtonl(tmp); 930 output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len); 931} 932 933// Try to read <len> hex digits from ptr, and stuff the numeric result into 934// *result. Returns true if that many digits were successfully consumed. 935static bool ReadHexDigits(const char* ptr, int len, uint32* result) { 936 *result = 0; 937 if (len == 0) return false; 938 for (const char* end = ptr + len; ptr < end; ++ptr) { 939 if (*ptr == '\0') return false; 940 *result = (*result << 4) + DigitValue(*ptr); 941 } 942 return true; 943} 944 945// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range 946// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail 947// surrogate. These numbers are in a reserved range of Unicode code points, so 948// if we encounter such a pair we know how to parse it and convert it into a 949// single code point. 950static const uint32 kMinHeadSurrogate = 0xd800; 951static const uint32 kMaxHeadSurrogate = 0xdc00; 952static const uint32 kMinTrailSurrogate = 0xdc00; 953static const uint32 kMaxTrailSurrogate = 0xe000; 954 955static inline bool IsHeadSurrogate(uint32 code_point) { 956 return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate); 957} 958 959static inline bool IsTrailSurrogate(uint32 code_point) { 960 return (code_point >= kMinTrailSurrogate) && 961 (code_point < kMaxTrailSurrogate); 962} 963 964// Combine a head and trail surrogate into a single Unicode code point. 965static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) { 966 GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate)); 967 GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate)); 968 return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) | 969 (trail_surrogate - kMinTrailSurrogate)); 970} 971 972// Convert the escape sequence parameter to a number of expected hex digits. 973static inline int UnicodeLength(char key) { 974 if (key == 'u') return 4; 975 if (key == 'U') return 8; 976 return 0; 977} 978 979// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt 980// to parse that sequence. On success, returns a pointer to the first char 981// beyond that sequence, and fills in *code_point. On failure, returns ptr 982// itself. 983static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) { 984 const char* p = ptr; 985 // Fetch the code point. 986 const int len = UnicodeLength(*p++); 987 if (!ReadHexDigits(p, len, code_point)) 988 return ptr; 989 p += len; 990 991 // Check if the code point we read is a "head surrogate." If so, then we 992 // expect it to be immediately followed by another code point which is a valid 993 // "trail surrogate," and together they form a UTF-16 pair which decodes into 994 // a single Unicode point. Trail surrogates may only use \u, not \U. 995 if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') { 996 uint32 trail_surrogate; 997 if (ReadHexDigits(p + 2, 4, &trail_surrogate) && 998 IsTrailSurrogate(trail_surrogate)) { 999 *code_point = AssembleUTF16(*code_point, trail_surrogate); 1000 p += 6; 1001 } 1002 // If this failed, then we just emit the head surrogate as a code point. 1003 // It's bogus, but so is the string. 1004 } 1005 1006 return p; 1007} 1008 1009// The text string must begin and end with single or double quote 1010// characters. 1011void Tokenizer::ParseStringAppend(const string& text, string* output) { 1012 // Reminder: text[0] is always a quote character. (If text is 1013 // empty, it's invalid, so we'll just return). 1014 const size_t text_size = text.size(); 1015 if (text_size == 0) { 1016 GOOGLE_LOG(DFATAL) 1017 << " Tokenizer::ParseStringAppend() passed text that could not" 1018 " have been tokenized as a string: " << CEscape(text); 1019 return; 1020 } 1021 1022 // Reserve room for new string. The branch is necessary because if 1023 // there is already space available the reserve() call might 1024 // downsize the output. 1025 const size_t new_len = text_size + output->size(); 1026 if (new_len > output->capacity()) { 1027 output->reserve(new_len); 1028 } 1029 1030 // Loop through the string copying characters to "output" and 1031 // interpreting escape sequences. Note that any invalid escape 1032 // sequences or other errors were already reported while tokenizing. 1033 // In this case we do not need to produce valid results. 1034 for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) { 1035 if (*ptr == '\\' && ptr[1] != '\0') { 1036 // An escape sequence. 1037 ++ptr; 1038 1039 if (OctalDigit::InClass(*ptr)) { 1040 // An octal escape. May one, two, or three digits. 1041 int code = DigitValue(*ptr); 1042 if (OctalDigit::InClass(ptr[1])) { 1043 ++ptr; 1044 code = code * 8 + DigitValue(*ptr); 1045 } 1046 if (OctalDigit::InClass(ptr[1])) { 1047 ++ptr; 1048 code = code * 8 + DigitValue(*ptr); 1049 } 1050 output->push_back(static_cast<char>(code)); 1051 1052 } else if (*ptr == 'x') { 1053 // A hex escape. May zero, one, or two digits. (The zero case 1054 // will have been caught as an error earlier.) 1055 int code = 0; 1056 if (HexDigit::InClass(ptr[1])) { 1057 ++ptr; 1058 code = DigitValue(*ptr); 1059 } 1060 if (HexDigit::InClass(ptr[1])) { 1061 ++ptr; 1062 code = code * 16 + DigitValue(*ptr); 1063 } 1064 output->push_back(static_cast<char>(code)); 1065 1066 } else if (*ptr == 'u' || *ptr == 'U') { 1067 uint32 unicode; 1068 const char* end = FetchUnicodePoint(ptr, &unicode); 1069 if (end == ptr) { 1070 // Failure: Just dump out what we saw, don't try to parse it. 1071 output->push_back(*ptr); 1072 } else { 1073 AppendUTF8(unicode, output); 1074 ptr = end - 1; // Because we're about to ++ptr. 1075 } 1076 } else { 1077 // Some other escape code. 1078 output->push_back(TranslateEscape(*ptr)); 1079 } 1080 1081 } else if (*ptr == text[0] && ptr[1] == '\0') { 1082 // Ignore final quote matching the starting quote. 1083 } else { 1084 output->push_back(*ptr); 1085 } 1086 } 1087} 1088 1089} // namespace io 1090} // namespace protobuf 1091} // namespace google 1092