1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// https://developers.google.com/protocol-buffers/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34 35#include <limits.h> 36#include <math.h> 37 38#include <vector> 39 40#include <google/protobuf/io/tokenizer.h> 41#include <google/protobuf/io/zero_copy_stream_impl.h> 42 43#include <google/protobuf/stubs/common.h> 44#include <google/protobuf/stubs/strutil.h> 45#include <google/protobuf/stubs/substitute.h> 46#include <google/protobuf/testing/googletest.h> 47#include <gtest/gtest.h> 48 49namespace google { 50namespace protobuf { 51namespace io { 52namespace { 53 54// =================================================================== 55// Data-Driven Test Infrastructure 56 57// TODO(kenton): This is copied from coded_stream_unittest. This is 58// temporary until these fetaures are integrated into gTest itself. 59 60// TEST_1D and TEST_2D are macros I'd eventually like to see added to 61// gTest. These macros can be used to declare tests which should be 62// run multiple times, once for each item in some input array. TEST_1D 63// tests all cases in a single input array. TEST_2D tests all 64// combinations of cases from two arrays. The arrays must be statically 65// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example: 66// 67// int kCases[] = {1, 2, 3, 4} 68// TEST_1D(MyFixture, MyTest, kCases) { 69// EXPECT_GT(kCases_case, 0); 70// } 71// 72// This test iterates through the numbers 1, 2, 3, and 4 and tests that 73// they are all grater than zero. In case of failure, the exact case 74// which failed will be printed. The case type must be printable using 75// ostream::operator<<. 76 77#define TEST_1D(FIXTURE, NAME, CASES) \ 78 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 79 protected: \ 80 template <typename CaseType> \ 81 void DoSingleCase(const CaseType& CASES##_case); \ 82 }; \ 83 \ 84 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 85 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \ 86 SCOPED_TRACE(testing::Message() \ 87 << #CASES " case #" << i << ": " << CASES[i]); \ 88 DoSingleCase(CASES[i]); \ 89 } \ 90 } \ 91 \ 92 template <typename CaseType> \ 93 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) 94 95#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \ 96 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 97 protected: \ 98 template <typename CaseType1, typename CaseType2> \ 99 void DoSingleCase(const CaseType1& CASES1##_case, \ 100 const CaseType2& CASES2##_case); \ 101 }; \ 102 \ 103 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 104 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \ 105 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \ 106 SCOPED_TRACE(testing::Message() \ 107 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ 108 << #CASES2 " case #" << j << ": " << CASES2[j]); \ 109 DoSingleCase(CASES1[i], CASES2[j]); \ 110 } \ 111 } \ 112 } \ 113 \ 114 template <typename CaseType1, typename CaseType2> \ 115 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \ 116 const CaseType2& CASES2##_case) 117 118// ------------------------------------------------------------------- 119 120// An input stream that is basically like an ArrayInputStream but sometimes 121// returns empty buffers, just to throw us off. 122class TestInputStream : public ZeroCopyInputStream { 123 public: 124 TestInputStream(const void* data, int size, int block_size) 125 : array_stream_(data, size, block_size), counter_(0) {} 126 ~TestInputStream() {} 127 128 // implements ZeroCopyInputStream ---------------------------------- 129 bool Next(const void** data, int* size) { 130 // We'll return empty buffers starting with the first buffer, and every 131 // 3 and 5 buffers after that. 132 if (counter_ % 3 == 0 || counter_ % 5 == 0) { 133 *data = NULL; 134 *size = 0; 135 ++counter_; 136 return true; 137 } else { 138 ++counter_; 139 return array_stream_.Next(data, size); 140 } 141 } 142 143 void BackUp(int count) { return array_stream_.BackUp(count); } 144 bool Skip(int count) { return array_stream_.Skip(count); } 145 int64 ByteCount() const { return array_stream_.ByteCount(); } 146 147 private: 148 ArrayInputStream array_stream_; 149 int counter_; 150}; 151 152// ------------------------------------------------------------------- 153 154// An error collector which simply concatenates all its errors into a big 155// block of text which can be checked. 156class TestErrorCollector : public ErrorCollector { 157 public: 158 TestErrorCollector() {} 159 ~TestErrorCollector() {} 160 161 string text_; 162 163 // implements ErrorCollector --------------------------------------- 164 void AddError(int line, int column, const string& message) { 165 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", 166 line, column, message); 167 } 168}; 169 170// ------------------------------------------------------------------- 171 172// We test each operation over a variety of block sizes to insure that 173// we test cases where reads cross buffer boundaries as well as cases 174// where they don't. This is sort of a brute-force approach to this, 175// but it's easy to write and easy to understand. 176const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; 177 178class TokenizerTest : public testing::Test { 179 protected: 180 // For easy testing. 181 uint64 ParseInteger(const string& text) { 182 uint64 result; 183 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result)); 184 return result; 185 } 186}; 187 188// =================================================================== 189 190// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: 191// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 192#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 193 194// In each test case, the entire input text should parse as a single token 195// of the given type. 196struct SimpleTokenCase { 197 string input; 198 Tokenizer::TokenType type; 199}; 200 201inline ostream& operator<<(ostream& out, 202 const SimpleTokenCase& test_case) { 203 return out << CEscape(test_case.input); 204} 205 206SimpleTokenCase kSimpleTokenCases[] = { 207 // Test identifiers. 208 { "hello", Tokenizer::TYPE_IDENTIFIER }, 209 210 // Test integers. 211 { "123", Tokenizer::TYPE_INTEGER }, 212 { "0xab6", Tokenizer::TYPE_INTEGER }, 213 { "0XAB6", Tokenizer::TYPE_INTEGER }, 214 { "0X1234567", Tokenizer::TYPE_INTEGER }, 215 { "0x89abcdef", Tokenizer::TYPE_INTEGER }, 216 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER }, 217 { "01234567", Tokenizer::TYPE_INTEGER }, 218 219 // Test floats. 220 { "123.45", Tokenizer::TYPE_FLOAT }, 221 { "1.", Tokenizer::TYPE_FLOAT }, 222 { "1e3", Tokenizer::TYPE_FLOAT }, 223 { "1E3", Tokenizer::TYPE_FLOAT }, 224 { "1e-3", Tokenizer::TYPE_FLOAT }, 225 { "1e+3", Tokenizer::TYPE_FLOAT }, 226 { "1.e3", Tokenizer::TYPE_FLOAT }, 227 { "1.2e3", Tokenizer::TYPE_FLOAT }, 228 { ".1", Tokenizer::TYPE_FLOAT }, 229 { ".1e3", Tokenizer::TYPE_FLOAT }, 230 { ".1e-3", Tokenizer::TYPE_FLOAT }, 231 { ".1e+3", Tokenizer::TYPE_FLOAT }, 232 233 // Test strings. 234 { "'hello'", Tokenizer::TYPE_STRING }, 235 { "\"foo\"", Tokenizer::TYPE_STRING }, 236 { "'a\"b'", Tokenizer::TYPE_STRING }, 237 { "\"a'b\"", Tokenizer::TYPE_STRING }, 238 { "'a\\'b'", Tokenizer::TYPE_STRING }, 239 { "\"a\\\"b\"", Tokenizer::TYPE_STRING }, 240 { "'\\xf'", Tokenizer::TYPE_STRING }, 241 { "'\\0'", Tokenizer::TYPE_STRING }, 242 243 // Test symbols. 244 { "+", Tokenizer::TYPE_SYMBOL }, 245 { ".", Tokenizer::TYPE_SYMBOL }, 246}; 247 248TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { 249 // Set up the tokenizer. 250 TestInputStream input(kSimpleTokenCases_case.input.data(), 251 kSimpleTokenCases_case.input.size(), 252 kBlockSizes_case); 253 TestErrorCollector error_collector; 254 Tokenizer tokenizer(&input, &error_collector); 255 256 // Before Next() is called, the initial token should always be TYPE_START. 257 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 258 EXPECT_EQ("", tokenizer.current().text); 259 EXPECT_EQ(0, tokenizer.current().line); 260 EXPECT_EQ(0, tokenizer.current().column); 261 EXPECT_EQ(0, tokenizer.current().end_column); 262 263 // Parse the token. 264 ASSERT_TRUE(tokenizer.Next()); 265 266 // Check that it has the right type. 267 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type); 268 // Check that it contains the complete input text. 269 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text); 270 // Check that it is located at the beginning of the input 271 EXPECT_EQ(0, tokenizer.current().line); 272 EXPECT_EQ(0, tokenizer.current().column); 273 EXPECT_EQ(kSimpleTokenCases_case.input.size(), 274 tokenizer.current().end_column); 275 276 // There should be no more input. 277 EXPECT_FALSE(tokenizer.Next()); 278 279 // After Next() returns false, the token should have type TYPE_END. 280 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type); 281 EXPECT_EQ("", tokenizer.current().text); 282 EXPECT_EQ(0, tokenizer.current().line); 283 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column); 284 EXPECT_EQ(kSimpleTokenCases_case.input.size(), 285 tokenizer.current().end_column); 286 287 // There should be no errors. 288 EXPECT_TRUE(error_collector.text_.empty()); 289} 290 291TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { 292 // Test the "allow_f_after_float" option. 293 294 // Set up the tokenizer. 295 const char* text = "1f 2.5f 6e3f 7F"; 296 TestInputStream input(text, strlen(text), kBlockSizes_case); 297 TestErrorCollector error_collector; 298 Tokenizer tokenizer(&input, &error_collector); 299 tokenizer.set_allow_f_after_float(true); 300 301 // Advance through tokens and check that they are parsed as expected. 302 ASSERT_TRUE(tokenizer.Next()); 303 EXPECT_EQ(tokenizer.current().text, "1f"); 304 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 305 ASSERT_TRUE(tokenizer.Next()); 306 EXPECT_EQ(tokenizer.current().text, "2.5f"); 307 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 308 ASSERT_TRUE(tokenizer.Next()); 309 EXPECT_EQ(tokenizer.current().text, "6e3f"); 310 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 311 ASSERT_TRUE(tokenizer.Next()); 312 EXPECT_EQ(tokenizer.current().text, "7F"); 313 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 314 315 // There should be no more input. 316 EXPECT_FALSE(tokenizer.Next()); 317 // There should be no errors. 318 EXPECT_TRUE(error_collector.text_.empty()); 319} 320 321#endif 322 323// ------------------------------------------------------------------- 324 325// In each case, the input is parsed to produce a list of tokens. The 326// last token in "output" must have type TYPE_END. 327struct MultiTokenCase { 328 string input; 329 Tokenizer::Token output[10]; // The compiler wants a constant array 330 // size for initialization to work. There 331 // is no reason this can't be increased if 332 // needed. 333}; 334 335inline ostream& operator<<(ostream& out, 336 const MultiTokenCase& test_case) { 337 return out << CEscape(test_case.input); 338} 339 340MultiTokenCase kMultiTokenCases[] = { 341 // Test empty input. 342 { "", { 343 { Tokenizer::TYPE_END , "" , 0, 0 }, 344 }}, 345 346 // Test all token types at the same time. 347 { "foo 1 1.2 + 'bar'", { 348 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0, 3 }, 349 { Tokenizer::TYPE_INTEGER , "1" , 0, 4, 5 }, 350 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6, 9 }, 351 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10, 11 }, 352 { Tokenizer::TYPE_STRING , "'bar'", 0, 12, 17 }, 353 { Tokenizer::TYPE_END , "" , 0, 17, 17 }, 354 }}, 355 356 // Test that consecutive symbols are parsed as separate tokens. 357 { "!@+%", { 358 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0, 1 }, 359 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1, 2 }, 360 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2, 3 }, 361 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3, 4 }, 362 { Tokenizer::TYPE_END , "" , 0, 4, 4 }, 363 }}, 364 365 // Test that newlines affect line numbers correctly. 366 { "foo bar\nrab oof", { 367 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 368 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4, 7 }, 369 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0, 3 }, 370 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4, 7 }, 371 { Tokenizer::TYPE_END , "" , 1, 7, 7 }, 372 }}, 373 374 // Test that tabs affect column numbers correctly. 375 { "foo\tbar \tbaz", { 376 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 377 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8, 11 }, 378 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 }, 379 { Tokenizer::TYPE_END , "" , 0, 19, 19 }, 380 }}, 381 382 // Test that tabs in string literals affect column numbers correctly. 383 { "\"foo\tbar\" baz", { 384 { Tokenizer::TYPE_STRING , "\"foo\tbar\"", 0, 0, 12 }, 385 { Tokenizer::TYPE_IDENTIFIER, "baz" , 0, 13, 16 }, 386 { Tokenizer::TYPE_END , "" , 0, 16, 16 }, 387 }}, 388 389 // Test that line comments are ignored. 390 { "foo // This is a comment\n" 391 "bar // This is another comment", { 392 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 393 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0, 3 }, 394 { Tokenizer::TYPE_END , "" , 1, 30, 30 }, 395 }}, 396 397 // Test that block comments are ignored. 398 { "foo /* This is a block comment */ bar", { 399 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 400 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 }, 401 { Tokenizer::TYPE_END , "" , 0, 37, 37 }, 402 }}, 403 404 // Test that sh-style comments are not ignored by default. 405 { "foo # bar\n" 406 "baz", { 407 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 408 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4, 5 }, 409 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 }, 410 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 }, 411 { Tokenizer::TYPE_END , "" , 1, 3, 3 }, 412 }}, 413 414 // Test all whitespace chars 415 { "foo\n\t\r\v\fbar", { 416 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 }, 417 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 }, 418 { Tokenizer::TYPE_END , "" , 1, 14, 14 }, 419 }}, 420}; 421 422TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { 423 // Set up the tokenizer. 424 TestInputStream input(kMultiTokenCases_case.input.data(), 425 kMultiTokenCases_case.input.size(), 426 kBlockSizes_case); 427 TestErrorCollector error_collector; 428 Tokenizer tokenizer(&input, &error_collector); 429 430 // Before Next() is called, the initial token should always be TYPE_START. 431 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 432 EXPECT_EQ("", tokenizer.current().text); 433 EXPECT_EQ(0, tokenizer.current().line); 434 EXPECT_EQ(0, tokenizer.current().column); 435 EXPECT_EQ(0, tokenizer.current().end_column); 436 437 // Loop through all expected tokens. 438 int i = 0; 439 Tokenizer::Token token; 440 do { 441 token = kMultiTokenCases_case.output[i++]; 442 443 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text); 444 445 Tokenizer::Token previous = tokenizer.current(); 446 447 // Next() should only return false when it hits the end token. 448 if (token.type != Tokenizer::TYPE_END) { 449 ASSERT_TRUE(tokenizer.Next()); 450 } else { 451 ASSERT_FALSE(tokenizer.Next()); 452 } 453 454 // Check that the previous token is set correctly. 455 EXPECT_EQ(previous.type, tokenizer.previous().type); 456 EXPECT_EQ(previous.text, tokenizer.previous().text); 457 EXPECT_EQ(previous.line, tokenizer.previous().line); 458 EXPECT_EQ(previous.column, tokenizer.previous().column); 459 EXPECT_EQ(previous.end_column, tokenizer.previous().end_column); 460 461 // Check that the token matches the expected one. 462 EXPECT_EQ(token.type, tokenizer.current().type); 463 EXPECT_EQ(token.text, tokenizer.current().text); 464 EXPECT_EQ(token.line, tokenizer.current().line); 465 EXPECT_EQ(token.column, tokenizer.current().column); 466 EXPECT_EQ(token.end_column, tokenizer.current().end_column); 467 468 } while (token.type != Tokenizer::TYPE_END); 469 470 // There should be no errors. 471 EXPECT_TRUE(error_collector.text_.empty()); 472} 473 474// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: 475// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 476#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 477 478TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { 479 // Test the "comment_style" option. 480 481 const char* text = "foo # bar\n" 482 "baz // qux\n" 483 "corge /* grault */\n" 484 "garply"; 485 const char* const kTokens[] = {"foo", // "# bar" is ignored 486 "baz", "/", "/", "qux", 487 "corge", "/", "*", "grault", "*", "/", 488 "garply"}; 489 490 // Set up the tokenizer. 491 TestInputStream input(text, strlen(text), kBlockSizes_case); 492 TestErrorCollector error_collector; 493 Tokenizer tokenizer(&input, &error_collector); 494 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE); 495 496 // Advance through tokens and check that they are parsed as expected. 497 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) { 498 EXPECT_TRUE(tokenizer.Next()); 499 EXPECT_EQ(tokenizer.current().text, kTokens[i]); 500 } 501 502 // There should be no more input. 503 EXPECT_FALSE(tokenizer.Next()); 504 // There should be no errors. 505 EXPECT_TRUE(error_collector.text_.empty()); 506} 507 508#endif 509 510// ------------------------------------------------------------------- 511 512// In each case, the input is expected to have two tokens named "prev" and 513// "next" with comments in between. 514struct DocCommentCase { 515 string input; 516 517 const char* prev_trailing_comments; 518 const char* detached_comments[10]; 519 const char* next_leading_comments; 520}; 521 522inline ostream& operator<<(ostream& out, 523 const DocCommentCase& test_case) { 524 return out << CEscape(test_case.input); 525} 526 527DocCommentCase kDocCommentCases[] = { 528 { 529 "prev next", 530 531 "", 532 {}, 533 "" 534 }, 535 536 { 537 "prev /* ignored */ next", 538 539 "", 540 {}, 541 "" 542 }, 543 544 { 545 "prev // trailing comment\n" 546 "next", 547 548 " trailing comment\n", 549 {}, 550 "" 551 }, 552 553 { 554 "prev\n" 555 "// leading comment\n" 556 "// line 2\n" 557 "next", 558 559 "", 560 {}, 561 " leading comment\n" 562 " line 2\n" 563 }, 564 565 { 566 "prev\n" 567 "// trailing comment\n" 568 "// line 2\n" 569 "\n" 570 "next", 571 572 " trailing comment\n" 573 " line 2\n", 574 {}, 575 "" 576 }, 577 578 { 579 "prev // trailing comment\n" 580 "// leading comment\n" 581 "// line 2\n" 582 "next", 583 584 " trailing comment\n", 585 {}, 586 " leading comment\n" 587 " line 2\n" 588 }, 589 590 { 591 "prev /* trailing block comment */\n" 592 "/* leading block comment\n" 593 " * line 2\n" 594 " * line 3 */" 595 "next", 596 597 " trailing block comment ", 598 {}, 599 " leading block comment\n" 600 " line 2\n" 601 " line 3 " 602 }, 603 604 { 605 "prev\n" 606 "/* trailing block comment\n" 607 " * line 2\n" 608 " * line 3\n" 609 " */\n" 610 "/* leading block comment\n" 611 " * line 2\n" 612 " * line 3 */" 613 "next", 614 615 " trailing block comment\n" 616 " line 2\n" 617 " line 3\n", 618 {}, 619 " leading block comment\n" 620 " line 2\n" 621 " line 3 " 622 }, 623 624 { 625 "prev\n" 626 "// trailing comment\n" 627 "\n" 628 "// detached comment\n" 629 "// line 2\n" 630 "\n" 631 "// second detached comment\n" 632 "/* third detached comment\n" 633 " * line 2 */\n" 634 "// leading comment\n" 635 "next", 636 637 " trailing comment\n", 638 { 639 " detached comment\n" 640 " line 2\n", 641 " second detached comment\n", 642 " third detached comment\n" 643 " line 2 " 644 }, 645 " leading comment\n" 646 }, 647 648 { 649 "prev /**/\n" 650 "\n" 651 "// detached comment\n" 652 "\n" 653 "// leading comment\n" 654 "next", 655 656 "", 657 { 658 " detached comment\n" 659 }, 660 " leading comment\n" 661 }, 662 663 { 664 "prev /**/\n" 665 "// leading comment\n" 666 "next", 667 668 "", 669 {}, 670 " leading comment\n" 671 }, 672 }; 673 674TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) { 675 // Set up the tokenizer. 676 TestInputStream input(kDocCommentCases_case.input.data(), 677 kDocCommentCases_case.input.size(), 678 kBlockSizes_case); 679 TestErrorCollector error_collector; 680 Tokenizer tokenizer(&input, &error_collector); 681 682 // Set up a second tokenizer where we'll pass all NULLs to NextWithComments(). 683 TestInputStream input2(kDocCommentCases_case.input.data(), 684 kDocCommentCases_case.input.size(), 685 kBlockSizes_case); 686 Tokenizer tokenizer2(&input2, &error_collector); 687 688 tokenizer.Next(); 689 tokenizer2.Next(); 690 691 EXPECT_EQ("prev", tokenizer.current().text); 692 EXPECT_EQ("prev", tokenizer2.current().text); 693 694 string prev_trailing_comments; 695 vector<string> detached_comments; 696 string next_leading_comments; 697 tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments, 698 &next_leading_comments); 699 tokenizer2.NextWithComments(NULL, NULL, NULL); 700 EXPECT_EQ("next", tokenizer.current().text); 701 EXPECT_EQ("next", tokenizer2.current().text); 702 703 EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments, 704 prev_trailing_comments); 705 706 for (int i = 0; i < detached_comments.size(); i++) { 707 ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases)); 708 ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL); 709 EXPECT_EQ(kDocCommentCases_case.detached_comments[i], 710 detached_comments[i]); 711 } 712 713 // Verify that we matched all the detached comments. 714 EXPECT_EQ(NULL, 715 kDocCommentCases_case.detached_comments[detached_comments.size()]); 716 717 EXPECT_EQ(kDocCommentCases_case.next_leading_comments, 718 next_leading_comments); 719} 720 721// ------------------------------------------------------------------- 722 723// Test parse helpers. It's not really worth setting up a full data-driven 724// test here. 725TEST_F(TokenizerTest, ParseInteger) { 726 EXPECT_EQ(0, ParseInteger("0")); 727 EXPECT_EQ(123, ParseInteger("123")); 728 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); 729 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); 730 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF")); 731 EXPECT_EQ(01234567, ParseInteger("01234567")); 732 EXPECT_EQ(0X123, ParseInteger("0X123")); 733 734 // Test invalid integers that may still be tokenized as integers. 735 EXPECT_EQ(0, ParseInteger("0x")); 736 737 uint64 i; 738#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet 739 // Test invalid integers that will never be tokenized as integers. 740 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i), 741 "passed text that could not have been tokenized as an integer"); 742 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i), 743 "passed text that could not have been tokenized as an integer"); 744 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i), 745 "passed text that could not have been tokenized as an integer"); 746 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i), 747 "passed text that could not have been tokenized as an integer"); 748 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i), 749 "passed text that could not have been tokenized as an integer"); 750#endif // PROTOBUF_HAS_DEATH_TEST 751 752 // Test overflows. 753 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i)); 754 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i)); 755 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i)); 756 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i)); 757 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i)); 758 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i)); 759 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i)); 760} 761 762TEST_F(TokenizerTest, ParseFloat) { 763 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1.")); 764 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3")); 765 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3")); 766 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3")); 767 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1")); 768 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25")); 769 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3")); 770 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3")); 771 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3")); 772 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3")); 773 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5")); 774 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12")); 775 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2")); 776 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2")); 777 778 // Test invalid integers that may still be tokenized as integers. 779 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e")); 780 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-")); 781 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e")); 782 783 // Test 'f' suffix. 784 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f")); 785 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f")); 786 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F")); 787 788 // These should parse successfully even though they are out of range. 789 // Overflows become infinity and underflows become zero. 790 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999")); 791 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999")); 792 793#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet 794 // Test invalid integers that will never be tokenized as integers. 795 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"), 796 "passed text that could not have been tokenized as a float"); 797 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"), 798 "passed text that could not have been tokenized as a float"); 799 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"), 800 "passed text that could not have been tokenized as a float"); 801#endif // PROTOBUF_HAS_DEATH_TEST 802} 803 804TEST_F(TokenizerTest, ParseString) { 805 string output; 806 Tokenizer::ParseString("'hello'", &output); 807 EXPECT_EQ("hello", output); 808 Tokenizer::ParseString("\"blah\\nblah2\"", &output); 809 EXPECT_EQ("blah\nblah2", output); 810 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output); 811 EXPECT_EQ("\1x\1\123\739\52\334n\3", output); 812 Tokenizer::ParseString("'\\x20\\x4'", &output); 813 EXPECT_EQ("\x20\x4", output); 814 815 // Test invalid strings that may still be tokenized as strings. 816 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid 817 EXPECT_EQ("\a?\v\t", output); 818 Tokenizer::ParseString("'", &output); 819 EXPECT_EQ("", output); 820 Tokenizer::ParseString("'\\", &output); 821 EXPECT_EQ("\\", output); 822 823 // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode 824 // characters. 825 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output); 826 EXPECT_EQ("$¢€XX", output); 827 // Same thing encoded using UTF16. 828 Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output); 829 EXPECT_EQ("$¢€XX", output); 830 // Here's some broken UTF16; there's a head surrogate with no tail surrogate. 831 // We just output this as if it were UTF8; it's not a defined code point, but 832 // it has a defined encoding. 833 Tokenizer::ParseString("'\\ud852XX'", &output); 834 EXPECT_EQ("\xed\xa1\x92XX", output); 835 // Malformed escape: Demons may fly out of the nose. 836 Tokenizer::ParseString("\\u0", &output); 837 EXPECT_EQ("u0", output); 838 839 // Test invalid strings that will never be tokenized as strings. 840#ifdef PROTOBUF_HAS_DEATH_TEST // death tests do not work on Windows yet 841 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), 842 "passed text that could not have been tokenized as a string"); 843#endif // PROTOBUF_HAS_DEATH_TEST 844} 845 846TEST_F(TokenizerTest, ParseStringAppend) { 847 // Check that ParseString and ParseStringAppend differ. 848 string output("stuff+"); 849 Tokenizer::ParseStringAppend("'hello'", &output); 850 EXPECT_EQ("stuff+hello", output); 851 Tokenizer::ParseString("'hello'", &output); 852 EXPECT_EQ("hello", output); 853} 854 855// ------------------------------------------------------------------- 856 857// Each case parses some input text, ignoring the tokens produced, and 858// checks that the error output matches what is expected. 859struct ErrorCase { 860 string input; 861 bool recoverable; // True if the tokenizer should be able to recover and 862 // parse more tokens after seeing this error. Cases 863 // for which this is true must end with "foo" as 864 // the last token, which the test will check for. 865 const char* errors; 866}; 867 868inline ostream& operator<<(ostream& out, 869 const ErrorCase& test_case) { 870 return out << CEscape(test_case.input); 871} 872 873ErrorCase kErrorCases[] = { 874 // String errors. 875 { "'\\l' foo", true, 876 "0:2: Invalid escape sequence in string literal.\n" }, 877 { "'\\x' foo", true, 878 "0:3: Expected hex digits for escape sequence.\n" }, 879 { "'foo", false, 880 "0:4: Unexpected end of string.\n" }, 881 { "'bar\nfoo", true, 882 "0:4: String literals cannot cross line boundaries.\n" }, 883 { "'\\u01' foo", true, 884 "0:5: Expected four hex digits for \\u escape sequence.\n" }, 885 { "'\\u01' foo", true, 886 "0:5: Expected four hex digits for \\u escape sequence.\n" }, 887 { "'\\uXYZ' foo", true, 888 "0:3: Expected four hex digits for \\u escape sequence.\n" }, 889 890 // Integer errors. 891 { "123foo", true, 892 "0:3: Need space between number and identifier.\n" }, 893 894 // Hex/octal errors. 895 { "0x foo", true, 896 "0:2: \"0x\" must be followed by hex digits.\n" }, 897 { "0541823 foo", true, 898 "0:4: Numbers starting with leading zero must be in octal.\n" }, 899 { "0x123z foo", true, 900 "0:5: Need space between number and identifier.\n" }, 901 { "0x123.4 foo", true, 902 "0:5: Hex and octal numbers must be integers.\n" }, 903 { "0123.4 foo", true, 904 "0:4: Hex and octal numbers must be integers.\n" }, 905 906 // Float errors. 907 { "1e foo", true, 908 "0:2: \"e\" must be followed by exponent.\n" }, 909 { "1e- foo", true, 910 "0:3: \"e\" must be followed by exponent.\n" }, 911 { "1.2.3 foo", true, 912 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 913 { "1e2.3 foo", true, 914 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 915 { "a.1 foo", true, 916 "0:1: Need space between identifier and decimal point.\n" }, 917 // allow_f_after_float not enabled, so this should be an error. 918 { "1.0f foo", true, 919 "0:3: Need space between number and identifier.\n" }, 920 921 // Block comment errors. 922 { "/*", false, 923 "0:2: End-of-file inside block comment.\n" 924 "0:0: Comment started here.\n"}, 925 { "/*/*/ foo", true, 926 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"}, 927 928 // Control characters. Multiple consecutive control characters should only 929 // produce one error. 930 { "\b foo", true, 931 "0:0: Invalid control characters encountered in text.\n" }, 932 { "\b\b foo", true, 933 "0:0: Invalid control characters encountered in text.\n" }, 934 935 // Check that control characters at end of input don't result in an 936 // infinite loop. 937 { "\b", false, 938 "0:0: Invalid control characters encountered in text.\n" }, 939 940 // Check recovery from '\0'. We have to explicitly specify the length of 941 // these strings because otherwise the string constructor will just call 942 // strlen() which will see the first '\0' and think that is the end of the 943 // string. 944 { string("\0foo", 4), true, 945 "0:0: Invalid control characters encountered in text.\n" }, 946 { string("\0\0foo", 5), true, 947 "0:0: Invalid control characters encountered in text.\n" }, 948 949 // Check error from high order bits set 950 { "\300foo", true, 951 "0:0: Interpreting non ascii codepoint 192.\n" }, 952}; 953 954TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { 955 // Set up the tokenizer. 956 TestInputStream input(kErrorCases_case.input.data(), 957 kErrorCases_case.input.size(), 958 kBlockSizes_case); 959 TestErrorCollector error_collector; 960 Tokenizer tokenizer(&input, &error_collector); 961 962 // Ignore all input, except remember if the last token was "foo". 963 bool last_was_foo = false; 964 while (tokenizer.Next()) { 965 last_was_foo = tokenizer.current().text == "foo"; 966 } 967 968 // Check that the errors match what was expected. 969 EXPECT_EQ(kErrorCases_case.errors, error_collector.text_); 970 971 // If the error was recoverable, make sure we saw "foo" after it. 972 if (kErrorCases_case.recoverable) { 973 EXPECT_TRUE(last_was_foo); 974 } 975} 976 977// ------------------------------------------------------------------- 978 979TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { 980 string text = "foo bar"; 981 TestInputStream input(text.data(), text.size(), kBlockSizes_case); 982 983 // Create a tokenizer, read one token, then destroy it. 984 { 985 TestErrorCollector error_collector; 986 Tokenizer tokenizer(&input, &error_collector); 987 988 tokenizer.Next(); 989 } 990 991 // Only "foo" should have been read. 992 EXPECT_EQ(strlen("foo"), input.ByteCount()); 993} 994 995 996} // namespace 997} // namespace io 998} // namespace protobuf 999} // namespace google 1000