1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// http://code.google.com/p/protobuf/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34 35#include <vector> 36#include <math.h> 37#include <limits.h> 38 39#include <google/protobuf/io/tokenizer.h> 40#include <google/protobuf/io/zero_copy_stream_impl.h> 41 42#include <google/protobuf/stubs/common.h> 43#include <google/protobuf/stubs/strutil.h> 44#include <google/protobuf/stubs/substitute.h> 45#include <google/protobuf/testing/googletest.h> 46#include <gtest/gtest.h> 47 48namespace google { 49namespace protobuf { 50namespace io { 51namespace { 52 53// =================================================================== 54// Data-Driven Test Infrastructure 55 56// TODO(kenton): This is copied from coded_stream_unittest. This is 57// temporary until these fetaures are integrated into gTest itself. 58 59// TEST_1D and TEST_2D are macros I'd eventually like to see added to 60// gTest. These macros can be used to declare tests which should be 61// run multiple times, once for each item in some input array. TEST_1D 62// tests all cases in a single input array. TEST_2D tests all 63// combinations of cases from two arrays. The arrays must be statically 64// defined such that the GOOGLE_ARRAYSIZE() macro works on them. Example: 65// 66// int kCases[] = {1, 2, 3, 4} 67// TEST_1D(MyFixture, MyTest, kCases) { 68// EXPECT_GT(kCases_case, 0); 69// } 70// 71// This test iterates through the numbers 1, 2, 3, and 4 and tests that 72// they are all grater than zero. In case of failure, the exact case 73// which failed will be printed. The case type must be printable using 74// ostream::operator<<. 75 76#define TEST_1D(FIXTURE, NAME, CASES) \ 77 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 78 protected: \ 79 template <typename CaseType> \ 80 void DoSingleCase(const CaseType& CASES##_case); \ 81 }; \ 82 \ 83 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 84 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) { \ 85 SCOPED_TRACE(testing::Message() \ 86 << #CASES " case #" << i << ": " << CASES[i]); \ 87 DoSingleCase(CASES[i]); \ 88 } \ 89 } \ 90 \ 91 template <typename CaseType> \ 92 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case) 93 94#define TEST_2D(FIXTURE, NAME, CASES1, CASES2) \ 95 class FIXTURE##_##NAME##_DD : public FIXTURE { \ 96 protected: \ 97 template <typename CaseType1, typename CaseType2> \ 98 void DoSingleCase(const CaseType1& CASES1##_case, \ 99 const CaseType2& CASES2##_case); \ 100 }; \ 101 \ 102 TEST_F(FIXTURE##_##NAME##_DD, NAME) { \ 103 for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) { \ 104 for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) { \ 105 SCOPED_TRACE(testing::Message() \ 106 << #CASES1 " case #" << i << ": " << CASES1[i] << ", " \ 107 << #CASES2 " case #" << j << ": " << CASES2[j]); \ 108 DoSingleCase(CASES1[i], CASES2[j]); \ 109 } \ 110 } \ 111 } \ 112 \ 113 template <typename CaseType1, typename CaseType2> \ 114 void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \ 115 const CaseType2& CASES2##_case) 116 117// ------------------------------------------------------------------- 118 119// An input stream that is basically like an ArrayInputStream but sometimes 120// returns empty buffers, just to throw us off. 121class TestInputStream : public ZeroCopyInputStream { 122 public: 123 TestInputStream(const void* data, int size, int block_size) 124 : array_stream_(data, size, block_size), counter_(0) {} 125 ~TestInputStream() {} 126 127 // implements ZeroCopyInputStream ---------------------------------- 128 bool Next(const void** data, int* size) { 129 // We'll return empty buffers starting with the first buffer, and every 130 // 3 and 5 buffers after that. 131 if (counter_ % 3 == 0 || counter_ % 5 == 0) { 132 *data = NULL; 133 *size = 0; 134 ++counter_; 135 return true; 136 } else { 137 ++counter_; 138 return array_stream_.Next(data, size); 139 } 140 } 141 142 void BackUp(int count) { return array_stream_.BackUp(count); } 143 bool Skip(int count) { return array_stream_.Skip(count); } 144 int64 ByteCount() const { return array_stream_.ByteCount(); } 145 146 private: 147 ArrayInputStream array_stream_; 148 int counter_; 149}; 150 151// ------------------------------------------------------------------- 152 153// An error collector which simply concatenates all its errors into a big 154// block of text which can be checked. 155class TestErrorCollector : public ErrorCollector { 156 public: 157 TestErrorCollector() {} 158 ~TestErrorCollector() {} 159 160 string text_; 161 162 // implements ErrorCollector --------------------------------------- 163 void AddError(int line, int column, const string& message) { 164 strings::SubstituteAndAppend(&text_, "$0:$1: $2\n", 165 line, column, message); 166 } 167}; 168 169// ------------------------------------------------------------------- 170 171// We test each operation over a variety of block sizes to insure that 172// we test cases where reads cross buffer boundaries as well as cases 173// where they don't. This is sort of a brute-force approach to this, 174// but it's easy to write and easy to understand. 175const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024}; 176 177class TokenizerTest : public testing::Test { 178 protected: 179 // For easy testing. 180 uint64 ParseInteger(const string& text) { 181 uint64 result; 182 EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result)); 183 return result; 184 } 185}; 186 187// =================================================================== 188 189// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error: 190// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 191#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 192 193// In each test case, the entire input text should parse as a single token 194// of the given type. 195struct SimpleTokenCase { 196 string input; 197 Tokenizer::TokenType type; 198}; 199 200inline ostream& operator<<(ostream& out, 201 const SimpleTokenCase& test_case) { 202 return out << CEscape(test_case.input); 203} 204 205SimpleTokenCase kSimpleTokenCases[] = { 206 // Test identifiers. 207 { "hello", Tokenizer::TYPE_IDENTIFIER }, 208 209 // Test integers. 210 { "123", Tokenizer::TYPE_INTEGER }, 211 { "0xab6", Tokenizer::TYPE_INTEGER }, 212 { "0XAB6", Tokenizer::TYPE_INTEGER }, 213 { "0X1234567", Tokenizer::TYPE_INTEGER }, 214 { "0x89abcdef", Tokenizer::TYPE_INTEGER }, 215 { "0x89ABCDEF", Tokenizer::TYPE_INTEGER }, 216 { "01234567", Tokenizer::TYPE_INTEGER }, 217 218 // Test floats. 219 { "123.45", Tokenizer::TYPE_FLOAT }, 220 { "1.", Tokenizer::TYPE_FLOAT }, 221 { "1e3", Tokenizer::TYPE_FLOAT }, 222 { "1E3", Tokenizer::TYPE_FLOAT }, 223 { "1e-3", Tokenizer::TYPE_FLOAT }, 224 { "1e+3", Tokenizer::TYPE_FLOAT }, 225 { "1.e3", Tokenizer::TYPE_FLOAT }, 226 { "1.2e3", Tokenizer::TYPE_FLOAT }, 227 { ".1", Tokenizer::TYPE_FLOAT }, 228 { ".1e3", Tokenizer::TYPE_FLOAT }, 229 { ".1e-3", Tokenizer::TYPE_FLOAT }, 230 { ".1e+3", Tokenizer::TYPE_FLOAT }, 231 232 // Test strings. 233 { "'hello'", Tokenizer::TYPE_STRING }, 234 { "\"foo\"", Tokenizer::TYPE_STRING }, 235 { "'a\"b'", Tokenizer::TYPE_STRING }, 236 { "\"a'b\"", Tokenizer::TYPE_STRING }, 237 { "'a\\'b'", Tokenizer::TYPE_STRING }, 238 { "\"a\\\"b\"", Tokenizer::TYPE_STRING }, 239 { "'\\xf'", Tokenizer::TYPE_STRING }, 240 { "'\\0'", Tokenizer::TYPE_STRING }, 241 242 // Test symbols. 243 { "+", Tokenizer::TYPE_SYMBOL }, 244 { ".", Tokenizer::TYPE_SYMBOL }, 245}; 246 247TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) { 248 // Set up the tokenizer. 249 TestInputStream input(kSimpleTokenCases_case.input.data(), 250 kSimpleTokenCases_case.input.size(), 251 kBlockSizes_case); 252 TestErrorCollector error_collector; 253 Tokenizer tokenizer(&input, &error_collector); 254 255 // Before Next() is called, the initial token should always be TYPE_START. 256 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 257 EXPECT_EQ("", tokenizer.current().text); 258 EXPECT_EQ(0, tokenizer.current().line); 259 EXPECT_EQ(0, tokenizer.current().column); 260 261 // Parse the token. 262 ASSERT_TRUE(tokenizer.Next()); 263 264 // Check that it has the right type. 265 EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type); 266 // Check that it contains the complete input text. 267 EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text); 268 // Check that it is located at the beginning of the input 269 EXPECT_EQ(0, tokenizer.current().line); 270 EXPECT_EQ(0, tokenizer.current().column); 271 272 // There should be no more input. 273 EXPECT_FALSE(tokenizer.Next()); 274 275 // After Next() returns false, the token should have type TYPE_END. 276 EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type); 277 EXPECT_EQ("", tokenizer.current().text); 278 EXPECT_EQ(0, tokenizer.current().line); 279 EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column); 280 281 // There should be no errors. 282 EXPECT_TRUE(error_collector.text_.empty()); 283} 284 285TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) { 286 // Test the "allow_f_after_float" option. 287 288 // Set up the tokenizer. 289 const char* text = "1f 2.5f 6e3f 7F"; 290 TestInputStream input(text, strlen(text), kBlockSizes_case); 291 TestErrorCollector error_collector; 292 Tokenizer tokenizer(&input, &error_collector); 293 tokenizer.set_allow_f_after_float(true); 294 295 // Advance through tokens and check that they are parsed as expected. 296 ASSERT_TRUE(tokenizer.Next()); 297 EXPECT_EQ(tokenizer.current().text, "1f"); 298 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 299 ASSERT_TRUE(tokenizer.Next()); 300 EXPECT_EQ(tokenizer.current().text, "2.5f"); 301 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 302 ASSERT_TRUE(tokenizer.Next()); 303 EXPECT_EQ(tokenizer.current().text, "6e3f"); 304 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 305 ASSERT_TRUE(tokenizer.Next()); 306 EXPECT_EQ(tokenizer.current().text, "7F"); 307 EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT); 308 309 // There should be no more input. 310 EXPECT_FALSE(tokenizer.Next()); 311 // There should be no errors. 312 EXPECT_TRUE(error_collector.text_.empty()); 313} 314 315#endif 316 317// ------------------------------------------------------------------- 318 319// In each case, the input is parsed to produce a list of tokens. The 320// last token in "output" must have type TYPE_END. 321struct MultiTokenCase { 322 string input; 323 Tokenizer::Token output[10]; // The compiler wants a constant array 324 // size for initialization to work. There 325 // is no reason this can't be increased if 326 // needed. 327}; 328 329inline ostream& operator<<(ostream& out, 330 const MultiTokenCase& test_case) { 331 return out << CEscape(test_case.input); 332} 333 334MultiTokenCase kMultiTokenCases[] = { 335 // Test empty input. 336 { "", { 337 { Tokenizer::TYPE_END , "" , 0, 0 }, 338 }}, 339 340 // Test all token types at the same time. 341 { "foo 1 1.2 + 'bar'", { 342 { Tokenizer::TYPE_IDENTIFIER, "foo" , 0, 0 }, 343 { Tokenizer::TYPE_INTEGER , "1" , 0, 4 }, 344 { Tokenizer::TYPE_FLOAT , "1.2" , 0, 6 }, 345 { Tokenizer::TYPE_SYMBOL , "+" , 0, 10 }, 346 { Tokenizer::TYPE_STRING , "'bar'", 0, 12 }, 347 { Tokenizer::TYPE_END , "" , 0, 17 }, 348 }}, 349 350 // Test that consecutive symbols are parsed as separate tokens. 351 { "!@+%", { 352 { Tokenizer::TYPE_SYMBOL , "!" , 0, 0 }, 353 { Tokenizer::TYPE_SYMBOL , "@" , 0, 1 }, 354 { Tokenizer::TYPE_SYMBOL , "+" , 0, 2 }, 355 { Tokenizer::TYPE_SYMBOL , "%" , 0, 3 }, 356 { Tokenizer::TYPE_END , "" , 0, 4 }, 357 }}, 358 359 // Test that newlines affect line numbers correctly. 360 { "foo bar\nrab oof", { 361 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 362 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 4 }, 363 { Tokenizer::TYPE_IDENTIFIER, "rab", 1, 0 }, 364 { Tokenizer::TYPE_IDENTIFIER, "oof", 1, 4 }, 365 { Tokenizer::TYPE_END , "" , 1, 7 }, 366 }}, 367 368 // Test that tabs affect column numbers correctly. 369 { "foo\tbar \tbaz", { 370 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 371 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 8 }, 372 { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 }, 373 { Tokenizer::TYPE_END , "" , 0, 19 }, 374 }}, 375 376 // Test that line comments are ignored. 377 { "foo // This is a comment\n" 378 "bar // This is another comment", { 379 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 380 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 0 }, 381 { Tokenizer::TYPE_END , "" , 1, 30 }, 382 }}, 383 384 // Test that block comments are ignored. 385 { "foo /* This is a block comment */ bar", { 386 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 387 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 }, 388 { Tokenizer::TYPE_END , "" , 0, 37 }, 389 }}, 390 391 // Test that sh-style comments are not ignored by default. 392 { "foo # bar\n" 393 "baz", { 394 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 395 { Tokenizer::TYPE_SYMBOL , "#" , 0, 4 }, 396 { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6 }, 397 { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0 }, 398 { Tokenizer::TYPE_END , "" , 1, 3 }, 399 }}, 400 401 // Bytes with the high-order bit set should not be seen as control characters. 402 { "\300", { 403 { Tokenizer::TYPE_SYMBOL, "\300", 0, 0 }, 404 { Tokenizer::TYPE_END , "" , 0, 1 }, 405 }}, 406 407 // Test all whitespace chars 408 { "foo\n\t\r\v\fbar", { 409 { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0 }, 410 { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11 }, 411 { Tokenizer::TYPE_END , "" , 1, 14 }, 412 }}, 413}; 414 415TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) { 416 // Set up the tokenizer. 417 TestInputStream input(kMultiTokenCases_case.input.data(), 418 kMultiTokenCases_case.input.size(), 419 kBlockSizes_case); 420 TestErrorCollector error_collector; 421 Tokenizer tokenizer(&input, &error_collector); 422 423 // Before Next() is called, the initial token should always be TYPE_START. 424 EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type); 425 EXPECT_EQ("", tokenizer.current().text); 426 EXPECT_EQ(0, tokenizer.current().line); 427 EXPECT_EQ(0, tokenizer.current().column); 428 429 // Loop through all expected tokens. 430 int i = 0; 431 Tokenizer::Token token; 432 do { 433 token = kMultiTokenCases_case.output[i++]; 434 435 SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text); 436 437 // Next() should only return false when it hits the end token. 438 if (token.type != Tokenizer::TYPE_END) { 439 ASSERT_TRUE(tokenizer.Next()); 440 } else { 441 ASSERT_FALSE(tokenizer.Next()); 442 } 443 444 // Check that the token matches the expected one. 445 EXPECT_EQ(token.type, tokenizer.current().type); 446 EXPECT_EQ(token.text, tokenizer.current().text); 447 EXPECT_EQ(token.line, tokenizer.current().line); 448 EXPECT_EQ(token.column, tokenizer.current().column); 449 450 } while (token.type != Tokenizer::TYPE_END); 451 452 // There should be no errors. 453 EXPECT_TRUE(error_collector.text_.empty()); 454} 455 456// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error: 457// "sorry, unimplemented: `method_call_expr' not supported by dump_expr" 458#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3) 459 460TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) { 461 // Test the "comment_style" option. 462 463 const char* text = "foo # bar\n" 464 "baz // qux\n" 465 "corge /* grault */\n" 466 "garply"; 467 const char* const kTokens[] = {"foo", // "# bar" is ignored 468 "baz", "/", "/", "qux", 469 "corge", "/", "*", "grault", "*", "/", 470 "garply"}; 471 472 // Set up the tokenizer. 473 TestInputStream input(text, strlen(text), kBlockSizes_case); 474 TestErrorCollector error_collector; 475 Tokenizer tokenizer(&input, &error_collector); 476 tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE); 477 478 // Advance through tokens and check that they are parsed as expected. 479 for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) { 480 EXPECT_TRUE(tokenizer.Next()); 481 EXPECT_EQ(tokenizer.current().text, kTokens[i]); 482 } 483 484 // There should be no more input. 485 EXPECT_FALSE(tokenizer.Next()); 486 // There should be no errors. 487 EXPECT_TRUE(error_collector.text_.empty()); 488} 489 490#endif 491 492// ------------------------------------------------------------------- 493 494// Test parse helpers. It's not really worth setting up a full data-driven 495// test here. 496TEST_F(TokenizerTest, ParseInteger) { 497 EXPECT_EQ(0, ParseInteger("0")); 498 EXPECT_EQ(123, ParseInteger("123")); 499 EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12")); 500 EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12")); 501 EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF")); 502 EXPECT_EQ(01234567, ParseInteger("01234567")); 503 EXPECT_EQ(0X123, ParseInteger("0X123")); 504 505 // Test invalid integers that may still be tokenized as integers. 506 EXPECT_EQ(0, ParseInteger("0x")); 507 508 uint64 i; 509#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet 510 // Test invalid integers that will never be tokenized as integers. 511 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i), 512 "passed text that could not have been tokenized as an integer"); 513 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i), 514 "passed text that could not have been tokenized as an integer"); 515 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i), 516 "passed text that could not have been tokenized as an integer"); 517 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i), 518 "passed text that could not have been tokenized as an integer"); 519 EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i), 520 "passed text that could not have been tokenized as an integer"); 521#endif // GTEST_HAS_DEATH_TEST 522 523 // Test overflows. 524 EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i)); 525 EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i)); 526 EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i)); 527 EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i)); 528 EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i)); 529 EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i)); 530 EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i)); 531} 532 533TEST_F(TokenizerTest, ParseFloat) { 534 EXPECT_DOUBLE_EQ(1 , Tokenizer::ParseFloat("1.")); 535 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1e3")); 536 EXPECT_DOUBLE_EQ(1e3 , Tokenizer::ParseFloat("1E3")); 537 EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3")); 538 EXPECT_DOUBLE_EQ(.1 , Tokenizer::ParseFloat(".1")); 539 EXPECT_DOUBLE_EQ(.25 , Tokenizer::ParseFloat(".25")); 540 EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3")); 541 EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3")); 542 EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3")); 543 EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3")); 544 EXPECT_DOUBLE_EQ(5 , Tokenizer::ParseFloat("5")); 545 EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12")); 546 EXPECT_DOUBLE_EQ(1.2 , Tokenizer::ParseFloat("1.2")); 547 EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2")); 548 549 // Test invalid integers that may still be tokenized as integers. 550 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e")); 551 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-")); 552 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e")); 553 554 // Test 'f' suffix. 555 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f")); 556 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f")); 557 EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F")); 558 559 // These should parse successfully even though they are out of range. 560 // Overflows become infinity and underflows become zero. 561 EXPECT_EQ( 0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999")); 562 EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999")); 563 564#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet 565 // Test invalid integers that will never be tokenized as integers. 566 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"), 567 "passed text that could not have been tokenized as a float"); 568 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"), 569 "passed text that could not have been tokenized as a float"); 570 EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"), 571 "passed text that could not have been tokenized as a float"); 572#endif // GTEST_HAS_DEATH_TEST 573} 574 575TEST_F(TokenizerTest, ParseString) { 576 string output; 577 Tokenizer::ParseString("'hello'", &output); 578 EXPECT_EQ("hello", output); 579 Tokenizer::ParseString("\"blah\\nblah2\"", &output); 580 EXPECT_EQ("blah\nblah2", output); 581 Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output); 582 EXPECT_EQ("\1x\1\123\739\52\334n\3", output); 583 Tokenizer::ParseString("'\\x20\\x4'", &output); 584 EXPECT_EQ("\x20\x4", output); 585 586 // Test invalid strings that may still be tokenized as strings. 587 Tokenizer::ParseString("\"\\a\\l\\v\\t", &output); // \l is invalid 588 EXPECT_EQ("\a?\v\t", output); 589 Tokenizer::ParseString("'", &output); 590 EXPECT_EQ("", output); 591 Tokenizer::ParseString("'\\", &output); 592 EXPECT_EQ("\\", output); 593 594 // Test invalid strings that will never be tokenized as strings. 595#ifdef GTEST_HAS_DEATH_TEST // death tests do not work on Windows yet 596 EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output), 597 "passed text that could not have been tokenized as a string"); 598#endif // GTEST_HAS_DEATH_TEST 599} 600 601TEST_F(TokenizerTest, ParseStringAppend) { 602 // Check that ParseString and ParseStringAppend differ. 603 string output("stuff+"); 604 Tokenizer::ParseStringAppend("'hello'", &output); 605 EXPECT_EQ("stuff+hello", output); 606 Tokenizer::ParseString("'hello'", &output); 607 EXPECT_EQ("hello", output); 608} 609 610// ------------------------------------------------------------------- 611 612// Each case parses some input text, ignoring the tokens produced, and 613// checks that the error output matches what is expected. 614struct ErrorCase { 615 string input; 616 bool recoverable; // True if the tokenizer should be able to recover and 617 // parse more tokens after seeing this error. Cases 618 // for which this is true must end with "foo" as 619 // the last token, which the test will check for. 620 const char* errors; 621}; 622 623inline ostream& operator<<(ostream& out, 624 const ErrorCase& test_case) { 625 return out << CEscape(test_case.input); 626} 627 628ErrorCase kErrorCases[] = { 629 // String errors. 630 { "'\\l' foo", true, 631 "0:2: Invalid escape sequence in string literal.\n" }, 632 { "'\\x' foo", true, 633 "0:3: Expected hex digits for escape sequence.\n" }, 634 { "'foo", false, 635 "0:4: String literals cannot cross line boundaries.\n" }, 636 { "'bar\nfoo", true, 637 "0:4: String literals cannot cross line boundaries.\n" }, 638 639 // Integer errors. 640 { "123foo", true, 641 "0:3: Need space between number and identifier.\n" }, 642 643 // Hex/octal errors. 644 { "0x foo", true, 645 "0:2: \"0x\" must be followed by hex digits.\n" }, 646 { "0541823 foo", true, 647 "0:4: Numbers starting with leading zero must be in octal.\n" }, 648 { "0x123z foo", true, 649 "0:5: Need space between number and identifier.\n" }, 650 { "0x123.4 foo", true, 651 "0:5: Hex and octal numbers must be integers.\n" }, 652 { "0123.4 foo", true, 653 "0:4: Hex and octal numbers must be integers.\n" }, 654 655 // Float errors. 656 { "1e foo", true, 657 "0:2: \"e\" must be followed by exponent.\n" }, 658 { "1e- foo", true, 659 "0:3: \"e\" must be followed by exponent.\n" }, 660 { "1.2.3 foo", true, 661 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 662 { "1e2.3 foo", true, 663 "0:3: Already saw decimal point or exponent; can't have another one.\n" }, 664 { "a.1 foo", true, 665 "0:1: Need space between identifier and decimal point.\n" }, 666 // allow_f_after_float not enabled, so this should be an error. 667 { "1.0f foo", true, 668 "0:3: Need space between number and identifier.\n" }, 669 670 // Block comment errors. 671 { "/*", false, 672 "0:2: End-of-file inside block comment.\n" 673 "0:0: Comment started here.\n"}, 674 { "/*/*/ foo", true, 675 "0:3: \"/*\" inside block comment. Block comments cannot be nested.\n"}, 676 677 // Control characters. Multiple consecutive control characters should only 678 // produce one error. 679 { "\b foo", true, 680 "0:0: Invalid control characters encountered in text.\n" }, 681 { "\b\b foo", true, 682 "0:0: Invalid control characters encountered in text.\n" }, 683 684 // Check that control characters at end of input don't result in an 685 // infinite loop. 686 { "\b", false, 687 "0:0: Invalid control characters encountered in text.\n" }, 688 689 // Check recovery from '\0'. We have to explicitly specify the length of 690 // these strings because otherwise the string constructor will just call 691 // strlen() which will see the first '\0' and think that is the end of the 692 // string. 693 { string("\0foo", 4), true, 694 "0:0: Invalid control characters encountered in text.\n" }, 695 { string("\0\0foo", 5), true, 696 "0:0: Invalid control characters encountered in text.\n" }, 697}; 698 699TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) { 700 // Set up the tokenizer. 701 TestInputStream input(kErrorCases_case.input.data(), 702 kErrorCases_case.input.size(), 703 kBlockSizes_case); 704 TestErrorCollector error_collector; 705 Tokenizer tokenizer(&input, &error_collector); 706 707 // Ignore all input, except remember if the last token was "foo". 708 bool last_was_foo = false; 709 while (tokenizer.Next()) { 710 last_was_foo = tokenizer.current().text == "foo"; 711 } 712 713 // Check that the errors match what was expected. 714 EXPECT_EQ(error_collector.text_, kErrorCases_case.errors); 715 716 // If the error was recoverable, make sure we saw "foo" after it. 717 if (kErrorCases_case.recoverable) { 718 EXPECT_TRUE(last_was_foo); 719 } 720} 721 722// ------------------------------------------------------------------- 723 724TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) { 725 string text = "foo bar"; 726 TestInputStream input(text.data(), text.size(), kBlockSizes_case); 727 728 // Create a tokenizer, read one token, then destroy it. 729 { 730 TestErrorCollector error_collector; 731 Tokenizer tokenizer(&input, &error_collector); 732 733 tokenizer.Next(); 734 } 735 736 // Only "foo" should have been read. 737 EXPECT_EQ(strlen("foo"), input.ByteCount()); 738} 739 740} // namespace 741} // namespace io 742} // namespace protobuf 743} // namespace google 744