1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34
35#include <vector>
36#include <math.h>
37#include <limits.h>
38
39#include <google/protobuf/io/tokenizer.h>
40#include <google/protobuf/io/zero_copy_stream_impl.h>
41
42#include <google/protobuf/stubs/common.h>
43#include <google/protobuf/stubs/strutil.h>
44#include <google/protobuf/stubs/substitute.h>
45#include <google/protobuf/testing/googletest.h>
46#include <gtest/gtest.h>
47
48namespace google {
49namespace protobuf {
50namespace io {
51namespace {
52
53// ===================================================================
54// Data-Driven Test Infrastructure
55
56// TODO(kenton):  This is copied from coded_stream_unittest.  This is
57//   temporary until these fetaures are integrated into gTest itself.
58
59// TEST_1D and TEST_2D are macros I'd eventually like to see added to
60// gTest.  These macros can be used to declare tests which should be
61// run multiple times, once for each item in some input array.  TEST_1D
62// tests all cases in a single input array.  TEST_2D tests all
63// combinations of cases from two arrays.  The arrays must be statically
64// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
65//
66// int kCases[] = {1, 2, 3, 4}
67// TEST_1D(MyFixture, MyTest, kCases) {
68//   EXPECT_GT(kCases_case, 0);
69// }
70//
71// This test iterates through the numbers 1, 2, 3, and 4 and tests that
72// they are all grater than zero.  In case of failure, the exact case
73// which failed will be printed.  The case type must be printable using
74// ostream::operator<<.
75
76#define TEST_1D(FIXTURE, NAME, CASES)                                      \
77  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
78   protected:                                                              \
79    template <typename CaseType>                                           \
80    void DoSingleCase(const CaseType& CASES##_case);                       \
81  };                                                                       \
82                                                                           \
83  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
84    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
85      SCOPED_TRACE(testing::Message()                                      \
86        << #CASES " case #" << i << ": " << CASES[i]);                     \
87      DoSingleCase(CASES[i]);                                              \
88    }                                                                      \
89  }                                                                        \
90                                                                           \
91  template <typename CaseType>                                             \
92  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
93
94#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
95  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
96   protected:                                                              \
97    template <typename CaseType1, typename CaseType2>                      \
98    void DoSingleCase(const CaseType1& CASES1##_case,                      \
99                      const CaseType2& CASES2##_case);                     \
100  };                                                                       \
101                                                                           \
102  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
103    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
104      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
105        SCOPED_TRACE(testing::Message()                                    \
106          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
107          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
108        DoSingleCase(CASES1[i], CASES2[j]);                                \
109      }                                                                    \
110    }                                                                      \
111  }                                                                        \
112                                                                           \
113  template <typename CaseType1, typename CaseType2>                        \
114  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
115                                           const CaseType2& CASES2##_case)
116
117// -------------------------------------------------------------------
118
119// An input stream that is basically like an ArrayInputStream but sometimes
120// returns empty buffers, just to throw us off.
121class TestInputStream : public ZeroCopyInputStream {
122 public:
123  TestInputStream(const void* data, int size, int block_size)
124    : array_stream_(data, size, block_size), counter_(0) {}
125  ~TestInputStream() {}
126
127  // implements ZeroCopyInputStream ----------------------------------
128  bool Next(const void** data, int* size) {
129    // We'll return empty buffers starting with the first buffer, and every
130    // 3 and 5 buffers after that.
131    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
132      *data = NULL;
133      *size = 0;
134      ++counter_;
135      return true;
136    } else {
137      ++counter_;
138      return array_stream_.Next(data, size);
139    }
140  }
141
142  void BackUp(int count)  { return array_stream_.BackUp(count); }
143  bool Skip(int count)    { return array_stream_.Skip(count);   }
144  int64 ByteCount() const { return array_stream_.ByteCount();   }
145
146 private:
147  ArrayInputStream array_stream_;
148  int counter_;
149};
150
151// -------------------------------------------------------------------
152
153// An error collector which simply concatenates all its errors into a big
154// block of text which can be checked.
155class TestErrorCollector : public ErrorCollector {
156 public:
157  TestErrorCollector() {}
158  ~TestErrorCollector() {}
159
160  string text_;
161
162  // implements ErrorCollector ---------------------------------------
163  void AddError(int line, int column, const string& message) {
164    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
165                                 line, column, message);
166  }
167};
168
169// -------------------------------------------------------------------
170
171// We test each operation over a variety of block sizes to insure that
172// we test cases where reads cross buffer boundaries as well as cases
173// where they don't.  This is sort of a brute-force approach to this,
174// but it's easy to write and easy to understand.
175const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
176
177class TokenizerTest : public testing::Test {
178 protected:
179  // For easy testing.
180  uint64 ParseInteger(const string& text) {
181    uint64 result;
182    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
183    return result;
184  }
185};
186
187// ===================================================================
188
189// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
190//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
191#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
192
193// In each test case, the entire input text should parse as a single token
194// of the given type.
195struct SimpleTokenCase {
196  string input;
197  Tokenizer::TokenType type;
198};
199
200inline ostream& operator<<(ostream& out,
201                           const SimpleTokenCase& test_case) {
202  return out << CEscape(test_case.input);
203}
204
205SimpleTokenCase kSimpleTokenCases[] = {
206  // Test identifiers.
207  { "hello",       Tokenizer::TYPE_IDENTIFIER },
208
209  // Test integers.
210  { "123",         Tokenizer::TYPE_INTEGER },
211  { "0xab6",       Tokenizer::TYPE_INTEGER },
212  { "0XAB6",       Tokenizer::TYPE_INTEGER },
213  { "0X1234567",   Tokenizer::TYPE_INTEGER },
214  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
215  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
216  { "01234567",    Tokenizer::TYPE_INTEGER },
217
218  // Test floats.
219  { "123.45",      Tokenizer::TYPE_FLOAT },
220  { "1.",          Tokenizer::TYPE_FLOAT },
221  { "1e3",         Tokenizer::TYPE_FLOAT },
222  { "1E3",         Tokenizer::TYPE_FLOAT },
223  { "1e-3",        Tokenizer::TYPE_FLOAT },
224  { "1e+3",        Tokenizer::TYPE_FLOAT },
225  { "1.e3",        Tokenizer::TYPE_FLOAT },
226  { "1.2e3",       Tokenizer::TYPE_FLOAT },
227  { ".1",          Tokenizer::TYPE_FLOAT },
228  { ".1e3",        Tokenizer::TYPE_FLOAT },
229  { ".1e-3",       Tokenizer::TYPE_FLOAT },
230  { ".1e+3",       Tokenizer::TYPE_FLOAT },
231
232  // Test strings.
233  { "'hello'",     Tokenizer::TYPE_STRING },
234  { "\"foo\"",     Tokenizer::TYPE_STRING },
235  { "'a\"b'",      Tokenizer::TYPE_STRING },
236  { "\"a'b\"",     Tokenizer::TYPE_STRING },
237  { "'a\\'b'",     Tokenizer::TYPE_STRING },
238  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
239  { "'\\xf'",      Tokenizer::TYPE_STRING },
240  { "'\\0'",       Tokenizer::TYPE_STRING },
241
242  // Test symbols.
243  { "+",           Tokenizer::TYPE_SYMBOL },
244  { ".",           Tokenizer::TYPE_SYMBOL },
245};
246
247TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
248  // Set up the tokenizer.
249  TestInputStream input(kSimpleTokenCases_case.input.data(),
250                        kSimpleTokenCases_case.input.size(),
251                        kBlockSizes_case);
252  TestErrorCollector error_collector;
253  Tokenizer tokenizer(&input, &error_collector);
254
255  // Before Next() is called, the initial token should always be TYPE_START.
256  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257  EXPECT_EQ("", tokenizer.current().text);
258  EXPECT_EQ(0, tokenizer.current().line);
259  EXPECT_EQ(0, tokenizer.current().column);
260
261  // Parse the token.
262  ASSERT_TRUE(tokenizer.Next());
263
264  // Check that it has the right type.
265  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
266  // Check that it contains the complete input text.
267  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
268  // Check that it is located at the beginning of the input
269  EXPECT_EQ(0, tokenizer.current().line);
270  EXPECT_EQ(0, tokenizer.current().column);
271
272  // There should be no more input.
273  EXPECT_FALSE(tokenizer.Next());
274
275  // After Next() returns false, the token should have type TYPE_END.
276  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
277  EXPECT_EQ("", tokenizer.current().text);
278  EXPECT_EQ(0, tokenizer.current().line);
279  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
280
281  // There should be no errors.
282  EXPECT_TRUE(error_collector.text_.empty());
283}
284
285TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
286  // Test the "allow_f_after_float" option.
287
288  // Set up the tokenizer.
289  const char* text = "1f 2.5f 6e3f 7F";
290  TestInputStream input(text, strlen(text), kBlockSizes_case);
291  TestErrorCollector error_collector;
292  Tokenizer tokenizer(&input, &error_collector);
293  tokenizer.set_allow_f_after_float(true);
294
295  // Advance through tokens and check that they are parsed as expected.
296  ASSERT_TRUE(tokenizer.Next());
297  EXPECT_EQ(tokenizer.current().text, "1f");
298  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
299  ASSERT_TRUE(tokenizer.Next());
300  EXPECT_EQ(tokenizer.current().text, "2.5f");
301  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
302  ASSERT_TRUE(tokenizer.Next());
303  EXPECT_EQ(tokenizer.current().text, "6e3f");
304  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
305  ASSERT_TRUE(tokenizer.Next());
306  EXPECT_EQ(tokenizer.current().text, "7F");
307  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
308
309  // There should be no more input.
310  EXPECT_FALSE(tokenizer.Next());
311  // There should be no errors.
312  EXPECT_TRUE(error_collector.text_.empty());
313}
314
315#endif
316
317// -------------------------------------------------------------------
318
319// In each case, the input is parsed to produce a list of tokens.  The
320// last token in "output" must have type TYPE_END.
321struct MultiTokenCase {
322  string input;
323  Tokenizer::Token output[10];  // The compiler wants a constant array
324                                // size for initialization to work.  There
325                                // is no reason this can't be increased if
326                                // needed.
327};
328
329inline ostream& operator<<(ostream& out,
330                           const MultiTokenCase& test_case) {
331  return out << CEscape(test_case.input);
332}
333
334MultiTokenCase kMultiTokenCases[] = {
335  // Test empty input.
336  { "", {
337    { Tokenizer::TYPE_END       , ""     , 0,  0 },
338  }},
339
340  // Test all token types at the same time.
341  { "foo 1 1.2 + 'bar'", {
342    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0 },
343    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4 },
344    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6 },
345    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10 },
346    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12 },
347    { Tokenizer::TYPE_END       , ""     , 0, 17 },
348  }},
349
350  // Test that consecutive symbols are parsed as separate tokens.
351  { "!@+%", {
352    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0 },
353    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1 },
354    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2 },
355    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3 },
356    { Tokenizer::TYPE_END       , ""     , 0, 4 },
357  }},
358
359  // Test that newlines affect line numbers correctly.
360  { "foo bar\nrab oof", {
361    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
362    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4 },
363    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0 },
364    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4 },
365    { Tokenizer::TYPE_END       , ""   , 1,  7 },
366  }},
367
368  // Test that tabs affect column numbers correctly.
369  { "foo\tbar  \tbaz", {
370    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
371    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8 },
372    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
373    { Tokenizer::TYPE_END       , ""   , 0, 19 },
374  }},
375
376  // Test that line comments are ignored.
377  { "foo // This is a comment\n"
378    "bar // This is another comment", {
379    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
380    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0 },
381    { Tokenizer::TYPE_END       , ""   , 1, 30 },
382  }},
383
384  // Test that block comments are ignored.
385  { "foo /* This is a block comment */ bar", {
386    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
387    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
388    { Tokenizer::TYPE_END       , ""   , 0, 37 },
389  }},
390
391  // Test that sh-style comments are not ignored by default.
392  { "foo # bar\n"
393    "baz", {
394    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
395    { Tokenizer::TYPE_SYMBOL    , "#"  , 0,  4 },
396    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  6 },
397    { Tokenizer::TYPE_IDENTIFIER, "baz", 1,  0 },
398    { Tokenizer::TYPE_END       , ""   , 1, 3 },
399  }},
400
401  // Bytes with the high-order bit set should not be seen as control characters.
402  { "\300", {
403    { Tokenizer::TYPE_SYMBOL, "\300", 0, 0 },
404    { Tokenizer::TYPE_END   , ""    , 0, 1 },
405  }},
406
407  // Test all whitespace chars
408  { "foo\n\t\r\v\fbar", {
409    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
410    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11 },
411    { Tokenizer::TYPE_END       , ""   , 1, 14 },
412  }},
413};
414
415TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
416  // Set up the tokenizer.
417  TestInputStream input(kMultiTokenCases_case.input.data(),
418                        kMultiTokenCases_case.input.size(),
419                        kBlockSizes_case);
420  TestErrorCollector error_collector;
421  Tokenizer tokenizer(&input, &error_collector);
422
423  // Before Next() is called, the initial token should always be TYPE_START.
424  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
425  EXPECT_EQ("", tokenizer.current().text);
426  EXPECT_EQ(0, tokenizer.current().line);
427  EXPECT_EQ(0, tokenizer.current().column);
428
429  // Loop through all expected tokens.
430  int i = 0;
431  Tokenizer::Token token;
432  do {
433    token = kMultiTokenCases_case.output[i++];
434
435    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
436
437    // Next() should only return false when it hits the end token.
438    if (token.type != Tokenizer::TYPE_END) {
439      ASSERT_TRUE(tokenizer.Next());
440    } else {
441      ASSERT_FALSE(tokenizer.Next());
442    }
443
444    // Check that the token matches the expected one.
445    EXPECT_EQ(token.type, tokenizer.current().type);
446    EXPECT_EQ(token.text, tokenizer.current().text);
447    EXPECT_EQ(token.line, tokenizer.current().line);
448    EXPECT_EQ(token.column, tokenizer.current().column);
449
450  } while (token.type != Tokenizer::TYPE_END);
451
452  // There should be no errors.
453  EXPECT_TRUE(error_collector.text_.empty());
454}
455
456// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
457//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
458#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
459
460TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
461  // Test the "comment_style" option.
462
463  const char* text = "foo # bar\n"
464                     "baz // qux\n"
465                     "corge /* grault */\n"
466                     "garply";
467  const char* const kTokens[] = {"foo",  // "# bar" is ignored
468                                 "baz", "/", "/", "qux",
469                                 "corge", "/", "*", "grault", "*", "/",
470                                 "garply"};
471
472  // Set up the tokenizer.
473  TestInputStream input(text, strlen(text), kBlockSizes_case);
474  TestErrorCollector error_collector;
475  Tokenizer tokenizer(&input, &error_collector);
476  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
477
478  // Advance through tokens and check that they are parsed as expected.
479  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
480    EXPECT_TRUE(tokenizer.Next());
481    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
482  }
483
484  // There should be no more input.
485  EXPECT_FALSE(tokenizer.Next());
486  // There should be no errors.
487  EXPECT_TRUE(error_collector.text_.empty());
488}
489
490#endif
491
492// -------------------------------------------------------------------
493
494// Test parse helpers.  It's not really worth setting up a full data-driven
495// test here.
496TEST_F(TokenizerTest, ParseInteger) {
497  EXPECT_EQ(0, ParseInteger("0"));
498  EXPECT_EQ(123, ParseInteger("123"));
499  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
500  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
501  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
502  EXPECT_EQ(01234567, ParseInteger("01234567"));
503  EXPECT_EQ(0X123, ParseInteger("0X123"));
504
505  // Test invalid integers that may still be tokenized as integers.
506  EXPECT_EQ(0, ParseInteger("0x"));
507
508  uint64 i;
509#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
510  // Test invalid integers that will never be tokenized as integers.
511  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
512    "passed text that could not have been tokenized as an integer");
513  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
514    "passed text that could not have been tokenized as an integer");
515  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
516    "passed text that could not have been tokenized as an integer");
517  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
518    "passed text that could not have been tokenized as an integer");
519  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
520    "passed text that could not have been tokenized as an integer");
521#endif  // GTEST_HAS_DEATH_TEST
522
523  // Test overflows.
524  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
525  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
526  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
527  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
528  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
529  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
530  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
531}
532
533TEST_F(TokenizerTest, ParseFloat) {
534  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
535  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
536  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
537  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
538  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
539  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
540  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
541  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
542  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
543  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
544  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
545  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
546  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
547  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
548
549  // Test invalid integers that may still be tokenized as integers.
550  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
551  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
552  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
553
554  // Test 'f' suffix.
555  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
556  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
557  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
558
559  // These should parse successfully even though they are out of range.
560  // Overflows become infinity and underflows become zero.
561  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
562  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
563
564#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
565  // Test invalid integers that will never be tokenized as integers.
566  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
567    "passed text that could not have been tokenized as a float");
568  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
569    "passed text that could not have been tokenized as a float");
570  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
571    "passed text that could not have been tokenized as a float");
572#endif  // GTEST_HAS_DEATH_TEST
573}
574
575TEST_F(TokenizerTest, ParseString) {
576  string output;
577  Tokenizer::ParseString("'hello'", &output);
578  EXPECT_EQ("hello", output);
579  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
580  EXPECT_EQ("blah\nblah2", output);
581  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
582  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
583  Tokenizer::ParseString("'\\x20\\x4'", &output);
584  EXPECT_EQ("\x20\x4", output);
585
586  // Test invalid strings that may still be tokenized as strings.
587  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
588  EXPECT_EQ("\a?\v\t", output);
589  Tokenizer::ParseString("'", &output);
590  EXPECT_EQ("", output);
591  Tokenizer::ParseString("'\\", &output);
592  EXPECT_EQ("\\", output);
593
594  // Test invalid strings that will never be tokenized as strings.
595#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
596  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
597    "passed text that could not have been tokenized as a string");
598#endif  // GTEST_HAS_DEATH_TEST
599}
600
601TEST_F(TokenizerTest, ParseStringAppend) {
602  // Check that ParseString and ParseStringAppend differ.
603  string output("stuff+");
604  Tokenizer::ParseStringAppend("'hello'", &output);
605  EXPECT_EQ("stuff+hello", output);
606  Tokenizer::ParseString("'hello'", &output);
607  EXPECT_EQ("hello", output);
608}
609
610// -------------------------------------------------------------------
611
612// Each case parses some input text, ignoring the tokens produced, and
613// checks that the error output matches what is expected.
614struct ErrorCase {
615  string input;
616  bool recoverable;  // True if the tokenizer should be able to recover and
617                     // parse more tokens after seeing this error.  Cases
618                     // for which this is true must end with "foo" as
619                     // the last token, which the test will check for.
620  const char* errors;
621};
622
623inline ostream& operator<<(ostream& out,
624                           const ErrorCase& test_case) {
625  return out << CEscape(test_case.input);
626}
627
628ErrorCase kErrorCases[] = {
629  // String errors.
630  { "'\\l' foo", true,
631    "0:2: Invalid escape sequence in string literal.\n" },
632  { "'\\x' foo", true,
633    "0:3: Expected hex digits for escape sequence.\n" },
634  { "'foo", false,
635    "0:4: String literals cannot cross line boundaries.\n" },
636  { "'bar\nfoo", true,
637    "0:4: String literals cannot cross line boundaries.\n" },
638
639  // Integer errors.
640  { "123foo", true,
641    "0:3: Need space between number and identifier.\n" },
642
643  // Hex/octal errors.
644  { "0x foo", true,
645    "0:2: \"0x\" must be followed by hex digits.\n" },
646  { "0541823 foo", true,
647    "0:4: Numbers starting with leading zero must be in octal.\n" },
648  { "0x123z foo", true,
649    "0:5: Need space between number and identifier.\n" },
650  { "0x123.4 foo", true,
651    "0:5: Hex and octal numbers must be integers.\n" },
652  { "0123.4 foo", true,
653    "0:4: Hex and octal numbers must be integers.\n" },
654
655  // Float errors.
656  { "1e foo", true,
657    "0:2: \"e\" must be followed by exponent.\n" },
658  { "1e- foo", true,
659    "0:3: \"e\" must be followed by exponent.\n" },
660  { "1.2.3 foo", true,
661    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
662  { "1e2.3 foo", true,
663    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
664  { "a.1 foo", true,
665    "0:1: Need space between identifier and decimal point.\n" },
666  // allow_f_after_float not enabled, so this should be an error.
667  { "1.0f foo", true,
668    "0:3: Need space between number and identifier.\n" },
669
670  // Block comment errors.
671  { "/*", false,
672    "0:2: End-of-file inside block comment.\n"
673    "0:0:   Comment started here.\n"},
674  { "/*/*/ foo", true,
675    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
676
677  // Control characters.  Multiple consecutive control characters should only
678  // produce one error.
679  { "\b foo", true,
680    "0:0: Invalid control characters encountered in text.\n" },
681  { "\b\b foo", true,
682    "0:0: Invalid control characters encountered in text.\n" },
683
684  // Check that control characters at end of input don't result in an
685  // infinite loop.
686  { "\b", false,
687    "0:0: Invalid control characters encountered in text.\n" },
688
689  // Check recovery from '\0'.  We have to explicitly specify the length of
690  // these strings because otherwise the string constructor will just call
691  // strlen() which will see the first '\0' and think that is the end of the
692  // string.
693  { string("\0foo", 4), true,
694    "0:0: Invalid control characters encountered in text.\n" },
695  { string("\0\0foo", 5), true,
696    "0:0: Invalid control characters encountered in text.\n" },
697};
698
699TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
700  // Set up the tokenizer.
701  TestInputStream input(kErrorCases_case.input.data(),
702                        kErrorCases_case.input.size(),
703                        kBlockSizes_case);
704  TestErrorCollector error_collector;
705  Tokenizer tokenizer(&input, &error_collector);
706
707  // Ignore all input, except remember if the last token was "foo".
708  bool last_was_foo = false;
709  while (tokenizer.Next()) {
710    last_was_foo = tokenizer.current().text == "foo";
711  }
712
713  // Check that the errors match what was expected.
714  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
715
716  // If the error was recoverable, make sure we saw "foo" after it.
717  if (kErrorCases_case.recoverable) {
718    EXPECT_TRUE(last_was_foo);
719  }
720}
721
722// -------------------------------------------------------------------
723
724TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
725  string text = "foo bar";
726  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
727
728  // Create a tokenizer, read one token, then destroy it.
729  {
730    TestErrorCollector error_collector;
731    Tokenizer tokenizer(&input, &error_collector);
732
733    tokenizer.Next();
734  }
735
736  // Only "foo" should have been read.
737  EXPECT_EQ(strlen("foo"), input.ByteCount());
738}
739
740}  // namespace
741}  // namespace io
742}  // namespace protobuf
743}  // namespace google
744