tokenizer_unittest.cc revision fbaaef999ba563838ebd00874ed8a1c01fbf286d
1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34
35#include <vector>
36#include <math.h>
37#include <limits.h>
38
39#include <google/protobuf/io/tokenizer.h>
40#include <google/protobuf/io/zero_copy_stream_impl.h>
41
42#include <google/protobuf/stubs/common.h>
43#include <google/protobuf/stubs/strutil.h>
44#include <google/protobuf/stubs/substitute.h>
45#include <google/protobuf/testing/googletest.h>
46#include <gtest/gtest.h>
47
48namespace google {
49namespace protobuf {
50namespace io {
51namespace {
52
53// ===================================================================
54// Data-Driven Test Infrastructure
55
56// TODO(kenton):  This is copied from coded_stream_unittest.  This is
57//   temporary until these fetaures are integrated into gTest itself.
58
59// TEST_1D and TEST_2D are macros I'd eventually like to see added to
60// gTest.  These macros can be used to declare tests which should be
61// run multiple times, once for each item in some input array.  TEST_1D
62// tests all cases in a single input array.  TEST_2D tests all
63// combinations of cases from two arrays.  The arrays must be statically
64// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
65//
66// int kCases[] = {1, 2, 3, 4}
67// TEST_1D(MyFixture, MyTest, kCases) {
68//   EXPECT_GT(kCases_case, 0);
69// }
70//
71// This test iterates through the numbers 1, 2, 3, and 4 and tests that
72// they are all grater than zero.  In case of failure, the exact case
73// which failed will be printed.  The case type must be printable using
74// ostream::operator<<.
75
76#define TEST_1D(FIXTURE, NAME, CASES)                                      \
77  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
78   protected:                                                              \
79    template <typename CaseType>                                           \
80    void DoSingleCase(const CaseType& CASES##_case);                       \
81  };                                                                       \
82                                                                           \
83  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
84    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
85      SCOPED_TRACE(testing::Message()                                      \
86        << #CASES " case #" << i << ": " << CASES[i]);                     \
87      DoSingleCase(CASES[i]);                                              \
88    }                                                                      \
89  }                                                                        \
90                                                                           \
91  template <typename CaseType>                                             \
92  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
93
94#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
95  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
96   protected:                                                              \
97    template <typename CaseType1, typename CaseType2>                      \
98    void DoSingleCase(const CaseType1& CASES1##_case,                      \
99                      const CaseType2& CASES2##_case);                     \
100  };                                                                       \
101                                                                           \
102  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
103    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
104      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
105        SCOPED_TRACE(testing::Message()                                    \
106          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
107          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
108        DoSingleCase(CASES1[i], CASES2[j]);                                \
109      }                                                                    \
110    }                                                                      \
111  }                                                                        \
112                                                                           \
113  template <typename CaseType1, typename CaseType2>                        \
114  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
115                                           const CaseType2& CASES2##_case)
116
117// -------------------------------------------------------------------
118
119// An input stream that is basically like an ArrayInputStream but sometimes
120// returns empty buffers, just to throw us off.
121class TestInputStream : public ZeroCopyInputStream {
122 public:
123  TestInputStream(const void* data, int size, int block_size)
124    : array_stream_(data, size, block_size), counter_(0) {}
125  ~TestInputStream() {}
126
127  // implements ZeroCopyInputStream ----------------------------------
128  bool Next(const void** data, int* size) {
129    // We'll return empty buffers starting with the first buffer, and every
130    // 3 and 5 buffers after that.
131    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
132      *data = NULL;
133      *size = 0;
134      ++counter_;
135      return true;
136    } else {
137      ++counter_;
138      return array_stream_.Next(data, size);
139    }
140  }
141
142  void BackUp(int count)  { return array_stream_.BackUp(count); }
143  bool Skip(int count)    { return array_stream_.Skip(count);   }
144  int64 ByteCount() const { return array_stream_.ByteCount();   }
145
146 private:
147  ArrayInputStream array_stream_;
148  int counter_;
149};
150
151// -------------------------------------------------------------------
152
153// An error collector which simply concatenates all its errors into a big
154// block of text which can be checked.
155class TestErrorCollector : public ErrorCollector {
156 public:
157  TestErrorCollector() {}
158  ~TestErrorCollector() {}
159
160  string text_;
161
162  // implements ErrorCollector ---------------------------------------
163  void AddError(int line, int column, const string& message) {
164    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
165                                 line, column, message);
166  }
167};
168
169// -------------------------------------------------------------------
170
171// We test each operation over a variety of block sizes to insure that
172// we test cases where reads cross buffer boundaries as well as cases
173// where they don't.  This is sort of a brute-force approach to this,
174// but it's easy to write and easy to understand.
175const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
176
177class TokenizerTest : public testing::Test {
178 protected:
179  // For easy testing.
180  uint64 ParseInteger(const string& text) {
181    uint64 result;
182    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
183    return result;
184  }
185};
186
187// ===================================================================
188
189// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
190//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
191#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
192
193// In each test case, the entire input text should parse as a single token
194// of the given type.
195struct SimpleTokenCase {
196  string input;
197  Tokenizer::TokenType type;
198};
199
200inline ostream& operator<<(ostream& out,
201                           const SimpleTokenCase& test_case) {
202  return out << CEscape(test_case.input);
203}
204
205SimpleTokenCase kSimpleTokenCases[] = {
206  // Test identifiers.
207  { "hello",       Tokenizer::TYPE_IDENTIFIER },
208
209  // Test integers.
210  { "123",         Tokenizer::TYPE_INTEGER },
211  { "0xab6",       Tokenizer::TYPE_INTEGER },
212  { "0XAB6",       Tokenizer::TYPE_INTEGER },
213  { "0X1234567",   Tokenizer::TYPE_INTEGER },
214  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
215  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
216  { "01234567",    Tokenizer::TYPE_INTEGER },
217
218  // Test floats.
219  { "123.45",      Tokenizer::TYPE_FLOAT },
220  { "1.",          Tokenizer::TYPE_FLOAT },
221  { "1e3",         Tokenizer::TYPE_FLOAT },
222  { "1E3",         Tokenizer::TYPE_FLOAT },
223  { "1e-3",        Tokenizer::TYPE_FLOAT },
224  { "1e+3",        Tokenizer::TYPE_FLOAT },
225  { "1.e3",        Tokenizer::TYPE_FLOAT },
226  { "1.2e3",       Tokenizer::TYPE_FLOAT },
227  { ".1",          Tokenizer::TYPE_FLOAT },
228  { ".1e3",        Tokenizer::TYPE_FLOAT },
229  { ".1e-3",       Tokenizer::TYPE_FLOAT },
230  { ".1e+3",       Tokenizer::TYPE_FLOAT },
231
232  // Test strings.
233  { "'hello'",     Tokenizer::TYPE_STRING },
234  { "\"foo\"",     Tokenizer::TYPE_STRING },
235  { "'a\"b'",      Tokenizer::TYPE_STRING },
236  { "\"a'b\"",     Tokenizer::TYPE_STRING },
237  { "'a\\'b'",     Tokenizer::TYPE_STRING },
238  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
239  { "'\\xf'",      Tokenizer::TYPE_STRING },
240  { "'\\0'",       Tokenizer::TYPE_STRING },
241
242  // Test symbols.
243  { "+",           Tokenizer::TYPE_SYMBOL },
244  { ".",           Tokenizer::TYPE_SYMBOL },
245};
246
247TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
248  // Set up the tokenizer.
249  TestInputStream input(kSimpleTokenCases_case.input.data(),
250                        kSimpleTokenCases_case.input.size(),
251                        kBlockSizes_case);
252  TestErrorCollector error_collector;
253  Tokenizer tokenizer(&input, &error_collector);
254
255  // Before Next() is called, the initial token should always be TYPE_START.
256  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
257  EXPECT_EQ("", tokenizer.current().text);
258  EXPECT_EQ(0, tokenizer.current().line);
259  EXPECT_EQ(0, tokenizer.current().column);
260
261  // Parse the token.
262  ASSERT_TRUE(tokenizer.Next());
263
264  // Check that it has the right type.
265  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
266  // Check that it contains the complete input text.
267  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
268  // Check that it is located at the beginning of the input
269  EXPECT_EQ(0, tokenizer.current().line);
270  EXPECT_EQ(0, tokenizer.current().column);
271
272  // There should be no more input.
273  EXPECT_FALSE(tokenizer.Next());
274
275  // After Next() returns false, the token should have type TYPE_END.
276  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
277  EXPECT_EQ("", tokenizer.current().text);
278  EXPECT_EQ(0, tokenizer.current().line);
279  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
280
281  // There should be no errors.
282  EXPECT_TRUE(error_collector.text_.empty());
283}
284
285TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
286  // Test the "allow_f_after_float" option.
287
288  // Set up the tokenizer.
289  const char* text = "1f 2.5f 6e3f 7F";
290  TestInputStream input(text, strlen(text), kBlockSizes_case);
291  TestErrorCollector error_collector;
292  Tokenizer tokenizer(&input, &error_collector);
293  tokenizer.set_allow_f_after_float(true);
294
295  // Advance through tokens and check that they are parsed as expected.
296  ASSERT_TRUE(tokenizer.Next());
297  EXPECT_EQ(tokenizer.current().text, "1f");
298  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
299  ASSERT_TRUE(tokenizer.Next());
300  EXPECT_EQ(tokenizer.current().text, "2.5f");
301  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
302  ASSERT_TRUE(tokenizer.Next());
303  EXPECT_EQ(tokenizer.current().text, "6e3f");
304  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
305  ASSERT_TRUE(tokenizer.Next());
306  EXPECT_EQ(tokenizer.current().text, "7F");
307  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
308
309  // There should be no more input.
310  EXPECT_FALSE(tokenizer.Next());
311  // There should be no errors.
312  EXPECT_TRUE(error_collector.text_.empty());
313}
314
315#endif
316
317// -------------------------------------------------------------------
318
319// In each case, the input is parsed to produce a list of tokens.  The
320// last token in "output" must have type TYPE_END.
321struct MultiTokenCase {
322  string input;
323  Tokenizer::Token output[10];  // The compiler wants a constant array
324                                // size for initialization to work.  There
325                                // is no reason this can't be increased if
326                                // needed.
327};
328
329inline ostream& operator<<(ostream& out,
330                           const MultiTokenCase& test_case) {
331  return out << CEscape(test_case.input);
332}
333
334MultiTokenCase kMultiTokenCases[] = {
335  // Test empty input.
336  { "", {
337    { Tokenizer::TYPE_END       , ""     , 0,  0 },
338  }},
339
340  // Test all token types at the same time.
341  { "foo 1 1.2 + 'bar'", {
342    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0 },
343    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4 },
344    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6 },
345    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10 },
346    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12 },
347    { Tokenizer::TYPE_END       , ""     , 0, 17 },
348  }},
349
350  // Test that consecutive symbols are parsed as separate tokens.
351  { "!@+%", {
352    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0 },
353    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1 },
354    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2 },
355    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3 },
356    { Tokenizer::TYPE_END       , ""     , 0, 4 },
357  }},
358
359  // Test that newlines affect line numbers correctly.
360  { "foo bar\nrab oof", {
361    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
362    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4 },
363    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0 },
364    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4 },
365    { Tokenizer::TYPE_END       , ""   , 1,  7 },
366  }},
367
368  // Test that tabs affect column numbers correctly.
369  { "foo\tbar  \tbaz", {
370    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
371    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8 },
372    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16 },
373    { Tokenizer::TYPE_END       , ""   , 0, 19 },
374  }},
375
376  // Test that line comments are ignored.
377  { "foo // This is a comment\n"
378    "bar // This is another comment", {
379    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
380    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0 },
381    { Tokenizer::TYPE_END       , ""   , 1, 30 },
382  }},
383
384  // Test that block comments are ignored.
385  { "foo /* This is a block comment */ bar", {
386    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
387    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34 },
388    { Tokenizer::TYPE_END       , ""   , 0, 37 },
389  }},
390
391  // Test that sh-style comments are not ignored by default.
392  { "foo # bar\n"
393    "baz", {
394    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0 },
395    { Tokenizer::TYPE_SYMBOL    , "#"  , 0,  4 },
396    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  6 },
397    { Tokenizer::TYPE_IDENTIFIER, "baz", 1,  0 },
398    { Tokenizer::TYPE_END       , ""   , 1, 3 },
399  }},
400};
401
402TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
403  // Set up the tokenizer.
404  TestInputStream input(kMultiTokenCases_case.input.data(),
405                        kMultiTokenCases_case.input.size(),
406                        kBlockSizes_case);
407  TestErrorCollector error_collector;
408  Tokenizer tokenizer(&input, &error_collector);
409
410  // Before Next() is called, the initial token should always be TYPE_START.
411  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
412  EXPECT_EQ("", tokenizer.current().text);
413  EXPECT_EQ(0, tokenizer.current().line);
414  EXPECT_EQ(0, tokenizer.current().column);
415
416  // Loop through all expected tokens.
417  int i = 0;
418  Tokenizer::Token token;
419  do {
420    token = kMultiTokenCases_case.output[i++];
421
422    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
423
424    // Next() should only return false when it hits the end token.
425    if (token.type != Tokenizer::TYPE_END) {
426      ASSERT_TRUE(tokenizer.Next());
427    } else {
428      ASSERT_FALSE(tokenizer.Next());
429    }
430
431    // Check that the token matches the expected one.
432    EXPECT_EQ(token.type, tokenizer.current().type);
433    EXPECT_EQ(token.text, tokenizer.current().text);
434    EXPECT_EQ(token.line, tokenizer.current().line);
435    EXPECT_EQ(token.column, tokenizer.current().column);
436
437  } while (token.type != Tokenizer::TYPE_END);
438
439  // There should be no errors.
440  EXPECT_TRUE(error_collector.text_.empty());
441}
442
443// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
444//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
445#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
446
447TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
448  // Test the "comment_style" option.
449
450  const char* text = "foo # bar\n"
451                     "baz // qux\n"
452                     "corge /* grault */\n"
453                     "garply";
454  const char* const kTokens[] = {"foo",  // "# bar" is ignored
455                                 "baz", "/", "/", "qux",
456                                 "corge", "/", "*", "grault", "*", "/",
457                                 "garply"};
458
459  // Set up the tokenizer.
460  TestInputStream input(text, strlen(text), kBlockSizes_case);
461  TestErrorCollector error_collector;
462  Tokenizer tokenizer(&input, &error_collector);
463  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
464
465  // Advance through tokens and check that they are parsed as expected.
466  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
467    EXPECT_TRUE(tokenizer.Next());
468    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
469  }
470
471  // There should be no more input.
472  EXPECT_FALSE(tokenizer.Next());
473  // There should be no errors.
474  EXPECT_TRUE(error_collector.text_.empty());
475}
476
477#endif
478
479// -------------------------------------------------------------------
480
481// Test parse helpers.  It's not really worth setting up a full data-driven
482// test here.
483TEST_F(TokenizerTest, ParseInteger) {
484  EXPECT_EQ(0, ParseInteger("0"));
485  EXPECT_EQ(123, ParseInteger("123"));
486  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
487  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
488  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
489  EXPECT_EQ(01234567, ParseInteger("01234567"));
490  EXPECT_EQ(0X123, ParseInteger("0X123"));
491
492  // Test invalid integers that may still be tokenized as integers.
493  EXPECT_EQ(0, ParseInteger("0x"));
494
495  uint64 i;
496#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
497  // Test invalid integers that will never be tokenized as integers.
498  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
499    "passed text that could not have been tokenized as an integer");
500  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
501    "passed text that could not have been tokenized as an integer");
502  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
503    "passed text that could not have been tokenized as an integer");
504  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
505    "passed text that could not have been tokenized as an integer");
506  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
507    "passed text that could not have been tokenized as an integer");
508#endif  // GTEST_HAS_DEATH_TEST
509
510  // Test overflows.
511  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
512  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
513  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
514  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
515  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
516  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
517  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
518}
519
520TEST_F(TokenizerTest, ParseFloat) {
521  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
522  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
523  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
524  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
525  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
526  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
527  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
528  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
529  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
530  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
531  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
532  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
533  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
534  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
535
536  // Test invalid integers that may still be tokenized as integers.
537  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
538  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
539  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
540
541  // Test 'f' suffix.
542  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
543  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
544  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
545
546  // These should parse successfully even though they are out of range.
547  // Overflows become infinity and underflows become zero.
548  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
549  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
550
551#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
552  // Test invalid integers that will never be tokenized as integers.
553  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
554    "passed text that could not have been tokenized as a float");
555  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
556    "passed text that could not have been tokenized as a float");
557  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
558    "passed text that could not have been tokenized as a float");
559#endif  // GTEST_HAS_DEATH_TEST
560}
561
562TEST_F(TokenizerTest, ParseString) {
563  string output;
564  Tokenizer::ParseString("'hello'", &output);
565  EXPECT_EQ("hello", output);
566  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
567  EXPECT_EQ("blah\nblah2", output);
568  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
569  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
570  Tokenizer::ParseString("'\\x20\\x4'", &output);
571  EXPECT_EQ("\x20\x4", output);
572
573  // Test invalid strings that may still be tokenized as strings.
574  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
575  EXPECT_EQ("\a?\v\t", output);
576  Tokenizer::ParseString("'", &output);
577  EXPECT_EQ("", output);
578  Tokenizer::ParseString("'\\", &output);
579  EXPECT_EQ("\\", output);
580
581  // Test invalid strings that will never be tokenized as strings.
582#ifdef GTEST_HAS_DEATH_TEST  // death tests do not work on Windows yet
583  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
584    "passed text that could not have been tokenized as a string");
585#endif  // GTEST_HAS_DEATH_TEST
586}
587
588TEST_F(TokenizerTest, ParseStringAppend) {
589  // Check that ParseString and ParseStringAppend differ.
590  string output("stuff+");
591  Tokenizer::ParseStringAppend("'hello'", &output);
592  EXPECT_EQ("stuff+hello", output);
593  Tokenizer::ParseString("'hello'", &output);
594  EXPECT_EQ("hello", output);
595}
596
597// -------------------------------------------------------------------
598
599// Each case parses some input text, ignoring the tokens produced, and
600// checks that the error output matches what is expected.
601struct ErrorCase {
602  string input;
603  bool recoverable;  // True if the tokenizer should be able to recover and
604                     // parse more tokens after seeing this error.  Cases
605                     // for which this is true must end with "foo" as
606                     // the last token, which the test will check for.
607  const char* errors;
608};
609
610inline ostream& operator<<(ostream& out,
611                           const ErrorCase& test_case) {
612  return out << CEscape(test_case.input);
613}
614
615ErrorCase kErrorCases[] = {
616  // String errors.
617  { "'\\l' foo", true,
618    "0:2: Invalid escape sequence in string literal.\n" },
619  { "'\\x' foo", true,
620    "0:3: Expected hex digits for escape sequence.\n" },
621  { "'foo", false,
622    "0:4: String literals cannot cross line boundaries.\n" },
623  { "'bar\nfoo", true,
624    "0:4: String literals cannot cross line boundaries.\n" },
625
626  // Integer errors.
627  { "123foo", true,
628    "0:3: Need space between number and identifier.\n" },
629
630  // Hex/octal errors.
631  { "0x foo", true,
632    "0:2: \"0x\" must be followed by hex digits.\n" },
633  { "0541823 foo", true,
634    "0:4: Numbers starting with leading zero must be in octal.\n" },
635  { "0x123z foo", true,
636    "0:5: Need space between number and identifier.\n" },
637  { "0x123.4 foo", true,
638    "0:5: Hex and octal numbers must be integers.\n" },
639  { "0123.4 foo", true,
640    "0:4: Hex and octal numbers must be integers.\n" },
641
642  // Float errors.
643  { "1e foo", true,
644    "0:2: \"e\" must be followed by exponent.\n" },
645  { "1e- foo", true,
646    "0:3: \"e\" must be followed by exponent.\n" },
647  { "1.2.3 foo", true,
648    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
649  { "1e2.3 foo", true,
650    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
651  { "a.1 foo", true,
652    "0:1: Need space between identifier and decimal point.\n" },
653  // allow_f_after_float not enabled, so this should be an error.
654  { "1.0f foo", true,
655    "0:3: Need space between number and identifier.\n" },
656
657  // Block comment errors.
658  { "/*", false,
659    "0:2: End-of-file inside block comment.\n"
660    "0:0:   Comment started here.\n"},
661  { "/*/*/ foo", true,
662    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
663
664  // Control characters.  Multiple consecutive control characters should only
665  // produce one error.
666  { "\b foo", true,
667    "0:0: Invalid control characters encountered in text.\n" },
668  { "\b\b foo", true,
669    "0:0: Invalid control characters encountered in text.\n" },
670
671  // Check that control characters at end of input don't result in an
672  // infinite loop.
673  { "\b", false,
674    "0:0: Invalid control characters encountered in text.\n" },
675
676  // Check recovery from '\0'.  We have to explicitly specify the length of
677  // these strings because otherwise the string constructor will just call
678  // strlen() which will see the first '\0' and think that is the end of the
679  // string.
680  { string("\0foo", 4), true,
681    "0:0: Invalid control characters encountered in text.\n" },
682  { string("\0\0foo", 5), true,
683    "0:0: Invalid control characters encountered in text.\n" },
684};
685
686TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
687  // Set up the tokenizer.
688  TestInputStream input(kErrorCases_case.input.data(),
689                        kErrorCases_case.input.size(),
690                        kBlockSizes_case);
691  TestErrorCollector error_collector;
692  Tokenizer tokenizer(&input, &error_collector);
693
694  // Ignore all input, except remember if the last token was "foo".
695  bool last_was_foo = false;
696  while (tokenizer.Next()) {
697    last_was_foo = tokenizer.current().text == "foo";
698  }
699
700  // Check that the errors match what was expected.
701  EXPECT_EQ(error_collector.text_, kErrorCases_case.errors);
702
703  // If the error was recoverable, make sure we saw "foo" after it.
704  if (kErrorCases_case.recoverable) {
705    EXPECT_TRUE(last_was_foo);
706  }
707}
708
709// -------------------------------------------------------------------
710
711TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
712  string text = "foo bar";
713  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
714
715  // Create a tokenizer, read one token, then destroy it.
716  {
717    TestErrorCollector error_collector;
718    Tokenizer tokenizer(&input, &error_collector);
719
720    tokenizer.Next();
721  }
722
723  // Only "foo" should have been read.
724  EXPECT_EQ(strlen("foo"), input.ByteCount());
725}
726
727}  // namespace
728}  // namespace io
729}  // namespace protobuf
730}  // namespace google
731