1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34
35#include <limits.h>
36#include <math.h>
37
38#include <vector>
39
40#include <google/protobuf/io/tokenizer.h>
41#include <google/protobuf/io/zero_copy_stream_impl.h>
42
43#include <google/protobuf/stubs/common.h>
44#include <google/protobuf/stubs/logging.h>
45#include <google/protobuf/stubs/strutil.h>
46#include <google/protobuf/stubs/substitute.h>
47#include <google/protobuf/testing/googletest.h>
48#include <gtest/gtest.h>
49
50namespace google {
51namespace protobuf {
52namespace io {
53namespace {
54
55// ===================================================================
56// Data-Driven Test Infrastructure
57
58// TODO(kenton):  This is copied from coded_stream_unittest.  This is
59//   temporary until these fetaures are integrated into gTest itself.
60
61// TEST_1D and TEST_2D are macros I'd eventually like to see added to
62// gTest.  These macros can be used to declare tests which should be
63// run multiple times, once for each item in some input array.  TEST_1D
64// tests all cases in a single input array.  TEST_2D tests all
65// combinations of cases from two arrays.  The arrays must be statically
66// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
67//
68// int kCases[] = {1, 2, 3, 4}
69// TEST_1D(MyFixture, MyTest, kCases) {
70//   EXPECT_GT(kCases_case, 0);
71// }
72//
73// This test iterates through the numbers 1, 2, 3, and 4 and tests that
74// they are all grater than zero.  In case of failure, the exact case
75// which failed will be printed.  The case type must be printable using
76// ostream::operator<<.
77
78#define TEST_1D(FIXTURE, NAME, CASES)                                      \
79  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
80   protected:                                                              \
81    template <typename CaseType>                                           \
82    void DoSingleCase(const CaseType& CASES##_case);                       \
83  };                                                                       \
84                                                                           \
85  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
86    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
87      SCOPED_TRACE(testing::Message()                                      \
88        << #CASES " case #" << i << ": " << CASES[i]);                     \
89      DoSingleCase(CASES[i]);                                              \
90    }                                                                      \
91  }                                                                        \
92                                                                           \
93  template <typename CaseType>                                             \
94  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
95
96#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
97  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
98   protected:                                                              \
99    template <typename CaseType1, typename CaseType2>                      \
100    void DoSingleCase(const CaseType1& CASES1##_case,                      \
101                      const CaseType2& CASES2##_case);                     \
102  };                                                                       \
103                                                                           \
104  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
105    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
106      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
107        SCOPED_TRACE(testing::Message()                                    \
108          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
109          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
110        DoSingleCase(CASES1[i], CASES2[j]);                                \
111      }                                                                    \
112    }                                                                      \
113  }                                                                        \
114                                                                           \
115  template <typename CaseType1, typename CaseType2>                        \
116  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
117                                           const CaseType2& CASES2##_case)
118
119// -------------------------------------------------------------------
120
121// An input stream that is basically like an ArrayInputStream but sometimes
122// returns empty buffers, just to throw us off.
123class TestInputStream : public ZeroCopyInputStream {
124 public:
125  TestInputStream(const void* data, int size, int block_size)
126    : array_stream_(data, size, block_size), counter_(0) {}
127  ~TestInputStream() {}
128
129  // implements ZeroCopyInputStream ----------------------------------
130  bool Next(const void** data, int* size) {
131    // We'll return empty buffers starting with the first buffer, and every
132    // 3 and 5 buffers after that.
133    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
134      *data = NULL;
135      *size = 0;
136      ++counter_;
137      return true;
138    } else {
139      ++counter_;
140      return array_stream_.Next(data, size);
141    }
142  }
143
144  void BackUp(int count)  { return array_stream_.BackUp(count); }
145  bool Skip(int count)    { return array_stream_.Skip(count);   }
146  int64 ByteCount() const { return array_stream_.ByteCount();   }
147
148 private:
149  ArrayInputStream array_stream_;
150  int counter_;
151};
152
153// -------------------------------------------------------------------
154
155// An error collector which simply concatenates all its errors into a big
156// block of text which can be checked.
157class TestErrorCollector : public ErrorCollector {
158 public:
159  TestErrorCollector() {}
160  ~TestErrorCollector() {}
161
162  string text_;
163
164  // implements ErrorCollector ---------------------------------------
165  void AddError(int line, int column, const string& message) {
166    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
167                                 line, column, message);
168  }
169};
170
171// -------------------------------------------------------------------
172
173// We test each operation over a variety of block sizes to insure that
174// we test cases where reads cross buffer boundaries as well as cases
175// where they don't.  This is sort of a brute-force approach to this,
176// but it's easy to write and easy to understand.
177const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
178
179class TokenizerTest : public testing::Test {
180 protected:
181  // For easy testing.
182  uint64 ParseInteger(const string& text) {
183    uint64 result;
184    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
185    return result;
186  }
187};
188
189// ===================================================================
190
191// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
192//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
193#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
194
195// In each test case, the entire input text should parse as a single token
196// of the given type.
197struct SimpleTokenCase {
198  string input;
199  Tokenizer::TokenType type;
200};
201
202inline ostream& operator<<(ostream& out,
203                           const SimpleTokenCase& test_case) {
204  return out << CEscape(test_case.input);
205}
206
207SimpleTokenCase kSimpleTokenCases[] = {
208  // Test identifiers.
209  { "hello",       Tokenizer::TYPE_IDENTIFIER },
210
211  // Test integers.
212  { "123",         Tokenizer::TYPE_INTEGER },
213  { "0xab6",       Tokenizer::TYPE_INTEGER },
214  { "0XAB6",       Tokenizer::TYPE_INTEGER },
215  { "0X1234567",   Tokenizer::TYPE_INTEGER },
216  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
217  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
218  { "01234567",    Tokenizer::TYPE_INTEGER },
219
220  // Test floats.
221  { "123.45",      Tokenizer::TYPE_FLOAT },
222  { "1.",          Tokenizer::TYPE_FLOAT },
223  { "1e3",         Tokenizer::TYPE_FLOAT },
224  { "1E3",         Tokenizer::TYPE_FLOAT },
225  { "1e-3",        Tokenizer::TYPE_FLOAT },
226  { "1e+3",        Tokenizer::TYPE_FLOAT },
227  { "1.e3",        Tokenizer::TYPE_FLOAT },
228  { "1.2e3",       Tokenizer::TYPE_FLOAT },
229  { ".1",          Tokenizer::TYPE_FLOAT },
230  { ".1e3",        Tokenizer::TYPE_FLOAT },
231  { ".1e-3",       Tokenizer::TYPE_FLOAT },
232  { ".1e+3",       Tokenizer::TYPE_FLOAT },
233
234  // Test strings.
235  { "'hello'",     Tokenizer::TYPE_STRING },
236  { "\"foo\"",     Tokenizer::TYPE_STRING },
237  { "'a\"b'",      Tokenizer::TYPE_STRING },
238  { "\"a'b\"",     Tokenizer::TYPE_STRING },
239  { "'a\\'b'",     Tokenizer::TYPE_STRING },
240  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
241  { "'\\xf'",      Tokenizer::TYPE_STRING },
242  { "'\\0'",       Tokenizer::TYPE_STRING },
243
244  // Test symbols.
245  { "+",           Tokenizer::TYPE_SYMBOL },
246  { ".",           Tokenizer::TYPE_SYMBOL },
247};
248
249TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
250  // Set up the tokenizer.
251  TestInputStream input(kSimpleTokenCases_case.input.data(),
252                        kSimpleTokenCases_case.input.size(),
253                        kBlockSizes_case);
254  TestErrorCollector error_collector;
255  Tokenizer tokenizer(&input, &error_collector);
256
257  // Before Next() is called, the initial token should always be TYPE_START.
258  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
259  EXPECT_EQ("", tokenizer.current().text);
260  EXPECT_EQ(0, tokenizer.current().line);
261  EXPECT_EQ(0, tokenizer.current().column);
262  EXPECT_EQ(0, tokenizer.current().end_column);
263
264  // Parse the token.
265  ASSERT_TRUE(tokenizer.Next());
266
267  // Check that it has the right type.
268  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
269  // Check that it contains the complete input text.
270  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
271  // Check that it is located at the beginning of the input
272  EXPECT_EQ(0, tokenizer.current().line);
273  EXPECT_EQ(0, tokenizer.current().column);
274  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
275            tokenizer.current().end_column);
276
277  // There should be no more input.
278  EXPECT_FALSE(tokenizer.Next());
279
280  // After Next() returns false, the token should have type TYPE_END.
281  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
282  EXPECT_EQ("", tokenizer.current().text);
283  EXPECT_EQ(0, tokenizer.current().line);
284  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
285  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
286            tokenizer.current().end_column);
287
288  // There should be no errors.
289  EXPECT_TRUE(error_collector.text_.empty());
290}
291
292TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
293  // Test the "allow_f_after_float" option.
294
295  // Set up the tokenizer.
296  const char* text = "1f 2.5f 6e3f 7F";
297  TestInputStream input(text, strlen(text), kBlockSizes_case);
298  TestErrorCollector error_collector;
299  Tokenizer tokenizer(&input, &error_collector);
300  tokenizer.set_allow_f_after_float(true);
301
302  // Advance through tokens and check that they are parsed as expected.
303  ASSERT_TRUE(tokenizer.Next());
304  EXPECT_EQ(tokenizer.current().text, "1f");
305  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
306  ASSERT_TRUE(tokenizer.Next());
307  EXPECT_EQ(tokenizer.current().text, "2.5f");
308  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
309  ASSERT_TRUE(tokenizer.Next());
310  EXPECT_EQ(tokenizer.current().text, "6e3f");
311  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
312  ASSERT_TRUE(tokenizer.Next());
313  EXPECT_EQ(tokenizer.current().text, "7F");
314  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
315
316  // There should be no more input.
317  EXPECT_FALSE(tokenizer.Next());
318  // There should be no errors.
319  EXPECT_TRUE(error_collector.text_.empty());
320}
321
322#endif
323
324// -------------------------------------------------------------------
325
326// In each case, the input is parsed to produce a list of tokens.  The
327// last token in "output" must have type TYPE_END.
328struct MultiTokenCase {
329  string input;
330  Tokenizer::Token output[10];  // The compiler wants a constant array
331                                // size for initialization to work.  There
332                                // is no reason this can't be increased if
333                                // needed.
334};
335
336inline ostream& operator<<(ostream& out,
337                           const MultiTokenCase& test_case) {
338  return out << CEscape(test_case.input);
339}
340
341MultiTokenCase kMultiTokenCases[] = {
342  // Test empty input.
343  { "", {
344    { Tokenizer::TYPE_END       , ""     , 0,  0 },
345  }},
346
347  // Test all token types at the same time.
348  { "foo 1 1.2 + 'bar'", {
349    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
350    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
351    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
352    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
353    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
354    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
355  }},
356
357  // Test that consecutive symbols are parsed as separate tokens.
358  { "!@+%", {
359    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
360    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
361    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
362    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
363    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
364  }},
365
366  // Test that newlines affect line numbers correctly.
367  { "foo bar\nrab oof", {
368    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
369    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
370    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
371    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
372    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
373  }},
374
375  // Test that tabs affect column numbers correctly.
376  { "foo\tbar  \tbaz", {
377    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
378    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
379    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
380    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
381  }},
382
383  // Test that tabs in string literals affect column numbers correctly.
384  { "\"foo\tbar\" baz", {
385    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
386    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
387    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
388  }},
389
390  // Test that line comments are ignored.
391  { "foo // This is a comment\n"
392    "bar // This is another comment", {
393    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
394    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
395    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
396  }},
397
398  // Test that block comments are ignored.
399  { "foo /* This is a block comment */ bar", {
400    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
401    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
402    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
403  }},
404
405  // Test that sh-style comments are not ignored by default.
406  { "foo # bar\n"
407    "baz", {
408    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
409    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
410    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
411    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
412    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
413  }},
414
415  // Test all whitespace chars
416  { "foo\n\t\r\v\fbar", {
417    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
418    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
419    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
420  }},
421};
422
423TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
424  // Set up the tokenizer.
425  TestInputStream input(kMultiTokenCases_case.input.data(),
426                        kMultiTokenCases_case.input.size(),
427                        kBlockSizes_case);
428  TestErrorCollector error_collector;
429  Tokenizer tokenizer(&input, &error_collector);
430
431  // Before Next() is called, the initial token should always be TYPE_START.
432  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
433  EXPECT_EQ("", tokenizer.current().text);
434  EXPECT_EQ(0, tokenizer.current().line);
435  EXPECT_EQ(0, tokenizer.current().column);
436  EXPECT_EQ(0, tokenizer.current().end_column);
437
438  // Loop through all expected tokens.
439  int i = 0;
440  Tokenizer::Token token;
441  do {
442    token = kMultiTokenCases_case.output[i++];
443
444    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
445
446    Tokenizer::Token previous = tokenizer.current();
447
448    // Next() should only return false when it hits the end token.
449    if (token.type != Tokenizer::TYPE_END) {
450      ASSERT_TRUE(tokenizer.Next());
451    } else {
452      ASSERT_FALSE(tokenizer.Next());
453    }
454
455    // Check that the previous token is set correctly.
456    EXPECT_EQ(previous.type, tokenizer.previous().type);
457    EXPECT_EQ(previous.text, tokenizer.previous().text);
458    EXPECT_EQ(previous.line, tokenizer.previous().line);
459    EXPECT_EQ(previous.column, tokenizer.previous().column);
460    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
461
462    // Check that the token matches the expected one.
463    EXPECT_EQ(token.type, tokenizer.current().type);
464    EXPECT_EQ(token.text, tokenizer.current().text);
465    EXPECT_EQ(token.line, tokenizer.current().line);
466    EXPECT_EQ(token.column, tokenizer.current().column);
467    EXPECT_EQ(token.end_column, tokenizer.current().end_column);
468
469  } while (token.type != Tokenizer::TYPE_END);
470
471  // There should be no errors.
472  EXPECT_TRUE(error_collector.text_.empty());
473}
474
475// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
476//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
477#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
478
479TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
480  // Test the "comment_style" option.
481
482  const char* text = "foo # bar\n"
483                     "baz // qux\n"
484                     "corge /* grault */\n"
485                     "garply";
486  const char* const kTokens[] = {"foo",  // "# bar" is ignored
487                                 "baz", "/", "/", "qux",
488                                 "corge", "/", "*", "grault", "*", "/",
489                                 "garply"};
490
491  // Set up the tokenizer.
492  TestInputStream input(text, strlen(text), kBlockSizes_case);
493  TestErrorCollector error_collector;
494  Tokenizer tokenizer(&input, &error_collector);
495  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
496
497  // Advance through tokens and check that they are parsed as expected.
498  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
499    EXPECT_TRUE(tokenizer.Next());
500    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
501  }
502
503  // There should be no more input.
504  EXPECT_FALSE(tokenizer.Next());
505  // There should be no errors.
506  EXPECT_TRUE(error_collector.text_.empty());
507}
508
509#endif
510
511// -------------------------------------------------------------------
512
513// In each case, the input is expected to have two tokens named "prev" and
514// "next" with comments in between.
515struct DocCommentCase {
516  string input;
517
518  const char* prev_trailing_comments;
519  const char* detached_comments[10];
520  const char* next_leading_comments;
521};
522
523inline ostream& operator<<(ostream& out,
524                           const DocCommentCase& test_case) {
525  return out << CEscape(test_case.input);
526}
527
528DocCommentCase kDocCommentCases[] = {
529  {
530    "prev next",
531
532    "",
533    {},
534    ""
535      },
536
537        {
538      "prev /* ignored */ next",
539
540      "",
541      {},
542      ""
543        },
544
545          {
546        "prev // trailing comment\n"
547            "next",
548
549            " trailing comment\n",
550            {},
551            ""
552          },
553
554            {
555          "prev\n"
556              "// leading comment\n"
557              "// line 2\n"
558              "next",
559
560              "",
561              {},
562              " leading comment\n"
563              " line 2\n"
564            },
565
566              {
567            "prev\n"
568                "// trailing comment\n"
569                "// line 2\n"
570                "\n"
571                "next",
572
573                " trailing comment\n"
574                " line 2\n",
575                {},
576                ""
577              },
578
579                {
580              "prev // trailing comment\n"
581                  "// leading comment\n"
582                  "// line 2\n"
583                  "next",
584
585                  " trailing comment\n",
586                  {},
587                  " leading comment\n"
588                  " line 2\n"
589                },
590
591                  {
592                "prev /* trailing block comment */\n"
593                    "/* leading block comment\n"
594                    " * line 2\n"
595                    " * line 3 */"
596                    "next",
597
598                    " trailing block comment ",
599                    {},
600                    " leading block comment\n"
601                    " line 2\n"
602                    " line 3 "
603                  },
604
605                    {
606                  "prev\n"
607                      "/* trailing block comment\n"
608                      " * line 2\n"
609                      " * line 3\n"
610                      " */\n"
611                      "/* leading block comment\n"
612                      " * line 2\n"
613                      " * line 3 */"
614                      "next",
615
616                      " trailing block comment\n"
617                      " line 2\n"
618                      " line 3\n",
619                      {},
620                      " leading block comment\n"
621                      " line 2\n"
622                      " line 3 "
623                    },
624
625                      {
626                    "prev\n"
627                        "// trailing comment\n"
628                        "\n"
629                        "// detached comment\n"
630                        "// line 2\n"
631                        "\n"
632                        "// second detached comment\n"
633                        "/* third detached comment\n"
634                        " * line 2 */\n"
635                        "// leading comment\n"
636                        "next",
637
638                        " trailing comment\n",
639                        {
640                      " detached comment\n"
641                          " line 2\n",
642                          " second detached comment\n",
643                          " third detached comment\n"
644                          " line 2 "
645                        },
646                          " leading comment\n"
647                        },
648
649                          {
650                        "prev /**/\n"
651                            "\n"
652                            "// detached comment\n"
653                            "\n"
654                            "// leading comment\n"
655                            "next",
656
657                            "",
658                            {
659                          " detached comment\n"
660                            },
661                              " leading comment\n"
662                            },
663
664                              {
665                            "prev /**/\n"
666                                "// leading comment\n"
667                                "next",
668
669                                "",
670                                {},
671                                " leading comment\n"
672                              },
673                              };
674
675TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
676  // Set up the tokenizer.
677  TestInputStream input(kDocCommentCases_case.input.data(),
678                        kDocCommentCases_case.input.size(),
679                        kBlockSizes_case);
680  TestErrorCollector error_collector;
681  Tokenizer tokenizer(&input, &error_collector);
682
683  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
684  TestInputStream input2(kDocCommentCases_case.input.data(),
685                        kDocCommentCases_case.input.size(),
686                        kBlockSizes_case);
687  Tokenizer tokenizer2(&input2, &error_collector);
688
689  tokenizer.Next();
690  tokenizer2.Next();
691
692  EXPECT_EQ("prev", tokenizer.current().text);
693  EXPECT_EQ("prev", tokenizer2.current().text);
694
695  string prev_trailing_comments;
696  vector<string> detached_comments;
697  string next_leading_comments;
698  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
699                             &next_leading_comments);
700  tokenizer2.NextWithComments(NULL, NULL, NULL);
701  EXPECT_EQ("next", tokenizer.current().text);
702  EXPECT_EQ("next", tokenizer2.current().text);
703
704  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
705            prev_trailing_comments);
706
707  for (int i = 0; i < detached_comments.size(); i++) {
708    ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
709    ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
710    EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
711              detached_comments[i]);
712  }
713
714  // Verify that we matched all the detached comments.
715  EXPECT_EQ(NULL,
716      kDocCommentCases_case.detached_comments[detached_comments.size()]);
717
718  EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
719            next_leading_comments);
720}
721
722// -------------------------------------------------------------------
723
724// Test parse helpers.  It's not really worth setting up a full data-driven
725// test here.
726TEST_F(TokenizerTest, ParseInteger) {
727  EXPECT_EQ(0, ParseInteger("0"));
728  EXPECT_EQ(123, ParseInteger("123"));
729  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
730  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
731  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
732  EXPECT_EQ(01234567, ParseInteger("01234567"));
733  EXPECT_EQ(0X123, ParseInteger("0X123"));
734
735  // Test invalid integers that may still be tokenized as integers.
736  EXPECT_EQ(0, ParseInteger("0x"));
737
738  uint64 i;
739
740  // Test invalid integers that will never be tokenized as integers.
741  EXPECT_FALSE(Tokenizer::ParseInteger("zxy", kuint64max, &i));
742  EXPECT_FALSE(Tokenizer::ParseInteger("1.2", kuint64max, &i));
743  EXPECT_FALSE(Tokenizer::ParseInteger("08", kuint64max, &i));
744  EXPECT_FALSE(Tokenizer::ParseInteger("0xg", kuint64max, &i));
745  EXPECT_FALSE(Tokenizer::ParseInteger("-1", kuint64max, &i));
746
747  // Test overflows.
748  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
749  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
750  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
751  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
752  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
753  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
754  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
755}
756
757TEST_F(TokenizerTest, ParseFloat) {
758  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
759  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
760  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
761  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
762  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
763  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
764  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
765  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
766  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
767  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
768  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
769  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
770  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
771  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
772
773  // Test invalid integers that may still be tokenized as integers.
774  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
775  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
776  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
777
778  // Test 'f' suffix.
779  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
780  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
781  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
782
783  // These should parse successfully even though they are out of range.
784  // Overflows become infinity and underflows become zero.
785  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
786  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
787
788#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
789  // Test invalid integers that will never be tokenized as integers.
790  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
791    "passed text that could not have been tokenized as a float");
792  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
793    "passed text that could not have been tokenized as a float");
794  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
795    "passed text that could not have been tokenized as a float");
796#endif  // PROTOBUF_HAS_DEATH_TEST
797}
798
799TEST_F(TokenizerTest, ParseString) {
800  string output;
801  Tokenizer::ParseString("'hello'", &output);
802  EXPECT_EQ("hello", output);
803  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
804  EXPECT_EQ("blah\nblah2", output);
805  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
806  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
807  Tokenizer::ParseString("'\\x20\\x4'", &output);
808  EXPECT_EQ("\x20\x4", output);
809
810  // Test invalid strings that may still be tokenized as strings.
811  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
812  EXPECT_EQ("\a?\v\t", output);
813  Tokenizer::ParseString("'", &output);
814  EXPECT_EQ("", output);
815  Tokenizer::ParseString("'\\", &output);
816  EXPECT_EQ("\\", output);
817
818  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
819  // characters.
820  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
821  EXPECT_EQ("$¢€��XX", output);
822  // Same thing encoded using UTF16.
823  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
824  EXPECT_EQ("$¢€��XX", output);
825  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
826  // We just output this as if it were UTF8; it's not a defined code point, but
827  // it has a defined encoding.
828  Tokenizer::ParseString("'\\ud852XX'", &output);
829  EXPECT_EQ("\xed\xa1\x92XX", output);
830  // Malformed escape: Demons may fly out of the nose.
831  Tokenizer::ParseString("\\u0", &output);
832  EXPECT_EQ("u0", output);
833
834  // Test invalid strings that will never be tokenized as strings.
835#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
836  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
837    "passed text that could not have been tokenized as a string");
838#endif  // PROTOBUF_HAS_DEATH_TEST
839}
840
841TEST_F(TokenizerTest, ParseStringAppend) {
842  // Check that ParseString and ParseStringAppend differ.
843  string output("stuff+");
844  Tokenizer::ParseStringAppend("'hello'", &output);
845  EXPECT_EQ("stuff+hello", output);
846  Tokenizer::ParseString("'hello'", &output);
847  EXPECT_EQ("hello", output);
848}
849
850// -------------------------------------------------------------------
851
852// Each case parses some input text, ignoring the tokens produced, and
853// checks that the error output matches what is expected.
854struct ErrorCase {
855  string input;
856  bool recoverable;  // True if the tokenizer should be able to recover and
857                     // parse more tokens after seeing this error.  Cases
858                     // for which this is true must end with "foo" as
859                     // the last token, which the test will check for.
860  const char* errors;
861};
862
863inline ostream& operator<<(ostream& out,
864                           const ErrorCase& test_case) {
865  return out << CEscape(test_case.input);
866}
867
868ErrorCase kErrorCases[] = {
869  // String errors.
870  { "'\\l' foo", true,
871    "0:2: Invalid escape sequence in string literal.\n" },
872  { "'\\X' foo", true,
873    "0:2: Invalid escape sequence in string literal.\n" },
874  { "'\\x' foo", true,
875    "0:3: Expected hex digits for escape sequence.\n" },
876  { "'foo", false,
877    "0:4: Unexpected end of string.\n" },
878  { "'bar\nfoo", true,
879    "0:4: String literals cannot cross line boundaries.\n" },
880  { "'\\u01' foo", true,
881    "0:5: Expected four hex digits for \\u escape sequence.\n" },
882  { "'\\u01' foo", true,
883    "0:5: Expected four hex digits for \\u escape sequence.\n" },
884  { "'\\uXYZ' foo", true,
885    "0:3: Expected four hex digits for \\u escape sequence.\n" },
886
887  // Integer errors.
888  { "123foo", true,
889    "0:3: Need space between number and identifier.\n" },
890
891  // Hex/octal errors.
892  { "0x foo", true,
893    "0:2: \"0x\" must be followed by hex digits.\n" },
894  { "0541823 foo", true,
895    "0:4: Numbers starting with leading zero must be in octal.\n" },
896  { "0x123z foo", true,
897    "0:5: Need space between number and identifier.\n" },
898  { "0x123.4 foo", true,
899    "0:5: Hex and octal numbers must be integers.\n" },
900  { "0123.4 foo", true,
901    "0:4: Hex and octal numbers must be integers.\n" },
902
903  // Float errors.
904  { "1e foo", true,
905    "0:2: \"e\" must be followed by exponent.\n" },
906  { "1e- foo", true,
907    "0:3: \"e\" must be followed by exponent.\n" },
908  { "1.2.3 foo", true,
909    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
910  { "1e2.3 foo", true,
911    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
912  { "a.1 foo", true,
913    "0:1: Need space between identifier and decimal point.\n" },
914  // allow_f_after_float not enabled, so this should be an error.
915  { "1.0f foo", true,
916    "0:3: Need space between number and identifier.\n" },
917
918  // Block comment errors.
919  { "/*", false,
920    "0:2: End-of-file inside block comment.\n"
921    "0:0:   Comment started here.\n"},
922  { "/*/*/ foo", true,
923    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
924
925  // Control characters.  Multiple consecutive control characters should only
926  // produce one error.
927  { "\b foo", true,
928    "0:0: Invalid control characters encountered in text.\n" },
929  { "\b\b foo", true,
930    "0:0: Invalid control characters encountered in text.\n" },
931
932  // Check that control characters at end of input don't result in an
933  // infinite loop.
934  { "\b", false,
935    "0:0: Invalid control characters encountered in text.\n" },
936
937  // Check recovery from '\0'.  We have to explicitly specify the length of
938  // these strings because otherwise the string constructor will just call
939  // strlen() which will see the first '\0' and think that is the end of the
940  // string.
941  { string("\0foo", 4), true,
942    "0:0: Invalid control characters encountered in text.\n" },
943  { string("\0\0foo", 5), true,
944    "0:0: Invalid control characters encountered in text.\n" },
945
946  // Check error from high order bits set
947  { "\300foo", true,
948    "0:0: Interpreting non ascii codepoint 192.\n" },
949};
950
951TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
952  // Set up the tokenizer.
953  TestInputStream input(kErrorCases_case.input.data(),
954                        kErrorCases_case.input.size(),
955                        kBlockSizes_case);
956  TestErrorCollector error_collector;
957  Tokenizer tokenizer(&input, &error_collector);
958
959  // Ignore all input, except remember if the last token was "foo".
960  bool last_was_foo = false;
961  while (tokenizer.Next()) {
962    last_was_foo = tokenizer.current().text == "foo";
963  }
964
965  // Check that the errors match what was expected.
966  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
967
968  // If the error was recoverable, make sure we saw "foo" after it.
969  if (kErrorCases_case.recoverable) {
970    EXPECT_TRUE(last_was_foo);
971  }
972}
973
974// -------------------------------------------------------------------
975
976TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
977  string text = "foo bar";
978  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
979
980  // Create a tokenizer, read one token, then destroy it.
981  {
982    TestErrorCollector error_collector;
983    Tokenizer tokenizer(&input, &error_collector);
984
985    tokenizer.Next();
986  }
987
988  // Only "foo" should have been read.
989  EXPECT_EQ(strlen("foo"), input.ByteCount());
990}
991
992
993}  // namespace
994}  // namespace io
995}  // namespace protobuf
996}  // namespace google
997