1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// https://developers.google.com/protocol-buffers/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34
35#include <limits.h>
36#include <math.h>
37
38#include <vector>
39
40#include <google/protobuf/io/tokenizer.h>
41#include <google/protobuf/io/zero_copy_stream_impl.h>
42
43#include <google/protobuf/stubs/common.h>
44#include <google/protobuf/stubs/strutil.h>
45#include <google/protobuf/stubs/substitute.h>
46#include <google/protobuf/testing/googletest.h>
47#include <gtest/gtest.h>
48
49namespace google {
50namespace protobuf {
51namespace io {
52namespace {
53
54// ===================================================================
55// Data-Driven Test Infrastructure
56
57// TODO(kenton):  This is copied from coded_stream_unittest.  This is
58//   temporary until these fetaures are integrated into gTest itself.
59
60// TEST_1D and TEST_2D are macros I'd eventually like to see added to
61// gTest.  These macros can be used to declare tests which should be
62// run multiple times, once for each item in some input array.  TEST_1D
63// tests all cases in a single input array.  TEST_2D tests all
64// combinations of cases from two arrays.  The arrays must be statically
65// defined such that the GOOGLE_ARRAYSIZE() macro works on them.  Example:
66//
67// int kCases[] = {1, 2, 3, 4}
68// TEST_1D(MyFixture, MyTest, kCases) {
69//   EXPECT_GT(kCases_case, 0);
70// }
71//
72// This test iterates through the numbers 1, 2, 3, and 4 and tests that
73// they are all grater than zero.  In case of failure, the exact case
74// which failed will be printed.  The case type must be printable using
75// ostream::operator<<.
76
77#define TEST_1D(FIXTURE, NAME, CASES)                                      \
78  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
79   protected:                                                              \
80    template <typename CaseType>                                           \
81    void DoSingleCase(const CaseType& CASES##_case);                       \
82  };                                                                       \
83                                                                           \
84  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
85    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES); i++) {                           \
86      SCOPED_TRACE(testing::Message()                                      \
87        << #CASES " case #" << i << ": " << CASES[i]);                     \
88      DoSingleCase(CASES[i]);                                              \
89    }                                                                      \
90  }                                                                        \
91                                                                           \
92  template <typename CaseType>                                             \
93  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType& CASES##_case)
94
95#define TEST_2D(FIXTURE, NAME, CASES1, CASES2)                             \
96  class FIXTURE##_##NAME##_DD : public FIXTURE {                           \
97   protected:                                                              \
98    template <typename CaseType1, typename CaseType2>                      \
99    void DoSingleCase(const CaseType1& CASES1##_case,                      \
100                      const CaseType2& CASES2##_case);                     \
101  };                                                                       \
102                                                                           \
103  TEST_F(FIXTURE##_##NAME##_DD, NAME) {                                    \
104    for (int i = 0; i < GOOGLE_ARRAYSIZE(CASES1); i++) {                          \
105      for (int j = 0; j < GOOGLE_ARRAYSIZE(CASES2); j++) {                        \
106        SCOPED_TRACE(testing::Message()                                    \
107          << #CASES1 " case #" << i << ": " << CASES1[i] << ", "           \
108          << #CASES2 " case #" << j << ": " << CASES2[j]);                 \
109        DoSingleCase(CASES1[i], CASES2[j]);                                \
110      }                                                                    \
111    }                                                                      \
112  }                                                                        \
113                                                                           \
114  template <typename CaseType1, typename CaseType2>                        \
115  void FIXTURE##_##NAME##_DD::DoSingleCase(const CaseType1& CASES1##_case, \
116                                           const CaseType2& CASES2##_case)
117
118// -------------------------------------------------------------------
119
120// An input stream that is basically like an ArrayInputStream but sometimes
121// returns empty buffers, just to throw us off.
122class TestInputStream : public ZeroCopyInputStream {
123 public:
124  TestInputStream(const void* data, int size, int block_size)
125    : array_stream_(data, size, block_size), counter_(0) {}
126  ~TestInputStream() {}
127
128  // implements ZeroCopyInputStream ----------------------------------
129  bool Next(const void** data, int* size) {
130    // We'll return empty buffers starting with the first buffer, and every
131    // 3 and 5 buffers after that.
132    if (counter_ % 3 == 0 || counter_ % 5 == 0) {
133      *data = NULL;
134      *size = 0;
135      ++counter_;
136      return true;
137    } else {
138      ++counter_;
139      return array_stream_.Next(data, size);
140    }
141  }
142
143  void BackUp(int count)  { return array_stream_.BackUp(count); }
144  bool Skip(int count)    { return array_stream_.Skip(count);   }
145  int64 ByteCount() const { return array_stream_.ByteCount();   }
146
147 private:
148  ArrayInputStream array_stream_;
149  int counter_;
150};
151
152// -------------------------------------------------------------------
153
154// An error collector which simply concatenates all its errors into a big
155// block of text which can be checked.
156class TestErrorCollector : public ErrorCollector {
157 public:
158  TestErrorCollector() {}
159  ~TestErrorCollector() {}
160
161  string text_;
162
163  // implements ErrorCollector ---------------------------------------
164  void AddError(int line, int column, const string& message) {
165    strings::SubstituteAndAppend(&text_, "$0:$1: $2\n",
166                                 line, column, message);
167  }
168};
169
170// -------------------------------------------------------------------
171
172// We test each operation over a variety of block sizes to insure that
173// we test cases where reads cross buffer boundaries as well as cases
174// where they don't.  This is sort of a brute-force approach to this,
175// but it's easy to write and easy to understand.
176const int kBlockSizes[] = {1, 2, 3, 5, 7, 13, 32, 1024};
177
178class TokenizerTest : public testing::Test {
179 protected:
180  // For easy testing.
181  uint64 ParseInteger(const string& text) {
182    uint64 result;
183    EXPECT_TRUE(Tokenizer::ParseInteger(text, kuint64max, &result));
184    return result;
185  }
186};
187
188// ===================================================================
189
190// These tests causes gcc 3.3.5 (and earlier?) to give the cryptic error:
191//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
192#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
193
194// In each test case, the entire input text should parse as a single token
195// of the given type.
196struct SimpleTokenCase {
197  string input;
198  Tokenizer::TokenType type;
199};
200
201inline ostream& operator<<(ostream& out,
202                           const SimpleTokenCase& test_case) {
203  return out << CEscape(test_case.input);
204}
205
206SimpleTokenCase kSimpleTokenCases[] = {
207  // Test identifiers.
208  { "hello",       Tokenizer::TYPE_IDENTIFIER },
209
210  // Test integers.
211  { "123",         Tokenizer::TYPE_INTEGER },
212  { "0xab6",       Tokenizer::TYPE_INTEGER },
213  { "0XAB6",       Tokenizer::TYPE_INTEGER },
214  { "0X1234567",   Tokenizer::TYPE_INTEGER },
215  { "0x89abcdef",  Tokenizer::TYPE_INTEGER },
216  { "0x89ABCDEF",  Tokenizer::TYPE_INTEGER },
217  { "01234567",    Tokenizer::TYPE_INTEGER },
218
219  // Test floats.
220  { "123.45",      Tokenizer::TYPE_FLOAT },
221  { "1.",          Tokenizer::TYPE_FLOAT },
222  { "1e3",         Tokenizer::TYPE_FLOAT },
223  { "1E3",         Tokenizer::TYPE_FLOAT },
224  { "1e-3",        Tokenizer::TYPE_FLOAT },
225  { "1e+3",        Tokenizer::TYPE_FLOAT },
226  { "1.e3",        Tokenizer::TYPE_FLOAT },
227  { "1.2e3",       Tokenizer::TYPE_FLOAT },
228  { ".1",          Tokenizer::TYPE_FLOAT },
229  { ".1e3",        Tokenizer::TYPE_FLOAT },
230  { ".1e-3",       Tokenizer::TYPE_FLOAT },
231  { ".1e+3",       Tokenizer::TYPE_FLOAT },
232
233  // Test strings.
234  { "'hello'",     Tokenizer::TYPE_STRING },
235  { "\"foo\"",     Tokenizer::TYPE_STRING },
236  { "'a\"b'",      Tokenizer::TYPE_STRING },
237  { "\"a'b\"",     Tokenizer::TYPE_STRING },
238  { "'a\\'b'",     Tokenizer::TYPE_STRING },
239  { "\"a\\\"b\"",  Tokenizer::TYPE_STRING },
240  { "'\\xf'",      Tokenizer::TYPE_STRING },
241  { "'\\0'",       Tokenizer::TYPE_STRING },
242
243  // Test symbols.
244  { "+",           Tokenizer::TYPE_SYMBOL },
245  { ".",           Tokenizer::TYPE_SYMBOL },
246};
247
248TEST_2D(TokenizerTest, SimpleTokens, kSimpleTokenCases, kBlockSizes) {
249  // Set up the tokenizer.
250  TestInputStream input(kSimpleTokenCases_case.input.data(),
251                        kSimpleTokenCases_case.input.size(),
252                        kBlockSizes_case);
253  TestErrorCollector error_collector;
254  Tokenizer tokenizer(&input, &error_collector);
255
256  // Before Next() is called, the initial token should always be TYPE_START.
257  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
258  EXPECT_EQ("", tokenizer.current().text);
259  EXPECT_EQ(0, tokenizer.current().line);
260  EXPECT_EQ(0, tokenizer.current().column);
261  EXPECT_EQ(0, tokenizer.current().end_column);
262
263  // Parse the token.
264  ASSERT_TRUE(tokenizer.Next());
265
266  // Check that it has the right type.
267  EXPECT_EQ(kSimpleTokenCases_case.type, tokenizer.current().type);
268  // Check that it contains the complete input text.
269  EXPECT_EQ(kSimpleTokenCases_case.input, tokenizer.current().text);
270  // Check that it is located at the beginning of the input
271  EXPECT_EQ(0, tokenizer.current().line);
272  EXPECT_EQ(0, tokenizer.current().column);
273  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
274            tokenizer.current().end_column);
275
276  // There should be no more input.
277  EXPECT_FALSE(tokenizer.Next());
278
279  // After Next() returns false, the token should have type TYPE_END.
280  EXPECT_EQ(Tokenizer::TYPE_END, tokenizer.current().type);
281  EXPECT_EQ("", tokenizer.current().text);
282  EXPECT_EQ(0, tokenizer.current().line);
283  EXPECT_EQ(kSimpleTokenCases_case.input.size(), tokenizer.current().column);
284  EXPECT_EQ(kSimpleTokenCases_case.input.size(),
285            tokenizer.current().end_column);
286
287  // There should be no errors.
288  EXPECT_TRUE(error_collector.text_.empty());
289}
290
291TEST_1D(TokenizerTest, FloatSuffix, kBlockSizes) {
292  // Test the "allow_f_after_float" option.
293
294  // Set up the tokenizer.
295  const char* text = "1f 2.5f 6e3f 7F";
296  TestInputStream input(text, strlen(text), kBlockSizes_case);
297  TestErrorCollector error_collector;
298  Tokenizer tokenizer(&input, &error_collector);
299  tokenizer.set_allow_f_after_float(true);
300
301  // Advance through tokens and check that they are parsed as expected.
302  ASSERT_TRUE(tokenizer.Next());
303  EXPECT_EQ(tokenizer.current().text, "1f");
304  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
305  ASSERT_TRUE(tokenizer.Next());
306  EXPECT_EQ(tokenizer.current().text, "2.5f");
307  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
308  ASSERT_TRUE(tokenizer.Next());
309  EXPECT_EQ(tokenizer.current().text, "6e3f");
310  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
311  ASSERT_TRUE(tokenizer.Next());
312  EXPECT_EQ(tokenizer.current().text, "7F");
313  EXPECT_EQ(tokenizer.current().type, Tokenizer::TYPE_FLOAT);
314
315  // There should be no more input.
316  EXPECT_FALSE(tokenizer.Next());
317  // There should be no errors.
318  EXPECT_TRUE(error_collector.text_.empty());
319}
320
321#endif
322
323// -------------------------------------------------------------------
324
325// In each case, the input is parsed to produce a list of tokens.  The
326// last token in "output" must have type TYPE_END.
327struct MultiTokenCase {
328  string input;
329  Tokenizer::Token output[10];  // The compiler wants a constant array
330                                // size for initialization to work.  There
331                                // is no reason this can't be increased if
332                                // needed.
333};
334
335inline ostream& operator<<(ostream& out,
336                           const MultiTokenCase& test_case) {
337  return out << CEscape(test_case.input);
338}
339
340MultiTokenCase kMultiTokenCases[] = {
341  // Test empty input.
342  { "", {
343    { Tokenizer::TYPE_END       , ""     , 0,  0 },
344  }},
345
346  // Test all token types at the same time.
347  { "foo 1 1.2 + 'bar'", {
348    { Tokenizer::TYPE_IDENTIFIER, "foo"  , 0,  0,  3 },
349    { Tokenizer::TYPE_INTEGER   , "1"    , 0,  4,  5 },
350    { Tokenizer::TYPE_FLOAT     , "1.2"  , 0,  6,  9 },
351    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 10, 11 },
352    { Tokenizer::TYPE_STRING    , "'bar'", 0, 12, 17 },
353    { Tokenizer::TYPE_END       , ""     , 0, 17, 17 },
354  }},
355
356  // Test that consecutive symbols are parsed as separate tokens.
357  { "!@+%", {
358    { Tokenizer::TYPE_SYMBOL    , "!"    , 0, 0, 1 },
359    { Tokenizer::TYPE_SYMBOL    , "@"    , 0, 1, 2 },
360    { Tokenizer::TYPE_SYMBOL    , "+"    , 0, 2, 3 },
361    { Tokenizer::TYPE_SYMBOL    , "%"    , 0, 3, 4 },
362    { Tokenizer::TYPE_END       , ""     , 0, 4, 4 },
363  }},
364
365  // Test that newlines affect line numbers correctly.
366  { "foo bar\nrab oof", {
367    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0, 3 },
368    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  4, 7 },
369    { Tokenizer::TYPE_IDENTIFIER, "rab", 1,  0, 3 },
370    { Tokenizer::TYPE_IDENTIFIER, "oof", 1,  4, 7 },
371    { Tokenizer::TYPE_END       , ""   , 1,  7, 7 },
372  }},
373
374  // Test that tabs affect column numbers correctly.
375  { "foo\tbar  \tbaz", {
376    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
377    { Tokenizer::TYPE_IDENTIFIER, "bar", 0,  8, 11 },
378    { Tokenizer::TYPE_IDENTIFIER, "baz", 0, 16, 19 },
379    { Tokenizer::TYPE_END       , ""   , 0, 19, 19 },
380  }},
381
382  // Test that tabs in string literals affect column numbers correctly.
383  { "\"foo\tbar\" baz", {
384    { Tokenizer::TYPE_STRING    , "\"foo\tbar\"", 0,  0, 12 },
385    { Tokenizer::TYPE_IDENTIFIER, "baz"         , 0, 13, 16 },
386    { Tokenizer::TYPE_END       , ""            , 0, 16, 16 },
387  }},
388
389  // Test that line comments are ignored.
390  { "foo // This is a comment\n"
391    "bar // This is another comment", {
392    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
393    { Tokenizer::TYPE_IDENTIFIER, "bar", 1,  0,  3 },
394    { Tokenizer::TYPE_END       , ""   , 1, 30, 30 },
395  }},
396
397  // Test that block comments are ignored.
398  { "foo /* This is a block comment */ bar", {
399    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
400    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 34, 37 },
401    { Tokenizer::TYPE_END       , ""   , 0, 37, 37 },
402  }},
403
404  // Test that sh-style comments are not ignored by default.
405  { "foo # bar\n"
406    "baz", {
407    { Tokenizer::TYPE_IDENTIFIER, "foo", 0, 0, 3 },
408    { Tokenizer::TYPE_SYMBOL    , "#"  , 0, 4, 5 },
409    { Tokenizer::TYPE_IDENTIFIER, "bar", 0, 6, 9 },
410    { Tokenizer::TYPE_IDENTIFIER, "baz", 1, 0, 3 },
411    { Tokenizer::TYPE_END       , ""   , 1, 3, 3 },
412  }},
413
414  // Test all whitespace chars
415  { "foo\n\t\r\v\fbar", {
416    { Tokenizer::TYPE_IDENTIFIER, "foo", 0,  0,  3 },
417    { Tokenizer::TYPE_IDENTIFIER, "bar", 1, 11, 14 },
418    { Tokenizer::TYPE_END       , ""   , 1, 14, 14 },
419  }},
420};
421
422TEST_2D(TokenizerTest, MultipleTokens, kMultiTokenCases, kBlockSizes) {
423  // Set up the tokenizer.
424  TestInputStream input(kMultiTokenCases_case.input.data(),
425                        kMultiTokenCases_case.input.size(),
426                        kBlockSizes_case);
427  TestErrorCollector error_collector;
428  Tokenizer tokenizer(&input, &error_collector);
429
430  // Before Next() is called, the initial token should always be TYPE_START.
431  EXPECT_EQ(Tokenizer::TYPE_START, tokenizer.current().type);
432  EXPECT_EQ("", tokenizer.current().text);
433  EXPECT_EQ(0, tokenizer.current().line);
434  EXPECT_EQ(0, tokenizer.current().column);
435  EXPECT_EQ(0, tokenizer.current().end_column);
436
437  // Loop through all expected tokens.
438  int i = 0;
439  Tokenizer::Token token;
440  do {
441    token = kMultiTokenCases_case.output[i++];
442
443    SCOPED_TRACE(testing::Message() << "Token #" << i << ": " << token.text);
444
445    Tokenizer::Token previous = tokenizer.current();
446
447    // Next() should only return false when it hits the end token.
448    if (token.type != Tokenizer::TYPE_END) {
449      ASSERT_TRUE(tokenizer.Next());
450    } else {
451      ASSERT_FALSE(tokenizer.Next());
452    }
453
454    // Check that the previous token is set correctly.
455    EXPECT_EQ(previous.type, tokenizer.previous().type);
456    EXPECT_EQ(previous.text, tokenizer.previous().text);
457    EXPECT_EQ(previous.line, tokenizer.previous().line);
458    EXPECT_EQ(previous.column, tokenizer.previous().column);
459    EXPECT_EQ(previous.end_column, tokenizer.previous().end_column);
460
461    // Check that the token matches the expected one.
462    EXPECT_EQ(token.type, tokenizer.current().type);
463    EXPECT_EQ(token.text, tokenizer.current().text);
464    EXPECT_EQ(token.line, tokenizer.current().line);
465    EXPECT_EQ(token.column, tokenizer.current().column);
466    EXPECT_EQ(token.end_column, tokenizer.current().end_column);
467
468  } while (token.type != Tokenizer::TYPE_END);
469
470  // There should be no errors.
471  EXPECT_TRUE(error_collector.text_.empty());
472}
473
474// This test causes gcc 3.3.5 (and earlier?) to give the cryptic error:
475//   "sorry, unimplemented: `method_call_expr' not supported by dump_expr"
476#if !defined(__GNUC__) || __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 3)
477
478TEST_1D(TokenizerTest, ShCommentStyle, kBlockSizes) {
479  // Test the "comment_style" option.
480
481  const char* text = "foo # bar\n"
482                     "baz // qux\n"
483                     "corge /* grault */\n"
484                     "garply";
485  const char* const kTokens[] = {"foo",  // "# bar" is ignored
486                                 "baz", "/", "/", "qux",
487                                 "corge", "/", "*", "grault", "*", "/",
488                                 "garply"};
489
490  // Set up the tokenizer.
491  TestInputStream input(text, strlen(text), kBlockSizes_case);
492  TestErrorCollector error_collector;
493  Tokenizer tokenizer(&input, &error_collector);
494  tokenizer.set_comment_style(Tokenizer::SH_COMMENT_STYLE);
495
496  // Advance through tokens and check that they are parsed as expected.
497  for (int i = 0; i < GOOGLE_ARRAYSIZE(kTokens); i++) {
498    EXPECT_TRUE(tokenizer.Next());
499    EXPECT_EQ(tokenizer.current().text, kTokens[i]);
500  }
501
502  // There should be no more input.
503  EXPECT_FALSE(tokenizer.Next());
504  // There should be no errors.
505  EXPECT_TRUE(error_collector.text_.empty());
506}
507
508#endif
509
510// -------------------------------------------------------------------
511
512// In each case, the input is expected to have two tokens named "prev" and
513// "next" with comments in between.
514struct DocCommentCase {
515  string input;
516
517  const char* prev_trailing_comments;
518  const char* detached_comments[10];
519  const char* next_leading_comments;
520};
521
522inline ostream& operator<<(ostream& out,
523                           const DocCommentCase& test_case) {
524  return out << CEscape(test_case.input);
525}
526
527DocCommentCase kDocCommentCases[] = {
528  {
529    "prev next",
530
531    "",
532    {},
533    ""
534      },
535
536        {
537      "prev /* ignored */ next",
538
539      "",
540      {},
541      ""
542        },
543
544          {
545        "prev // trailing comment\n"
546            "next",
547
548            " trailing comment\n",
549            {},
550            ""
551          },
552
553            {
554          "prev\n"
555              "// leading comment\n"
556              "// line 2\n"
557              "next",
558
559              "",
560              {},
561              " leading comment\n"
562              " line 2\n"
563            },
564
565              {
566            "prev\n"
567                "// trailing comment\n"
568                "// line 2\n"
569                "\n"
570                "next",
571
572                " trailing comment\n"
573                " line 2\n",
574                {},
575                ""
576              },
577
578                {
579              "prev // trailing comment\n"
580                  "// leading comment\n"
581                  "// line 2\n"
582                  "next",
583
584                  " trailing comment\n",
585                  {},
586                  " leading comment\n"
587                  " line 2\n"
588                },
589
590                  {
591                "prev /* trailing block comment */\n"
592                    "/* leading block comment\n"
593                    " * line 2\n"
594                    " * line 3 */"
595                    "next",
596
597                    " trailing block comment ",
598                    {},
599                    " leading block comment\n"
600                    " line 2\n"
601                    " line 3 "
602                  },
603
604                    {
605                  "prev\n"
606                      "/* trailing block comment\n"
607                      " * line 2\n"
608                      " * line 3\n"
609                      " */\n"
610                      "/* leading block comment\n"
611                      " * line 2\n"
612                      " * line 3 */"
613                      "next",
614
615                      " trailing block comment\n"
616                      " line 2\n"
617                      " line 3\n",
618                      {},
619                      " leading block comment\n"
620                      " line 2\n"
621                      " line 3 "
622                    },
623
624                      {
625                    "prev\n"
626                        "// trailing comment\n"
627                        "\n"
628                        "// detached comment\n"
629                        "// line 2\n"
630                        "\n"
631                        "// second detached comment\n"
632                        "/* third detached comment\n"
633                        " * line 2 */\n"
634                        "// leading comment\n"
635                        "next",
636
637                        " trailing comment\n",
638                        {
639                      " detached comment\n"
640                          " line 2\n",
641                          " second detached comment\n",
642                          " third detached comment\n"
643                          " line 2 "
644                        },
645                          " leading comment\n"
646                        },
647
648                          {
649                        "prev /**/\n"
650                            "\n"
651                            "// detached comment\n"
652                            "\n"
653                            "// leading comment\n"
654                            "next",
655
656                            "",
657                            {
658                          " detached comment\n"
659                            },
660                              " leading comment\n"
661                            },
662
663                              {
664                            "prev /**/\n"
665                                "// leading comment\n"
666                                "next",
667
668                                "",
669                                {},
670                                " leading comment\n"
671                              },
672                              };
673
674TEST_2D(TokenizerTest, DocComments, kDocCommentCases, kBlockSizes) {
675  // Set up the tokenizer.
676  TestInputStream input(kDocCommentCases_case.input.data(),
677                        kDocCommentCases_case.input.size(),
678                        kBlockSizes_case);
679  TestErrorCollector error_collector;
680  Tokenizer tokenizer(&input, &error_collector);
681
682  // Set up a second tokenizer where we'll pass all NULLs to NextWithComments().
683  TestInputStream input2(kDocCommentCases_case.input.data(),
684                        kDocCommentCases_case.input.size(),
685                        kBlockSizes_case);
686  Tokenizer tokenizer2(&input2, &error_collector);
687
688  tokenizer.Next();
689  tokenizer2.Next();
690
691  EXPECT_EQ("prev", tokenizer.current().text);
692  EXPECT_EQ("prev", tokenizer2.current().text);
693
694  string prev_trailing_comments;
695  vector<string> detached_comments;
696  string next_leading_comments;
697  tokenizer.NextWithComments(&prev_trailing_comments, &detached_comments,
698                             &next_leading_comments);
699  tokenizer2.NextWithComments(NULL, NULL, NULL);
700  EXPECT_EQ("next", tokenizer.current().text);
701  EXPECT_EQ("next", tokenizer2.current().text);
702
703  EXPECT_EQ(kDocCommentCases_case.prev_trailing_comments,
704            prev_trailing_comments);
705
706  for (int i = 0; i < detached_comments.size(); i++) {
707    ASSERT_LT(i, GOOGLE_ARRAYSIZE(kDocCommentCases));
708    ASSERT_TRUE(kDocCommentCases_case.detached_comments[i] != NULL);
709    EXPECT_EQ(kDocCommentCases_case.detached_comments[i],
710              detached_comments[i]);
711  }
712
713  // Verify that we matched all the detached comments.
714  EXPECT_EQ(NULL,
715      kDocCommentCases_case.detached_comments[detached_comments.size()]);
716
717  EXPECT_EQ(kDocCommentCases_case.next_leading_comments,
718            next_leading_comments);
719}
720
721// -------------------------------------------------------------------
722
723// Test parse helpers.  It's not really worth setting up a full data-driven
724// test here.
725TEST_F(TokenizerTest, ParseInteger) {
726  EXPECT_EQ(0, ParseInteger("0"));
727  EXPECT_EQ(123, ParseInteger("123"));
728  EXPECT_EQ(0xabcdef12u, ParseInteger("0xabcdef12"));
729  EXPECT_EQ(0xabcdef12u, ParseInteger("0xABCDEF12"));
730  EXPECT_EQ(kuint64max, ParseInteger("0xFFFFFFFFFFFFFFFF"));
731  EXPECT_EQ(01234567, ParseInteger("01234567"));
732  EXPECT_EQ(0X123, ParseInteger("0X123"));
733
734  // Test invalid integers that may still be tokenized as integers.
735  EXPECT_EQ(0, ParseInteger("0x"));
736
737  uint64 i;
738#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
739  // Test invalid integers that will never be tokenized as integers.
740  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("zxy", kuint64max, &i),
741    "passed text that could not have been tokenized as an integer");
742  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("1.2", kuint64max, &i),
743    "passed text that could not have been tokenized as an integer");
744  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("08", kuint64max, &i),
745    "passed text that could not have been tokenized as an integer");
746  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("0xg", kuint64max, &i),
747    "passed text that could not have been tokenized as an integer");
748  EXPECT_DEBUG_DEATH(Tokenizer::ParseInteger("-1", kuint64max, &i),
749    "passed text that could not have been tokenized as an integer");
750#endif  // PROTOBUF_HAS_DEATH_TEST
751
752  // Test overflows.
753  EXPECT_TRUE (Tokenizer::ParseInteger("0", 0, &i));
754  EXPECT_FALSE(Tokenizer::ParseInteger("1", 0, &i));
755  EXPECT_TRUE (Tokenizer::ParseInteger("1", 1, &i));
756  EXPECT_TRUE (Tokenizer::ParseInteger("12345", 12345, &i));
757  EXPECT_FALSE(Tokenizer::ParseInteger("12346", 12345, &i));
758  EXPECT_TRUE (Tokenizer::ParseInteger("0xFFFFFFFFFFFFFFFF" , kuint64max, &i));
759  EXPECT_FALSE(Tokenizer::ParseInteger("0x10000000000000000", kuint64max, &i));
760}
761
762TEST_F(TokenizerTest, ParseFloat) {
763  EXPECT_DOUBLE_EQ(1    , Tokenizer::ParseFloat("1."));
764  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1e3"));
765  EXPECT_DOUBLE_EQ(1e3  , Tokenizer::ParseFloat("1E3"));
766  EXPECT_DOUBLE_EQ(1.5e3, Tokenizer::ParseFloat("1.5e3"));
767  EXPECT_DOUBLE_EQ(.1   , Tokenizer::ParseFloat(".1"));
768  EXPECT_DOUBLE_EQ(.25  , Tokenizer::ParseFloat(".25"));
769  EXPECT_DOUBLE_EQ(.1e3 , Tokenizer::ParseFloat(".1e3"));
770  EXPECT_DOUBLE_EQ(.25e3, Tokenizer::ParseFloat(".25e3"));
771  EXPECT_DOUBLE_EQ(.1e+3, Tokenizer::ParseFloat(".1e+3"));
772  EXPECT_DOUBLE_EQ(.1e-3, Tokenizer::ParseFloat(".1e-3"));
773  EXPECT_DOUBLE_EQ(5    , Tokenizer::ParseFloat("5"));
774  EXPECT_DOUBLE_EQ(6e-12, Tokenizer::ParseFloat("6e-12"));
775  EXPECT_DOUBLE_EQ(1.2  , Tokenizer::ParseFloat("1.2"));
776  EXPECT_DOUBLE_EQ(1.e2 , Tokenizer::ParseFloat("1.e2"));
777
778  // Test invalid integers that may still be tokenized as integers.
779  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e"));
780  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1e-"));
781  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.e"));
782
783  // Test 'f' suffix.
784  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1f"));
785  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1.0f"));
786  EXPECT_DOUBLE_EQ(1, Tokenizer::ParseFloat("1F"));
787
788  // These should parse successfully even though they are out of range.
789  // Overflows become infinity and underflows become zero.
790  EXPECT_EQ(     0.0, Tokenizer::ParseFloat("1e-9999999999999999999999999999"));
791  EXPECT_EQ(HUGE_VAL, Tokenizer::ParseFloat("1e+9999999999999999999999999999"));
792
793#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
794  // Test invalid integers that will never be tokenized as integers.
795  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("zxy"),
796    "passed text that could not have been tokenized as a float");
797  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("1-e0"),
798    "passed text that could not have been tokenized as a float");
799  EXPECT_DEBUG_DEATH(Tokenizer::ParseFloat("-1.0"),
800    "passed text that could not have been tokenized as a float");
801#endif  // PROTOBUF_HAS_DEATH_TEST
802}
803
804TEST_F(TokenizerTest, ParseString) {
805  string output;
806  Tokenizer::ParseString("'hello'", &output);
807  EXPECT_EQ("hello", output);
808  Tokenizer::ParseString("\"blah\\nblah2\"", &output);
809  EXPECT_EQ("blah\nblah2", output);
810  Tokenizer::ParseString("'\\1x\\1\\123\\739\\52\\334n\\3'", &output);
811  EXPECT_EQ("\1x\1\123\739\52\334n\3", output);
812  Tokenizer::ParseString("'\\x20\\x4'", &output);
813  EXPECT_EQ("\x20\x4", output);
814
815  // Test invalid strings that may still be tokenized as strings.
816  Tokenizer::ParseString("\"\\a\\l\\v\\t", &output);  // \l is invalid
817  EXPECT_EQ("\a?\v\t", output);
818  Tokenizer::ParseString("'", &output);
819  EXPECT_EQ("", output);
820  Tokenizer::ParseString("'\\", &output);
821  EXPECT_EQ("\\", output);
822
823  // Experiment with Unicode escapes. Here are one-, two- and three-byte Unicode
824  // characters.
825  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\U00024b62XX'", &output);
826  EXPECT_EQ("$¢€��XX", output);
827  // Same thing encoded using UTF16.
828  Tokenizer::ParseString("'\\u0024\\u00a2\\u20ac\\ud852\\udf62XX'", &output);
829  EXPECT_EQ("$¢€��XX", output);
830  // Here's some broken UTF16; there's a head surrogate with no tail surrogate.
831  // We just output this as if it were UTF8; it's not a defined code point, but
832  // it has a defined encoding.
833  Tokenizer::ParseString("'\\ud852XX'", &output);
834  EXPECT_EQ("\xed\xa1\x92XX", output);
835  // Malformed escape: Demons may fly out of the nose.
836  Tokenizer::ParseString("\\u0", &output);
837  EXPECT_EQ("u0", output);
838
839  // Test invalid strings that will never be tokenized as strings.
840#ifdef PROTOBUF_HAS_DEATH_TEST  // death tests do not work on Windows yet
841  EXPECT_DEBUG_DEATH(Tokenizer::ParseString("", &output),
842    "passed text that could not have been tokenized as a string");
843#endif  // PROTOBUF_HAS_DEATH_TEST
844}
845
846TEST_F(TokenizerTest, ParseStringAppend) {
847  // Check that ParseString and ParseStringAppend differ.
848  string output("stuff+");
849  Tokenizer::ParseStringAppend("'hello'", &output);
850  EXPECT_EQ("stuff+hello", output);
851  Tokenizer::ParseString("'hello'", &output);
852  EXPECT_EQ("hello", output);
853}
854
855// -------------------------------------------------------------------
856
857// Each case parses some input text, ignoring the tokens produced, and
858// checks that the error output matches what is expected.
859struct ErrorCase {
860  string input;
861  bool recoverable;  // True if the tokenizer should be able to recover and
862                     // parse more tokens after seeing this error.  Cases
863                     // for which this is true must end with "foo" as
864                     // the last token, which the test will check for.
865  const char* errors;
866};
867
868inline ostream& operator<<(ostream& out,
869                           const ErrorCase& test_case) {
870  return out << CEscape(test_case.input);
871}
872
873ErrorCase kErrorCases[] = {
874  // String errors.
875  { "'\\l' foo", true,
876    "0:2: Invalid escape sequence in string literal.\n" },
877  { "'\\x' foo", true,
878    "0:3: Expected hex digits for escape sequence.\n" },
879  { "'foo", false,
880    "0:4: Unexpected end of string.\n" },
881  { "'bar\nfoo", true,
882    "0:4: String literals cannot cross line boundaries.\n" },
883  { "'\\u01' foo", true,
884    "0:5: Expected four hex digits for \\u escape sequence.\n" },
885  { "'\\u01' foo", true,
886    "0:5: Expected four hex digits for \\u escape sequence.\n" },
887  { "'\\uXYZ' foo", true,
888    "0:3: Expected four hex digits for \\u escape sequence.\n" },
889
890  // Integer errors.
891  { "123foo", true,
892    "0:3: Need space between number and identifier.\n" },
893
894  // Hex/octal errors.
895  { "0x foo", true,
896    "0:2: \"0x\" must be followed by hex digits.\n" },
897  { "0541823 foo", true,
898    "0:4: Numbers starting with leading zero must be in octal.\n" },
899  { "0x123z foo", true,
900    "0:5: Need space between number and identifier.\n" },
901  { "0x123.4 foo", true,
902    "0:5: Hex and octal numbers must be integers.\n" },
903  { "0123.4 foo", true,
904    "0:4: Hex and octal numbers must be integers.\n" },
905
906  // Float errors.
907  { "1e foo", true,
908    "0:2: \"e\" must be followed by exponent.\n" },
909  { "1e- foo", true,
910    "0:3: \"e\" must be followed by exponent.\n" },
911  { "1.2.3 foo", true,
912    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
913  { "1e2.3 foo", true,
914    "0:3: Already saw decimal point or exponent; can't have another one.\n" },
915  { "a.1 foo", true,
916    "0:1: Need space between identifier and decimal point.\n" },
917  // allow_f_after_float not enabled, so this should be an error.
918  { "1.0f foo", true,
919    "0:3: Need space between number and identifier.\n" },
920
921  // Block comment errors.
922  { "/*", false,
923    "0:2: End-of-file inside block comment.\n"
924    "0:0:   Comment started here.\n"},
925  { "/*/*/ foo", true,
926    "0:3: \"/*\" inside block comment.  Block comments cannot be nested.\n"},
927
928  // Control characters.  Multiple consecutive control characters should only
929  // produce one error.
930  { "\b foo", true,
931    "0:0: Invalid control characters encountered in text.\n" },
932  { "\b\b foo", true,
933    "0:0: Invalid control characters encountered in text.\n" },
934
935  // Check that control characters at end of input don't result in an
936  // infinite loop.
937  { "\b", false,
938    "0:0: Invalid control characters encountered in text.\n" },
939
940  // Check recovery from '\0'.  We have to explicitly specify the length of
941  // these strings because otherwise the string constructor will just call
942  // strlen() which will see the first '\0' and think that is the end of the
943  // string.
944  { string("\0foo", 4), true,
945    "0:0: Invalid control characters encountered in text.\n" },
946  { string("\0\0foo", 5), true,
947    "0:0: Invalid control characters encountered in text.\n" },
948
949  // Check error from high order bits set
950  { "\300foo", true,
951    "0:0: Interpreting non ascii codepoint 192.\n" },
952};
953
954TEST_2D(TokenizerTest, Errors, kErrorCases, kBlockSizes) {
955  // Set up the tokenizer.
956  TestInputStream input(kErrorCases_case.input.data(),
957                        kErrorCases_case.input.size(),
958                        kBlockSizes_case);
959  TestErrorCollector error_collector;
960  Tokenizer tokenizer(&input, &error_collector);
961
962  // Ignore all input, except remember if the last token was "foo".
963  bool last_was_foo = false;
964  while (tokenizer.Next()) {
965    last_was_foo = tokenizer.current().text == "foo";
966  }
967
968  // Check that the errors match what was expected.
969  EXPECT_EQ(kErrorCases_case.errors, error_collector.text_);
970
971  // If the error was recoverable, make sure we saw "foo" after it.
972  if (kErrorCases_case.recoverable) {
973    EXPECT_TRUE(last_was_foo);
974  }
975}
976
977// -------------------------------------------------------------------
978
979TEST_1D(TokenizerTest, BackUpOnDestruction, kBlockSizes) {
980  string text = "foo bar";
981  TestInputStream input(text.data(), text.size(), kBlockSizes_case);
982
983  // Create a tokenizer, read one token, then destroy it.
984  {
985    TestErrorCollector error_collector;
986    Tokenizer tokenizer(&input, &error_collector);
987
988    tokenizer.Next();
989  }
990
991  // Only "foo" should have been read.
992  EXPECT_EQ(strlen("foo"), input.ByteCount());
993}
994
995
996}  // namespace
997}  // namespace io
998}  // namespace protobuf
999}  // namespace google
1000