15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Protocol Buffers - Google's data interchange format
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 Google Inc.  All rights reserved.
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// http://code.google.com/p/protobuf/
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Redistribution and use in source and binary forms, with or without
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// modification, are permitted provided that the following conditions are
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// met:
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Redistributions of source code must retain the above copyright
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// notice, this list of conditions and the following disclaimer.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Redistributions in binary form must reproduce the above
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// copyright notice, this list of conditions and the following disclaimer
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the documentation and/or other materials provided with the
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// distribution.
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//     * Neither the name of Google Inc. nor the names of its
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// contributors may be used to endorse or promote products derived from
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this software without specific prior written permission.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Author: kenton@google.com (Kenton Varda)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  Based on original Protocol Buffers design by
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//  Sanjay Ghemawat, Jeff Dean, and others.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Class for parsing tokenized text from a ZeroCopyInputStream.
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string>
413d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch#include <vector>
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <google/protobuf/stubs/common.h>
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace google {
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace protobuf {
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace io {
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ZeroCopyInputStream;     // zero_copy_stream.h
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Defined in this file.
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ErrorCollector;
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Tokenizer;
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Abstract interface for an object which collects the errors that occur
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// during parsing.  A typical implementation might simply print the errors
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to stdout.
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LIBPROTOBUF_EXPORT ErrorCollector {
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline ErrorCollector() {}
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~ErrorCollector();
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Indicates that there was an error in the input at the given line and
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // column numbers.  The numbers are zero-based, so you may want to add
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // 1 to each before printing them.
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void AddError(int line, int column, const string& message) = 0;
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Indicates that there was a warning in the input at the given line and
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // column numbers.  The numbers are zero-based, so you may want to add
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // 1 to each before printing them.
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void AddWarning(int line, int column, const string& message) { }
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This class converts a stream of raw text into a stream of tokens for
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the protocol definition parser to parse.  The tokens recognized are
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// similar to those that make up the C language; see the TokenType enum for
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// precise descriptions.  Whitespace and comments are skipped.  By default,
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// C- and C++-style comments are recognized, but other styles can be used by
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// calling set_comment_style().
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LIBPROTOBUF_EXPORT Tokenizer {
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Construct a Tokenizer that reads and tokenizes text from the given
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // input stream and writes errors to the given error_collector.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The caller keeps ownership of input and error_collector.
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~Tokenizer();
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum TokenType {
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_START,       // Next() has not yet been called.
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_END,         // End of input reached.  "text" is empty.
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // starting with a digit.  It is an error for a number
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // to be followed by an identifier with no space in
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // between.
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // the digits are decimal, but a prefix of "0x" indicates
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // a hex number and a leading zero indicates octal, just
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // like with C numeric literals.  A leading negative sign
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // is NOT included in the token; it's up to the parser to
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // interpret the unary minus operator on its own.
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // an exponent.  Always in decimal.  Again, never
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // negative.
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // or double quotes can be used, but they must match.
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // A string literal cannot cross a line break.
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // Symbols are always a single character, so "!+$%" is
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      // four tokens.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Structure representing a token read from the token stream.
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  struct Token {
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    TokenType type;
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    string text;       // The exact text of the token as it appeared in
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       // the input.  e.g. tokens of TYPE_STRING will still
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       // be escaped and in quotes.
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // "line" and "column" specify the position of the first character of
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // the token within the input stream.  They are zero-based.
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int line;
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int column;
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int end_column;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Get the current token.  This is updated when Next() is called.  Before
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the first call to Next(), current() has type TYPE_START and no contents.
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const Token& current();
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Return the previous token -- i.e. what current() returned before the
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // previous call to Next().
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const Token& previous();
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Advance to the next token.  Returns false if the end of the input is
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // reached.
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool Next();
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1413d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // Like Next(), but also collects comments which appear between the previous
1423d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // and next tokens.
1433d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1443d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // Comments which appear to be attached to the previous token are stored
1453d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // in *prev_tailing_comments.  Comments which appear to be attached to the
1463d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // next token are stored in *next_leading_comments.  Comments appearing in
1473d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // between which do not appear to be attached to either will be added to
1483d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // detached_comments.  Any of these parameters can be NULL to simply discard
1493d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // the comments.
1503d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1513d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // A series of line comments appearing on consecutive lines, with no other
1523d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // tokens appearing on those lines, will be treated as a single comment.
1533d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1543d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // Only the comment content is returned; comment markers (e.g. //) are
1553d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // stripped out.  For block comments, leading whitespace and an asterisk will
1563d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // be stripped from the beginning of each line other than the first.  Newlines
1573d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // are included in the output.
1583d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1593d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // Examples:
1603d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1613d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional int32 foo = 1;  // Comment attached to foo.
1623d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Comment attached to bar.
1633d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional int32 bar = 2;
1643d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1653d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional string baz = 3;
1663d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Comment attached to baz.
1673d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Another line attached to baz.
1683d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1693d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Comment attached to qux.
1703d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   //
1713d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Another line attached to qux.
1723d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional double qux = 4;
1733d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1743d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // Detached comment.  This is not attached to qux or corge
1753d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   // because there are blank lines separating it from both.
1763d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //
1773d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional string corge = 5;
1783d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   /* Block comment attached
1793d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //    * to corge.  Leading asterisks
1803d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //    * will be removed. */
1813d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   /* Block comment attached to
1823d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //    * grault. */
1833d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  //   optional int32 grault = 6;
1843d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  bool NextWithComments(string* prev_trailing_comments,
1853d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch                        vector<string>* detached_comments,
1863d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch                        string* next_leading_comments);
1873d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Parse helpers ---------------------------------------------------
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // result is undefined (possibly an assert failure).
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static double ParseFloat(const string& text);
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // result is undefined (possibly an assert failure).
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static void ParseString(const string& text, string* output);
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Identical to ParseString, but appends to output.
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static void ParseStringAppend(const string& text, string* output);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Parses a TYPE_INTEGER token.  Returns false if the result would be
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // greater than max_value.  Otherwise, returns true and sets *output to the
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // result.  If the text is not from a Token of type TYPE_INTEGER originally
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // parsed by a Tokenizer, the result is undefined (possibly an assert
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // failure).
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static bool ParseInteger(const string& text, uint64 max_value,
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           uint64* output);
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Options ---------------------------------------------------------
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // which would otherwise be integers but which have the 'f' suffix will be
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // forced to be interpreted as floats.  For all other purposes, the 'f' is
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // ignored.
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Valid values for set_comment_style().
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  enum CommentStyle {
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Line comments begin with "//", block comments are delimited by "/*" and
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // "*/".
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    CPP_COMMENT_STYLE,
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Line comments begin with "#".  No way to write block comments.
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    SH_COMMENT_STYLE
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  };
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Sets the comment style.
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void set_comment_style(CommentStyle style) { comment_style_ = style; }
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // -----------------------------------------------------------------
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Token current_;           // Returned by current().
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Token previous_;          // Returned by previous().
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ZeroCopyInputStream* input_;
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ErrorCollector* error_collector_;
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* buffer_;      // Current buffer returned from input_.
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int buffer_size_;         // Size of buffer_.
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int buffer_pos_;          // Current position within the buffer.
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool read_error_;         // Did we previously encounter a read error?
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Line and column number of current_char_ within the whole input stream.
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int line_;
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int column_;
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2513d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // String to which text should be appended as we advance through it.
2523d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // Call RecordTo(&str) to start recording and StopRecording() to stop.
2533d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
2543d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // position within the current buffer where recording started.
2553d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  string* record_target_;
2563d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  int record_start_;
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Options.
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool allow_f_after_float_;
2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CommentStyle comment_style_;
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Since we count columns we need to interpret tabs somehow.  We'll take
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the standard 8-character definition for lack of any way to do better.
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kTabWidth = 8;
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // -----------------------------------------------------------------
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Helper methods.
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Consume this character and advance to the next one.
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void NextChar();
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Read a new buffer from the input.
2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Refresh();
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2753d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  inline void RecordTo(string* target);
2763d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  inline void StopRecording();
2773d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called when the current character is the first character of a new
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // token (not including whitespace or comments).
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void StartToken();
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Called when the current character is the first character after the
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // end of the last token.  After this returns, current_.text will
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // contain all text consumed since StartToken() was called.
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void EndToken();
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Convenience method to add an error at the current line and column.
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void AddError(const string& message) {
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    error_collector_->AddError(line_, column_, message);
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // -----------------------------------------------------------------
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The following four methods are used to consume tokens of specific
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // types.  They are actually used to consume all characters *after*
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the first, since the calling function consumes the first character
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // in order to decide what kind of token is being read.
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Read and consume a string, ending when the given delimiter is
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // consumed.
2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void ConsumeString(char delimiter);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // depending on what was read.  This needs to know if the first
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // character was a zero in order to correctly recognize hex and octal
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // numbers.
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // It also needs to know if the first characted was a . to parse floating
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // point correctly.
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Consume the rest of a line.
3103d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  void ConsumeLineComment(string* content);
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Consume until "*/".
3123d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  void ConsumeBlockComment(string* content);
3133d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
3143d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  enum NextCommentStatus {
3153d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    // Started a line comment.
3163d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    LINE_COMMENT,
3173d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
3183d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    // Started a block comment.
3193d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    BLOCK_COMMENT,
3203d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
3213d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    // Consumed a slash, then realized it wasn't a comment.  current_ has
3223d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    // been filled in with a slash token.  The caller should return it.
3233d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    SLASH_NOT_COMMENT,
3243d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
3253d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    // We do not appear to be starting a comment here.
3263d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch    NO_COMMENT
3273d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  };
3283d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch
3293d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // If we're at the start of a new comment, consume it and return what kind
3303d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  // of comment it is.
3313d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch  NextCommentStatus TryConsumeCommentStart();
3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // -----------------------------------------------------------------
3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // These helper methods make the parsing code more readable.  The
3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // "character classes" refered to are defined at the top of the .cc file.
3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Basically it is a C++ class with one method:
3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   static bool InClass(char c);
3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The method returns true if c is a member of this "class", like "Letter"
3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // or "Digit".
3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns true if the current character is of the given character
3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // class, but does not consume anything.
3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  template<typename CharacterClass>
3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline bool LookingAt();
3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // If the current character is in the given class, consume it and return
3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // true.  Otherwise return false.
3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // e.g. TryConsumeOne<Letter>()
3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  template<typename CharacterClass>
3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline bool TryConsumeOne();
3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Like above, but try to consume the specific character indicated.
3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline bool TryConsume(char c);
3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Consume zero or more of the given character class.
3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  template<typename CharacterClass>
3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void ConsumeZeroOrMore();
3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Consume one or more of the given character class or log the given
3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // error message.
3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  template<typename CharacterClass>
3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  inline void ConsumeOneOrMore(const char* error);
3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// inline methods ====================================================
3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline const Tokenizer::Token& Tokenizer::current() {
3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return current_;
3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline const Tokenizer::Token& Tokenizer::previous() {
3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return previous_;
3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline void Tokenizer::ParseString(const string& text, string* output) {
3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  output->clear();
3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ParseStringAppend(text, output);
3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace io
3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace protobuf
3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace google
3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
385