1fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Protocol Buffers - Google's data interchange format
2fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Copyright 2008 Google Inc.  All rights reserved.
3fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// http://code.google.com/p/protobuf/
4fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//
5fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Redistribution and use in source and binary forms, with or without
6fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// modification, are permitted provided that the following conditions are
7fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// met:
8fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//
9fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//     * Redistributions of source code must retain the above copyright
10fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// notice, this list of conditions and the following disclaimer.
11fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//     * Redistributions in binary form must reproduce the above
12fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// copyright notice, this list of conditions and the following disclaimer
13fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// in the documentation and/or other materials provided with the
14fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// distribution.
15fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//     * Neither the name of Google Inc. nor the names of its
16fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// contributors may be used to endorse or promote products derived from
17fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// this software without specific prior written permission.
18fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//
19fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
31fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Author: kenton@google.com (Kenton Varda)
32fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//  Based on original Protocol Buffers design by
33fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//  Sanjay Ghemawat, Jeff Dean, and others.
34fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville//
35fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Class for parsing tokenized text from a ZeroCopyInputStream.
36fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
37fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
40fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#include <string>
41fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#include <google/protobuf/stubs/common.h>
42fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
43fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace google {
44fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace protobuf {
45fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savillenamespace io {
46fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
47fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleclass ZeroCopyInputStream;     // zero_copy_stream.h
48fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
49fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Defined in this file.
50fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleclass ErrorCollector;
51fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleclass Tokenizer;
52fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
53fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// Abstract interface for an object which collects the errors that occur
54fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// during parsing.  A typical implementation might simply print the errors
55fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// to stdout.
56fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleclass LIBPROTOBUF_EXPORT ErrorCollector {
57fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville public:
58fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline ErrorCollector() {}
59fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  virtual ~ErrorCollector();
60fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
61fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Indicates that there was an error in the input at the given line and
62fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // column numbers.  The numbers are zero-based, so you may want to add
63fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // 1 to each before printing them.
64fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  virtual void AddError(int line, int column, const string& message) = 0;
65fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
66d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  // Indicates that there was a warning in the input at the given line and
67d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  // column numbers.  The numbers are zero-based, so you may want to add
68d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  // 1 to each before printing them.
69d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville  virtual void AddWarning(int line, int column, const string& message) { }
70d0332953cda33fb4f8e24ebff9c49159b69c43d6Wink Saville
71fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville private:
72fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
73fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
74fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
75fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// This class converts a stream of raw text into a stream of tokens for
76fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// the protocol definition parser to parse.  The tokens recognized are
77fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// similar to those that make up the C language; see the TokenType enum for
78fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// precise descriptions.  Whitespace and comments are skipped.  By default,
79fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// C- and C++-style comments are recognized, but other styles can be used by
80fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// calling set_comment_style().
81fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleclass LIBPROTOBUF_EXPORT Tokenizer {
82fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville public:
83fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Construct a Tokenizer that reads and tokenizes text from the given
84fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // input stream and writes errors to the given error_collector.
85fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // The caller keeps ownership of input and error_collector.
86fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
87fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  ~Tokenizer();
88fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
89fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  enum TokenType {
90fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_START,       // Next() has not yet been called.
91fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_END,         // End of input reached.  "text" is empty.
92fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
93fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
94fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // starting with a digit.  It is an error for a number
95fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // to be followed by an identifier with no space in
96fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // between.
97fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
98fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // the digits are decimal, but a prefix of "0x" indicates
99fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // a hex number and a leading zero indicates octal, just
100fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // like with C numeric literals.  A leading negative sign
101fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // is NOT included in the token; it's up to the parser to
102fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // interpret the unary minus operator on its own.
103fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
104fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // an exponent.  Always in decimal.  Again, never
105fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // negative.
106fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
107fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // or double quotes can be used, but they must match.
108fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // A string literal cannot cross a line break.
109fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
110fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // Symbols are always a single character, so "!+$%" is
111fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                      // four tokens.
112fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  };
113fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
114fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Structure representing a token read from the token stream.
115fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  struct Token {
116fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    TokenType type;
117fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    string text;       // The exact text of the token as it appeared in
118fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                       // the input.  e.g. tokens of TYPE_STRING will still
119fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                       // be escaped and in quotes.
120fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
121fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // "line" and "column" specify the position of the first character of
122fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // the token within the input stream.  They are zero-based.
123fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    int line;
124fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    int column;
125fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  };
126fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
127fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Get the current token.  This is updated when Next() is called.  Before
128fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // the first call to Next(), current() has type TYPE_START and no contents.
129fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const Token& current();
130fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
131fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Advance to the next token.  Returns false if the end of the input is
132fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // reached.
133fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  bool Next();
134fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
135fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Parse helpers ---------------------------------------------------
136fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
137fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
138fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
139fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // result is undefined (possibly an assert failure).
140fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  static double ParseFloat(const string& text);
141fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
142fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
143fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
144fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // result is undefined (possibly an assert failure).
145fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  static void ParseString(const string& text, string* output);
146fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
147fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Identical to ParseString, but appends to output.
148fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  static void ParseStringAppend(const string& text, string* output);
149fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
150fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Parses a TYPE_INTEGER token.  Returns false if the result would be
151fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // greater than max_value.  Otherwise, returns true and sets *output to the
152fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // result.  If the text is not from a Token of type TYPE_INTEGER originally
153fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // parsed by a Tokenizer, the result is undefined (possibly an assert
154fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // failure).
155fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  static bool ParseInteger(const string& text, uint64 max_value,
156fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville                           uint64* output);
157fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
158fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Options ---------------------------------------------------------
159fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
160fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
161fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // which would otherwise be integers but which have the 'f' suffix will be
162fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // forced to be interpreted as floats.  For all other purposes, the 'f' is
163fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // ignored.
164fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
165fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
166fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Valid values for set_comment_style().
167fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  enum CommentStyle {
168fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Line comments begin with "//", block comments are delimited by "/*" and
169fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // "*/".
170fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    CPP_COMMENT_STYLE,
171fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    // Line comments begin with "#".  No way to write block comments.
172fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    SH_COMMENT_STYLE
173fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  };
174fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
175fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Sets the comment style.
176fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void set_comment_style(CommentStyle style) { comment_style_ = style; }
177fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
178fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // -----------------------------------------------------------------
179fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville private:
180fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
181fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
182fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  Token current_;           // Returned by current().
183fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
184fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  ZeroCopyInputStream* input_;
185fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  ErrorCollector* error_collector_;
186fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
187fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
188fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  const char* buffer_;      // Current buffer returned from input_.
189fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int buffer_size_;         // Size of buffer_.
190fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int buffer_pos_;          // Current position within the buffer.
191fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  bool read_error_;         // Did we previously encounter a read error?
192fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
193fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Line and column number of current_char_ within the whole input stream.
194fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int line_;
195fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int column_;
196fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
197fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Position in buffer_ where StartToken() was called.  If the token
198fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // started in the previous buffer, this is zero, and current_.text already
199fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // contains the part of the token from the previous buffer.  If not
200fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // currently parsing a token, this is -1.
201fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  int token_start_;
202fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
203fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Options.
204fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  bool allow_f_after_float_;
205fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  CommentStyle comment_style_;
206fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
207fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Since we count columns we need to interpret tabs somehow.  We'll take
208fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // the standard 8-character definition for lack of any way to do better.
209fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  static const int kTabWidth = 8;
210fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
211fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // -----------------------------------------------------------------
212fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Helper methods.
213fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
214fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Consume this character and advance to the next one.
215fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void NextChar();
216fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
217fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Read a new buffer from the input.
218fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void Refresh();
219fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
220fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Called when the current character is the first character of a new
221fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // token (not including whitespace or comments).
222fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline void StartToken();
223fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Called when the current character is the first character after the
224fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // end of the last token.  After this returns, current_.text will
225fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // contain all text consumed since StartToken() was called.
226fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline void EndToken();
227fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
228fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Convenience method to add an error at the current line and column.
229fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void AddError(const string& message) {
230fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville    error_collector_->AddError(line_, column_, message);
231fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  }
232fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
233fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // -----------------------------------------------------------------
234fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // The following four methods are used to consume tokens of specific
235fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // types.  They are actually used to consume all characters *after*
236fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // the first, since the calling function consumes the first character
237fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // in order to decide what kind of token is being read.
238fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
239fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Read and consume a string, ending when the given delimiter is
240fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // consumed.
241fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void ConsumeString(char delimiter);
242fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
243fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
244fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // depending on what was read.  This needs to know if the first
245fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // character was a zero in order to correctly recognize hex and octal
246fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // numbers.
247fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // It also needs to know if the first characted was a . to parse floating
248fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // point correctly.
249fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
250fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
251fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Consume the rest of a line.
252fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void ConsumeLineComment();
253fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Consume until "*/".
254fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  void ConsumeBlockComment();
255fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
256fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // -----------------------------------------------------------------
257fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // These helper methods make the parsing code more readable.  The
258fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // "character classes" refered to are defined at the top of the .cc file.
259fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Basically it is a C++ class with one method:
260fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  //   static bool InClass(char c);
261fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // The method returns true if c is a member of this "class", like "Letter"
262fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // or "Digit".
263fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
264fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Returns true if the current character is of the given character
265fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // class, but does not consume anything.
266fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  template<typename CharacterClass>
267fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline bool LookingAt();
268fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
269fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // If the current character is in the given class, consume it and return
270fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // true.  Otherwise return false.
271fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // e.g. TryConsumeOne<Letter>()
272fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  template<typename CharacterClass>
273fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline bool TryConsumeOne();
274fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
275fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Like above, but try to consume the specific character indicated.
276fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline bool TryConsume(char c);
277fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
278fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Consume zero or more of the given character class.
279fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  template<typename CharacterClass>
280fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline void ConsumeZeroOrMore();
281fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
282fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // Consume one or more of the given character class or log the given
283fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // error message.
284fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
285fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  template<typename CharacterClass>
286fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  inline void ConsumeOneOrMore(const char* error);
287fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville};
288fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
289fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville// inline methods ====================================================
290fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleinline const Tokenizer::Token& Tokenizer::current() {
291fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  return current_;
292fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
293fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
294fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Savilleinline void Tokenizer::ParseString(const string& text, string* output) {
295fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  output->clear();
296fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville  ParseStringAppend(text, output);
297fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}
298fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
299fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace io
300fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace protobuf
301fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville
302fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville}  // namespace google
303fbaaef999ba563838ebd00874ed8a1c01fbf286dWink Saville#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
304