1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Class for parsing tokenized text from a ZeroCopyInputStream.
36
37#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39
40#include <string>
41#include <google/protobuf/stubs/common.h>
42
43namespace google {
44namespace protobuf {
45namespace io {
46
47class ZeroCopyInputStream;     // zero_copy_stream.h
48
49// Defined in this file.
50class ErrorCollector;
51class Tokenizer;
52
53// Abstract interface for an object which collects the errors that occur
54// during parsing.  A typical implementation might simply print the errors
55// to stdout.
56class LIBPROTOBUF_EXPORT ErrorCollector {
57 public:
58  inline ErrorCollector() {}
59  virtual ~ErrorCollector();
60
61  // Indicates that there was an error in the input at the given line and
62  // column numbers.  The numbers are zero-based, so you may want to add
63  // 1 to each before printing them.
64  virtual void AddError(int line, int column, const string& message) = 0;
65
66  // Indicates that there was a warning in the input at the given line and
67  // column numbers.  The numbers are zero-based, so you may want to add
68  // 1 to each before printing them.
69  virtual void AddWarning(int line, int column, const string& message) { }
70
71 private:
72  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
73};
74
75// This class converts a stream of raw text into a stream of tokens for
76// the protocol definition parser to parse.  The tokens recognized are
77// similar to those that make up the C language; see the TokenType enum for
78// precise descriptions.  Whitespace and comments are skipped.  By default,
79// C- and C++-style comments are recognized, but other styles can be used by
80// calling set_comment_style().
81class LIBPROTOBUF_EXPORT Tokenizer {
82 public:
83  // Construct a Tokenizer that reads and tokenizes text from the given
84  // input stream and writes errors to the given error_collector.
85  // The caller keeps ownership of input and error_collector.
86  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
87  ~Tokenizer();
88
89  enum TokenType {
90    TYPE_START,       // Next() has not yet been called.
91    TYPE_END,         // End of input reached.  "text" is empty.
92
93    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
94                      // starting with a digit.  It is an error for a number
95                      // to be followed by an identifier with no space in
96                      // between.
97    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
98                      // the digits are decimal, but a prefix of "0x" indicates
99                      // a hex number and a leading zero indicates octal, just
100                      // like with C numeric literals.  A leading negative sign
101                      // is NOT included in the token; it's up to the parser to
102                      // interpret the unary minus operator on its own.
103    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
104                      // an exponent.  Always in decimal.  Again, never
105                      // negative.
106    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
107                      // or double quotes can be used, but they must match.
108                      // A string literal cannot cross a line break.
109    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
110                      // Symbols are always a single character, so "!+$%" is
111                      // four tokens.
112  };
113
114  // Structure representing a token read from the token stream.
115  struct Token {
116    TokenType type;
117    string text;       // The exact text of the token as it appeared in
118                       // the input.  e.g. tokens of TYPE_STRING will still
119                       // be escaped and in quotes.
120
121    // "line" and "column" specify the position of the first character of
122    // the token within the input stream.  They are zero-based.
123    int line;
124    int column;
125  };
126
127  // Get the current token.  This is updated when Next() is called.  Before
128  // the first call to Next(), current() has type TYPE_START and no contents.
129  const Token& current();
130
131  // Advance to the next token.  Returns false if the end of the input is
132  // reached.
133  bool Next();
134
135  // Parse helpers ---------------------------------------------------
136
137  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
138  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
139  // result is undefined (possibly an assert failure).
140  static double ParseFloat(const string& text);
141
142  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
143  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
144  // result is undefined (possibly an assert failure).
145  static void ParseString(const string& text, string* output);
146
147  // Identical to ParseString, but appends to output.
148  static void ParseStringAppend(const string& text, string* output);
149
150  // Parses a TYPE_INTEGER token.  Returns false if the result would be
151  // greater than max_value.  Otherwise, returns true and sets *output to the
152  // result.  If the text is not from a Token of type TYPE_INTEGER originally
153  // parsed by a Tokenizer, the result is undefined (possibly an assert
154  // failure).
155  static bool ParseInteger(const string& text, uint64 max_value,
156                           uint64* output);
157
158  // Options ---------------------------------------------------------
159
160  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
161  // which would otherwise be integers but which have the 'f' suffix will be
162  // forced to be interpreted as floats.  For all other purposes, the 'f' is
163  // ignored.
164  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
165
166  // Valid values for set_comment_style().
167  enum CommentStyle {
168    // Line comments begin with "//", block comments are delimited by "/*" and
169    // "*/".
170    CPP_COMMENT_STYLE,
171    // Line comments begin with "#".  No way to write block comments.
172    SH_COMMENT_STYLE
173  };
174
175  // Sets the comment style.
176  void set_comment_style(CommentStyle style) { comment_style_ = style; }
177
178  // -----------------------------------------------------------------
179 private:
180  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
181
182  Token current_;           // Returned by current().
183
184  ZeroCopyInputStream* input_;
185  ErrorCollector* error_collector_;
186
187  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
188  const char* buffer_;      // Current buffer returned from input_.
189  int buffer_size_;         // Size of buffer_.
190  int buffer_pos_;          // Current position within the buffer.
191  bool read_error_;         // Did we previously encounter a read error?
192
193  // Line and column number of current_char_ within the whole input stream.
194  int line_;
195  int column_;
196
197  // Position in buffer_ where StartToken() was called.  If the token
198  // started in the previous buffer, this is zero, and current_.text already
199  // contains the part of the token from the previous buffer.  If not
200  // currently parsing a token, this is -1.
201  int token_start_;
202
203  // Options.
204  bool allow_f_after_float_;
205  CommentStyle comment_style_;
206
207  // Since we count columns we need to interpret tabs somehow.  We'll take
208  // the standard 8-character definition for lack of any way to do better.
209  static const int kTabWidth = 8;
210
211  // -----------------------------------------------------------------
212  // Helper methods.
213
214  // Consume this character and advance to the next one.
215  void NextChar();
216
217  // Read a new buffer from the input.
218  void Refresh();
219
220  // Called when the current character is the first character of a new
221  // token (not including whitespace or comments).
222  inline void StartToken();
223  // Called when the current character is the first character after the
224  // end of the last token.  After this returns, current_.text will
225  // contain all text consumed since StartToken() was called.
226  inline void EndToken();
227
228  // Convenience method to add an error at the current line and column.
229  void AddError(const string& message) {
230    error_collector_->AddError(line_, column_, message);
231  }
232
233  // -----------------------------------------------------------------
234  // The following four methods are used to consume tokens of specific
235  // types.  They are actually used to consume all characters *after*
236  // the first, since the calling function consumes the first character
237  // in order to decide what kind of token is being read.
238
239  // Read and consume a string, ending when the given delimiter is
240  // consumed.
241  void ConsumeString(char delimiter);
242
243  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
244  // depending on what was read.  This needs to know if the first
245  // character was a zero in order to correctly recognize hex and octal
246  // numbers.
247  // It also needs to know if the first characted was a . to parse floating
248  // point correctly.
249  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
250
251  // Consume the rest of a line.
252  void ConsumeLineComment();
253  // Consume until "*/".
254  void ConsumeBlockComment();
255
256  // -----------------------------------------------------------------
257  // These helper methods make the parsing code more readable.  The
258  // "character classes" refered to are defined at the top of the .cc file.
259  // Basically it is a C++ class with one method:
260  //   static bool InClass(char c);
261  // The method returns true if c is a member of this "class", like "Letter"
262  // or "Digit".
263
264  // Returns true if the current character is of the given character
265  // class, but does not consume anything.
266  template<typename CharacterClass>
267  inline bool LookingAt();
268
269  // If the current character is in the given class, consume it and return
270  // true.  Otherwise return false.
271  // e.g. TryConsumeOne<Letter>()
272  template<typename CharacterClass>
273  inline bool TryConsumeOne();
274
275  // Like above, but try to consume the specific character indicated.
276  inline bool TryConsume(char c);
277
278  // Consume zero or more of the given character class.
279  template<typename CharacterClass>
280  inline void ConsumeZeroOrMore();
281
282  // Consume one or more of the given character class or log the given
283  // error message.
284  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
285  template<typename CharacterClass>
286  inline void ConsumeOneOrMore(const char* error);
287};
288
289// inline methods ====================================================
290inline const Tokenizer::Token& Tokenizer::current() {
291  return current_;
292}
293
294inline void Tokenizer::ParseString(const string& text, string* output) {
295  output->clear();
296  ParseStringAppend(text, output);
297}
298
299}  // namespace io
300}  // namespace protobuf
301
302}  // namespace google
303#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
304