1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Class for parsing tokenized text from a ZeroCopyInputStream.
36
37#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
38#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__
39
40#include <string>
41#include <vector>
42#include <google/protobuf/stubs/common.h>
43
44namespace google {
45namespace protobuf {
46namespace io {
47
48class ZeroCopyInputStream;     // zero_copy_stream.h
49
50// Defined in this file.
51class ErrorCollector;
52class Tokenizer;
53
54// Abstract interface for an object which collects the errors that occur
55// during parsing.  A typical implementation might simply print the errors
56// to stdout.
57class LIBPROTOBUF_EXPORT ErrorCollector {
58 public:
59  inline ErrorCollector() {}
60  virtual ~ErrorCollector();
61
62  // Indicates that there was an error in the input at the given line and
63  // column numbers.  The numbers are zero-based, so you may want to add
64  // 1 to each before printing them.
65  virtual void AddError(int line, int column, const string& message) = 0;
66
67  // Indicates that there was a warning in the input at the given line and
68  // column numbers.  The numbers are zero-based, so you may want to add
69  // 1 to each before printing them.
70  virtual void AddWarning(int line, int column, const string& message) { }
71
72 private:
73  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
74};
75
76// This class converts a stream of raw text into a stream of tokens for
77// the protocol definition parser to parse.  The tokens recognized are
78// similar to those that make up the C language; see the TokenType enum for
79// precise descriptions.  Whitespace and comments are skipped.  By default,
80// C- and C++-style comments are recognized, but other styles can be used by
81// calling set_comment_style().
82class LIBPROTOBUF_EXPORT Tokenizer {
83 public:
84  // Construct a Tokenizer that reads and tokenizes text from the given
85  // input stream and writes errors to the given error_collector.
86  // The caller keeps ownership of input and error_collector.
87  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
88  ~Tokenizer();
89
90  enum TokenType {
91    TYPE_START,       // Next() has not yet been called.
92    TYPE_END,         // End of input reached.  "text" is empty.
93
94    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
95                      // starting with a digit.  It is an error for a number
96                      // to be followed by an identifier with no space in
97                      // between.
98    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
99                      // the digits are decimal, but a prefix of "0x" indicates
100                      // a hex number and a leading zero indicates octal, just
101                      // like with C numeric literals.  A leading negative sign
102                      // is NOT included in the token; it's up to the parser to
103                      // interpret the unary minus operator on its own.
104    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
105                      // an exponent.  Always in decimal.  Again, never
106                      // negative.
107    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
108                      // or double quotes can be used, but they must match.
109                      // A string literal cannot cross a line break.
110    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
111                      // Symbols are always a single character, so "!+$%" is
112                      // four tokens.
113  };
114
115  // Structure representing a token read from the token stream.
116  struct Token {
117    TokenType type;
118    string text;       // The exact text of the token as it appeared in
119                       // the input.  e.g. tokens of TYPE_STRING will still
120                       // be escaped and in quotes.
121
122    // "line" and "column" specify the position of the first character of
123    // the token within the input stream.  They are zero-based.
124    int line;
125    int column;
126    int end_column;
127  };
128
129  // Get the current token.  This is updated when Next() is called.  Before
130  // the first call to Next(), current() has type TYPE_START and no contents.
131  const Token& current();
132
133  // Return the previous token -- i.e. what current() returned before the
134  // previous call to Next().
135  const Token& previous();
136
137  // Advance to the next token.  Returns false if the end of the input is
138  // reached.
139  bool Next();
140
141  // Like Next(), but also collects comments which appear between the previous
142  // and next tokens.
143  //
144  // Comments which appear to be attached to the previous token are stored
145  // in *prev_tailing_comments.  Comments which appear to be attached to the
146  // next token are stored in *next_leading_comments.  Comments appearing in
147  // between which do not appear to be attached to either will be added to
148  // detached_comments.  Any of these parameters can be NULL to simply discard
149  // the comments.
150  //
151  // A series of line comments appearing on consecutive lines, with no other
152  // tokens appearing on those lines, will be treated as a single comment.
153  //
154  // Only the comment content is returned; comment markers (e.g. //) are
155  // stripped out.  For block comments, leading whitespace and an asterisk will
156  // be stripped from the beginning of each line other than the first.  Newlines
157  // are included in the output.
158  //
159  // Examples:
160  //
161  //   optional int32 foo = 1;  // Comment attached to foo.
162  //   // Comment attached to bar.
163  //   optional int32 bar = 2;
164  //
165  //   optional string baz = 3;
166  //   // Comment attached to baz.
167  //   // Another line attached to baz.
168  //
169  //   // Comment attached to qux.
170  //   //
171  //   // Another line attached to qux.
172  //   optional double qux = 4;
173  //
174  //   // Detached comment.  This is not attached to qux or corge
175  //   // because there are blank lines separating it from both.
176  //
177  //   optional string corge = 5;
178  //   /* Block comment attached
179  //    * to corge.  Leading asterisks
180  //    * will be removed. */
181  //   /* Block comment attached to
182  //    * grault. */
183  //   optional int32 grault = 6;
184  bool NextWithComments(string* prev_trailing_comments,
185                        vector<string>* detached_comments,
186                        string* next_leading_comments);
187
188  // Parse helpers ---------------------------------------------------
189
190  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
191  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
192  // result is undefined (possibly an assert failure).
193  static double ParseFloat(const string& text);
194
195  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
196  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
197  // result is undefined (possibly an assert failure).
198  static void ParseString(const string& text, string* output);
199
200  // Identical to ParseString, but appends to output.
201  static void ParseStringAppend(const string& text, string* output);
202
203  // Parses a TYPE_INTEGER token.  Returns false if the result would be
204  // greater than max_value.  Otherwise, returns true and sets *output to the
205  // result.  If the text is not from a Token of type TYPE_INTEGER originally
206  // parsed by a Tokenizer, the result is undefined (possibly an assert
207  // failure).
208  static bool ParseInteger(const string& text, uint64 max_value,
209                           uint64* output);
210
211  // Options ---------------------------------------------------------
212
213  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
214  // which would otherwise be integers but which have the 'f' suffix will be
215  // forced to be interpreted as floats.  For all other purposes, the 'f' is
216  // ignored.
217  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }
218
219  // Valid values for set_comment_style().
220  enum CommentStyle {
221    // Line comments begin with "//", block comments are delimited by "/*" and
222    // "*/".
223    CPP_COMMENT_STYLE,
224    // Line comments begin with "#".  No way to write block comments.
225    SH_COMMENT_STYLE
226  };
227
228  // Sets the comment style.
229  void set_comment_style(CommentStyle style) { comment_style_ = style; }
230
231  // -----------------------------------------------------------------
232 private:
233  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);
234
235  Token current_;           // Returned by current().
236  Token previous_;          // Returned by previous().
237
238  ZeroCopyInputStream* input_;
239  ErrorCollector* error_collector_;
240
241  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
242  const char* buffer_;      // Current buffer returned from input_.
243  int buffer_size_;         // Size of buffer_.
244  int buffer_pos_;          // Current position within the buffer.
245  bool read_error_;         // Did we previously encounter a read error?
246
247  // Line and column number of current_char_ within the whole input stream.
248  int line_;
249  int column_;
250
251  // String to which text should be appended as we advance through it.
252  // Call RecordTo(&str) to start recording and StopRecording() to stop.
253  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
254  // position within the current buffer where recording started.
255  string* record_target_;
256  int record_start_;
257
258  // Options.
259  bool allow_f_after_float_;
260  CommentStyle comment_style_;
261
262  // Since we count columns we need to interpret tabs somehow.  We'll take
263  // the standard 8-character definition for lack of any way to do better.
264  static const int kTabWidth = 8;
265
266  // -----------------------------------------------------------------
267  // Helper methods.
268
269  // Consume this character and advance to the next one.
270  void NextChar();
271
272  // Read a new buffer from the input.
273  void Refresh();
274
275  inline void RecordTo(string* target);
276  inline void StopRecording();
277
278  // Called when the current character is the first character of a new
279  // token (not including whitespace or comments).
280  inline void StartToken();
281  // Called when the current character is the first character after the
282  // end of the last token.  After this returns, current_.text will
283  // contain all text consumed since StartToken() was called.
284  inline void EndToken();
285
286  // Convenience method to add an error at the current line and column.
287  void AddError(const string& message) {
288    error_collector_->AddError(line_, column_, message);
289  }
290
291  // -----------------------------------------------------------------
292  // The following four methods are used to consume tokens of specific
293  // types.  They are actually used to consume all characters *after*
294  // the first, since the calling function consumes the first character
295  // in order to decide what kind of token is being read.
296
297  // Read and consume a string, ending when the given delimiter is
298  // consumed.
299  void ConsumeString(char delimiter);
300
301  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
302  // depending on what was read.  This needs to know if the first
303  // character was a zero in order to correctly recognize hex and octal
304  // numbers.
305  // It also needs to know if the first characted was a . to parse floating
306  // point correctly.
307  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);
308
309  // Consume the rest of a line.
310  void ConsumeLineComment(string* content);
311  // Consume until "*/".
312  void ConsumeBlockComment(string* content);
313
314  enum NextCommentStatus {
315    // Started a line comment.
316    LINE_COMMENT,
317
318    // Started a block comment.
319    BLOCK_COMMENT,
320
321    // Consumed a slash, then realized it wasn't a comment.  current_ has
322    // been filled in with a slash token.  The caller should return it.
323    SLASH_NOT_COMMENT,
324
325    // We do not appear to be starting a comment here.
326    NO_COMMENT
327  };
328
329  // If we're at the start of a new comment, consume it and return what kind
330  // of comment it is.
331  NextCommentStatus TryConsumeCommentStart();
332
333  // -----------------------------------------------------------------
334  // These helper methods make the parsing code more readable.  The
335  // "character classes" refered to are defined at the top of the .cc file.
336  // Basically it is a C++ class with one method:
337  //   static bool InClass(char c);
338  // The method returns true if c is a member of this "class", like "Letter"
339  // or "Digit".
340
341  // Returns true if the current character is of the given character
342  // class, but does not consume anything.
343  template<typename CharacterClass>
344  inline bool LookingAt();
345
346  // If the current character is in the given class, consume it and return
347  // true.  Otherwise return false.
348  // e.g. TryConsumeOne<Letter>()
349  template<typename CharacterClass>
350  inline bool TryConsumeOne();
351
352  // Like above, but try to consume the specific character indicated.
353  inline bool TryConsume(char c);
354
355  // Consume zero or more of the given character class.
356  template<typename CharacterClass>
357  inline void ConsumeZeroOrMore();
358
359  // Consume one or more of the given character class or log the given
360  // error message.
361  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
362  template<typename CharacterClass>
363  inline void ConsumeOneOrMore(const char* error);
364};
365
366// inline methods ====================================================
367inline const Tokenizer::Token& Tokenizer::current() {
368  return current_;
369}
370
371inline const Tokenizer::Token& Tokenizer::previous() {
372  return previous_;
373}
374
375inline void Tokenizer::ParseString(const string& text, string* output) {
376  output->clear();
377  ParseStringAppend(text, output);
378}
379
380}  // namespace io
381}  // namespace protobuf
382
383}  // namespace google
384#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__
385