protobuf/io/tokenizer.h

// Protocol Buffers - Google's data interchange format
// Copyright 2008 Google Inc.  All rights reserved.
// https://developers.google.com/protocol-buffers/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

// Author: kenton@google.com (Kenton Varda)
//  Based on original Protocol Buffers design by
//  Sanjay Ghemawat, Jeff Dean, and others.
//
// Class for parsing tokenized text from a ZeroCopyInputStream.

#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__
#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__

#include <string>
#include <vector>
#include <google/protobuf/stubs/common.h>

namespace google {
namespace protobuf {
namespace io {

class ZeroCopyInputStream;     // zero_copy_stream.h

// Defined in this file.
class ErrorCollector;
class Tokenizer;

// Abstract interface for an object which collects the errors that occur
// during parsing.  A typical implementation might simply print the errors
// to stdout.
class LIBPROTOBUF_EXPORT ErrorCollector {
 public:
  inline ErrorCollector() {}
  virtual ~ErrorCollector();

  // Indicates that there was an error in the input at the given line and
  // column numbers.  The numbers are zero-based, so you may want to add
  // 1 to each before printing them.
  virtual void AddError(int line, int column, const string& message) = 0;

  // Indicates that there was a warning in the input at the given line and
  // column numbers.  The numbers are zero-based, so you may want to add
  // 1 to each before printing them.
  virtual void AddWarning(int /* line */, int /* column */,
                          const string& /* message */) { }

 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector);
};

// This class converts a stream of raw text into a stream of tokens for
// the protocol definition parser to parse.  The tokens recognized are
// similar to those that make up the C language; see the TokenType enum for
// precise descriptions.  Whitespace and comments are skipped.  By default,
// C- and C++-style comments are recognized, but other styles can be used by
// calling set_comment_style().
class LIBPROTOBUF_EXPORT Tokenizer {
 public:
  // Construct a Tokenizer that reads and tokenizes text from the given
  // input stream and writes errors to the given error_collector.
  // The caller keeps ownership of input and error_collector.
  Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector);
  ~Tokenizer();

  enum TokenType {
    TYPE_START,       // Next() has not yet been called.
    TYPE_END,         // End of input reached.  "text" is empty.

    TYPE_IDENTIFIER,  // A sequence of letters, digits, and underscores, not
                      // starting with a digit.  It is an error for a number
                      // to be followed by an identifier with no space in
                      // between.
    TYPE_INTEGER,     // A sequence of digits representing an integer.  Normally
                      // the digits are decimal, but a prefix of "0x" indicates
                      // a hex number and a leading zero indicates octal, just
                      // like with C numeric literals.  A leading negative sign
                      // is NOT included in the token; it's up to the parser to
                      // interpret the unary minus operator on its own.
    TYPE_FLOAT,       // A floating point literal, with a fractional part and/or
                      // an exponent.  Always in decimal.  Again, never
                      // negative.
    TYPE_STRING,      // A quoted sequence of escaped characters.  Either single
                      // or double quotes can be used, but they must match.
                      // A string literal cannot cross a line break.
    TYPE_SYMBOL,      // Any other printable character, like '!' or '+'.
                      // Symbols are always a single character, so "!+$%" is
                      // four tokens.
  };

  // Structure representing a token read from the token stream.
  struct Token {
    TokenType type;
    string text;       // The exact text of the token as it appeared in
                       // the input.  e.g. tokens of TYPE_STRING will still
                       // be escaped and in quotes.

    // "line" and "column" specify the position of the first character of
    // the token within the input stream.  They are zero-based.
    int line;
    int column;
    int end_column;
  };

  // Get the current token.  This is updated when Next() is called.  Before
  // the first call to Next(), current() has type TYPE_START and no contents.
  const Token& current();

  // Return the previous token -- i.e. what current() returned before the
  // previous call to Next().
  const Token& previous();

  // Advance to the next token.  Returns false if the end of the input is
  // reached.
  bool Next();

  // Like Next(), but also collects comments which appear between the previous
  // and next tokens.
  //
  // Comments which appear to be attached to the previous token are stored
  // in *prev_tailing_comments.  Comments which appear to be attached to the
  // next token are stored in *next_leading_comments.  Comments appearing in
  // between which do not appear to be attached to either will be added to
  // detached_comments.  Any of these parameters can be NULL to simply discard
  // the comments.
  //
  // A series of line comments appearing on consecutive lines, with no other
  // tokens appearing on those lines, will be treated as a single comment.
  //
  // Only the comment content is returned; comment markers (e.g. //) are
  // stripped out.  For block comments, leading whitespace and an asterisk will
  // be stripped from the beginning of each line other than the first.  Newlines
  // are included in the output.
  //
  // Examples:
  //
  //   optional int32 foo = 1;  // Comment attached to foo.
  //   // Comment attached to bar.
  //   optional int32 bar = 2;
  //
  //   optional string baz = 3;
  //   // Comment attached to baz.
  //   // Another line attached to baz.
  //
  //   // Comment attached to qux.
  //   //
  //   // Another line attached to qux.
  //   optional double qux = 4;
  //
  //   // Detached comment.  This is not attached to qux or corge
  //   // because there are blank lines separating it from both.
  //
  //   optional string corge = 5;
  //   /* Block comment attached
  //    * to corge.  Leading asterisks
  //    * will be removed. */
  //   /* Block comment attached to
  //    * grault. */
  //   optional int32 grault = 6;
  bool NextWithComments(string* prev_trailing_comments,
                        vector<string>* detached_comments,
                        string* next_leading_comments);

  // Parse helpers ---------------------------------------------------

  // Parses a TYPE_FLOAT token.  This never fails, so long as the text actually
  // comes from a TYPE_FLOAT token parsed by Tokenizer.  If it doesn't, the
  // result is undefined (possibly an assert failure).
  static double ParseFloat(const string& text);

  // Parses a TYPE_STRING token.  This never fails, so long as the text actually
  // comes from a TYPE_STRING token parsed by Tokenizer.  If it doesn't, the
  // result is undefined (possibly an assert failure).
  static void ParseString(const string& text, string* output);

  // Identical to ParseString, but appends to output.
  static void ParseStringAppend(const string& text, string* output);

  // Parses a TYPE_INTEGER token.  Returns false if the result would be
  // greater than max_value.  Otherwise, returns true and sets *output to the
  // result.  If the text is not from a Token of type TYPE_INTEGER originally
  // parsed by a Tokenizer, the result is undefined (possibly an assert
  // failure).
  static bool ParseInteger(const string& text, uint64 max_value,
                           uint64* output);

  // Options ---------------------------------------------------------

  // Set true to allow floats to be suffixed with the letter 'f'.  Tokens
  // which would otherwise be integers but which have the 'f' suffix will be
  // forced to be interpreted as floats.  For all other purposes, the 'f' is
  // ignored.
  void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; }

  // Valid values for set_comment_style().
  enum CommentStyle {
    // Line comments begin with "//", block comments are delimited by "/*" and
    // "*/".
    CPP_COMMENT_STYLE,
    // Line comments begin with "#".  No way to write block comments.
    SH_COMMENT_STYLE
  };

  // Sets the comment style.
  void set_comment_style(CommentStyle style) { comment_style_ = style; }

  // Whether to require whitespace between a number and a field name.
  // Default is true. Do not use this; for Google-internal cleanup only.
  void set_require_space_after_number(bool require) {
    require_space_after_number_ = require;
  }

  // Whether to allow string literals to span multiple lines. Default is false.
  // Do not use this; for Google-internal cleanup only.
  void set_allow_multiline_strings(bool allow) {
    allow_multiline_strings_ = allow;
  }

  // External helper: validate an identifier.
  static bool IsIdentifier(const string& text);

  // -----------------------------------------------------------------
 private:
  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer);

  Token current_;           // Returned by current().
  Token previous_;          // Returned by previous().

  ZeroCopyInputStream* input_;
  ErrorCollector* error_collector_;

  char current_char_;       // == buffer_[buffer_pos_], updated by NextChar().
  const char* buffer_;      // Current buffer returned from input_.
  int buffer_size_;         // Size of buffer_.
  int buffer_pos_;          // Current position within the buffer.
  bool read_error_;         // Did we previously encounter a read error?

  // Line and column number of current_char_ within the whole input stream.
  int line_;
  int column_;

  // String to which text should be appended as we advance through it.
  // Call RecordTo(&str) to start recording and StopRecording() to stop.
  // E.g. StartToken() calls RecordTo(&current_.text).  record_start_ is the
  // position within the current buffer where recording started.
  string* record_target_;
  int record_start_;

  // Options.
  bool allow_f_after_float_;
  CommentStyle comment_style_;
  bool require_space_after_number_;
  bool allow_multiline_strings_;

  // Since we count columns we need to interpret tabs somehow.  We'll take
  // the standard 8-character definition for lack of any way to do better.
  static const int kTabWidth = 8;

  // -----------------------------------------------------------------
  // Helper methods.

  // Consume this character and advance to the next one.
  void NextChar();

  // Read a new buffer from the input.
  void Refresh();

  inline void RecordTo(string* target);
  inline void StopRecording();

  // Called when the current character is the first character of a new
  // token (not including whitespace or comments).
  inline void StartToken();
  // Called when the current character is the first character after the
  // end of the last token.  After this returns, current_.text will
  // contain all text consumed since StartToken() was called.
  inline void EndToken();

  // Convenience method to add an error at the current line and column.
  void AddError(const string& message) {
    error_collector_->AddError(line_, column_, message);
  }

  // -----------------------------------------------------------------
  // The following four methods are used to consume tokens of specific
  // types.  They are actually used to consume all characters *after*
  // the first, since the calling function consumes the first character
  // in order to decide what kind of token is being read.

  // Read and consume a string, ending when the given delimiter is
  // consumed.
  void ConsumeString(char delimiter);

  // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER
  // depending on what was read.  This needs to know if the first
  // character was a zero in order to correctly recognize hex and octal
  // numbers.
  // It also needs to know if the first characted was a . to parse floating
  // point correctly.
  TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot);

  // Consume the rest of a line.
  void ConsumeLineComment(string* content);
  // Consume until "*/".
  void ConsumeBlockComment(string* content);

  enum NextCommentStatus {
    // Started a line comment.
    LINE_COMMENT,

    // Started a block comment.
    BLOCK_COMMENT,

    // Consumed a slash, then realized it wasn't a comment.  current_ has
    // been filled in with a slash token.  The caller should return it.
    SLASH_NOT_COMMENT,

    // We do not appear to be starting a comment here.
    NO_COMMENT
  };

  // If we're at the start of a new comment, consume it and return what kind
  // of comment it is.
  NextCommentStatus TryConsumeCommentStart();

  // -----------------------------------------------------------------
  // These helper methods make the parsing code more readable.  The
  // "character classes" refered to are defined at the top of the .cc file.
  // Basically it is a C++ class with one method:
  //   static bool InClass(char c);
  // The method returns true if c is a member of this "class", like "Letter"
  // or "Digit".

  // Returns true if the current character is of the given character
  // class, but does not consume anything.
  template<typename CharacterClass>
  inline bool LookingAt();

  // If the current character is in the given class, consume it and return
  // true.  Otherwise return false.
  // e.g. TryConsumeOne<Letter>()
  template<typename CharacterClass>
  inline bool TryConsumeOne();

  // Like above, but try to consume the specific character indicated.
  inline bool TryConsume(char c);

  // Consume zero or more of the given character class.
  template<typename CharacterClass>
  inline void ConsumeZeroOrMore();

  // Consume one or more of the given character class or log the given
  // error message.
  // e.g. ConsumeOneOrMore<Digit>("Expected digits.");
  template<typename CharacterClass>
  inline void ConsumeOneOrMore(const char* error);
};

// inline methods ====================================================
inline const Tokenizer::Token& Tokenizer::current() {
  return current_;
}

inline const Tokenizer::Token& Tokenizer::previous() {
  return previous_;
}

inline void Tokenizer::ParseString(const string& text, string* output) {
  output->clear();
  ParseStringAppend(text, output);
}

}  // namespace io
}  // namespace protobuf

}  // namespace google
#endif  // GOOGLE_PROTOBUF_IO_TOKENIZER_H__