1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// http://code.google.com/p/protobuf/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34// 35// Class for parsing tokenized text from a ZeroCopyInputStream. 36 37#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 38#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 39 40#include <string> 41#include <google/protobuf/stubs/common.h> 42 43namespace google { 44namespace protobuf { 45namespace io { 46 47class ZeroCopyInputStream; // zero_copy_stream.h 48 49// Defined in this file. 50class ErrorCollector; 51class Tokenizer; 52 53// Abstract interface for an object which collects the errors that occur 54// during parsing. A typical implementation might simply print the errors 55// to stdout. 56class LIBPROTOBUF_EXPORT ErrorCollector { 57 public: 58 inline ErrorCollector() {} 59 virtual ~ErrorCollector(); 60 61 // Indicates that there was an error in the input at the given line and 62 // column numbers. The numbers are zero-based, so you may want to add 63 // 1 to each before printing them. 64 virtual void AddError(int line, int column, const string& message) = 0; 65 66 // Indicates that there was a warning in the input at the given line and 67 // column numbers. The numbers are zero-based, so you may want to add 68 // 1 to each before printing them. 69 virtual void AddWarning(int line, int column, const string& message) { } 70 71 private: 72 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 73}; 74 75// This class converts a stream of raw text into a stream of tokens for 76// the protocol definition parser to parse. The tokens recognized are 77// similar to those that make up the C language; see the TokenType enum for 78// precise descriptions. Whitespace and comments are skipped. By default, 79// C- and C++-style comments are recognized, but other styles can be used by 80// calling set_comment_style(). 81class LIBPROTOBUF_EXPORT Tokenizer { 82 public: 83 // Construct a Tokenizer that reads and tokenizes text from the given 84 // input stream and writes errors to the given error_collector. 85 // The caller keeps ownership of input and error_collector. 86 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 87 ~Tokenizer(); 88 89 enum TokenType { 90 TYPE_START, // Next() has not yet been called. 91 TYPE_END, // End of input reached. "text" is empty. 92 93 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 94 // starting with a digit. It is an error for a number 95 // to be followed by an identifier with no space in 96 // between. 97 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 98 // the digits are decimal, but a prefix of "0x" indicates 99 // a hex number and a leading zero indicates octal, just 100 // like with C numeric literals. A leading negative sign 101 // is NOT included in the token; it's up to the parser to 102 // interpret the unary minus operator on its own. 103 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 104 // an exponent. Always in decimal. Again, never 105 // negative. 106 TYPE_STRING, // A quoted sequence of escaped characters. Either single 107 // or double quotes can be used, but they must match. 108 // A string literal cannot cross a line break. 109 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 110 // Symbols are always a single character, so "!+$%" is 111 // four tokens. 112 }; 113 114 // Structure representing a token read from the token stream. 115 struct Token { 116 TokenType type; 117 string text; // The exact text of the token as it appeared in 118 // the input. e.g. tokens of TYPE_STRING will still 119 // be escaped and in quotes. 120 121 // "line" and "column" specify the position of the first character of 122 // the token within the input stream. They are zero-based. 123 int line; 124 int column; 125 }; 126 127 // Get the current token. This is updated when Next() is called. Before 128 // the first call to Next(), current() has type TYPE_START and no contents. 129 const Token& current(); 130 131 // Advance to the next token. Returns false if the end of the input is 132 // reached. 133 bool Next(); 134 135 // Parse helpers --------------------------------------------------- 136 137 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 138 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 139 // result is undefined (possibly an assert failure). 140 static double ParseFloat(const string& text); 141 142 // Parses a TYPE_STRING token. This never fails, so long as the text actually 143 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 144 // result is undefined (possibly an assert failure). 145 static void ParseString(const string& text, string* output); 146 147 // Identical to ParseString, but appends to output. 148 static void ParseStringAppend(const string& text, string* output); 149 150 // Parses a TYPE_INTEGER token. Returns false if the result would be 151 // greater than max_value. Otherwise, returns true and sets *output to the 152 // result. If the text is not from a Token of type TYPE_INTEGER originally 153 // parsed by a Tokenizer, the result is undefined (possibly an assert 154 // failure). 155 static bool ParseInteger(const string& text, uint64 max_value, 156 uint64* output); 157 158 // Options --------------------------------------------------------- 159 160 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 161 // which would otherwise be integers but which have the 'f' suffix will be 162 // forced to be interpreted as floats. For all other purposes, the 'f' is 163 // ignored. 164 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 165 166 // Valid values for set_comment_style(). 167 enum CommentStyle { 168 // Line comments begin with "//", block comments are delimited by "/*" and 169 // "*/". 170 CPP_COMMENT_STYLE, 171 // Line comments begin with "#". No way to write block comments. 172 SH_COMMENT_STYLE 173 }; 174 175 // Sets the comment style. 176 void set_comment_style(CommentStyle style) { comment_style_ = style; } 177 178 // ----------------------------------------------------------------- 179 private: 180 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 181 182 Token current_; // Returned by current(). 183 184 ZeroCopyInputStream* input_; 185 ErrorCollector* error_collector_; 186 187 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 188 const char* buffer_; // Current buffer returned from input_. 189 int buffer_size_; // Size of buffer_. 190 int buffer_pos_; // Current position within the buffer. 191 bool read_error_; // Did we previously encounter a read error? 192 193 // Line and column number of current_char_ within the whole input stream. 194 int line_; 195 int column_; 196 197 // Position in buffer_ where StartToken() was called. If the token 198 // started in the previous buffer, this is zero, and current_.text already 199 // contains the part of the token from the previous buffer. If not 200 // currently parsing a token, this is -1. 201 int token_start_; 202 203 // Options. 204 bool allow_f_after_float_; 205 CommentStyle comment_style_; 206 207 // Since we count columns we need to interpret tabs somehow. We'll take 208 // the standard 8-character definition for lack of any way to do better. 209 static const int kTabWidth = 8; 210 211 // ----------------------------------------------------------------- 212 // Helper methods. 213 214 // Consume this character and advance to the next one. 215 void NextChar(); 216 217 // Read a new buffer from the input. 218 void Refresh(); 219 220 // Called when the current character is the first character of a new 221 // token (not including whitespace or comments). 222 inline void StartToken(); 223 // Called when the current character is the first character after the 224 // end of the last token. After this returns, current_.text will 225 // contain all text consumed since StartToken() was called. 226 inline void EndToken(); 227 228 // Convenience method to add an error at the current line and column. 229 void AddError(const string& message) { 230 error_collector_->AddError(line_, column_, message); 231 } 232 233 // ----------------------------------------------------------------- 234 // The following four methods are used to consume tokens of specific 235 // types. They are actually used to consume all characters *after* 236 // the first, since the calling function consumes the first character 237 // in order to decide what kind of token is being read. 238 239 // Read and consume a string, ending when the given delimiter is 240 // consumed. 241 void ConsumeString(char delimiter); 242 243 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 244 // depending on what was read. This needs to know if the first 245 // character was a zero in order to correctly recognize hex and octal 246 // numbers. 247 // It also needs to know if the first characted was a . to parse floating 248 // point correctly. 249 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 250 251 // Consume the rest of a line. 252 void ConsumeLineComment(); 253 // Consume until "*/". 254 void ConsumeBlockComment(); 255 256 // ----------------------------------------------------------------- 257 // These helper methods make the parsing code more readable. The 258 // "character classes" refered to are defined at the top of the .cc file. 259 // Basically it is a C++ class with one method: 260 // static bool InClass(char c); 261 // The method returns true if c is a member of this "class", like "Letter" 262 // or "Digit". 263 264 // Returns true if the current character is of the given character 265 // class, but does not consume anything. 266 template<typename CharacterClass> 267 inline bool LookingAt(); 268 269 // If the current character is in the given class, consume it and return 270 // true. Otherwise return false. 271 // e.g. TryConsumeOne<Letter>() 272 template<typename CharacterClass> 273 inline bool TryConsumeOne(); 274 275 // Like above, but try to consume the specific character indicated. 276 inline bool TryConsume(char c); 277 278 // Consume zero or more of the given character class. 279 template<typename CharacterClass> 280 inline void ConsumeZeroOrMore(); 281 282 // Consume one or more of the given character class or log the given 283 // error message. 284 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 285 template<typename CharacterClass> 286 inline void ConsumeOneOrMore(const char* error); 287}; 288 289// inline methods ==================================================== 290inline const Tokenizer::Token& Tokenizer::current() { 291 return current_; 292} 293 294inline void Tokenizer::ParseString(const string& text, string* output) { 295 output->clear(); 296 ParseStringAppend(text, output); 297} 298 299} // namespace io 300} // namespace protobuf 301 302} // namespace google 303#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 304