1// Protocol Buffers - Google's data interchange format 2// Copyright 2008 Google Inc. All rights reserved. 3// https://developers.google.com/protocol-buffers/ 4// 5// Redistribution and use in source and binary forms, with or without 6// modification, are permitted provided that the following conditions are 7// met: 8// 9// * Redistributions of source code must retain the above copyright 10// notice, this list of conditions and the following disclaimer. 11// * Redistributions in binary form must reproduce the above 12// copyright notice, this list of conditions and the following disclaimer 13// in the documentation and/or other materials provided with the 14// distribution. 15// * Neither the name of Google Inc. nor the names of its 16// contributors may be used to endorse or promote products derived from 17// this software without specific prior written permission. 18// 19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31// Author: kenton@google.com (Kenton Varda) 32// Based on original Protocol Buffers design by 33// Sanjay Ghemawat, Jeff Dean, and others. 34// 35// Class for parsing tokenized text from a ZeroCopyInputStream. 36 37#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 38#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 39 40#include <string> 41#include <vector> 42#include <google/protobuf/stubs/common.h> 43 44namespace google { 45namespace protobuf { 46namespace io { 47 48class ZeroCopyInputStream; // zero_copy_stream.h 49 50// Defined in this file. 51class ErrorCollector; 52class Tokenizer; 53 54// Abstract interface for an object which collects the errors that occur 55// during parsing. A typical implementation might simply print the errors 56// to stdout. 57class LIBPROTOBUF_EXPORT ErrorCollector { 58 public: 59 inline ErrorCollector() {} 60 virtual ~ErrorCollector(); 61 62 // Indicates that there was an error in the input at the given line and 63 // column numbers. The numbers are zero-based, so you may want to add 64 // 1 to each before printing them. 65 virtual void AddError(int line, int column, const string& message) = 0; 66 67 // Indicates that there was a warning in the input at the given line and 68 // column numbers. The numbers are zero-based, so you may want to add 69 // 1 to each before printing them. 70 virtual void AddWarning(int /* line */, int /* column */, 71 const string& /* message */) { } 72 73 private: 74 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 75}; 76 77// This class converts a stream of raw text into a stream of tokens for 78// the protocol definition parser to parse. The tokens recognized are 79// similar to those that make up the C language; see the TokenType enum for 80// precise descriptions. Whitespace and comments are skipped. By default, 81// C- and C++-style comments are recognized, but other styles can be used by 82// calling set_comment_style(). 83class LIBPROTOBUF_EXPORT Tokenizer { 84 public: 85 // Construct a Tokenizer that reads and tokenizes text from the given 86 // input stream and writes errors to the given error_collector. 87 // The caller keeps ownership of input and error_collector. 88 Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 89 ~Tokenizer(); 90 91 enum TokenType { 92 TYPE_START, // Next() has not yet been called. 93 TYPE_END, // End of input reached. "text" is empty. 94 95 TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 96 // starting with a digit. It is an error for a number 97 // to be followed by an identifier with no space in 98 // between. 99 TYPE_INTEGER, // A sequence of digits representing an integer. Normally 100 // the digits are decimal, but a prefix of "0x" indicates 101 // a hex number and a leading zero indicates octal, just 102 // like with C numeric literals. A leading negative sign 103 // is NOT included in the token; it's up to the parser to 104 // interpret the unary minus operator on its own. 105 TYPE_FLOAT, // A floating point literal, with a fractional part and/or 106 // an exponent. Always in decimal. Again, never 107 // negative. 108 TYPE_STRING, // A quoted sequence of escaped characters. Either single 109 // or double quotes can be used, but they must match. 110 // A string literal cannot cross a line break. 111 TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 112 // Symbols are always a single character, so "!+$%" is 113 // four tokens. 114 }; 115 116 // Structure representing a token read from the token stream. 117 struct Token { 118 TokenType type; 119 string text; // The exact text of the token as it appeared in 120 // the input. e.g. tokens of TYPE_STRING will still 121 // be escaped and in quotes. 122 123 // "line" and "column" specify the position of the first character of 124 // the token within the input stream. They are zero-based. 125 int line; 126 int column; 127 int end_column; 128 }; 129 130 // Get the current token. This is updated when Next() is called. Before 131 // the first call to Next(), current() has type TYPE_START and no contents. 132 const Token& current(); 133 134 // Return the previous token -- i.e. what current() returned before the 135 // previous call to Next(). 136 const Token& previous(); 137 138 // Advance to the next token. Returns false if the end of the input is 139 // reached. 140 bool Next(); 141 142 // Like Next(), but also collects comments which appear between the previous 143 // and next tokens. 144 // 145 // Comments which appear to be attached to the previous token are stored 146 // in *prev_tailing_comments. Comments which appear to be attached to the 147 // next token are stored in *next_leading_comments. Comments appearing in 148 // between which do not appear to be attached to either will be added to 149 // detached_comments. Any of these parameters can be NULL to simply discard 150 // the comments. 151 // 152 // A series of line comments appearing on consecutive lines, with no other 153 // tokens appearing on those lines, will be treated as a single comment. 154 // 155 // Only the comment content is returned; comment markers (e.g. //) are 156 // stripped out. For block comments, leading whitespace and an asterisk will 157 // be stripped from the beginning of each line other than the first. Newlines 158 // are included in the output. 159 // 160 // Examples: 161 // 162 // optional int32 foo = 1; // Comment attached to foo. 163 // // Comment attached to bar. 164 // optional int32 bar = 2; 165 // 166 // optional string baz = 3; 167 // // Comment attached to baz. 168 // // Another line attached to baz. 169 // 170 // // Comment attached to qux. 171 // // 172 // // Another line attached to qux. 173 // optional double qux = 4; 174 // 175 // // Detached comment. This is not attached to qux or corge 176 // // because there are blank lines separating it from both. 177 // 178 // optional string corge = 5; 179 // /* Block comment attached 180 // * to corge. Leading asterisks 181 // * will be removed. */ 182 // /* Block comment attached to 183 // * grault. */ 184 // optional int32 grault = 6; 185 bool NextWithComments(string* prev_trailing_comments, 186 vector<string>* detached_comments, 187 string* next_leading_comments); 188 189 // Parse helpers --------------------------------------------------- 190 191 // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 192 // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 193 // result is undefined (possibly an assert failure). 194 static double ParseFloat(const string& text); 195 196 // Parses a TYPE_STRING token. This never fails, so long as the text actually 197 // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 198 // result is undefined (possibly an assert failure). 199 static void ParseString(const string& text, string* output); 200 201 // Identical to ParseString, but appends to output. 202 static void ParseStringAppend(const string& text, string* output); 203 204 // Parses a TYPE_INTEGER token. Returns false if the result would be 205 // greater than max_value. Otherwise, returns true and sets *output to the 206 // result. If the text is not from a Token of type TYPE_INTEGER originally 207 // parsed by a Tokenizer, the result is undefined (possibly an assert 208 // failure). 209 static bool ParseInteger(const string& text, uint64 max_value, 210 uint64* output); 211 212 // Options --------------------------------------------------------- 213 214 // Set true to allow floats to be suffixed with the letter 'f'. Tokens 215 // which would otherwise be integers but which have the 'f' suffix will be 216 // forced to be interpreted as floats. For all other purposes, the 'f' is 217 // ignored. 218 void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 219 220 // Valid values for set_comment_style(). 221 enum CommentStyle { 222 // Line comments begin with "//", block comments are delimited by "/*" and 223 // "*/". 224 CPP_COMMENT_STYLE, 225 // Line comments begin with "#". No way to write block comments. 226 SH_COMMENT_STYLE 227 }; 228 229 // Sets the comment style. 230 void set_comment_style(CommentStyle style) { comment_style_ = style; } 231 232 // Whether to require whitespace between a number and a field name. 233 // Default is true. Do not use this; for Google-internal cleanup only. 234 void set_require_space_after_number(bool require) { 235 require_space_after_number_ = require; 236 } 237 238 // Whether to allow string literals to span multiple lines. Default is false. 239 // Do not use this; for Google-internal cleanup only. 240 void set_allow_multiline_strings(bool allow) { 241 allow_multiline_strings_ = allow; 242 } 243 244 // External helper: validate an identifier. 245 static bool IsIdentifier(const string& text); 246 247 // ----------------------------------------------------------------- 248 private: 249 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 250 251 Token current_; // Returned by current(). 252 Token previous_; // Returned by previous(). 253 254 ZeroCopyInputStream* input_; 255 ErrorCollector* error_collector_; 256 257 char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 258 const char* buffer_; // Current buffer returned from input_. 259 int buffer_size_; // Size of buffer_. 260 int buffer_pos_; // Current position within the buffer. 261 bool read_error_; // Did we previously encounter a read error? 262 263 // Line and column number of current_char_ within the whole input stream. 264 int line_; 265 int column_; 266 267 // String to which text should be appended as we advance through it. 268 // Call RecordTo(&str) to start recording and StopRecording() to stop. 269 // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the 270 // position within the current buffer where recording started. 271 string* record_target_; 272 int record_start_; 273 274 // Options. 275 bool allow_f_after_float_; 276 CommentStyle comment_style_; 277 bool require_space_after_number_; 278 bool allow_multiline_strings_; 279 280 // Since we count columns we need to interpret tabs somehow. We'll take 281 // the standard 8-character definition for lack of any way to do better. 282 static const int kTabWidth = 8; 283 284 // ----------------------------------------------------------------- 285 // Helper methods. 286 287 // Consume this character and advance to the next one. 288 void NextChar(); 289 290 // Read a new buffer from the input. 291 void Refresh(); 292 293 inline void RecordTo(string* target); 294 inline void StopRecording(); 295 296 // Called when the current character is the first character of a new 297 // token (not including whitespace or comments). 298 inline void StartToken(); 299 // Called when the current character is the first character after the 300 // end of the last token. After this returns, current_.text will 301 // contain all text consumed since StartToken() was called. 302 inline void EndToken(); 303 304 // Convenience method to add an error at the current line and column. 305 void AddError(const string& message) { 306 error_collector_->AddError(line_, column_, message); 307 } 308 309 // ----------------------------------------------------------------- 310 // The following four methods are used to consume tokens of specific 311 // types. They are actually used to consume all characters *after* 312 // the first, since the calling function consumes the first character 313 // in order to decide what kind of token is being read. 314 315 // Read and consume a string, ending when the given delimiter is 316 // consumed. 317 void ConsumeString(char delimiter); 318 319 // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 320 // depending on what was read. This needs to know if the first 321 // character was a zero in order to correctly recognize hex and octal 322 // numbers. 323 // It also needs to know if the first characted was a . to parse floating 324 // point correctly. 325 TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 326 327 // Consume the rest of a line. 328 void ConsumeLineComment(string* content); 329 // Consume until "*/". 330 void ConsumeBlockComment(string* content); 331 332 enum NextCommentStatus { 333 // Started a line comment. 334 LINE_COMMENT, 335 336 // Started a block comment. 337 BLOCK_COMMENT, 338 339 // Consumed a slash, then realized it wasn't a comment. current_ has 340 // been filled in with a slash token. The caller should return it. 341 SLASH_NOT_COMMENT, 342 343 // We do not appear to be starting a comment here. 344 NO_COMMENT 345 }; 346 347 // If we're at the start of a new comment, consume it and return what kind 348 // of comment it is. 349 NextCommentStatus TryConsumeCommentStart(); 350 351 // ----------------------------------------------------------------- 352 // These helper methods make the parsing code more readable. The 353 // "character classes" refered to are defined at the top of the .cc file. 354 // Basically it is a C++ class with one method: 355 // static bool InClass(char c); 356 // The method returns true if c is a member of this "class", like "Letter" 357 // or "Digit". 358 359 // Returns true if the current character is of the given character 360 // class, but does not consume anything. 361 template<typename CharacterClass> 362 inline bool LookingAt(); 363 364 // If the current character is in the given class, consume it and return 365 // true. Otherwise return false. 366 // e.g. TryConsumeOne<Letter>() 367 template<typename CharacterClass> 368 inline bool TryConsumeOne(); 369 370 // Like above, but try to consume the specific character indicated. 371 inline bool TryConsume(char c); 372 373 // Consume zero or more of the given character class. 374 template<typename CharacterClass> 375 inline void ConsumeZeroOrMore(); 376 377 // Consume one or more of the given character class or log the given 378 // error message. 379 // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 380 template<typename CharacterClass> 381 inline void ConsumeOneOrMore(const char* error); 382}; 383 384// inline methods ==================================================== 385inline const Tokenizer::Token& Tokenizer::current() { 386 return current_; 387} 388 389inline const Tokenizer::Token& Tokenizer::previous() { 390 return previous_; 391} 392 393inline void Tokenizer::ParseString(const string& text, string* output) { 394 output->clear(); 395 ParseStringAppend(text, output); 396} 397 398} // namespace io 399} // namespace protobuf 400 401} // namespace google 402#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 403