15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Protocol Buffers - Google's data interchange format 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 Google Inc. All rights reserved. 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// http://code.google.com/p/protobuf/ 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Redistribution and use in source and binary forms, with or without 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// modification, are permitted provided that the following conditions are 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// met: 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// * Redistributions of source code must retain the above copyright 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// notice, this list of conditions and the following disclaimer. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// * Redistributions in binary form must reproduce the above 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// copyright notice, this list of conditions and the following disclaimer 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the documentation and/or other materials provided with the 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// distribution. 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// * Neither the name of Google Inc. nor the names of its 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// contributors may be used to endorse or promote products derived from 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this software without specific prior written permission. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Author: kenton@google.com (Kenton Varda) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Based on original Protocol Buffers design by 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Sanjay Ghemawat, Jeff Dean, and others. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Class for parsing tokenized text from a ZeroCopyInputStream. 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string> 413d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch#include <vector> 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <google/protobuf/stubs/common.h> 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace google { 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace protobuf { 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace io { 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ZeroCopyInputStream; // zero_copy_stream.h 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Defined in this file. 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ErrorCollector; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class Tokenizer; 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Abstract interface for an object which collects the errors that occur 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// during parsing. A typical implementation might simply print the errors 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to stdout. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LIBPROTOBUF_EXPORT ErrorCollector { 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline ErrorCollector() {} 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~ErrorCollector(); 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Indicates that there was an error in the input at the given line and 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // column numbers. The numbers are zero-based, so you may want to add 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 1 to each before printing them. 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void AddError(int line, int column, const string& message) = 0; 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Indicates that there was a warning in the input at the given line and 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // column numbers. The numbers are zero-based, so you may want to add 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 1 to each before printing them. 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void AddWarning(int line, int column, const string& message) { } 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ErrorCollector); 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This class converts a stream of raw text into a stream of tokens for 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the protocol definition parser to parse. The tokens recognized are 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// similar to those that make up the C language; see the TokenType enum for 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// precise descriptions. Whitespace and comments are skipped. By default, 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// C- and C++-style comments are recognized, but other styles can be used by 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// calling set_comment_style(). 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LIBPROTOBUF_EXPORT Tokenizer { 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Construct a Tokenizer that reads and tokenizes text from the given 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // input stream and writes errors to the given error_collector. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The caller keeps ownership of input and error_collector. 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Tokenizer(ZeroCopyInputStream* input, ErrorCollector* error_collector); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~Tokenizer(); 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) enum TokenType { 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_START, // Next() has not yet been called. 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_END, // End of input reached. "text" is empty. 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_IDENTIFIER, // A sequence of letters, digits, and underscores, not 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // starting with a digit. It is an error for a number 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to be followed by an identifier with no space in 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // between. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_INTEGER, // A sequence of digits representing an integer. Normally 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the digits are decimal, but a prefix of "0x" indicates 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a hex number and a leading zero indicates octal, just 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // like with C numeric literals. A leading negative sign 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is NOT included in the token; it's up to the parser to 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // interpret the unary minus operator on its own. 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_FLOAT, // A floating point literal, with a fractional part and/or 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // an exponent. Always in decimal. Again, never 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // negative. 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_STRING, // A quoted sequence of escaped characters. Either single 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // or double quotes can be used, but they must match. 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A string literal cannot cross a line break. 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TYPE_SYMBOL, // Any other printable character, like '!' or '+'. 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Symbols are always a single character, so "!+$%" is 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // four tokens. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Structure representing a token read from the token stream. 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct Token { 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TokenType type; 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string text; // The exact text of the token as it appeared in 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the input. e.g. tokens of TYPE_STRING will still 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // be escaped and in quotes. 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "line" and "column" specify the position of the first character of 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the token within the input stream. They are zero-based. 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int line; 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int column; 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int end_column; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Get the current token. This is updated when Next() is called. Before 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the first call to Next(), current() has type TYPE_START and no contents. 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Token& current(); 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Return the previous token -- i.e. what current() returned before the 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // previous call to Next(). 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const Token& previous(); 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Advance to the next token. Returns false if the end of the input is 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // reached. 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool Next(); 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1413d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Like Next(), but also collects comments which appear between the previous 1423d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // and next tokens. 1433d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1443d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Comments which appear to be attached to the previous token are stored 1453d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // in *prev_tailing_comments. Comments which appear to be attached to the 1463d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // next token are stored in *next_leading_comments. Comments appearing in 1473d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // between which do not appear to be attached to either will be added to 1483d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // detached_comments. Any of these parameters can be NULL to simply discard 1493d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // the comments. 1503d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1513d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // A series of line comments appearing on consecutive lines, with no other 1523d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // tokens appearing on those lines, will be treated as a single comment. 1533d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1543d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Only the comment content is returned; comment markers (e.g. //) are 1553d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // stripped out. For block comments, leading whitespace and an asterisk will 1563d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // be stripped from the beginning of each line other than the first. Newlines 1573d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // are included in the output. 1583d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1593d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Examples: 1603d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1613d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional int32 foo = 1; // Comment attached to foo. 1623d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Comment attached to bar. 1633d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional int32 bar = 2; 1643d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1653d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional string baz = 3; 1663d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Comment attached to baz. 1673d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Another line attached to baz. 1683d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1693d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Comment attached to qux. 1703d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // 1713d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Another line attached to qux. 1723d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional double qux = 4; 1733d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1743d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // Detached comment. This is not attached to qux or corge 1753d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // // because there are blank lines separating it from both. 1763d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // 1773d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional string corge = 5; 1783d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // /* Block comment attached 1793d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // * to corge. Leading asterisks 1803d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // * will be removed. */ 1813d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // /* Block comment attached to 1823d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // * grault. */ 1833d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // optional int32 grault = 6; 1843d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch bool NextWithComments(string* prev_trailing_comments, 1853d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch vector<string>* detached_comments, 1863d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch string* next_leading_comments); 1873d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parse helpers --------------------------------------------------- 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parses a TYPE_FLOAT token. This never fails, so long as the text actually 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // comes from a TYPE_FLOAT token parsed by Tokenizer. If it doesn't, the 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // result is undefined (possibly an assert failure). 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static double ParseFloat(const string& text); 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parses a TYPE_STRING token. This never fails, so long as the text actually 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // comes from a TYPE_STRING token parsed by Tokenizer. If it doesn't, the 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // result is undefined (possibly an assert failure). 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static void ParseString(const string& text, string* output); 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Identical to ParseString, but appends to output. 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static void ParseStringAppend(const string& text, string* output); 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Parses a TYPE_INTEGER token. Returns false if the result would be 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // greater than max_value. Otherwise, returns true and sets *output to the 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // result. If the text is not from a Token of type TYPE_INTEGER originally 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // parsed by a Tokenizer, the result is undefined (possibly an assert 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // failure). 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static bool ParseInteger(const string& text, uint64 max_value, 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint64* output); 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Options --------------------------------------------------------- 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Set true to allow floats to be suffixed with the letter 'f'. Tokens 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // which would otherwise be integers but which have the 'f' suffix will be 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // forced to be interpreted as floats. For all other purposes, the 'f' is 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ignored. 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void set_allow_f_after_float(bool value) { allow_f_after_float_ = value; } 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Valid values for set_comment_style(). 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) enum CommentStyle { 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Line comments begin with "//", block comments are delimited by "/*" and 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "*/". 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CPP_COMMENT_STYLE, 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Line comments begin with "#". No way to write block comments. 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SH_COMMENT_STYLE 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Sets the comment style. 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void set_comment_style(CommentStyle style) { comment_style_ = style; } 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ----------------------------------------------------------------- 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Tokenizer); 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Token current_; // Returned by current(). 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Token previous_; // Returned by previous(). 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ZeroCopyInputStream* input_; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ErrorCollector* error_collector_; 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char current_char_; // == buffer_[buffer_pos_], updated by NextChar(). 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* buffer_; // Current buffer returned from input_. 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int buffer_size_; // Size of buffer_. 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int buffer_pos_; // Current position within the buffer. 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool read_error_; // Did we previously encounter a read error? 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Line and column number of current_char_ within the whole input stream. 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int line_; 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int column_; 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2513d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // String to which text should be appended as we advance through it. 2523d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Call RecordTo(&str) to start recording and StopRecording() to stop. 2533d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // E.g. StartToken() calls RecordTo(¤t_.text). record_start_ is the 2543d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // position within the current buffer where recording started. 2553d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch string* record_target_; 2563d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch int record_start_; 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Options. 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool allow_f_after_float_; 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CommentStyle comment_style_; 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Since we count columns we need to interpret tabs somehow. We'll take 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the standard 8-character definition for lack of any way to do better. 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kTabWidth = 8; 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ----------------------------------------------------------------- 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Helper methods. 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Consume this character and advance to the next one. 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void NextChar(); 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Read a new buffer from the input. 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Refresh(); 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2753d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch inline void RecordTo(string* target); 2763d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch inline void StopRecording(); 2773d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called when the current character is the first character of a new 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // token (not including whitespace or comments). 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline void StartToken(); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Called when the current character is the first character after the 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // end of the last token. After this returns, current_.text will 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // contain all text consumed since StartToken() was called. 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline void EndToken(); 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Convenience method to add an error at the current line and column. 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddError(const string& message) { 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) error_collector_->AddError(line_, column_, message); 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ----------------------------------------------------------------- 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The following four methods are used to consume tokens of specific 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // types. They are actually used to consume all characters *after* 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the first, since the calling function consumes the first character 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in order to decide what kind of token is being read. 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Read and consume a string, ending when the given delimiter is 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // consumed. 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ConsumeString(char delimiter); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Read and consume a number, returning TYPE_FLOAT or TYPE_INTEGER 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // depending on what was read. This needs to know if the first 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // character was a zero in order to correctly recognize hex and octal 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // numbers. 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It also needs to know if the first characted was a . to parse floating 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // point correctly. 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) TokenType ConsumeNumber(bool started_with_zero, bool started_with_dot); 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Consume the rest of a line. 3103d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch void ConsumeLineComment(string* content); 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Consume until "*/". 3123d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch void ConsumeBlockComment(string* content); 3133d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 3143d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch enum NextCommentStatus { 3153d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Started a line comment. 3163d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch LINE_COMMENT, 3173d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 3183d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Started a block comment. 3193d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch BLOCK_COMMENT, 3203d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 3213d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // Consumed a slash, then realized it wasn't a comment. current_ has 3223d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // been filled in with a slash token. The caller should return it. 3233d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch SLASH_NOT_COMMENT, 3243d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 3253d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // We do not appear to be starting a comment here. 3263d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch NO_COMMENT 3273d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch }; 3283d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch 3293d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // If we're at the start of a new comment, consume it and return what kind 3303d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch // of comment it is. 3313d4dfb6f11fb4e934d658743a8efc26d5490fdb0Ben Murdoch NextCommentStatus TryConsumeCommentStart(); 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ----------------------------------------------------------------- 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These helper methods make the parsing code more readable. The 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "character classes" refered to are defined at the top of the .cc file. 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Basically it is a C++ class with one method: 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // static bool InClass(char c); 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The method returns true if c is a member of this "class", like "Letter" 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // or "Digit". 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns true if the current character is of the given character 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // class, but does not consume anything. 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) template<typename CharacterClass> 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline bool LookingAt(); 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the current character is in the given class, consume it and return 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // true. Otherwise return false. 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // e.g. TryConsumeOne<Letter>() 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) template<typename CharacterClass> 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline bool TryConsumeOne(); 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Like above, but try to consume the specific character indicated. 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline bool TryConsume(char c); 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Consume zero or more of the given character class. 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) template<typename CharacterClass> 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline void ConsumeZeroOrMore(); 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Consume one or more of the given character class or log the given 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // error message. 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // e.g. ConsumeOneOrMore<Digit>("Expected digits."); 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) template<typename CharacterClass> 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline void ConsumeOneOrMore(const char* error); 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// inline methods ==================================================== 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline const Tokenizer::Token& Tokenizer::current() { 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return current_; 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline const Tokenizer::Token& Tokenizer::previous() { 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return previous_; 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline void Tokenizer::ParseString(const string& text, string* output) { 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) output->clear(); 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ParseStringAppend(text, output); 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace io 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace protobuf 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace google 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // GOOGLE_PROTOBUF_IO_TOKENIZER_H__ 385