1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Implements parsing of .proto files to FileDescriptorProtos.
36
37#ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
38#define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
39
40#include <map>
41#include <string>
42#include <utility>
43#include <google/protobuf/stubs/common.h>
44#include <google/protobuf/descriptor.h>
45#include <google/protobuf/descriptor.pb.h>
46#include <google/protobuf/repeated_field.h>
47#include <google/protobuf/io/tokenizer.h>
48
49namespace google {
50namespace protobuf { class Message; }
51
52namespace protobuf {
53namespace compiler {
54
55// Defined in this file.
56class Parser;
57class SourceLocationTable;
58
59// Implements parsing of protocol definitions (such as .proto files).
60//
61// Note that most users will be more interested in the Importer class.
62// Parser is a lower-level class which simply converts a single .proto file
63// to a FileDescriptorProto.  It does not resolve import directives or perform
64// many other kinds of validation needed to construct a complete
65// FileDescriptor.
66class LIBPROTOBUF_EXPORT Parser {
67 public:
68  Parser();
69  ~Parser();
70
71  // Parse the entire input and construct a FileDescriptorProto representing
72  // it.  Returns true if no errors occurred, false otherwise.
73  bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
74
75  // Optional fetaures:
76
77  // Requests that locations of certain definitions be recorded to the given
78  // SourceLocationTable while parsing.  This can be used to look up exact line
79  // and column numbers for errors reported by DescriptorPool during validation.
80  // Set to NULL (the default) to discard source location information.
81  void RecordSourceLocationsTo(SourceLocationTable* location_table) {
82    source_location_table_ = location_table;
83  }
84
85  // Requsets that errors be recorded to the given ErrorCollector while
86  // parsing.  Set to NULL (the default) to discard error messages.
87  void RecordErrorsTo(io::ErrorCollector* error_collector) {
88    error_collector_ = error_collector;
89  }
90
91  // Returns the identifier used in the "syntax = " declaration, if one was
92  // seen during the last call to Parse(), or the empty string otherwise.
93  const string& GetSyntaxIdentifier() { return syntax_identifier_; }
94
95  // If set true, input files will be required to begin with a syntax
96  // identifier.  Otherwise, files may omit this.  If a syntax identifier
97  // is provided, it must be 'syntax = "proto2";' and must appear at the
98  // top of this file regardless of whether or not it was required.
99  void SetRequireSyntaxIdentifier(bool value) {
100    require_syntax_identifier_ = value;
101  }
102
103  // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
104  // parsing as soon as it has seen the syntax identifier, or lack thereof.
105  // This is useful for quickly identifying the syntax of the file without
106  // parsing the whole thing.  If this is enabled, no error will be recorded
107  // if the syntax identifier is something other than "proto2" (since
108  // presumably the caller intends to deal with that), but other kinds of
109  // errors (e.g. parse errors) will still be reported.  When this is enabled,
110  // you may pass a NULL FileDescriptorProto to Parse().
111  void SetStopAfterSyntaxIdentifier(bool value) {
112    stop_after_syntax_identifier_ = value;
113  }
114
115 private:
116  // =================================================================
117  // Error recovery helpers
118
119  // Consume the rest of the current statement.  This consumes tokens
120  // until it sees one of:
121  //   ';'  Consumes the token and returns.
122  //   '{'  Consumes the brace then calls SkipRestOfBlock().
123  //   '}'  Returns without consuming.
124  //   EOF  Returns (can't consume).
125  // The Parser often calls SkipStatement() after encountering a syntax
126  // error.  This allows it to go on parsing the following lines, allowing
127  // it to report more than just one error in the file.
128  void SkipStatement();
129
130  // Consume the rest of the current block, including nested blocks,
131  // ending after the closing '}' is encountered and consumed, or at EOF.
132  void SkipRestOfBlock();
133
134  // -----------------------------------------------------------------
135  // Single-token consuming helpers
136  //
137  // These make parsing code more readable.
138
139  // True if the current token is TYPE_END.
140  inline bool AtEnd();
141
142  // True if the next token matches the given text.
143  inline bool LookingAt(const char* text);
144  // True if the next token is of the given type.
145  inline bool LookingAtType(io::Tokenizer::TokenType token_type);
146
147  // If the next token exactly matches the text given, consume it and return
148  // true.  Otherwise, return false without logging an error.
149  bool TryConsume(const char* text);
150
151  // These attempt to read some kind of token from the input.  If successful,
152  // they return true.  Otherwise they return false and add the given error
153  // to the error list.
154
155  // Consume a token with the exact text given.
156  bool Consume(const char* text, const char* error);
157  // Same as above, but automatically generates the error "Expected \"text\".",
158  // where "text" is the expected token text.
159  bool Consume(const char* text);
160  // Consume a token of type IDENTIFIER and store its text in "output".
161  bool ConsumeIdentifier(string* output, const char* error);
162  // Consume an integer and store its value in "output".
163  bool ConsumeInteger(int* output, const char* error);
164  // Consume a 64-bit integer and store its value in "output".  If the value
165  // is greater than max_value, an error will be reported.
166  bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
167  // Consume a number and store its value in "output".  This will accept
168  // tokens of either INTEGER or FLOAT type.
169  bool ConsumeNumber(double* output, const char* error);
170  // Consume a string literal and store its (unescaped) value in "output".
171  bool ConsumeString(string* output, const char* error);
172
173  // -----------------------------------------------------------------
174  // Error logging helpers
175
176  // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
177  void AddError(int line, int column, const string& error);
178
179  // Invokes error_collector_->AddError() with the line and column number
180  // of the current token.
181  void AddError(const string& error);
182
183  // Record the given line and column and associate it with this descriptor
184  // in the SourceLocationTable.
185  void RecordLocation(const Message* descriptor,
186                      DescriptorPool::ErrorCollector::ErrorLocation location,
187                      int line, int column);
188
189  // Record the current line and column and associate it with this descriptor
190  // in the SourceLocationTable.
191  void RecordLocation(const Message* descriptor,
192                      DescriptorPool::ErrorCollector::ErrorLocation location);
193
194  // =================================================================
195  // Parsers for various language constructs
196
197  // Parses the "syntax = \"proto2\";" line at the top of the file.  Returns
198  // false if it failed to parse or if the syntax identifier was not
199  // recognized.
200  bool ParseSyntaxIdentifier();
201
202  // These methods parse various individual bits of code.  They return
203  // false if they completely fail to parse the construct.  In this case,
204  // it is probably necessary to skip the rest of the statement to recover.
205  // However, if these methods return true, it does NOT mean that there
206  // were no errors; only that there were no *syntax* errors.  For instance,
207  // if a service method is defined using proper syntax but uses a primitive
208  // type as its input or output, ParseMethodField() still returns true
209  // and only reports the error by calling AddError().  In practice, this
210  // makes logic much simpler for the caller.
211
212  // Parse a top-level message, enum, service, etc.
213  bool ParseTopLevelStatement(FileDescriptorProto* file);
214
215  // Parse various language high-level language construrcts.
216  bool ParseMessageDefinition(DescriptorProto* message);
217  bool ParseEnumDefinition(EnumDescriptorProto* enum_type);
218  bool ParseServiceDefinition(ServiceDescriptorProto* service);
219  bool ParsePackage(FileDescriptorProto* file);
220  bool ParseImport(string* import_filename);
221  bool ParseOption(Message* options);
222
223  // These methods parse the contents of a message, enum, or service type and
224  // add them to the given object.  They consume the entire block including
225  // the beginning and ending brace.
226  bool ParseMessageBlock(DescriptorProto* message);
227  bool ParseEnumBlock(EnumDescriptorProto* enum_type);
228  bool ParseServiceBlock(ServiceDescriptorProto* service);
229
230  // Parse one statement within a message, enum, or service block, inclunding
231  // final semicolon.
232  bool ParseMessageStatement(DescriptorProto* message);
233  bool ParseEnumStatement(EnumDescriptorProto* message);
234  bool ParseServiceStatement(ServiceDescriptorProto* message);
235
236  // Parse a field of a message.  If the field is a group, its type will be
237  // added to "messages".
238  bool ParseMessageField(FieldDescriptorProto* field,
239                         RepeatedPtrField<DescriptorProto>* messages);
240
241  // Parse an "extensions" declaration.
242  bool ParseExtensions(DescriptorProto* message);
243
244  // Parse an "extend" declaration.
245  bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
246                   RepeatedPtrField<DescriptorProto>* messages);
247
248  // Parse a single enum value within an enum block.
249  bool ParseEnumConstant(EnumValueDescriptorProto* enum_value);
250
251  // Parse enum constant options, i.e. the list in square brackets at the end
252  // of the enum constant value definition.
253  bool ParseEnumConstantOptions(EnumValueDescriptorProto* value);
254
255  // Parse a single method within a service definition.
256  bool ParseServiceMethod(MethodDescriptorProto* method);
257
258  // Parse "required", "optional", or "repeated" and fill in "label"
259  // with the value.
260  bool ParseLabel(FieldDescriptorProto::Label* label);
261
262  // Parse a type name and fill in "type" (if it is a primitive) or
263  // "type_name" (if it is not) with the type parsed.
264  bool ParseType(FieldDescriptorProto::Type* type,
265                 string* type_name);
266  // Parse a user-defined type and fill in "type_name" with the name.
267  // If a primitive type is named, it is treated as an error.
268  bool ParseUserDefinedType(string* type_name);
269
270  // Parses field options, i.e. the stuff in square brackets at the end
271  // of a field definition.  Also parses default value.
272  bool ParseFieldOptions(FieldDescriptorProto* field);
273
274  // Parse the "default" option.  This needs special handling because its
275  // type is the field's type.
276  bool ParseDefaultAssignment(FieldDescriptorProto* field);
277
278  // Parse a single option name/value pair, e.g. "ctype = CORD".  The name
279  // identifies a field of the given Message, and the value of that field
280  // is set to the parsed value.
281  bool ParseOptionAssignment(Message* options);
282
283  // Parses a single part of a multipart option name. A multipart name consists
284  // of names separated by dots. Each name is either an identifier or a series
285  // of identifiers separated by dots and enclosed in parentheses. E.g.,
286  // "foo.(bar.baz).qux".
287  bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option);
288
289  // =================================================================
290
291  io::Tokenizer* input_;
292  io::ErrorCollector* error_collector_;
293  SourceLocationTable* source_location_table_;
294  bool had_errors_;
295  bool require_syntax_identifier_;
296  bool stop_after_syntax_identifier_;
297  string syntax_identifier_;
298
299  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
300};
301
302// A table mapping (descriptor, ErrorLocation) pairs -- as reported by
303// DescriptorPool when validating descriptors -- to line and column numbers
304// within the original source code.
305class LIBPROTOBUF_EXPORT SourceLocationTable {
306 public:
307  SourceLocationTable();
308  ~SourceLocationTable();
309
310  // Finds the precise location of the given error and fills in *line and
311  // *column with the line and column numbers.  If not found, sets *line to
312  // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
313  // location" in the ErrorCollector interface).  Returns true if found, false
314  // otherwise.
315  bool Find(const Message* descriptor,
316            DescriptorPool::ErrorCollector::ErrorLocation location,
317            int* line, int* column) const;
318
319  // Adds a location to the table.
320  void Add(const Message* descriptor,
321           DescriptorPool::ErrorCollector::ErrorLocation location,
322           int line, int column);
323
324  // Clears the contents of the table.
325  void Clear();
326
327 private:
328  typedef map<
329    pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
330    pair<int, int> > LocationMap;
331  LocationMap location_map_;
332};
333
334}  // namespace compiler
335}  // namespace protobuf
336
337}  // namespace google
338#endif  // GOOGLE_PROTOBUF_COMPILER_PARSER_H__
339