1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Implements parsing of .proto files to FileDescriptorProtos.
36
37#ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
38#define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
39
40#include <map>
41#include <string>
42#include <utility>
43#include <google/protobuf/stubs/common.h>
44#include <google/protobuf/descriptor.h>
45#include <google/protobuf/descriptor.pb.h>
46#include <google/protobuf/repeated_field.h>
47#include <google/protobuf/io/tokenizer.h>
48
49namespace google {
50namespace protobuf { class Message; }
51
52namespace protobuf {
53namespace compiler {
54
55// Defined in this file.
56class Parser;
57class SourceLocationTable;
58
59// Implements parsing of protocol definitions (such as .proto files).
60//
61// Note that most users will be more interested in the Importer class.
62// Parser is a lower-level class which simply converts a single .proto file
63// to a FileDescriptorProto.  It does not resolve import directives or perform
64// many other kinds of validation needed to construct a complete
65// FileDescriptor.
66class LIBPROTOBUF_EXPORT Parser {
67 public:
68  Parser();
69  ~Parser();
70
71  // Parse the entire input and construct a FileDescriptorProto representing
72  // it.  Returns true if no errors occurred, false otherwise.
73  bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
74
75  // Optional fetaures:
76
77  // DEPRECATED:  New code should use the SourceCodeInfo embedded in the
78  //   FileDescriptorProto.
79  //
80  // Requests that locations of certain definitions be recorded to the given
81  // SourceLocationTable while parsing.  This can be used to look up exact line
82  // and column numbers for errors reported by DescriptorPool during validation.
83  // Set to NULL (the default) to discard source location information.
84  void RecordSourceLocationsTo(SourceLocationTable* location_table) {
85    source_location_table_ = location_table;
86  }
87
88  // Requests that errors be recorded to the given ErrorCollector while
89  // parsing.  Set to NULL (the default) to discard error messages.
90  void RecordErrorsTo(io::ErrorCollector* error_collector) {
91    error_collector_ = error_collector;
92  }
93
94  // Returns the identifier used in the "syntax = " declaration, if one was
95  // seen during the last call to Parse(), or the empty string otherwise.
96  const string& GetSyntaxIdentifier() { return syntax_identifier_; }
97
98  // If set true, input files will be required to begin with a syntax
99  // identifier.  Otherwise, files may omit this.  If a syntax identifier
100  // is provided, it must be 'syntax = "proto2";' and must appear at the
101  // top of this file regardless of whether or not it was required.
102  void SetRequireSyntaxIdentifier(bool value) {
103    require_syntax_identifier_ = value;
104  }
105
106  // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
107  // parsing as soon as it has seen the syntax identifier, or lack thereof.
108  // This is useful for quickly identifying the syntax of the file without
109  // parsing the whole thing.  If this is enabled, no error will be recorded
110  // if the syntax identifier is something other than "proto2" (since
111  // presumably the caller intends to deal with that), but other kinds of
112  // errors (e.g. parse errors) will still be reported.  When this is enabled,
113  // you may pass a NULL FileDescriptorProto to Parse().
114  void SetStopAfterSyntaxIdentifier(bool value) {
115    stop_after_syntax_identifier_ = value;
116  }
117
118 private:
119  class LocationRecorder;
120
121  // =================================================================
122  // Error recovery helpers
123
124  // Consume the rest of the current statement.  This consumes tokens
125  // until it sees one of:
126  //   ';'  Consumes the token and returns.
127  //   '{'  Consumes the brace then calls SkipRestOfBlock().
128  //   '}'  Returns without consuming.
129  //   EOF  Returns (can't consume).
130  // The Parser often calls SkipStatement() after encountering a syntax
131  // error.  This allows it to go on parsing the following lines, allowing
132  // it to report more than just one error in the file.
133  void SkipStatement();
134
135  // Consume the rest of the current block, including nested blocks,
136  // ending after the closing '}' is encountered and consumed, or at EOF.
137  void SkipRestOfBlock();
138
139  // -----------------------------------------------------------------
140  // Single-token consuming helpers
141  //
142  // These make parsing code more readable.
143
144  // True if the current token is TYPE_END.
145  inline bool AtEnd();
146
147  // True if the next token matches the given text.
148  inline bool LookingAt(const char* text);
149  // True if the next token is of the given type.
150  inline bool LookingAtType(io::Tokenizer::TokenType token_type);
151
152  // If the next token exactly matches the text given, consume it and return
153  // true.  Otherwise, return false without logging an error.
154  bool TryConsume(const char* text);
155
156  // These attempt to read some kind of token from the input.  If successful,
157  // they return true.  Otherwise they return false and add the given error
158  // to the error list.
159
160  // Consume a token with the exact text given.
161  bool Consume(const char* text, const char* error);
162  // Same as above, but automatically generates the error "Expected \"text\".",
163  // where "text" is the expected token text.
164  bool Consume(const char* text);
165  // Consume a token of type IDENTIFIER and store its text in "output".
166  bool ConsumeIdentifier(string* output, const char* error);
167  // Consume an integer and store its value in "output".
168  bool ConsumeInteger(int* output, const char* error);
169  // Consume a signed integer and store its value in "output".
170  bool ConsumeSignedInteger(int* output, const char* error);
171  // Consume a 64-bit integer and store its value in "output".  If the value
172  // is greater than max_value, an error will be reported.
173  bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
174  // Consume a number and store its value in "output".  This will accept
175  // tokens of either INTEGER or FLOAT type.
176  bool ConsumeNumber(double* output, const char* error);
177  // Consume a string literal and store its (unescaped) value in "output".
178  bool ConsumeString(string* output, const char* error);
179
180  // Consume a token representing the end of the statement.  Comments between
181  // this token and the next will be harvested for documentation.  The given
182  // LocationRecorder should refer to the declaration that was just parsed;
183  // it will be populated with these comments.
184  //
185  // TODO(kenton):  The LocationRecorder is const because historically locations
186  //   have been passed around by const reference, for no particularly good
187  //   reason.  We should probably go through and change them all to mutable
188  //   pointer to make this more intuitive.
189  bool TryConsumeEndOfDeclaration(const char* text,
190                                  const LocationRecorder* location);
191  bool ConsumeEndOfDeclaration(const char* text,
192                               const LocationRecorder* location);
193
194  // -----------------------------------------------------------------
195  // Error logging helpers
196
197  // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
198  void AddError(int line, int column, const string& error);
199
200  // Invokes error_collector_->AddError() with the line and column number
201  // of the current token.
202  void AddError(const string& error);
203
204  // Records a location in the SourceCodeInfo.location table (see
205  // descriptor.proto).  We use RAII to ensure that the start and end locations
206  // are recorded -- the constructor records the start location and the
207  // destructor records the end location.  Since the parser is
208  // recursive-descent, this works out beautifully.
209  class LIBPROTOBUF_EXPORT LocationRecorder {
210   public:
211    // Construct the file's "root" location.
212    LocationRecorder(Parser* parser);
213
214    // Construct a location that represents a declaration nested within the
215    // given parent.  E.g. a field's location is nested within the location
216    // for a message type.  The parent's path will be copied, so you should
217    // call AddPath() only to add the path components leading from the parent
218    // to the child (as opposed to leading from the root to the child).
219    LocationRecorder(const LocationRecorder& parent);
220
221    // Convenience constructors that call AddPath() one or two times.
222    LocationRecorder(const LocationRecorder& parent, int path1);
223    LocationRecorder(const LocationRecorder& parent, int path1, int path2);
224
225    ~LocationRecorder();
226
227    // Add a path component.  See SourceCodeInfo.Location.path in
228    // descriptor.proto.
229    void AddPath(int path_component);
230
231    // By default the location is considered to start at the current token at
232    // the time the LocationRecorder is created.  StartAt() sets the start
233    // location to the given token instead.
234    void StartAt(const io::Tokenizer::Token& token);
235
236    // By default the location is considered to end at the previous token at
237    // the time the LocationRecorder is destroyed.  EndAt() sets the end
238    // location to the given token instead.
239    void EndAt(const io::Tokenizer::Token& token);
240
241    // Records the start point of this location to the SourceLocationTable that
242    // was passed to RecordSourceLocationsTo(), if any.  SourceLocationTable
243    // is an older way of keeping track of source locations which is still
244    // used in some places.
245    void RecordLegacyLocation(const Message* descriptor,
246        DescriptorPool::ErrorCollector::ErrorLocation location);
247
248    // Attaches leading and trailing comments to the location.  The two strings
249    // will be swapped into place, so after this is called *leading and
250    // *trailing will be empty.
251    //
252    // TODO(kenton):  See comment on TryConsumeEndOfDeclaration(), above, for
253    //   why this is const.
254    void AttachComments(string* leading, string* trailing) const;
255
256   private:
257    Parser* parser_;
258    SourceCodeInfo::Location* location_;
259
260    void Init(const LocationRecorder& parent);
261  };
262
263  // =================================================================
264  // Parsers for various language constructs
265
266  // Parses the "syntax = \"proto2\";" line at the top of the file.  Returns
267  // false if it failed to parse or if the syntax identifier was not
268  // recognized.
269  bool ParseSyntaxIdentifier();
270
271  // These methods parse various individual bits of code.  They return
272  // false if they completely fail to parse the construct.  In this case,
273  // it is probably necessary to skip the rest of the statement to recover.
274  // However, if these methods return true, it does NOT mean that there
275  // were no errors; only that there were no *syntax* errors.  For instance,
276  // if a service method is defined using proper syntax but uses a primitive
277  // type as its input or output, ParseMethodField() still returns true
278  // and only reports the error by calling AddError().  In practice, this
279  // makes logic much simpler for the caller.
280
281  // Parse a top-level message, enum, service, etc.
282  bool ParseTopLevelStatement(FileDescriptorProto* file,
283                              const LocationRecorder& root_location);
284
285  // Parse various language high-level language construrcts.
286  bool ParseMessageDefinition(DescriptorProto* message,
287                              const LocationRecorder& message_location);
288  bool ParseEnumDefinition(EnumDescriptorProto* enum_type,
289                           const LocationRecorder& enum_location);
290  bool ParseServiceDefinition(ServiceDescriptorProto* service,
291                              const LocationRecorder& service_location);
292  bool ParsePackage(FileDescriptorProto* file,
293                    const LocationRecorder& root_location);
294  bool ParseImport(RepeatedPtrField<string>* dependency,
295                   RepeatedField<int32>* public_dependency,
296                   RepeatedField<int32>* weak_dependency,
297                   const LocationRecorder& root_location);
298  bool ParseOption(Message* options,
299                   const LocationRecorder& options_location);
300
301  // These methods parse the contents of a message, enum, or service type and
302  // add them to the given object.  They consume the entire block including
303  // the beginning and ending brace.
304  bool ParseMessageBlock(DescriptorProto* message,
305                         const LocationRecorder& message_location);
306  bool ParseEnumBlock(EnumDescriptorProto* enum_type,
307                      const LocationRecorder& enum_location);
308  bool ParseServiceBlock(ServiceDescriptorProto* service,
309                         const LocationRecorder& service_location);
310
311  // Parse one statement within a message, enum, or service block, inclunding
312  // final semicolon.
313  bool ParseMessageStatement(DescriptorProto* message,
314                             const LocationRecorder& message_location);
315  bool ParseEnumStatement(EnumDescriptorProto* message,
316                          const LocationRecorder& enum_location);
317  bool ParseServiceStatement(ServiceDescriptorProto* message,
318                             const LocationRecorder& service_location);
319
320  // Parse a field of a message.  If the field is a group, its type will be
321  // added to "messages".
322  //
323  // parent_location and location_field_number_for_nested_type are needed when
324  // parsing groups -- we need to generate a nested message type within the
325  // parent and record its location accordingly.  Since the parent could be
326  // either a FileDescriptorProto or a DescriptorProto, we must pass in the
327  // correct field number to use.
328  bool ParseMessageField(FieldDescriptorProto* field,
329                         RepeatedPtrField<DescriptorProto>* messages,
330                         const LocationRecorder& parent_location,
331                         int location_field_number_for_nested_type,
332                         const LocationRecorder& field_location);
333
334  // Parse an "extensions" declaration.
335  bool ParseExtensions(DescriptorProto* message,
336                       const LocationRecorder& extensions_location);
337
338  // Parse an "extend" declaration.  (See also comments for
339  // ParseMessageField().)
340  bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
341                   RepeatedPtrField<DescriptorProto>* messages,
342                   const LocationRecorder& parent_location,
343                   int location_field_number_for_nested_type,
344                   const LocationRecorder& extend_location);
345
346  // Parse a single enum value within an enum block.
347  bool ParseEnumConstant(EnumValueDescriptorProto* enum_value,
348                         const LocationRecorder& enum_value_location);
349
350  // Parse enum constant options, i.e. the list in square brackets at the end
351  // of the enum constant value definition.
352  bool ParseEnumConstantOptions(EnumValueDescriptorProto* value,
353                                const LocationRecorder& enum_value_location);
354
355  // Parse a single method within a service definition.
356  bool ParseServiceMethod(MethodDescriptorProto* method,
357                          const LocationRecorder& method_location);
358
359
360  // Parse options of a single method or stream.
361  bool ParseOptions(const LocationRecorder& parent_location,
362                    const int optionsFieldNumber,
363                    Message* mutable_options);
364
365  // Parse "required", "optional", or "repeated" and fill in "label"
366  // with the value.
367  bool ParseLabel(FieldDescriptorProto::Label* label);
368
369  // Parse a type name and fill in "type" (if it is a primitive) or
370  // "type_name" (if it is not) with the type parsed.
371  bool ParseType(FieldDescriptorProto::Type* type,
372                 string* type_name);
373  // Parse a user-defined type and fill in "type_name" with the name.
374  // If a primitive type is named, it is treated as an error.
375  bool ParseUserDefinedType(string* type_name);
376
377  // Parses field options, i.e. the stuff in square brackets at the end
378  // of a field definition.  Also parses default value.
379  bool ParseFieldOptions(FieldDescriptorProto* field,
380                         const LocationRecorder& field_location);
381
382  // Parse the "default" option.  This needs special handling because its
383  // type is the field's type.
384  bool ParseDefaultAssignment(FieldDescriptorProto* field,
385                              const LocationRecorder& field_location);
386
387  enum OptionStyle {
388    OPTION_ASSIGNMENT,  // just "name = value"
389    OPTION_STATEMENT    // "option name = value;"
390  };
391
392  // Parse a single option name/value pair, e.g. "ctype = CORD".  The name
393  // identifies a field of the given Message, and the value of that field
394  // is set to the parsed value.
395  bool ParseOption(Message* options,
396                   const LocationRecorder& options_location,
397                   OptionStyle style);
398
399  // Parses a single part of a multipart option name. A multipart name consists
400  // of names separated by dots. Each name is either an identifier or a series
401  // of identifiers separated by dots and enclosed in parentheses. E.g.,
402  // "foo.(bar.baz).qux".
403  bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option,
404                           const LocationRecorder& part_location);
405
406  // Parses a string surrounded by balanced braces.  Strips off the outer
407  // braces and stores the enclosed string in *value.
408  // E.g.,
409  //     { foo }                     *value gets 'foo'
410  //     { foo { bar: box } }        *value gets 'foo { bar: box }'
411  //     {}                          *value gets ''
412  //
413  // REQUIRES: LookingAt("{")
414  // When finished successfully, we are looking at the first token past
415  // the ending brace.
416  bool ParseUninterpretedBlock(string* value);
417
418  // =================================================================
419
420  io::Tokenizer* input_;
421  io::ErrorCollector* error_collector_;
422  SourceCodeInfo* source_code_info_;
423  SourceLocationTable* source_location_table_;  // legacy
424  bool had_errors_;
425  bool require_syntax_identifier_;
426  bool stop_after_syntax_identifier_;
427  string syntax_identifier_;
428
429  // Leading doc comments for the next declaration.  These are not complete
430  // yet; use ConsumeEndOfDeclaration() to get the complete comments.
431  string upcoming_doc_comments_;
432
433  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
434};
435
436// A table mapping (descriptor, ErrorLocation) pairs -- as reported by
437// DescriptorPool when validating descriptors -- to line and column numbers
438// within the original source code.
439//
440// This is semi-obsolete:  FileDescriptorProto.source_code_info now contains
441// far more complete information about source locations.  However, as of this
442// writing you still need to use SourceLocationTable when integrating with
443// DescriptorPool.
444class LIBPROTOBUF_EXPORT SourceLocationTable {
445 public:
446  SourceLocationTable();
447  ~SourceLocationTable();
448
449  // Finds the precise location of the given error and fills in *line and
450  // *column with the line and column numbers.  If not found, sets *line to
451  // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
452  // location" in the ErrorCollector interface).  Returns true if found, false
453  // otherwise.
454  bool Find(const Message* descriptor,
455            DescriptorPool::ErrorCollector::ErrorLocation location,
456            int* line, int* column) const;
457
458  // Adds a location to the table.
459  void Add(const Message* descriptor,
460           DescriptorPool::ErrorCollector::ErrorLocation location,
461           int line, int column);
462
463  // Clears the contents of the table.
464  void Clear();
465
466 private:
467  typedef map<
468    pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
469    pair<int, int> > LocationMap;
470  LocationMap location_map_;
471};
472
473}  // namespace compiler
474}  // namespace protobuf
475
476}  // namespace google
477#endif  // GOOGLE_PROTOBUF_COMPILER_PARSER_H__
478