1// Copyright (c) 2005, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29//
30// Author: Sanjay Ghemawat
31//
32// Regular-expression based scanner for parsing an input stream.
33//
34// Example 1: parse a sequence of "var = number" entries from input:
35//
36//      Scanner scanner(input);
37//      string var;
38//      int number;
39//      scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
40//      while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
41//        ...;
42//      }
43
44#ifndef _PCRE_SCANNER_H
45#define _PCRE_SCANNER_H
46
47#include <assert.h>
48#include <string>
49#include <vector>
50
51#include <pcrecpp.h>
52#include <pcre_stringpiece.h>
53
54namespace pcrecpp {
55
56class PCRECPP_EXP_DEFN Scanner {
57 public:
58  Scanner();
59  explicit Scanner(const std::string& input);
60  ~Scanner();
61
62  // Return current line number.  The returned line-number is
63  // one-based.  I.e. it returns 1 + the number of consumed newlines.
64  //
65  // Note: this method may be slow.  It may take time proportional to
66  // the size of the input.
67  int LineNumber() const;
68
69  // Return the byte-offset that the scanner is looking in the
70  // input data;
71  int Offset() const;
72
73  // Return true iff the start of the remaining input matches "re"
74  bool LookingAt(const RE& re) const;
75
76  // Return true iff all of the following are true
77  //    a. the start of the remaining input matches "re",
78  //    b. if any arguments are supplied, matched sub-patterns can be
79  //       parsed and stored into the arguments.
80  // If it returns true, it skips over the matched input and any
81  // following input that matches the "skip" regular expression.
82  bool Consume(const RE& re,
83               const Arg& arg0 = RE::no_arg,
84               const Arg& arg1 = RE::no_arg,
85               const Arg& arg2 = RE::no_arg
86               // TODO: Allow more arguments?
87               );
88
89  // Set the "skip" regular expression.  If after consuming some data,
90  // a prefix of the input matches this RE, it is automatically
91  // skipped.  For example, a programming language scanner would use
92  // a skip RE that matches white space and comments.
93  //
94  //    scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
95  //
96  // Skipping repeats as long as it succeeds.  We used to let people do
97  // this by writing "(...)*" in the regular expression, but that added
98  // up to lots of recursive calls within the pcre library, so now we
99  // control repetition explicitly via the function call API.
100  //
101  // You can pass NULL for "re" if you do not want any data to be skipped.
102  void Skip(const char* re);   // DEPRECATED; does *not* repeat
103  void SetSkipExpression(const char* re);
104
105  // Temporarily pause "skip"ing. This
106  //   Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
107  // is similar to
108  //   Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
109  // but avoids creating/deleting new RE objects.
110  void DisableSkip();
111
112  // Reenable previously paused skipping.  Any prefix of the input
113  // that matches the skip pattern is immediately dropped.
114  void EnableSkip();
115
116  /***** Special wrappers around SetSkip() for some common idioms *****/
117
118  // Arranges to skip whitespace, C comments, C++ comments.
119  // The overall RE is a disjunction of the following REs:
120  //    \\s                     whitespace
121  //    //.*\n                  C++ comment
122  //    /[*](.|\n)*?[*]/        C comment (x*? means minimal repetitions of x)
123  // We get repetition via the semantics of SetSkipExpression, not by using *
124  void SkipCXXComments() {
125    SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
126  }
127
128  void set_save_comments(bool comments) {
129    save_comments_ = comments;
130  }
131
132  bool save_comments() {
133    return save_comments_;
134  }
135
136  // Append to vector ranges the comments found in the
137  // byte range [start,end] (inclusive) of the input data.
138  // Only comments that were extracted entirely within that
139  // range are returned: no range splitting of atomically-extracted
140  // comments is performed.
141  void GetComments(int start, int end, std::vector<StringPiece> *ranges);
142
143  // Append to vector ranges the comments added
144  // since the last time this was called. This
145  // functionality is provided for efficiency when
146  // interleaving scanning with parsing.
147  void GetNextComments(std::vector<StringPiece> *ranges);
148
149 private:
150  std::string   data_;          // All the input data
151  StringPiece   input_;         // Unprocessed input
152  RE*           skip_;          // If non-NULL, RE for skipping input
153  bool          should_skip_;   // If true, use skip_
154  bool          skip_repeat_;   // If true, repeat skip_ as long as it works
155  bool          save_comments_; // If true, aggregate the skip expression
156
157  // the skipped comments
158  // TODO: later consider requiring that the StringPieces be added
159  // in order by their start position
160  std::vector<StringPiece> *comments_;
161
162  // the offset into comments_ that has been returned by GetNextComments
163  int           comments_offset_;
164
165  // helper function to consume *skip_ and honour
166  // save_comments_
167  void ConsumeSkip();
168};
169
170}   // namespace pcrecpp
171
172#endif /* _PCRE_SCANNER_H */
173