1// Copyright (c) 2005, Google Inc.
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are
6// met:
7//
8//     * Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10//     * Redistributions in binary form must reproduce the above
11// copyright notice, this list of conditions and the following disclaimer
12// in the documentation and/or other materials provided with the
13// distribution.
14//     * Neither the name of Google Inc. nor the names of its
15// contributors may be used to endorse or promote products derived from
16// this software without specific prior written permission.
17//
18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29//
30// Author: Sanjay Ghemawat
31//
32// Regular-expression based scanner for parsing an input stream.
33//
34// Example 1: parse a sequence of "var = number" entries from input:
35//
36//      Scanner scanner(input);
37//      string var;
38//      int number;
39//      scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter
40//      while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) {
41//        ...;
42//      }
43
44#ifndef _PCRE_SCANNER_H
45#define _PCRE_SCANNER_H
46
47#include <assert.h>
48#include <string>
49#include <vector>
50
51#include <pcrecpp.h>
52#include <pcre_stringpiece.h>
53
54namespace pcrecpp {
55
56class Scanner {
57 public:
58  Scanner();
59  explicit Scanner(const std::string& input);
60  ~Scanner();
61
62  // Return current line number.  The returned line-number is
63  // one-based.  I.e. it returns 1 + the number of consumed newlines.
64  //
65  // Note: this method may be slow.  It may take time proportional to
66  // the size of the input.
67  int LineNumber() const;
68
69  // Return the byte-offset that the scanner is looking in the
70  // input data;
71  int Offset() const;
72
73  // Return true iff the start of the remaining input matches "re"
74  bool LookingAt(const RE& re) const;
75
76  // Return true iff all of the following are true
77  //    a. the start of the remaining input matches "re",
78  //    b. if any arguments are supplied, matched sub-patterns can be
79  //       parsed and stored into the arguments.
80  // If it returns true, it skips over the matched input and any
81  // following input that matches the "skip" regular expression.
82  template<typename ... ARGS>
83  bool Consume(const RE& re, ARGS && ... args) {
84    const bool result = re.Consume(&input_, args...);
85    if (result && should_skip_)
86      ConsumeSkip();
87    return result;
88  }
89
90  // Set the "skip" regular expression.  If after consuming some data,
91  // a prefix of the input matches this RE, it is automatically
92  // skipped.  For example, a programming language scanner would use
93  // a skip RE that matches white space and comments.
94  //
95  //    scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/");
96  //
97  // Skipping repeats as long as it succeeds.  We used to let people do
98  // this by writing "(...)*" in the regular expression, but that added
99  // up to lots of recursive calls within the pcre library, so now we
100  // control repetition explicitly via the function call API.
101  //
102  // You can pass NULL for "re" if you do not want any data to be skipped.
103  void Skip(const char* re);   // DEPRECATED; does *not* repeat
104  void SetSkipExpression(const char* re);
105
106  // Temporarily pause "skip"ing. This
107  //   Skip("Foo"); code ; DisableSkip(); code; EnableSkip()
108  // is similar to
109  //   Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo");
110  // but avoids creating/deleting new RE objects.
111  void DisableSkip();
112
113  // Reenable previously paused skipping.  Any prefix of the input
114  // that matches the skip pattern is immediately dropped.
115  void EnableSkip();
116
117  /***** Special wrappers around SetSkip() for some common idioms *****/
118
119  // Arranges to skip whitespace, C comments, C++ comments.
120  // The overall RE is a disjunction of the following REs:
121  //    \\s                     whitespace
122  //    //.*\n                  C++ comment
123  //    /[*](.|\n)*?[*]/        C comment (x*? means minimal repetitions of x)
124  // We get repetition via the semantics of SetSkipExpression, not by using *
125  void SkipCXXComments() {
126    SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/");
127  }
128
129  void set_save_comments(bool comments) {
130    save_comments_ = comments;
131  }
132
133  bool save_comments() {
134    return save_comments_;
135  }
136
137  // Append to vector ranges the comments found in the
138  // byte range [start,end] (inclusive) of the input data.
139  // Only comments that were extracted entirely within that
140  // range are returned: no range splitting of atomically-extracted
141  // comments is performed.
142  void GetComments(int start, int end, std::vector<StringPiece> *ranges);
143
144  // Append to vector ranges the comments added
145  // since the last time this was called. This
146  // functionality is provided for efficiency when
147  // interleaving scanning with parsing.
148  void GetNextComments(std::vector<StringPiece> *ranges);
149
150 private:
151  std::string   data_;          // All the input data
152  StringPiece   input_;         // Unprocessed input
153  RE*           skip_;          // If non-NULL, RE for skipping input
154  bool          should_skip_;   // If true, use skip_
155  bool          skip_repeat_;   // If true, repeat skip_ as long as it works
156  bool          save_comments_; // If true, aggregate the skip expression
157
158  // the skipped comments
159  // TODO: later consider requiring that the StringPieces be added
160  // in order by their start position
161  std::vector<StringPiece> *comments_;
162
163  // the offset into comments_ that has been returned by GetNextComments
164  int           comments_offset_;
165
166  // helper function to consume *skip_ and honour
167  // save_comments_
168  void ConsumeSkip();
169};
170
171}   // namespace pcrecpp
172
173#endif /* _PCRE_SCANNER_H */
174