1// Copyright (c) 2005, Google Inc. 2// All rights reserved. 3// 4// Redistribution and use in source and binary forms, with or without 5// modification, are permitted provided that the following conditions are 6// met: 7// 8// * Redistributions of source code must retain the above copyright 9// notice, this list of conditions and the following disclaimer. 10// * Redistributions in binary form must reproduce the above 11// copyright notice, this list of conditions and the following disclaimer 12// in the documentation and/or other materials provided with the 13// distribution. 14// * Neither the name of Google Inc. nor the names of its 15// contributors may be used to endorse or promote products derived from 16// this software without specific prior written permission. 17// 18// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29// 30// Author: Sanjay Ghemawat 31// 32// Regular-expression based scanner for parsing an input stream. 33// 34// Example 1: parse a sequence of "var = number" entries from input: 35// 36// Scanner scanner(input); 37// string var; 38// int number; 39// scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter 40// while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { 41// ...; 42// } 43 44#ifndef _PCRE_SCANNER_H 45#define _PCRE_SCANNER_H 46 47#include <assert.h> 48#include <string> 49#include <vector> 50 51#include <pcrecpp.h> 52#include <pcre_stringpiece.h> 53 54namespace pcrecpp { 55 56class PCRECPP_EXP_DEFN Scanner { 57 public: 58 Scanner(); 59 explicit Scanner(const std::string& input); 60 ~Scanner(); 61 62 // Return current line number. The returned line-number is 63 // one-based. I.e. it returns 1 + the number of consumed newlines. 64 // 65 // Note: this method may be slow. It may take time proportional to 66 // the size of the input. 67 int LineNumber() const; 68 69 // Return the byte-offset that the scanner is looking in the 70 // input data; 71 int Offset() const; 72 73 // Return true iff the start of the remaining input matches "re" 74 bool LookingAt(const RE& re) const; 75 76 // Return true iff all of the following are true 77 // a. the start of the remaining input matches "re", 78 // b. if any arguments are supplied, matched sub-patterns can be 79 // parsed and stored into the arguments. 80 // If it returns true, it skips over the matched input and any 81 // following input that matches the "skip" regular expression. 82 bool Consume(const RE& re, 83 const Arg& arg0 = RE::no_arg, 84 const Arg& arg1 = RE::no_arg, 85 const Arg& arg2 = RE::no_arg 86 // TODO: Allow more arguments? 87 ); 88 89 // Set the "skip" regular expression. If after consuming some data, 90 // a prefix of the input matches this RE, it is automatically 91 // skipped. For example, a programming language scanner would use 92 // a skip RE that matches white space and comments. 93 // 94 // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); 95 // 96 // Skipping repeats as long as it succeeds. We used to let people do 97 // this by writing "(...)*" in the regular expression, but that added 98 // up to lots of recursive calls within the pcre library, so now we 99 // control repetition explicitly via the function call API. 100 // 101 // You can pass NULL for "re" if you do not want any data to be skipped. 102 void Skip(const char* re); // DEPRECATED; does *not* repeat 103 void SetSkipExpression(const char* re); 104 105 // Temporarily pause "skip"ing. This 106 // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() 107 // is similar to 108 // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); 109 // but avoids creating/deleting new RE objects. 110 void DisableSkip(); 111 112 // Reenable previously paused skipping. Any prefix of the input 113 // that matches the skip pattern is immediately dropped. 114 void EnableSkip(); 115 116 /***** Special wrappers around SetSkip() for some common idioms *****/ 117 118 // Arranges to skip whitespace, C comments, C++ comments. 119 // The overall RE is a disjunction of the following REs: 120 // \\s whitespace 121 // //.*\n C++ comment 122 // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) 123 // We get repetition via the semantics of SetSkipExpression, not by using * 124 void SkipCXXComments() { 125 SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); 126 } 127 128 void set_save_comments(bool comments) { 129 save_comments_ = comments; 130 } 131 132 bool save_comments() { 133 return save_comments_; 134 } 135 136 // Append to vector ranges the comments found in the 137 // byte range [start,end] (inclusive) of the input data. 138 // Only comments that were extracted entirely within that 139 // range are returned: no range splitting of atomically-extracted 140 // comments is performed. 141 void GetComments(int start, int end, std::vector<StringPiece> *ranges); 142 143 // Append to vector ranges the comments added 144 // since the last time this was called. This 145 // functionality is provided for efficiency when 146 // interleaving scanning with parsing. 147 void GetNextComments(std::vector<StringPiece> *ranges); 148 149 private: 150 std::string data_; // All the input data 151 StringPiece input_; // Unprocessed input 152 RE* skip_; // If non-NULL, RE for skipping input 153 bool should_skip_; // If true, use skip_ 154 bool skip_repeat_; // If true, repeat skip_ as long as it works 155 bool save_comments_; // If true, aggregate the skip expression 156 157 // the skipped comments 158 // TODO: later consider requiring that the StringPieces be added 159 // in order by their start position 160 std::vector<StringPiece> *comments_; 161 162 // the offset into comments_ that has been returned by GetNextComments 163 int comments_offset_; 164 165 // helper function to consume *skip_ and honour 166 // save_comments_ 167 void ConsumeSkip(); 168}; 169 170} // namespace pcrecpp 171 172#endif /* _PCRE_SCANNER_H */ 173