15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2009 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It provides a prefilter mechanism that helps in cutting down the 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// number of regexps that need to be actually searched. 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// By design, it does not include a string matching engine. This is to 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// allow the user of the class to use their favorite string match 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// engine. The overall flow is: Add all the regexps using Add, then 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compile the FilteredRE2. The compile returns strings that need to 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be matched. Note that all returned strings are lowercase. For 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// applying regexps to a search text, the caller does the string 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matching using the strings returned. When doing the string match, 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// note that the caller has to do that on lower cased version of the 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// search text. Then call FirstMatch or AllMatches with a vector of 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// indices of strings that were found in the text to get the actual 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regexp matches. 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_FILTERED_RE2_H_ 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_FILTERED_RE2_H_ 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector> 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h" 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)using std::vector; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PrefilterTree; 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilteredRE2 { 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FilteredRE2(); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~FilteredRE2(); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Uses RE2 constructor to create a RE2 object (re). Returns 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // re->error_code(). If error_code is other than NoError, then re is 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // deleted and not added to re2_vec_. 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RE2::ErrorCode Add(const StringPiece& pattern, 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const RE2::Options& options, 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int *id); 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Prepares the regexps added by Add for filtering. Returns a set 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of strings that the caller should check for in candidate texts. 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The returned strings are lowercased. When doing string matching, 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the search text should be lowercased first to find matching 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // strings from the set of strings returned by Compile. Call after 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // all Add calls are done. 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void Compile(vector<string>* strings_to_match); 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the index of the first matching regexp. 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns -1 on no match. Can be called prior to Compile. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Does not do any filtering: simply tries to Match the 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // regexps in a loop. 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int SlowFirstMatch(const StringPiece& text) const; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the index of the first matching regexp. 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns -1 on no match. Compile has to be called before 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // calling this. 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int FirstMatch(const StringPiece& text, 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const vector<int>& atoms) const; 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns the indices of all matching regexps, after first clearing 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // matched_regexps. 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool AllMatches(const StringPiece& text, 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const vector<int>& atoms, 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* matching_regexps) const; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The number of regexps added. 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int NumRegexps() const { return re2_vec_.size(); } 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Get the individual RE2 objects. Useful for testing. 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; } 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Print prefilter. 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void PrintPrefilter(int regexpid); 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Useful for testing and debugging. 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RegexpsGivenStrings(const vector<int>& matched_atoms, 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* passed_regexps); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // All the regexps in the FilteredRE2. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<RE2*> re2_vec_; 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Has the FilteredRE2 been compiled using Compile() 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool compiled_; 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // An AND-OR tree of string atoms used for filtering regexps. 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) PrefilterTree* prefilter_tree_; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FilteredRE2(const FilteredRE2&); 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void operator=(const FilteredRE2&); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // RE2_FILTERED_RE2_H_ 102