15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2009 The RE2 Authors.  All Rights Reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It provides a prefilter mechanism that helps in cutting down the
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// number of regexps that need to be actually searched.
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// By design, it does not include a string matching engine. This is to
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// allow the user of the class to use their favorite string match
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// engine. The overall flow is: Add all the regexps using Add, then
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compile the FilteredRE2. The compile returns strings that need to
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be matched. Note that all returned strings are lowercase. For
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// applying regexps to a search text, the caller does the string
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matching using the strings returned. When doing the string match,
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// note that the caller has to do that on lower cased version of the
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// search text. Then call FirstMatch or AllMatches with a vector of
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// indices of strings that were found in the text to get the actual
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regexp matches.
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef RE2_FILTERED_RE2_H_
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define RE2_FILTERED_RE2_H_
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <vector>
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/re2.h"
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 {
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)using std::vector;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class PrefilterTree;
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FilteredRE2 {
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FilteredRE2();
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~FilteredRE2();
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Uses RE2 constructor to create a RE2 object (re). Returns
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // re->error_code(). If error_code is other than NoError, then re is
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // deleted and not added to re2_vec_.
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RE2::ErrorCode Add(const StringPiece& pattern,
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     const RE2::Options& options,
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                     int *id);
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Prepares the regexps added by Add for filtering.  Returns a set
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // of strings that the caller should check for in candidate texts.
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The returned strings are lowercased. When doing string matching,
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // the search text should be lowercased first to find matching
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // strings from the set of strings returned by Compile.  Call after
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // all Add calls are done.
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void Compile(vector<string>* strings_to_match);
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the index of the first matching regexp.
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns -1 on no match. Can be called prior to Compile.
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Does not do any filtering: simply tries to Match the
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // regexps in a loop.
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int SlowFirstMatch(const StringPiece& text) const;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the index of the first matching regexp.
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns -1 on no match. Compile has to be called before
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // calling this.
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int FirstMatch(const StringPiece& text,
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                 const vector<int>& atoms) const;
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Returns the indices of all matching regexps, after first clearing
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // matched_regexps.
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool AllMatches(const StringPiece& text,
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                  const vector<int>& atoms,
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                  vector<int>* matching_regexps) const;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The number of regexps added.
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int NumRegexps() const { return re2_vec_.size(); }
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Get the individual RE2 objects. Useful for testing.
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Print prefilter.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void PrintPrefilter(int regexpid);
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Useful for testing and debugging.
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void RegexpsGivenStrings(const vector<int>& matched_atoms,
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           vector<int>* passed_regexps);
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // All the regexps in the FilteredRE2.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  vector<RE2*> re2_vec_;
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Has the FilteredRE2 been compiled using Compile()
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool compiled_;
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // An AND-OR tree of string atoms used for filtering regexps.
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  PrefilterTree* prefilter_tree_;
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //DISALLOW_EVIL_CONSTRUCTORS(FilteredRE2);
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FilteredRE2(const FilteredRE2&);
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void operator=(const FilteredRE2&);
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace re2
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // RE2_FILTERED_RE2_H_
102