dfa.cc revision 0d4c52358a1af421705c54bd8a9fdd8a30558a2e
12ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copyright 2008 The RE2 Authors. All Rights Reserved. 22ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Use of this source code is governed by a BSD-style 32ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// license that can be found in the LICENSE file. 42ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 52ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A DFA (deterministic finite automaton)-based regular expression search. 62ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 72ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The DFA search has two main parts: the construction of the automaton, 82ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which is represented by a graph of State structures, and the execution 92ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of the automaton over a given input string. 102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The basic idea is that the State graph is constructed so that the 122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// execution can simply start with a state s, and then for each byte c in 132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the input string, execute "s = s->next[c]", checking at each point whether 142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the current s represents a matching state. 152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The simple explanation just given does convey the essence of this code, 172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but it omits the details of how the State graph gets constructed as well 182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as some performance-driven optimizations to the execution of the automaton. 192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// All these details are explained in the comments for the code following 202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the definition of class DFA. 212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. 232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/prog.h" 252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "re2/stringpiece.h" 262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/atomicops.h" 272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/flags.h" 282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#include "util/sparse_set.h" 292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDEFINE_bool(re2_dfa_bail_when_slow, true, 312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson "Whether the RE2 DFA should bail out early " 322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson "if the NFA would be faster (for testing)."); 332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonnamespace re2 { 352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#if !defined(__linux__) /* only Linux seems to have memrchr */ 372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic void* memrchr(const void* s, int c, size_t n) { 382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const unsigned char* p = (const unsigned char*)s; 392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (p += n; n > 0; n--) 402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (*--p == c) 412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return (void*)p; 422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#endif 462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Changing this to true compiles in prints that trace execution of the DFA. 482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Generates a lot of output -- only useful for debugging. 492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic const bool DebugDFA = false; 502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A DFA implementation of a regular expression program. 522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Since this is entirely a forward declaration mandated by C++, 532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// some of the comments here are better understood after reading 542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the comments in the sections that follow the DFA definition. 552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA { 562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem); 582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~DFA(); 592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ok() const { return !init_failed_; } 602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::MatchKind kind() { return kind_; } 612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Searches for the regular expression in text, which is considered 632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // as a subsection of context for the purposes of interpreting flags 642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // like ^ and $ and \A and \z. 652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns whether a match was found. 662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If a match is found, sets *ep to the end point of the best match in text. 672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If "anchored", the match must begin at the start of text. 682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If "want_earliest_match", the match that ends first is used, not 692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // necessarily the best one. 702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If "run_forward" is true, the DFA runs from text.begin() to text.end(). 712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If it is false, the DFA runs from text.end() to text.begin(), 722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // returning the leftmost end of the match instead of the rightmost one. 732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If the DFA cannot complete the search (for example, if it is out of 742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // memory), it sets *failed and returns false. 752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool Search(const StringPiece& text, const StringPiece& context, 762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, bool want_earliest_match, bool run_forward, 772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool* failed, const char** ep, vector<int>* matches); 782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Builds out all states for the entire DFA. FOR TESTING ONLY 802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns number of states. 812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int BuildAllStates(); 822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Computes min and max for matching strings. Won't return strings 842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // bigger than maxlen. 852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool PossibleMatchRange(string* min, string* max, int maxlen); 862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // These data structures are logically private, but C++ makes it too 882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // difficult to mark them as such. 892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson class Workq; 902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson class RWLocker; 912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson class StateSaver; 922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // A single DFA state. The DFA is represented as a graph of these 942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // States, linked by the next_ pointers. If in state s and reading 952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // byte c, the next state should be s->next_[c]. 962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct State { 972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline bool IsMatch() const { return flag_ & kFlagMatch; } 982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void SaveMatch(vector<int>* v); 992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* inst_; // Instruction pointers in the state. 1012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ninst_; // # of inst_ pointers. 1022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint flag_; // Empty string bitfield flags in effect on the way 1032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // into this state, along with kFlagMatch if this 1042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // is a matching state. 1052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State** next_; // Outgoing arrows from State, 1062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // one per input byte class 1072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson enum { 1102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kByteEndText = 256, // imaginary byte at end of text 1112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags 1132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFlagMatch = 0x1000, // State.flag_: this is a matching state 1142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFlagLastWord = 0x2000, // State.flag_: last byte was a word char 1152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left 1162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#ifndef STL_MSVC 1192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // STL function structures for use with unordered_set. 1202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct StateEqual { 1212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool operator()(const State* a, const State* b) const { 1222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a == b) 1232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 1242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a == NULL || b == NULL) 1252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a->ninst_ != b->ninst_) 1272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a->flag_ != b->flag_) 1292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < a->ninst_; i++) 1312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a->inst_[i] != b->inst_[i]) 1322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 1332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; // they're equal 1342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1360d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#endif // STL_MSVC 1372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct StateHash { 1382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson size_t operator()(const State* a) const { 1392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (a == NULL) 1402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 1412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* s = reinterpret_cast<const char*>(a->inst_); 1422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int len = a->ninst_ * sizeof a->inst_[0]; 1432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sizeof(size_t) == sizeof(uint32)) 1442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Hash32StringWithSeed(s, len, a->flag_); 1452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 1462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return Hash64StringWithSeed(s, len, a->flag_); 1472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 1480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#ifdef STL_MSVC 1490d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Less than operator. 1500d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin bool operator()(const State* a, const State* b) const { 1510d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (a == b) 1520d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return false; 1530d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (a == NULL || b == NULL) 1540d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return a == NULL; 1550d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (a->ninst_ != b->ninst_) 1560d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return a->ninst_ < b->ninst_; 1570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (a->flag_ != b->flag_) 1580d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return a->flag_ < b->flag_; 1590d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < a->ninst_; ++i) 1600d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (a->inst_[i] != b->inst_[i]) 1610d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return a->inst_[i] < b->inst_[i]; 1620d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return false; // they're equal 1630d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 1640d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // The two public members are required by msvc. 4 and 8 are default values. 1650d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx 1660d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin static const size_t bucket_size = 4; 1670d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin static const size_t min_buckets = 8; 1680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#endif // STL_MSVC 1692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#ifdef STL_MSVC 1720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin typedef unordered_set<State*, StateHash> StateSet; 1730d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#else // !STL_MSVC 1742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson typedef unordered_set<State*, StateHash, StateEqual> StateSet; 1750d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin#endif // STL_MSVC 1762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 1792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) 1802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson enum { 1812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFbUnknown = -1, // No analysis has been performed. 1822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFbMany = -2, // Many bytes will lead out of this state. 1832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kFbNone = -3, // No bytes lead out of this state. 1842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson enum { 1872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Indices into start_ for unanchored searches. 1882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Add kStartAnchored for anchored searches. 1892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kStartBeginText = 0, // text at beginning of context 1902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kStartBeginLine = 2, // text at beginning of line 1912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kStartAfterWordChar = 4, // text follows a word character 1922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kStartAfterNonWordChar = 6, // text follows non-word character 1932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kMaxStart = 8, 1942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kStartAnchored = 1, 1962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 1972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Resets the DFA State cache, flushing all saved State* information. 1992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Releases and reacquires cache_mutex_ via cache_lock, so any 2002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // State* existing before the call are not valid after the call. 2012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Use a StateSaver to preserve important states across the call. 2022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 2032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // After: cache_mutex_.w <= L < mutex_ 2042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void ResetCache(RWLocker* cache_lock); 2052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Looks up and returns the State corresponding to a Workq. 2072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* WorkqToCachedState(Workq* q, uint flag); 2092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Looks up and returns a State matching the inst, ninst, and flag. 2112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* CachedState(int* inst, int ninst, uint flag); 2132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Clear the cache entirely. 2152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Must hold cache_mutex_.w or be in destructor. 2162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void ClearCache(); 2172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Converts a State into a Workq: the opposite of WorkqToCachedState. 2192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static void StateToWorkq(State* s, Workq* q); 2212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Runs a State on a given byte, returning the next state. 2232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ 2242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* RunStateOnByte(State*, int); // L >= mutex_ 2252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Runs a Workq on a given byte followed by a set of empty-string flags, 2272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // producing a new Workq in nq. If a match instruction is encountered, 2282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // sets *ismatch to true. 2292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void RunWorkqOnByte(Workq* q, Workq* nq, 2312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c, uint flag, bool* ismatch, 2322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::MatchKind kind, 2332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int new_byte_loop); 2342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. 2362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag); 2382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Adds the instruction id to the Workq, following empty arrows 2402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // according to flag. 2412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // L >= mutex_ 2422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void AddToQueue(Workq* q, int id, uint flag); 2432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For debugging, returns a text representation of State. 2452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static string DumpState(State* state); 2462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For debugging, returns a text representation of a Workq. 2482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static string DumpWorkq(Workq* q); 2492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Search parameters 2512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct SearchParams { 2522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SearchParams(const StringPiece& text, const StringPiece& context, 2532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RWLocker* cache_lock) 2542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : text(text), context(context), 2552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson anchored(false), 2562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson want_earliest_match(false), 2572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson run_forward(false), 2582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start(NULL), 2592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson firstbyte(kFbUnknown), 2602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cache_lock(cache_lock), 2612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed(false), 2622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ep(NULL), 2632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matches(NULL) { } 2642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece text; 2662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece context; 2672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored; 2682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool want_earliest_match; 2692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool run_forward; 2702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* start; 2712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int firstbyte; 2722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RWLocker *cache_lock; 2732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool failed; // "out" parameter: whether search gave up 2742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* ep; // "out" parameter: end pointer for match 2752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<int>* matches; 2762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 2782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(SearchParams); 2792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 2802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Before each search, the parameters to Search are analyzed by 2822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // AnalyzeSearch to determine the state in which to start and the 2832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // "firstbyte" for that state, if any. 2842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson struct StartInfo { 2852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StartInfo() : start(NULL), firstbyte(kFbUnknown) { } 2862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* start; 2872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson volatile int firstbyte; 2882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 2892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Fills in params->start and params->firstbyte using 2912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the other search parameters. Returns true on success, 2922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // false on failure. 2932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 2942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool AnalyzeSearch(SearchParams* params); 2952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags); 2962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The generic search loop, inlined to create specialized versions. 2982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 2992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Might unlock and relock cache_mutex_ via params->cache_lock. 3002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inline bool InlinedSearchLoop(SearchParams* params, 3012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool have_firstbyte, 3022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool want_earliest_match, 3032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool run_forward); 3042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The specialized versions of InlinedSearchLoop. The three letters 3062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // at the ends of the name denote the true/false values used as the 3072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // last three parameters of InlinedSearchLoop. 3082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 3092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Might unlock and relock cache_mutex_ via params->cache_lock. 3102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchFFF(SearchParams* params); 3112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchFFT(SearchParams* params); 3122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchFTF(SearchParams* params); 3132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchFTT(SearchParams* params); 3142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchTFF(SearchParams* params); 3152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchTFT(SearchParams* params); 3162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchTTF(SearchParams* params); 3172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SearchTTT(SearchParams* params); 3182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The main search loop: calls an appropriate specialized version of 3202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // InlinedSearchLoop. 3212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 3222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Might unlock and relock cache_mutex_ via params->cache_lock. 3232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool FastSearchLoop(SearchParams* params); 3242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For debugging, a slow search loop that calls InlinedSearchLoop 3262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // directly -- because the booleans passed are not constants, the 3272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // loop is not specialized like the SearchFFF etc. versions, so it 3282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // runs much more slowly. Useful only for debugging. 3292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache_mutex_.r <= L < mutex_ 3302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Might unlock and relock cache_mutex_ via params->cache_lock. 3312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool SlowSearchLoop(SearchParams* params); 3322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Looks up bytes in bytemap_ but handles case c == kByteEndText too. 3342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ByteMap(int c) { 3352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c == kByteEndText) 3362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return prog_->bytemap_range(); 3372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return prog_->bytemap()[c]; 3382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Constant after initialization. 3412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog* prog_; // The regular expression program to run. 3422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::MatchKind kind_; // The kind of DFA. 3432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start_unanchored_; // start of unanchored program 3442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool init_failed_; // initialization failed (out of memory) 3452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Mutex mutex_; // mutex_ >= cache_mutex_.r 3472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Scratch areas, protected by mutex_. 3492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq* q0_; // Two pre-allocated work queues. 3502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq* q1_; 3512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* astack_; // Pre-allocated stack for AddToQueue 3522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nastack_; 3532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // State* cache. Many threads use and add to the cache simultaneously, 3552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // holding cache_mutex_ for reading and mutex_ (above) when adding. 3562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If the cache fills and needs to be discarded, the discarding is done 3572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // while holding cache_mutex_ for writing, to avoid interrupting other 3582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // readers. Any State* pointers are only valid while cache_mutex_ 3592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // is held. 3602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Mutex cache_mutex_; 3612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 mem_budget_; // Total memory budget for all States. 3622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 state_budget_; // Amount of memory remaining for new States. 3632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSet state_cache_; // All States computed so far. 3642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StartInfo start_[kMaxStart]; 3652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool cache_warned_; // have printed to LOG(INFO) about the cache 3662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 3672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Shorthand for casting to uint8*. 3692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic inline const uint8* BytePtr(const void* v) { 3702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return reinterpret_cast<const uint8*>(v); 3712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 3722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Work queues 3742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Marks separate thread groups of different priority 3762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// in the work queue when in leftmost-longest matching mode. 3772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define Mark (-1) 3782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Internally, the DFA uses a sparse array of 3802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// program instruction pointers as a work queue. 3812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In leftmost longest mode, marks separate sections 3822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of workq that started executing at different 3832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// locations in the string (earlier locations first). 3842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA::Workq : public SparseSet { 3852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 3862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Constructor: n is number of normal slots, maxmark number of mark slots. 3872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Workq(int n, int maxmark) : 3882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SparseSet(n+maxmark), 3892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n_(n), 3902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson maxmark_(maxmark), 3912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nextmark_(n), 3922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson last_was_mark_(true) { 3932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 3942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool is_mark(int i) { return i >= n_; } 3962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int maxmark() { return maxmark_; } 3982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 3992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void clear() { 4002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SparseSet::clear(); 4012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nextmark_ = n_; 4022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void mark() { 4052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (last_was_mark_) 4062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 4072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson last_was_mark_ = false; 4082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SparseSet::insert_new(nextmark_++); 4092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int size() { 4122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return n_ + maxmark_; 4132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void insert(int id) { 4162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (contains(id)) 4172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 4182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson insert_new(id); 4192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void insert_new(int id) { 4222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson last_was_mark_ = false; 4232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SparseSet::insert_new(id); 4242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 4272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n_; // size excluding marks 4282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int maxmark_; // maximum number of marks 4292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nextmark_; // id of next mark 4302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool last_was_mark_; // last inserted was mark 4312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(Workq); 4322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 4332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) 4352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : prog_(prog), 4362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kind_(kind), 4372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson init_failed_(false), 4382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q0_(NULL), 4392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q1_(NULL), 4402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson astack_(NULL), 4412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_(max_mem), 4422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cache_warned_(false) { 4432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 4442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); 4452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nmark = 0; 4462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_unanchored_ = 0; 4472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind_ == Prog::kLongestMatch) { 4482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nmark = prog->size(); 4492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_unanchored_ = prog->start_unanchored(); 4502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson nastack_ = 2 * prog->size() + nmark; 4522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Account for space needed for DFA, q0, q1, astack. 4542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ -= sizeof(DFA); 4552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ -= (prog_->size() + nmark) * 4562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (sizeof(int)+sizeof(int)) * 2; // q0, q1 4572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ -= nastack_ * sizeof(int); // astack 4582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (mem_budget_ < 0) { 4592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", 4602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->size(), max_mem); 4612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson init_failed_ = true; 4622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 4632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson state_budget_ = mem_budget_; 4662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Make sure there is a reasonable amount of working room left. 4682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // At minimum, the search requires room for two states in order 4692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to limp along, restarting frequently. We'll get better performance 4702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if there is room for a larger number of states, say 20. 4710d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + 4720d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin (prog_->bytemap_range()+1)*sizeof(State*); 4732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state_budget_ < 20*one_state) { 4742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", 4752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson prog_->size(), max_mem); 4762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson init_failed_ = true; 4772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 4782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 4792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q0_ = new Workq(prog->size(), nmark); 4812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q1_ = new Workq(prog->size(), nmark); 4822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson astack_ = new int[nastack_]; 4832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::~DFA() { 4862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete q0_; 4872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete q1_; 4882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] astack_; 4892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ClearCache(); 4902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 4912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// In the DFA state graph, s->next[c] == NULL means that the 4932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// state has not yet been computed and needs to be. We need 4942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// a different special value to signal that s->next[c] is a 4952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// state that can never lead to a match (and thus the search 4962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// can be called off). Hence DeadState. 4972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define DeadState reinterpret_cast<State*>(1) 4982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 4992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Signals that the rest of the string matches no matter what it is. 5002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define FullMatchState reinterpret_cast<State*>(2) 5012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson#define SpecialStateMax FullMatchState 5032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Debugging printouts 5052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For debugging, returns a string representation of the work queue. 5072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring DFA::DumpWorkq(Workq* q) { 5082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string s; 5092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* sep = ""; 5102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) { 5112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q->is_mark(*it)) { 5122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "|"); 5132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sep = ""; 5142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 5152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "%s%d", sep, *it); 5162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sep = ","; 5172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 5202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For debugging, returns a string representation of the state. 5232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstring DFA::DumpState(State* state) { 5242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == NULL) 5252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return "_"; 5262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == DeadState) 5272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return "X"; 5282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == FullMatchState) 5292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return "*"; 5302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson string s; 5312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* sep = ""; 5322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "(%p)", state); 5332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < state->ninst_; i++) { 5342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state->inst_[i] == Mark) { 5352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "|"); 5362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sep = ""; 5372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 5382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, "%s%d", sep, state->inst_[i]); 5392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sep = ","; 5402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 5422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringAppendF(&s, " flag=%#x", state->flag_); 5432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 5442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 5452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson////////////////////////////////////////////////////////////////////// 5472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// DFA state graph construction. 5492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The DFA state graph is a heavily-linked collection of State* structures. 5512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The state_cache_ is a set of all the State structures ever allocated, 5522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// so that if the same state is reached by two different paths, 5532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the same State structure can be used. This reduces allocation 5542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// requirements and also avoids duplication of effort across the two 5552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// identical states. 5562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A State is defined by an ordered list of instruction ids and a flag word. 5582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The choice of an ordered list of instructions differs from a typical 5602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// textbook DFA implementation, which would use an unordered set. 5612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Textbook descriptions, however, only care about whether 5622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the DFA matches, not where it matches in the text. To decide where the 5632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// DFA matches, we need to mimic the behavior of the dominant backtracking 5642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementations like PCRE, which try one possible regular expression 5652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// execution, then another, then another, stopping when one of them succeeds. 5662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The DFA execution tries these many executions in parallel, representing 5672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// each by an instruction id. These pointers are ordered in the State.inst_ 5682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// list in the same order that the executions would happen in a backtracking 5692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// search: if a match is found during execution of inst_[2], inst_[i] for i>=3 5702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// can be discarded. 5712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Textbooks also typically do not consider context-aware empty string operators 5732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// like ^ or $. These are handled by the flag word, which specifies the set 5742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of empty-string operators that should be matched when executing at the 5752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// current text position. These flag bits are defined in prog.h. 5762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The flag word also contains two DFA-specific bits: kFlagMatch if the state 5772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is a matching state (one that reached a kInstMatch in the program) 5782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and kFlagLastWord if the last processed byte was a word character, for the 5792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// implementation of \B and \b. 5802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The flag word also contains, shifted up 16 bits, the bits looked for by 5822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// any kInstEmptyWidth instructions in the state. These provide a useful 5832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// summary indicating when new flags might be useful. 5842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 5852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The permanent representation of a State's instruction ids is just an array, 5862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but while a state is being analyzed, these instruction ids are represented 5872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as a Workq, which is an array that allows iteration in insertion order. 5882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 5892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// NOTE(rsc): The choice of State construction determines whether the DFA 5902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// mimics backtracking implementations (so-called leftmost first matching) or 5912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// traditional DFA implementations (so-called leftmost longest matching as 5922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// prescribed by POSIX). This implementation chooses to mimic the 5932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// backtracking implementations, because we want to replace PCRE. To get 5942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// POSIX behavior, the states would need to be considered not as a simple 5952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ordered list of instruction ids, but as a list of unordered sets of instruction 5962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ids. A match by a state in one set would inhibit the running of sets 5972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// farther down the list but not other instruction ids in the same set. Each 5982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// set would correspond to matches beginning at a given point in the string. 5992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This is implemented by separating different sets with Mark pointers. 6002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Looks in the State cache for a State matching q, flag. 6022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If one is found, returns it. If one is not found, allocates one, 6032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// inserts it in the cache, and returns it. 6042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { 6052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DEBUG_MODE) 6062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mutex_.AssertHeld(); 6072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Construct array of instruction ids for the new state. 6092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: 6102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // those are the only operators with any effect in 6112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // RunWorkqOnEmptyString or RunWorkqOnByte. 6122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* inst = new int[q->size()]; 6132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int n = 0; 6142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint needflags = 0; // flags needed by kInstEmptyWidth instructions 6152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool sawmatch = false; // whether queue contains guaranteed kInstMatch 6162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool sawmark = false; // whether queue contains a Mark 6172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 6182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); 6192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator it = q->begin(); it != q->end(); ++it) { 6202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *it; 6212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) 6222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q->is_mark(id)) { 6242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n > 0 && inst[n-1] != Mark) { 6252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawmark = true; 6262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst[n++] = Mark; 6272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 6292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 6312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 6322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 6332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This state will continue to a match no matter what 6342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the rest of the input is. If it is the highest priority match 6352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // being considered, return the special FullMatchState 6362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to indicate that it's all matches from here out. 6372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind_ != Prog::kManyMatch && 6382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (kind_ != Prog::kFirstMatch || 6392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (it == q->begin() && ip->greedy(prog_))) && 6402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (kind_ != Prog::kLongestMatch || !sawmark) && 6412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (flag & kFlagMatch)) { 6422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst; 6432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 6442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " -> FullMatchState\n"); 6452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return FullMatchState; 6462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Fall through. 6482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: // These are useful. 6492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 6502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 6512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: // Not useful, but necessary [*] 6522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst[n++] = *it; 6532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstEmptyWidth) 6542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson needflags |= ip->empty(); 6552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->opcode() == kInstMatch && !prog_->anchor_end()) 6562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sawmatch = true; 6572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson default: // The rest are not. 6602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 6612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // [*] kInstAlt would seem useless to record in a state, since 6642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // we've already followed both its arrows and saved all the 6652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // interesting states we can reach from there. The problem 6662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // is that one of the empty-width instructions might lead 6672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // back to the same kInstAlt (if an empty-width operator is starred), 6682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // producing a different evaluation order depending on whether 6692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // we keep the kInstAlt to begin with. Sigh. 6702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // A specific case that this affects is /(^|a)+/ matching "a". 6712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If we don't save the kInstAlt, we will match the whole "a" (0,1) 6722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // but in fact the correct leftmost-first match is the leading "" (0,0). 6732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 6742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_LE(n, q->size()); 6752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n > 0 && inst[n-1] == Mark) 6762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson n--; 6772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If there are no empty-width instructions waiting to execute, 6792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // then the extra flag bits will not be used, so there is no 6802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // point in saving them. (Discarding them reduces the number 6812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // of distinct states.) 6822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (needflags == 0) 6832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag &= kFlagMatch; 6842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // NOTE(rsc): The code above cannot do flag &= needflags, 6862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // because if the right flags were present to pass the current 6872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // kInstEmptyWidth instructions, new kInstEmptyWidth instructions 6882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // might be reached that in turn need different flags. 6892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The only sure thing is that if there are no kInstEmptyWidth 6902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // instructions at all, no flags will be needed. 6912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We could do the extra work to figure out the full set of 6922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // possibly needed flags by exploring past the kInstEmptyWidth 6932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // instructions, but the check above -- are any flags needed 6942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // at all? -- handles the most common case. More fine-grained 6952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // analysis can only be justified by measurements showing that 6962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // too many redundant states are being allocated. 6972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 6982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If there are no Insts in the list, it's a dead state, 6992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // which is useful to signal with a special pointer so that 7002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the execution loop can stop early. This is only okay 7012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if the state is *not* a matching state. 7022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (n == 0 && flag == 0) { 7032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst; 7042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 7052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " -> DeadState\n"); 7062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return DeadState; 7072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If we're in longest match mode, the state is a sequence of 7102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // unordered state sets separated by Marks. Sort each set 7112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to canonicalize, to reduce the number of distinct sets stored. 7122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind_ == Prog::kLongestMatch) { 7132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* ip = inst; 7142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* ep = ip + n; 7152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (ip < ep) { 7162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* markp = ip; 7172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (markp < ep && *markp != Mark) 7182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson markp++; 7192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson sort(ip, markp); 7202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (markp < ep) 7212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson markp++; 7222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ip = markp; 7232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Save the needed empty-width flags in the top bits for use later. 7272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= needflags << kFlagNeedShift; 7282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* state = CachedState(inst, n, flag); 7302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst; 7312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return state; 7322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Looks in the State cache for a State matching inst, ninst, flag. 7352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If one is found, returns it. If one is not found, allocates one, 7362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// inserts it in the cache, and returns it. 7372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { 7382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DEBUG_MODE) 7392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mutex_.AssertHeld(); 7402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Look in the cache for a pre-existing state. 7422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State state = { inst, ninst, flag, NULL }; 7432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSet::iterator it = state_cache_.find(&state); 7442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (it != state_cache_.end()) { 7452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 7462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); 7472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return *it; 7482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Must have enough memory for new state. 7512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In addition to what we're going to allocate, 7522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // the state cache hash table seems to incur about 32 bytes per 7532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // State*, empirically. 7542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const int kStateCacheOverhead = 32; 7552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot 7562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int); 7572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (mem_budget_ < mem + kStateCacheOverhead) { 7582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ = -1; 7592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 7602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 7612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ -= mem + kStateCacheOverhead; 7622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Allocate new state, along with room for next and inst. 7642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson char* space = new char[mem]; 7652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = reinterpret_cast<State*>(space); 7662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->next_ = reinterpret_cast<State**>(s + 1); 7672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->inst_ = reinterpret_cast<int*>(s->next_ + nnext); 7682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memset(s->next_, 0, nnext*sizeof s->next_[0]); 7692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); 7702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->ninst_ = ninst; 7712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s->flag_ = flag; 7722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 7732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, " -> %s\n", DumpState(s).c_str()); 7742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Put state in cache and return it. 7762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson state_cache_.insert(s); 7772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 7782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Clear the cache. Must hold cache_mutex_.w or be in destructor. 7812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::ClearCache() { 7822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In case state_cache_ doesn't support deleting entries 7832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // during iteration, copy into a vector and then delete. 7842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<State*> v; 7852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v.reserve(state_cache_.size()); 7862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (StateSet::iterator it = state_cache_.begin(); 7872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson it != state_cache_.end(); ++it) 7882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson v.push_back(*it); 7892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson state_cache_.clear(); 7902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < v.size(); i++) 7912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] reinterpret_cast<const char*>(v[i]); 7922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 7932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 7942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Copies insts in state s to the work queue q. 7952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::StateToWorkq(State* s, Workq* q) { 7962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->clear(); 7972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < s->ninst_; i++) { 7982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->inst_[i] == Mark) 7992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->mark(); 8002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 8012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->insert_new(s->inst_[i]); 8022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 8042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Adds ip to the work queue, following empty arrows according to flag 8062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and expanding kInstAlt instructions (two-target gotos). 8072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::AddToQueue(Workq* q, int id, uint flag) { 8082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Use astack_ to hold our stack of states yet to process. 8102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It is sized to have room for nastack_ == 2*prog->size() + nmark 8112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // instructions, which is enough: each instruction can be 8122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // processed by the switch below only once, and the processing 8132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // pushes at most two instructions plus maybe a mark. 8142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.) 8152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* stk = astack_; 8162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int nstk = 0; 8172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = id; 8192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (nstk > 0) { 8202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DCHECK_LE(nstk, nastack_); 8212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id = stk[--nstk]; 8222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id == Mark) { 8242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->mark(); 8252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (id == 0) 8292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If ip is already on the queue, nothing to do. 8322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Otherwise add it. We don't actually keep all the ones 8332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // that get added -- for example, kInstAlt is ignored 8342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // when on a work queue -- but adding all ip's here 8352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // increases the likelihood of q->contains(id), 8362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // reducing the amount of duplicated work. 8372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q->contains(id)) 8382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 8392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q->insert_new(id); 8402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Process instruction. 8422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 8432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 8442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: // can't happen: discarded above 8452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: // just save these on the queue 8482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 8492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: // DFA treats captures as no-ops. 8522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: 8532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = ip->out(); 8542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: // two choices: expand both, in order 8572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: 8582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Want to visit out then out1, so push on stack in reverse order. 8592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // This instruction is the [00-FF]* loop at the beginning of 8602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a leftmost-longest unanchored search, separate out from out1 8612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // with a Mark, so that out1's threads (which will start farther 8622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // to the right in the string being searched) are lower priority 8632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // than the current ones. 8642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = ip->out1(); 8652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (q->maxmark() > 0 && 8662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson id == prog_->start_unanchored() && id != prog_->start()) 8672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = Mark; 8682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = ip->out(); 8692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: 8722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((ip->empty() & flag) == ip->empty()) 8732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson stk[nstk++] = ip->out(); 8742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 8752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 8772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 8782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Running of work queues. In the work queue, order matters: 8802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the queue is sorted in priority order. If instruction i comes before j, 8812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// then the instructions that i produces during the run must come before 8822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the ones that j produces. In order to keep this invariant, all the 8832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// work queue runners have to take an old queue to process and then 8842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// also a new queue to fill in. It's not acceptable to add to the end of 8852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// an existing queue, because new instructions will not end up in the 8862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// correct position. 8872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 8882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Runs the work queue, processing the empty strings indicated by flag. 8892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match 8902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// both ^ and $. It is important that callers pass all flags at once: 8912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// processing both ^ and $ is not the same as first processing only ^ 8922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and then processing only $. Doing the two-step sequence won't match 8932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior 8942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// exhibited by existing implementations). 8952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { 8962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson newq->clear(); 8972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { 8982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (oldq->is_mark(*i)) 8992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(newq, Mark, flag); 9002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 9012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(newq, *i, flag); 9022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Runs the work queue, processing the single byte c followed by any empty 9062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, 9072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// means to match c$. Sets the bool *ismatch to true if the end of the 9082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// regular expression program has been reached (the regexp has matched). 9092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, 9102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c, uint flag, bool* ismatch, 9112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::MatchKind kind, 9122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int new_byte_loop) { 9132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DEBUG_MODE) 9142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mutex_.AssertHeld(); 9152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson newq->clear(); 9172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { 9182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (oldq->is_mark(*i)) { 9192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (*ismatch) 9202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 9212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson newq->mark(); 9222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 9232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int id = *i; 9252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Prog::Inst* ip = prog_->inst(id); 9262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson switch (ip->opcode()) { 9272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstFail: // never succeeds 9282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstCapture: // already followed 9292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstNop: // already followed 9302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAlt: // already followed 9312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstAltMatch: // already followed 9322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstEmptyWidth: // already followed 9332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstByteRange: // can follow if c is in range 9362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ip->Matches(c)) 9372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(newq, ip->out(), flag); 9382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson case kInstMatch: 9412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (prog_->anchor_end() && c != kByteEndText) 9422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *ismatch = true; 9442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == Prog::kFirstMatch) { 9452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Can stop processing work queue since we found a match. 9462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 9472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 9492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 9532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), 9542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c, flag, DumpWorkq(newq).c_str(), *ismatch); 9552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes input byte c in state, returning new state. 9582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Caller does not hold mutex. 9592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { 9602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Keep only one RunStateOnByte going 9612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // even if the DFA is being run by multiple threads. 9622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MutexLock l(&mutex_); 9632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return RunStateOnByte(state, c); 9642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 9652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Processes input byte c in state, returning new state. 9672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::State* DFA::RunStateOnByte(State* state, int c) { 9682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DEBUG_MODE) 9692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mutex_.AssertHeld(); 9702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state <= SpecialStateMax) { 9712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == FullMatchState) { 9722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It is convenient for routines like PossibleMatchRange 9732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if we implement RunStateOnByte for FullMatchState: 9742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // once you get into this state you never get out, 9752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so it's pretty easy. 9762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return FullMatchState; 9772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == DeadState) { 9792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "DeadState in RunStateOnByte"; 9802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 9812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state == NULL) { 9832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "NULL state in RunStateOnByte"; 9842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 9852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Unexpected special state in RunStateOnByte"; 9872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return NULL; 9882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 9892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If someone else already computed this, return it. 9912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 9920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin State* ns = state->next_[ByteMap(c)]; 9930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(ns); 9940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (ns != NULL) 9950d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return ns; 9962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 9972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Convert state into Workq. 9982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateToWorkq(state, q0_); 9992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Flags marking the kinds of empty-width things (^ $ etc) 10012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // around this byte. Before the byte we have the flags recorded 10022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in the State structure itself. After the byte we have 10032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // nothing yet (but that will change: read on). 10042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint needflag = state->flag_ >> kFlagNeedShift; 10052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint beforeflag = state->flag_ & kFlagEmptyMask; 10062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint oldbeforeflag = beforeflag; 10072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint afterflag = 0; 10082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c == '\n') { 10102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Insert implicit $ and ^ around \n 10112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson beforeflag |= kEmptyEndLine; 10122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson afterflag |= kEmptyBeginLine; 10132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (c == kByteEndText) { 10162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Insert implicit $ and \z before the fake "end text" byte. 10172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson beforeflag |= kEmptyEndLine | kEmptyEndText; 10182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The state flag kFlagLastWord says whether the last 10212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // byte processed was a word character. Use that info to 10222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // insert empty-width (non-)word boundaries. 10232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool islastword = state->flag_ & kFlagLastWord; 10242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool isword = (c != kByteEndText && Prog::IsWordChar(c)); 10252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (isword == islastword) 10262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson beforeflag |= kEmptyNonWordBoundary; 10272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 10282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson beforeflag |= kEmptyWordBoundary; 10292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Okay, finally ready to run. 10312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Only useful to rerun on empty string if there are new, useful flags. 10322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (beforeflag & ~oldbeforeflag & needflag) { 10332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RunWorkqOnEmptyString(q0_, q1_, beforeflag); 10342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson swap(q0_, q1_); 10352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 10362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ismatch = false; 10372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); 10380d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin 10390d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Most of the time, we build the state from the output of 10400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that 10410d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // RE2::Set can tell exactly which match instructions 10420d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // contributed to the match, don't swap if c is kByteEndText. 10430d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // The resulting state wouldn't be correct for further processing 10440d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // of the string, but we're at the end of the text so that's okay. 10450d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // Leaving q0_ alone preseves the match instructions that led to 10460d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin // the current setting of ismatch. 10470d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (c != kByteEndText || kind_ != Prog::kManyMatch) 10480d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin swap(q0_, q1_); 10492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Save afterflag along with ismatch and isword in new state. 10512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint flag = afterflag; 10522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ismatch) 10532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kFlagMatch; 10542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (isword) 10552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag |= kFlagLastWord; 10562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10570d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ns = WorkqToCachedState(q0_, flag); 10582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Write barrier before updating state->next_ so that the 10602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // main search loop can proceed without any locking, for speed. 10612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (Otherwise it would need one mutex operation per input byte.) 10622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The annotations below tell race detectors that: 10632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // a) the access to next_ should be ignored, 10642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // b) 'ns' is properly published. 10652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); // Flush ns before linking to it. 10662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ANNOTATE_IGNORE_WRITES_BEGIN(); 10680d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_BEFORE(ns); 10692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson state->next_[ByteMap(c)] = ns; 10702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ANNOTATE_IGNORE_WRITES_END(); 10712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ns; 10722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 10732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson////////////////////////////////////////////////////////////////////// 10762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// DFA cache reset. 10772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Reader-writer lock helper. 10792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 10802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The DFA uses a reader-writer mutex to protect the state graph itself. 10812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Traversing the state graph requires holding the mutex for reading, 10822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and discarding the state graph and starting over requires holding the 10832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// lock for writing. If a search needs to expand the graph but is out 10842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of memory, it will need to drop its read lock and then acquire the 10852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// write lock. Since it cannot then atomically downgrade from write lock 10862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to read lock, it runs the rest of the search holding the write lock. 10872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (This probably helps avoid repeated contention, but really the decision 10882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is forced by the Mutex interface.) It's a bit complicated to keep 10892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// track of whether the lock is held for reading or writing and thread 10902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// that through the search, so instead we encapsulate it in the RWLocker 10912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and pass that around. 10922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA::RWLocker { 10942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 10952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson explicit RWLocker(Mutex* mu); 10962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~RWLocker(); 10972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 10982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If the lock is only held for reading right now, 10992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // drop the read lock and re-acquire for writing. 11002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Subsequent calls to LockForWriting are no-ops. 11012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Notice that the lock is *released* temporarily. 11022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson void LockForWriting(); 11032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns whether the lock is already held for writing. 11052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool IsLockedForWriting() { 11062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return writing_; 11072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 11102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Mutex* mu_; 11112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool writing_; 11122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(RWLocker); 11142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 11152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::RWLocker::RWLocker(Mutex* mu) 11172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson : mu_(mu), writing_(false) { 11182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mu_->ReaderLock(); 11202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations 11232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// does not support lock upgrade. 11242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { 11252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!writing_) { 11262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mu_->ReaderUnlock(); 11272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mu_->Lock(); 11282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson writing_ = true; 11292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::RWLocker::~RWLocker() { 11332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (writing_) 11342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mu_->WriterUnlock(); 11352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 11362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mu_->ReaderUnlock(); 11372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When the DFA's State cache fills, we discard all the states in the 11412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// cache and start over. Many threads can be using and adding to the 11422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// cache at the same time, so we synchronize using the cache_mutex_ 11432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to keep from stepping on other threads. Specifically, all the 11442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// threads using the current cache hold cache_mutex_ for reading. 11452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When a thread decides to flush the cache, it drops cache_mutex_ 11462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and then re-acquires it for writing. That ensures there are no 11472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// other threads accessing the cache anymore. The rest of the search 11482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// runs holding cache_mutex_ for writing, avoiding any contention 11492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// with or cache pollution caused by other threads. 11502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonvoid DFA::ResetCache(RWLocker* cache_lock) { 11522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Re-acquire the cache_mutex_ for writing (exclusive use). 11532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool was_writing = cache_lock->IsLockedForWriting(); 11542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cache_lock->LockForWriting(); 11552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If we already held cache_mutex_ for writing, it means 11572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // this invocation of Search() has already reset the 11582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache once already. That's a pretty clear indication 11592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // that the cache is too small. Warn about that, once. 11602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // TODO(rsc): Only warn if state_cache_.size() < some threshold. 11612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (was_writing && !cache_warned_) { 11622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(INFO) << "DFA memory cache could be too small: " 11632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << "only room for " << state_cache_.size() << " states."; 11642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson cache_warned_ = true; 11652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Clear the cache, reset the memory budget. 11682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < kMaxStart; i++) { 11692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_[i].start = NULL; 11702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start_[i].firstbyte = kFbUnknown; 11712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 11722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ClearCache(); 11732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson mem_budget_ = state_budget_; 11742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 11752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Typically, a couple States do need to be preserved across a cache 11772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// reset, like the State at the current point in the search. 11782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The StateSaver class helps keep States across cache resets. 11792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// It makes a copy of the state's guts outside the cache (before the reset) 11802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and then can be asked, after the reset, to recreate the State 11812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// in the new cache. For example, in a DFA method ("this" is a DFA): 11822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 11832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// StateSaver saver(this, s); 11842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// ResetCache(cache_lock); 11852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// s = saver.Restore(); 11862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 11872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The saver should always have room in the cache to re-create the state, 11882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// because resetting the cache locks out all other threads, and the cache 11892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is known to have room for at least a couple states (otherwise the DFA 11902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// constructor fails). 11912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonclass DFA::StateSaver { 11932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson public: 11942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson explicit StateSaver(DFA* dfa, State* state); 11952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ~StateSaver(); 11962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 11972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Recreates and returns a state equivalent to the 11982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // original state passed to the constructor. 11992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Returns NULL if the cache has filled, but 12002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // since the DFA guarantees to have room in the cache 12012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // for a couple states, should never return NULL 12022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if used right after ResetCache. 12032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* Restore(); 12042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson private: 12062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* dfa_; // the DFA to use 12072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int* inst_; // saved info from State 12082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int ninst_; 12092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint flag_; 12102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool is_special_; // whether original state was special 12112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* special_; // if is_special_, the original state 12122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DISALLOW_EVIL_CONSTRUCTORS(StateSaver); 12142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson}; 12152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::StateSaver::StateSaver(DFA* dfa, State* state) { 12172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa_ = dfa; 12182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (state <= SpecialStateMax) { 12192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = NULL; 12202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ninst_ = 0; 12212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag_ = 0; 12222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson is_special_ = true; 12232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson special_ = state; 12242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return; 12252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 12262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson is_special_ = false; 12272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson special_ = NULL; 12282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flag_ = state->flag_; 12292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ninst_ = state->ninst_; 12302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson inst_ = new int[ninst_]; 12312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); 12322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::StateSaver::~StateSaver() { 12352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!is_special_) 12362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete[] inst_; 12372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA::State* DFA::StateSaver::Restore() { 12402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (is_special_) 12412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return special_; 12422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MutexLock l(&dfa_->mutex_); 12432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = dfa_->CachedState(inst_, ninst_, flag_); 12442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s == NULL) 12452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "StateSaver failed to restore state."; 12462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return s; 12472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 12482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 12502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson////////////////////////////////////////////////////////////////////// 12512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// DFA execution. 12532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The basic search loop is easy: start in a state s and then for each 12552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// byte c in the input, s = s->next[c]. 12562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This simple description omits a few efficiency-driven complications. 12582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// First, the State graph is constructed incrementally: it is possible 12602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// that s->next[c] is null, indicating that that state has not been 12612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// fully explored. In this case, RunStateOnByte must be invoked to 12622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// determine the next state, which is cached in s->next[c] to save 12632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// future effort. An alternative reason for s->next[c] to be null is 12642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// that the DFA has reached a so-called "dead state", in which any match 12652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is no longer possible. In this case RunStateOnByte will return NULL 12662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and the processing of the string can stop early. 12672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Second, a 256-element pointer array for s->next_ makes each State 12692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] 12702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// maps from bytes to "byte classes" and then next_ only needs to have 12712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// as many pointers as there are byte classes. A byte class is simply a 12722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// range of bytes that the regexp never distinguishes between. 12732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, 12742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit 12752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// but in exchange we typically cut the size of a State (and thus our 12762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// memory footprint) by about 5-10x. The comments still refer to 12772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. 12782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Third, it is common for a DFA for an unanchored match to begin in a 12802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// state in which only one particular byte value can take the DFA to a 12812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// different state. That is, s->next[c] != s for only one c. In this 12822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// situation, the DFA can do better than executing the simple loop. 12832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Instead, it can call memchr to search very quickly for the byte c. 12842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Whether the start state has this property is determined during a 12852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// pre-compilation pass, and if so, the byte b is passed to the search 12862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// loop as the "firstbyte" argument, along with a boolean "have_firstbyte". 12872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Fourth, the desired behavior is to search for the leftmost-best match 12892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// (approximately, the same one that Perl would find), which is not 12902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// necessarily the match ending earliest in the string. Each time a 12912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// match is found, it must be noted, but the DFA must continue on in 12922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// hope of finding a higher-priority match. In some cases, the caller only 12932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// cares whether there is any match at all, not which one is found. 12942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The "want_earliest_match" flag causes the search to stop at the first 12952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// match found. 12962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 12972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Fifth, one algorithm that uses the DFA needs it to run over the 12982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// input string backward, beginning at the end and ending at the beginning. 12992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Passing false for the "run_forward" flag causes the DFA to run backward. 13002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 13012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The checks for these last three cases, which in a naive implementation 13022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// would be performed once per input byte, slow the general loop enough 13032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to merit specialized versions of the search loop for each of the 13042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// eight possible settings of the three booleans. Rather than write 13052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// eight different functions, we write one general implementation and then 13062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// inline it to create the specialized ones. 13072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 13082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Note that matches are delayed by one byte, to make it easier to 13092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// accomodate match conditions depending on the next input byte (like $ and \b). 13102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// When s->next[c]->IsMatch(), it means that there is a match ending just 13112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// *before* byte c. 13122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The generic search loop. Searches text for a match, returning 13142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// the pointer to the end of the chosen match, or NULL if no match. 13152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The bools are equal to the same-named variables in params, but 13162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// making them function arguments lets the inliner specialize 13172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// this function to each combination (see two paragraphs above). 13182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsoninline bool DFA::InlinedSearchLoop(SearchParams* params, 13192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool have_firstbyte, 13202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool want_earliest_match, 13212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool run_forward) { 13222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* start = params->start; 13232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* bp = BytePtr(params->text.begin()); // start of text 13242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* p = bp; // text scanning point 13252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* ep = BytePtr(params->text.end()); // end of text 13262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* resetp = NULL; // p at last cache reset 13272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!run_forward) 13282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson swap(p, ep); 13292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* bytemap = prog_->bytemap(); 13312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const uint8* lastmatch = NULL; // most recent matching position in text 13322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool matched = false; 13332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = start; 13342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->IsMatch()) { 13362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched = true; 13372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastmatch = p; 13382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (want_earliest_match) { 13392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(lastmatch); 13402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 13412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson while (p != ep) { 13452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 13462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "@%d: %s\n", static_cast<int>(p - bp), 13472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DumpState(s).c_str()); 13482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (have_firstbyte && s == start) { 13492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // In start state, only way out is to find firstbyte, 13502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so use optimized assembly in memchr to skip ahead. 13512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If firstbyte isn't found, we can skip to the end 13522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // of the string. 13532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (run_forward) { 13542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) { 13552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = ep; 13562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 13572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 13592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) { 13602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p = ep; 13612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 13622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson p++; 13642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 13662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int c; 13682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (run_forward) 13692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = *p++; 13702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 13712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson c = *--p; 13722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Note that multiple threads might be consulting 13742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // s->next_[bytemap[c]] simultaneously. 13752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // RunStateOnByte takes care of the appropriate locking, 13762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // including a memory barrier so that the unlocked access 13772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (sometimes known as "double-checked locking") is safe. 13782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The alternative would be either one DFA per thread 13792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // or one mutex operation per input byte. 13802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 13812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ns == DeadState means the state is known to be dead 13822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (no more matches are possible). 13832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ns == NULL means the state has not yet been computed 13842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (need to call RunStateOnByteUnlocked). 13852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // RunStateOnByte returns ns == NULL if it is out of memory. 13862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ns == FullMatchState means the rest of the string matches. 13872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 13882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Okay to use bytemap[] not ByteMap() here, because 13892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // c is known to be an actual byte and not kByteEndText. 13902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 13912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 13922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* ns = s->next_[bytemap[c]]; 13930d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(ns); 13942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 13952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ns = RunStateOnByteUnlocked(s, c); 13962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 13972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // After we reset the cache, we hold cache_mutex exclusively, 13982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so if resetp != NULL, it means we filled the DFA state 13992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cache with this search alone (without any other threads). 14002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Benchmarks show that doing a state computation on every 14012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the 14022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // same at about 2 MB/s. Unless we're processing an average 14032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // of 10 bytes per state computation, fail so that RE2 can 14042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // fall back to the NFA. 14052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL && 14062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (p - resetp) < 10*state_cache_.size()) { 14072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 14082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 14092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson resetp = p; 14112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Prepare to save start and s across the reset. 14132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSaver save_start(this, start); 14142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSaver save_s(this, s); 14152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Discard all the States in the cache. 14172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ResetCache(params->cache_lock); 14182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Restore start and s so we can continue. 14202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((start = save_start.Restore()) == NULL || 14212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (s = save_s.Restore()) == NULL) { 14222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Restore already did LOG(DFATAL). 14232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 14242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 14252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ns = RunStateOnByteUnlocked(s, c); 14272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 14282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache"; 14292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 14302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 14312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns <= SpecialStateMax) { 14352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == DeadState) { 14362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(lastmatch); 14372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return matched; 14382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // FullMatchState 14402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(ep); 14412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 14422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s = ns; 14442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s->IsMatch()) { 14462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched = true; 14472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The DFA notices the match one byte late, 14482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // so adjust p before using it in the match. 14492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (run_forward) 14502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastmatch = p - 1; 14512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 14522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastmatch = p + 1; 14532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 14542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "match @%d! [%s]\n", 14552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static_cast<int>(lastmatch - bp), 14562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DumpState(s).c_str()); 14572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (want_earliest_match) { 14592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(lastmatch); 14602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 14612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Process one more byte to see if it triggers a match. 14662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (Remember, matches are delayed one byte.) 14672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int lastbyte; 14682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (run_forward) { 14692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params->text.end() == params->context.end()) 14702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastbyte = kByteEndText; 14712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 14722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastbyte = params->text.end()[0] & 0xFF; 14732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 14742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params->text.begin() == params->context.begin()) 14752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastbyte = kByteEndText; 14762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 14772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastbyte = params->text.begin()[-1] & 0xFF; 14782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 14802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 14812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* ns = s->next_[ByteMap(lastbyte)]; 14820d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(ns); 14832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 14842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ns = RunStateOnByteUnlocked(s, lastbyte); 14852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 14862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSaver save_s(this, s); 14872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ResetCache(params->cache_lock); 14882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if ((s = save_s.Restore()) == NULL) { 14892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 14902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 14912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ns = RunStateOnByteUnlocked(s, lastbyte); 14932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) { 14942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset"; 14952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 14962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 14972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 14992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s = ns; 15012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 15022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "@_: %s\n", DumpState(s).c_str()); 15032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s == FullMatchState) { 15042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(ep); 15052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 15062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s > SpecialStateMax && s->IsMatch()) { 15082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson matched = true; 15092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson lastmatch = p; 15100d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (params->matches && kind_ == Prog::kManyMatch) { 15110d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin vector<int>* v = params->matches; 15120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v->clear(); 15130d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin for (int i = 0; i < s->ninst_; i++) { 15140d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin Prog::Inst* ip = prog_->inst(s->inst_[i]); 15150d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (ip->opcode() == kInstMatch) 15160d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin v->push_back(ip->match_id()); 15170d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 15180d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 15192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 15202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp), 15212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DumpState(s).c_str()); 15222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 15232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->ep = reinterpret_cast<const char*>(lastmatch); 15242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return matched; 15252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Inline specializations of the general loop. 15282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchFFF(SearchParams* params) { 15292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 0, 0, 0); 15302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchFFT(SearchParams* params) { 15322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 0, 0, 1); 15332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchFTF(SearchParams* params) { 15352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 0, 1, 0); 15362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchFTT(SearchParams* params) { 15382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 0, 1, 1); 15392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchTFF(SearchParams* params) { 15412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 1, 0, 0); 15422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchTFT(SearchParams* params) { 15442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 1, 0, 1); 15452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchTTF(SearchParams* params) { 15472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 1, 1, 0); 15482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SearchTTT(SearchParams* params) { 15502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 1, 1, 1); 15512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For debugging, calls the general code directly. 15542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::SlowSearchLoop(SearchParams* params) { 15552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return InlinedSearchLoop(params, 15562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->firstbyte >= 0, 15572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->want_earliest_match, 15582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->run_forward); 15592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// For performance, calls the appropriate specialized version 15622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of InlinedSearchLoop. 15632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::FastSearchLoop(SearchParams* params) { 15642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Because the methods are private, the Searches array 15652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // cannot be declared at top level. 15662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static bool (DFA::*Searches[])(SearchParams*) = { 15672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchFFF, 15682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchFFT, 15692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchFTF, 15702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchFTT, 15712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchTFF, 15722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchTFT, 15732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchTTF, 15742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson &DFA::SearchTTT, 15752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson }; 15762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool have_firstbyte = (params->firstbyte >= 0); 15782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int index = 4 * have_firstbyte + 15792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 2 * params->want_earliest_match + 15802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 1 * params->run_forward; 15812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return (this->*Searches[index])(params); 15822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 15832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 15852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The discussion of DFA execution above ignored the question of how 15862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// to determine the initial state for the search loop. There are two 15872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// factors that influence the choice of start state. 15882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 15892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The first factor is whether the search is anchored or not. 15902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The regexp program (Prog*) itself has 15912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// two different entry points: one for anchored searches and one for 15922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// unanchored searches. (The unanchored version starts with a leading ".*?" 15932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and then jumps to the anchored one.) 15942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 15952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The second factor is where text appears in the larger context, which 15962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// determines which empty-string operators can be matched at the beginning 15972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// of execution. If text is at the very beginning of context, \A and ^ match. 15982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Otherwise if text is at the beginning of a line, then ^ matches. 15992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Otherwise it matters whether the character before text is a word character 16002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// or a non-word character. 16012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 16022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The two cases (unanchored vs not) and four cases (empty-string flags) 16032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// combine to make the eight cases recorded in the DFA's begin_text_[2], 16042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached 16052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// StartInfos. The start state for each is filled in the first time it 16062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// is used for an actual search. 16072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Examines text, context, and anchored to determine the right start 16092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// state for the DFA search loop. Fills in params and returns true on success. 16102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns false on failure. 16112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::AnalyzeSearch(SearchParams* params) { 16122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& text = params->text; 16132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& context = params->context; 16142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Sanity check: make sure that text lies within context. 16162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (text.begin() < context.begin() || text.end() > context.end()) { 16172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Text is not inside context."; 16182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->start = DeadState; 16192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 16202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Determine correct search type. 16232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int start; 16242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint flags; 16252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params->run_forward) { 16262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (text.begin() == context.begin()) { 16272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartBeginText; 16282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kEmptyBeginText|kEmptyBeginLine; 16292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (text.begin()[-1] == '\n') { 16302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartBeginLine; 16312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kEmptyBeginLine; 16322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { 16332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartAfterWordChar; 16342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kFlagLastWord; 16352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 16362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartAfterNonWordChar; 16372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = 0; 16382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 16402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (text.end() == context.end()) { 16412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartBeginText; 16422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kEmptyBeginText|kEmptyBeginLine; 16432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (text.end()[0] == '\n') { 16442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartBeginLine; 16452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kEmptyBeginLine; 16462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { 16472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartAfterWordChar; 16482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = kFlagLastWord; 16492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 16502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start = kStartAfterNonWordChar; 16512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags = 0; 16522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params->anchored || prog_->anchor_start()) 16552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson start |= kStartAnchored; 16562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StartInfo* info = &start_[start]; 16572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Try once without cache_lock for writing. 16592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Try again after resetting the cache 16602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // (ResetCache will relock cache_lock for writing). 16612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!AnalyzeSearchHelper(params, info, flags)) { 16622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ResetCache(params->cache_lock); 16632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!AnalyzeSearchHelper(params, info, flags)) { 16642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson LOG(DFATAL) << "Failed to analyze start state."; 16652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->failed = true; 16662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 16672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 16692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 16712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n", 16722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->anchored, params->run_forward, flags, 16732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DumpState(info->start).c_str(), info->firstbyte); 16742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->start = info->start; 16760d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin params->firstbyte = ANNOTATE_UNPROTECTED_READ(info->firstbyte); 16772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 16792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 16802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Fills in info if needed. Returns true on success, false on failure. 16822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, 16832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson uint flags) { 16842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Quick check; okay because of memory barriers below. 16850d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (ANNOTATE_UNPROTECTED_READ(info->firstbyte) != kFbUnknown) { 16860d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(&info->firstbyte); 16872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 16880d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 16892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MutexLock l(&mutex_); 16910d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (info->firstbyte != kFbUnknown) { 16920d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(&info->firstbyte); 16932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 16940d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 16952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 16962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q0_->clear(); 16972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson AddToQueue(q0_, 16982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params->anchored ? prog_->start() : prog_->start_unanchored(), 16992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson flags); 17002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson info->start = WorkqToCachedState(q0_, flags); 17012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (info->start == NULL) 17022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (info->start == DeadState) { 17050d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); // Synchronize with "quick check" above. 17072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson info->firstbyte = kFbNone; 17082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (info->start == FullMatchState) { 17120d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); // Synchronize with "quick check" above. 17142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson info->firstbyte = kFbNone; // will be ignored 17152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Compute info->firstbyte by running state on all 17192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // possible byte values, looking for a single one that 17202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // leads to a different state. 17212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int firstbyte = kFbNone; 17222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < 256; i++) { 17232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = RunStateOnByte(info->start, i); 17242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s == NULL) { 17250d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); // Synchronize with "quick check" above. 17272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson info->firstbyte = firstbyte; 17282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (s == info->start) 17312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson continue; 17322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Goes to new state... 17332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (firstbyte == kFbNone) { 17342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson firstbyte = i; // ... first one 17352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 17362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson firstbyte = kFbMany; // ... too many 17372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 17382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17400d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); // Synchronize with "quick check" above. 17422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson info->firstbyte = firstbyte; 17432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 17452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. 17472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::Search(const StringPiece& text, 17482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const StringPiece& context, 17492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored, 17502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool want_earliest_match, 17512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool run_forward, 17522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool* failed, 17532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char** epp, 17542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<int>* matches) { 17552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *epp = NULL; 17562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ok()) { 17572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *failed = true; 17582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *failed = false; 17612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) { 17632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); 17642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", 17652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson text.as_string().c_str(), anchored, want_earliest_match, 17662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson run_forward, kind_); 17672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RWLocker l(&cache_mutex_); 17702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SearchParams params(text, context, &l); 17712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.anchored = anchored; 17722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.want_earliest_match = want_earliest_match; 17732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.run_forward = run_forward; 17742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.matches = matches; 17752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 17762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!AnalyzeSearch(¶ms)) { 17772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *failed = true; 17782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params.start == DeadState) 17810d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin return false; 17822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params.start == FullMatchState) { 17832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (run_forward == want_earliest_match) 17842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *epp = text.begin(); 17852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 17862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *epp = text.end(); 17872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 17882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (DebugDFA) 17902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); 17912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool ret = FastSearchLoop(¶ms); 17922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params.failed) { 17932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *failed = true; 17942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 17952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 17962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *epp = params.ep; 17972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return ret; 17982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 17992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Deletes dfa. 18012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 18022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This is a separate function so that 18032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// prog.h can be used without moving the definition of 18042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// class DFA out of this file. If you set 18052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// prog->dfa_ = dfa; 18062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// then you also have to set 18072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// prog->delete_dfa_ = DeleteDFA; 18082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// so that ~Prog can delete the dfa. 18092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonstatic void DeleteDFA(DFA* dfa) { 18102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete dfa; 18112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 18122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian HodsonDFA* Prog::GetDFA(MatchKind kind) { 18142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA*volatile* pdfa; 18152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kFirstMatch || kind == kManyMatch) { 18162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson pdfa = &dfa_first_; 18172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else { 18182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kind = kLongestMatch; 18192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson pdfa = &dfa_longest_; 18202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Quick check; okay because of memory barrier below. 18230d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin DFA *dfa = ANNOTATE_UNPROTECTED_READ(*pdfa); 18242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (dfa != NULL) { 18252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ANNOTATE_HAPPENS_AFTER(dfa); 18262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return dfa; 18272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MutexLock l(&dfa_mutex_); 18302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa = *pdfa; 18310d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin if (dfa != NULL) { 18320d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin ANNOTATE_HAPPENS_AFTER(dfa); 18332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return dfa; 18340d4c52358a1af421705c54bd8a9fdd8a30558a2eAlexander Gutkin } 18352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For a forward DFA, half the memory goes to each DFA. 18372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For a reverse DFA, all the memory goes to the 18382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // "longest match" DFA, because RE2 never does reverse 18392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // "first match" searches. 18402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson int64 m = dfa_mem_/2; 18412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed_) { 18422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kLongestMatch || kind == kManyMatch) 18432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson m = dfa_mem_; 18442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 18452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson m = 0; 18462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa = new DFA(this, kind, m); 18482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete_dfa_ = DeleteDFA; 18492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Synchronize with "quick check" above. 18512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ANNOTATE_HAPPENS_BEFORE(dfa); 18522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson WriteMemoryBarrier(); 18532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *pdfa = dfa; 18542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return dfa; 18562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 18572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Executes the regexp program to search in text, 18602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// which itself is inside the larger context. (As a convenience, 18612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// passing a NULL context is equivalent to passing text.) 18622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Returns true if a match is found, false if not. 18632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// If a match is found, fills in match0->end() to point at the end of the match 18642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// and sets match0->begin() to text.begin(), since the DFA can't track 18652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// where the match actually began. 18662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 18672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// This is the only external interface (class DFA only exists in this file). 18682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// 18692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, 18702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson Anchor anchor, MatchKind kind, 18712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece* match0, bool* failed, vector<int>* matches) { 18722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *failed = false; 18732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StringPiece context = const_context; 18752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (context.begin() == NULL) 18762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson context = text; 18772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool carat = anchor_start(); 18782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool dollar = anchor_end(); 18792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed_) { 18802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool t = carat; 18812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson carat = dollar; 18822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dollar = t; 18832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (carat && context.begin() != text.begin()) 18852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (dollar && context.end() != text.end()) 18872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 18882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 18892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Handle full match by running an anchored longest match 18902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and then checking if it covers all of text. 18912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; 18922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool endmatch = false; 18932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (kind == kManyMatch) { 18942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch = true; 18952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } else if (kind == kFullMatch || anchor_end()) { 18962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson endmatch = true; 18972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kind = kLongestMatch; 18982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 18992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If the caller doesn't care where the match is (just whether one exists), 19012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // then we can stop at the very first match we find, the so-called 19022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // "shortest match". 19032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool want_shortest_match = false; 19042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (match0 == NULL && !endmatch) { 19052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson want_shortest_match = true; 19062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson kind = kLongestMatch; 19072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* dfa = GetDFA(kind); 19102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson const char* ep; 19112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool matched = dfa->Search(text, context, anchored, 19122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson want_shortest_match, !reversed_, 19132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson failed, &ep, matches); 19142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (*failed) 19152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!matched) 19172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (endmatch && ep != (reversed_ ? text.begin() : text.end())) 19192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If caller cares, record the boundary of the match. 19222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // We only know where it ends, so use the boundary of text 19232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // as the beginning. 19242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (match0) { 19252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (reversed_) 19262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *match0 = StringPiece(ep, text.end() - ep); 19272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson else 19282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *match0 = StringPiece(text.begin(), ep - text.begin()); 19292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 19312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 19322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Build out all states in DFA. Returns number of states. 19342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint DFA::BuildAllStates() { 19352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ok()) 19362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 19372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pick out start state for unanchored search 19392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // at beginning of text. 19402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RWLocker l(&cache_mutex_); 19412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SearchParams params(NULL, NULL, &l); 19422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.anchored = false; 19432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!AnalyzeSearch(¶ms) || params.start <= SpecialStateMax) 19442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return 0; 19452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Add start state to work queue. 19472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson StateSet queued; 19482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson vector<State*> q; 19492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson queued.insert(params.start); 19502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.push_back(params.start); 19512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Flood to expand every state. 19532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < q.size(); i++) { 19542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = q[i]; 19552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int c = 0; c < 257; c++) { 19562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* ns = RunStateOnByteUnlocked(s, c); 19572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns > SpecialStateMax && queued.find(ns) == queued.end()) { 19582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson queued.insert(ns); 19592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson q.push_back(ns); 19602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 19632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return q.size(); 19652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 19662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Build out all states in DFA for kind. Returns number of states. 19682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonint Prog::BuildEntireDFA(MatchKind kind) { 19692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson //LOG(ERROR) << "BuildEntireDFA is only for testing."; 19702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return GetDFA(kind)->BuildAllStates(); 19712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 19722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Computes min and max for matching string. 19742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// Won't return strings bigger than maxlen. 19752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { 19762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!ok()) 19772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 19782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // NOTE: if future users of PossibleMatchRange want more precision when 19802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // presented with infinitely repeated elements, consider making this a 19812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // parameter to PossibleMatchRange. 19822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson static int kMaxEltRepetitions = 0; 19832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Keep track of the number of times we've visited states previously. We only 19852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // revisit a given state if it's part of a repeated group, so if the value 19862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set 19872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // |*max| to |PrefixSuccessor(*max)|. 19882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // 19892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Also note that previously_visited_states[UnseenStatePtr] will, in the STL 19902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // tradition, implicitly insert a '0' value at first use. We take advantage 19912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // of that property below. 19922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson map<State*, int> previously_visited_states; 19932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 19942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Pick out start state for anchored search at beginning of text. 19952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson RWLocker l(&cache_mutex_); 19962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson SearchParams params(NULL, NULL, &l); 19972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson params.anchored = true; 19982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!AnalyzeSearch(¶ms)) 19992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 20002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params.start == DeadState) { // No matching strings 20012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *min = ""; 20022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *max = ""; 20032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 20042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (params.start == FullMatchState) // Every string matches: no max 20062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 20072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The DFA is essentially a big graph rooted at params.start, 20092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and paths in the graph correspond to accepted strings. 20102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Each node in the graph has potentially 256+1 arrows 20112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // coming out, one for each byte plus the magic end of 20122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // text character kByteEndText. 20132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // To find the smallest possible prefix of an accepted 20152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // string, we just walk the graph preferring to follow 20162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // arrows with the lowest bytes possible. To find the 20172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // largest possible prefix, we follow the largest bytes 20182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // possible. 20192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The test for whether there is an arrow from s on byte j is 20212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ns = RunStateOnByteUnlocked(s, j); 20222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if (ns == NULL) 20232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // return false; 20242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // if (ns != DeadState && ns->ninst > 0) 20252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The RunStateOnByteUnlocked call asks the DFA to build out the graph. 20262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // It returns NULL only if the DFA has run out of memory, 20272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // in which case we can't be sure of anything. 20282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // The second check sees whether there was graph built 20292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // and whether it is interesting graph. Nodes might have 20302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // ns->ninst == 0 if they exist only to represent the fact 20312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // that a match was found on the previous byte. 20322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20332ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Build minimum prefix. 20342ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* s = params.start; 20352ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson min->clear(); 20362ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < maxlen; i++) { 20372ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (previously_visited_states[s] > kMaxEltRepetitions) { 20382ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions 20392ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << " for state s=" << s << " and min=" << CEscape(*min); 20402ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20412ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20422ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson previously_visited_states[s]++; 20432ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20442ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Stop if min is a match. 20452ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* ns = RunStateOnByteUnlocked(s, kByteEndText); 20462ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) // DFA out of memory 20472ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 20482ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) 20492ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20502ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20512ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Try to extend the string with low bytes. 20522ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool extended = false; 20532ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = 0; j < 256; j++) { 20542ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson ns = RunStateOnByteUnlocked(s, j); 20552ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) // DFA out of memory 20562ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 20572ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == FullMatchState || 20582ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (ns > SpecialStateMax && ns->ninst_ > 0)) { 20592ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson extended = true; 20602ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson min->append(1, j); 20612ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s = ns; 20622ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20632ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20642ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20652ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!extended) 20662ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20672ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20682ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20692ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Build maximum prefix. 20702ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson previously_visited_states.clear(); 20712ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s = params.start; 20722ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max->clear(); 20732ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int i = 0; i < maxlen; i++) { 20742ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (previously_visited_states[s] > kMaxEltRepetitions) { 20752ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions 20762ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson << " for state s=" << s << " and max=" << CEscape(*max); 20772ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20782ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20792ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson previously_visited_states[s] += 1; 20802ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 20812ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Try to extend the string with high bytes. 20822ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson bool extended = false; 20832ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson for (int j = 255; j >= 0; j--) { 20842ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson State* ns = RunStateOnByteUnlocked(s, j); 20852ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == NULL) 20862ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 20872ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (ns == FullMatchState || 20882ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson (ns > SpecialStateMax && ns->ninst_ > 0)) { 20892ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson extended = true; 20902ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson max->append(1, j); 20912ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson s = ns; 20922ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson break; 20932ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20942ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20952ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (!extended) { 20962ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Done, no need for PrefixSuccessor. 20972ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 20982ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 20992ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21002ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21012ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b 21022ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson *max = PrefixSuccessor(*max); 21032ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21042ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // If there are no bytes left, we have no way to say "there is no maximum 21052ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // string". We could make the interface more complicated and be able to 21062ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // return "there is no maximum but here is a minimum", but that seems like 21072ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // overkill -- the most common no-max case is all possible strings, so not 21082ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // telling the caller that the empty string is the minimum match isn't a 21092ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // great loss. 21102ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (max->empty()) 21112ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return false; 21122ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21132ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return true; 21142ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 21152ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21162ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson// PossibleMatchRange for a Prog. 21172ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodsonbool Prog::PossibleMatchRange(string* min, string* max, int maxlen) { 21182ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson DFA* dfa = NULL; 21192ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson { 21202ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson MutexLock l(&dfa_mutex_); 21212ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // Have to use dfa_longest_ to get all strings for full matches. 21222ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson // For example, (a|aa) never matches aa in first-match mode. 21232ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson if (dfa_longest_ == NULL) { 21242ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa_longest_ = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); 21252ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson delete_dfa_ = DeleteDFA; 21262ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21272ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson dfa = dfa_longest_; 21282ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson } 21292ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson return dfa->PossibleMatchRange(min, max, maxlen); 21302ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} 21312ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson 21322ee91b4af4353b9e6a9d591c32fedfc58fd4ef35Ian Hodson} // namespace re2 2133