15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A DFA (deterministic finite automaton)-based regular expression search. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The DFA search has two main parts: the construction of the automaton, 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which is represented by a graph of State structures, and the execution 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of the automaton over a given input string. 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The basic idea is that the State graph is constructed so that the 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// execution can simply start with a state s, and then for each byte c in 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the input string, execute "s = s->next[c]", checking at each point whether 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the current s represents a matching state. 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The simple explanation just given does convey the essence of this code, 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but it omits the details of how the State graph gets constructed as well 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as some performance-driven optimizations to the execution of the automaton. 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// All these details are explained in the comments for the code following 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the definition of class DFA. 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See http://swtch.com/~rsc/regexp/ for a very bare-bones equivalent. 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h" 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/stringpiece.h" 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/atomicops.h" 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/flags.h" 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_set.h" 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define NO_THREAD_SAFETY_ANALYSIS 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DEFINE_bool(re2_dfa_bail_when_slow, true, 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "Whether the RE2 DFA should bail out early " 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "if the NFA would be faster (for testing)."); 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#if !defined(__linux__) /* only Linux seems to have memrchr */ 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void* memrchr(const void* s, int c, size_t n) { 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const unsigned char* p = (const unsigned char*)s; 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (p += n; n > 0; n--) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (*--p == c) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (void*)p; 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Changing this to true compiles in prints that trace execution of the DFA. 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Generates a lot of output -- only useful for debugging. 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const bool DebugDFA = false; 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A DFA implementation of a regular expression program. 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Since this is entirely a forward declaration mandated by C++, 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// some of the comments here are better understood after reading 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the comments in the sections that follow the DFA definition. 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DFA { 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem); 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~DFA(); 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ok() const { return !init_failed_; } 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::MatchKind kind() { return kind_; } 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Searches for the regular expression in text, which is considered 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // as a subsection of context for the purposes of interpreting flags 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // like ^ and $ and \A and \z. 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns whether a match was found. 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If a match is found, sets *ep to the end point of the best match in text. 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If "anchored", the match must begin at the start of text. 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If "want_earliest_match", the match that ends first is used, not 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // necessarily the best one. 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If "run_forward" is true, the DFA runs from text.begin() to text.end(). 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If it is false, the DFA runs from text.end() to text.begin(), 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // returning the leftmost end of the match instead of the rightmost one. 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the DFA cannot complete the search (for example, if it is out of 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // memory), it sets *failed and returns false. 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool Search(const StringPiece& text, const StringPiece& context, 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool anchored, bool want_earliest_match, bool run_forward, 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool* failed, const char** ep, vector<int>* matches); 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Builds out all states for the entire DFA. FOR TESTING ONLY 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns number of states. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int BuildAllStates(); 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Computes min and max for matching strings. Won't return strings 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // bigger than maxlen. 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool PossibleMatchRange(string* min, string* max, int maxlen); 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // These data structures are logically private, but C++ makes it too 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // difficult to mark them as such. 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) class Workq; 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) class RWLocker; 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) class StateSaver; 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A single DFA state. The DFA is represented as a graph of these 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // States, linked by the next_ pointers. If in state s and reading 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // byte c, the next state should be s->next_[c]. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct State { 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline bool IsMatch() const { return flag_ & kFlagMatch; } 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SaveMatch(vector<int>* v); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* inst_; // Instruction pointers in the state. 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ninst_; // # of inst_ pointers. 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint flag_; // Empty string bitfield flags in effect on the way 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // into this state, along with kFlagMatch if this 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is a matching state. 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State** next_; // Outgoing arrows from State, 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // one per input byte class 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) enum { 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kByteEndText = 256, // imaginary byte at end of text 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFlagEmptyMask = 0xFFF, // State.flag_: bits holding kEmptyXXX flags 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFlagMatch = 0x1000, // State.flag_: this is a matching state 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFlagLastWord = 0x2000, // State.flag_: last byte was a word char 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFlagNeedShift = 16, // needed kEmpty bits are or'ed in shifted left 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef STL_MSVC 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // STL function structures for use with unordered_set. 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct StateEqual { 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool operator()(const State* a, const State* b) const { 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a == b) 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a == NULL || b == NULL) 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->ninst_ != b->ninst_) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->flag_ != b->flag_) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < a->ninst_; i++) 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->inst_[i] != b->inst_[i]) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; // they're equal 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // STL_MSVC 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct StateHash { 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t operator()(const State* a) const { 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a == NULL) 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* s = reinterpret_cast<const char*>(a->inst_); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int len = a->ninst_ * sizeof a->inst_[0]; 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sizeof(size_t) == sizeof(uint32)) 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Hash32StringWithSeed(s, len, a->flag_); 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return Hash64StringWithSeed(s, len, a->flag_); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef STL_MSVC 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Less than operator. 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool operator()(const State* a, const State* b) const { 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a == b) 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a == NULL || b == NULL) 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a == NULL; 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->ninst_ != b->ninst_) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a->ninst_ < b->ninst_; 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->flag_ != b->flag_) 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a->flag_ < b->flag_; 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < a->ninst_; ++i) 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (a->inst_[i] != b->inst_[i]) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return a->inst_[i] < b->inst_[i]; 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; // they're equal 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The two public members are required by msvc. 4 and 8 are default values. 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Reference: http://msdn.microsoft.com/en-us/library/1s1byw77.aspx 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const size_t bucket_size = 4; 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const size_t min_buckets = 8; 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // STL_MSVC 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifdef STL_MSVC 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef unordered_set<State*, StateHash> StateSet; 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#else // !STL_MSVC 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef unordered_set<State*, StateHash, StateEqual> StateSet; 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // STL_MSVC 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Special "firstbyte" values for a state. (Values >= 0 denote actual bytes.) 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) enum { 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFbUnknown = -1, // No analysis has been performed. 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFbMany = -2, // Many bytes will lead out of this state. 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kFbNone = -3, // No bytes lead out of this state. 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) enum { 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Indices into start_ for unanchored searches. 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add kStartAnchored for anchored searches. 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kStartBeginText = 0, // text at beginning of context 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kStartBeginLine = 2, // text at beginning of line 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kStartAfterWordChar = 4, // text follows a word character 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kStartAfterNonWordChar = 6, // text follows non-word character 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kMaxStart = 8, 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kStartAnchored = 1, 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Resets the DFA State cache, flushing all saved State* information. 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Releases and reacquires cache_mutex_ via cache_lock, so any 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // State* existing before the call are not valid after the call. 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Use a StateSaver to preserve important states across the call. 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // After: cache_mutex_.w <= L < mutex_ 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ResetCache(RWLocker* cache_lock); 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Looks up and returns the State corresponding to a Workq. 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* WorkqToCachedState(Workq* q, uint flag); 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Looks up and returns a State matching the inst, ninst, and flag. 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* CachedState(int* inst, int ninst, uint flag); 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clear the cache entirely. 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Must hold cache_mutex_.w or be in destructor. 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void ClearCache(); 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Converts a State into a Workq: the opposite of WorkqToCachedState. 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static void StateToWorkq(State* s, Workq* q); 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Runs a State on a given byte, returning the next state. 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* RunStateOnByteUnlocked(State*, int); // cache_mutex_.r <= L < mutex_ 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* RunStateOnByte(State*, int); // L >= mutex_ 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Runs a Workq on a given byte followed by a set of empty-string flags, 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // producing a new Workq in nq. If a match instruction is encountered, 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // sets *ismatch to true. 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RunWorkqOnByte(Workq* q, Workq* nq, 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c, uint flag, bool* ismatch, 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::MatchKind kind, 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int new_byte_loop); 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Runs a Workq on a set of empty-string flags, producing a new Workq in nq. 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void RunWorkqOnEmptyString(Workq* q, Workq* nq, uint flag); 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Adds the instruction id to the Workq, following empty arrows 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // according to flag. 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // L >= mutex_ 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void AddToQueue(Workq* q, int id, uint flag); 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For debugging, returns a text representation of State. 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static string DumpState(State* state); 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For debugging, returns a text representation of a Workq. 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static string DumpWorkq(Workq* q); 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Search parameters 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct SearchParams { 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SearchParams(const StringPiece& text, const StringPiece& context, 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RWLocker* cache_lock) 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : text(text), context(context), 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) anchored(false), 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) want_earliest_match(false), 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) run_forward(false), 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start(NULL), 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) firstbyte(kFbUnknown), 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cache_lock(cache_lock), 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed(false), 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ep(NULL), 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matches(NULL) { } 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece text; 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece context; 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool anchored; 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool want_earliest_match; 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool run_forward; 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* start; 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int firstbyte; 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RWLocker *cache_lock; 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool failed; // "out" parameter: whether search gave up 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ep; // "out" parameter: end pointer for match 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* matches; 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(SearchParams); 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Before each search, the parameters to Search are analyzed by 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // AnalyzeSearch to determine the state in which to start and the 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "firstbyte" for that state, if any. 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) struct StartInfo { 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StartInfo() : start(NULL), firstbyte(kFbUnknown) { } 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* start; 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) volatile int firstbyte; 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fills in params->start and params->firstbyte using 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the other search parameters. Returns true on success, 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // false on failure. 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool AnalyzeSearch(SearchParams* params); 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool AnalyzeSearchHelper(SearchParams* params, StartInfo* info, uint flags); 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The generic search loop, inlined to create specialized versions. 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Might unlock and relock cache_mutex_ via params->cache_lock. 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inline bool InlinedSearchLoop(SearchParams* params, 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool have_firstbyte, 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool want_earliest_match, 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool run_forward); 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The specialized versions of InlinedSearchLoop. The three letters 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // at the ends of the name denote the true/false values used as the 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // last three parameters of InlinedSearchLoop. 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Might unlock and relock cache_mutex_ via params->cache_lock. 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchFFF(SearchParams* params); 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchFFT(SearchParams* params); 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchFTF(SearchParams* params); 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchFTT(SearchParams* params); 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchTFF(SearchParams* params); 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchTFT(SearchParams* params); 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchTTF(SearchParams* params); 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SearchTTT(SearchParams* params); 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The main search loop: calls an appropriate specialized version of 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // InlinedSearchLoop. 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Might unlock and relock cache_mutex_ via params->cache_lock. 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool FastSearchLoop(SearchParams* params); 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For debugging, a slow search loop that calls InlinedSearchLoop 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // directly -- because the booleans passed are not constants, the 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // loop is not specialized like the SearchFFF etc. versions, so it 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // runs much more slowly. Useful only for debugging. 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache_mutex_.r <= L < mutex_ 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Might unlock and relock cache_mutex_ via params->cache_lock. 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool SlowSearchLoop(SearchParams* params); 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Looks up bytes in bytemap_ but handles case c == kByteEndText too. 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ByteMap(int c) { 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c == kByteEndText) 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return prog_->bytemap_range(); 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return prog_->bytemap()[c]; 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Constant after initialization. 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog* prog_; // The regular expression program to run. 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::MatchKind kind_; // The kind of DFA. 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int start_unanchored_; // start of unanchored program 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool init_failed_; // initialization failed (out of memory) 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Mutex mutex_; // mutex_ >= cache_mutex_.r 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Scratch areas, protected by mutex_. 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Workq* q0_; // Two pre-allocated work queues. 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Workq* q1_; 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* astack_; // Pre-allocated stack for AddToQueue 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nastack_; 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // State* cache. Many threads use and add to the cache simultaneously, 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // holding cache_mutex_ for reading and mutex_ (above) when adding. 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the cache fills and needs to be discarded, the discarding is done 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // while holding cache_mutex_ for writing, to avoid interrupting other 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // readers. Any State* pointers are only valid while cache_mutex_ 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is held. 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Mutex cache_mutex_; 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 mem_budget_; // Total memory budget for all States. 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 state_budget_; // Amount of memory remaining for new States. 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSet state_cache_; // All States computed so far. 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StartInfo start_[kMaxStart]; 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool cache_warned_; // have printed to LOG(INFO) about the cache 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Shorthand for casting to uint8*. 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static inline const uint8* BytePtr(const void* v) { 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return reinterpret_cast<const uint8*>(v); 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Work queues 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Marks separate thread groups of different priority 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the work queue when in leftmost-longest matching mode. 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define Mark (-1) 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Internally, the DFA uses a sparse array of 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// program instruction pointers as a work queue. 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In leftmost longest mode, marks separate sections 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of workq that started executing at different 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// locations in the string (earlier locations first). 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DFA::Workq : public SparseSet { 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Constructor: n is number of normal slots, maxmark number of mark slots. 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Workq(int n, int maxmark) : 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SparseSet(n+maxmark), 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n_(n), 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) maxmark_(maxmark), 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nextmark_(n), 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) last_was_mark_(true) { 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_mark(int i) { return i >= n_; } 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxmark() { return maxmark_; } 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void clear() { 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SparseSet::clear(); 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nextmark_ = n_; 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void mark() { 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (last_was_mark_) 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) last_was_mark_ = false; 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SparseSet::insert_new(nextmark_++); 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int size() { 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return n_ + maxmark_; 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void insert(int id) { 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (contains(id)) 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) insert_new(id); 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void insert_new(int id) { 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) last_was_mark_ = false; 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SparseSet::insert_new(id); 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n_; // size excluding marks 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxmark_; // maximum number of marks 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nextmark_; // id of next mark 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool last_was_mark_; // last inserted was mark 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(Workq); 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::DFA(Prog* prog, Prog::MatchKind kind, int64 max_mem) 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : prog_(prog), 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kind_(kind), 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) init_failed_(false), 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q0_(NULL), 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q1_(NULL), 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) astack_(NULL), 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_(max_mem), 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cache_warned_(false) { 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "\nkind %d\n%s\n", (int)kind_, prog_->DumpUnanchored().c_str()); 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nmark = 0; 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_unanchored_ = 0; 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind_ == Prog::kLongestMatch) { 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nmark = prog->size(); 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_unanchored_ = prog->start_unanchored(); 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nastack_ = 2 * prog->size() + nmark; 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Account for space needed for DFA, q0, q1, astack. 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ -= sizeof(DFA); 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ -= (prog_->size() + nmark) * 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (sizeof(int)+sizeof(int)) * 2; // q0, q1 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ -= nastack_ * sizeof(int); // astack 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (mem_budget_ < 0) { 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->size(), max_mem); 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) init_failed_ = true; 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state_budget_ = mem_budget_; 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make sure there is a reasonable amount of working room left. 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // At minimum, the search requires room for two states in order 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to limp along, restarting frequently. We'll get better performance 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if there is room for a larger number of states, say 20. 4732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) int64 one_state = sizeof(State) + (prog_->size()+nmark)*sizeof(int) + 4742a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles) (prog_->bytemap_range()+1)*sizeof(State*); 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state_budget_ < 20*one_state) { 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(INFO) << StringPrintf("DFA out of memory: prog size %lld mem %lld", 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) prog_->size(), max_mem); 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) init_failed_ = true; 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q0_ = new Workq(prog->size(), nmark); 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q1_ = new Workq(prog->size(), nmark); 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) astack_ = new int[nastack_]; 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::~DFA() { 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete q0_; 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete q1_; 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] astack_; 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClearCache(); 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In the DFA state graph, s->next[c] == NULL means that the 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// state has not yet been computed and needs to be. We need 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a different special value to signal that s->next[c] is a 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// state that can never lead to a match (and thus the search 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// can be called off). Hence DeadState. 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define DeadState reinterpret_cast<State*>(1) 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Signals that the rest of the string matches no matter what it is. 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define FullMatchState reinterpret_cast<State*>(2) 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define SpecialStateMax FullMatchState 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Debugging printouts 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For debugging, returns a string representation of the work queue. 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)string DFA::DumpWorkq(Workq* q) { 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string s; 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* sep = ""; 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (DFA::Workq::iterator it = q->begin(); it != q->end(); ++it) { 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q->is_mark(*it)) { 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, "|"); 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sep = ""; 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, "%s%d", sep, *it); 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sep = ","; 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return s; 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For debugging, returns a string representation of the state. 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)string DFA::DumpState(State* state) { 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == NULL) 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return "_"; 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == DeadState) 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return "X"; 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == FullMatchState) 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return "*"; 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string s; 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* sep = ""; 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, "(%p)", state); 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < state->ninst_; i++) { 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state->inst_[i] == Mark) { 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, "|"); 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sep = ""; 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, "%s%d", sep, state->inst_[i]); 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sep = ","; 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&s, " flag=%#x", state->flag_); 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return s; 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////////////////////// 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DFA state graph construction. 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The DFA state graph is a heavily-linked collection of State* structures. 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The state_cache_ is a set of all the State structures ever allocated, 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// so that if the same state is reached by two different paths, 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the same State structure can be used. This reduces allocation 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// requirements and also avoids duplication of effort across the two 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// identical states. 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A State is defined by an ordered list of instruction ids and a flag word. 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The choice of an ordered list of instructions differs from a typical 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// textbook DFA implementation, which would use an unordered set. 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Textbook descriptions, however, only care about whether 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the DFA matches, not where it matches in the text. To decide where the 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DFA matches, we need to mimic the behavior of the dominant backtracking 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementations like PCRE, which try one possible regular expression 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// execution, then another, then another, stopping when one of them succeeds. 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The DFA execution tries these many executions in parallel, representing 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// each by an instruction id. These pointers are ordered in the State.inst_ 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// list in the same order that the executions would happen in a backtracking 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// search: if a match is found during execution of inst_[2], inst_[i] for i>=3 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// can be discarded. 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Textbooks also typically do not consider context-aware empty string operators 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// like ^ or $. These are handled by the flag word, which specifies the set 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of empty-string operators that should be matched when executing at the 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// current text position. These flag bits are defined in prog.h. 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The flag word also contains two DFA-specific bits: kFlagMatch if the state 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is a matching state (one that reached a kInstMatch in the program) 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and kFlagLastWord if the last processed byte was a word character, for the 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementation of \B and \b. 5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The flag word also contains, shifted up 16 bits, the bits looked for by 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// any kInstEmptyWidth instructions in the state. These provide a useful 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// summary indicating when new flags might be useful. 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The permanent representation of a State's instruction ids is just an array, 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but while a state is being analyzed, these instruction ids are represented 5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as a Workq, which is an array that allows iteration in insertion order. 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// NOTE(rsc): The choice of State construction determines whether the DFA 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// mimics backtracking implementations (so-called leftmost first matching) or 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// traditional DFA implementations (so-called leftmost longest matching as 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prescribed by POSIX). This implementation chooses to mimic the 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// backtracking implementations, because we want to replace PCRE. To get 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// POSIX behavior, the states would need to be considered not as a simple 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ordered list of instruction ids, but as a list of unordered sets of instruction 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ids. A match by a state in one set would inhibit the running of sets 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// farther down the list but not other instruction ids in the same set. Each 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// set would correspond to matches beginning at a given point in the string. 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is implemented by separating different sets with Mark pointers. 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Looks in the State cache for a State matching q, flag. 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If one is found, returns it. If one is not found, allocates one, 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// inserts it in the cache, and returns it. 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::State* DFA::WorkqToCachedState(Workq* q, uint flag) { 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DEBUG_MODE) 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mutex_.AssertHeld(); 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Construct array of instruction ids for the new state. 6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Only ByteRange, EmptyWidth, and Match instructions are useful to keep: 6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // those are the only operators with any effect in 6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // RunWorkqOnEmptyString or RunWorkqOnByte. 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* inst = new int[q->size()]; 6155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int n = 0; 6165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint needflags = 0; // flags needed by kInstEmptyWidth instructions 6175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool sawmatch = false; // whether queue contains guaranteed kInstMatch 6185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool sawmark = false; // whether queue contains a Mark 6195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 6205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "WorkqToCachedState %s [%#x]", DumpWorkq(q).c_str(), flag); 6215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Workq::iterator it = q->begin(); it != q->end(); ++it) { 6225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = *it; 6235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (sawmatch && (kind_ == Prog::kFirstMatch || q->is_mark(id))) 6245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 6255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q->is_mark(id)) { 6265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n > 0 && inst[n-1] != Mark) { 6275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawmark = true; 6285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst[n++] = Mark; 6295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 6315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = prog_->inst(id); 6335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ip->opcode()) { 6345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAltMatch: 6355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This state will continue to a match no matter what 6365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the rest of the input is. If it is the highest priority match 6375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // being considered, return the special FullMatchState 6385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to indicate that it's all matches from here out. 6395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind_ != Prog::kManyMatch && 6405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (kind_ != Prog::kFirstMatch || 6415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (it == q->begin() && ip->greedy(prog_))) && 6425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (kind_ != Prog::kLongestMatch || !sawmark) && 6435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (flag & kFlagMatch)) { 6445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst; 6455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 6465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " -> FullMatchState\n"); 6475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return FullMatchState; 6485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fall through. 6505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstByteRange: // These are useful. 6515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstEmptyWidth: 6525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstMatch: 6535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAlt: // Not useful, but necessary [*] 6545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst[n++] = *it; 6555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->opcode() == kInstEmptyWidth) 6565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) needflags |= ip->empty(); 6575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->opcode() == kInstMatch && !prog_->anchor_end()) 6585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sawmatch = true; 6595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 6605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) default: // The rest are not. 6625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 6635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // [*] kInstAlt would seem useless to record in a state, since 6665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we've already followed both its arrows and saved all the 6675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // interesting states we can reach from there. The problem 6685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is that one of the empty-width instructions might lead 6695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // back to the same kInstAlt (if an empty-width operator is starred), 6705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // producing a different evaluation order depending on whether 6715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // we keep the kInstAlt to begin with. Sigh. 6725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // A specific case that this affects is /(^|a)+/ matching "a". 6735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If we don't save the kInstAlt, we will match the whole "a" (0,1) 6745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // but in fact the correct leftmost-first match is the leading "" (0,0). 6755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 6765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(n, q->size()); 6775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n > 0 && inst[n-1] == Mark) 6785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) n--; 6795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If there are no empty-width instructions waiting to execute, 6815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // then the extra flag bits will not be used, so there is no 6825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // point in saving them. (Discarding them reduces the number 6835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of distinct states.) 6845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (needflags == 0) 6855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag &= kFlagMatch; 6865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // NOTE(rsc): The code above cannot do flag &= needflags, 6885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // because if the right flags were present to pass the current 6895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kInstEmptyWidth instructions, new kInstEmptyWidth instructions 6905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // might be reached that in turn need different flags. 6915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The only sure thing is that if there are no kInstEmptyWidth 6925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // instructions at all, no flags will be needed. 6935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We could do the extra work to figure out the full set of 6945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // possibly needed flags by exploring past the kInstEmptyWidth 6955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // instructions, but the check above -- are any flags needed 6965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // at all? -- handles the most common case. More fine-grained 6975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // analysis can only be justified by measurements showing that 6985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // too many redundant states are being allocated. 6995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If there are no Insts in the list, it's a dead state, 7015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // which is useful to signal with a special pointer so that 7025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the execution loop can stop early. This is only okay 7035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if the state is *not* a matching state. 7045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (n == 0 && flag == 0) { 7055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst; 7065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 7075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " -> DeadState\n"); 7085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return DeadState; 7095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If we're in longest match mode, the state is a sequence of 7125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // unordered state sets separated by Marks. Sort each set 7135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to canonicalize, to reduce the number of distinct sets stored. 7145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind_ == Prog::kLongestMatch) { 7155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* ip = inst; 7165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* ep = ip + n; 7175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (ip < ep) { 7185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* markp = ip; 7195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (markp < ep && *markp != Mark) 7205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) markp++; 7215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) sort(ip, markp); 7225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (markp < ep) 7235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) markp++; 7245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ip = markp; 7255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Save the needed empty-width flags in the top bits for use later. 7295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag |= needflags << kFlagNeedShift; 7305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* state = CachedState(inst, n, flag); 7325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst; 7335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return state; 7345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Looks in the State cache for a State matching inst, ninst, flag. 7375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If one is found, returns it. If one is not found, allocates one, 7385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// inserts it in the cache, and returns it. 7395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::State* DFA::CachedState(int* inst, int ninst, uint flag) { 7405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DEBUG_MODE) 7415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mutex_.AssertHeld(); 7425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look in the cache for a pre-existing state. 7445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State state = { inst, ninst, flag, NULL }; 7455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSet::iterator it = state_cache_.find(&state); 7465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (it != state_cache_.end()) { 7475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 7485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " -cached-> %s\n", DumpState(*it).c_str()); 7495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return *it; 7505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Must have enough memory for new state. 7535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In addition to what we're going to allocate, 7545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the state cache hash table seems to incur about 32 bytes per 7555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // State*, empirically. 7565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int kStateCacheOverhead = 32; 7575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nnext = prog_->bytemap_range() + 1; // + 1 for kByteEndText slot 7585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int mem = sizeof(State) + nnext*sizeof(State*) + ninst*sizeof(int); 7595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (mem_budget_ < mem + kStateCacheOverhead) { 7605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ = -1; 7615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 7625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 7635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ -= mem + kStateCacheOverhead; 7645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Allocate new state, along with room for next and inst. 7665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char* space = new char[mem]; 7675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = reinterpret_cast<State*>(space); 7685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->next_ = reinterpret_cast<State**>(s + 1); 7695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->inst_ = reinterpret_cast<int*>(s->next_ + nnext); 7705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memset(s->next_, 0, nnext*sizeof s->next_[0]); 7715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(s->inst_, inst, ninst*sizeof s->inst_[0]); 7725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->ninst_ = ninst; 7735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s->flag_ = flag; 7745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 7755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, " -> %s\n", DumpState(s).c_str()); 7765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Put state in cache and return it. 7785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state_cache_.insert(s); 7795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return s; 7805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Clear the cache. Must hold cache_mutex_.w or be in destructor. 7835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::ClearCache() { 7845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In case state_cache_ doesn't support deleting entries 7855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // during iteration, copy into a vector and then delete. 7865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<State*> v; 7875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) v.reserve(state_cache_.size()); 7885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (StateSet::iterator it = state_cache_.begin(); 7895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) it != state_cache_.end(); ++it) 7905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) v.push_back(*it); 7915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state_cache_.clear(); 7925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < v.size(); i++) 7935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] reinterpret_cast<const char*>(v[i]); 7945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 7955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 7965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copies insts in state s to the work queue q. 7975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::StateToWorkq(State* s, Workq* q) { 7985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->clear(); 7995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < s->ninst_; i++) { 8005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->inst_[i] == Mark) 8015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->mark(); 8025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 8035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->insert_new(s->inst_[i]); 8045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Adds ip to the work queue, following empty arrows according to flag 8085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and expanding kInstAlt instructions (two-target gotos). 8095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::AddToQueue(Workq* q, int id, uint flag) { 8105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Use astack_ to hold our stack of states yet to process. 8125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It is sized to have room for nastack_ == 2*prog->size() + nmark 8135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // instructions, which is enough: each instruction can be 8145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // processed by the switch below only once, and the processing 8155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // pushes at most two instructions plus maybe a mark. 8165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (If we're using marks, nmark == prog->size(); otherwise nmark == 0.) 8175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* stk = astack_; 8185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nstk = 0; 8195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = id; 8215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (nstk > 0) { 8225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_LE(nstk, nastack_); 8235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) id = stk[--nstk]; 8245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id == Mark) { 8265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->mark(); 8275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id == 0) 8315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If ip is already on the queue, nothing to do. 8345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Otherwise add it. We don't actually keep all the ones 8355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that get added -- for example, kInstAlt is ignored 8365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // when on a work queue -- but adding all ip's here 8375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // increases the likelihood of q->contains(id), 8385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // reducing the amount of duplicated work. 8395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q->contains(id)) 8405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 8415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->insert_new(id); 8425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Process instruction. 8445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = prog_->inst(id); 8455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ip->opcode()) { 8465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstFail: // can't happen: discarded above 8475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstByteRange: // just save these on the queue 8505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstMatch: 8515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstCapture: // DFA treats captures as no-ops. 8545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstNop: 8555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = ip->out(); 8565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAlt: // two choices: expand both, in order 8595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAltMatch: 8605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Want to visit out then out1, so push on stack in reverse order. 8615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This instruction is the [00-FF]* loop at the beginning of 8625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a leftmost-longest unanchored search, separate out from out1 8635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // with a Mark, so that out1's threads (which will start farther 8645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to the right in the string being searched) are lower priority 8655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // than the current ones. 8665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = ip->out1(); 8675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q->maxmark() > 0 && 8685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) id == prog_->start_unanchored() && id != prog_->start()) 8695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = Mark; 8705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = ip->out(); 8715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstEmptyWidth: 8745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((ip->empty() & flag) == ip->empty()) 8755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stk[nstk++] = ip->out(); 8765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 8775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 8795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 8805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Running of work queues. In the work queue, order matters: 8825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the queue is sorted in priority order. If instruction i comes before j, 8835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// then the instructions that i produces during the run must come before 8845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the ones that j produces. In order to keep this invariant, all the 8855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// work queue runners have to take an old queue to process and then 8865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// also a new queue to fill in. It's not acceptable to add to the end of 8875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// an existing queue, because new instructions will not end up in the 8885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// correct position. 8895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 8905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Runs the work queue, processing the empty strings indicated by flag. 8915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, flag == kEmptyBeginLine|kEmptyEndLine means to match 8925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// both ^ and $. It is important that callers pass all flags at once: 8935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// processing both ^ and $ is not the same as first processing only ^ 8945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and then processing only $. Doing the two-step sequence won't match 8955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ^$^$^$ but processing ^ and $ simultaneously will (and is the behavior 8965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// exhibited by existing implementations). 8975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::RunWorkqOnEmptyString(Workq* oldq, Workq* newq, uint flag) { 8985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newq->clear(); 8995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { 9005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (oldq->is_mark(*i)) 9015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddToQueue(newq, Mark, flag); 9025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 9035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddToQueue(newq, *i, flag); 9045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Runs the work queue, processing the single byte c followed by any empty 9085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// strings indicated by flag. For example, c == 'a' and flag == kEmptyEndLine, 9095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// means to match c$. Sets the bool *ismatch to true if the end of the 9105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regular expression program has been reached (the regexp has matched). 9115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::RunWorkqOnByte(Workq* oldq, Workq* newq, 9125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c, uint flag, bool* ismatch, 9135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::MatchKind kind, 9145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int new_byte_loop) { 9155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DEBUG_MODE) 9165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mutex_.AssertHeld(); 9175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newq->clear(); 9195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Workq::iterator i = oldq->begin(); i != oldq->end(); ++i) { 9205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (oldq->is_mark(*i)) { 9215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (*ismatch) 9225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 9235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) newq->mark(); 9245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 9255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = *i; 9275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = prog_->inst(id); 9285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ip->opcode()) { 9295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstFail: // never succeeds 9305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstCapture: // already followed 9315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstNop: // already followed 9325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAlt: // already followed 9335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAltMatch: // already followed 9345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstEmptyWidth: // already followed 9355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstByteRange: // can follow if c is in range 9385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->Matches(c)) 9395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddToQueue(newq, ip->out(), flag); 9405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstMatch: 9435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (prog_->anchor_end() && c != kByteEndText) 9445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *ismatch = true; 9465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == Prog::kFirstMatch) { 9475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Can stop processing work queue since we found a match. 9485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 9495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 9515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 9555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "%s on %d[%#x] -> %s [%d]\n", DumpWorkq(oldq).c_str(), 9565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c, flag, DumpWorkq(newq).c_str(), *ismatch); 9575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes input byte c in state, returning new state. 9605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Caller does not hold mutex. 9615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::State* DFA::RunStateOnByteUnlocked(State* state, int c) { 9625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Keep only one RunStateOnByte going 9635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // even if the DFA is being run by multiple threads. 9645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MutexLock l(&mutex_); 9655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return RunStateOnByte(state, c); 9665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 9675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Processes input byte c in state, returning new state. 9695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::State* DFA::RunStateOnByte(State* state, int c) { 9705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DEBUG_MODE) 9715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mutex_.AssertHeld(); 9725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state <= SpecialStateMax) { 9735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == FullMatchState) { 9745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It is convenient for routines like PossibleMatchRange 9755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if we implement RunStateOnByte for FullMatchState: 9765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // once you get into this state you never get out, 9775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so it's pretty easy. 9785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return FullMatchState; 9795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == DeadState) { 9815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "DeadState in RunStateOnByte"; 9825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 9835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == NULL) { 9855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "NULL state in RunStateOnByte"; 9865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 9875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Unexpected special state in RunStateOnByte"; 9895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return NULL; 9905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 9915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If someone else already computed this, return it. 9935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 9945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = state->next_[ByteMap(c)]; 9955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(ns); 9965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns != NULL) 9975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ns; 9985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 9995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Convert state into Workq. 10005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateToWorkq(state, q0_); 10015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Flags marking the kinds of empty-width things (^ $ etc) 10035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // around this byte. Before the byte we have the flags recorded 10045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in the State structure itself. After the byte we have 10055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // nothing yet (but that will change: read on). 10065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint needflag = state->flag_ >> kFlagNeedShift; 10075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint beforeflag = state->flag_ & kFlagEmptyMask; 10085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint oldbeforeflag = beforeflag; 10095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint afterflag = 0; 10105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c == '\n') { 10125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Insert implicit $ and ^ around \n 10135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) beforeflag |= kEmptyEndLine; 10145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) afterflag |= kEmptyBeginLine; 10155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c == kByteEndText) { 10185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Insert implicit $ and \z before the fake "end text" byte. 10195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) beforeflag |= kEmptyEndLine | kEmptyEndText; 10205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The state flag kFlagLastWord says whether the last 10235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // byte processed was a word character. Use that info to 10245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // insert empty-width (non-)word boundaries. 10255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool islastword = state->flag_ & kFlagLastWord; 10265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool isword = (c != kByteEndText && Prog::IsWordChar(c)); 10275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (isword == islastword) 10285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) beforeflag |= kEmptyNonWordBoundary; 10295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 10305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) beforeflag |= kEmptyWordBoundary; 10315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Okay, finally ready to run. 10335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Only useful to rerun on empty string if there are new, useful flags. 10345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (beforeflag & ~oldbeforeflag & needflag) { 10355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunWorkqOnEmptyString(q0_, q1_, beforeflag); 10365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) swap(q0_, q1_); 10375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 10385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ismatch = false; 10395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RunWorkqOnByte(q0_, q1_, c, afterflag, &ismatch, kind_, start_unanchored_); 10405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Most of the time, we build the state from the output of 10425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // RunWorkqOnByte, so swap q0_ and q1_ here. However, so that 10435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // RE2::Set can tell exactly which match instructions 10445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // contributed to the match, don't swap if c is kByteEndText. 10455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The resulting state wouldn't be correct for further processing 10465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the string, but we're at the end of the text so that's okay. 10475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Leaving q0_ alone preseves the match instructions that led to 10485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // the current setting of ismatch. 10495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (c != kByteEndText || kind_ != Prog::kManyMatch) 10505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) swap(q0_, q1_); 10515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Save afterflag along with ismatch and isword in new state. 10535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint flag = afterflag; 10545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ismatch) 10555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag |= kFlagMatch; 10565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (isword) 10575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag |= kFlagLastWord; 10585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = WorkqToCachedState(q0_, flag); 10605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Write barrier before updating state->next_ so that the 10625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // main search loop can proceed without any locking, for speed. 10635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (Otherwise it would need one mutex operation per input byte.) 10645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The annotations below tell race detectors that: 10655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // a) the access to next_ should be ignored, 10665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // b) 'ns' is properly published. 10675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); // Flush ns before linking to it. 10685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_IGNORE_WRITES_BEGIN(); 10705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(ns); 10715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state->next_[ByteMap(c)] = ns; 10725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_IGNORE_WRITES_END(); 10735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ns; 10745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 10755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////////////////////// 10785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DFA cache reset. 10795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Reader-writer lock helper. 10815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 10825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The DFA uses a reader-writer mutex to protect the state graph itself. 10835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Traversing the state graph requires holding the mutex for reading, 10845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and discarding the state graph and starting over requires holding the 10855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// lock for writing. If a search needs to expand the graph but is out 10865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of memory, it will need to drop its read lock and then acquire the 10875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// write lock. Since it cannot then atomically downgrade from write lock 10885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to read lock, it runs the rest of the search holding the write lock. 10895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (This probably helps avoid repeated contention, but really the decision 10905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is forced by the Mutex interface.) It's a bit complicated to keep 10915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// track of whether the lock is held for reading or writing and thread 10925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that through the search, so instead we encapsulate it in the RWLocker 10935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and pass that around. 10945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 10955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DFA::RWLocker { 10965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 10975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit RWLocker(Mutex* mu); 10985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~RWLocker(); 10995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the lock is only held for reading right now, 11015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // drop the read lock and re-acquire for writing. 11025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Subsequent calls to LockForWriting are no-ops. 11035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Notice that the lock is *released* temporarily. 11045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void LockForWriting(); 11055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns whether the lock is already held for writing. 11075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool IsLockedForWriting() { 11085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return writing_; 11095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 11125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Mutex* mu_; 11135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool writing_; 11145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(RWLocker); 11165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 11175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::RWLocker::RWLocker(Mutex* mu) 11195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : mu_(mu), writing_(false) { 11205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mu_->ReaderLock(); 11225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This function is marked as NO_THREAD_SAFETY_ANALYSIS because the annotations 11255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// does not support lock upgrade. 11265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::RWLocker::LockForWriting() NO_THREAD_SAFETY_ANALYSIS { 11275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!writing_) { 11285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mu_->ReaderUnlock(); 11295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mu_->Lock(); 11305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) writing_ = true; 11315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::RWLocker::~RWLocker() { 11355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (writing_) 11365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mu_->WriterUnlock(); 11375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 11385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mu_->ReaderUnlock(); 11395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When the DFA's State cache fills, we discard all the states in the 11435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cache and start over. Many threads can be using and adding to the 11445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cache at the same time, so we synchronize using the cache_mutex_ 11455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to keep from stepping on other threads. Specifically, all the 11465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// threads using the current cache hold cache_mutex_ for reading. 11475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When a thread decides to flush the cache, it drops cache_mutex_ 11485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and then re-acquires it for writing. That ensures there are no 11495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// other threads accessing the cache anymore. The rest of the search 11505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// runs holding cache_mutex_ for writing, avoiding any contention 11515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// with or cache pollution caused by other threads. 11525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void DFA::ResetCache(RWLocker* cache_lock) { 11545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Re-acquire the cache_mutex_ for writing (exclusive use). 11555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool was_writing = cache_lock->IsLockedForWriting(); 11565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cache_lock->LockForWriting(); 11575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If we already held cache_mutex_ for writing, it means 11595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // this invocation of Search() has already reset the 11605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache once already. That's a pretty clear indication 11615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that the cache is too small. Warn about that, once. 11625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(rsc): Only warn if state_cache_.size() < some threshold. 11635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (was_writing && !cache_warned_) { 11645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(INFO) << "DFA memory cache could be too small: " 11655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << "only room for " << state_cache_.size() << " states."; 11665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cache_warned_ = true; 11675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Clear the cache, reset the memory budget. 11705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < kMaxStart; i++) { 11715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_[i].start = NULL; 11725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start_[i].firstbyte = kFbUnknown; 11735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 11745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ClearCache(); 11755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mem_budget_ = state_budget_; 11765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 11775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Typically, a couple States do need to be preserved across a cache 11795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// reset, like the State at the current point in the search. 11805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The StateSaver class helps keep States across cache resets. 11815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It makes a copy of the state's guts outside the cache (before the reset) 11825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and then can be asked, after the reset, to recreate the State 11835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the new cache. For example, in a DFA method ("this" is a DFA): 11845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 11855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// StateSaver saver(this, s); 11865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ResetCache(cache_lock); 11875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// s = saver.Restore(); 11885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 11895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The saver should always have room in the cache to re-create the state, 11905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// because resetting the cache locks out all other threads, and the cache 11915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is known to have room for at least a couple states (otherwise the DFA 11925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// constructor fails). 11935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class DFA::StateSaver { 11955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 11965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit StateSaver(DFA* dfa, State* state); 11975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~StateSaver(); 11985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 11995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Recreates and returns a state equivalent to the 12005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // original state passed to the constructor. 12015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Returns NULL if the cache has filled, but 12025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // since the DFA guarantees to have room in the cache 12035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // for a couple states, should never return NULL 12045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if used right after ResetCache. 12055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* Restore(); 12065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 12085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA* dfa_; // the DFA to use 12095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* inst_; // saved info from State 12105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ninst_; 12115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint flag_; 12125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_special_; // whether original state was special 12135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* special_; // if is_special_, the original state 12145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_EVIL_CONSTRUCTORS(StateSaver); 12165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 12175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::StateSaver::StateSaver(DFA* dfa, State* state) { 12195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa_ = dfa; 12205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state <= SpecialStateMax) { 12215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = NULL; 12225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ninst_ = 0; 12235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag_ = 0; 12245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_special_ = true; 12255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) special_ = state; 12265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return; 12275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 12285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_special_ = false; 12295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) special_ = NULL; 12305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flag_ = state->flag_; 12315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ninst_ = state->ninst_; 12325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) inst_ = new int[ninst_]; 12335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(inst_, state->inst_, ninst_*sizeof inst_[0]); 12345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::StateSaver::~StateSaver() { 12375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!is_special_) 12385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] inst_; 12395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA::State* DFA::StateSaver::Restore() { 12425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (is_special_) 12435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return special_; 12445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MutexLock l(&dfa_->mutex_); 12455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = dfa_->CachedState(inst_, ninst_, flag_); 12465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s == NULL) 12475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "StateSaver failed to restore state."; 12485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return s; 12495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 12505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 12525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)////////////////////////////////////////////////////////////////////// 12535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// DFA execution. 12555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The basic search loop is easy: start in a state s and then for each 12575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// byte c in the input, s = s->next[c]. 12585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This simple description omits a few efficiency-driven complications. 12605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// First, the State graph is constructed incrementally: it is possible 12625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that s->next[c] is null, indicating that that state has not been 12635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// fully explored. In this case, RunStateOnByte must be invoked to 12645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// determine the next state, which is cached in s->next[c] to save 12655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// future effort. An alternative reason for s->next[c] to be null is 12665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that the DFA has reached a so-called "dead state", in which any match 12675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is no longer possible. In this case RunStateOnByte will return NULL 12685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and the processing of the string can stop early. 12695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Second, a 256-element pointer array for s->next_ makes each State 12715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// quite large (2kB on 64-bit machines). Instead, dfa->bytemap_[] 12725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// maps from bytes to "byte classes" and then next_ only needs to have 12735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// as many pointers as there are byte classes. A byte class is simply a 12745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// range of bytes that the regexp never distinguishes between. 12755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A regexp looking for a[abc] would have four byte ranges -- 0 to 'a'-1, 12765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 'a', 'b' to 'c', and 'c' to 0xFF. The bytemap slows us a little bit 12775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but in exchange we typically cut the size of a State (and thus our 12785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// memory footprint) by about 5-10x. The comments still refer to 12795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// s->next[c] for simplicity, but code should refer to s->next_[bytemap_[c]]. 12805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Third, it is common for a DFA for an unanchored match to begin in a 12825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// state in which only one particular byte value can take the DFA to a 12835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// different state. That is, s->next[c] != s for only one c. In this 12845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// situation, the DFA can do better than executing the simple loop. 12855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Instead, it can call memchr to search very quickly for the byte c. 12865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Whether the start state has this property is determined during a 12875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// pre-compilation pass, and if so, the byte b is passed to the search 12885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// loop as the "firstbyte" argument, along with a boolean "have_firstbyte". 12895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fourth, the desired behavior is to search for the leftmost-best match 12915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (approximately, the same one that Perl would find), which is not 12925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// necessarily the match ending earliest in the string. Each time a 12935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// match is found, it must be noted, but the DFA must continue on in 12945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// hope of finding a higher-priority match. In some cases, the caller only 12955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cares whether there is any match at all, not which one is found. 12965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The "want_earliest_match" flag causes the search to stop at the first 12975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// match found. 12985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 12995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fifth, one algorithm that uses the DFA needs it to run over the 13005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// input string backward, beginning at the end and ending at the beginning. 13015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Passing false for the "run_forward" flag causes the DFA to run backward. 13025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 13035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The checks for these last three cases, which in a naive implementation 13045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// would be performed once per input byte, slow the general loop enough 13055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to merit specialized versions of the search loop for each of the 13065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// eight possible settings of the three booleans. Rather than write 13075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// eight different functions, we write one general implementation and then 13085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// inline it to create the specialized ones. 13095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 13105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Note that matches are delayed by one byte, to make it easier to 13115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// accomodate match conditions depending on the next input byte (like $ and \b). 13125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// When s->next[c]->IsMatch(), it means that there is a match ending just 13135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// *before* byte c. 13145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The generic search loop. Searches text for a match, returning 13165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the pointer to the end of the chosen match, or NULL if no match. 13175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The bools are equal to the same-named variables in params, but 13185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// making them function arguments lets the inliner specialize 13195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// this function to each combination (see two paragraphs above). 13205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)inline bool DFA::InlinedSearchLoop(SearchParams* params, 13215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool have_firstbyte, 13225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool want_earliest_match, 13235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool run_forward) { 13245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* start = params->start; 13255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* bp = BytePtr(params->text.begin()); // start of text 13265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* p = bp; // text scanning point 13275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* ep = BytePtr(params->text.end()); // end of text 13285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* resetp = NULL; // p at last cache reset 13295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!run_forward) 13305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) swap(p, ep); 13315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* bytemap = prog_->bytemap(); 13335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const uint8* lastmatch = NULL; // most recent matching position in text 13345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool matched = false; 13355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = start; 13365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->IsMatch()) { 13385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 13395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastmatch = p; 13405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (want_earliest_match) { 13415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(lastmatch); 13425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 13435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (p != ep) { 13475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 13485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "@%d: %s\n", static_cast<int>(p - bp), 13495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DumpState(s).c_str()); 13505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (have_firstbyte && s == start) { 13515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In start state, only way out is to find firstbyte, 13525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so use optimized assembly in memchr to skip ahead. 13535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If firstbyte isn't found, we can skip to the end 13545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of the string. 13555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (run_forward) { 13565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((p = BytePtr(memchr(p, params->firstbyte, ep - p))) == NULL) { 13575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p = ep; 13585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 13595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 13615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((p = BytePtr(memrchr(ep, params->firstbyte, p - ep))) == NULL) { 13625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p = ep; 13635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 13645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) p++; 13665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 13685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c; 13705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (run_forward) 13715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = *p++; 13725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 13735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = *--p; 13745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Note that multiple threads might be consulting 13765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // s->next_[bytemap[c]] simultaneously. 13775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // RunStateOnByte takes care of the appropriate locking, 13785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // including a memory barrier so that the unlocked access 13795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (sometimes known as "double-checked locking") is safe. 13805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The alternative would be either one DFA per thread 13815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // or one mutex operation per input byte. 13825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 13835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ns == DeadState means the state is known to be dead 13845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (no more matches are possible). 13855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ns == NULL means the state has not yet been computed 13865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (need to call RunStateOnByteUnlocked). 13875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // RunStateOnByte returns ns == NULL if it is out of memory. 13885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ns == FullMatchState means the rest of the string matches. 13895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 13905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Okay to use bytemap[] not ByteMap() here, because 13915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // c is known to be an actual byte and not kByteEndText. 13925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 13935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 13945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = s->next_[bytemap[c]]; 13955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(ns); 13965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 13975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = RunStateOnByteUnlocked(s, c); 13985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 13995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // After we reset the cache, we hold cache_mutex exclusively, 14005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so if resetp != NULL, it means we filled the DFA state 14015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cache with this search alone (without any other threads). 14025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Benchmarks show that doing a state computation on every 14035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // byte runs at about 0.2 MB/s, while the NFA (nfa.cc) can do the 14045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // same at about 2 MB/s. Unless we're processing an average 14055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of 10 bytes per state computation, fail so that RE2 can 14065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // fall back to the NFA. 14075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (FLAGS_re2_dfa_bail_when_slow && resetp != NULL && 14085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (p - resetp) < 10*state_cache_.size()) { 14095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 14105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 14115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) resetp = p; 14135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Prepare to save start and s across the reset. 14155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSaver save_start(this, start); 14165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSaver save_s(this, s); 14175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Discard all the States in the cache. 14195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ResetCache(params->cache_lock); 14205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Restore start and s so we can continue. 14225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((start = save_start.Restore()) == NULL || 14235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (s = save_s.Restore()) == NULL) { 14245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Restore already did LOG(DFATAL). 14255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 14265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 14275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = RunStateOnByteUnlocked(s, c); 14295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 14305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "RunStateOnByteUnlocked failed after ResetCache"; 14315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 14325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 14335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns <= SpecialStateMax) { 14375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == DeadState) { 14385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(lastmatch); 14395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return matched; 14405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // FullMatchState 14425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(ep); 14435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 14445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s = ns; 14465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s->IsMatch()) { 14485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 14495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The DFA notices the match one byte late, 14505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // so adjust p before using it in the match. 14515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (run_forward) 14525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastmatch = p - 1; 14535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 14545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastmatch = p + 1; 14555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 14565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "match @%d! [%s]\n", 14575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static_cast<int>(lastmatch - bp), 14585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DumpState(s).c_str()); 14595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (want_earliest_match) { 14615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(lastmatch); 14625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 14635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Process one more byte to see if it triggers a match. 14685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (Remember, matches are delayed one byte.) 14695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int lastbyte; 14705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (run_forward) { 14715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params->text.end() == params->context.end()) 14725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastbyte = kByteEndText; 14735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 14745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastbyte = params->text.end()[0] & 0xFF; 14755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 14765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params->text.begin() == params->context.begin()) 14775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastbyte = kByteEndText; 14785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 14795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastbyte = params->text.begin()[-1] & 0xFF; 14805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 14825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MaybeReadMemoryBarrier(); // On alpha we need to ensure read ordering 14835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = s->next_[ByteMap(lastbyte)]; 14845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(ns); 14855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 14865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = RunStateOnByteUnlocked(s, lastbyte); 14875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 14885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSaver save_s(this, s); 14895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ResetCache(params->cache_lock); 14905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((s = save_s.Restore()) == NULL) { 14915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 14925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 14935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 14945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = RunStateOnByteUnlocked(s, lastbyte); 14955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) { 14965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "RunStateOnByteUnlocked failed after Reset"; 14975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 14985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 14995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s = ns; 15035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 15045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "@_: %s\n", DumpState(s).c_str()); 15055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s == FullMatchState) { 15065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(ep); 15075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 15085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s > SpecialStateMax && s->IsMatch()) { 15105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 15115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lastmatch = p; 15125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params->matches && kind_ == Prog::kManyMatch) { 15135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* v = params->matches; 15145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) v->clear(); 15155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < s->ninst_; i++) { 15165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = prog_->inst(s->inst_[i]); 15175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->opcode() == kInstMatch) 15185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) v->push_back(ip->match_id()); 15195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 15225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "match @%d! [%s]\n", static_cast<int>(lastmatch - bp), 15235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DumpState(s).c_str()); 15245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 15255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->ep = reinterpret_cast<const char*>(lastmatch); 15265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return matched; 15275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Inline specializations of the general loop. 15305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchFFF(SearchParams* params) { 15315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 0, 0, 0); 15325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchFFT(SearchParams* params) { 15345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 0, 0, 1); 15355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchFTF(SearchParams* params) { 15375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 0, 1, 0); 15385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchFTT(SearchParams* params) { 15405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 0, 1, 1); 15415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchTFF(SearchParams* params) { 15435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 1, 0, 0); 15445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchTFT(SearchParams* params) { 15465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 1, 0, 1); 15475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchTTF(SearchParams* params) { 15495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 1, 1, 0); 15505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SearchTTT(SearchParams* params) { 15525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 1, 1, 1); 15535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For debugging, calls the general code directly. 15565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::SlowSearchLoop(SearchParams* params) { 15575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return InlinedSearchLoop(params, 15585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->firstbyte >= 0, 15595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->want_earliest_match, 15605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->run_forward); 15615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For performance, calls the appropriate specialized version 15645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of InlinedSearchLoop. 15655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::FastSearchLoop(SearchParams* params) { 15665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Because the methods are private, the Searches array 15675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // cannot be declared at top level. 15685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static bool (DFA::*Searches[])(SearchParams*) = { 15695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchFFF, 15705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchFFT, 15715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchFTF, 15725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchFTT, 15735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchTFF, 15745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchTFT, 15755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchTTF, 15765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) &DFA::SearchTTT, 15775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) }; 15785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool have_firstbyte = (params->firstbyte >= 0); 15805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int index = 4 * have_firstbyte + 15815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2 * params->want_earliest_match + 15825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1 * params->run_forward; 15835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return (this->*Searches[index])(params); 15845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 15855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 15875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The discussion of DFA execution above ignored the question of how 15885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// to determine the initial state for the search loop. There are two 15895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// factors that influence the choice of start state. 15905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 15915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The first factor is whether the search is anchored or not. 15925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The regexp program (Prog*) itself has 15935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// two different entry points: one for anchored searches and one for 15945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// unanchored searches. (The unanchored version starts with a leading ".*?" 15955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and then jumps to the anchored one.) 15965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 15975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The second factor is where text appears in the larger context, which 15985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// determines which empty-string operators can be matched at the beginning 15995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// of execution. If text is at the very beginning of context, \A and ^ match. 16005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Otherwise if text is at the beginning of a line, then ^ matches. 16015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Otherwise it matters whether the character before text is a word character 16025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or a non-word character. 16035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 16045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The two cases (unanchored vs not) and four cases (empty-string flags) 16055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// combine to make the eight cases recorded in the DFA's begin_text_[2], 16065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// begin_line_[2], after_wordchar_[2], and after_nonwordchar_[2] cached 16075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// StartInfos. The start state for each is filled in the first time it 16085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is used for an actual search. 16095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Examines text, context, and anchored to determine the right start 16115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// state for the DFA search loop. Fills in params and returns true on success. 16125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns false on failure. 16135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::AnalyzeSearch(SearchParams* params) { 16145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& text = params->text; 16155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& context = params->context; 16165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Sanity check: make sure that text lies within context. 16185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (text.begin() < context.begin() || text.end() > context.end()) { 16195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Text is not inside context."; 16205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->start = DeadState; 16215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 16225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Determine correct search type. 16255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int start; 16265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint flags; 16275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params->run_forward) { 16285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (text.begin() == context.begin()) { 16295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartBeginText; 16305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kEmptyBeginText|kEmptyBeginLine; 16315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (text.begin()[-1] == '\n') { 16325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartBeginLine; 16335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kEmptyBeginLine; 16345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (Prog::IsWordChar(text.begin()[-1] & 0xFF)) { 16355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartAfterWordChar; 16365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kFlagLastWord; 16375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 16385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartAfterNonWordChar; 16395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = 0; 16405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 16425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (text.end() == context.end()) { 16435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartBeginText; 16445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kEmptyBeginText|kEmptyBeginLine; 16455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (text.end()[0] == '\n') { 16465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartBeginLine; 16475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kEmptyBeginLine; 16485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (Prog::IsWordChar(text.end()[0] & 0xFF)) { 16495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartAfterWordChar; 16505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = kFlagLastWord; 16515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 16525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start = kStartAfterNonWordChar; 16535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags = 0; 16545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params->anchored || prog_->anchor_start()) 16575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) start |= kStartAnchored; 16585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StartInfo* info = &start_[start]; 16595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Try once without cache_lock for writing. 16615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Try again after resetting the cache 16625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (ResetCache will relock cache_lock for writing). 16635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AnalyzeSearchHelper(params, info, flags)) { 16645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ResetCache(params->cache_lock); 16655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AnalyzeSearchHelper(params, info, flags)) { 16665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Failed to analyze start state."; 16675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->failed = true; 16685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 16695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 16735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "anchored=%d fwd=%d flags=%#x state=%s firstbyte=%d\n", 16745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->anchored, params->run_forward, flags, 16755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DumpState(info->start).c_str(), info->firstbyte); 16765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->start = info->start; 16785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->firstbyte = ANNOTATE_UNPROTECTED_READ(info->firstbyte); 16795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 16815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 16825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Fills in info if needed. Returns true on success, false on failure. 16845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::AnalyzeSearchHelper(SearchParams* params, StartInfo* info, 16855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint flags) { 16865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Quick check; okay because of memory barriers below. 16875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ANNOTATE_UNPROTECTED_READ(info->firstbyte) != kFbUnknown) { 16885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(&info->firstbyte); 16895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 16905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MutexLock l(&mutex_); 16935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (info->firstbyte != kFbUnknown) { 16945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(&info->firstbyte); 16955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 16965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 16975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 16985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q0_->clear(); 16995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddToQueue(q0_, 17005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params->anchored ? prog_->start() : prog_->start_unanchored(), 17015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flags); 17025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) info->start = WorkqToCachedState(q0_, flags); 17035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (info->start == NULL) 17045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (info->start == DeadState) { 17075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); // Synchronize with "quick check" above. 17095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) info->firstbyte = kFbNone; 17105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (info->start == FullMatchState) { 17145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); // Synchronize with "quick check" above. 17165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) info->firstbyte = kFbNone; // will be ignored 17175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Compute info->firstbyte by running state on all 17215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // possible byte values, looking for a single one that 17225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // leads to a different state. 17235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int firstbyte = kFbNone; 17245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < 256; i++) { 17255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = RunStateOnByte(info->start, i); 17265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s == NULL) { 17275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); // Synchronize with "quick check" above. 17295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) info->firstbyte = firstbyte; 17305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (s == info->start) 17335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 17345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Goes to new state... 17355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (firstbyte == kFbNone) { 17365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) firstbyte = i; // ... first one 17375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 17385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) firstbyte = kFbMany; // ... too many 17395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 17405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(&info->firstbyte); 17435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); // Synchronize with "quick check" above. 17445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) info->firstbyte = firstbyte; 17455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 17475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The actual DFA search: calls AnalyzeSearch and then FastSearchLoop. 17495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::Search(const StringPiece& text, 17505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& context, 17515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool anchored, 17525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool want_earliest_match, 17535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool run_forward, 17545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool* failed, 17555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char** epp, 17565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<int>* matches) { 17575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *epp = NULL; 17585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ok()) { 17595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *failed = true; 17605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *failed = false; 17635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) { 17655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "\nprogram:\n%s\n", prog_->DumpUnanchored().c_str()); 17665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "text %s anchored=%d earliest=%d fwd=%d kind %d\n", 17675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) text.as_string().c_str(), anchored, want_earliest_match, 17685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) run_forward, kind_); 17695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RWLocker l(&cache_mutex_); 17725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SearchParams params(text, context, &l); 17735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.anchored = anchored; 17745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.want_earliest_match = want_earliest_match; 17755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.run_forward = run_forward; 17765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.matches = matches; 17775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 17785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AnalyzeSearch(¶ms)) { 17795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *failed = true; 17805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params.start == DeadState) 17835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params.start == FullMatchState) { 17855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (run_forward == want_earliest_match) 17865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *epp = text.begin(); 17875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 17885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *epp = text.end(); 17895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 17905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (DebugDFA) 17925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) fprintf(stderr, "start %s\n", DumpState(params.start).c_str()); 17935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool ret = FastSearchLoop(¶ms); 17945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params.failed) { 17955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *failed = true; 17965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 17975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 17985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *epp = params.ep; 17995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return ret; 18005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 18015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Deletes dfa. 18035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 18045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is a separate function so that 18055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prog.h can be used without moving the definition of 18065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// class DFA out of this file. If you set 18075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prog->dfa_ = dfa; 18085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// then you also have to set 18095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// prog->delete_dfa_ = DeleteDFA; 18105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// so that ~Prog can delete the dfa. 18115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void DeleteDFA(DFA* dfa) { 18125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete dfa; 18135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 18145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)DFA* Prog::GetDFA(MatchKind kind) { 18165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA*volatile* pdfa; 18175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == kFirstMatch || kind == kManyMatch) { 18185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pdfa = &dfa_first_; 18195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 18205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kind = kLongestMatch; 18215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pdfa = &dfa_longest_; 18225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Quick check; okay because of memory barrier below. 18255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA *dfa = ANNOTATE_UNPROTECTED_READ(*pdfa); 18265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (dfa != NULL) { 18275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(dfa); 18285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return dfa; 18295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MutexLock l(&dfa_mutex_); 18325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa = *pdfa; 18335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (dfa != NULL) { 18345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_AFTER(dfa); 18355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return dfa; 18365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For a forward DFA, half the memory goes to each DFA. 18395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For a reverse DFA, all the memory goes to the 18405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "longest match" DFA, because RE2 never does reverse 18415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "first match" searches. 18425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int64 m = dfa_mem_/2; 18435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed_) { 18445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == kLongestMatch || kind == kManyMatch) 18455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) m = dfa_mem_; 18465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 18475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) m = 0; 18485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa = new DFA(this, kind, m); 18505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete_dfa_ = DeleteDFA; 18515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Synchronize with "quick check" above. 18535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ANNOTATE_HAPPENS_BEFORE(dfa); 18545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) WriteMemoryBarrier(); 18555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *pdfa = dfa; 18565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return dfa; 18585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 18595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Executes the regexp program to search in text, 18625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// which itself is inside the larger context. (As a convenience, 18635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// passing a NULL context is equivalent to passing text.) 18645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns true if a match is found, false if not. 18655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If a match is found, fills in match0->end() to point at the end of the match 18665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and sets match0->begin() to text.begin(), since the DFA can't track 18675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// where the match actually began. 18685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 18695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is the only external interface (class DFA only exists in this file). 18705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 18715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Prog::SearchDFA(const StringPiece& text, const StringPiece& const_context, 18725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Anchor anchor, MatchKind kind, 18735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece* match0, bool* failed, vector<int>* matches) { 18745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *failed = false; 18755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece context = const_context; 18775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (context.begin() == NULL) 18785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) context = text; 18795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool carat = anchor_start(); 18805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool dollar = anchor_end(); 18815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed_) { 18825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool t = carat; 18835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) carat = dollar; 18845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dollar = t; 18855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 18865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (carat && context.begin() != text.begin()) 18875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (dollar && context.end() != text.end()) 18895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 18905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 18915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Handle full match by running an anchored longest match 18925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and then checking if it covers all of text. 18935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool anchored = anchor == kAnchored || anchor_start() || kind == kFullMatch; 18945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool endmatch = false; 18955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == kManyMatch) { 18965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) endmatch = true; 18975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (kind == kFullMatch || anchor_end()) { 18985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) endmatch = true; 18995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kind = kLongestMatch; 19005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If the caller doesn't care where the match is (just whether one exists), 19035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // then we can stop at the very first match we find, the so-called 19045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // "shortest match". 19055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool want_shortest_match = false; 19065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (match0 == NULL && !endmatch) { 19075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) want_shortest_match = true; 19085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kind = kLongestMatch; 19095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA* dfa = GetDFA(kind); 19125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ep; 19135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool matched = dfa->Search(text, context, anchored, 19145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) want_shortest_match, !reversed_, 19155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) failed, &ep, matches); 19165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (*failed) 19175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!matched) 19195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (endmatch && ep != (reversed_ ? text.begin() : text.end())) 19215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If caller cares, record the boundary of the match. 19245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We only know where it ends, so use the boundary of text 19255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // as the beginning. 19265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (match0) { 19275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (reversed_) 19285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *match0 = StringPiece(ep, text.end() - ep); 19295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) else 19305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *match0 = StringPiece(text.begin(), ep - text.begin()); 19315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 19335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 19345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Build out all states in DFA. Returns number of states. 19365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int DFA::BuildAllStates() { 19375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ok()) 19385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 19395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pick out start state for unanchored search 19415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // at beginning of text. 19425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RWLocker l(&cache_mutex_); 19435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SearchParams params(NULL, NULL, &l); 19445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.anchored = false; 19455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AnalyzeSearch(¶ms) || params.start <= SpecialStateMax) 19465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return 0; 19475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Add start state to work queue. 19495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StateSet queued; 19505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) vector<State*> q; 19515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) queued.insert(params.start); 19525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q.push_back(params.start); 19535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Flood to expand every state. 19555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < q.size(); i++) { 19565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = q[i]; 19575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int c = 0; c < 257; c++) { 19585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = RunStateOnByteUnlocked(s, c); 19595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns > SpecialStateMax && queued.find(ns) == queued.end()) { 19605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) queued.insert(ns); 19615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q.push_back(ns); 19625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 19655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return q.size(); 19675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 19685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Build out all states in DFA for kind. Returns number of states. 19705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)int Prog::BuildEntireDFA(MatchKind kind) { 19715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) //LOG(ERROR) << "BuildEntireDFA is only for testing."; 19725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return GetDFA(kind)->BuildAllStates(); 19735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 19745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Computes min and max for matching string. 19765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Won't return strings bigger than maxlen. 19775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool DFA::PossibleMatchRange(string* min, string* max, int maxlen) { 19785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!ok()) 19795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 19805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // NOTE: if future users of PossibleMatchRange want more precision when 19825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // presented with infinitely repeated elements, consider making this a 19835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // parameter to PossibleMatchRange. 19845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static int kMaxEltRepetitions = 0; 19855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Keep track of the number of times we've visited states previously. We only 19875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // revisit a given state if it's part of a repeated group, so if the value 19885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // portion of the map tuple exceeds kMaxEltRepetitions we bail out and set 19895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // |*max| to |PrefixSuccessor(*max)|. 19905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 19915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Also note that previously_visited_states[UnseenStatePtr] will, in the STL 19925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // tradition, implicitly insert a '0' value at first use. We take advantage 19935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // of that property below. 19945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) map<State*, int> previously_visited_states; 19955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 19965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Pick out start state for anchored search at beginning of text. 19975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) RWLocker l(&cache_mutex_); 19985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SearchParams params(NULL, NULL, &l); 19995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) params.anchored = true; 20005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AnalyzeSearch(¶ms)) 20015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 20025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params.start == DeadState) { // No matching strings 20035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *min = ""; 20045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *max = ""; 20055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 20065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (params.start == FullMatchState) // Every string matches: no max 20085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 20095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The DFA is essentially a big graph rooted at params.start, 20115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and paths in the graph correspond to accepted strings. 20125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Each node in the graph has potentially 256+1 arrows 20135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // coming out, one for each byte plus the magic end of 20145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // text character kByteEndText. 20155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // To find the smallest possible prefix of an accepted 20175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // string, we just walk the graph preferring to follow 20185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // arrows with the lowest bytes possible. To find the 20195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // largest possible prefix, we follow the largest bytes 20205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // possible. 20215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The test for whether there is an arrow from s on byte j is 20235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ns = RunStateOnByteUnlocked(s, j); 20245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if (ns == NULL) 20255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // return false; 20265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // if (ns != DeadState && ns->ninst > 0) 20275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The RunStateOnByteUnlocked call asks the DFA to build out the graph. 20285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // It returns NULL only if the DFA has run out of memory, 20295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // in which case we can't be sure of anything. 20305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The second check sees whether there was graph built 20315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // and whether it is interesting graph. Nodes might have 20325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ns->ninst == 0 if they exist only to represent the fact 20335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that a match was found on the previous byte. 20345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Build minimum prefix. 20365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* s = params.start; 20375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) min->clear(); 20385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < maxlen; i++) { 20395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (previously_visited_states[s] > kMaxEltRepetitions) { 20405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions 20415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << " for state s=" << s << " and min=" << CEscape(*min); 20425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) previously_visited_states[s]++; 20455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Stop if min is a match. 20475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = RunStateOnByteUnlocked(s, kByteEndText); 20485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) // DFA out of memory 20495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 20505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns != DeadState && (ns == FullMatchState || ns->IsMatch())) 20515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Try to extend the string with low bytes. 20545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool extended = false; 20555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int j = 0; j < 256; j++) { 20565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ns = RunStateOnByteUnlocked(s, j); 20575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) // DFA out of memory 20585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 20595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == FullMatchState || 20605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (ns > SpecialStateMax && ns->ninst_ > 0)) { 20615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) extended = true; 20625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) min->append(1, j); 20635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s = ns; 20645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!extended) 20685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Build maximum prefix. 20725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) previously_visited_states.clear(); 20735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s = params.start; 20745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max->clear(); 20755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < maxlen; i++) { 20765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (previously_visited_states[s] > kMaxEltRepetitions) { 20775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) VLOG(2) << "Hit kMaxEltRepetitions=" << kMaxEltRepetitions 20785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << " for state s=" << s << " and max=" << CEscape(*max); 20795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) previously_visited_states[s] += 1; 20825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 20835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Try to extend the string with high bytes. 20845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool extended = false; 20855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int j = 255; j >= 0; j--) { 20865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) State* ns = RunStateOnByteUnlocked(s, j); 20875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == NULL) 20885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 20895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ns == FullMatchState || 20905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) (ns > SpecialStateMax && ns->ninst_ > 0)) { 20915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) extended = true; 20925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) max->append(1, j); 20935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) s = ns; 20945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 20955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 20975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!extended) { 20985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Done, no need for PrefixSuccessor. 20995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 21005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Stopped while still adding to *max - round aaaaaaaaaa... to aaaa...b 21045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *max = PrefixSuccessor(*max); 21055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If there are no bytes left, we have no way to say "there is no maximum 21075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // string". We could make the interface more complicated and be able to 21085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // return "there is no maximum but here is a minimum", but that seems like 21095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // overkill -- the most common no-max case is all possible strings, so not 21105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // telling the caller that the empty string is the minimum match isn't a 21115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // great loss. 21125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (max->empty()) 21135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 21145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 21165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 21175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// PossibleMatchRange for a Prog. 21195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Prog::PossibleMatchRange(string* min, string* max, int maxlen) { 21205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DFA* dfa = NULL; 21215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 21225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MutexLock l(&dfa_mutex_); 21235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Have to use dfa_longest_ to get all strings for full matches. 21245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For example, (a|aa) never matches aa in first-match mode. 21255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (dfa_longest_ == NULL) { 21265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa_longest_ = new DFA(this, Prog::kLongestMatch, dfa_mem_/2); 21275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete_dfa_ = DeleteDFA; 21285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa = dfa_longest_; 21305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 21315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return dfa->PossibleMatchRange(min, max, maxlen); 21325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 21335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 21345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 2135