15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright 2008 The RE2 Authors. All Rights Reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// license that can be found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Tested by search_test.cc. 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Prog::SearchOnePass is an efficient implementation of 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regular expression search with submatch tracking for 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// what I call "one-pass regular expressions". (An alternate 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// name might be "backtracking-free regular expressions".) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// One-pass regular expressions have the property that 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// at each input byte during an anchored match, there may be 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// multiple alternatives but only one can proceed for any 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// given input byte. 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// For example, the regexp /x*yx*/ is one-pass: you read 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// x's until a y, then you read the y, then you keep reading x's. 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// At no point do you have to guess what to do or back up 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and try a different guess. 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On the other hand, /x*x/ is not one-pass: when you're 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// looking at an input "x", it's not clear whether you should 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// use it to extend the x* or as the final x. 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not. 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not. 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A simple intuition for identifying one-pass regular expressions 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is that it's always immediately obvious when a repetition ends. 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// It must also be immediately obvious which branch of an | to take: 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// /x(y|z)/ is one-pass, but /(xy|xz)/ is not. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The NFA-based search in nfa.cc does some bookkeeping to 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// avoid the need for backtracking and its associated exponential blowup. 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// But if we have a one-pass regular expression, there is no 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// possibility of backtracking, so there is no need for the 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// extra bookkeeping. Hence, this code. 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// On a one-pass regular expression, the NFA code in nfa.cc 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// runs at about 1/20 of the backtracking-based PCRE speed. 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In contrast, the code in this file runs at about the same 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// speed as PCRE. 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// One-pass regular expressions get used a lot when RE is 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// used for parsing simple strings, so it pays off to 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// notice them and handle them efficiently. 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// See also Anne Brüggemann-Klein and Derick Wood, 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// "One-unambiguous regular languages", Information and Computation 142(2). 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <string.h> 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include <map> 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/util.h" 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/arena.h" 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "util/sparse_set.h" 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/prog.h" 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "re2/stringpiece.h" 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace re2 { 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int Debug = 0; 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The key insight behind this implementation is that the 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// non-determinism in an NFA for a one-pass regular expression 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is contained. To explain what that means, first a 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// refresher about what regular expression programs look like 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and how the usual NFA execution runs. 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// In a regular expression program, only the kInstByteRange 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instruction processes an input byte c and moves on to the 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// next byte in the string (it does so if c is in the given range). 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The kInstByteRange instructions correspond to literal characters 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and character classes in the regular expression. 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The kInstAlt instructions are used as wiring to connect the 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// kInstByteRange instructions together in interesting ways when 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// implementing | + and *. 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The kInstAlt instruction forks execution, like a goto that 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// jumps to ip->out() and ip->out1() in parallel. Each of the 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// resulting computation paths is called a thread. 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture -- 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// are interesting in their own right but like kInstAlt they don't 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// advance the input pointer. Only kInstByteRange does. 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The automaton execution in nfa.cc runs all the possible 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// threads of execution in lock-step over the input. To process 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a particular byte, each thread gets run until it either dies 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// or finds a kInstByteRange instruction matching the byte. 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If the latter happens, the thread stops just past the 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// kInstByteRange instruction (at ip->out()) and waits for 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the other threads to finish processing the input byte. 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Then, once all the threads have processed that input byte, 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the whole process repeats. The kInstAlt state instruction 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// might create new threads during input processing, but no 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// matter what, all the threads stop after a kInstByteRange 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and wait for the other threads to "catch up". 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Running in lock step like this ensures that the NFA reads 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the input string only once. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Each thread maintains its own set of capture registers 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (the string positions at which it executed the kInstCapture 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instructions corresponding to capturing parentheses in the 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// regular expression). Repeated copying of the capture registers 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// is the main performance bottleneck in the NFA implementation. 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A regular expression program is "one-pass" if, no matter what 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the input string, there is only one thread that makes it 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// past a kInstByteRange instruction at each input byte. This means 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// that there is in some sense only one active thread throughout 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the execution. Other threads might be created during the 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// processing of an input byte, but they are ephemeral: only one 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// thread is left to start processing the next input byte. 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is what I meant above when I said the non-determinism 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// was "contained". 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// To execute a one-pass regular expression program, we can build 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// a DFA (no non-determinism) that has at most as many states as 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the NFA (compare this to the possibly exponential number of states 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// in the general case). Each state records, for each possible 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// input byte, the next state along with the conditions required 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// before entering that state -- empty-width flags that must be true 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and capture operations that must be performed. It also records 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// whether a set of conditions required to finish a match at that 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// point in the input rather than process the next byte. 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A state in the one-pass NFA (aka DFA) - just an array of actions. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct OneState; 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// A state in the one-pass NFA - just an array of actions indexed 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// by the bytemap_[] of the next input byte. (The bytemap 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// maps next input bytes into equivalence classes, to reduce 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the memory footprint.) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct OneState { 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 matchcond; // conditions to match right now. 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 action[1]; 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The uint32 conditions in the action are a combination of 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// condition and capture bits and the next state. The bottom 16 bits 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// are the condition and capture bits, and the top 16 are the index of 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// the next state. 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Bits 0-5 are the empty-width flags from prog.h. 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Bit 6 is kMatchWins, which means the match takes 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// priority over moving to next in a first-match search. 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// The remaining bits mark capture registers that should 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// be set to the current input position. The capture bits 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// start at index 2, since the search loop can take care of 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// cap[0], cap[1] (the overall match position). 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// That means we can handle up to 5 capturing parens: $1 through $4, plus $0. 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// No input position can satisfy both kEmptyWordBoundary 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// and kEmptyNonWordBoundary, so we can use that as a sentinel 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instead of needing an extra bit. 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kIndexShift = 16; // number of bits below index 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kEmptyShift = 6; // number of empty flags in prog.h 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kRealCapShift = kEmptyShift + 1; 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2; 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Parameters used to skip over cap[0], cap[1]. 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kCapShift = kRealCapShift - 2; 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const int kMaxCap = kRealMaxCap + 2; 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kMatchWins = 1 << kEmptyShift; 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift; 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary; 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Check, at compile time, that prog.h agrees with math above. 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This function is never called. 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void OnePass_Checks() { 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags, 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kEmptyShift_disagrees_with_kEmptyAllFlags); 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kMaxCap counts pointers, kMaxOnePassCapture counts pairs. 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2, 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kMaxCap_disagrees_with_kMaxOnePassCapture); 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) { 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 satisfied = Prog::EmptyFlags(context, p); 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cond & kEmptyAllFlags & ~satisfied) 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Apply the capture bits in cond, saving p to the appropriate 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// locations in cap[]. 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static void ApplyCaptures(uint32 cond, const char* p, 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char** cap, int ncap) { 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 2; i < ncap; i++) 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (cond & (1 << kCapShift << i)) 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cap[i] = p; 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Compute a node pointer. 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Basically (OneState*)(nodes + statesize*nodeindex) 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// but the version with the C++ casts overflows 80 characters (and is ugly). 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static inline OneState* IndexToNode(volatile uint8* nodes, int statesize, 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nodeindex) { 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return reinterpret_cast<OneState*>( 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const_cast<uint8*>(nodes + statesize*nodeindex)); 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Prog::SearchOnePass(const StringPiece& text, 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const StringPiece& const_context, 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Anchor anchor, MatchKind kind, 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece* match, int nmatch) { 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor != kAnchored && kind != kFullMatch) { 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches."; 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Make sure we have at least cap[1], 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // because we use it to tell if we matched. 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int ncap = 2*nmatch; 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ncap < 2) 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ncap = 2; 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* cap[kMaxCap]; 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < ncap; i++) 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cap[i] = NULL; 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* matchcap[kMaxCap]; 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < ncap; i++) 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[i] = NULL; 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringPiece context = const_context; 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (context.begin() == NULL) 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) context = text; 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor_start() && context.begin() != text.begin()) 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor_end() && context.end() != text.end()) 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (anchor_end()) 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kind = kFullMatch; 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // State and act are marked volatile to 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // keep the compiler from re-ordering the 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // memory accesses walking over the NFA. 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This is worth about 5%. 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) volatile OneState* state = onepass_start_; 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) volatile uint8* nodes = onepass_nodes_; 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) volatile uint32 statesize = onepass_statesize_; 2475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8* bytemap = bytemap_; 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* bp = text.begin(); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* ep = text.end(); 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* p; 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool matched = false; 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[0] = bp; 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cap[0] = bp; 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 nextmatchcond = state->matchcond; 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (p = bp; p < ep; p++) { 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int c = bytemap[*p & 0xFF]; 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 matchcond = nextmatchcond; 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 cond = state->action[c]; 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Determine whether we can reach act->next. 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If so, advance state and nextmatchcond. 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) { 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 nextindex = cond >> kIndexShift; 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state = IndexToNode(nodes, statesize, nextindex); 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nextmatchcond = state->matchcond; 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else { 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) state = NULL; 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nextmatchcond = kImpossible; 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // This code section is carefully tuned. 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The goto sequence is about 10% faster than the 2735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // obvious rewrite as a large if statement in the 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // ASCIIMatchRE2 and DotMatchRE2 benchmarks. 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Saving the match capture registers is expensive. 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Is this intermediate match worth thinking about? 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Not if we want a full match. 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == kFullMatch) 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto skipmatch; 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Not if it's impossible. 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (matchcond == kImpossible) 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto skipmatch; 2865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Not if the possible match is beaten by the certain 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // match at the next byte. When this test is useless 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (e.g., HTTPPartialMatchRE2) it slows the loop by 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // about 10%, but when it avoids work (e.g., DotMatchRE2), 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // it cuts the loop execution by about 45%. 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0) 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto skipmatch; 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Finally, the match conditions must be satisfied. 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) { 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 2; i < 2*nmatch; i++) 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[i] = cap[i]; 2995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nmatch > 1 && (matchcond & kCapMask)) 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ApplyCaptures(matchcond, p, matchcap, ncap); 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[1] = p; 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If we're in longest match mode, we have to keep 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // going and see if we find a longer match. 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // In first match mode, we can stop if the match 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // takes priority over the next state for this input byte. 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // That bit is per-input byte and thus in cond, not matchcond. 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (kind == kFirstMatch && (cond & kMatchWins)) 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto done; 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) skipmatch: 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (state == NULL) 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto done; 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((cond & kCapMask) && nmatch > 1) 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ApplyCaptures(cond, p, cap, ncap); 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Look for match at end of input. 3215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) { 3225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 matchcond = state->matchcond; 3235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (matchcond != kImpossible && 3245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) { 3255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nmatch > 1 && (matchcond & kCapMask)) 3265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ApplyCaptures(matchcond, p, cap, ncap); 3275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 2; i < ncap; i++) 3285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[i] = cap[i]; 3295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matchcap[1] = p; 3305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 3315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 3335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)done: 3355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!matched) 3365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 3375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < nmatch; i++) 3385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) match[i].set(matchcap[2*i], matchcap[2*i+1] - matchcap[2*i]); 3395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 3405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Analysis to determine whether a given regexp program is one-pass. 3445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If ip is not on workq, adds ip to work queue and returns true. 3465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If ip is already on work queue, does nothing and returns false. 3475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// If ip is NULL, does nothing and returns true (pretends to add it). 3485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)typedef SparseSet Instq; 3495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)static bool AddQ(Instq *q, int id) { 3505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (id == 0) 3515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 3525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (q->contains(id)) 3535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 3545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) q->insert(id); 3555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 3565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 3575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)struct InstCond { 3595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id; 3605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 cond; 3615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 3625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Returns whether this is a one-pass program; that is, 3645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// returns whether it is safe to use SearchOnePass on this program. 3655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// These conditions must be true for any instruction ip: 3665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (1) for any other Inst nip, there is at most one input-free 3685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// path from ip to nip. 3695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (2) there is at most one kInstByte instruction reachable from 3705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// ip that matches any particular byte c. 3715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// (3) there is at most one input-free path from ip to a kInstMatch 3725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instruction. 3735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// 3745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// This is actually just a conservative approximation: it might 3755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// return false when the answer is true, when kInstEmptyWidth 3765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// instructions are involved. 3775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Constructs and saves corresponding one-pass NFA on success. 3785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)bool Prog::IsOnePass() { 3795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (did_onepass_) 3805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return onepass_start_ != NULL; 3815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) did_onepass_ = true; 3825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (start() == 0) // no match 3845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 3855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Steal memory for the one-pass NFA from the overall DFA budget. 3875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Willing to use at most 1/4 of the DFA budget (heuristic). 3885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Limit max node count to 65000 as a conservative estimate to 3895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // avoid overflowing 16-bit node index in encoding. 3905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxnodes = 2 + byte_inst_count_; 3915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32); 3925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes) 3935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 3945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Flood the graph starting at the start state, and check 3965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // that in each reachable state, each possible byte leads 3975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // to a unique next state. 3985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int size = this->size(); 3995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) InstCond *stack = new InstCond[size]; 4005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* nodebyid = new int[size]; // indexed by ip 4025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memset(nodebyid, 0xFF, size*sizeof nodebyid[0]); 4035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8* nodes = new uint8[maxnodes*statesize]; 4055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint8* nodep = nodes; 4065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Instq tovisit(size), workq(size); 4085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddQ(&tovisit, start()); 4095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodebyid[start()] = 0; 4105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodep += statesize; 4115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nalloc = 1; 4125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { 4135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = *it; 4145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nodeindex = nodebyid[id]; 4155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OneState* node = IndexToNode(nodes, statesize, nodeindex); 4165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Flood graph using manual stack, filling in actions as found. 4185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Default is none. 4195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int b = 0; b < bytemap_range_; b++) 4205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->action[b] = kImpossible; 4215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->matchcond = kImpossible; 4225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) workq.clear(); 4245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool matched = false; 4255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nstack = 0; 4265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack].id = id; 4275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack++].cond = 0; 4285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (nstack > 0) { 4295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = stack[--nstack].id; 4305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Prog::Inst* ip = inst(id); 4315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 cond = stack[nstack].cond; 4325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) switch (ip->opcode()) { 4335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAltMatch: 4345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // TODO(rsc): Ignoring kInstAltMatch optimization. 4355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Should implement it in this engine, but it's subtle. 4365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Fall through. 4375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstAlt: 4385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If already on work queue, (1) is violated: bail out. 4395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1())) 4405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 4415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack].id = ip->out1(); 4425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack++].cond = cond; 4435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack].id = ip->out(); 4445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack++].cond = cond; 4455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 4465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 4475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstByteRange: { 4485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nextindex = nodebyid[ip->out()]; 4495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nextindex == -1) { 4505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nalloc >= maxnodes) { 4515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) 4525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) 4535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << StringPrintf("Not OnePass: hit node limit %d > %d", 4545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nalloc, maxnodes); 4555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 4565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nextindex = nalloc; 4585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodep += statesize; 4595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodebyid[ip->out()] = nextindex; 4605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nalloc++; 4615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) AddQ(&tovisit, ip->out()); 4625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (matched) 4645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cond |= kMatchWins; 4655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int c = ip->lo(); c <= ip->hi(); c++) { 4665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int b = bytemap_[c]; 4675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = unbytemap_[b]; // last c in byte class 4685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 act = node->action[b]; 4695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 newact = (nextindex << kIndexShift) | cond; 4705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((act & kImpossible) == kImpossible) { 4715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->action[b] = newact; 4725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (act != newact) { 4735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) { 4745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) 4755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << StringPrintf("Not OnePass: conflict on byte " 4765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "%#x at state %d", 4775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c, *it); 4785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 4805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->foldcase()) { 4835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a'; 4845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a'; 4855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int c = lo; c <= hi; c++) { 4865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int b = bytemap_[c]; 4875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c = unbytemap_[b]; // last c in class 4885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 act = node->action[b]; 4895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32 newact = (nextindex << kIndexShift) | cond; 4905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((act & kImpossible) == kImpossible) { 4915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->action[b] = newact; 4925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } else if (act != newact) { 4935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) { 4945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) 4955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) << StringPrintf("Not OnePass: conflict on byte " 4965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) "%#x at state %d", 4975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) c, *it); 4985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 4995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 5005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstCapture: 5075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (ip->cap() < kMaxCap) 5085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cond |= (1 << kCapShift) << ip->cap(); 5095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto QueueEmpty; 5105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstEmptyWidth: 5125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) cond |= ip->empty(); 5135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto QueueEmpty; 5145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstNop: 5165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) QueueEmpty: 5175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kInstCapture and kInstNop always proceed to ip->out(). 5185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // kInstEmptyWidth only sometimes proceeds to ip->out(), 5195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // but as a conservative approximation we assume it always does. 5205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // We could be a little more precise by looking at what c 5215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is, but that seems like overkill. 5225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // If already on work queue, (1) is violated: bail out. 5245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!AddQ(&workq, ip->out())) { 5255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) { 5265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << StringPrintf("Not OnePass: multiple paths" 5275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) " %d -> %d\n", 5285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) *it, ip->out()); 5295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 5315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack].id = ip->out(); 5335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) stack[nstack++].cond = cond; 5345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstMatch: 5375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (matched) { 5385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // (3) is violated 5395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) { 5405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << StringPrintf("Not OnePass: multiple matches" 5415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) " from %d\n", *it); 5425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) goto fail; 5445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) matched = true; 5465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->matchcond = cond; 5475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) case kInstFail: 5505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) break; 5515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR). 5565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string dump = "prog dump:\n" + Dump() + "node dump\n"; 5575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) map<int, int> idmap; 5585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < size; i++) 5595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nodebyid[i] != -1) 5605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) idmap[nodebyid[i]] = i; 5615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&dump, "byte ranges:\n"); 5635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int i = 0; 5645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int b = 0; b < bytemap_range_; b++) { 5655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int lo = i; 5665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) while (bytemap_[i] == b) 5675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i++; 5685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1); 5695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) { 5725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int id = *it; 5735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int nodeindex = nodebyid[id]; 5745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (nodeindex == -1) 5755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 5765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) OneState* node = IndexToNode(nodes, statesize, nodeindex); 5775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) string s; 5785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n", 5795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodeindex, id, node->matchcond); 5805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < bytemap_range_; i++) { 5815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if ((node->action[i] & kImpossible) == kImpossible) 5825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) continue; 5835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) StringAppendF(&dump, " %d cond %#x -> %d id=%d\n", 5845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) i, node->action[i] & 0xFFFF, 5855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) node->action[i] >> kIndexShift, 5865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) idmap[node->action[i] >> kIndexShift]); 5875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LOG(ERROR) << dump; 5905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 5915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Overallocated earlier; cut down to actual size. 5935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodep = new uint8[nalloc*statesize]; 5945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memmove(nodep, nodes, nalloc*statesize); 5955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] nodes; 5965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) nodes = nodep; 5975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 5985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]); 5995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) onepass_nodes_ = nodes; 6005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) onepass_statesize_ = statesize; 6015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) dfa_mem_ -= nalloc*statesize; 6025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] stack; 6045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] nodebyid; 6055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return true; 6065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)fail: 6085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] stack; 6095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] nodebyid; 6105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) delete[] nodes; 6115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return false; 6125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 6135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 6145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace re2 615