1// Copyright 2007 The RE2 Authors. All Rights Reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Compile regular expression to Prog. 6// 7// Prog and Inst are defined in prog.h. 8// This file's external interface is just Regexp::CompileToProg. 9// The Compiler class defined in this file is private. 10 11#include "re2/prog.h" 12#include "re2/re2.h" 13#include "re2/regexp.h" 14#include "re2/walker-inl.h" 15 16namespace re2 { 17 18// List of pointers to Inst* that need to be filled in (patched). 19// Because the Inst* haven't been filled in yet, 20// we can use the Inst* word to hold the list's "next" pointer. 21// It's kind of sleazy, but it works well in practice. 22// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration. 23// 24// Because the out and out1 fields in Inst are no longer pointers, 25// we can't use pointers directly here either. Instead, p refers 26// to inst_[p>>1].out (p&1 == 0) or inst_[p>>1].out1 (p&1 == 1). 27// p == 0 represents the NULL list. This is okay because instruction #0 28// is always the fail instruction, which never appears on a list. 29 30struct PatchList { 31 uint32 p; 32 33 // Returns patch list containing just p. 34 static PatchList Mk(uint32 p); 35 36 // Patches all the entries on l to have value v. 37 // Caller must not ever use patch list again. 38 static void Patch(Prog::Inst *inst0, PatchList l, uint32 v); 39 40 // Deref returns the next pointer pointed at by p. 41 static PatchList Deref(Prog::Inst *inst0, PatchList l); 42 43 // Appends two patch lists and returns result. 44 static PatchList Append(Prog::Inst *inst0, PatchList l1, PatchList l2); 45}; 46 47static PatchList nullPatchList = { 0 }; 48 49// Returns patch list containing just p. 50PatchList PatchList::Mk(uint32 p) { 51 PatchList l; 52 l.p = p; 53 return l; 54} 55 56// Returns the next pointer pointed at by l. 57PatchList PatchList::Deref(Prog::Inst* inst0, PatchList l) { 58 Prog::Inst* ip = &inst0[l.p>>1]; 59 if (l.p&1) 60 l.p = ip->out1(); 61 else 62 l.p = ip->out(); 63 return l; 64} 65 66// Patches all the entries on l to have value v. 67void PatchList::Patch(Prog::Inst *inst0, PatchList l, uint32 val) { 68 while (l.p != 0) { 69 Prog::Inst* ip = &inst0[l.p>>1]; 70 if (l.p&1) { 71 l.p = ip->out1(); 72 ip->out1_ = val; 73 } else { 74 l.p = ip->out(); 75 ip->set_out(val); 76 } 77 } 78} 79 80// Appends two patch lists and returns result. 81PatchList PatchList::Append(Prog::Inst* inst0, PatchList l1, PatchList l2) { 82 if (l1.p == 0) 83 return l2; 84 if (l2.p == 0) 85 return l1; 86 87 PatchList l = l1; 88 for (;;) { 89 PatchList next = PatchList::Deref(inst0, l); 90 if (next.p == 0) 91 break; 92 l = next; 93 } 94 95 Prog::Inst* ip = &inst0[l.p>>1]; 96 if (l.p&1) 97 ip->out1_ = l2.p; 98 else 99 ip->set_out(l2.p); 100 101 return l1; 102} 103 104// Compiled program fragment. 105struct Frag { 106 uint32 begin; 107 PatchList end; 108 109 Frag() : begin(0) { end.p = 0; } // needed so Frag can go in vector 110 Frag(uint32 begin, PatchList end) : begin(begin), end(end) {} 111}; 112 113static Frag NullFrag() { 114 return Frag(); 115} 116 117// Input encodings. 118enum Encoding { 119 kEncodingUTF8 = 1, // UTF-8 (0-10FFFF) 120 kEncodingLatin1, // Latin1 (0-FF) 121}; 122 123class Compiler : public Regexp::Walker<Frag> { 124 public: 125 explicit Compiler(); 126 ~Compiler(); 127 128 // Compiles Regexp to a new Prog. 129 // Caller is responsible for deleting Prog when finished with it. 130 // If reversed is true, compiles for walking over the input 131 // string backward (reverses all concatenations). 132 static Prog *Compile(Regexp* re, bool reversed, int64 max_mem); 133 134 // Compiles alternation of all the re to a new Prog. 135 // Each re has a match with an id equal to its index in the vector. 136 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, 137 Regexp* re); 138 139 // Interface for Regexp::Walker, which helps traverse the Regexp. 140 // The walk is purely post-recursive: given the machines for the 141 // children, PostVisit combines them to create the machine for 142 // the current node. The child_args are Frags. 143 // The Compiler traverses the Regexp parse tree, visiting 144 // each node in depth-first order. It invokes PreVisit before 145 // visiting the node's children and PostVisit after visiting 146 // the children. 147 Frag PreVisit(Regexp* re, Frag parent_arg, bool* stop); 148 Frag PostVisit(Regexp* re, Frag parent_arg, Frag pre_arg, Frag* child_args, 149 int nchild_args); 150 Frag ShortVisit(Regexp* re, Frag parent_arg); 151 Frag Copy(Frag arg); 152 153 // Given fragment a, returns a+ or a+?; a* or a*?; a? or a?? 154 Frag Plus(Frag a, bool nongreedy); 155 Frag Star(Frag a, bool nongreedy); 156 Frag Quest(Frag a, bool nongreedy); 157 158 // Given fragment a, returns (a) capturing as \n. 159 Frag Capture(Frag a, int n); 160 161 // Given fragments a and b, returns ab; a|b 162 Frag Cat(Frag a, Frag b); 163 Frag Alt(Frag a, Frag b); 164 165 // Returns a fragment that can't match anything. 166 Frag NoMatch(); 167 168 // Returns a fragment that matches the empty string. 169 Frag Match(int32 id); 170 171 // Returns a no-op fragment. 172 Frag Nop(); 173 174 // Returns a fragment matching the byte range lo-hi. 175 Frag ByteRange(int lo, int hi, bool foldcase); 176 177 // Returns a fragment matching an empty-width special op. 178 Frag EmptyWidth(EmptyOp op); 179 180 // Adds n instructions to the program. 181 // Returns the index of the first one. 182 // Returns -1 if no more instructions are available. 183 int AllocInst(int n); 184 185 // Deletes unused instructions. 186 void Trim(); 187 188 // Rune range compiler. 189 190 // Begins a new alternation. 191 void BeginRange(); 192 193 // Adds a fragment matching the rune range lo-hi. 194 void AddRuneRange(Rune lo, Rune hi, bool foldcase); 195 void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase); 196 void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase); 197 void Add_80_10ffff(); 198 199 // New suffix that matches the byte range lo-hi, then goes to next. 200 int RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 201 int UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next); 202 203 // Adds a suffix to alternation. 204 void AddSuffix(int id); 205 206 // Returns the alternation of all the added suffixes. 207 Frag EndRange(); 208 209 // Single rune. 210 Frag Literal(Rune r, bool foldcase); 211 212 void Setup(Regexp::ParseFlags, int64, RE2::Anchor); 213 Prog* Finish(); 214 215 // Returns .* where dot = any byte 216 Frag DotStar(); 217 218 private: 219 Prog* prog_; // Program being built. 220 bool failed_; // Did we give up compiling? 221 Encoding encoding_; // Input encoding 222 bool reversed_; // Should program run backward over text? 223 224 int max_inst_; // Maximum number of instructions. 225 226 Prog::Inst* inst_; // Pointer to first instruction. 227 int inst_len_; // Number of instructions used. 228 int inst_cap_; // Number of instructions allocated. 229 230 int64 max_mem_; // Total memory budget. 231 232 map<uint64, int> rune_cache_; 233 Frag rune_range_; 234 235 RE2::Anchor anchor_; // anchor mode for RE2::Set 236 237 DISALLOW_EVIL_CONSTRUCTORS(Compiler); 238}; 239 240Compiler::Compiler() { 241 prog_ = new Prog(); 242 failed_ = false; 243 encoding_ = kEncodingUTF8; 244 reversed_ = false; 245 inst_ = NULL; 246 inst_len_ = 0; 247 inst_cap_ = 0; 248 max_inst_ = 1; // make AllocInst for fail instruction okay 249 max_mem_ = 0; 250 int fail = AllocInst(1); 251 inst_[fail].InitFail(); 252 max_inst_ = 0; // Caller must change 253} 254 255Compiler::~Compiler() { 256 delete prog_; 257 delete[] inst_; 258} 259 260int Compiler::AllocInst(int n) { 261 if (failed_ || inst_len_ + n > max_inst_) { 262 failed_ = true; 263 return -1; 264 } 265 266 if (inst_len_ + n > inst_cap_) { 267 if (inst_cap_ == 0) 268 inst_cap_ = 8; 269 while (inst_len_ + n > inst_cap_) 270 inst_cap_ *= 2; 271 Prog::Inst* ip = new Prog::Inst[inst_cap_]; 272 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 273 memset(ip + inst_len_, 0, (inst_cap_ - inst_len_) * sizeof ip[0]); 274 delete[] inst_; 275 inst_ = ip; 276 } 277 int id = inst_len_; 278 inst_len_ += n; 279 return id; 280} 281 282void Compiler::Trim() { 283 if (inst_len_ < inst_cap_) { 284 Prog::Inst* ip = new Prog::Inst[inst_len_]; 285 memmove(ip, inst_, inst_len_ * sizeof ip[0]); 286 delete[] inst_; 287 inst_ = ip; 288 inst_cap_ = inst_len_; 289 } 290} 291 292// These routines are somewhat hard to visualize in text -- 293// see http://swtch.com/~rsc/regexp/regexp1.html for 294// pictures explaining what is going on here. 295 296// Returns an unmatchable fragment. 297Frag Compiler::NoMatch() { 298 return Frag(0, nullPatchList); 299} 300 301// Is a an unmatchable fragment? 302static bool IsNoMatch(Frag a) { 303 return a.begin == 0; 304} 305 306// Given fragments a and b, returns fragment for ab. 307Frag Compiler::Cat(Frag a, Frag b) { 308 if (IsNoMatch(a) || IsNoMatch(b)) 309 return NoMatch(); 310 311 // Elide no-op. 312 Prog::Inst* begin = &inst_[a.begin]; 313 if (begin->opcode() == kInstNop && 314 a.end.p == (a.begin << 1) && 315 begin->out() == 0) { 316 PatchList::Patch(inst_, a.end, b.begin); // in case refs to a somewhere 317 return b; 318 } 319 320 // To run backward over string, reverse all concatenations. 321 if (reversed_) { 322 PatchList::Patch(inst_, b.end, a.begin); 323 return Frag(b.begin, a.end); 324 } 325 326 PatchList::Patch(inst_, a.end, b.begin); 327 return Frag(a.begin, b.end); 328} 329 330// Given fragments for a and b, returns fragment for a|b. 331Frag Compiler::Alt(Frag a, Frag b) { 332 // Special case for convenience in loops. 333 if (IsNoMatch(a)) 334 return b; 335 if (IsNoMatch(b)) 336 return a; 337 338 int id = AllocInst(1); 339 if (id < 0) 340 return NoMatch(); 341 342 inst_[id].InitAlt(a.begin, b.begin); 343 return Frag(id, PatchList::Append(inst_, a.end, b.end)); 344} 345 346// When capturing submatches in like-Perl mode, a kOpAlt Inst 347// treats out_ as the first choice, out1_ as the second. 348// 349// For *, +, and ?, if out_ causes another repetition, 350// then the operator is greedy. If out1_ is the repetition 351// (and out_ moves forward), then the operator is non-greedy. 352 353// Given a fragment a, returns a fragment for a* or a*? (if nongreedy) 354Frag Compiler::Star(Frag a, bool nongreedy) { 355 int id = AllocInst(1); 356 if (id < 0) 357 return NoMatch(); 358 inst_[id].InitAlt(0, 0); 359 PatchList::Patch(inst_, a.end, id); 360 if (nongreedy) { 361 inst_[id].out1_ = a.begin; 362 return Frag(id, PatchList::Mk(id << 1)); 363 } else { 364 inst_[id].set_out(a.begin); 365 return Frag(id, PatchList::Mk((id << 1) | 1)); 366 } 367} 368 369// Given a fragment for a, returns a fragment for a+ or a+? (if nongreedy) 370Frag Compiler::Plus(Frag a, bool nongreedy) { 371 // a+ is just a* with a different entry point. 372 Frag f = Star(a, nongreedy); 373 return Frag(a.begin, f.end); 374} 375 376// Given a fragment for a, returns a fragment for a? or a?? (if nongreedy) 377Frag Compiler::Quest(Frag a, bool nongreedy) { 378 int id = AllocInst(1); 379 if (id < 0) 380 return NoMatch(); 381 PatchList pl; 382 if (nongreedy) { 383 inst_[id].InitAlt(0, a.begin); 384 pl = PatchList::Mk(id << 1); 385 } else { 386 inst_[id].InitAlt(a.begin, 0); 387 pl = PatchList::Mk((id << 1) | 1); 388 } 389 return Frag(id, PatchList::Append(inst_, pl, a.end)); 390} 391 392// Returns a fragment for the byte range lo-hi. 393Frag Compiler::ByteRange(int lo, int hi, bool foldcase) { 394 int id = AllocInst(1); 395 if (id < 0) 396 return NoMatch(); 397 inst_[id].InitByteRange(lo, hi, foldcase, 0); 398 prog_->byte_inst_count_++; 399 prog_->MarkByteRange(lo, hi); 400 if (foldcase && lo <= 'z' && hi >= 'a') { 401 if (lo < 'a') 402 lo = 'a'; 403 if (hi > 'z') 404 hi = 'z'; 405 if (lo <= hi) 406 prog_->MarkByteRange(lo + 'A' - 'a', hi + 'A' - 'a'); 407 } 408 return Frag(id, PatchList::Mk(id << 1)); 409} 410 411// Returns a no-op fragment. Sometimes unavoidable. 412Frag Compiler::Nop() { 413 int id = AllocInst(1); 414 if (id < 0) 415 return NoMatch(); 416 inst_[id].InitNop(0); 417 return Frag(id, PatchList::Mk(id << 1)); 418} 419 420// Returns a fragment that signals a match. 421Frag Compiler::Match(int32 match_id) { 422 int id = AllocInst(1); 423 if (id < 0) 424 return NoMatch(); 425 inst_[id].InitMatch(match_id); 426 return Frag(id, nullPatchList); 427} 428 429// Returns a fragment matching a particular empty-width op (like ^ or $) 430Frag Compiler::EmptyWidth(EmptyOp empty) { 431 int id = AllocInst(1); 432 if (id < 0) 433 return NoMatch(); 434 inst_[id].InitEmptyWidth(empty, 0); 435 if (empty & (kEmptyBeginLine|kEmptyEndLine)) 436 prog_->MarkByteRange('\n', '\n'); 437 if (empty & (kEmptyWordBoundary|kEmptyNonWordBoundary)) { 438 int j; 439 for (int i = 0; i < 256; i = j) { 440 for (j = i+1; j < 256 && Prog::IsWordChar(i) == Prog::IsWordChar(j); j++) 441 ; 442 prog_->MarkByteRange(i, j-1); 443 } 444 } 445 return Frag(id, PatchList::Mk(id << 1)); 446} 447 448// Given a fragment a, returns a fragment with capturing parens around a. 449Frag Compiler::Capture(Frag a, int n) { 450 int id = AllocInst(2); 451 if (id < 0) 452 return NoMatch(); 453 inst_[id].InitCapture(2*n, a.begin); 454 inst_[id+1].InitCapture(2*n+1, 0); 455 PatchList::Patch(inst_, a.end, id+1); 456 457 return Frag(id, PatchList::Mk((id+1) << 1)); 458} 459 460// A Rune is a name for a Unicode code point. 461// Returns maximum rune encoded by UTF-8 sequence of length len. 462static int MaxRune(int len) { 463 int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax) 464 if (len == 1) 465 b = 7; 466 else 467 b = 8-(len+1) + 6*(len-1); 468 return (1<<b) - 1; // maximum Rune for b bits. 469} 470 471// The rune range compiler caches common suffix fragments, 472// which are very common in UTF-8 (e.g., [80-bf]). 473// The fragment suffixes are identified by their start 474// instructions. NULL denotes the eventual end match. 475// The Frag accumulates in rune_range_. Caching common 476// suffixes reduces the UTF-8 "." from 32 to 24 instructions, 477// and it reduces the corresponding one-pass NFA from 16 nodes to 8. 478 479void Compiler::BeginRange() { 480 rune_cache_.clear(); 481 rune_range_.begin = 0; 482 rune_range_.end = nullPatchList; 483} 484 485int Compiler::UncachedRuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, 486 int next) { 487 Frag f = ByteRange(lo, hi, foldcase); 488 if (next != 0) { 489 PatchList::Patch(inst_, f.end, next); 490 } else { 491 rune_range_.end = PatchList::Append(inst_, rune_range_.end, f.end); 492 } 493 return f.begin; 494} 495 496int Compiler::RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, int next) { 497 // In Latin1 mode, there's no point in caching. 498 // In forward UTF-8 mode, only need to cache continuation bytes. 499 if (encoding_ == kEncodingLatin1 || 500 (encoding_ == kEncodingUTF8 && 501 !reversed_ && 502 !(0x80 <= lo && hi <= 0xbf))) { 503 return UncachedRuneByteSuffix(lo, hi, foldcase, next); 504 } 505 506 uint64 key = ((uint64)next << 17) | (lo<<9) | (hi<<1) | (foldcase ? 1ULL : 0ULL); 507 map<uint64, int>::iterator it = rune_cache_.find(key); 508 if (it != rune_cache_.end()) 509 return it->second; 510 int id = UncachedRuneByteSuffix(lo, hi, foldcase, next); 511 rune_cache_[key] = id; 512 return id; 513} 514 515void Compiler::AddSuffix(int id) { 516 if (rune_range_.begin == 0) { 517 rune_range_.begin = id; 518 return; 519 } 520 521 int alt = AllocInst(1); 522 if (alt < 0) { 523 rune_range_.begin = 0; 524 return; 525 } 526 inst_[alt].InitAlt(rune_range_.begin, id); 527 rune_range_.begin = alt; 528} 529 530Frag Compiler::EndRange() { 531 return rune_range_; 532} 533 534// Converts rune range lo-hi into a fragment that recognizes 535// the bytes that would make up those runes in the current 536// encoding (Latin 1 or UTF-8). 537// This lets the machine work byte-by-byte even when 538// using multibyte encodings. 539 540void Compiler::AddRuneRange(Rune lo, Rune hi, bool foldcase) { 541 switch (encoding_) { 542 default: 543 case kEncodingUTF8: 544 AddRuneRangeUTF8(lo, hi, foldcase); 545 break; 546 case kEncodingLatin1: 547 AddRuneRangeLatin1(lo, hi, foldcase); 548 break; 549 } 550} 551 552void Compiler::AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase) { 553 // Latin1 is easy: runes *are* bytes. 554 if (lo > hi || lo > 0xFF) 555 return; 556 if (hi > 0xFF) 557 hi = 0xFF; 558 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 559} 560 561// Table describing how to make a UTF-8 matching machine 562// for the rune range 80-10FFFF (Runeself-Runemax). 563// This range happens frequently enough (for example /./ and /[^a-z]/) 564// and the rune_cache_ map is slow enough that this is worth 565// special handling. Makes compilation of a small expression 566// with a dot in it about 10% faster. 567// The * in the comments below mark whole sequences. 568static struct ByteRangeProg { 569 int next; 570 int lo; 571 int hi; 572} prog_80_10ffff[] = { 573 // Two-byte 574 { -1, 0x80, 0xBF, }, // 0: 80-BF 575 { 0, 0xC2, 0xDF, }, // 1: C2-DF 80-BF* 576 577 // Three-byte 578 { 0, 0xA0, 0xBF, }, // 2: A0-BF 80-BF 579 { 2, 0xE0, 0xE0, }, // 3: E0 A0-BF 80-BF* 580 { 0, 0x80, 0xBF, }, // 4: 80-BF 80-BF 581 { 4, 0xE1, 0xEF, }, // 5: E1-EF 80-BF 80-BF* 582 583 // Four-byte 584 { 4, 0x90, 0xBF, }, // 6: 90-BF 80-BF 80-BF 585 { 6, 0xF0, 0xF0, }, // 7: F0 90-BF 80-BF 80-BF* 586 { 4, 0x80, 0xBF, }, // 8: 80-BF 80-BF 80-BF 587 { 8, 0xF1, 0xF3, }, // 9: F1-F3 80-BF 80-BF 80-BF* 588 { 4, 0x80, 0x8F, }, // 10: 80-8F 80-BF 80-BF 589 { 10, 0xF4, 0xF4, }, // 11: F4 80-8F 80-BF 80-BF* 590}; 591 592void Compiler::Add_80_10ffff() { 593 int inst[arraysize(prog_80_10ffff)] = { 0 }; // does not need to be initialized; silences gcc warning 594 for (int i = 0; i < arraysize(prog_80_10ffff); i++) { 595 const ByteRangeProg& p = prog_80_10ffff[i]; 596 int next = 0; 597 if (p.next >= 0) 598 next = inst[p.next]; 599 inst[i] = UncachedRuneByteSuffix(p.lo, p.hi, false, next); 600 if ((p.lo & 0xC0) != 0x80) 601 AddSuffix(inst[i]); 602 } 603} 604 605void Compiler::AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase) { 606 if (lo > hi) 607 return; 608 609 // Pick off 80-10FFFF as a common special case 610 // that can bypass the slow rune_cache_. 611 if (lo == 0x80 && hi == 0x10ffff && !reversed_) { 612 Add_80_10ffff(); 613 return; 614 } 615 616 // Split range into same-length sized ranges. 617 for (int i = 1; i < UTFmax; i++) { 618 Rune max = MaxRune(i); 619 if (lo <= max && max < hi) { 620 AddRuneRangeUTF8(lo, max, foldcase); 621 AddRuneRangeUTF8(max+1, hi, foldcase); 622 return; 623 } 624 } 625 626 // ASCII range is always a special case. 627 if (hi < Runeself) { 628 AddSuffix(RuneByteSuffix(lo, hi, foldcase, 0)); 629 return; 630 } 631 632 // Split range into sections that agree on leading bytes. 633 for (int i = 1; i < UTFmax; i++) { 634 uint m = (1<<(6*i)) - 1; // last i bytes of a UTF-8 sequence 635 if ((lo & ~m) != (hi & ~m)) { 636 if ((lo & m) != 0) { 637 AddRuneRangeUTF8(lo, lo|m, foldcase); 638 AddRuneRangeUTF8((lo|m)+1, hi, foldcase); 639 return; 640 } 641 if ((hi & m) != m) { 642 AddRuneRangeUTF8(lo, (hi&~m)-1, foldcase); 643 AddRuneRangeUTF8(hi&~m, hi, foldcase); 644 return; 645 } 646 } 647 } 648 649 // Finally. Generate byte matching equivalent for lo-hi. 650 uint8 ulo[UTFmax], uhi[UTFmax]; 651 int n = runetochar(reinterpret_cast<char*>(ulo), &lo); 652 int m = runetochar(reinterpret_cast<char*>(uhi), &hi); 653 (void)m; // USED(m) 654 DCHECK_EQ(n, m); 655 656 int id = 0; 657 if (reversed_) { 658 for (int i = 0; i < n; i++) 659 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 660 } else { 661 for (int i = n-1; i >= 0; i--) 662 id = RuneByteSuffix(ulo[i], uhi[i], false, id); 663 } 664 AddSuffix(id); 665} 666 667// Should not be called. 668Frag Compiler::Copy(Frag arg) { 669 // We're using WalkExponential; there should be no copying. 670 LOG(DFATAL) << "Compiler::Copy called!"; 671 failed_ = true; 672 return NoMatch(); 673} 674 675// Visits a node quickly; called once WalkExponential has 676// decided to cut this walk short. 677Frag Compiler::ShortVisit(Regexp* re, Frag) { 678 failed_ = true; 679 return NoMatch(); 680} 681 682// Called before traversing a node's children during the walk. 683Frag Compiler::PreVisit(Regexp* re, Frag, bool* stop) { 684 // Cut off walk if we've already failed. 685 if (failed_) 686 *stop = true; 687 688 return NullFrag(); // not used by caller 689} 690 691Frag Compiler::Literal(Rune r, bool foldcase) { 692 switch (encoding_) { 693 default: 694 return NullFrag(); 695 696 case kEncodingLatin1: 697 return ByteRange(r, r, foldcase); 698 699 case kEncodingUTF8: { 700 if (r < Runeself) // Make common case fast. 701 return ByteRange(r, r, foldcase); 702 uint8 buf[UTFmax]; 703 int n = runetochar(reinterpret_cast<char*>(buf), &r); 704 Frag f = ByteRange((uint8)buf[0], buf[0], false); 705 for (int i = 1; i < n; i++) 706 f = Cat(f, ByteRange((uint8)buf[i], buf[i], false)); 707 return f; 708 } 709 } 710} 711 712// Called after traversing the node's children during the walk. 713// Given their frags, build and return the frag for this re. 714Frag Compiler::PostVisit(Regexp* re, Frag, Frag, Frag* child_frags, 715 int nchild_frags) { 716 // If a child failed, don't bother going forward, especially 717 // since the child_frags might contain Frags with NULLs in them. 718 if (failed_) 719 return NoMatch(); 720 721 // Given the child fragments, return the fragment for this node. 722 switch (re->op()) { 723 case kRegexpRepeat: 724 // Should not see; code at bottom of function will print error 725 break; 726 727 case kRegexpNoMatch: 728 return NoMatch(); 729 730 case kRegexpEmptyMatch: 731 return Nop(); 732 733 case kRegexpHaveMatch: { 734 Frag f = Match(re->match_id()); 735 // Remember unanchored match to end of string. 736 if (anchor_ != RE2::ANCHOR_BOTH) 737 f = Cat(DotStar(), Cat(EmptyWidth(kEmptyEndText), f)); 738 return f; 739 } 740 741 case kRegexpConcat: { 742 Frag f = child_frags[0]; 743 for (int i = 1; i < nchild_frags; i++) 744 f = Cat(f, child_frags[i]); 745 return f; 746 } 747 748 case kRegexpAlternate: { 749 Frag f = child_frags[0]; 750 for (int i = 1; i < nchild_frags; i++) 751 f = Alt(f, child_frags[i]); 752 return f; 753 } 754 755 case kRegexpStar: 756 return Star(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 757 758 case kRegexpPlus: 759 return Plus(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 760 761 case kRegexpQuest: 762 return Quest(child_frags[0], re->parse_flags()&Regexp::NonGreedy); 763 764 case kRegexpLiteral: 765 return Literal(re->rune(), re->parse_flags()&Regexp::FoldCase); 766 767 case kRegexpLiteralString: { 768 // Concatenation of literals. 769 if (re->nrunes() == 0) 770 return Nop(); 771 Frag f; 772 for (int i = 0; i < re->nrunes(); i++) { 773 Frag f1 = Literal(re->runes()[i], re->parse_flags()&Regexp::FoldCase); 774 if (i == 0) 775 f = f1; 776 else 777 f = Cat(f, f1); 778 } 779 return f; 780 } 781 782 case kRegexpAnyChar: 783 BeginRange(); 784 AddRuneRange(0, Runemax, false); 785 return EndRange(); 786 787 case kRegexpAnyByte: 788 return ByteRange(0x00, 0xFF, false); 789 790 case kRegexpCharClass: { 791 CharClass* cc = re->cc(); 792 if (cc->empty()) { 793 // This can't happen. 794 LOG(DFATAL) << "No ranges in char class"; 795 failed_ = true; 796 return NoMatch(); 797 } 798 799 // ASCII case-folding optimization: if the char class 800 // behaves the same on A-Z as it does on a-z, 801 // discard any ranges wholly contained in A-Z 802 // and mark the other ranges as foldascii. 803 // This reduces the size of a program for 804 // (?i)abc from 3 insts per letter to 1 per letter. 805 bool foldascii = cc->FoldsASCII(); 806 807 // Character class is just a big OR of the different 808 // character ranges in the class. 809 BeginRange(); 810 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) { 811 // ASCII case-folding optimization (see above). 812 if (foldascii && 'A' <= i->lo && i->hi <= 'Z') 813 continue; 814 815 // If this range contains all of A-Za-z or none of it, 816 // the fold flag is unnecessary; don't bother. 817 bool fold = foldascii; 818 if ((i->lo <= 'A' && 'z' <= i->hi) || i->hi < 'A' || 'z' < i->lo) 819 fold = false; 820 821 AddRuneRange(i->lo, i->hi, fold); 822 } 823 return EndRange(); 824 } 825 826 case kRegexpCapture: 827 // If this is a non-capturing parenthesis -- (?:foo) -- 828 // just use the inner expression. 829 if (re->cap() < 0) 830 return child_frags[0]; 831 return Capture(child_frags[0], re->cap()); 832 833 case kRegexpBeginLine: 834 return EmptyWidth(reversed_ ? kEmptyEndLine : kEmptyBeginLine); 835 836 case kRegexpEndLine: 837 return EmptyWidth(reversed_ ? kEmptyBeginLine : kEmptyEndLine); 838 839 case kRegexpBeginText: 840 return EmptyWidth(reversed_ ? kEmptyEndText : kEmptyBeginText); 841 842 case kRegexpEndText: 843 return EmptyWidth(reversed_ ? kEmptyBeginText : kEmptyEndText); 844 845 case kRegexpWordBoundary: 846 return EmptyWidth(kEmptyWordBoundary); 847 848 case kRegexpNoWordBoundary: 849 return EmptyWidth(kEmptyNonWordBoundary); 850 } 851 LOG(DFATAL) << "Missing case in Compiler: " << re->op(); 852 failed_ = true; 853 return NoMatch(); 854} 855 856// Is this regexp required to start at the beginning of the text? 857// Only approximate; can return false for complicated regexps like (\Aa|\Ab), 858// but handles (\A(a|b)). Could use the Walker to write a more exact one. 859static bool IsAnchorStart(Regexp** pre, int depth) { 860 Regexp* re = *pre; 861 Regexp* sub; 862 // The depth limit makes sure that we don't overflow 863 // the stack on a deeply nested regexp. As the comment 864 // above says, IsAnchorStart is conservative, so returning 865 // a false negative is okay. The exact limit is somewhat arbitrary. 866 if (re == NULL || depth >= 4) 867 return false; 868 switch (re->op()) { 869 default: 870 break; 871 case kRegexpConcat: 872 if (re->nsub() > 0) { 873 sub = re->sub()[0]->Incref(); 874 if (IsAnchorStart(&sub, depth+1)) { 875 Regexp** subcopy = new Regexp*[re->nsub()]; 876 subcopy[0] = sub; // already have reference 877 for (int i = 1; i < re->nsub(); i++) 878 subcopy[i] = re->sub()[i]->Incref(); 879 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 880 delete[] subcopy; 881 re->Decref(); 882 return true; 883 } 884 sub->Decref(); 885 } 886 break; 887 case kRegexpCapture: 888 sub = re->sub()[0]->Incref(); 889 if (IsAnchorStart(&sub, depth+1)) { 890 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 891 re->Decref(); 892 return true; 893 } 894 sub->Decref(); 895 break; 896 case kRegexpBeginText: 897 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 898 re->Decref(); 899 return true; 900 } 901 return false; 902} 903 904// Is this regexp required to start at the end of the text? 905// Only approximate; can return false for complicated regexps like (a\z|b\z), 906// but handles ((a|b)\z). Could use the Walker to write a more exact one. 907static bool IsAnchorEnd(Regexp** pre, int depth) { 908 Regexp* re = *pre; 909 Regexp* sub; 910 // The depth limit makes sure that we don't overflow 911 // the stack on a deeply nested regexp. As the comment 912 // above says, IsAnchorEnd is conservative, so returning 913 // a false negative is okay. The exact limit is somewhat arbitrary. 914 if (re == NULL || depth >= 4) 915 return false; 916 switch (re->op()) { 917 default: 918 break; 919 case kRegexpConcat: 920 if (re->nsub() > 0) { 921 sub = re->sub()[re->nsub() - 1]->Incref(); 922 if (IsAnchorEnd(&sub, depth+1)) { 923 Regexp** subcopy = new Regexp*[re->nsub()]; 924 subcopy[re->nsub() - 1] = sub; // already have reference 925 for (int i = 0; i < re->nsub() - 1; i++) 926 subcopy[i] = re->sub()[i]->Incref(); 927 *pre = Regexp::Concat(subcopy, re->nsub(), re->parse_flags()); 928 delete[] subcopy; 929 re->Decref(); 930 return true; 931 } 932 sub->Decref(); 933 } 934 break; 935 case kRegexpCapture: 936 sub = re->sub()[0]->Incref(); 937 if (IsAnchorEnd(&sub, depth+1)) { 938 *pre = Regexp::Capture(sub, re->parse_flags(), re->cap()); 939 re->Decref(); 940 return true; 941 } 942 sub->Decref(); 943 break; 944 case kRegexpEndText: 945 *pre = Regexp::LiteralString(NULL, 0, re->parse_flags()); 946 re->Decref(); 947 return true; 948 } 949 return false; 950} 951 952void Compiler::Setup(Regexp::ParseFlags flags, int64 max_mem, 953 RE2::Anchor anchor) { 954 prog_->set_flags(flags); 955 956 if (flags & Regexp::Latin1) 957 encoding_ = kEncodingLatin1; 958 max_mem_ = max_mem; 959 if (max_mem <= 0) { 960 max_inst_ = 100000; // more than enough 961 } else if (max_mem <= sizeof(Prog)) { 962 // No room for anything. 963 max_inst_ = 0; 964 } else { 965 int64 m = (max_mem - sizeof(Prog)) / sizeof(Prog::Inst); 966 // Limit instruction count so that inst->id() fits nicely in an int. 967 // SparseArray also assumes that the indices (inst->id()) are ints. 968 // The call to WalkExponential uses 2*max_inst_ below, 969 // and other places in the code use 2 or 3 * prog->size(). 970 // Limiting to 2^24 should avoid overflow in those places. 971 // (The point of allowing more than 32 bits of memory is to 972 // have plenty of room for the DFA states, not to use it up 973 // on the program.) 974 if (m >= 1<<24) 975 m = 1<<24; 976 977 // Inst imposes its own limit (currently bigger than 2^24 but be safe). 978 if (m > Prog::Inst::kMaxInst) 979 m = Prog::Inst::kMaxInst; 980 981 max_inst_ = m; 982 } 983 984 anchor_ = anchor; 985} 986 987// Compiles re, returning program. 988// Caller is responsible for deleting prog_. 989// If reversed is true, compiles a program that expects 990// to run over the input string backward (reverses all concatenations). 991// The reversed flag is also recorded in the returned program. 992Prog* Compiler::Compile(Regexp* re, bool reversed, int64 max_mem) { 993 Compiler c; 994 995 c.Setup(re->parse_flags(), max_mem, RE2::ANCHOR_BOTH /* unused */); 996 c.reversed_ = reversed; 997 998 // Simplify to remove things like counted repetitions 999 // and character classes like \d. 1000 Regexp* sre = re->Simplify(); 1001 if (sre == NULL) 1002 return NULL; 1003 1004 // Record whether prog is anchored, removing the anchors. 1005 // (They get in the way of other optimizations.) 1006 bool is_anchor_start = IsAnchorStart(&sre, 0); 1007 bool is_anchor_end = IsAnchorEnd(&sre, 0); 1008 1009 // Generate fragment for entire regexp. 1010 Frag f = c.WalkExponential(sre, NullFrag(), 2*c.max_inst_); 1011 sre->Decref(); 1012 if (c.failed_) 1013 return NULL; 1014 1015 // Success! Finish by putting Match node at end, and record start. 1016 // Turn off c.reversed_ (if it is set) to force the remaining concatenations 1017 // to behave normally. 1018 c.reversed_ = false; 1019 Frag all = c.Cat(f, c.Match(0)); 1020 c.prog_->set_start(all.begin); 1021 1022 if (reversed) { 1023 c.prog_->set_anchor_start(is_anchor_end); 1024 c.prog_->set_anchor_end(is_anchor_start); 1025 } else { 1026 c.prog_->set_anchor_start(is_anchor_start); 1027 c.prog_->set_anchor_end(is_anchor_end); 1028 } 1029 1030 // Also create unanchored version, which starts with a .*? loop. 1031 if (c.prog_->anchor_start()) { 1032 c.prog_->set_start_unanchored(c.prog_->start()); 1033 } else { 1034 Frag unanchored = c.Cat(c.DotStar(), all); 1035 c.prog_->set_start_unanchored(unanchored.begin); 1036 } 1037 1038 c.prog_->set_reversed(reversed); 1039 1040 // Hand ownership of prog_ to caller. 1041 return c.Finish(); 1042} 1043 1044Prog* Compiler::Finish() { 1045 if (failed_) 1046 return NULL; 1047 1048 if (prog_->start() == 0 && prog_->start_unanchored() == 0) { 1049 // No possible matches; keep Fail instruction only. 1050 inst_len_ = 1; 1051 } 1052 1053 // Trim instruction to minimum array and transfer to Prog. 1054 Trim(); 1055 prog_->inst_ = inst_; 1056 prog_->size_ = inst_len_; 1057 inst_ = NULL; 1058 1059 // Compute byte map. 1060 prog_->ComputeByteMap(); 1061 1062 prog_->Optimize(); 1063 1064 // Record remaining memory for DFA. 1065 if (max_mem_ <= 0) { 1066 prog_->set_dfa_mem(1<<20); 1067 } else { 1068 int64 m = max_mem_ - sizeof(Prog) - inst_len_*sizeof(Prog::Inst); 1069 if (m < 0) 1070 m = 0; 1071 prog_->set_dfa_mem(m); 1072 } 1073 1074 Prog* p = prog_; 1075 prog_ = NULL; 1076 return p; 1077} 1078 1079// Converts Regexp to Prog. 1080Prog* Regexp::CompileToProg(int64 max_mem) { 1081 return Compiler::Compile(this, false, max_mem); 1082} 1083 1084Prog* Regexp::CompileToReverseProg(int64 max_mem) { 1085 return Compiler::Compile(this, true, max_mem); 1086} 1087 1088Frag Compiler::DotStar() { 1089 return Star(ByteRange(0x00, 0xff, false), true); 1090} 1091 1092// Compiles RE set to Prog. 1093Prog* Compiler::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1094 Regexp* re) { 1095 Compiler c; 1096 1097 Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(options.ParseFlags()); 1098 c.Setup(pf, options.max_mem(), anchor); 1099 1100 // Compile alternation of fragments. 1101 Frag all = c.WalkExponential(re, NullFrag(), 2*c.max_inst_); 1102 re->Decref(); 1103 if (c.failed_) 1104 return NULL; 1105 1106 if (anchor == RE2::UNANCHORED) { 1107 // The trailing .* was added while handling kRegexpHaveMatch. 1108 // We just have to add the leading one. 1109 all = c.Cat(c.DotStar(), all); 1110 } 1111 1112 c.prog_->set_start(all.begin); 1113 c.prog_->set_start_unanchored(all.begin); 1114 c.prog_->set_anchor_start(true); 1115 c.prog_->set_anchor_end(true); 1116 1117 Prog* prog = c.Finish(); 1118 if (prog == NULL) 1119 return NULL; 1120 1121 // Make sure DFA has enough memory to operate, 1122 // since we're not going to fall back to the NFA. 1123 bool failed; 1124 StringPiece sp = "hello, world"; 1125 prog->SearchDFA(sp, sp, Prog::kAnchored, Prog::kManyMatch, 1126 NULL, &failed, NULL); 1127 if (failed) { 1128 delete prog; 1129 return NULL; 1130 } 1131 1132 return prog; 1133} 1134 1135Prog* Prog::CompileSet(const RE2::Options& options, RE2::Anchor anchor, 1136 Regexp* re) { 1137 return Compiler::CompileSet(options, anchor, re); 1138} 1139 1140} // namespace re2 1141