1/* $NetBSD: engine.c,v 1.24 2012/03/13 21:13:42 christos Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)engine.c 8.5 (Berkeley) 3/20/94 35 */ 36 37/*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)engine.c 8.5 (Berkeley) 3/20/94 72 */ 73 74/* 75 * The matching engine and friends. This file is #included by regexec.c 76 * after suitable #defines of a variety of macros used herein, so that 77 * different state representations can be used without duplicating masses 78 * of code. 79 */ 80 81#ifdef SNAMES 82#define matcher smatcher 83#define fast sfast 84#define slow sslow 85#define dissect sdissect 86#define backref sbackref 87#define step sstep 88#define print sprint 89#define at sat 90#define match smat 91#define nope snope 92#endif 93#ifdef LNAMES 94#define matcher lmatcher 95#define fast lfast 96#define slow lslow 97#define dissect ldissect 98#define backref lbackref 99#define step lstep 100#define print lprint 101#define at lat 102#define match lmat 103#define nope lnope 104#endif 105 106/* another structure passed up and down to avoid zillions of parameters */ 107struct match { 108 struct re_guts *g; 109 int eflags; 110 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ 111 const char *offp; /* offsets work from here */ 112 const char *beginp; /* start of string -- virtual NUL precedes */ 113 const char *endp; /* end of string -- virtual NUL here */ 114 const char *coldp; /* can be no match starting before here */ 115 const char **lastpos; /* [nplus+1] */ 116 STATEVARS; 117 states st; /* current states */ 118 states fresh; /* states for a fresh start */ 119 states tmp; /* temporary */ 120 states empty; /* empty set of states */ 121}; 122 123/* ========= begin header generated by ./mkh ========= */ 124#ifdef __cplusplus 125extern "C" { 126#endif 127 128/* === engine.c === */ 129static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags); 130static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 131static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev); 132static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 133static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst); 134static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); 135#define BOL (OUT+1) 136#define EOL (BOL+1) 137#define BOLEOL (BOL+2) 138#define NOTHING (BOL+3) 139#define BOW (BOL+4) 140#define EOW (BOL+5) 141#define CODEMAX (BOL+5) /* highest code used */ 142#define NONCHAR(c) ((c) > CHAR_MAX) 143#define NNONCHAR (CODEMAX-CHAR_MAX) 144#ifdef REDEBUG 145static void print(struct match *m, char *caption, states st, int ch, FILE *d); 146#endif 147#ifdef REDEBUG 148static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst); 149#endif 150#ifdef REDEBUG 151static char *pchar(int ch); 152#endif 153 154#ifdef __cplusplus 155} 156#endif 157/* ========= end header generated by ./mkh ========= */ 158 159#ifdef REDEBUG 160#define SP(t, s, c) print(m, t, s, c, stdout) 161#define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) 162#define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } 163static int nope = 0; 164#else 165#define SP(t, s, c) /* nothing */ 166#define AT(t, p1, p2, s1, s2) /* nothing */ 167#define NOTE(s) /* nothing */ 168#endif 169 170/* 171 - matcher - the actual matching engine 172 == static int matcher(struct re_guts *g, char *string, \ 173 == size_t nmatch, regmatch_t pmatch[], int eflags); 174 */ 175static int /* 0 success, REG_NOMATCH failure */ 176matcher( 177 struct re_guts *g, 178 const char *string, 179 size_t nmatch, 180 regmatch_t pmatch[], 181 int eflags) 182{ 183 const char *endp; 184 size_t i; 185 struct match mv; 186 struct match *m = &mv; 187 const char *dp; 188 const sopno gf = g->firststate+1; /* +1 for OEND */ 189 const sopno gl = g->laststate; 190 const char *start; 191 const char *stop; 192 int error = 0; 193 194 _DIAGASSERT(g != NULL); 195 _DIAGASSERT(string != NULL); 196 /* pmatch checked below */ 197 198 /* simplify the situation where possible */ 199 if (g->cflags®_NOSUB) 200 nmatch = 0; 201 if (eflags®_STARTEND) { 202 _DIAGASSERT(pmatch != NULL); 203 start = string + (size_t)pmatch[0].rm_so; 204 stop = string + (size_t)pmatch[0].rm_eo; 205 } else { 206 start = string; 207 stop = start + strlen(start); 208 } 209 if (stop < start) 210 return(REG_INVARG); 211 212 /* prescreening; this does wonders for this rather slow code */ 213 if (g->must != NULL) { 214 for (dp = start; dp < stop; dp++) 215 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen && 216 memcmp(dp, g->must, g->mlen) == 0) 217 break; 218 if (dp == stop) /* we didn't find g->must */ 219 return(REG_NOMATCH); 220 } 221 222 /* match struct setup */ 223 m->g = g; 224 m->eflags = eflags; 225 m->pmatch = NULL; 226 m->lastpos = NULL; 227 m->offp = string; 228 m->beginp = start; 229 m->endp = stop; 230 STATESETUP(m, 4); 231 SETUP(m->st); 232 SETUP(m->fresh); 233 SETUP(m->tmp); 234 SETUP(m->empty); 235 CLEAR(m->empty); 236 237 /* this loop does only one repetition except for backrefs */ 238 for (;;) { 239 endp = fast(m, start, stop, gf, gl); 240 if (endp == NULL) { /* a miss */ 241 error = REG_NOMATCH; 242 goto done; 243 } 244 if (nmatch == 0 && !g->backrefs) 245 break; /* no further info needed */ 246 247 /* where? */ 248 assert(m->coldp != NULL); 249 for (;;) { 250 NOTE("finding start"); 251 endp = slow(m, m->coldp, stop, gf, gl); 252 if (endp != NULL) 253 break; 254 assert(m->coldp < m->endp); 255 m->coldp++; 256 } 257 if (nmatch == 1 && !g->backrefs) 258 break; /* no further info needed */ 259 260 /* oh my, he wants the subexpressions... */ 261 if (m->pmatch == NULL) 262 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * 263 sizeof(regmatch_t)); 264 if (m->pmatch == NULL) { 265 error = REG_ESPACE; 266 goto done; 267 } 268 for (i = 1; i <= m->g->nsub; i++) 269 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1; 270 if (!g->backrefs && !(m->eflags®_BACKR)) { 271 NOTE("dissecting"); 272 dp = dissect(m, m->coldp, endp, gf, gl); 273 } else { 274 if (g->nplus > 0 && m->lastpos == NULL) 275 m->lastpos = malloc((g->nplus+1) * 276 sizeof(const char *)); 277 if (g->nplus > 0 && m->lastpos == NULL) { 278 error = REG_ESPACE; 279 goto done; 280 } 281 NOTE("backref dissect"); 282 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 283 } 284 if (dp != NULL) 285 break; 286 287 /* uh-oh... we couldn't find a subexpression-level match */ 288 assert(g->backrefs); /* must be back references doing it */ 289 assert(g->nplus == 0 || m->lastpos != NULL); 290 for (;;) { 291 if (dp != NULL || endp <= m->coldp) 292 break; /* defeat */ 293 NOTE("backoff"); 294 endp = slow(m, m->coldp, endp-1, gf, gl); 295 if (endp == NULL) 296 break; /* defeat */ 297 /* try it on a shorter possibility */ 298#ifndef NDEBUG 299 for (i = 1; i <= m->g->nsub; i++) { 300 assert(m->pmatch[i].rm_so == (regoff_t)-1); 301 assert(m->pmatch[i].rm_eo == (regoff_t)-1); 302 } 303#endif 304 NOTE("backoff dissect"); 305 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); 306 } 307 assert(dp == NULL || dp == endp); 308 if (dp != NULL) /* found a shorter one */ 309 break; 310 311 /* despite initial appearances, there is no match here */ 312 NOTE("false alarm"); 313 start = m->coldp + 1; /* recycle starting later */ 314 assert(start <= stop); 315 } 316 317 /* fill in the details if requested */ 318 if (nmatch > 0) { 319 _DIAGASSERT(pmatch != NULL); 320 pmatch[0].rm_so = m->coldp - m->offp; 321 pmatch[0].rm_eo = endp - m->offp; 322 } 323 if (nmatch > 1) { 324 assert(m->pmatch != NULL); 325 for (i = 1; i < nmatch; i++) 326 if (i <= m->g->nsub) 327 pmatch[i] = m->pmatch[i]; 328 else { 329 pmatch[i].rm_so = (regoff_t)-1; 330 pmatch[i].rm_eo = (regoff_t)-1; 331 } 332 } 333 334done: 335 if (m->pmatch != NULL) { 336 free(m->pmatch); 337 m->pmatch = NULL; 338 } 339 if (m->lastpos != NULL) { 340 free(m->lastpos); 341 m->lastpos = NULL; 342 } 343 STATETEARDOWN(m); 344 return error; 345} 346 347/* 348 - dissect - figure out what matched what, no back references 349 == static const char *dissect(struct match *m, const char *start, \ 350 == const char *stop, sopno startst, sopno stopst); 351 */ 352static const char * /* == stop (success) always */ 353dissect( 354 struct match *m, 355 const char *start, 356 const char *stop, 357 sopno startst, 358 sopno stopst) 359{ 360 int i; 361 sopno ss; /* start sop of current subRE */ 362 sopno es; /* end sop of current subRE */ 363 const char *sp; /* start of string matched by it */ 364 const char *stp; /* string matched by it cannot pass here */ 365 const char *rest; /* start of rest of string */ 366 const char *tail; /* string unmatched by rest of RE */ 367 sopno ssub; /* start sop of subsubRE */ 368 sopno esub; /* end sop of subsubRE */ 369 const char *ssp; /* start of string matched by subsubRE */ 370 const char *sep; /* end of string matched by subsubRE */ 371 const char *oldssp; /* previous ssp */ 372#ifndef NDEBUG 373 const char *dp; 374#endif 375 376 _DIAGASSERT(m != NULL); 377 _DIAGASSERT(start != NULL); 378 _DIAGASSERT(stop != NULL); 379 380 AT("diss", start, stop, startst, stopst); 381 sp = start; 382 for (ss = startst; ss < stopst; ss = es) { 383 /* identify end of subRE */ 384 es = ss; 385 switch (OP(m->g->strip[es])) { 386 case OPLUS_: 387 case OQUEST_: 388 es += OPND(m->g->strip[es]); 389 break; 390 case OCH_: 391 while (OP(m->g->strip[es]) != O_CH) 392 es += OPND(m->g->strip[es]); 393 break; 394 } 395 es++; 396 397 /* figure out what it matched */ 398 switch (OP(m->g->strip[ss])) { 399 case OEND: 400 assert(nope); 401 break; 402 case OCHAR: 403 sp++; 404 break; 405 case OBOL: 406 case OEOL: 407 case OBOW: 408 case OEOW: 409 break; 410 case OANY: 411 case OANYOF: 412 sp++; 413 break; 414 case OBACK_: 415 case O_BACK: 416 assert(nope); 417 break; 418 /* cases where length of match is hard to find */ 419 case OQUEST_: 420 stp = stop; 421 for (;;) { 422 /* how long could this one be? */ 423 rest = slow(m, sp, stp, ss, es); 424 assert(rest != NULL); /* it did match */ 425 /* could the rest match the rest? */ 426 tail = slow(m, rest, stop, es, stopst); 427 if (tail == stop) 428 break; /* yes! */ 429 /* no -- try a shorter match for this one */ 430 stp = rest - 1; 431 assert(stp >= sp); /* it did work */ 432 } 433 ssub = ss + 1; 434 esub = es - 1; 435 /* did innards match? */ 436 if (slow(m, sp, rest, ssub, esub) != NULL) { 437#ifdef NDEBUG 438 (void) 439#else 440 dp = 441#endif 442 dissect(m, sp, rest, ssub, esub); 443 assert(dp == rest); 444 } else /* no */ 445 assert(sp == rest); 446 sp = rest; 447 break; 448 case OPLUS_: 449 stp = stop; 450 for (;;) { 451 /* how long could this one be? */ 452 rest = slow(m, sp, stp, ss, es); 453 assert(rest != NULL); /* it did match */ 454 /* could the rest match the rest? */ 455 tail = slow(m, rest, stop, es, stopst); 456 if (tail == stop) 457 break; /* yes! */ 458 /* no -- try a shorter match for this one */ 459 stp = rest - 1; 460 assert(stp >= sp); /* it did work */ 461 } 462 ssub = ss + 1; 463 esub = es - 1; 464 ssp = sp; 465 oldssp = ssp; 466 for (;;) { /* find last match of innards */ 467 sep = slow(m, ssp, rest, ssub, esub); 468 if (sep == NULL || sep == ssp) 469 break; /* failed or matched null */ 470 oldssp = ssp; /* on to next try */ 471 ssp = sep; 472 } 473 if (sep == NULL) { 474 /* last successful match */ 475 sep = ssp; 476 ssp = oldssp; 477 } 478 assert(sep == rest); /* must exhaust substring */ 479 assert(slow(m, ssp, sep, ssub, esub) == rest); 480#ifdef NDEBUG 481 (void) 482#else 483 dp = 484#endif 485 dissect(m, ssp, sep, ssub, esub); 486 assert(dp == sep); 487 sp = rest; 488 break; 489 case OCH_: 490 stp = stop; 491 for (;;) { 492 /* how long could this one be? */ 493 rest = slow(m, sp, stp, ss, es); 494 assert(rest != NULL); /* it did match */ 495 /* could the rest match the rest? */ 496 tail = slow(m, rest, stop, es, stopst); 497 if (tail == stop) 498 break; /* yes! */ 499 /* no -- try a shorter match for this one */ 500 stp = rest - 1; 501 assert(stp >= sp); /* it did work */ 502 } 503 ssub = ss + 1; 504 esub = ss + OPND(m->g->strip[ss]) - 1; 505 assert(OP(m->g->strip[esub]) == OOR1); 506 for (;;) { /* find first matching branch */ 507 if (slow(m, sp, rest, ssub, esub) == rest) 508 break; /* it matched all of it */ 509 /* that one missed, try next one */ 510 assert(OP(m->g->strip[esub]) == OOR1); 511 esub++; 512 assert(OP(m->g->strip[esub]) == OOR2); 513 ssub = esub + 1; 514 esub += OPND(m->g->strip[esub]); 515 if (OP(m->g->strip[esub]) == OOR2) 516 esub--; 517 else 518 assert(OP(m->g->strip[esub]) == O_CH); 519 } 520#ifdef NDEBUG 521 (void) 522#else 523 dp = 524#endif 525 dissect(m, sp, rest, ssub, esub); 526 assert(dp == rest); 527 sp = rest; 528 break; 529 case O_PLUS: 530 case O_QUEST: 531 case OOR1: 532 case OOR2: 533 case O_CH: 534 assert(nope); 535 break; 536 case OLPAREN: 537 i = OPND(m->g->strip[ss]); 538 assert(0 < i && i <= m->g->nsub); 539 m->pmatch[i].rm_so = sp - m->offp; 540 break; 541 case ORPAREN: 542 i = OPND(m->g->strip[ss]); 543 assert(0 < i && i <= m->g->nsub); 544 m->pmatch[i].rm_eo = sp - m->offp; 545 break; 546 default: /* uh oh */ 547 assert(nope); 548 break; 549 } 550 } 551 552 assert(sp == stop); 553 return(sp); 554} 555 556/* 557 - backref - figure out what matched what, figuring in back references 558 == static const char *backref(struct match *m, const char *start, \ 559 == const char *stop, sopno startst, sopno stopst, sopno lev); 560 */ 561static const char * /* == stop (success) or NULL (failure) */ 562backref( 563 struct match *m, 564 const char *start, 565 const char *stop, 566 sopno startst, 567 sopno stopst, 568 sopno lev) /* PLUS nesting level */ 569{ 570 int i; 571 sopno ss; /* start sop of current subRE */ 572 const char *sp; /* start of string matched by it */ 573 sopno ssub; /* start sop of subsubRE */ 574 sopno esub; /* end sop of subsubRE */ 575 const char *ssp; /* start of string matched by subsubRE */ 576 const char *dp; 577 size_t len; 578 int hard; 579 sop s; 580 regoff_t offsave; 581 cset *cs; 582 583 _DIAGASSERT(m != NULL); 584 _DIAGASSERT(start != NULL); 585 _DIAGASSERT(stop != NULL); 586 587 AT("back", start, stop, startst, stopst); 588 sp = start; 589 590 /* get as far as we can with easy stuff */ 591 hard = 0; 592 for (ss = startst; !hard && ss < stopst; ss++) 593 switch (OP(s = m->g->strip[ss])) { 594 case OCHAR: 595 if (sp == stop || *sp++ != (char)OPND(s)) 596 return(NULL); 597 break; 598 case OANY: 599 if (sp == stop) 600 return(NULL); 601 sp++; 602 break; 603 case OANYOF: 604 cs = &m->g->sets[OPND(s)]; 605 if (sp == stop || !CHIN(cs, *sp++)) 606 return(NULL); 607 break; 608 case OBOL: 609 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 610 (sp < m->endp && *(sp-1) == '\n' && 611 (m->g->cflags®_NEWLINE)) ) 612 { /* yes */ } 613 else 614 return(NULL); 615 break; 616 case OEOL: 617 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || 618 (sp < m->endp && *sp == '\n' && 619 (m->g->cflags®_NEWLINE)) ) 620 { /* yes */ } 621 else 622 return(NULL); 623 break; 624 case OBOW: 625 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || 626 (sp < m->endp && *(sp-1) == '\n' && 627 (m->g->cflags®_NEWLINE)) || 628 (sp > m->beginp && 629 !ISWORD(*(sp-1))) ) && 630 (sp < m->endp && ISWORD(*sp)) ) 631 { /* yes */ } 632 else 633 return(NULL); 634 break; 635 case OEOW: 636 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || 637 (sp < m->endp && *sp == '\n' && 638 (m->g->cflags®_NEWLINE)) || 639 (sp < m->endp && !ISWORD(*sp)) ) && 640 (sp > m->beginp && ISWORD(*(sp-1))) ) 641 { /* yes */ } 642 else 643 return(NULL); 644 break; 645 case O_QUEST: 646 break; 647 case OOR1: /* matches null but needs to skip */ 648 ss++; 649 s = m->g->strip[ss]; 650 do { 651 assert(OP(s) == OOR2); 652 ss += OPND(s); 653 } while (OP(s = m->g->strip[ss]) != O_CH); 654 /* note that the ss++ gets us past the O_CH */ 655 break; 656 default: /* have to make a choice */ 657 hard = 1; 658 break; 659 } 660 if (!hard) { /* that was it! */ 661 if (sp != stop) 662 return(NULL); 663 return(sp); 664 } 665 ss--; /* adjust for the for's final increment */ 666 667 /* the hard stuff */ 668 AT("hard", sp, stop, ss, stopst); 669 s = m->g->strip[ss]; 670 switch (OP(s)) { 671 case OBACK_: /* the vilest depths */ 672 i = OPND(s); 673 assert(0 < i && i <= m->g->nsub); 674 if (m->pmatch[i].rm_eo == (regoff_t)-1) 675 return(NULL); 676 assert(m->pmatch[i].rm_so != (regoff_t)-1); 677 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so); 678 if (len == 0) 679 return(NULL); 680 assert(stop - m->beginp >= len); 681 if (sp > stop - len) 682 return(NULL); /* not enough left to match */ 683 ssp = m->offp + (size_t)m->pmatch[i].rm_so; 684 if (memcmp(sp, ssp, len) != 0) 685 return(NULL); 686 while (m->g->strip[ss] != SOP(O_BACK, i)) 687 ss++; 688 return(backref(m, sp+len, stop, ss+1, stopst, lev)); 689 690 case OQUEST_: /* to null or not */ 691 dp = backref(m, sp, stop, ss+1, stopst, lev); 692 if (dp != NULL) 693 return(dp); /* not */ 694 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); 695 696 case OPLUS_: 697 assert(m->lastpos != NULL); 698 assert(lev+1 <= m->g->nplus); 699 m->lastpos[lev+1] = sp; 700 return(backref(m, sp, stop, ss+1, stopst, lev+1)); 701 702 case O_PLUS: 703 if (sp == m->lastpos[lev]) /* last pass matched null */ 704 return(backref(m, sp, stop, ss+1, stopst, lev-1)); 705 /* try another pass */ 706 m->lastpos[lev] = sp; 707 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); 708 if (dp == NULL) 709 dp = backref(m, sp, stop, ss+1, stopst, lev-1); 710 return(dp); 711 712 case OCH_: /* find the right one, if any */ 713 ssub = ss + 1; 714 esub = ss + OPND(s) - 1; 715 assert(OP(m->g->strip[esub]) == OOR1); 716 for (;;) { /* find first matching branch */ 717 dp = backref(m, sp, stop, ssub, esub, lev); 718 if (dp != NULL) 719 return(dp); 720 /* that one missed, try next one */ 721 if (OP(m->g->strip[esub]) == O_CH) 722 return(NULL); /* there is none */ 723 esub++; 724 assert(OP(m->g->strip[esub]) == OOR2); 725 ssub = esub + 1; 726 esub += OPND(m->g->strip[esub]); 727 if (OP(m->g->strip[esub]) == OOR2) 728 esub--; 729 else 730 assert(OP(m->g->strip[esub]) == O_CH); 731 } 732 733 case OLPAREN: /* must undo assignment if rest fails */ 734 i = OPND(s); 735 assert(0 < i && i <= m->g->nsub); 736 offsave = m->pmatch[i].rm_so; 737 m->pmatch[i].rm_so = sp - m->offp; 738 dp = backref(m, sp, stop, ss+1, stopst, lev); 739 if (dp != NULL) 740 return(dp); 741 m->pmatch[i].rm_so = offsave; 742 return(NULL); 743 744 case ORPAREN: /* must undo assignment if rest fails */ 745 i = OPND(s); 746 assert(0 < i && i <= m->g->nsub); 747 offsave = m->pmatch[i].rm_eo; 748 m->pmatch[i].rm_eo = sp - m->offp; 749 dp = backref(m, sp, stop, ss+1, stopst, lev); 750 if (dp != NULL) 751 return(dp); 752 m->pmatch[i].rm_eo = offsave; 753 return(NULL); 754 755 default: /* uh oh */ 756 assert(nope); 757 break; 758 } 759 760 /* "can't happen" */ 761 assert(nope); 762 /* NOTREACHED */ 763 return NULL; 764} 765 766/* 767 - fast - step through the string at top speed 768 == static const char *fast(struct match *m, const char *start, \ 769 == const char *stop, sopno startst, sopno stopst); 770 */ 771static const char * /* where tentative match ended, or NULL */ 772fast( 773 struct match *m, 774 const char *start, 775 const char *stop, 776 sopno startst, 777 sopno stopst) 778{ 779 states st = m->st; 780 states fresh = m->fresh; 781 states tmp = m->tmp; 782 const char *p = start; 783 int c = (start == m->beginp) ? OUT : *(start-1); 784 int lastc; /* previous c */ 785 int flagch; 786 size_t i; 787 const char *coldp; /* last p after which no match was underway */ 788 789 _DIAGASSERT(m != NULL); 790 _DIAGASSERT(start != NULL); 791 _DIAGASSERT(stop != NULL); 792 793 CLEAR(st); 794 SET1(st, startst); 795 st = step(m->g, startst, stopst, st, NOTHING, st); 796 ASSIGN(fresh, st); 797 SP("start", st, *p); 798 coldp = NULL; 799 for (;;) { 800 /* next character */ 801 lastc = c; 802 c = (p == m->endp) ? OUT : *p; 803 if (EQ(st, fresh)) 804 coldp = p; 805 806 /* is there an EOL and/or BOL between lastc and c? */ 807 flagch = '\0'; 808 i = 0; 809 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 810 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 811 flagch = BOL; 812 i = m->g->nbol; 813 } 814 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 815 (c == OUT && !(m->eflags®_NOTEOL)) ) { 816 flagch = (flagch == BOL) ? BOLEOL : EOL; 817 i += m->g->neol; 818 } 819 if (i != 0) { 820 for (; i > 0; i--) 821 st = step(m->g, startst, stopst, st, flagch, st); 822 SP("boleol", st, c); 823 } 824 825 /* how about a word boundary? */ 826 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 827 (c != OUT && ISWORD(c)) ) { 828 flagch = BOW; 829 } 830 if ( (lastc != OUT && ISWORD(lastc)) && 831 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 832 flagch = EOW; 833 } 834 if (flagch == BOW || flagch == EOW) { 835 st = step(m->g, startst, stopst, st, flagch, st); 836 SP("boweow", st, c); 837 } 838 839 /* are we done? */ 840 if (ISSET(st, stopst) || p == stop) 841 break; /* NOTE BREAK OUT */ 842 843 /* no, we must deal with this character */ 844 ASSIGN(tmp, st); 845 ASSIGN(st, fresh); 846 assert(c != OUT); 847 st = step(m->g, startst, stopst, tmp, c, st); 848 SP("aft", st, c); 849 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 850 p++; 851 } 852 853 assert(coldp != NULL); 854 m->coldp = coldp; 855 if (ISSET(st, stopst)) 856 return(p+1); 857 else 858 return(NULL); 859} 860 861/* 862 - slow - step through the string more deliberately 863 == static const char *slow(struct match *m, const char *start, \ 864 == const char *stop, sopno startst, sopno stopst); 865 */ 866static const char * /* where it ended */ 867slow( 868 struct match *m, 869 const char *start, 870 const char *stop, 871 sopno startst, 872 sopno stopst) 873{ 874 states st = m->st; 875 states empty = m->empty; 876 states tmp = m->tmp; 877 const char *p = start; 878 int c = (start == m->beginp) ? OUT : *(start-1); 879 int lastc; /* previous c */ 880 int flagch; 881 size_t i; 882 const char *matchp; /* last p at which a match ended */ 883 884 _DIAGASSERT(m != NULL); 885 _DIAGASSERT(start != NULL); 886 _DIAGASSERT(stop != NULL); 887 888 AT("slow", start, stop, startst, stopst); 889 CLEAR(st); 890 SET1(st, startst); 891 SP("sstart", st, *p); 892 st = step(m->g, startst, stopst, st, NOTHING, st); 893 matchp = NULL; 894 for (;;) { 895 /* next character */ 896 lastc = c; 897 c = (p == m->endp) ? OUT : *p; 898 899 /* is there an EOL and/or BOL between lastc and c? */ 900 flagch = '\0'; 901 i = 0; 902 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || 903 (lastc == OUT && !(m->eflags®_NOTBOL)) ) { 904 flagch = BOL; 905 i = m->g->nbol; 906 } 907 if ( (c == '\n' && m->g->cflags®_NEWLINE) || 908 (c == OUT && !(m->eflags®_NOTEOL)) ) { 909 flagch = (flagch == BOL) ? BOLEOL : EOL; 910 i += m->g->neol; 911 } 912 if (i != 0) { 913 for (; i > 0; i--) 914 st = step(m->g, startst, stopst, st, flagch, st); 915 SP("sboleol", st, c); 916 } 917 918 /* how about a word boundary? */ 919 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && 920 (c != OUT && ISWORD(c)) ) { 921 flagch = BOW; 922 } 923 if ( (lastc != OUT && ISWORD(lastc)) && 924 (flagch == EOL || (c != OUT && !ISWORD(c))) ) { 925 flagch = EOW; 926 } 927 if (flagch == BOW || flagch == EOW) { 928 st = step(m->g, startst, stopst, st, flagch, st); 929 SP("sboweow", st, c); 930 } 931 932 /* are we done? */ 933 if (ISSET(st, stopst)) 934 matchp = p; 935 if (EQ(st, empty) || p == stop) 936 break; /* NOTE BREAK OUT */ 937 938 /* no, we must deal with this character */ 939 ASSIGN(tmp, st); 940 ASSIGN(st, empty); 941 assert(c != OUT); 942 st = step(m->g, startst, stopst, tmp, c, st); 943 SP("saft", st, c); 944 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); 945 p++; 946 } 947 948 return(matchp); 949} 950 951 952/* 953 - step - map set of states reachable before char to set reachable after 954 == static states step(struct re_guts *g, sopno start, sopno stop, \ 955 == states bef, int ch, states aft); 956 == #define BOL (OUT+1) 957 == #define EOL (BOL+1) 958 == #define BOLEOL (BOL+2) 959 == #define NOTHING (BOL+3) 960 == #define BOW (BOL+4) 961 == #define EOW (BOL+5) 962 == #define CODEMAX (BOL+5) // highest code used 963 == #define NONCHAR(c) ((c) > CHAR_MAX) 964 == #define NNONCHAR (CODEMAX-CHAR_MAX) 965 */ 966static states 967step( 968 struct re_guts *g, 969 sopno start, /* start state within strip */ 970 sopno stop, /* state after stop state within strip */ 971 states bef, /* states reachable before */ 972 int ch, /* character or NONCHAR code */ 973 states aft) /* states already known reachable after */ 974{ 975 cset *cs; 976 sop s; 977 sopno pc; 978 onestate here; /* note, macros know this name */ 979 sopno look; 980 int i; 981 982 _DIAGASSERT(g != NULL); 983 984 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { 985 s = g->strip[pc]; 986 switch (OP(s)) { 987 case OEND: 988 assert(pc == stop-1); 989 break; 990 case OCHAR: 991 /* only characters can match */ 992 assert(!NONCHAR(ch) || ch != (char)OPND(s)); 993 if (ch == (char)OPND(s)) 994 FWD(aft, bef, 1); 995 break; 996 case OBOL: 997 if (ch == BOL || ch == BOLEOL) 998 FWD(aft, bef, 1); 999 break; 1000 case OEOL: 1001 if (ch == EOL || ch == BOLEOL) 1002 FWD(aft, bef, 1); 1003 break; 1004 case OBOW: 1005 if (ch == BOW) 1006 FWD(aft, bef, 1); 1007 break; 1008 case OEOW: 1009 if (ch == EOW) 1010 FWD(aft, bef, 1); 1011 break; 1012 case OANY: 1013 if (!NONCHAR(ch)) 1014 FWD(aft, bef, 1); 1015 break; 1016 case OANYOF: 1017 cs = &g->sets[OPND(s)]; 1018 if (!NONCHAR(ch) && CHIN(cs, ch)) 1019 FWD(aft, bef, 1); 1020 break; 1021 case OBACK_: /* ignored here */ 1022 case O_BACK: 1023 FWD(aft, aft, 1); 1024 break; 1025 case OPLUS_: /* forward, this is just an empty */ 1026 FWD(aft, aft, 1); 1027 break; 1028 case O_PLUS: /* both forward and back */ 1029 FWD(aft, aft, 1); 1030 i = ISSETBACK(aft, OPND(s)); 1031 BACK(aft, aft, OPND(s)); 1032 if (!i && ISSETBACK(aft, OPND(s))) { 1033 /* oho, must reconsider loop body */ 1034 pc -= OPND(s) + 1; 1035 INIT(here, pc); 1036 } 1037 break; 1038 case OQUEST_: /* two branches, both forward */ 1039 FWD(aft, aft, 1); 1040 FWD(aft, aft, OPND(s)); 1041 break; 1042 case O_QUEST: /* just an empty */ 1043 FWD(aft, aft, 1); 1044 break; 1045 case OLPAREN: /* not significant here */ 1046 case ORPAREN: 1047 FWD(aft, aft, 1); 1048 break; 1049 case OCH_: /* mark the first two branches */ 1050 FWD(aft, aft, 1); 1051 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1052 FWD(aft, aft, OPND(s)); 1053 break; 1054 case OOR1: /* done a branch, find the O_CH */ 1055 if (ISSTATEIN(aft, here)) { 1056 for (look = 1; 1057 OP(s = g->strip[pc+look]) != O_CH; 1058 look += OPND(s)) 1059 assert(OP(s) == OOR2); 1060 FWD(aft, aft, look); 1061 } 1062 break; 1063 case OOR2: /* propagate OCH_'s marking */ 1064 FWD(aft, aft, 1); 1065 if (OP(g->strip[pc+OPND(s)]) != O_CH) { 1066 assert(OP(g->strip[pc+OPND(s)]) == OOR2); 1067 FWD(aft, aft, OPND(s)); 1068 } 1069 break; 1070 case O_CH: /* just empty */ 1071 FWD(aft, aft, 1); 1072 break; 1073 default: /* ooooops... */ 1074 assert(nope); 1075 break; 1076 } 1077 } 1078 1079 return(aft); 1080} 1081 1082#ifdef REDEBUG 1083/* 1084 - print - print a set of states 1085 == #ifdef REDEBUG 1086 == static void print(struct match *m, char *caption, states st, \ 1087 == int ch, FILE *d); 1088 == #endif 1089 */ 1090static void 1091print( 1092 struct match *m, 1093 char *caption, 1094 states st, 1095 int ch, 1096 FILE *d) 1097{ 1098 struct re_guts *g = m->g; 1099 int i; 1100 int first = 1; 1101 1102 _DIAGASSERT(m != NULL); 1103 _DIAGASSERT(caption != NULL); 1104 1105 if (!(m->eflags®_TRACE)) 1106 return; 1107 1108 _DIAGASSERT(d != NULL); 1109 1110 fprintf(d, "%s", caption); 1111 if (ch != '\0') 1112 fprintf(d, " %s", pchar(ch)); 1113 for (i = 0; i < g->nstates; i++) 1114 if (ISSET(st, i)) { 1115 fprintf(d, "%s%d", (first) ? "\t" : ", ", i); 1116 first = 0; 1117 } 1118 fprintf(d, "\n"); 1119} 1120 1121/* 1122 - at - print current situation 1123 == #ifdef REDEBUG 1124 == static void at(struct match *m, char *title, char *start, char *stop, \ 1125 == sopno startst, sopno stopst); 1126 == #endif 1127 */ 1128static void 1129at( 1130 struct match *m, 1131 char *title, 1132 char *start, 1133 char *stop, 1134 sopno startst, 1135 sopno stopst) 1136{ 1137 1138 _DIAGASSERT(m != NULL); 1139 _DIAGASSERT(title != NULL); 1140 _DIAGASSERT(start != NULL); 1141 _DIAGASSERT(stop != NULL); 1142 1143 if (!(m->eflags®_TRACE)) 1144 return; 1145 1146 printf("%s %s-", title, pchar(*start)); 1147 printf("%s ", pchar(*stop)); 1148 printf("%ld-%ld\n", (long)startst, (long)stopst); 1149} 1150 1151#ifndef PCHARDONE 1152#define PCHARDONE /* never again */ 1153/* 1154 - pchar - make a character printable 1155 == #ifdef REDEBUG 1156 == static char *pchar(int ch); 1157 == #endif 1158 * 1159 * Is this identical to regchar() over in debug.c? Well, yes. But a 1160 * duplicate here avoids having a debugging-capable regexec.o tied to 1161 * a matching debug.o, and this is convenient. It all disappears in 1162 * the non-debug compilation anyway, so it doesn't matter much. 1163 */ 1164static char * /* -> representation */ 1165pchar( 1166 int ch) 1167{ 1168 static char pbuf[10]; 1169 1170 if (isprint(ch) || ch == ' ') 1171 (void)snprintf(pbuf, sizeof pbuf, "%c", ch); 1172 else 1173 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch); 1174 return(pbuf); 1175} 1176#endif 1177#endif 1178 1179#undef matcher 1180#undef fast 1181#undef slow 1182#undef dissect 1183#undef backref 1184#undef step 1185#undef print 1186#undef at 1187#undef match 1188#undef nope 1189