1/* $NetBSD: regcomp.c,v 1.33 2012/03/13 21:13:43 christos Exp $ */ 2 3/*- 4 * Copyright (c) 1992, 1993, 1994 5 * The Regents of the University of California. All rights reserved. 6 * 7 * This code is derived from software contributed to Berkeley by 8 * Henry Spencer. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. Neither the name of the University nor the names of its contributors 19 * may be used to endorse or promote products derived from this software 20 * without specific prior written permission. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 35 */ 36 37/*- 38 * Copyright (c) 1992, 1993, 1994 Henry Spencer. 39 * 40 * This code is derived from software contributed to Berkeley by 41 * Henry Spencer. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. All advertising materials mentioning features or use of this software 52 * must display the following acknowledgement: 53 * This product includes software developed by the University of 54 * California, Berkeley and its contributors. 55 * 4. Neither the name of the University nor the names of its contributors 56 * may be used to endorse or promote products derived from this software 57 * without specific prior written permission. 58 * 59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69 * SUCH DAMAGE. 70 * 71 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 72 */ 73 74#include <sys/cdefs.h> 75#if defined(LIBC_SCCS) && !defined(lint) 76#if 0 77static char sccsid[] = "@(#)regcomp.c 8.5 (Berkeley) 3/20/94"; 78#else 79__RCSID("$NetBSD: regcomp.c,v 1.33 2012/03/13 21:13:43 christos Exp $"); 80#endif 81#endif /* LIBC_SCCS and not lint */ 82 83#include "namespace.h" 84#include <sys/types.h> 85 86#include <assert.h> 87#include <ctype.h> 88#include <limits.h> 89#include <stdio.h> 90#include <stdlib.h> 91#include <string.h> 92#include <regex.h> 93 94#ifdef __weak_alias 95__weak_alias(regcomp,_regcomp) 96#endif 97 98#include "utils.h" 99#include "regex2.h" 100 101#include "cclass.h" 102#include "cname.h" 103 104/* 105 * parse structure, passed up and down to avoid global variables and 106 * other clumsinesses 107 */ 108struct parse { 109 const char *next; /* next character in RE */ 110 const char *end; /* end of string (-> NUL normally) */ 111 int error; /* has an error been seen? */ 112 sop *strip; /* malloced strip */ 113 sopno ssize; /* malloced strip size (allocated) */ 114 sopno slen; /* malloced strip length (used) */ 115 size_t ncsalloc; /* number of csets allocated */ 116 struct re_guts *g; 117# define NPAREN 10 /* we need to remember () 1-9 for back refs */ 118 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ 119 sopno pend[NPAREN]; /* -> ) ([0] unused) */ 120}; 121 122/* ========= begin header generated by ./mkh ========= */ 123#ifdef __cplusplus 124extern "C" { 125#endif 126 127/* === regcomp.c === */ 128static void p_ere(struct parse *p, int stop, size_t reclimit); 129static void p_ere_exp(struct parse *p, size_t reclimit); 130static void p_str(struct parse *p); 131static void p_bre(struct parse *p, int end1, int end2, size_t reclimit); 132static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 133static int p_count(struct parse *p); 134static void p_bracket(struct parse *p); 135static void p_b_term(struct parse *p, cset *cs); 136static void p_b_cclass(struct parse *p, cset *cs); 137static void p_b_eclass(struct parse *p, cset *cs); 138static char p_b_symbol(struct parse *p); 139static char p_b_coll_elem(struct parse *p, int endc); 140static int othercase(int ch); 141static void bothcases(struct parse *p, int ch); 142static void ordinary(struct parse *p, int ch); 143static void nonnewline(struct parse *p); 144static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit); 145static int seterr(struct parse *p, int e); 146static cset *allocset(struct parse *p); 147static void freeset(struct parse *p, cset *cs); 148static sopno freezeset(struct parse *p, cset *cs); 149static int firstch(struct parse *p, cset *cs); 150static int nch(struct parse *p, cset *cs); 151static void mcadd(struct parse *p, cset *cs, const char *cp); 152#if 0 153static void mcsub(cset *cs, char *cp); 154static int mcin(cset *cs, char *cp); 155static char *mcfind(cset *cs, char *cp); 156#endif 157static void mcinvert(struct parse *p, cset *cs); 158static void mccase(struct parse *p, cset *cs); 159static int isinsets(struct re_guts *g, int c); 160static int samesets(struct re_guts *g, int c1, int c2); 161static void categorize(struct parse *p, struct re_guts *g); 162static sopno dupl(struct parse *p, sopno start, sopno finish); 163static void doemit(struct parse *p, sop op, sopno opnd); 164static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos); 165static void dofwd(struct parse *p, sopno pos, sopno value); 166static int enlarge(struct parse *p, sopno size); 167static void stripsnug(struct parse *p, struct re_guts *g); 168static void findmust(struct parse *p, struct re_guts *g); 169static sopno pluscount(struct parse *p, struct re_guts *g); 170 171#ifdef __cplusplus 172} 173#endif 174/* ========= end header generated by ./mkh ========= */ 175 176static char nuls[10]; /* place to point scanner in event of error */ 177 178/* 179 * macros for use with parse structure 180 * BEWARE: these know that the parse structure is named `p' !!! 181 */ 182#define PEEK() (*p->next) 183#define PEEK2() (*(p->next+1)) 184#define MORE() (p->next < p->end) 185#define MORE2() (p->next+1 < p->end) 186#define SEE(c) (MORE() && PEEK() == (c)) 187#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b)) 188#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) 189#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) 190#define NEXT() (p->next++) 191#define NEXT2() (p->next += 2) 192#define NEXTn(n) (p->next += (n)) 193#define GETNEXT() (*p->next++) 194#define SETERROR(e) seterr(p, (e)) 195#define REQUIRE(co, e) (void) ((co) || SETERROR(e)) 196#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) 197#define MUSTEAT(c, e) (void) (REQUIRE(MORE() && GETNEXT() == (c), e)) 198#define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) 199#define EMIT(op, sopnd) doemit(p, (sop)(op), sopnd) 200#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) 201#define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) 202#define ASTERN(sop, pos) EMIT(sop, HERE()-pos) 203#define HERE() (p->slen) 204#define THERE() (p->slen - 1) 205#define THERETHERE() (p->slen - 2) 206#define DROP(n) (p->slen -= (n)) 207 208#ifndef NDEBUG 209static int never = 0; /* for use in asserts; shuts lint up */ 210#else 211#define never 0 /* some <assert.h>s have bugs too */ 212#endif 213 214#define MEMLIMIT 0x8000000 215#define MEMSIZE(p) \ 216 ((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \ 217 (p)->ncsalloc * sizeof(cset) + \ 218 (p)->ssize * sizeof(sop)) 219#define RECLIMIT 256 220 221/* 222 - regcomp - interface for parser and compilation 223 = extern int regcomp(regex_t *, const char *, int); 224 = #define REG_BASIC 0000 225 = #define REG_EXTENDED 0001 226 = #define REG_ICASE 0002 227 = #define REG_NOSUB 0004 228 = #define REG_NEWLINE 0010 229 = #define REG_NOSPEC 0020 230 = #define REG_PEND 0040 231 = #define REG_DUMP 0200 232 */ 233int /* 0 success, otherwise REG_something */ 234regcomp( 235 regex_t *preg, 236 const char *pattern, 237 int cflags) 238{ 239 struct parse pa; 240 struct re_guts *g; 241 struct parse *p = &pa; 242 int i; 243 size_t len; 244#ifdef REDEBUG 245# define GOODFLAGS(f) (f) 246#else 247# define GOODFLAGS(f) ((f)&~REG_DUMP) 248#endif 249 250 _DIAGASSERT(preg != NULL); 251 _DIAGASSERT(pattern != NULL); 252 253 cflags = GOODFLAGS(cflags); 254 if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) 255 return(REG_INVARG); 256 257 if (cflags®_PEND) { 258 if (preg->re_endp < pattern) 259 return(REG_INVARG); 260 len = preg->re_endp - pattern; 261 } else 262 len = strlen(pattern); 263 264 /* do the mallocs early so failure handling is easy */ 265 g = (struct re_guts *)malloc(sizeof(struct re_guts) + 266 (NC-1)*sizeof(cat_t)); 267 if (g == NULL) 268 return(REG_ESPACE); 269 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ 270 p->strip = malloc(p->ssize * sizeof(sop)); 271 p->slen = 0; 272 if (p->strip == NULL) { 273 free(g); 274 return(REG_ESPACE); 275 } 276 277 /* set things up */ 278 p->g = g; 279 p->next = pattern; 280 p->end = p->next + len; 281 p->error = 0; 282 p->ncsalloc = 0; 283 for (i = 0; i < NPAREN; i++) { 284 p->pbegin[i] = 0; 285 p->pend[i] = 0; 286 } 287 g->csetsize = NC; 288 g->sets = NULL; 289 g->setbits = NULL; 290 g->ncsets = 0; 291 g->cflags = cflags; 292 g->iflags = 0; 293 g->nbol = 0; 294 g->neol = 0; 295 g->must = NULL; 296 g->mlen = 0; 297 g->nsub = 0; 298 g->ncategories = 1; /* category 0 is "everything else" */ 299 g->categories = &g->catspace[-(CHAR_MIN)]; 300 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); 301 g->backrefs = 0; 302 303 /* do it */ 304 EMIT(OEND, 0); 305 g->firststate = THERE(); 306 if (cflags®_EXTENDED) 307 p_ere(p, OUT, 0); 308 else if (cflags®_NOSPEC) 309 p_str(p); 310 else 311 p_bre(p, OUT, OUT, 0); 312 EMIT(OEND, 0); 313 g->laststate = THERE(); 314 315 /* tidy up loose ends and fill things in */ 316 categorize(p, g); 317 stripsnug(p, g); 318 findmust(p, g); 319 g->nplus = pluscount(p, g); 320 g->magic = MAGIC2; 321 preg->re_nsub = g->nsub; 322 preg->re_g = g; 323 preg->re_magic = MAGIC1; 324#ifndef REDEBUG 325 /* not debugging, so can't rely on the assert() in regexec() */ 326 if (g->iflags&BAD) 327 SETERROR(REG_ASSERT); 328#endif 329 330 /* win or lose, we're done */ 331 if (p->error != 0) /* lose */ 332 regfree(preg); 333 return(p->error); 334} 335 336/* 337 - p_ere - ERE parser top level, concatenation and alternation 338 == static void p_ere(struct parse *p, int stop, size_t reclimit); 339 */ 340static void 341p_ere( 342 struct parse *p, 343 int stop, /* character this ERE should end at */ 344 size_t reclimit) 345{ 346 char c; 347 sopno prevback = 0; /* pacify gcc */ 348 sopno prevfwd = 0; /* pacify gcc */ 349 sopno conc; 350 int first = 1; /* is this the first alternative? */ 351 352 _DIAGASSERT(p != NULL); 353 354 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 355 p->error = REG_ESPACE; 356 return; 357 } 358 359 for (;;) { 360 /* do a bunch of concatenated expressions */ 361 conc = HERE(); 362 while (MORE() && (c = PEEK()) != '|' && c != stop) 363 p_ere_exp(p, reclimit); 364 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ 365 366 if (!EAT('|')) 367 break; /* NOTE BREAK OUT */ 368 369 if (first) { 370 INSERT(OCH_, conc); /* offset is wrong */ 371 prevfwd = conc; 372 prevback = conc; 373 first = 0; 374 } 375 ASTERN(OOR1, prevback); 376 prevback = THERE(); 377 AHEAD(prevfwd); /* fix previous offset */ 378 prevfwd = HERE(); 379 EMIT(OOR2, 0); /* offset is very wrong */ 380 } 381 382 if (!first) { /* tail-end fixups */ 383 AHEAD(prevfwd); 384 ASTERN(O_CH, prevback); 385 } 386 387 assert(!MORE() || SEE(stop)); 388} 389 390/* 391 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op 392 == static void p_ere_exp(struct parse *p, size_t reclimit); 393 */ 394static void 395p_ere_exp( 396 struct parse *p, 397 size_t reclimit) 398{ 399 char c; 400 sopno pos; 401 int count; 402 int count2; 403 sopno subno; 404 int wascaret = 0; 405 406 _DIAGASSERT(p != NULL); 407 408 assert(MORE()); /* caller should have ensured this */ 409 c = GETNEXT(); 410 411 pos = HERE(); 412 switch (c) { 413 case '(': 414 REQUIRE(MORE(), REG_EPAREN); 415 p->g->nsub++; 416 subno = p->g->nsub; 417 if (subno < NPAREN) 418 p->pbegin[subno] = HERE(); 419 EMIT(OLPAREN, subno); 420 if (!SEE(')')) 421 p_ere(p, ')', reclimit); 422 if (subno < NPAREN) { 423 p->pend[subno] = HERE(); 424 assert(p->pend[subno] != 0); 425 } 426 EMIT(ORPAREN, subno); 427 MUSTEAT(')', REG_EPAREN); 428 break; 429#ifndef POSIX_MISTAKE 430 case ')': /* happens only if no current unmatched ( */ 431 /* 432 * You may ask, why the ifndef? Because I didn't notice 433 * this until slightly too late for 1003.2, and none of the 434 * other 1003.2 regular-expression reviewers noticed it at 435 * all. So an unmatched ) is legal POSIX, at least until 436 * we can get it fixed. 437 */ 438 SETERROR(REG_EPAREN); 439 break; 440#endif 441 case '^': 442 EMIT(OBOL, 0); 443 p->g->iflags |= USEBOL; 444 p->g->nbol++; 445 wascaret = 1; 446 break; 447 case '$': 448 EMIT(OEOL, 0); 449 p->g->iflags |= USEEOL; 450 p->g->neol++; 451 break; 452 case '|': 453 SETERROR(REG_EMPTY); 454 break; 455 case '*': 456 case '+': 457 case '?': 458 SETERROR(REG_BADRPT); 459 break; 460 case '.': 461 if (p->g->cflags®_NEWLINE) 462 nonnewline(p); 463 else 464 EMIT(OANY, 0); 465 break; 466 case '[': 467 p_bracket(p); 468 break; 469 case '\\': 470 REQUIRE(MORE(), REG_EESCAPE); 471 c = GETNEXT(); 472 ordinary(p, c); 473 break; 474 case '{': /* okay as ordinary except if digit follows */ 475 REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT); 476 /* FALLTHROUGH */ 477 default: 478 ordinary(p, c); 479 break; 480 } 481 482 if (!MORE()) 483 return; 484 c = PEEK(); 485 /* we call { a repetition if followed by a digit */ 486 if (!( c == '*' || c == '+' || c == '?' || 487 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) )) 488 return; /* no repetition, we're done */ 489 NEXT(); 490 491 REQUIRE(!wascaret, REG_BADRPT); 492 switch (c) { 493 case '*': /* implemented as +? */ 494 /* this case does not require the (y|) trick, noKLUDGE */ 495 INSERT(OPLUS_, pos); 496 ASTERN(O_PLUS, pos); 497 INSERT(OQUEST_, pos); 498 ASTERN(O_QUEST, pos); 499 break; 500 case '+': 501 INSERT(OPLUS_, pos); 502 ASTERN(O_PLUS, pos); 503 break; 504 case '?': 505 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 506 INSERT(OCH_, pos); /* offset slightly wrong */ 507 ASTERN(OOR1, pos); /* this one's right */ 508 AHEAD(pos); /* fix the OCH_ */ 509 EMIT(OOR2, 0); /* offset very wrong... */ 510 AHEAD(THERE()); /* ...so fix it */ 511 ASTERN(O_CH, THERETHERE()); 512 break; 513 case '{': 514 count = p_count(p); 515 if (EAT(',')) { 516 if (isdigit((unsigned char)PEEK())) { 517 count2 = p_count(p); 518 REQUIRE(count <= count2, REG_BADBR); 519 } else /* single number with comma */ 520 count2 = INFINITY; 521 } else /* just a single number */ 522 count2 = count; 523 repeat(p, pos, count, count2, 0); 524 if (!EAT('}')) { /* error heuristics */ 525 while (MORE() && PEEK() != '}') 526 NEXT(); 527 REQUIRE(MORE(), REG_EBRACE); 528 SETERROR(REG_BADBR); 529 } 530 break; 531 } 532 533 if (!MORE()) 534 return; 535 c = PEEK(); 536 if (!( c == '*' || c == '+' || c == '?' || 537 (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) ) 538 return; 539 SETERROR(REG_BADRPT); 540} 541 542/* 543 - p_str - string (no metacharacters) "parser" 544 == static void p_str(struct parse *p); 545 */ 546static void 547p_str( 548 struct parse *p) 549{ 550 551 _DIAGASSERT(p != NULL); 552 553 REQUIRE(MORE(), REG_EMPTY); 554 while (MORE()) 555 ordinary(p, GETNEXT()); 556} 557 558/* 559 - p_bre - BRE parser top level, anchoring and concatenation 560 == static void p_bre(struct parse *p, int end1, \ 561 == int end2, size_t reclimit); 562 * Giving end1 as OUT essentially eliminates the end1/end2 check. 563 * 564 * This implementation is a bit of a kludge, in that a trailing $ is first 565 * taken as an ordinary character and then revised to be an anchor. The 566 * only undesirable side effect is that '$' gets included as a character 567 * category in such cases. This is fairly harmless; not worth fixing. 568 * The amount of lookahead needed to avoid this kludge is excessive. 569 */ 570static void 571p_bre( 572 struct parse *p, 573 int end1, /* first terminating character */ 574 int end2, /* second terminating character */ 575 size_t reclimit) 576{ 577 sopno start; 578 int first = 1; /* first subexpression? */ 579 int wasdollar = 0; 580 581 _DIAGASSERT(p != NULL); 582 583 if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) { 584 p->error = REG_ESPACE; 585 return; 586 } 587 588 start = HERE(); 589 590 if (EAT('^')) { 591 EMIT(OBOL, 0); 592 p->g->iflags |= USEBOL; 593 p->g->nbol++; 594 } 595 while (MORE() && !SEETWO(end1, end2)) { 596 wasdollar = p_simp_re(p, first, reclimit); 597 first = 0; 598 } 599 if (wasdollar) { /* oops, that was a trailing anchor */ 600 DROP(1); 601 EMIT(OEOL, 0); 602 p->g->iflags |= USEEOL; 603 p->g->neol++; 604 } 605 606 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ 607} 608 609/* 610 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition 611 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit); 612 */ 613static int /* was the simple RE an unbackslashed $? */ 614p_simp_re( 615 struct parse *p, 616 int starordinary, /* is a leading * an ordinary character? */ 617 size_t reclimit) 618{ 619 int c; 620 int count; 621 int count2; 622 sopno pos, i; 623 sopno subno; 624# define BACKSL (1<<CHAR_BIT) 625 626 _DIAGASSERT(p != NULL); 627 628 pos = HERE(); /* repetion op, if any, covers from here */ 629 630 assert(MORE()); /* caller should have ensured this */ 631 c = GETNEXT(); 632 if (c == '\\') { 633 REQUIRE(MORE(), REG_EESCAPE); 634 c = BACKSL | (unsigned char)GETNEXT(); 635 } 636 switch (c) { 637 case '.': 638 if (p->g->cflags®_NEWLINE) 639 nonnewline(p); 640 else 641 EMIT(OANY, 0); 642 break; 643 case '[': 644 p_bracket(p); 645 break; 646 case BACKSL|'{': 647 SETERROR(REG_BADRPT); 648 break; 649 case BACKSL|'(': 650 p->g->nsub++; 651 subno = p->g->nsub; 652 if (subno < NPAREN) 653 p->pbegin[subno] = HERE(); 654 EMIT(OLPAREN, subno); 655 /* the MORE here is an error heuristic */ 656 if (MORE() && !SEETWO('\\', ')')) 657 p_bre(p, '\\', ')', reclimit); 658 if (subno < NPAREN) { 659 p->pend[subno] = HERE(); 660 assert(p->pend[subno] != 0); 661 } 662 EMIT(ORPAREN, subno); 663 REQUIRE(EATTWO('\\', ')'), REG_EPAREN); 664 break; 665 case BACKSL|')': /* should not get here -- must be user */ 666 case BACKSL|'}': 667 SETERROR(REG_EPAREN); 668 break; 669 case BACKSL|'1': 670 case BACKSL|'2': 671 case BACKSL|'3': 672 case BACKSL|'4': 673 case BACKSL|'5': 674 case BACKSL|'6': 675 case BACKSL|'7': 676 case BACKSL|'8': 677 case BACKSL|'9': 678 i = (c&~BACKSL) - '0'; 679 assert(i < NPAREN); 680 if (p->pend[i] != 0) { 681 assert(i <= p->g->nsub); 682 EMIT(OBACK_, i); 683 assert(p->pbegin[i] != 0); 684 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); 685 assert(OP(p->strip[p->pend[i]]) == ORPAREN); 686 (void) dupl(p, p->pbegin[i]+1, p->pend[i]); 687 EMIT(O_BACK, i); 688 } else 689 SETERROR(REG_ESUBREG); 690 p->g->backrefs = 1; 691 break; 692 case '*': 693 REQUIRE(starordinary, REG_BADRPT); 694 /* FALLTHROUGH */ 695 default: 696 ordinary(p, c &~ BACKSL); 697 break; 698 } 699 700 if (EAT('*')) { /* implemented as +? */ 701 /* this case does not require the (y|) trick, noKLUDGE */ 702 INSERT(OPLUS_, pos); 703 ASTERN(O_PLUS, pos); 704 INSERT(OQUEST_, pos); 705 ASTERN(O_QUEST, pos); 706 } else if (EATTWO('\\', '{')) { 707 count = p_count(p); 708 if (EAT(',')) { 709 if (MORE() && isdigit((unsigned char)PEEK())) { 710 count2 = p_count(p); 711 REQUIRE(count <= count2, REG_BADBR); 712 } else /* single number with comma */ 713 count2 = INFINITY; 714 } else /* just a single number */ 715 count2 = count; 716 repeat(p, pos, count, count2, 0); 717 if (!EATTWO('\\', '}')) { /* error heuristics */ 718 while (MORE() && !SEETWO('\\', '}')) 719 NEXT(); 720 REQUIRE(MORE(), REG_EBRACE); 721 SETERROR(REG_BADBR); 722 } 723 } else if (c == (unsigned char)'$') /* $ (but not \$) ends it */ 724 return(1); 725 726 return(0); 727} 728 729/* 730 - p_count - parse a repetition count 731 == static int p_count(struct parse *p); 732 */ 733static int /* the value */ 734p_count( 735 struct parse *p) 736{ 737 int count = 0; 738 int ndigits = 0; 739 740 _DIAGASSERT(p != NULL); 741 742 while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) { 743 count = count*10 + (GETNEXT() - '0'); 744 ndigits++; 745 } 746 747 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); 748 return(count); 749} 750 751/* 752 - p_bracket - parse a bracketed character list 753 == static void p_bracket(struct parse *p); 754 * 755 * Note a significant property of this code: if the allocset() did SETERROR, 756 * no set operations are done. 757 */ 758static void 759p_bracket( 760 struct parse *p) 761{ 762 cset *cs; 763 int invert = 0; 764 _DIAGASSERT(p != NULL); 765 766 cs = allocset(p); 767 if (cs == NULL) 768 return; 769 770 /* Dept of Truly Sickening Special-Case Kludges */ 771 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 772 (size_t)6) == 0) { 773 EMIT(OBOW, 0); 774 NEXTn(6); 775 return; 776 } 777 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 778 (size_t)6) == 0) { 779 EMIT(OEOW, 0); 780 NEXTn(6); 781 return; 782 } 783 784 if (EAT('^')) 785 invert++; /* make note to invert set at end */ 786 if (EAT(']')) 787 CHadd(cs, ']'); 788 else if (EAT('-')) 789 CHadd(cs, '-'); 790 while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) 791 p_b_term(p, cs); 792 if (EAT('-')) 793 CHadd(cs, '-'); 794 MUSTEAT(']', REG_EBRACK); 795 796 if (p->error != 0) /* don't mess things up further */ 797 return; 798 799 if (p->g->cflags®_ICASE) { 800 ssize_t i; 801 int ci; 802 803 for (i = p->g->csetsize - 1; i >= 0; i--) 804 if (CHIN(cs, i) && isalpha(i)) { 805 ci = othercase((int)i); 806 if (ci != i) 807 CHadd(cs, ci); 808 } 809 if (cs->multis != NULL) 810 mccase(p, cs); 811 } 812 if (invert) { 813 ssize_t i; 814 815 for (i = p->g->csetsize - 1; i >= 0; i--) 816 if (CHIN(cs, i)) 817 CHsub(cs, (int)i); 818 else 819 CHadd(cs, (int)i); 820 if (p->g->cflags®_NEWLINE) 821 CHsub(cs, '\n'); 822 if (cs->multis != NULL) 823 mcinvert(p, cs); 824 } 825 826 assert(cs->multis == NULL); /* xxx */ 827 828 if (nch(p, cs) == 1) { /* optimize singleton sets */ 829 ordinary(p, firstch(p, cs)); 830 freeset(p, cs); 831 } else 832 EMIT(OANYOF, freezeset(p, cs)); 833} 834 835/* 836 - p_b_term - parse one term of a bracketed character list 837 == static void p_b_term(struct parse *p, cset *cs); 838 */ 839static void 840p_b_term( 841 struct parse *p, 842 cset *cs) 843{ 844 char c; 845 char start, finish; 846 int i; 847 848 _DIAGASSERT(p != NULL); 849 _DIAGASSERT(cs != NULL); 850 851 /* classify what we've got */ 852 switch ((MORE()) ? PEEK() : '\0') { 853 case '[': 854 c = (MORE2()) ? PEEK2() : '\0'; 855 break; 856 857 case '-': 858 SETERROR(REG_ERANGE); 859 return; /* NOTE RETURN */ 860 861 default: 862 c = '\0'; 863 break; 864 } 865 866 switch (c) { 867 case ':': /* character class */ 868 NEXT2(); 869 REQUIRE(MORE(), REG_EBRACK); 870 c = PEEK(); 871 REQUIRE(c != '-' && c != ']', REG_ECTYPE); 872 p_b_cclass(p, cs); 873 REQUIRE(MORE(), REG_EBRACK); 874 REQUIRE(EATTWO(':', ']'), REG_ECTYPE); 875 break; 876 case '=': /* equivalence class */ 877 NEXT2(); 878 REQUIRE(MORE(), REG_EBRACK); 879 c = PEEK(); 880 REQUIRE(c != '-' && c != ']', REG_ECOLLATE); 881 p_b_eclass(p, cs); 882 REQUIRE(MORE(), REG_EBRACK); 883 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); 884 break; 885 default: /* symbol, ordinary character, or range */ 886/* xxx revision needed for multichar stuff */ 887 start = p_b_symbol(p); 888 if (SEE('-') && MORE2() && PEEK2() != ']') { 889 /* range */ 890 NEXT(); 891 if (EAT('-')) 892 finish = '-'; 893 else 894 finish = p_b_symbol(p); 895 } else 896 finish = start; 897/* xxx what about signed chars here... */ 898 REQUIRE(start <= finish, REG_ERANGE); 899 for (i = start; i <= finish; i++) 900 CHadd(cs, i); 901 break; 902 } 903} 904 905/* 906 - p_b_cclass - parse a character-class name and deal with it 907 == static void p_b_cclass(struct parse *p, cset *cs); 908 */ 909static void 910p_b_cclass( 911 struct parse *p, 912 cset *cs) 913{ 914 const char *sp; 915 const struct cclass *cp; 916 size_t len; 917 const char *u; 918 char c; 919 920 _DIAGASSERT(p != NULL); 921 _DIAGASSERT(cs != NULL); 922 923 sp = p->next; 924 925 while (MORE() && isalpha((unsigned char)PEEK())) 926 NEXT(); 927 len = p->next - sp; 928 for (cp = cclasses; cp->name != NULL; cp++) 929 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 930 break; 931 if (cp->name == NULL) { 932 /* oops, didn't find it */ 933 SETERROR(REG_ECTYPE); 934 return; 935 } 936 937 u = cp->chars; 938 while ((c = *u++) != '\0') 939 CHadd(cs, c); 940 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) 941 MCadd(p, cs, u); 942} 943 944/* 945 - p_b_eclass - parse an equivalence-class name and deal with it 946 == static void p_b_eclass(struct parse *p, cset *cs); 947 * 948 * This implementation is incomplete. xxx 949 */ 950static void 951p_b_eclass( 952 struct parse *p, 953 cset *cs) 954{ 955 char c; 956 957 _DIAGASSERT(p != NULL); 958 _DIAGASSERT(cs != NULL); 959 960 c = p_b_coll_elem(p, '='); 961 CHadd(cs, c); 962} 963 964/* 965 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol 966 == static char p_b_symbol(struct parse *p); 967 */ 968static char /* value of symbol */ 969p_b_symbol( 970 struct parse *p) 971{ 972 char value; 973 974 _DIAGASSERT(p != NULL); 975 976 REQUIRE(MORE(), REG_EBRACK); 977 if (!EATTWO('[', '.')) 978 return(GETNEXT()); 979 980 /* collating symbol */ 981 value = p_b_coll_elem(p, '.'); 982 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); 983 return(value); 984} 985 986/* 987 - p_b_coll_elem - parse a collating-element name and look it up 988 == static char p_b_coll_elem(struct parse *p, int endc); 989 */ 990static char /* value of collating element */ 991p_b_coll_elem( 992 struct parse *p, 993 int endc) /* name ended by endc,']' */ 994{ 995 const char *sp; 996 const struct cname *cp; 997 size_t len; 998 999 _DIAGASSERT(p != NULL); 1000 1001 sp = p->next; 1002 1003 while (MORE() && !SEETWO(endc, ']')) 1004 NEXT(); 1005 if (!MORE()) { 1006 SETERROR(REG_EBRACK); 1007 return(0); 1008 } 1009 len = p->next - sp; 1010 for (cp = cnames; cp->name != NULL; cp++) 1011 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') 1012 return(cp->code); /* known name */ 1013 if (len == 1) 1014 return(*sp); /* single character */ 1015 SETERROR(REG_ECOLLATE); /* neither */ 1016 return(0); 1017} 1018 1019/* 1020 - othercase - return the case counterpart of an alphabetic 1021 == static int othercase(int ch); 1022 */ 1023static int /* if no counterpart, return ch */ 1024othercase( 1025 int ch) 1026{ 1027 assert(isalpha(ch)); 1028 if (isupper(ch)) 1029 return(tolower(ch)); 1030 else if (islower(ch)) 1031 return(toupper(ch)); 1032 else /* peculiar, but could happen */ 1033 return(ch); 1034} 1035 1036/* 1037 - bothcases - emit a dualcase version of a two-case character 1038 == static void bothcases(struct parse *p, int ch); 1039 * 1040 * Boy, is this implementation ever a kludge... 1041 */ 1042static void 1043bothcases( 1044 struct parse *p, 1045 int ch) 1046{ 1047 const char *oldnext; 1048 const char *oldend; 1049 char bracket[3]; 1050 1051 _DIAGASSERT(p != NULL); 1052 1053 oldnext = p->next; 1054 oldend = p->end; 1055 1056 assert(othercase(ch) != ch); /* p_bracket() would recurse */ 1057 p->next = bracket; 1058 p->end = bracket+2; 1059 bracket[0] = ch; 1060 bracket[1] = ']'; 1061 bracket[2] = '\0'; 1062 p_bracket(p); 1063 assert(p->next == bracket+2); 1064 p->next = oldnext; 1065 p->end = oldend; 1066} 1067 1068/* 1069 - ordinary - emit an ordinary character 1070 == static void ordinary(struct parse *p, int ch); 1071 */ 1072static void 1073ordinary( 1074 struct parse *p, 1075 int ch) 1076{ 1077 cat_t *cap; 1078 1079 _DIAGASSERT(p != NULL); 1080 1081 cap = p->g->categories; 1082 if ((p->g->cflags®_ICASE) && isalpha((unsigned char) ch) 1083 && othercase((unsigned char) ch) != (unsigned char) ch) 1084 bothcases(p, (unsigned char) ch); 1085 else { 1086 EMIT(OCHAR, (sopno)(unsigned char)ch); 1087 if (cap[ch] == 0) { 1088 _DIAGASSERT(__type_fit(unsigned char, 1089 p->g->ncategories + 1)); 1090 cap[ch] = (unsigned char)p->g->ncategories++; 1091 } 1092 } 1093} 1094 1095/* 1096 - nonnewline - emit REG_NEWLINE version of OANY 1097 == static void nonnewline(struct parse *p); 1098 * 1099 * Boy, is this implementation ever a kludge... 1100 */ 1101static void 1102nonnewline( 1103 struct parse *p) 1104{ 1105 const char *oldnext; 1106 const char *oldend; 1107 char bracket[4]; 1108 1109 _DIAGASSERT(p != NULL); 1110 1111 oldnext = p->next; 1112 oldend = p->end; 1113 1114 p->next = bracket; 1115 p->end = bracket+3; 1116 bracket[0] = '^'; 1117 bracket[1] = '\n'; 1118 bracket[2] = ']'; 1119 bracket[3] = '\0'; 1120 p_bracket(p); 1121 assert(p->next == bracket+3); 1122 p->next = oldnext; 1123 p->end = oldend; 1124} 1125 1126/* 1127 - repeat - generate code for a bounded repetition, recursively if needed 1128 == static void repeat(struct parse *p, sopno start, int from, int to, 1129 == size_t reclimit); 1130 */ 1131static void 1132repeat( 1133 struct parse *p, 1134 sopno start, /* operand from here to end of strip */ 1135 int from, /* repeated from this number */ 1136 int to, /* to this number of times (maybe INFINITY) */ 1137 size_t reclimit) 1138{ 1139 sopno finish; 1140# define N 2 1141# define INF 3 1142# define REP(f, t) ((f)*8 + (t)) 1143# define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) 1144 sopno copy; 1145 1146 _DIAGASSERT(p != NULL); 1147 1148 if (reclimit++ > RECLIMIT) 1149 p->error = REG_ESPACE; 1150 if (p->error) 1151 return; 1152 1153 finish = HERE(); 1154 1155 assert(from <= to); 1156 1157 switch (REP(MAP(from), MAP(to))) { 1158 case REP(0, 0): /* must be user doing this */ 1159 DROP(finish-start); /* drop the operand */ 1160 break; 1161 case REP(0, 1): /* as x{1,1}? */ 1162 case REP(0, N): /* as x{1,n}? */ 1163 case REP(0, INF): /* as x{1,}? */ 1164 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1165 INSERT(OCH_, start); /* offset is wrong... */ 1166 repeat(p, start+1, 1, to, reclimit); 1167 ASTERN(OOR1, start); 1168 AHEAD(start); /* ... fix it */ 1169 EMIT(OOR2, 0); 1170 AHEAD(THERE()); 1171 ASTERN(O_CH, THERETHERE()); 1172 break; 1173 case REP(1, 1): /* trivial case */ 1174 /* done */ 1175 break; 1176 case REP(1, N): /* as x?x{1,n-1} */ 1177 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ 1178 INSERT(OCH_, start); 1179 ASTERN(OOR1, start); 1180 AHEAD(start); 1181 EMIT(OOR2, 0); /* offset very wrong... */ 1182 AHEAD(THERE()); /* ...so fix it */ 1183 ASTERN(O_CH, THERETHERE()); 1184 copy = dupl(p, start+1, finish+1); 1185 assert(copy == finish+4); 1186 repeat(p, copy, 1, to-1, reclimit); 1187 break; 1188 case REP(1, INF): /* as x+ */ 1189 INSERT(OPLUS_, start); 1190 ASTERN(O_PLUS, start); 1191 break; 1192 case REP(N, N): /* as xx{m-1,n-1} */ 1193 copy = dupl(p, start, finish); 1194 repeat(p, copy, from-1, to-1, reclimit); 1195 break; 1196 case REP(N, INF): /* as xx{n-1,INF} */ 1197 copy = dupl(p, start, finish); 1198 repeat(p, copy, from-1, to, reclimit); 1199 break; 1200 default: /* "can't happen" */ 1201 SETERROR(REG_ASSERT); /* just in case */ 1202 break; 1203 } 1204} 1205 1206/* 1207 - seterr - set an error condition 1208 == static int seterr(struct parse *p, int e); 1209 */ 1210static int /* useless but makes type checking happy */ 1211seterr( 1212 struct parse *p, 1213 int e) 1214{ 1215 1216 _DIAGASSERT(p != NULL); 1217 1218 if (p->error == 0) /* keep earliest error condition */ 1219 p->error = e; 1220 p->next = nuls; /* try to bring things to a halt */ 1221 p->end = nuls; 1222 return(0); /* make the return value well-defined */ 1223} 1224 1225/* 1226 - allocset - allocate a set of characters for [] 1227 == static cset *allocset(struct parse *p); 1228 */ 1229static cset * 1230allocset( 1231 struct parse *p) 1232{ 1233 size_t no; 1234 size_t nc; 1235 size_t nbytes; 1236 cset *cs; 1237 size_t css; 1238 size_t i; 1239 1240 _DIAGASSERT(p != NULL); 1241 1242 no = p->g->ncsets++; 1243 css = (size_t)p->g->csetsize; 1244 if (no >= p->ncsalloc) { /* need another column of space */ 1245 p->ncsalloc += CHAR_BIT; 1246 nc = p->ncsalloc; 1247 assert(nc % CHAR_BIT == 0); 1248 nbytes = nc / CHAR_BIT * css; 1249 if (MEMSIZE(p) > MEMLIMIT) 1250 goto oomem; 1251 if (p->g->sets == NULL) 1252 p->g->sets = malloc(nc * sizeof(cset)); 1253 else 1254 p->g->sets = realloc(p->g->sets, nc * sizeof(cset)); 1255 if (p->g->setbits == NULL) 1256 p->g->setbits = malloc(nbytes); 1257 else { 1258 p->g->setbits = realloc(p->g->setbits, nbytes); 1259 /* xxx this isn't right if setbits is now NULL */ 1260 for (i = 0; i < no; i++) 1261 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); 1262 } 1263 if (p->g->sets != NULL && p->g->setbits != NULL) 1264 (void) memset((char *)p->g->setbits + (nbytes - css), 1265 0, css); 1266 else { 1267oomem: 1268 no = 0; 1269 SETERROR(REG_ESPACE); 1270 /* caller's responsibility not to do set ops */ 1271 return NULL; 1272 } 1273 } 1274 1275 cs = &p->g->sets[no]; 1276 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); 1277 cs->mask = 1 << (unsigned int)((no) % CHAR_BIT); 1278 cs->hash = 0; 1279 cs->smultis = 0; 1280 cs->multis = NULL; 1281 1282 return(cs); 1283} 1284 1285/* 1286 - freeset - free a now-unused set 1287 == static void freeset(struct parse *p, cset *cs); 1288 */ 1289static void 1290freeset( 1291 struct parse *p, 1292 cset *cs) 1293{ 1294 size_t i; 1295 cset *top; 1296 size_t css; 1297 1298 _DIAGASSERT(p != NULL); 1299 _DIAGASSERT(cs != NULL); 1300 1301 top = &p->g->sets[p->g->ncsets]; 1302 css = (size_t)p->g->csetsize; 1303 1304 for (i = 0; i < css; i++) 1305 CHsub(cs, (int)i); 1306 if (cs == top-1) /* recover only the easy case */ 1307 p->g->ncsets--; 1308} 1309 1310/* 1311 - freezeset - final processing on a set of characters 1312 == static int freezeset(struct parse *p, cset *cs); 1313 * 1314 * The main task here is merging identical sets. This is usually a waste 1315 * of time (although the hash code minimizes the overhead), but can win 1316 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash 1317 * is done using addition rather than xor -- all ASCII [aA] sets xor to 1318 * the same value! 1319 */ 1320static sopno /* set number */ 1321freezeset( 1322 struct parse *p, 1323 cset *cs) 1324{ 1325 uch h; 1326 size_t i; 1327 cset *top; 1328 cset *cs2; 1329 size_t css; 1330 1331 _DIAGASSERT(p != NULL); 1332 _DIAGASSERT(cs != NULL); 1333 1334 h = cs->hash; 1335 top = &p->g->sets[p->g->ncsets]; 1336 css = (size_t)p->g->csetsize; 1337 1338 /* look for an earlier one which is the same */ 1339 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) 1340 if (cs2->hash == h && cs2 != cs) { 1341 /* maybe */ 1342 for (i = 0; i < css; i++) 1343 if (!!CHIN(cs2, i) != !!CHIN(cs, i)) 1344 break; /* no */ 1345 if (i == css) 1346 break; /* yes */ 1347 } 1348 1349 if (cs2 < top) { /* found one */ 1350 freeset(p, cs); 1351 cs = cs2; 1352 } 1353 1354 return (sopno)(cs - p->g->sets); 1355} 1356 1357/* 1358 - firstch - return first character in a set (which must have at least one) 1359 == static int firstch(struct parse *p, cset *cs); 1360 */ 1361static int /* character; there is no "none" value */ 1362firstch( 1363 struct parse *p, 1364 cset *cs) 1365{ 1366 size_t i; 1367 size_t css; 1368 1369 _DIAGASSERT(p != NULL); 1370 _DIAGASSERT(cs != NULL); 1371 1372 css = (size_t)p->g->csetsize; 1373 1374 for (i = 0; i < css; i++) 1375 if (CHIN(cs, i)) 1376 return((char)i); 1377 assert(never); 1378 return(0); /* arbitrary */ 1379} 1380 1381/* 1382 - nch - number of characters in a set 1383 == static int nch(struct parse *p, cset *cs); 1384 */ 1385static int 1386nch( 1387 struct parse *p, 1388 cset *cs) 1389{ 1390 size_t i; 1391 size_t css; 1392 int n = 0; 1393 1394 _DIAGASSERT(p != NULL); 1395 _DIAGASSERT(cs != NULL); 1396 1397 css = (size_t)p->g->csetsize; 1398 1399 for (i = 0; i < css; i++) 1400 if (CHIN(cs, i)) 1401 n++; 1402 return(n); 1403} 1404 1405/* 1406 - mcadd - add a collating element to a cset 1407 == static void mcadd(struct parse *p, cset *cs, \ 1408 == char *cp); 1409 */ 1410static void 1411mcadd( 1412 struct parse *p, 1413 cset *cs, 1414 const char *cp) 1415{ 1416 size_t oldend; 1417 1418 _DIAGASSERT(p != NULL); 1419 _DIAGASSERT(cs != NULL); 1420 _DIAGASSERT(cp != NULL); 1421 1422 oldend = cs->smultis; 1423 1424 cs->smultis += strlen(cp) + 1; 1425 if (cs->multis == NULL) 1426 cs->multis = malloc(cs->smultis); 1427 else 1428 cs->multis = realloc(cs->multis, cs->smultis); 1429 if (cs->multis == NULL) { 1430 SETERROR(REG_ESPACE); 1431 return; 1432 } 1433 1434 (void) strcpy(cs->multis + oldend - 1, cp); 1435 cs->multis[cs->smultis - 1] = '\0'; 1436} 1437 1438#if 0 1439/* 1440 - mcsub - subtract a collating element from a cset 1441 == static void mcsub(cset *cs, char *cp); 1442 */ 1443static void 1444mcsub( 1445 cset *cs, 1446 char *cp) 1447{ 1448 char *fp; 1449 size_t len; 1450 1451 _DIAGASSERT(cs != NULL); 1452 _DIAGASSERT(cp != NULL); 1453 1454 fp = mcfind(cs, cp); 1455 len = strlen(fp); 1456 1457 assert(fp != NULL); 1458 (void) memmove(fp, fp + len + 1, 1459 cs->smultis - (fp + len + 1 - cs->multis)); 1460 cs->smultis -= len; 1461 1462 if (cs->smultis == 0) { 1463 free(cs->multis); 1464 cs->multis = NULL; 1465 return; 1466 } 1467 1468 cs->multis = realloc(cs->multis, cs->smultis); 1469 assert(cs->multis != NULL); 1470} 1471 1472/* 1473 - mcin - is a collating element in a cset? 1474 == static int mcin(cset *cs, char *cp); 1475 */ 1476static int 1477mcin( 1478 cset *cs, 1479 char *cp) 1480{ 1481 1482 _DIAGASSERT(cs != NULL); 1483 _DIAGASSERT(cp != NULL); 1484 1485 return(mcfind(cs, cp) != NULL); 1486} 1487 1488/* 1489 - mcfind - find a collating element in a cset 1490 == static char *mcfind(cset *cs, char *cp); 1491 */ 1492static char * 1493mcfind( 1494 cset *cs, 1495 char *cp) 1496{ 1497 char *p; 1498 1499 _DIAGASSERT(cs != NULL); 1500 _DIAGASSERT(cp != NULL); 1501 1502 if (cs->multis == NULL) 1503 return(NULL); 1504 for (p = cs->multis; *p != '\0'; p += strlen(p) + 1) 1505 if (strcmp(cp, p) == 0) 1506 return(p); 1507 return(NULL); 1508} 1509#endif 1510 1511/* 1512 - mcinvert - invert the list of collating elements in a cset 1513 == static void mcinvert(struct parse *p, cset *cs); 1514 * 1515 * This would have to know the set of possibilities. Implementation 1516 * is deferred. 1517 */ 1518/* ARGSUSED */ 1519static void 1520mcinvert( 1521 struct parse *p, 1522 cset *cs) 1523{ 1524 1525 _DIAGASSERT(p != NULL); 1526 _DIAGASSERT(cs != NULL); 1527 1528 assert(cs->multis == NULL); /* xxx */ 1529} 1530 1531/* 1532 - mccase - add case counterparts of the list of collating elements in a cset 1533 == static void mccase(struct parse *p, cset *cs); 1534 * 1535 * This would have to know the set of possibilities. Implementation 1536 * is deferred. 1537 */ 1538/* ARGSUSED */ 1539static void 1540mccase( 1541 struct parse *p, 1542 cset *cs) 1543{ 1544 1545 _DIAGASSERT(p != NULL); 1546 _DIAGASSERT(cs != NULL); 1547 1548 assert(cs->multis == NULL); /* xxx */ 1549} 1550 1551/* 1552 - isinsets - is this character in any sets? 1553 == static int isinsets(struct re_guts *g, int c); 1554 */ 1555static int /* predicate */ 1556isinsets( 1557 struct re_guts *g, 1558 int c) 1559{ 1560 uch *col; 1561 size_t i; 1562 size_t ncols; 1563 unsigned uc = (unsigned char)c; 1564 1565 _DIAGASSERT(g != NULL); 1566 1567 if (g->setbits == NULL) 1568 return 0; 1569 1570 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1571 1572 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1573 if (col[uc] != 0) 1574 return(1); 1575 return(0); 1576} 1577 1578/* 1579 - samesets - are these two characters in exactly the same sets? 1580 == static int samesets(struct re_guts *g, int c1, int c2); 1581 */ 1582static int /* predicate */ 1583samesets( 1584 struct re_guts *g, 1585 int c1, 1586 int c2) 1587{ 1588 uch *col; 1589 size_t i; 1590 size_t ncols; 1591 unsigned uc1 = (unsigned char)c1; 1592 unsigned uc2 = (unsigned char)c2; 1593 1594 _DIAGASSERT(g != NULL); 1595 1596 ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; 1597 1598 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) 1599 if (col[uc1] != col[uc2]) 1600 return(0); 1601 return(1); 1602} 1603 1604/* 1605 - categorize - sort out character categories 1606 == static void categorize(struct parse *p, struct re_guts *g); 1607 */ 1608static void 1609categorize( 1610 struct parse *p, 1611 struct re_guts *g) 1612{ 1613 cat_t *cats; 1614 int c; 1615 int c2; 1616 cat_t cat; 1617 1618 _DIAGASSERT(p != NULL); 1619 _DIAGASSERT(g != NULL); 1620 1621 cats = g->categories; 1622 1623 /* avoid making error situations worse */ 1624 if (p->error != 0) 1625 return; 1626 1627 for (c = CHAR_MIN; c <= CHAR_MAX; c++) 1628 if (cats[c] == 0 && isinsets(g, c)) { 1629 _DIAGASSERT(__type_fit(unsigned char, 1630 g->ncategories + 1)); 1631 cat = g->ncategories++; 1632 cats[c] = cat; 1633 for (c2 = c+1; c2 <= CHAR_MAX; c2++) 1634 if (cats[c2] == 0 && samesets(g, c, c2)) 1635 cats[c2] = cat; 1636 } 1637} 1638 1639/* 1640 - dupl - emit a duplicate of a bunch of sops 1641 == static sopno dupl(struct parse *p, sopno start, sopno finish); 1642 */ 1643static sopno /* start of duplicate */ 1644dupl( 1645 struct parse *p, 1646 sopno start, /* from here */ 1647 sopno finish) /* to this less one */ 1648{ 1649 sopno ret; 1650 sopno len = finish - start; 1651 1652 _DIAGASSERT(p != NULL); 1653 1654 ret = HERE(); 1655 1656 assert(finish >= start); 1657 if (len == 0) 1658 return(ret); 1659 if (!enlarge(p, p->ssize + len))/* this many unexpected additions */ 1660 return ret; 1661 (void)memcpy(p->strip + p->slen, p->strip + start, 1662 (size_t)len * sizeof(sop)); 1663 p->slen += len; 1664 return(ret); 1665} 1666 1667/* 1668 - doemit - emit a strip operator 1669 == static void doemit(struct parse *p, sop op, size_t opnd); 1670 * 1671 * It might seem better to implement this as a macro with a function as 1672 * hard-case backup, but it's just too big and messy unless there are 1673 * some changes to the data structures. Maybe later. 1674 */ 1675static void 1676doemit( 1677 struct parse *p, 1678 sop op, 1679 sopno opnd) 1680{ 1681 _DIAGASSERT(p != NULL); 1682 1683 /* avoid making error situations worse */ 1684 if (p->error != 0) 1685 return; 1686 1687 /* deal with oversize operands ("can't happen", more or less) */ 1688 assert(opnd < 1<<OPSHIFT); 1689 1690 /* deal with undersized strip */ 1691 if (p->slen >= p->ssize) 1692 if (!enlarge(p, (p->ssize+1) / 2 * 3)) /* +50% */ 1693 return; 1694 1695 /* finally, it's all reduced to the easy case */ 1696 p->strip[p->slen++] = (sop)SOP(op, opnd); 1697} 1698 1699/* 1700 - doinsert - insert a sop into the strip 1701 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos); 1702 */ 1703static void 1704doinsert( 1705 struct parse *p, 1706 sop op, 1707 sopno opnd, 1708 sopno pos) 1709{ 1710 sopno sn; 1711 sop s; 1712 int i; 1713 1714 _DIAGASSERT(p != NULL); 1715 1716 /* avoid making error situations worse */ 1717 if (p->error != 0) 1718 return; 1719 1720 sn = HERE(); 1721 EMIT(op, opnd); /* do checks, ensure space */ 1722 assert(HERE() == sn+1); 1723 s = p->strip[sn]; 1724 1725 /* adjust paren pointers */ 1726 assert(pos > 0); 1727 for (i = 1; i < NPAREN; i++) { 1728 if (p->pbegin[i] >= pos) { 1729 p->pbegin[i]++; 1730 } 1731 if (p->pend[i] >= pos) { 1732 p->pend[i]++; 1733 } 1734 } 1735 1736 memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop)); 1737 p->strip[pos] = s; 1738} 1739 1740/* 1741 - dofwd - complete a forward reference 1742 == static void dofwd(struct parse *p, sopno pos, sop value); 1743 */ 1744static void 1745dofwd( 1746 struct parse *p, 1747 sopno pos, 1748 sopno value) 1749{ 1750 1751 _DIAGASSERT(p != NULL); 1752 1753 /* avoid making error situations worse */ 1754 if (p->error != 0) 1755 return; 1756 1757 assert(value < 1<<OPSHIFT); 1758 p->strip[pos] = (sop)(OP(p->strip[pos]) | value); 1759} 1760 1761/* 1762 - enlarge - enlarge the strip 1763 == static void enlarge(struct parse *p, sopno size); 1764 */ 1765static int 1766enlarge( 1767 struct parse *p, 1768 sopno size) 1769{ 1770 sop *sp; 1771 sopno osize; 1772 1773 _DIAGASSERT(p != NULL); 1774 1775 if (p->ssize >= size) 1776 return 1; 1777 1778 osize = p->ssize; 1779 p->ssize = size; 1780 if (MEMSIZE(p) > MEMLIMIT) 1781 goto oomem; 1782 sp = realloc(p->strip, p->ssize * sizeof(sop)); 1783 if (sp == NULL) { 1784oomem: 1785 p->ssize = osize; 1786 SETERROR(REG_ESPACE); 1787 return 0; 1788 } 1789 p->strip = sp; 1790 return 1; 1791} 1792 1793/* 1794 - stripsnug - compact the strip 1795 == static void stripsnug(struct parse *p, struct re_guts *g); 1796 */ 1797static void 1798stripsnug( 1799 struct parse *p, 1800 struct re_guts *g) 1801{ 1802 1803 _DIAGASSERT(p != NULL); 1804 _DIAGASSERT(g != NULL); 1805 1806 g->nstates = p->slen; 1807 g->strip = realloc(p->strip, p->slen * sizeof(sop)); 1808 if (g->strip == NULL) { 1809 SETERROR(REG_ESPACE); 1810 g->strip = p->strip; 1811 } 1812} 1813 1814/* 1815 - findmust - fill in must and mlen with longest mandatory literal string 1816 == static void findmust(struct parse *p, struct re_guts *g); 1817 * 1818 * This algorithm could do fancy things like analyzing the operands of | 1819 * for common subsequences. Someday. This code is simple and finds most 1820 * of the interesting cases. 1821 * 1822 * Note that must and mlen got initialized during setup. 1823 */ 1824static void 1825findmust( 1826 struct parse *p, 1827 struct re_guts *g) 1828{ 1829 sop *scan; 1830 sop *start = NULL; 1831 sop *newstart = NULL; 1832 sopno newlen; 1833 sop s; 1834 char *cp; 1835 sopno i; 1836 1837 _DIAGASSERT(p != NULL); 1838 _DIAGASSERT(g != NULL); 1839 1840 /* avoid making error situations worse */ 1841 if (p->error != 0) 1842 return; 1843 1844 /* find the longest OCHAR sequence in strip */ 1845 newlen = 0; 1846 scan = g->strip + 1; 1847 do { 1848 s = *scan++; 1849 switch (OP(s)) { 1850 case OCHAR: /* sequence member */ 1851 if (newlen == 0) /* new sequence */ 1852 newstart = scan - 1; 1853 newlen++; 1854 break; 1855 case OPLUS_: /* things that don't break one */ 1856 case OLPAREN: 1857 case ORPAREN: 1858 break; 1859 case OQUEST_: /* things that must be skipped */ 1860 case OCH_: 1861 scan--; 1862 do { 1863 scan += OPND(s); 1864 s = *scan; 1865 /* assert() interferes w debug printouts */ 1866 if (OP(s) != O_QUEST && OP(s) != O_CH && 1867 OP(s) != OOR2) { 1868 g->iflags |= BAD; 1869 return; 1870 } 1871 } while (OP(s) != O_QUEST && OP(s) != O_CH); 1872 /* FALLTHROUGH */ 1873 default: /* things that break a sequence */ 1874 if (newlen > g->mlen) { /* ends one */ 1875 start = newstart; 1876 g->mlen = newlen; 1877 } 1878 newlen = 0; 1879 break; 1880 } 1881 } while (OP(s) != OEND); 1882 1883 if (start == NULL) 1884 g->mlen = 0; 1885 1886 if (g->mlen == 0) /* there isn't one */ 1887 return; 1888 1889 /* turn it into a character string */ 1890 g->must = malloc((size_t)g->mlen + 1); 1891 if (g->must == NULL) { /* argh; just forget it */ 1892 g->mlen = 0; 1893 return; 1894 } 1895 cp = g->must; 1896 scan = start; 1897 for (i = g->mlen; i > 0; i--) { 1898 while (OP(s = *scan++) != OCHAR) 1899 continue; 1900 assert(cp < g->must + g->mlen); 1901 *cp++ = (char)OPND(s); 1902 } 1903 assert(cp == g->must + g->mlen); 1904 *cp++ = '\0'; /* just on general principles */ 1905} 1906 1907/* 1908 - pluscount - count + nesting 1909 == static sopno pluscount(struct parse *p, struct re_guts *g); 1910 */ 1911static sopno /* nesting depth */ 1912pluscount( 1913 struct parse *p, 1914 struct re_guts *g) 1915{ 1916 sop *scan; 1917 sop s; 1918 sopno plusnest = 0; 1919 sopno maxnest = 0; 1920 1921 _DIAGASSERT(p != NULL); 1922 _DIAGASSERT(g != NULL); 1923 1924 if (p->error != 0) 1925 return(0); /* there may not be an OEND */ 1926 1927 scan = g->strip + 1; 1928 do { 1929 s = *scan++; 1930 switch (OP(s)) { 1931 case OPLUS_: 1932 plusnest++; 1933 break; 1934 case O_PLUS: 1935 if (plusnest > maxnest) 1936 maxnest = plusnest; 1937 plusnest--; 1938 break; 1939 } 1940 } while (OP(s) != OEND); 1941 if (plusnest != 0) 1942 g->iflags |= BAD; 1943 return(maxnest); 1944} 1945