1/*	$NetBSD: engine.c,v 1.24 2012/03/13 21:13:42 christos Exp $	*/
2
3/*-
4 * Copyright (c) 1992, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
35 */
36
37/*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 *    must display the following acknowledgement:
53 *	This product includes software developed by the University of
54 *	California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 *    may be used to endorse or promote products derived from this software
57 *    without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 *	@(#)engine.c	8.5 (Berkeley) 3/20/94
72 */
73
74/*
75 * The matching engine and friends.  This file is #included by regexec.c
76 * after suitable #defines of a variety of macros used herein, so that
77 * different state representations can be used without duplicating masses
78 * of code.
79 */
80
81#ifdef SNAMES
82#define	matcher	smatcher
83#define	fast	sfast
84#define	slow	sslow
85#define	dissect	sdissect
86#define	backref	sbackref
87#define	step	sstep
88#define	print	sprint
89#define	at	sat
90#define	match	smat
91#define	nope	snope
92#endif
93#ifdef LNAMES
94#define	matcher	lmatcher
95#define	fast	lfast
96#define	slow	lslow
97#define	dissect	ldissect
98#define	backref	lbackref
99#define	step	lstep
100#define	print	lprint
101#define	at	lat
102#define	match	lmat
103#define	nope	lnope
104#endif
105
106/* another structure passed up and down to avoid zillions of parameters */
107struct match {
108	struct re_guts *g;
109	int eflags;
110	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
111	const char *offp;	/* offsets work from here */
112	const char *beginp;	/* start of string -- virtual NUL precedes */
113	const char *endp;	/* end of string -- virtual NUL here */
114	const char *coldp;	/* can be no match starting before here */
115	const char **lastpos;	/* [nplus+1] */
116	STATEVARS;
117	states st;		/* current states */
118	states fresh;		/* states for a fresh start */
119	states tmp;		/* temporary */
120	states empty;		/* empty set of states */
121};
122
123/* ========= begin header generated by ./mkh ========= */
124#ifdef __cplusplus
125extern "C" {
126#endif
127
128/* === engine.c === */
129static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
130static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
131static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev);
132static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
133static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
134static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
135#define	BOL	(OUT+1)
136#define	EOL	(BOL+1)
137#define	BOLEOL	(BOL+2)
138#define	NOTHING	(BOL+3)
139#define	BOW	(BOL+4)
140#define	EOW	(BOL+5)
141#define	CODEMAX	(BOL+5)		/* highest code used */
142#define	NONCHAR(c)	((c) > CHAR_MAX)
143#define	NNONCHAR	(CODEMAX-CHAR_MAX)
144#ifdef REDEBUG
145static void print(struct match *m, char *caption, states st, int ch, FILE *d);
146#endif
147#ifdef REDEBUG
148static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
149#endif
150#ifdef REDEBUG
151static char *pchar(int ch);
152#endif
153
154#ifdef __cplusplus
155}
156#endif
157/* ========= end header generated by ./mkh ========= */
158
159#ifdef REDEBUG
160#define	SP(t, s, c)	print(m, t, s, c, stdout)
161#define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
162#define	NOTE(str)	{ if (m->eflags&REG_TRACE) printf("=%s\n", (str)); }
163static int nope = 0;
164#else
165#define	SP(t, s, c)	/* nothing */
166#define	AT(t, p1, p2, s1, s2)	/* nothing */
167#define	NOTE(s)	/* nothing */
168#endif
169
170/*
171 - matcher - the actual matching engine
172 == static int matcher(struct re_guts *g, char *string, \
173 ==	size_t nmatch, regmatch_t pmatch[], int eflags);
174 */
175static int			/* 0 success, REG_NOMATCH failure */
176matcher(
177    struct re_guts *g,
178    const char *string,
179    size_t nmatch,
180    regmatch_t pmatch[],
181    int eflags)
182{
183	const char *endp;
184	size_t i;
185	struct match mv;
186	struct match *m = &mv;
187	const char *dp;
188	const sopno gf = g->firststate+1;	/* +1 for OEND */
189	const sopno gl = g->laststate;
190	const char *start;
191	const char *stop;
192	int error = 0;
193
194	_DIAGASSERT(g != NULL);
195	_DIAGASSERT(string != NULL);
196	/* pmatch checked below */
197
198	/* simplify the situation where possible */
199	if (g->cflags&REG_NOSUB)
200		nmatch = 0;
201	if (eflags&REG_STARTEND) {
202		_DIAGASSERT(pmatch != NULL);
203		start = string + (size_t)pmatch[0].rm_so;
204		stop = string + (size_t)pmatch[0].rm_eo;
205	} else {
206		start = string;
207		stop = start + strlen(start);
208	}
209	if (stop < start)
210		return(REG_INVARG);
211
212	/* prescreening; this does wonders for this rather slow code */
213	if (g->must != NULL) {
214		for (dp = start; dp < stop; dp++)
215			if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
216				memcmp(dp, g->must, g->mlen) == 0)
217				break;
218		if (dp == stop)		/* we didn't find g->must */
219			return(REG_NOMATCH);
220	}
221
222	/* match struct setup */
223	m->g = g;
224	m->eflags = eflags;
225	m->pmatch = NULL;
226	m->lastpos = NULL;
227	m->offp = string;
228	m->beginp = start;
229	m->endp = stop;
230	STATESETUP(m, 4);
231	SETUP(m->st);
232	SETUP(m->fresh);
233	SETUP(m->tmp);
234	SETUP(m->empty);
235	CLEAR(m->empty);
236
237	/* this loop does only one repetition except for backrefs */
238	for (;;) {
239		endp = fast(m, start, stop, gf, gl);
240		if (endp == NULL) {		/* a miss */
241			error = REG_NOMATCH;
242			goto done;
243		}
244		if (nmatch == 0 && !g->backrefs)
245			break;		/* no further info needed */
246
247		/* where? */
248		assert(m->coldp != NULL);
249		for (;;) {
250			NOTE("finding start");
251			endp = slow(m, m->coldp, stop, gf, gl);
252			if (endp != NULL)
253				break;
254			assert(m->coldp < m->endp);
255			m->coldp++;
256		}
257		if (nmatch == 1 && !g->backrefs)
258			break;		/* no further info needed */
259
260		/* oh my, he wants the subexpressions... */
261		if (m->pmatch == NULL)
262			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
263							sizeof(regmatch_t));
264		if (m->pmatch == NULL) {
265			error = REG_ESPACE;
266			goto done;
267		}
268		for (i = 1; i <= m->g->nsub; i++)
269			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
270		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
271			NOTE("dissecting");
272			dp = dissect(m, m->coldp, endp, gf, gl);
273		} else {
274			if (g->nplus > 0 && m->lastpos == NULL)
275				m->lastpos = malloc((g->nplus+1) *
276							sizeof(const char *));
277			if (g->nplus > 0 && m->lastpos == NULL) {
278				error = REG_ESPACE;
279				goto done;
280			}
281			NOTE("backref dissect");
282			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
283		}
284		if (dp != NULL)
285			break;
286
287		/* uh-oh... we couldn't find a subexpression-level match */
288		assert(g->backrefs);	/* must be back references doing it */
289		assert(g->nplus == 0 || m->lastpos != NULL);
290		for (;;) {
291			if (dp != NULL || endp <= m->coldp)
292				break;		/* defeat */
293			NOTE("backoff");
294			endp = slow(m, m->coldp, endp-1, gf, gl);
295			if (endp == NULL)
296				break;		/* defeat */
297			/* try it on a shorter possibility */
298#ifndef NDEBUG
299			for (i = 1; i <= m->g->nsub; i++) {
300				assert(m->pmatch[i].rm_so == (regoff_t)-1);
301				assert(m->pmatch[i].rm_eo == (regoff_t)-1);
302			}
303#endif
304			NOTE("backoff dissect");
305			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
306		}
307		assert(dp == NULL || dp == endp);
308		if (dp != NULL)		/* found a shorter one */
309			break;
310
311		/* despite initial appearances, there is no match here */
312		NOTE("false alarm");
313		start = m->coldp + 1;	/* recycle starting later */
314		assert(start <= stop);
315	}
316
317	/* fill in the details if requested */
318	if (nmatch > 0) {
319		_DIAGASSERT(pmatch != NULL);
320		pmatch[0].rm_so = m->coldp - m->offp;
321		pmatch[0].rm_eo = endp - m->offp;
322	}
323	if (nmatch > 1) {
324		assert(m->pmatch != NULL);
325		for (i = 1; i < nmatch; i++)
326			if (i <= m->g->nsub)
327				pmatch[i] = m->pmatch[i];
328			else {
329				pmatch[i].rm_so = (regoff_t)-1;
330				pmatch[i].rm_eo = (regoff_t)-1;
331			}
332	}
333
334done:
335	if (m->pmatch != NULL) {
336		free(m->pmatch);
337		m->pmatch = NULL;
338	}
339	if (m->lastpos != NULL) {
340		free(m->lastpos);
341		m->lastpos = NULL;
342	}
343	STATETEARDOWN(m);
344	return error;
345}
346
347/*
348 - dissect - figure out what matched what, no back references
349 == static const char *dissect(struct match *m, const char *start, \
350 ==	const char *stop, sopno startst, sopno stopst);
351 */
352static const char *			/* == stop (success) always */
353dissect(
354    struct match *m,
355    const char *start,
356    const char *stop,
357    sopno startst,
358    sopno stopst)
359{
360	int i;
361	sopno ss;	/* start sop of current subRE */
362	sopno es;	/* end sop of current subRE */
363	const char *sp;	/* start of string matched by it */
364	const char *stp; /* string matched by it cannot pass here */
365	const char *rest; /* start of rest of string */
366	const char *tail; /* string unmatched by rest of RE */
367	sopno ssub;	/* start sop of subsubRE */
368	sopno esub;	/* end sop of subsubRE */
369	const char *ssp; /* start of string matched by subsubRE */
370	const char *sep; /* end of string matched by subsubRE */
371	const char *oldssp; /* previous ssp */
372#ifndef NDEBUG
373	const char *dp;
374#endif
375
376	_DIAGASSERT(m != NULL);
377	_DIAGASSERT(start != NULL);
378	_DIAGASSERT(stop != NULL);
379
380	AT("diss", start, stop, startst, stopst);
381	sp = start;
382	for (ss = startst; ss < stopst; ss = es) {
383		/* identify end of subRE */
384		es = ss;
385		switch (OP(m->g->strip[es])) {
386		case OPLUS_:
387		case OQUEST_:
388			es += OPND(m->g->strip[es]);
389			break;
390		case OCH_:
391			while (OP(m->g->strip[es]) != O_CH)
392				es += OPND(m->g->strip[es]);
393			break;
394		}
395		es++;
396
397		/* figure out what it matched */
398		switch (OP(m->g->strip[ss])) {
399		case OEND:
400			assert(nope);
401			break;
402		case OCHAR:
403			sp++;
404			break;
405		case OBOL:
406		case OEOL:
407		case OBOW:
408		case OEOW:
409			break;
410		case OANY:
411		case OANYOF:
412			sp++;
413			break;
414		case OBACK_:
415		case O_BACK:
416			assert(nope);
417			break;
418		/* cases where length of match is hard to find */
419		case OQUEST_:
420			stp = stop;
421			for (;;) {
422				/* how long could this one be? */
423				rest = slow(m, sp, stp, ss, es);
424				assert(rest != NULL);	/* it did match */
425				/* could the rest match the rest? */
426				tail = slow(m, rest, stop, es, stopst);
427				if (tail == stop)
428					break;		/* yes! */
429				/* no -- try a shorter match for this one */
430				stp = rest - 1;
431				assert(stp >= sp);	/* it did work */
432			}
433			ssub = ss + 1;
434			esub = es - 1;
435			/* did innards match? */
436			if (slow(m, sp, rest, ssub, esub) != NULL) {
437#ifdef NDEBUG
438				(void)
439#else
440				dp =
441#endif
442				    dissect(m, sp, rest, ssub, esub);
443				assert(dp == rest);
444			} else		/* no */
445				assert(sp == rest);
446			sp = rest;
447			break;
448		case OPLUS_:
449			stp = stop;
450			for (;;) {
451				/* how long could this one be? */
452				rest = slow(m, sp, stp, ss, es);
453				assert(rest != NULL);	/* it did match */
454				/* could the rest match the rest? */
455				tail = slow(m, rest, stop, es, stopst);
456				if (tail == stop)
457					break;		/* yes! */
458				/* no -- try a shorter match for this one */
459				stp = rest - 1;
460				assert(stp >= sp);	/* it did work */
461			}
462			ssub = ss + 1;
463			esub = es - 1;
464			ssp = sp;
465			oldssp = ssp;
466			for (;;) {	/* find last match of innards */
467				sep = slow(m, ssp, rest, ssub, esub);
468				if (sep == NULL || sep == ssp)
469					break;	/* failed or matched null */
470				oldssp = ssp;	/* on to next try */
471				ssp = sep;
472			}
473			if (sep == NULL) {
474				/* last successful match */
475				sep = ssp;
476				ssp = oldssp;
477			}
478			assert(sep == rest);	/* must exhaust substring */
479			assert(slow(m, ssp, sep, ssub, esub) == rest);
480#ifdef NDEBUG
481			(void)
482#else
483			dp =
484#endif
485			    dissect(m, ssp, sep, ssub, esub);
486			assert(dp == sep);
487			sp = rest;
488			break;
489		case OCH_:
490			stp = stop;
491			for (;;) {
492				/* how long could this one be? */
493				rest = slow(m, sp, stp, ss, es);
494				assert(rest != NULL);	/* it did match */
495				/* could the rest match the rest? */
496				tail = slow(m, rest, stop, es, stopst);
497				if (tail == stop)
498					break;		/* yes! */
499				/* no -- try a shorter match for this one */
500				stp = rest - 1;
501				assert(stp >= sp);	/* it did work */
502			}
503			ssub = ss + 1;
504			esub = ss + OPND(m->g->strip[ss]) - 1;
505			assert(OP(m->g->strip[esub]) == OOR1);
506			for (;;) {	/* find first matching branch */
507				if (slow(m, sp, rest, ssub, esub) == rest)
508					break;	/* it matched all of it */
509				/* that one missed, try next one */
510				assert(OP(m->g->strip[esub]) == OOR1);
511				esub++;
512				assert(OP(m->g->strip[esub]) == OOR2);
513				ssub = esub + 1;
514				esub += OPND(m->g->strip[esub]);
515				if (OP(m->g->strip[esub]) == OOR2)
516					esub--;
517				else
518					assert(OP(m->g->strip[esub]) == O_CH);
519			}
520#ifdef NDEBUG
521			(void)
522#else
523			dp =
524#endif
525			    dissect(m, sp, rest, ssub, esub);
526			assert(dp == rest);
527			sp = rest;
528			break;
529		case O_PLUS:
530		case O_QUEST:
531		case OOR1:
532		case OOR2:
533		case O_CH:
534			assert(nope);
535			break;
536		case OLPAREN:
537			i = OPND(m->g->strip[ss]);
538			assert(0 < i && i <= m->g->nsub);
539			m->pmatch[i].rm_so = sp - m->offp;
540			break;
541		case ORPAREN:
542			i = OPND(m->g->strip[ss]);
543			assert(0 < i && i <= m->g->nsub);
544			m->pmatch[i].rm_eo = sp - m->offp;
545			break;
546		default:		/* uh oh */
547			assert(nope);
548			break;
549		}
550	}
551
552	assert(sp == stop);
553	return(sp);
554}
555
556/*
557 - backref - figure out what matched what, figuring in back references
558 == static const char *backref(struct match *m, const char *start, \
559 ==	const char *stop, sopno startst, sopno stopst, sopno lev);
560 */
561static const char *		/* == stop (success) or NULL (failure) */
562backref(
563    struct match *m,
564    const char *start,
565    const char *stop,
566    sopno startst,
567    sopno stopst,
568    sopno lev)			/* PLUS nesting level */
569{
570	int i;
571	sopno ss;	/* start sop of current subRE */
572	const char *sp;	/* start of string matched by it */
573	sopno ssub;	/* start sop of subsubRE */
574	sopno esub;	/* end sop of subsubRE */
575	const char *ssp; /* start of string matched by subsubRE */
576	const char *dp;
577	size_t len;
578	int hard;
579	sop s;
580	regoff_t offsave;
581	cset *cs;
582
583	_DIAGASSERT(m != NULL);
584	_DIAGASSERT(start != NULL);
585	_DIAGASSERT(stop != NULL);
586
587	AT("back", start, stop, startst, stopst);
588	sp = start;
589
590	/* get as far as we can with easy stuff */
591	hard = 0;
592	for (ss = startst; !hard && ss < stopst; ss++)
593		switch (OP(s = m->g->strip[ss])) {
594		case OCHAR:
595			if (sp == stop || *sp++ != (char)OPND(s))
596				return(NULL);
597			break;
598		case OANY:
599			if (sp == stop)
600				return(NULL);
601			sp++;
602			break;
603		case OANYOF:
604			cs = &m->g->sets[OPND(s)];
605			if (sp == stop || !CHIN(cs, *sp++))
606				return(NULL);
607			break;
608		case OBOL:
609			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
610					(sp < m->endp && *(sp-1) == '\n' &&
611						(m->g->cflags&REG_NEWLINE)) )
612				{ /* yes */ }
613			else
614				return(NULL);
615			break;
616		case OEOL:
617			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
618					(sp < m->endp && *sp == '\n' &&
619						(m->g->cflags&REG_NEWLINE)) )
620				{ /* yes */ }
621			else
622				return(NULL);
623			break;
624		case OBOW:
625			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
626					(sp < m->endp && *(sp-1) == '\n' &&
627						(m->g->cflags&REG_NEWLINE)) ||
628					(sp > m->beginp &&
629							!ISWORD(*(sp-1))) ) &&
630					(sp < m->endp && ISWORD(*sp)) )
631				{ /* yes */ }
632			else
633				return(NULL);
634			break;
635		case OEOW:
636			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
637					(sp < m->endp && *sp == '\n' &&
638						(m->g->cflags&REG_NEWLINE)) ||
639					(sp < m->endp && !ISWORD(*sp)) ) &&
640					(sp > m->beginp && ISWORD(*(sp-1))) )
641				{ /* yes */ }
642			else
643				return(NULL);
644			break;
645		case O_QUEST:
646			break;
647		case OOR1:	/* matches null but needs to skip */
648			ss++;
649			s = m->g->strip[ss];
650			do {
651				assert(OP(s) == OOR2);
652				ss += OPND(s);
653			} while (OP(s = m->g->strip[ss]) != O_CH);
654			/* note that the ss++ gets us past the O_CH */
655			break;
656		default:	/* have to make a choice */
657			hard = 1;
658			break;
659		}
660	if (!hard) {		/* that was it! */
661		if (sp != stop)
662			return(NULL);
663		return(sp);
664	}
665	ss--;			/* adjust for the for's final increment */
666
667	/* the hard stuff */
668	AT("hard", sp, stop, ss, stopst);
669	s = m->g->strip[ss];
670	switch (OP(s)) {
671	case OBACK_:		/* the vilest depths */
672		i = OPND(s);
673		assert(0 < i && i <= m->g->nsub);
674		if (m->pmatch[i].rm_eo == (regoff_t)-1)
675			return(NULL);
676		assert(m->pmatch[i].rm_so != (regoff_t)-1);
677		len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
678		if (len == 0)
679			return(NULL);
680		assert(stop - m->beginp >= len);
681		if (sp > stop - len)
682			return(NULL);	/* not enough left to match */
683		ssp = m->offp + (size_t)m->pmatch[i].rm_so;
684		if (memcmp(sp, ssp, len) != 0)
685			return(NULL);
686		while (m->g->strip[ss] != SOP(O_BACK, i))
687			ss++;
688		return(backref(m, sp+len, stop, ss+1, stopst, lev));
689
690	case OQUEST_:		/* to null or not */
691		dp = backref(m, sp, stop, ss+1, stopst, lev);
692		if (dp != NULL)
693			return(dp);	/* not */
694		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
695
696	case OPLUS_:
697		assert(m->lastpos != NULL);
698		assert(lev+1 <= m->g->nplus);
699		m->lastpos[lev+1] = sp;
700		return(backref(m, sp, stop, ss+1, stopst, lev+1));
701
702	case O_PLUS:
703		if (sp == m->lastpos[lev])	/* last pass matched null */
704			return(backref(m, sp, stop, ss+1, stopst, lev-1));
705		/* try another pass */
706		m->lastpos[lev] = sp;
707		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
708		if (dp == NULL)
709			dp = backref(m, sp, stop, ss+1, stopst, lev-1);
710		return(dp);
711
712	case OCH_:		/* find the right one, if any */
713		ssub = ss + 1;
714		esub = ss + OPND(s) - 1;
715		assert(OP(m->g->strip[esub]) == OOR1);
716		for (;;) {	/* find first matching branch */
717			dp = backref(m, sp, stop, ssub, esub, lev);
718			if (dp != NULL)
719				return(dp);
720			/* that one missed, try next one */
721			if (OP(m->g->strip[esub]) == O_CH)
722				return(NULL);	/* there is none */
723			esub++;
724			assert(OP(m->g->strip[esub]) == OOR2);
725			ssub = esub + 1;
726			esub += OPND(m->g->strip[esub]);
727			if (OP(m->g->strip[esub]) == OOR2)
728				esub--;
729			else
730				assert(OP(m->g->strip[esub]) == O_CH);
731		}
732
733	case OLPAREN:		/* must undo assignment if rest fails */
734		i = OPND(s);
735		assert(0 < i && i <= m->g->nsub);
736		offsave = m->pmatch[i].rm_so;
737		m->pmatch[i].rm_so = sp - m->offp;
738		dp = backref(m, sp, stop, ss+1, stopst, lev);
739		if (dp != NULL)
740			return(dp);
741		m->pmatch[i].rm_so = offsave;
742		return(NULL);
743
744	case ORPAREN:		/* must undo assignment if rest fails */
745		i = OPND(s);
746		assert(0 < i && i <= m->g->nsub);
747		offsave = m->pmatch[i].rm_eo;
748		m->pmatch[i].rm_eo = sp - m->offp;
749		dp = backref(m, sp, stop, ss+1, stopst, lev);
750		if (dp != NULL)
751			return(dp);
752		m->pmatch[i].rm_eo = offsave;
753		return(NULL);
754
755	default:		/* uh oh */
756		assert(nope);
757		break;
758	}
759
760	/* "can't happen" */
761	assert(nope);
762	/* NOTREACHED */
763	return NULL;
764}
765
766/*
767 - fast - step through the string at top speed
768 == static const char *fast(struct match *m, const char *start, \
769 ==	const char *stop, sopno startst, sopno stopst);
770 */
771static const char *		/* where tentative match ended, or NULL */
772fast(
773    struct match *m,
774    const char *start,
775    const char *stop,
776    sopno startst,
777    sopno stopst)
778{
779	states st = m->st;
780	states fresh = m->fresh;
781	states tmp = m->tmp;
782	const char *p = start;
783	int c = (start == m->beginp) ? OUT : *(start-1);
784	int lastc;	/* previous c */
785	int flagch;
786	size_t i;
787	const char *coldp; /* last p after which no match was underway */
788
789	_DIAGASSERT(m != NULL);
790	_DIAGASSERT(start != NULL);
791	_DIAGASSERT(stop != NULL);
792
793	CLEAR(st);
794	SET1(st, startst);
795	st = step(m->g, startst, stopst, st, NOTHING, st);
796	ASSIGN(fresh, st);
797	SP("start", st, *p);
798	coldp = NULL;
799	for (;;) {
800		/* next character */
801		lastc = c;
802		c = (p == m->endp) ? OUT : *p;
803		if (EQ(st, fresh))
804			coldp = p;
805
806		/* is there an EOL and/or BOL between lastc and c? */
807		flagch = '\0';
808		i = 0;
809		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
810				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
811			flagch = BOL;
812			i = m->g->nbol;
813		}
814		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
815				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
816			flagch = (flagch == BOL) ? BOLEOL : EOL;
817			i += m->g->neol;
818		}
819		if (i != 0) {
820			for (; i > 0; i--)
821				st = step(m->g, startst, stopst, st, flagch, st);
822			SP("boleol", st, c);
823		}
824
825		/* how about a word boundary? */
826		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
827					(c != OUT && ISWORD(c)) ) {
828			flagch = BOW;
829		}
830		if ( (lastc != OUT && ISWORD(lastc)) &&
831				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
832			flagch = EOW;
833		}
834		if (flagch == BOW || flagch == EOW) {
835			st = step(m->g, startst, stopst, st, flagch, st);
836			SP("boweow", st, c);
837		}
838
839		/* are we done? */
840		if (ISSET(st, stopst) || p == stop)
841			break;		/* NOTE BREAK OUT */
842
843		/* no, we must deal with this character */
844		ASSIGN(tmp, st);
845		ASSIGN(st, fresh);
846		assert(c != OUT);
847		st = step(m->g, startst, stopst, tmp, c, st);
848		SP("aft", st, c);
849		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
850		p++;
851	}
852
853	assert(coldp != NULL);
854	m->coldp = coldp;
855	if (ISSET(st, stopst))
856		return(p+1);
857	else
858		return(NULL);
859}
860
861/*
862 - slow - step through the string more deliberately
863 == static const char *slow(struct match *m, const char *start, \
864 ==	const char *stop, sopno startst, sopno stopst);
865 */
866static const char *			/* where it ended */
867slow(
868    struct match *m,
869    const char *start,
870    const char *stop,
871    sopno startst,
872    sopno stopst)
873{
874	states st = m->st;
875	states empty = m->empty;
876	states tmp = m->tmp;
877	const char *p = start;
878	int c = (start == m->beginp) ? OUT : *(start-1);
879	int lastc;	/* previous c */
880	int flagch;
881	size_t i;
882	const char *matchp;	/* last p at which a match ended */
883
884	_DIAGASSERT(m != NULL);
885	_DIAGASSERT(start != NULL);
886	_DIAGASSERT(stop != NULL);
887
888	AT("slow", start, stop, startst, stopst);
889	CLEAR(st);
890	SET1(st, startst);
891	SP("sstart", st, *p);
892	st = step(m->g, startst, stopst, st, NOTHING, st);
893	matchp = NULL;
894	for (;;) {
895		/* next character */
896		lastc = c;
897		c = (p == m->endp) ? OUT : *p;
898
899		/* is there an EOL and/or BOL between lastc and c? */
900		flagch = '\0';
901		i = 0;
902		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
903				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
904			flagch = BOL;
905			i = m->g->nbol;
906		}
907		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
908				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
909			flagch = (flagch == BOL) ? BOLEOL : EOL;
910			i += m->g->neol;
911		}
912		if (i != 0) {
913			for (; i > 0; i--)
914				st = step(m->g, startst, stopst, st, flagch, st);
915			SP("sboleol", st, c);
916		}
917
918		/* how about a word boundary? */
919		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
920					(c != OUT && ISWORD(c)) ) {
921			flagch = BOW;
922		}
923		if ( (lastc != OUT && ISWORD(lastc)) &&
924				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
925			flagch = EOW;
926		}
927		if (flagch == BOW || flagch == EOW) {
928			st = step(m->g, startst, stopst, st, flagch, st);
929			SP("sboweow", st, c);
930		}
931
932		/* are we done? */
933		if (ISSET(st, stopst))
934			matchp = p;
935		if (EQ(st, empty) || p == stop)
936			break;		/* NOTE BREAK OUT */
937
938		/* no, we must deal with this character */
939		ASSIGN(tmp, st);
940		ASSIGN(st, empty);
941		assert(c != OUT);
942		st = step(m->g, startst, stopst, tmp, c, st);
943		SP("saft", st, c);
944		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
945		p++;
946	}
947
948	return(matchp);
949}
950
951
952/*
953 - step - map set of states reachable before char to set reachable after
954 == static states step(struct re_guts *g, sopno start, sopno stop, \
955 ==	states bef, int ch, states aft);
956 == #define	BOL	(OUT+1)
957 == #define	EOL	(BOL+1)
958 == #define	BOLEOL	(BOL+2)
959 == #define	NOTHING	(BOL+3)
960 == #define	BOW	(BOL+4)
961 == #define	EOW	(BOL+5)
962 == #define	CODEMAX	(BOL+5)		// highest code used
963 == #define	NONCHAR(c)	((c) > CHAR_MAX)
964 == #define	NNONCHAR	(CODEMAX-CHAR_MAX)
965 */
966static states
967step(
968    struct re_guts *g,
969    sopno start,		/* start state within strip */
970    sopno stop,			/* state after stop state within strip */
971    states bef,			/* states reachable before */
972    int ch,			/* character or NONCHAR code */
973    states aft)			/* states already known reachable after */
974{
975	cset *cs;
976	sop s;
977	sopno pc;
978	onestate here;		/* note, macros know this name */
979	sopno look;
980	int i;
981
982	_DIAGASSERT(g != NULL);
983
984	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
985		s = g->strip[pc];
986		switch (OP(s)) {
987		case OEND:
988			assert(pc == stop-1);
989			break;
990		case OCHAR:
991			/* only characters can match */
992			assert(!NONCHAR(ch) || ch != (char)OPND(s));
993			if (ch == (char)OPND(s))
994				FWD(aft, bef, 1);
995			break;
996		case OBOL:
997			if (ch == BOL || ch == BOLEOL)
998				FWD(aft, bef, 1);
999			break;
1000		case OEOL:
1001			if (ch == EOL || ch == BOLEOL)
1002				FWD(aft, bef, 1);
1003			break;
1004		case OBOW:
1005			if (ch == BOW)
1006				FWD(aft, bef, 1);
1007			break;
1008		case OEOW:
1009			if (ch == EOW)
1010				FWD(aft, bef, 1);
1011			break;
1012		case OANY:
1013			if (!NONCHAR(ch))
1014				FWD(aft, bef, 1);
1015			break;
1016		case OANYOF:
1017			cs = &g->sets[OPND(s)];
1018			if (!NONCHAR(ch) && CHIN(cs, ch))
1019				FWD(aft, bef, 1);
1020			break;
1021		case OBACK_:		/* ignored here */
1022		case O_BACK:
1023			FWD(aft, aft, 1);
1024			break;
1025		case OPLUS_:		/* forward, this is just an empty */
1026			FWD(aft, aft, 1);
1027			break;
1028		case O_PLUS:		/* both forward and back */
1029			FWD(aft, aft, 1);
1030			i = ISSETBACK(aft, OPND(s));
1031			BACK(aft, aft, OPND(s));
1032			if (!i && ISSETBACK(aft, OPND(s))) {
1033				/* oho, must reconsider loop body */
1034				pc -= OPND(s) + 1;
1035				INIT(here, pc);
1036			}
1037			break;
1038		case OQUEST_:		/* two branches, both forward */
1039			FWD(aft, aft, 1);
1040			FWD(aft, aft, OPND(s));
1041			break;
1042		case O_QUEST:		/* just an empty */
1043			FWD(aft, aft, 1);
1044			break;
1045		case OLPAREN:		/* not significant here */
1046		case ORPAREN:
1047			FWD(aft, aft, 1);
1048			break;
1049		case OCH_:		/* mark the first two branches */
1050			FWD(aft, aft, 1);
1051			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1052			FWD(aft, aft, OPND(s));
1053			break;
1054		case OOR1:		/* done a branch, find the O_CH */
1055			if (ISSTATEIN(aft, here)) {
1056				for (look = 1;
1057						OP(s = g->strip[pc+look]) != O_CH;
1058						look += OPND(s))
1059					assert(OP(s) == OOR2);
1060				FWD(aft, aft, look);
1061			}
1062			break;
1063		case OOR2:		/* propagate OCH_'s marking */
1064			FWD(aft, aft, 1);
1065			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1066				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1067				FWD(aft, aft, OPND(s));
1068			}
1069			break;
1070		case O_CH:		/* just empty */
1071			FWD(aft, aft, 1);
1072			break;
1073		default:		/* ooooops... */
1074			assert(nope);
1075			break;
1076		}
1077	}
1078
1079	return(aft);
1080}
1081
1082#ifdef REDEBUG
1083/*
1084 - print - print a set of states
1085 == #ifdef REDEBUG
1086 == static void print(struct match *m, char *caption, states st, \
1087 ==	int ch, FILE *d);
1088 == #endif
1089 */
1090static void
1091print(
1092    struct match *m,
1093    char *caption,
1094    states st,
1095    int ch,
1096    FILE *d)
1097{
1098	struct re_guts *g = m->g;
1099	int i;
1100	int first = 1;
1101
1102	_DIAGASSERT(m != NULL);
1103	_DIAGASSERT(caption != NULL);
1104
1105	if (!(m->eflags&REG_TRACE))
1106		return;
1107
1108	_DIAGASSERT(d != NULL);
1109
1110	fprintf(d, "%s", caption);
1111	if (ch != '\0')
1112		fprintf(d, " %s", pchar(ch));
1113	for (i = 0; i < g->nstates; i++)
1114		if (ISSET(st, i)) {
1115			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1116			first = 0;
1117		}
1118	fprintf(d, "\n");
1119}
1120
1121/*
1122 - at - print current situation
1123 == #ifdef REDEBUG
1124 == static void at(struct match *m, char *title, char *start, char *stop, \
1125 ==						sopno startst, sopno stopst);
1126 == #endif
1127 */
1128static void
1129at(
1130    struct match *m,
1131    char *title,
1132    char *start,
1133    char *stop,
1134    sopno startst,
1135    sopno stopst)
1136{
1137
1138	_DIAGASSERT(m != NULL);
1139	_DIAGASSERT(title != NULL);
1140	_DIAGASSERT(start != NULL);
1141	_DIAGASSERT(stop != NULL);
1142
1143	if (!(m->eflags&REG_TRACE))
1144		return;
1145
1146	printf("%s %s-", title, pchar(*start));
1147	printf("%s ", pchar(*stop));
1148	printf("%ld-%ld\n", (long)startst, (long)stopst);
1149}
1150
1151#ifndef PCHARDONE
1152#define	PCHARDONE	/* never again */
1153/*
1154 - pchar - make a character printable
1155 == #ifdef REDEBUG
1156 == static char *pchar(int ch);
1157 == #endif
1158 *
1159 * Is this identical to regchar() over in debug.c?  Well, yes.  But a
1160 * duplicate here avoids having a debugging-capable regexec.o tied to
1161 * a matching debug.o, and this is convenient.  It all disappears in
1162 * the non-debug compilation anyway, so it doesn't matter much.
1163 */
1164static char *			/* -> representation */
1165pchar(
1166    int ch)
1167{
1168	static char pbuf[10];
1169
1170	if (isprint(ch) || ch == ' ')
1171		(void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1172	else
1173		(void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1174	return(pbuf);
1175}
1176#endif
1177#endif
1178
1179#undef	matcher
1180#undef	fast
1181#undef	slow
1182#undef	dissect
1183#undef	backref
1184#undef	step
1185#undef	print
1186#undef	at
1187#undef	match
1188#undef	nope
1189