1/*	$NetBSD: regcomp.c,v 1.36 2015/09/12 19:08:47 christos Exp $	*/
2
3/*-
4 * Copyright (c) 1992, 1993, 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
35 */
36
37/*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 *    must display the following acknowledgement:
53 *	This product includes software developed by the University of
54 *	California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 *    may be used to endorse or promote products derived from this software
57 *    without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
72 */
73
74#include <sys/cdefs.h>
75#if defined(LIBC_SCCS) && !defined(lint)
76#if 0
77static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
78#else
79__RCSID("$NetBSD: regcomp.c,v 1.36 2015/09/12 19:08:47 christos Exp $");
80#endif
81#endif /* LIBC_SCCS and not lint */
82
83#include "namespace.h"
84#include <sys/types.h>
85
86#include <assert.h>
87#include <ctype.h>
88#include <limits.h>
89#include <stdio.h>
90#include <stdlib.h>
91#include <string.h>
92#include <regex.h>
93
94#ifdef __weak_alias
95__weak_alias(regcomp,_regcomp)
96#endif
97
98#include "utils.h"
99#include "regex2.h"
100
101#include "cclass.h"
102#include "cname.h"
103
104/*
105 * parse structure, passed up and down to avoid global variables and
106 * other clumsinesses
107 */
108struct parse {
109	const char *next;	/* next character in RE */
110	const char *end;	/* end of string (-> NUL normally) */
111	int error;		/* has an error been seen? */
112	sop *strip;		/* malloced strip */
113	sopno ssize;		/* malloced strip size (allocated) */
114	sopno slen;		/* malloced strip length (used) */
115	size_t ncsalloc;	/* number of csets allocated */
116	struct re_guts *g;
117#	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
118	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
119	sopno pend[NPAREN];	/* -> ) ([0] unused) */
120};
121
122/* ========= begin header generated by ./mkh ========= */
123#ifdef __cplusplus
124extern "C" {
125#endif
126
127/* === regcomp.c === */
128static void p_ere(struct parse *p, int stop, size_t reclimit);
129static void p_ere_exp(struct parse *p, size_t reclimit);
130static void p_str(struct parse *p);
131static void p_bre(struct parse *p, int end1, int end2, size_t reclimit);
132static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
133static int p_count(struct parse *p);
134static void p_bracket(struct parse *p);
135static void p_b_term(struct parse *p, cset *cs);
136static void p_b_cclass(struct parse *p, cset *cs);
137static void p_b_eclass(struct parse *p, cset *cs);
138static char p_b_symbol(struct parse *p);
139static char p_b_coll_elem(struct parse *p, int endc);
140static int othercase(int ch);
141static void bothcases(struct parse *p, int ch);
142static void ordinary(struct parse *p, int ch);
143static void nonnewline(struct parse *p);
144static void repeat(struct parse *p, sopno start, int from, int to, size_t reclimit);
145static int seterr(struct parse *p, int e);
146static cset *allocset(struct parse *p);
147static void freeset(struct parse *p, cset *cs);
148static sopno freezeset(struct parse *p, cset *cs);
149static int firstch(struct parse *p, cset *cs);
150static int nch(struct parse *p, cset *cs);
151static void mcadd(struct parse *p, cset *cs, const char *cp);
152#if 0
153static void mcsub(cset *cs, char *cp);
154static int mcin(cset *cs, char *cp);
155static char *mcfind(cset *cs, char *cp);
156#endif
157static void mcinvert(struct parse *p, cset *cs);
158static void mccase(struct parse *p, cset *cs);
159static int isinsets(struct re_guts *g, int c);
160static int samesets(struct re_guts *g, int c1, int c2);
161static void categorize(struct parse *p, struct re_guts *g);
162static sopno dupl(struct parse *p, sopno start, sopno finish);
163static void doemit(struct parse *p, sop op, sopno opnd);
164static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
165static void dofwd(struct parse *p, sopno pos, sopno value);
166static int enlarge(struct parse *p, sopno size);
167static void stripsnug(struct parse *p, struct re_guts *g);
168static void findmust(struct parse *p, struct re_guts *g);
169static sopno pluscount(struct parse *p, struct re_guts *g);
170
171#ifdef __cplusplus
172}
173#endif
174/* ========= end header generated by ./mkh ========= */
175
176static char nuls[10];		/* place to point scanner in event of error */
177
178/*
179 * macros for use with parse structure
180 * BEWARE:  these know that the parse structure is named `p' !!!
181 */
182#define	PEEK()	(*p->next)
183#define	PEEK2()	(*(p->next+1))
184#define	MORE()	(p->next < p->end)
185#define	MORE2()	(p->next+1 < p->end)
186#define	SEE(c)	(MORE() && PEEK() == (c))
187#define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
188#define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
189#define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
190#define	NEXT()	(p->next++)
191#define	NEXT2()	(p->next += 2)
192#define	NEXTn(n)	(p->next += (n))
193#define	GETNEXT()	(*p->next++)
194#define	SETERROR(e)	seterr(p, (e))
195#define	REQUIRE(co, e)	(void) ((co) || SETERROR(e))
196#define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
197#define	MUSTEAT(c, e)	(void) (REQUIRE(MORE() && GETNEXT() == (c), e))
198#define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
199#define	EMIT(op, sopnd)	doemit(p, (sop)(op), sopnd)
200#define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
201#define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
202#define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
203#define	HERE()		(p->slen)
204#define	THERE()		(p->slen - 1)
205#define	THERETHERE()	(p->slen - 2)
206#define	DROP(n)	(p->slen -= (n))
207
208#ifndef NDEBUG
209static int never = 0;		/* for use in asserts; shuts lint up */
210#else
211#define	never	0		/* some <assert.h>s have bugs too */
212#endif
213
214#define	MEMLIMIT	0x8000000
215#define MEMSIZE(p) \
216	((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
217	(p)->ncsalloc * sizeof(cset) + \
218	(p)->ssize * sizeof(sop))
219#define	RECLIMIT	256
220
221/*
222 - regcomp - interface for parser and compilation
223 = extern int regcomp(regex_t *, const char *, int);
224 = #define	REG_BASIC	0000
225 = #define	REG_EXTENDED	0001
226 = #define	REG_ICASE	0002
227 = #define	REG_NOSUB	0004
228 = #define	REG_NEWLINE	0010
229 = #define	REG_NOSPEC	0020
230 = #define	REG_PEND	0040
231 = #define	REG_DUMP	0200
232 */
233int				/* 0 success, otherwise REG_something */
234regcomp(
235    regex_t *preg,
236    const char *pattern,
237    int cflags)
238{
239	struct parse pa;
240	struct re_guts *g;
241	struct parse *p = &pa;
242	int i;
243	size_t len;
244#ifdef REDEBUG
245#	define	GOODFLAGS(f)	(f)
246#else
247#	define	GOODFLAGS(f)	((f)&~REG_DUMP)
248#endif
249
250	_DIAGASSERT(preg != NULL);
251	_DIAGASSERT(pattern != NULL);
252
253	cflags = GOODFLAGS(cflags);
254	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
255		return(REG_INVARG);
256
257	if (cflags&REG_PEND) {
258		if (preg->re_endp < pattern)
259			return(REG_INVARG);
260		len = preg->re_endp - pattern;
261	} else
262		len = strlen(pattern);
263
264	/* do the mallocs early so failure handling is easy */
265	g = malloc(sizeof(struct re_guts) + (NC - 1) * sizeof(cat_t));
266	if (g == NULL)
267		return(REG_ESPACE);
268	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
269	p->strip = calloc(p->ssize, sizeof(sop));
270	p->slen = 0;
271	if (p->strip == NULL) {
272		free(g);
273		return(REG_ESPACE);
274	}
275
276	/* set things up */
277	p->g = g;
278	p->next = pattern;
279	p->end = p->next + len;
280	p->error = 0;
281	p->ncsalloc = 0;
282	for (i = 0; i < NPAREN; i++) {
283		p->pbegin[i] = 0;
284		p->pend[i] = 0;
285	}
286	g->csetsize = NC;
287	g->sets = NULL;
288	g->setbits = NULL;
289	g->ncsets = 0;
290	g->cflags = cflags;
291	g->iflags = 0;
292	g->nbol = 0;
293	g->neol = 0;
294	g->must = NULL;
295	g->mlen = 0;
296	g->nsub = 0;
297	g->ncategories = 1;	/* category 0 is "everything else" */
298	g->categories = &g->catspace[-(CHAR_MIN)];
299	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
300	g->backrefs = 0;
301
302	/* do it */
303	EMIT(OEND, 0);
304	g->firststate = THERE();
305	if (cflags&REG_EXTENDED)
306		p_ere(p, OUT, 0);
307	else if (cflags&REG_NOSPEC)
308		p_str(p);
309	else
310		p_bre(p, OUT, OUT, 0);
311	EMIT(OEND, 0);
312	g->laststate = THERE();
313
314	/* tidy up loose ends and fill things in */
315	categorize(p, g);
316	stripsnug(p, g);
317	findmust(p, g);
318	g->nplus = pluscount(p, g);
319	g->magic = MAGIC2;
320	preg->re_nsub = g->nsub;
321	preg->re_g = g;
322	preg->re_magic = MAGIC1;
323#ifndef REDEBUG
324	/* not debugging, so can't rely on the assert() in regexec() */
325	if (g->iflags&BAD)
326		SETERROR(REG_ASSERT);
327#endif
328
329	/* win or lose, we're done */
330	if (p->error != 0)	/* lose */
331		regfree(preg);
332	return(p->error);
333}
334
335/*
336 - p_ere - ERE parser top level, concatenation and alternation
337 == static void p_ere(struct parse *p, int stop, size_t reclimit);
338 */
339static void
340p_ere(
341    struct parse *p,
342    int stop,			/* character this ERE should end at */
343    size_t reclimit)
344{
345	char c;
346	sopno prevback = 0;	/* pacify gcc */
347	sopno prevfwd = 0; 	/* pacify gcc */
348	sopno conc;
349	int first = 1;		/* is this the first alternative? */
350
351	_DIAGASSERT(p != NULL);
352
353	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
354		p->error = REG_ESPACE;
355		return;
356	}
357
358	for (;;) {
359		/* do a bunch of concatenated expressions */
360		conc = HERE();
361		while (MORE() && (c = PEEK()) != '|' && c != stop)
362			p_ere_exp(p, reclimit);
363		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
364
365		if (!EAT('|'))
366			break;		/* NOTE BREAK OUT */
367
368		if (first) {
369			INSERT(OCH_, conc);	/* offset is wrong */
370			prevfwd = conc;
371			prevback = conc;
372			first = 0;
373		}
374		ASTERN(OOR1, prevback);
375		prevback = THERE();
376		AHEAD(prevfwd);			/* fix previous offset */
377		prevfwd = HERE();
378		EMIT(OOR2, 0);			/* offset is very wrong */
379	}
380
381	if (!first) {		/* tail-end fixups */
382		AHEAD(prevfwd);
383		ASTERN(O_CH, prevback);
384	}
385
386	assert(!MORE() || SEE(stop));
387}
388
389/*
390 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
391 == static void p_ere_exp(struct parse *p, size_t reclimit);
392 */
393static void
394p_ere_exp(
395    struct parse *p,
396    size_t reclimit)
397{
398	char c;
399	sopno pos;
400	int count;
401	int count2;
402	sopno subno;
403	int wascaret = 0;
404
405	_DIAGASSERT(p != NULL);
406
407	assert(MORE());		/* caller should have ensured this */
408	c = GETNEXT();
409
410	pos = HERE();
411	switch (c) {
412	case '(':
413		REQUIRE(MORE(), REG_EPAREN);
414		p->g->nsub++;
415		subno = p->g->nsub;
416		if (subno < NPAREN)
417			p->pbegin[subno] = HERE();
418		EMIT(OLPAREN, subno);
419		if (!SEE(')'))
420			p_ere(p, ')', reclimit);
421		if (subno < NPAREN) {
422			p->pend[subno] = HERE();
423			assert(p->pend[subno] != 0);
424		}
425		EMIT(ORPAREN, subno);
426		MUSTEAT(')', REG_EPAREN);
427		break;
428#ifndef POSIX_MISTAKE
429	case ')':		/* happens only if no current unmatched ( */
430		/*
431		 * You may ask, why the ifndef?  Because I didn't notice
432		 * this until slightly too late for 1003.2, and none of the
433		 * other 1003.2 regular-expression reviewers noticed it at
434		 * all.  So an unmatched ) is legal POSIX, at least until
435		 * we can get it fixed.
436		 */
437		SETERROR(REG_EPAREN);
438		break;
439#endif
440	case '^':
441		EMIT(OBOL, 0);
442		p->g->iflags |= USEBOL;
443		p->g->nbol++;
444		wascaret = 1;
445		break;
446	case '$':
447		EMIT(OEOL, 0);
448		p->g->iflags |= USEEOL;
449		p->g->neol++;
450		break;
451	case '|':
452		SETERROR(REG_EMPTY);
453		break;
454	case '*':
455	case '+':
456	case '?':
457		SETERROR(REG_BADRPT);
458		break;
459	case '.':
460		if (p->g->cflags&REG_NEWLINE)
461			nonnewline(p);
462		else
463			EMIT(OANY, 0);
464		break;
465	case '[':
466		p_bracket(p);
467		break;
468	case '\\':
469		REQUIRE(MORE(), REG_EESCAPE);
470		c = GETNEXT();
471		ordinary(p, c);
472		break;
473	case '{':		/* okay as ordinary except if digit follows */
474		REQUIRE(!MORE() || !isdigit((unsigned char)PEEK()), REG_BADRPT);
475		/* FALLTHROUGH */
476	default:
477		ordinary(p, c);
478		break;
479	}
480
481	if (!MORE())
482		return;
483	c = PEEK();
484	/* we call { a repetition if followed by a digit */
485	if (!( c == '*' || c == '+' || c == '?' ||
486	    (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ))
487		return;		/* no repetition, we're done */
488	NEXT();
489
490	REQUIRE(!wascaret, REG_BADRPT);
491	switch (c) {
492	case '*':	/* implemented as +? */
493		/* this case does not require the (y|) trick, noKLUDGE */
494		INSERT(OPLUS_, pos);
495		ASTERN(O_PLUS, pos);
496		INSERT(OQUEST_, pos);
497		ASTERN(O_QUEST, pos);
498		break;
499	case '+':
500		INSERT(OPLUS_, pos);
501		ASTERN(O_PLUS, pos);
502		break;
503	case '?':
504		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
505		INSERT(OCH_, pos);		/* offset slightly wrong */
506		ASTERN(OOR1, pos);		/* this one's right */
507		AHEAD(pos);			/* fix the OCH_ */
508		EMIT(OOR2, 0);			/* offset very wrong... */
509		AHEAD(THERE());			/* ...so fix it */
510		ASTERN(O_CH, THERETHERE());
511		break;
512	case '{':
513		count = p_count(p);
514		if (EAT(',')) {
515			if (isdigit((unsigned char)PEEK())) {
516				count2 = p_count(p);
517				REQUIRE(count <= count2, REG_BADBR);
518			} else		/* single number with comma */
519				count2 = INFINITY;
520		} else		/* just a single number */
521			count2 = count;
522		repeat(p, pos, count, count2, 0);
523		if (!EAT('}')) {	/* error heuristics */
524			while (MORE() && PEEK() != '}')
525				NEXT();
526			REQUIRE(MORE(), REG_EBRACE);
527			SETERROR(REG_BADBR);
528		}
529		break;
530	}
531
532	if (!MORE())
533		return;
534	c = PEEK();
535	if (!( c == '*' || c == '+' || c == '?' ||
536	    (c == '{' && MORE2() && isdigit((unsigned char)PEEK2())) ) )
537		return;
538	SETERROR(REG_BADRPT);
539}
540
541/*
542 - p_str - string (no metacharacters) "parser"
543 == static void p_str(struct parse *p);
544 */
545static void
546p_str(
547    struct parse *p)
548{
549
550	_DIAGASSERT(p != NULL);
551
552	REQUIRE(MORE(), REG_EMPTY);
553	while (MORE())
554		ordinary(p, GETNEXT());
555}
556
557/*
558 - p_bre - BRE parser top level, anchoring and concatenation
559 == static void p_bre(struct parse *p, int end1, \
560 ==	int end2, size_t reclimit);
561 * Giving end1 as OUT essentially eliminates the end1/end2 check.
562 *
563 * This implementation is a bit of a kludge, in that a trailing $ is first
564 * taken as an ordinary character and then revised to be an anchor.  The
565 * only undesirable side effect is that '$' gets included as a character
566 * category in such cases.  This is fairly harmless; not worth fixing.
567 * The amount of lookahead needed to avoid this kludge is excessive.
568 */
569static void
570p_bre(
571    struct parse *p,
572    int end1,		/* first terminating character */
573    int end2,		/* second terminating character */
574    size_t reclimit)
575{
576	sopno start;
577	int first = 1;			/* first subexpression? */
578	int wasdollar = 0;
579
580	_DIAGASSERT(p != NULL);
581
582	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
583		p->error = REG_ESPACE;
584		return;
585	}
586
587	start = HERE();
588
589	if (EAT('^')) {
590		EMIT(OBOL, 0);
591		p->g->iflags |= USEBOL;
592		p->g->nbol++;
593	}
594	while (MORE() && !SEETWO(end1, end2)) {
595		wasdollar = p_simp_re(p, first, reclimit);
596		first = 0;
597	}
598	if (wasdollar) {	/* oops, that was a trailing anchor */
599		DROP(1);
600		EMIT(OEOL, 0);
601		p->g->iflags |= USEEOL;
602		p->g->neol++;
603	}
604
605	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
606}
607
608/*
609 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
610 == static int p_simp_re(struct parse *p, int starordinary, size_t reclimit);
611 */
612static int			/* was the simple RE an unbackslashed $? */
613p_simp_re(
614    struct parse *p,
615    int starordinary,		/* is a leading * an ordinary character? */
616    size_t reclimit)
617{
618	int c;
619	int count;
620	int count2;
621	sopno pos, i;
622	sopno subno;
623#	define	BACKSL	(1<<CHAR_BIT)
624
625	_DIAGASSERT(p != NULL);
626
627	pos = HERE();		/* repetion op, if any, covers from here */
628
629	assert(MORE());		/* caller should have ensured this */
630	c = GETNEXT();
631	if (c == '\\') {
632		REQUIRE(MORE(), REG_EESCAPE);
633		c = BACKSL | (unsigned char)GETNEXT();
634	}
635	switch (c) {
636	case '.':
637		if (p->g->cflags&REG_NEWLINE)
638			nonnewline(p);
639		else
640			EMIT(OANY, 0);
641		break;
642	case '[':
643		p_bracket(p);
644		break;
645	case BACKSL|'{':
646		SETERROR(REG_BADRPT);
647		break;
648	case BACKSL|'(':
649		p->g->nsub++;
650		subno = p->g->nsub;
651		if (subno < NPAREN)
652			p->pbegin[subno] = HERE();
653		EMIT(OLPAREN, subno);
654		/* the MORE here is an error heuristic */
655		if (MORE() && !SEETWO('\\', ')'))
656			p_bre(p, '\\', ')', reclimit);
657		if (subno < NPAREN) {
658			p->pend[subno] = HERE();
659			assert(p->pend[subno] != 0);
660		}
661		EMIT(ORPAREN, subno);
662		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
663		break;
664	case BACKSL|')':	/* should not get here -- must be user */
665	case BACKSL|'}':
666		SETERROR(REG_EPAREN);
667		break;
668	case BACKSL|'1':
669	case BACKSL|'2':
670	case BACKSL|'3':
671	case BACKSL|'4':
672	case BACKSL|'5':
673	case BACKSL|'6':
674	case BACKSL|'7':
675	case BACKSL|'8':
676	case BACKSL|'9':
677		i = (c&~BACKSL) - '0';
678		assert(i < NPAREN);
679		if (p->pend[i] != 0) {
680			assert(i <= p->g->nsub);
681			EMIT(OBACK_, i);
682			assert(p->pbegin[i] != 0);
683			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
684			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
685			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
686			EMIT(O_BACK, i);
687		} else
688			SETERROR(REG_ESUBREG);
689		p->g->backrefs = 1;
690		break;
691	case '*':
692		REQUIRE(starordinary, REG_BADRPT);
693		/* FALLTHROUGH */
694	default:
695		ordinary(p, c &~ BACKSL);
696		break;
697	}
698
699	if (EAT('*')) {		/* implemented as +? */
700		/* this case does not require the (y|) trick, noKLUDGE */
701		INSERT(OPLUS_, pos);
702		ASTERN(O_PLUS, pos);
703		INSERT(OQUEST_, pos);
704		ASTERN(O_QUEST, pos);
705	} else if (EATTWO('\\', '{')) {
706		count = p_count(p);
707		if (EAT(',')) {
708			if (MORE() && isdigit((unsigned char)PEEK())) {
709				count2 = p_count(p);
710				REQUIRE(count <= count2, REG_BADBR);
711			} else		/* single number with comma */
712				count2 = INFINITY;
713		} else		/* just a single number */
714			count2 = count;
715		repeat(p, pos, count, count2, 0);
716		if (!EATTWO('\\', '}')) {	/* error heuristics */
717			while (MORE() && !SEETWO('\\', '}'))
718				NEXT();
719			REQUIRE(MORE(), REG_EBRACE);
720			SETERROR(REG_BADBR);
721		}
722	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
723		return(1);
724
725	return(0);
726}
727
728/*
729 - p_count - parse a repetition count
730 == static int p_count(struct parse *p);
731 */
732static int			/* the value */
733p_count(
734    struct parse *p)
735{
736	int count = 0;
737	int ndigits = 0;
738
739	_DIAGASSERT(p != NULL);
740
741	while (MORE() && isdigit((unsigned char)PEEK()) && count <= DUPMAX) {
742		count = count*10 + (GETNEXT() - '0');
743		ndigits++;
744	}
745
746	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
747	return(count);
748}
749
750/*
751 - p_bracket - parse a bracketed character list
752 == static void p_bracket(struct parse *p);
753 *
754 * Note a significant property of this code:  if the allocset() did SETERROR,
755 * no set operations are done.
756 */
757static void
758p_bracket(
759    struct parse *p)
760{
761	cset *cs;
762	int invert = 0;
763	_DIAGASSERT(p != NULL);
764
765	cs = allocset(p);
766	if (cs == NULL)
767		return;
768
769	/* Dept of Truly Sickening Special-Case Kludges */
770	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]",
771					    (size_t)6) == 0) {
772		EMIT(OBOW, 0);
773		NEXTn(6);
774		return;
775	}
776	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]",
777					    (size_t)6) == 0) {
778		EMIT(OEOW, 0);
779		NEXTn(6);
780		return;
781	}
782
783	if (EAT('^'))
784		invert++;	/* make note to invert set at end */
785	if (EAT(']'))
786		CHadd(cs, ']');
787	else if (EAT('-'))
788		CHadd(cs, '-');
789	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
790		p_b_term(p, cs);
791	if (EAT('-'))
792		CHadd(cs, '-');
793	MUSTEAT(']', REG_EBRACK);
794
795	if (p->error != 0)	/* don't mess things up further */
796		return;
797
798	if (p->g->cflags&REG_ICASE) {
799		ssize_t i;
800		int ci;
801
802		for (i = p->g->csetsize - 1; i >= 0; i--)
803			if (CHIN(cs, i) && isalpha(i)) {
804				ci = othercase((int)i);
805				if (ci != i)
806					CHadd(cs, ci);
807			}
808		if (cs->multis != NULL)
809			mccase(p, cs);
810	}
811	if (invert) {
812		ssize_t i;
813
814		for (i = p->g->csetsize - 1; i >= 0; i--)
815			if (CHIN(cs, i))
816				CHsub(cs, (int)i);
817			else
818				CHadd(cs, (int)i);
819		if (p->g->cflags&REG_NEWLINE)
820			CHsub(cs, '\n');
821		if (cs->multis != NULL)
822			mcinvert(p, cs);
823	}
824
825	assert(cs->multis == NULL);		/* xxx */
826
827	if (nch(p, cs) == 1) {		/* optimize singleton sets */
828		ordinary(p, firstch(p, cs));
829		freeset(p, cs);
830	} else
831		EMIT(OANYOF, freezeset(p, cs));
832}
833
834/*
835 - p_b_term - parse one term of a bracketed character list
836 == static void p_b_term(struct parse *p, cset *cs);
837 */
838static void
839p_b_term(
840    struct parse *p,
841    cset *cs)
842{
843	char c;
844	char start, finish;
845	int i;
846
847	_DIAGASSERT(p != NULL);
848	_DIAGASSERT(cs != NULL);
849
850	/* classify what we've got */
851	switch ((MORE()) ? PEEK() : '\0') {
852	case '[':
853		c = (MORE2()) ? PEEK2() : '\0';
854		break;
855
856	case '-':
857		SETERROR(REG_ERANGE);
858		return;			/* NOTE RETURN */
859
860	default:
861		c = '\0';
862		break;
863	}
864
865	switch (c) {
866	case ':':		/* character class */
867		NEXT2();
868		REQUIRE(MORE(), REG_EBRACK);
869		c = PEEK();
870		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
871		p_b_cclass(p, cs);
872		REQUIRE(MORE(), REG_EBRACK);
873		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
874		break;
875	case '=':		/* equivalence class */
876		NEXT2();
877		REQUIRE(MORE(), REG_EBRACK);
878		c = PEEK();
879		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
880		p_b_eclass(p, cs);
881		REQUIRE(MORE(), REG_EBRACK);
882		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
883		break;
884	default:		/* symbol, ordinary character, or range */
885/* xxx revision needed for multichar stuff */
886		start = p_b_symbol(p);
887		if (SEE('-') && MORE2() && PEEK2() != ']') {
888			/* range */
889			NEXT();
890			if (EAT('-'))
891				finish = '-';
892			else
893				finish = p_b_symbol(p);
894		} else
895			finish = start;
896/* xxx what about signed chars here... */
897		REQUIRE(start <= finish, REG_ERANGE);
898		for (i = start; i <= finish; i++)
899			CHadd(cs, i);
900		break;
901	}
902}
903
904/*
905 - p_b_cclass - parse a character-class name and deal with it
906 == static void p_b_cclass(struct parse *p, cset *cs);
907 */
908static void
909p_b_cclass(
910    struct parse *p,
911    cset *cs)
912{
913	const char *sp;
914	const struct cclass *cp;
915	size_t len;
916	const char *u;
917	char c;
918
919	_DIAGASSERT(p != NULL);
920	_DIAGASSERT(cs != NULL);
921
922	sp = p->next;
923
924	while (MORE() && isalpha((unsigned char)PEEK()))
925		NEXT();
926	len = p->next - sp;
927	for (cp = cclasses; cp->name != NULL; cp++)
928		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
929			break;
930	if (cp->name == NULL) {
931		/* oops, didn't find it */
932		SETERROR(REG_ECTYPE);
933		return;
934	}
935
936	u = cp->chars;
937	while ((c = *u++) != '\0')
938		CHadd(cs, c);
939	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
940		MCadd(p, cs, u);
941}
942
943/*
944 - p_b_eclass - parse an equivalence-class name and deal with it
945 == static void p_b_eclass(struct parse *p, cset *cs);
946 *
947 * This implementation is incomplete. xxx
948 */
949static void
950p_b_eclass(
951    struct parse *p,
952    cset *cs)
953{
954	char c;
955
956	_DIAGASSERT(p != NULL);
957	_DIAGASSERT(cs != NULL);
958
959	c = p_b_coll_elem(p, '=');
960	CHadd(cs, c);
961}
962
963/*
964 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
965 == static char p_b_symbol(struct parse *p);
966 */
967static char			/* value of symbol */
968p_b_symbol(
969    struct parse *p)
970{
971	char value;
972
973	_DIAGASSERT(p != NULL);
974
975	REQUIRE(MORE(), REG_EBRACK);
976	if (!EATTWO('[', '.'))
977		return(GETNEXT());
978
979	/* collating symbol */
980	value = p_b_coll_elem(p, '.');
981	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
982	return(value);
983}
984
985/*
986 - p_b_coll_elem - parse a collating-element name and look it up
987 == static char p_b_coll_elem(struct parse *p, int endc);
988 */
989static char			/* value of collating element */
990p_b_coll_elem(
991    struct parse *p,
992    int endc)			/* name ended by endc,']' */
993{
994	const char *sp;
995	const struct cname *cp;
996	size_t len;
997
998	_DIAGASSERT(p != NULL);
999
1000	sp = p->next;
1001
1002	while (MORE() && !SEETWO(endc, ']'))
1003		NEXT();
1004	if (!MORE()) {
1005		SETERROR(REG_EBRACK);
1006		return(0);
1007	}
1008	len = p->next - sp;
1009	for (cp = cnames; cp->name != NULL; cp++)
1010		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
1011			return(cp->code);	/* known name */
1012	if (len == 1)
1013		return(*sp);	/* single character */
1014	SETERROR(REG_ECOLLATE);			/* neither */
1015	return(0);
1016}
1017
1018/*
1019 - othercase - return the case counterpart of an alphabetic
1020 == static int othercase(int ch);
1021 */
1022static int			/* if no counterpart, return ch */
1023othercase(
1024    int ch)
1025{
1026	assert(isalpha(ch));
1027	if (isupper(ch))
1028		return(tolower(ch));
1029	else if (islower(ch))
1030		return(toupper(ch));
1031	else			/* peculiar, but could happen */
1032		return(ch);
1033}
1034
1035/*
1036 - bothcases - emit a dualcase version of a two-case character
1037 == static void bothcases(struct parse *p, int ch);
1038 *
1039 * Boy, is this implementation ever a kludge...
1040 */
1041static void
1042bothcases(
1043    struct parse *p,
1044    int ch)
1045{
1046	const char *oldnext;
1047	const char *oldend;
1048	char bracket[3];
1049
1050	_DIAGASSERT(p != NULL);
1051
1052	oldnext = p->next;
1053	oldend = p->end;
1054
1055	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
1056	p->next = bracket;
1057	p->end = bracket+2;
1058	bracket[0] = ch;
1059	bracket[1] = ']';
1060	bracket[2] = '\0';
1061	p_bracket(p);
1062	assert(p->next == bracket+2);
1063	p->next = oldnext;
1064	p->end = oldend;
1065}
1066
1067/*
1068 - ordinary - emit an ordinary character
1069 == static void ordinary(struct parse *p, int ch);
1070 */
1071static void
1072ordinary(
1073    struct parse *p,
1074    int ch)
1075{
1076	cat_t *cap;
1077	unsigned char uc = (unsigned char)ch;
1078
1079	_DIAGASSERT(p != NULL);
1080
1081	cap = p->g->categories;
1082	if ((p->g->cflags & REG_ICASE) && isalpha(uc) && othercase(uc) != uc)
1083		bothcases(p, uc);
1084	else {
1085		EMIT(OCHAR, (sopno)uc);
1086		if (cap[uc] == 0) {
1087			_DIAGASSERT(__type_fit(unsigned char,
1088			    p->g->ncategories + 1));
1089			cap[uc] = (unsigned char)p->g->ncategories++;
1090		}
1091	}
1092}
1093
1094/*
1095 - nonnewline - emit REG_NEWLINE version of OANY
1096 == static void nonnewline(struct parse *p);
1097 *
1098 * Boy, is this implementation ever a kludge...
1099 */
1100static void
1101nonnewline(
1102    struct parse *p)
1103{
1104	const char *oldnext;
1105	const char *oldend;
1106	char bracket[4];
1107
1108	_DIAGASSERT(p != NULL);
1109
1110	oldnext = p->next;
1111	oldend = p->end;
1112
1113	p->next = bracket;
1114	p->end = bracket+3;
1115	bracket[0] = '^';
1116	bracket[1] = '\n';
1117	bracket[2] = ']';
1118	bracket[3] = '\0';
1119	p_bracket(p);
1120	assert(p->next == bracket+3);
1121	p->next = oldnext;
1122	p->end = oldend;
1123}
1124
1125/*
1126 - repeat - generate code for a bounded repetition, recursively if needed
1127 == static void repeat(struct parse *p, sopno start, int from, int to,
1128 == size_t reclimit);
1129 */
1130static void
1131repeat(
1132    struct parse *p,
1133    sopno start,		/* operand from here to end of strip */
1134    int from,			/* repeated from this number */
1135    int to,			/* to this number of times (maybe INFINITY) */
1136    size_t reclimit)
1137{
1138	sopno finish;
1139#	define	N	2
1140#	define	INF	3
1141#	define	REP(f, t)	((f)*8 + (t))
1142#	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1143	sopno copy;
1144
1145	_DIAGASSERT(p != NULL);
1146
1147	if (reclimit++ > RECLIMIT)
1148		p->error = REG_ESPACE;
1149	if (p->error)
1150		return;
1151
1152	finish = HERE();
1153
1154	assert(from <= to);
1155
1156	switch (REP(MAP(from), MAP(to))) {
1157	case REP(0, 0):			/* must be user doing this */
1158		DROP(finish-start);	/* drop the operand */
1159		break;
1160	case REP(0, 1):			/* as x{1,1}? */
1161	case REP(0, N):			/* as x{1,n}? */
1162	case REP(0, INF):		/* as x{1,}? */
1163		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1164		INSERT(OCH_, start);		/* offset is wrong... */
1165		repeat(p, start+1, 1, to, reclimit);
1166		ASTERN(OOR1, start);
1167		AHEAD(start);			/* ... fix it */
1168		EMIT(OOR2, 0);
1169		AHEAD(THERE());
1170		ASTERN(O_CH, THERETHERE());
1171		break;
1172	case REP(1, 1):			/* trivial case */
1173		/* done */
1174		break;
1175	case REP(1, N):			/* as x?x{1,n-1} */
1176		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1177		INSERT(OCH_, start);
1178		ASTERN(OOR1, start);
1179		AHEAD(start);
1180		EMIT(OOR2, 0);			/* offset very wrong... */
1181		AHEAD(THERE());			/* ...so fix it */
1182		ASTERN(O_CH, THERETHERE());
1183		copy = dupl(p, start+1, finish+1);
1184		assert(copy == finish+4);
1185		repeat(p, copy, 1, to-1, reclimit);
1186		break;
1187	case REP(1, INF):		/* as x+ */
1188		INSERT(OPLUS_, start);
1189		ASTERN(O_PLUS, start);
1190		break;
1191	case REP(N, N):			/* as xx{m-1,n-1} */
1192		copy = dupl(p, start, finish);
1193		repeat(p, copy, from-1, to-1, reclimit);
1194		break;
1195	case REP(N, INF):		/* as xx{n-1,INF} */
1196		copy = dupl(p, start, finish);
1197		repeat(p, copy, from-1, to, reclimit);
1198		break;
1199	default:			/* "can't happen" */
1200		SETERROR(REG_ASSERT);	/* just in case */
1201		break;
1202	}
1203}
1204
1205/*
1206 - seterr - set an error condition
1207 == static int seterr(struct parse *p, int e);
1208 */
1209static int			/* useless but makes type checking happy */
1210seterr(
1211    struct parse *p,
1212    int e)
1213{
1214
1215	_DIAGASSERT(p != NULL);
1216
1217	if (p->error == 0)	/* keep earliest error condition */
1218		p->error = e;
1219	p->next = nuls;		/* try to bring things to a halt */
1220	p->end = nuls;
1221	return(0);		/* make the return value well-defined */
1222}
1223
1224/*
1225 - allocset - allocate a set of characters for []
1226 == static cset *allocset(struct parse *p);
1227 */
1228static cset *
1229allocset(
1230    struct parse *p)
1231{
1232	size_t no;
1233	size_t nc;
1234	size_t nbytes;
1235	cset *cs;
1236	size_t css;
1237	size_t i;
1238	void *old_ptr;
1239
1240	_DIAGASSERT(p != NULL);
1241
1242	no = p->g->ncsets++;
1243	css = (size_t)p->g->csetsize;
1244	if (no >= p->ncsalloc) {	/* need another column of space */
1245		p->ncsalloc += CHAR_BIT;
1246		nc = p->ncsalloc;
1247		assert(nc % CHAR_BIT == 0);
1248		nbytes = nc / CHAR_BIT * css;
1249		if (MEMSIZE(p) > MEMLIMIT)
1250			goto oomem;
1251		if (reallocarr(&p->g->sets, nc, sizeof(cset)))
1252			goto oomem;
1253		old_ptr = p->g->setbits;
1254		if (reallocarr(&p->g->setbits, nc / CHAR_BIT, css)) {
1255			free(old_ptr);
1256			goto oomem;
1257		}
1258		if (old_ptr != p->g->setbits) {
1259			for (i = 0; i < no; i++)
1260				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1261		}
1262		(void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1263	}
1264
1265	cs = &p->g->sets[no];
1266	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1267	cs->mask = 1 << (unsigned int)((no) % CHAR_BIT);
1268	cs->hash = 0;
1269	cs->smultis = 0;
1270	cs->multis = NULL;
1271
1272	return(cs);
1273
1274oomem:
1275	SETERROR(REG_ESPACE);
1276	/* caller's responsibility not to do set ops */
1277	return NULL;
1278}
1279
1280/*
1281 - freeset - free a now-unused set
1282 == static void freeset(struct parse *p, cset *cs);
1283 */
1284static void
1285freeset(
1286    struct parse *p,
1287    cset *cs)
1288{
1289	size_t i;
1290	cset *top;
1291	size_t css;
1292
1293	_DIAGASSERT(p != NULL);
1294	_DIAGASSERT(cs != NULL);
1295
1296	top = &p->g->sets[p->g->ncsets];
1297	css = (size_t)p->g->csetsize;
1298
1299	for (i = 0; i < css; i++)
1300		CHsub(cs, (int)i);
1301	if (cs == top-1)	/* recover only the easy case */
1302		p->g->ncsets--;
1303}
1304
1305/*
1306 - freezeset - final processing on a set of characters
1307 == static int freezeset(struct parse *p, cset *cs);
1308 *
1309 * The main task here is merging identical sets.  This is usually a waste
1310 * of time (although the hash code minimizes the overhead), but can win
1311 * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1312 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1313 * the same value!
1314 */
1315static sopno			/* set number */
1316freezeset(
1317    struct parse *p,
1318    cset *cs)
1319{
1320	uch h;
1321	size_t i;
1322	cset *top;
1323	cset *cs2;
1324	size_t css;
1325
1326	_DIAGASSERT(p != NULL);
1327	_DIAGASSERT(cs != NULL);
1328
1329	h = cs->hash;
1330	top = &p->g->sets[p->g->ncsets];
1331	css = (size_t)p->g->csetsize;
1332
1333	/* look for an earlier one which is the same */
1334	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1335		if (cs2->hash == h && cs2 != cs) {
1336			/* maybe */
1337			for (i = 0; i < css; i++)
1338				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1339					break;		/* no */
1340			if (i == css)
1341				break;			/* yes */
1342		}
1343
1344	if (cs2 < top) {	/* found one */
1345		freeset(p, cs);
1346		cs = cs2;
1347	}
1348
1349	return (sopno)(cs - p->g->sets);
1350}
1351
1352/*
1353 - firstch - return first character in a set (which must have at least one)
1354 == static int firstch(struct parse *p, cset *cs);
1355 */
1356static int			/* character; there is no "none" value */
1357firstch(
1358    struct parse *p,
1359    cset *cs)
1360{
1361	size_t i;
1362	size_t css;
1363
1364	_DIAGASSERT(p != NULL);
1365	_DIAGASSERT(cs != NULL);
1366
1367	css = (size_t)p->g->csetsize;
1368
1369	for (i = 0; i < css; i++)
1370		if (CHIN(cs, i))
1371			return((char)i);
1372	assert(never);
1373	return(0);		/* arbitrary */
1374}
1375
1376/*
1377 - nch - number of characters in a set
1378 == static int nch(struct parse *p, cset *cs);
1379 */
1380static int
1381nch(
1382    struct parse *p,
1383    cset *cs)
1384{
1385	size_t i;
1386	size_t css;
1387	int n = 0;
1388
1389	_DIAGASSERT(p != NULL);
1390	_DIAGASSERT(cs != NULL);
1391
1392	css = (size_t)p->g->csetsize;
1393
1394	for (i = 0; i < css; i++)
1395		if (CHIN(cs, i))
1396			n++;
1397	return(n);
1398}
1399
1400/*
1401 - mcadd - add a collating element to a cset
1402 == static void mcadd(struct parse *p, cset *cs, \
1403 ==	char *cp);
1404 */
1405static void
1406mcadd(
1407    struct parse *p,
1408    cset *cs,
1409    const char *cp)
1410{
1411	size_t oldend;
1412
1413	_DIAGASSERT(p != NULL);
1414	_DIAGASSERT(cs != NULL);
1415	_DIAGASSERT(cp != NULL);
1416
1417	oldend = cs->smultis;
1418
1419	cs->smultis += strlen(cp) + 1;
1420	if (cs->multis == NULL)
1421		cs->multis = malloc(cs->smultis);
1422	else
1423		cs->multis = realloc(cs->multis, cs->smultis);
1424	if (cs->multis == NULL) {
1425		SETERROR(REG_ESPACE);
1426		return;
1427	}
1428
1429	(void) strcpy(cs->multis + oldend - 1, cp);
1430	cs->multis[cs->smultis - 1] = '\0';
1431}
1432
1433#if 0
1434/*
1435 - mcsub - subtract a collating element from a cset
1436 == static void mcsub(cset *cs, char *cp);
1437 */
1438static void
1439mcsub(
1440    cset *cs,
1441    char *cp)
1442{
1443	char *fp;
1444	size_t len;
1445
1446	_DIAGASSERT(cs != NULL);
1447	_DIAGASSERT(cp != NULL);
1448
1449	fp = mcfind(cs, cp);
1450	len = strlen(fp);
1451
1452	assert(fp != NULL);
1453	(void) memmove(fp, fp + len + 1,
1454				cs->smultis - (fp + len + 1 - cs->multis));
1455	cs->smultis -= len;
1456
1457	if (cs->smultis == 0) {
1458		free(cs->multis);
1459		cs->multis = NULL;
1460		return;
1461	}
1462
1463	cs->multis = realloc(cs->multis, cs->smultis);
1464	assert(cs->multis != NULL);
1465}
1466
1467/*
1468 - mcin - is a collating element in a cset?
1469 == static int mcin(cset *cs, char *cp);
1470 */
1471static int
1472mcin(
1473    cset *cs,
1474    char *cp)
1475{
1476
1477	_DIAGASSERT(cs != NULL);
1478	_DIAGASSERT(cp != NULL);
1479
1480	return(mcfind(cs, cp) != NULL);
1481}
1482
1483/*
1484 - mcfind - find a collating element in a cset
1485 == static char *mcfind(cset *cs, char *cp);
1486 */
1487static char *
1488mcfind(
1489    cset *cs,
1490    char *cp)
1491{
1492	char *p;
1493
1494	_DIAGASSERT(cs != NULL);
1495	_DIAGASSERT(cp != NULL);
1496
1497	if (cs->multis == NULL)
1498		return(NULL);
1499	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1500		if (strcmp(cp, p) == 0)
1501			return(p);
1502	return(NULL);
1503}
1504#endif
1505
1506/*
1507 - mcinvert - invert the list of collating elements in a cset
1508 == static void mcinvert(struct parse *p, cset *cs);
1509 *
1510 * This would have to know the set of possibilities.  Implementation
1511 * is deferred.
1512 */
1513/* ARGSUSED */
1514static void
1515mcinvert(
1516    struct parse *p,
1517    cset *cs)
1518{
1519
1520	_DIAGASSERT(p != NULL);
1521	_DIAGASSERT(cs != NULL);
1522
1523	assert(cs->multis == NULL);	/* xxx */
1524}
1525
1526/*
1527 - mccase - add case counterparts of the list of collating elements in a cset
1528 == static void mccase(struct parse *p, cset *cs);
1529 *
1530 * This would have to know the set of possibilities.  Implementation
1531 * is deferred.
1532 */
1533/* ARGSUSED */
1534static void
1535mccase(
1536    struct parse *p,
1537    cset *cs)
1538{
1539
1540	_DIAGASSERT(p != NULL);
1541	_DIAGASSERT(cs != NULL);
1542
1543	assert(cs->multis == NULL);	/* xxx */
1544}
1545
1546/*
1547 - isinsets - is this character in any sets?
1548 == static int isinsets(struct re_guts *g, int c);
1549 */
1550static int			/* predicate */
1551isinsets(
1552    struct re_guts *g,
1553    int c)
1554{
1555	uch *col;
1556	size_t i;
1557	size_t ncols;
1558	unsigned uc = (unsigned char)c;
1559
1560	_DIAGASSERT(g != NULL);
1561
1562	if (g->setbits == NULL)
1563		return 0;
1564
1565	ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1566
1567	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1568		if (col[uc] != 0)
1569			return(1);
1570	return(0);
1571}
1572
1573/*
1574 - samesets - are these two characters in exactly the same sets?
1575 == static int samesets(struct re_guts *g, int c1, int c2);
1576 */
1577static int			/* predicate */
1578samesets(
1579    struct re_guts *g,
1580    int c1,
1581    int c2)
1582{
1583	uch *col;
1584	size_t i;
1585	size_t ncols;
1586	unsigned uc1 = (unsigned char)c1;
1587	unsigned uc2 = (unsigned char)c2;
1588
1589	_DIAGASSERT(g != NULL);
1590
1591	ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1592
1593	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1594		if (col[uc1] != col[uc2])
1595			return(0);
1596	return(1);
1597}
1598
1599/*
1600 - categorize - sort out character categories
1601 == static void categorize(struct parse *p, struct re_guts *g);
1602 */
1603static void
1604categorize(
1605    struct parse *p,
1606    struct re_guts *g)
1607{
1608	cat_t *cats;
1609	int c;
1610	int c2;
1611	cat_t cat;
1612
1613	_DIAGASSERT(p != NULL);
1614	_DIAGASSERT(g != NULL);
1615
1616	cats = g->categories;
1617
1618	/* avoid making error situations worse */
1619	if (p->error != 0)
1620		return;
1621
1622	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1623		if (cats[c] == 0 && isinsets(g, c)) {
1624			_DIAGASSERT(__type_fit(unsigned char,
1625			    g->ncategories + 1));
1626			cat = g->ncategories++;
1627			cats[c] = cat;
1628			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1629				if (cats[c2] == 0 && samesets(g, c, c2))
1630					cats[c2] = cat;
1631		}
1632}
1633
1634/*
1635 - dupl - emit a duplicate of a bunch of sops
1636 == static sopno dupl(struct parse *p, sopno start, sopno finish);
1637 */
1638static sopno			/* start of duplicate */
1639dupl(
1640    struct parse *p,
1641    sopno start,			/* from here */
1642    sopno finish)			/* to this less one */
1643{
1644	sopno ret;
1645	sopno len = finish - start;
1646
1647	_DIAGASSERT(p != NULL);
1648
1649	ret = HERE();
1650
1651	assert(finish >= start);
1652	if (len == 0)
1653		return(ret);
1654	if (!enlarge(p, p->ssize + len))/* this many unexpected additions */
1655		return ret;
1656	(void)memcpy(p->strip + p->slen, p->strip + start,
1657	    (size_t)len * sizeof(sop));
1658	p->slen += len;
1659	return(ret);
1660}
1661
1662/*
1663 - doemit - emit a strip operator
1664 == static void doemit(struct parse *p, sop op, size_t opnd);
1665 *
1666 * It might seem better to implement this as a macro with a function as
1667 * hard-case backup, but it's just too big and messy unless there are
1668 * some changes to the data structures.  Maybe later.
1669 */
1670static void
1671doemit(
1672    struct parse *p,
1673    sop op,
1674    sopno opnd)
1675{
1676	_DIAGASSERT(p != NULL);
1677
1678	/* avoid making error situations worse */
1679	if (p->error != 0)
1680		return;
1681
1682	/* deal with oversize operands ("can't happen", more or less) */
1683	assert(opnd < 1<<OPSHIFT);
1684
1685	/* deal with undersized strip */
1686	if (p->slen >= p->ssize)
1687		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1688			return;
1689
1690	/* finally, it's all reduced to the easy case */
1691	p->strip[p->slen++] = (sop)SOP(op, opnd);
1692}
1693
1694/*
1695 - doinsert - insert a sop into the strip
1696 == static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
1697 */
1698static void
1699doinsert(
1700    struct parse *p,
1701    sop op,
1702    sopno opnd,
1703    sopno pos)
1704{
1705	sopno sn;
1706	sop s;
1707	int i;
1708
1709	_DIAGASSERT(p != NULL);
1710
1711	/* avoid making error situations worse */
1712	if (p->error != 0)
1713		return;
1714
1715	sn = HERE();
1716	EMIT(op, opnd);		/* do checks, ensure space */
1717	assert(HERE() == sn+1);
1718	s = p->strip[sn];
1719
1720	/* adjust paren pointers */
1721	assert(pos > 0);
1722	for (i = 1; i < NPAREN; i++) {
1723		if (p->pbegin[i] >= pos) {
1724			p->pbegin[i]++;
1725		}
1726		if (p->pend[i] >= pos) {
1727			p->pend[i]++;
1728		}
1729	}
1730
1731	memmove(&p->strip[pos+1], &p->strip[pos], (HERE()-pos-1)*sizeof(sop));
1732	p->strip[pos] = s;
1733}
1734
1735/*
1736 - dofwd - complete a forward reference
1737 == static void dofwd(struct parse *p, sopno pos, sop value);
1738 */
1739static void
1740dofwd(
1741    struct parse *p,
1742    sopno pos,
1743    sopno value)
1744{
1745
1746	_DIAGASSERT(p != NULL);
1747
1748	/* avoid making error situations worse */
1749	if (p->error != 0)
1750		return;
1751
1752	assert(value < 1<<OPSHIFT);
1753	p->strip[pos] = (sop)(OP(p->strip[pos]) | value);
1754}
1755
1756/*
1757 - enlarge - enlarge the strip
1758 == static void enlarge(struct parse *p, sopno size);
1759 */
1760static int
1761enlarge(struct parse *p, sopno size)
1762{
1763	_DIAGASSERT(p != NULL);
1764
1765	if (p->ssize >= size)
1766		return 1;
1767
1768	if (MEMSIZE(p) > MEMLIMIT || reallocarr(&p->strip, size, sizeof(sop))) {
1769		SETERROR(REG_ESPACE);
1770		return 0;
1771	}
1772	p->ssize = size;
1773	return 1;
1774}
1775
1776/*
1777 - stripsnug - compact the strip
1778 == static void stripsnug(struct parse *p, struct re_guts *g);
1779 */
1780static void
1781stripsnug(
1782    struct parse *p,
1783    struct re_guts *g)
1784{
1785
1786	_DIAGASSERT(p != NULL);
1787	_DIAGASSERT(g != NULL);
1788
1789	g->nstates = p->slen;
1790	g->strip = p->strip;
1791	reallocarr(&g->strip, p->slen, sizeof(sop));
1792	/* Ignore error as tries to free memory only. */
1793}
1794
1795/*
1796 - findmust - fill in must and mlen with longest mandatory literal string
1797 == static void findmust(struct parse *p, struct re_guts *g);
1798 *
1799 * This algorithm could do fancy things like analyzing the operands of |
1800 * for common subsequences.  Someday.  This code is simple and finds most
1801 * of the interesting cases.
1802 *
1803 * Note that must and mlen got initialized during setup.
1804 */
1805static void
1806findmust(
1807    struct parse *p,
1808    struct re_guts *g)
1809{
1810	sop *scan;
1811	sop *start = NULL;
1812	sop *newstart = NULL;
1813	sopno newlen;
1814	sop s;
1815	char *cp;
1816	sopno i;
1817
1818	_DIAGASSERT(p != NULL);
1819	_DIAGASSERT(g != NULL);
1820
1821	/* avoid making error situations worse */
1822	if (p->error != 0)
1823		return;
1824
1825	/* find the longest OCHAR sequence in strip */
1826	newlen = 0;
1827	scan = g->strip + 1;
1828	do {
1829		s = *scan++;
1830		switch (OP(s)) {
1831		case OCHAR:		/* sequence member */
1832			if (newlen == 0)		/* new sequence */
1833				newstart = scan - 1;
1834			newlen++;
1835			break;
1836		case OPLUS_:		/* things that don't break one */
1837		case OLPAREN:
1838		case ORPAREN:
1839			break;
1840		case OQUEST_:		/* things that must be skipped */
1841		case OCH_:
1842			scan--;
1843			do {
1844				scan += OPND(s);
1845				s = *scan;
1846				/* assert() interferes w debug printouts */
1847				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1848							OP(s) != OOR2) {
1849					g->iflags |= BAD;
1850					return;
1851				}
1852			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1853			/* FALLTHROUGH */
1854		default:		/* things that break a sequence */
1855			if (newlen > g->mlen) {		/* ends one */
1856				start = newstart;
1857				g->mlen = newlen;
1858			}
1859			newlen = 0;
1860			break;
1861		}
1862	} while (OP(s) != OEND);
1863
1864	if (start == NULL)
1865		g->mlen = 0;
1866
1867	if (g->mlen == 0)	/* there isn't one */
1868		return;
1869
1870	/* turn it into a character string */
1871	g->must = malloc((size_t)g->mlen + 1);
1872	if (g->must == NULL) {		/* argh; just forget it */
1873		g->mlen = 0;
1874		return;
1875	}
1876	cp = g->must;
1877	scan = start;
1878	for (i = g->mlen; i > 0; i--) {
1879		while (OP(s = *scan++) != OCHAR)
1880			continue;
1881		assert(cp < g->must + g->mlen);
1882		*cp++ = (char)OPND(s);
1883	}
1884	assert(cp == g->must + g->mlen);
1885	*cp++ = '\0';		/* just on general principles */
1886}
1887
1888/*
1889 - pluscount - count + nesting
1890 == static sopno pluscount(struct parse *p, struct re_guts *g);
1891 */
1892static sopno			/* nesting depth */
1893pluscount(
1894    struct parse *p,
1895    struct re_guts *g)
1896{
1897	sop *scan;
1898	sop s;
1899	sopno plusnest = 0;
1900	sopno maxnest = 0;
1901
1902	_DIAGASSERT(p != NULL);
1903	_DIAGASSERT(g != NULL);
1904
1905	if (p->error != 0)
1906		return(0);	/* there may not be an OEND */
1907
1908	scan = g->strip + 1;
1909	do {
1910		s = *scan++;
1911		switch (OP(s)) {
1912		case OPLUS_:
1913			plusnest++;
1914			break;
1915		case O_PLUS:
1916			if (plusnest > maxnest)
1917				maxnest = plusnest;
1918			plusnest--;
1919			break;
1920		}
1921	} while (OP(s) != OEND);
1922	if (plusnest != 0)
1923		g->iflags |= BAD;
1924	return(maxnest);
1925}
1926