1#include <time.h>
2#include <string.h>
3#include <stdio.h>
4#include <ctype.h>
5
6#include "tools/re2c/globals.h"
7#include "tools/re2c/parse.h"
8#include "tools/re2c/dfa.h"
9
10static Symbol *first = NULL;
11
12void
13Symbol_init(Symbol *r, const SubStr *str)
14{
15    r->next = first;
16    Str_init(&r->name, str);
17    r->re = NULL;
18    first = r;
19}
20
21Symbol *
22Symbol_find(const SubStr *str)
23{
24    Symbol *sym;
25    for(sym = first; sym; sym = sym->next)
26	if(SubStr_eq(&sym->name, str)) return sym;
27    return Symbol_new(str);
28}
29
30/*
31void showIns(FILE *o, const Ins *i, const Ins *base){
32    o.width(3);
33    o << &i - &base << ": ";
34    switch(i.i.tag){
35    case CHAR: {
36	o << "match ";
37	for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j)
38	    prtCh(o, j->c.value);
39	break;
40    } case GOTO:
41	o << "goto " << ((Ins*) i.i.link - &base);
42	break;
43    case FORK:
44	o << "fork " << ((Ins*) i.i.link - &base);
45	break;
46    case CTXT:
47	o << "term " << ((RuleOp*) i.i.link)->accept;
48	break;
49    case TERM:
50	o << "term " << ((RuleOp*) i.i.link)->accept;
51	break;
52    }
53    o << "\n";
54}
55*/
56
57static unsigned int
58AltOp_fixedLength(RegExp *r)
59{
60    unsigned int l1 = RegExp_fixedLength(r->d.AltCatOp.exp1);
61    /* XXX? Should be exp2? */
62    unsigned int l2 = RegExp_fixedLength(r->d.AltCatOp.exp1);
63    if(l1 != l2 || l1 == ~0u)
64	return ~0u;
65    return l1;
66}
67
68static unsigned int
69CatOp_fixedLength(RegExp *r)
70{
71    unsigned int l1, l2;
72    if((l1 = RegExp_fixedLength(r->d.AltCatOp.exp1)) != ~0u )
73        if((l2 = RegExp_fixedLength(r->d.AltCatOp.exp2)) != ~0u)
74	    return l1+l2;
75    return ~0u;
76}
77
78unsigned int
79RegExp_fixedLength(RegExp *r)
80{
81    switch (r->type) {
82	case NULLOP:
83	    return 0;
84	case MATCHOP:
85	    return 1;
86	case ALTOP:
87	    return AltOp_fixedLength(r);
88	case CATOP:
89	    return CatOp_fixedLength(r);
90	default:
91	    return ~0u;
92    }
93    return ~0u;
94}
95
96void
97RegExp_calcSize(RegExp *re, Char *rep)
98{
99    Range *r;
100    unsigned int c;
101
102    switch (re->type) {
103	case NULLOP:
104	    re->size = 0;
105	    break;
106	case MATCHOP:
107	    re->size = 1;
108	    for(r = re->d.match; r; r = r->next)
109		for(c = r->lb; c < r->ub; ++c)
110		    if(rep[c] == c)
111			++re->size;
112	    break;
113	case RULEOP:
114	    RegExp_calcSize(re->d.RuleOp.exp, rep);
115	    RegExp_calcSize(re->d.RuleOp.ctx, rep);
116	    re->size = re->d.RuleOp.exp->size + re->d.RuleOp.ctx->size + 1;
117	    break;
118	case ALTOP:
119	    RegExp_calcSize(re->d.AltCatOp.exp1, rep);
120	    RegExp_calcSize(re->d.AltCatOp.exp2, rep);
121	    re->size = re->d.AltCatOp.exp1->size
122		       + re->d.AltCatOp.exp2->size + 2;
123	    break;
124	case CATOP:
125	    RegExp_calcSize(re->d.AltCatOp.exp1, rep);
126	    RegExp_calcSize(re->d.AltCatOp.exp2, rep);
127	    re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size;
128	    break;
129	case CLOSEOP:
130	    RegExp_calcSize(re->d.exp, rep);
131	    re->size = re->d.exp->size + 1;
132	    break;
133	case CLOSEVOP:
134	    RegExp_calcSize(re->d.CloseVOp.exp, rep);
135
136	    if (re->d.CloseVOp.max >= 0)
137		re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) +
138		    ((1 + re->d.CloseVOp.exp->size) *
139		     (re->d.CloseVOp.max - re->d.CloseVOp.min));
140	    else
141		re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 1;
142	    break;
143    }
144}
145
146static void
147MatchOp_compile(RegExp *re, Char *rep, Ins *i)
148{
149    Ins *j;
150    unsigned int bump;
151    Range *r;
152    unsigned int c;
153
154    i->i.tag = CHAR;
155    i->i.link = &i[re->size];
156    j = &i[1];
157    bump = re->size;
158    for(r = re->d.match; r; r = r->next){
159	for(c = r->lb; c < r->ub; ++c){
160	    if(rep[c] == c){
161		j->c.value = c;
162		j->c.bump = --bump;
163		j++;
164	    }
165	}
166    }
167}
168
169static void
170AltOp_compile(RegExp *re, Char *rep, Ins *i){
171    Ins *j;
172
173    i->i.tag = FORK;
174    j = &i[re->d.AltCatOp.exp1->size + 1];
175    i->i.link = &j[1];
176    RegExp_compile(re->d.AltCatOp.exp1, rep, &i[1]);
177    j->i.tag = GOTO;
178    j->i.link = &j[re->d.AltCatOp.exp2->size + 1];
179    RegExp_compile(re->d.AltCatOp.exp2, rep, &j[1]);
180}
181
182void
183RegExp_compile(RegExp *re, Char *rep, Ins *i)
184{
185    Ins *jumppoint;
186    int st = 0;
187
188    switch (re->type) {
189	case NULLOP:
190	    break;
191	case MATCHOP:
192	    MatchOp_compile(re, rep, i);
193	    break;
194	case RULEOP:
195	    re->d.RuleOp.ins = i;
196	    RegExp_compile(re->d.RuleOp.exp, rep, &i[0]);
197	    i += re->d.RuleOp.exp->size;
198	    RegExp_compile(re->d.RuleOp.ctx, rep, &i[0]);
199	    i += re->d.RuleOp.ctx->size;
200	    i->i.tag = TERM;
201	    i->i.link = re;
202	    break;
203	case ALTOP:
204	    AltOp_compile(re, rep, i);
205	    break;
206	case CATOP:
207	    RegExp_compile(re->d.AltCatOp.exp1, rep, &i[0]);
208	    RegExp_compile(re->d.AltCatOp.exp2, rep,
209			   &i[re->d.AltCatOp.exp1->size]);
210	    break;
211	case CLOSEOP:
212	    RegExp_compile(re->d.exp, rep, &i[0]);
213	    i += re->d.exp->size;
214	    i->i.tag = FORK;
215	    i->i.link = i - re->d.exp->size;
216	    break;
217	case CLOSEVOP:
218	    jumppoint = i + ((1 + re->d.CloseVOp.exp->size) *
219			     (re->d.CloseVOp.max - re->d.CloseVOp.min));
220	    for(st = re->d.CloseVOp.min; st < re->d.CloseVOp.max; st++) {
221		i->i.tag = FORK;
222		i->i.link = jumppoint;
223		i+=1;
224		RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]);
225		i += re->d.CloseVOp.exp->size;
226	    }
227	    for(st = 0; st < re->d.CloseVOp.min; st++) {
228		RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]);
229		i += re->d.CloseVOp.exp->size;
230		if(re->d.CloseVOp.max < 0 && st == 0) {
231		    i->i.tag = FORK;
232		    i->i.link = i - re->d.CloseVOp.exp->size;
233		    i++;
234		}
235	    }
236	    break;
237    }
238}
239
240static void
241MatchOp_split(RegExp *re, CharSet *s)
242{
243    Range *r;
244    unsigned int c;
245
246    for(r = re->d.match; r; r = r->next){
247	for(c = r->lb; c < r->ub; ++c){
248	    CharPtn *x = s->rep[c], *a = x->nxt;
249	    if(!a){
250		if(x->card == 1)
251		    continue;
252		x->nxt = a = s->freeHead;
253		if(!(s->freeHead = s->freeHead->nxt))
254		    s->freeTail = &s->freeHead;
255		a->nxt = NULL;
256		x->fix = s->fix;
257		s->fix = x;
258	    }
259	    if(--(x->card) == 0){
260		*s->freeTail = x;
261		*(s->freeTail = &x->nxt) = NULL;
262	    }
263	    s->rep[c] = a;
264	    ++(a->card);
265	}
266    }
267    for(; s->fix; s->fix = s->fix->fix)
268	if(s->fix->card)
269	    s->fix->nxt = NULL;
270}
271
272void
273RegExp_split(RegExp *re, CharSet *s)
274{
275    switch (re->type) {
276	case NULLOP:
277	    break;
278	case MATCHOP:
279	    MatchOp_split(re, s);
280	    break;
281	case RULEOP:
282	    RegExp_split(re->d.RuleOp.exp, s);
283	    RegExp_split(re->d.RuleOp.ctx, s);
284	    break;
285	case ALTOP:
286	    /* FALLTHROUGH */
287	case CATOP:
288	    RegExp_split(re->d.AltCatOp.exp1, s);
289	    RegExp_split(re->d.AltCatOp.exp2, s);
290	    break;
291	case CLOSEOP:
292	    RegExp_split(re->d.exp, s);
293	    break;
294	case CLOSEVOP:
295	    RegExp_split(re->d.CloseVOp.exp, s);
296	    break;
297    }
298}
299
300void
301RegExp_display(RegExp *re, FILE *o)
302{
303    switch (re->type) {
304	case NULLOP:
305	    fputc('_', o);
306	    break;
307	case MATCHOP:
308	    Range_out(o, re->d.match);
309	    break;
310	case RULEOP:
311	    RegExp_display(re->d.RuleOp.exp, o);
312	    fputc('/', o);
313	    RegExp_display(re->d.RuleOp.ctx, o);
314	    fputc(';', o);
315	    break;
316	case ALTOP:
317	    RegExp_display(re->d.AltCatOp.exp1, o);
318	    fputc('|', o);
319	    RegExp_display(re->d.AltCatOp.exp2, o);
320	    break;
321	case CATOP:
322	    RegExp_display(re->d.AltCatOp.exp1, o);
323	    RegExp_display(re->d.AltCatOp.exp2, o);
324	    break;
325	case CLOSEOP:
326	    RegExp_display(re->d.exp, o);
327	    fputc('+', o);
328	    break;
329    }
330}
331
332void
333Range_out(FILE *o, const Range *r)
334{
335    if(!r)
336	return;
337
338    if((r->ub - r->lb) == 1){
339	prtCh(o, r->lb);
340    } else {
341	prtCh(o, r->lb);
342	fputc('-', o);
343	prtCh(o, r->ub-1);
344    }
345    Range_out(o, r->next);
346}
347
348static Range *doUnion(Range *r1, Range *r2){
349    Range *r, **rP = &r;
350    for(;;){
351	Range *s;
352	if(r1->lb <= r2->lb){
353	    s = Range_new_copy(r1);
354	} else {
355	    s = Range_new_copy(r2);
356	}
357	*rP = s;
358	rP = &s->next;
359	for(;;){
360	    if(r1->lb <= r2->lb){
361		if(r1->lb > s->ub)
362		    break;
363		if(r1->ub > s->ub)
364		    s->ub = r1->ub;
365		if(!(r1 = r1->next)){
366		    unsigned int ub = 0;
367		    for(; r2 && r2->lb <= s->ub; r2 = r2->next)
368			ub = r2->ub;
369		    if(ub > s->ub)
370			s->ub = ub;
371		    *rP = r2;
372		    return r;
373		}
374	    } else {
375		if(r2->lb > s->ub)
376		    break;
377		if(r2->ub > s->ub)
378		    s->ub = r2->ub;
379		if(!(r2 = r2->next)){
380		    unsigned int ub = 0;
381		    for(; r1 && r1->lb <= s->ub; r1 = r1->next)
382			ub = r1->ub;
383		    if(ub > s->ub)
384			s->ub = ub;
385		    *rP = r1;
386		    return r;
387		}
388	    }
389	}
390    }
391    *rP = NULL;
392    return r;
393}
394
395static Range *doDiff(Range *r1, Range *r2){
396    Range *r, *s, **rP = &r;
397    for(; r1; r1 = r1->next){
398	unsigned int lb = r1->lb;
399	for(; r2 && r2->ub <= r1->lb; r2 = r2->next);
400	for(; r2 && r2->lb <  r1->ub; r2 = r2->next){
401	    if(lb < r2->lb){
402		*rP = s = Range_new(lb, r2->lb);
403		rP = &s->next;
404	    }
405	    if((lb = r2->ub) >= r1->ub)
406		goto noMore;
407	}
408	*rP = s = Range_new(lb, r1->ub);
409	rP = &s->next;
410    noMore:;
411    }
412    *rP = NULL;
413    return r;
414}
415
416static RegExp *merge(RegExp *m1, RegExp *m2){
417    if(!m1)
418	return m2;
419    if(!m2)
420	return m1;
421    return RegExp_new_MatchOp(doUnion(m1->d.match, m2->d.match));
422}
423
424RegExp *mkDiff(RegExp *e1, RegExp *e2){
425    RegExp *m1, *m2;
426    Range *r;
427    if(!(m1 = RegExp_isA(e1, MATCHOP)))
428	return NULL;
429    if(!(m2 = RegExp_isA(e2, MATCHOP)))
430	return NULL;
431    r = doDiff(m1->d.match, m2->d.match);
432    return r? RegExp_new_MatchOp(r) : RegExp_new_NullOp();
433}
434
435static RegExp *doAlt(RegExp *e1, RegExp *e2){
436    if(!e1)
437	return e2;
438    if(!e2)
439	return e1;
440    return RegExp_new_AltOp(e1, e2);
441}
442
443RegExp *mkAlt(RegExp *e1, RegExp *e2){
444    RegExp *a;
445    RegExp *m1, *m2;
446    if((a = RegExp_isA(e1, ALTOP))){
447	if((m1 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP)))
448	    e1 = a->d.AltCatOp.exp2;
449    } else if((m1 = RegExp_isA(e1, MATCHOP))){
450	    e1 = NULL;
451    }
452    if((a = RegExp_isA(e2, ALTOP))){
453	if((m2 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP)))
454	    e2 = a->d.AltCatOp.exp2;
455    } else if((m2 = RegExp_isA(e2, MATCHOP))){
456	    e2 = NULL;
457    }
458    return doAlt(merge(m1, m2), doAlt(e1, e2));
459}
460
461static unsigned char unescape(SubStr *s){
462    unsigned char c;
463    unsigned char v;
464    s->len--;
465    if((c = *s->str++) != '\\' || s->len == 0)
466	return xlat[c];
467    s->len--;
468    switch(c = *s->str++){
469    case 'n':
470	return xlat['\n'];
471    case 't':
472	return xlat['\t'];
473    case 'v':
474	return xlat['\v'];
475    case 'b':
476	return xlat['\b'];
477    case 'r':
478	return xlat['\r'];
479    case 'f':
480	return xlat['\f'];
481    case 'a':
482	return xlat['\a'];
483    case '0': case '1': case '2': case '3':
484    case '4': case '5': case '6': case '7': {
485	v = c - '0';
486	for(; s->len != 0 && '0' <= (c = *s->str) && c <= '7'; s->len--, s->str++)
487	    v = v*8 + (c - '0');
488	return v;
489    } default:
490	return xlat[c];
491    }
492}
493
494static Range *getRange(SubStr *s){
495    unsigned char lb = unescape(s), ub;
496    if(s->len < 2 || *s->str != '-'){
497	ub = lb;
498    } else {
499	s->len--; s->str++;
500	ub = unescape(s);
501	if(ub < lb){
502	    unsigned char tmp;
503	    tmp = lb; lb = ub; ub = tmp;
504	}
505    }
506    return Range_new(lb, ub+1);
507}
508
509static RegExp *matchChar(unsigned int c){
510    return RegExp_new_MatchOp(Range_new(c, c+1));
511}
512
513RegExp *strToRE(SubStr s){
514    RegExp *re;
515    s.len -= 2; s.str += 1;
516    if(s.len == 0)
517	return RegExp_new_NullOp();
518    re = matchChar(unescape(&s));
519    while(s.len > 0)
520	re = RegExp_new_CatOp(re, matchChar(unescape(&s)));
521    return re;
522}
523
524RegExp *strToCaseInsensitiveRE(SubStr s){
525    unsigned char c;
526    RegExp *re, *reL, *reU;
527    s.len -= 2; s.str += 1;
528    if(s.len == 0)
529	return RegExp_new_NullOp();
530    c = unescape(&s);
531    if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
532	reL = matchChar(tolower(c));
533	reU = matchChar(toupper(c));
534	re = mkAlt(reL, reU);
535    } else {
536	re = matchChar(c);
537    }
538    while(s.len > 0) {
539	c = unescape(&s);
540	if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
541	    reL = matchChar(tolower(c));
542	    reU = matchChar(toupper(c));
543	    re = RegExp_new_CatOp(re, mkAlt(reL, reU));
544    	} else {
545	    re = RegExp_new_CatOp(re, matchChar(c));
546	}
547    }
548    return re;
549}
550
551RegExp *ranToRE(SubStr s){
552    Range *r;
553    s.len -= 2; s.str += 1;
554    if(s.len == 0)
555	return RegExp_new_NullOp();
556    r = getRange(&s);
557    while(s.len > 0)
558	r = doUnion(r, getRange(&s));
559    return RegExp_new_MatchOp(r);
560}
561
562RegExp *invToRE(SubStr s)
563{
564    RegExp *any, *ran, *inv;
565    SubStr *ss;
566
567
568    s.len--;
569    s.str++;
570
571    ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]"));
572    any = ranToRE(*ss);
573    free(ss);
574    if (s.len <= 2)
575	return any;
576
577    ran = ranToRE(s);
578    inv = mkDiff(any, ran);
579
580    free(ran);
581    free(any);
582
583    return inv;
584}
585
586RegExp *mkDot()
587{
588    SubStr *ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]"));
589    RegExp * any = ranToRE(*ss);
590    RegExp * ran = matchChar('\n');
591    RegExp * inv = mkDiff(any, ran);
592
593    free(ss);
594    free(ran);
595    free(any);
596
597    return inv;
598}
599
600RegExp *
601RegExp_new_RuleOp(RegExp *e, RegExp *c, Token *t, unsigned int a)
602{
603    RegExp *r = malloc(sizeof(RegExp));
604    r->type = RULEOP;
605    r->d.RuleOp.exp = e;
606    r->d.RuleOp.ctx = c;
607    r->d.RuleOp.ins = NULL;
608    r->d.RuleOp.accept = a;
609    r->d.RuleOp.code = t;
610    return r;
611}
612
613static void optimize(Ins *i){
614    while(!isMarked(i)){
615	mark(i);
616	if(i->i.tag == CHAR){
617	    i = (Ins*) i->i.link;
618	} else if(i->i.tag == GOTO || i->i.tag == FORK){
619	    Ins *target = (Ins*) i->i.link;
620	    optimize(target);
621	    if(target->i.tag == GOTO)
622		i->i.link = target->i.link == target? i : target;
623	    if(i->i.tag == FORK){
624		Ins *follow = (Ins*) &i[1];
625		optimize(follow);
626		if(follow->i.tag == GOTO && follow->i.link == follow){
627		    i->i.tag = GOTO;
628		} else if(i->i.link == i){
629		    i->i.tag = GOTO;
630		    i->i.link = follow;
631		}
632	    }
633	    return;
634	} else {
635	    ++i;
636	}
637    }
638}
639
640void genCode(FILE *o, RegExp *re){
641    CharSet cs;
642    unsigned int j;
643    Char rep[nChars];
644    Ins *ins, *eoi;
645    DFA *dfa;
646
647    memset(&cs, 0, sizeof(cs));
648    for(j = 0; j < nChars; ++j){
649	cs.rep[j] = &cs.ptn[0];
650	cs.ptn[j].nxt = &cs.ptn[j+1];
651    }
652    cs.freeHead = &cs.ptn[1];
653    *(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL;
654    cs.ptn[0].card = nChars;
655    cs.ptn[0].nxt = NULL;
656    RegExp_split(re, &cs);
657/*
658    for(unsigned int k = 0; k < nChars;){
659	for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];);
660	printSpan(cerr, j, k);
661	cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl;
662    }
663*/
664    for(j = 0; j < nChars; ++j){
665	if(!cs.rep[j]->nxt)
666	    cs.rep[j]->nxt = &cs.ptn[j];
667	rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]);
668    }
669
670    RegExp_calcSize(re, rep);
671    ins = malloc(sizeof(Ins)*(re->size+1));
672    memset(ins, 0, (re->size+1)*sizeof(Ins));
673    RegExp_compile(re, rep, ins);
674    eoi = &ins[re->size];
675    eoi->i.tag = GOTO;
676    eoi->i.link = eoi;
677
678    optimize(ins);
679    for(j = 0; j < re->size;){
680	unmark(&ins[j]);
681	if(ins[j].i.tag == CHAR){
682	    j = (Ins*) ins[j].i.link - ins;
683	} else {
684	    j++;
685	}
686    }
687
688    dfa = DFA_new(ins, re->size, 0, 256, rep);
689    DFA_emit(dfa, o);
690    DFA_delete(dfa);
691    free(ins);
692}
693