xmlregexp.c revision 0ddb21c46ca6ac5297ff5f6537480de8463223ea
1/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/shemas mechanisms now available in
6 * XML related specifications thise includes:
7 *    - XML-1.0 DTD validation
8 *    - XML Schemas structure part 1
9 *    - XML Schemas Datatypes part 2 especially Appendix F
10 *    - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#ifdef HAVE_LIMITS_H
25#include <limits.h>
26#endif
27
28#include <libxml/tree.h>
29#include <libxml/parserInternals.h>
30#include <libxml/xmlregexp.h>
31#include <libxml/xmlautomata.h>
32#include <libxml/xmlunicode.h>
33
34#ifndef INT_MAX
35#define INT_MAX 123456789 /* easy to flag and big enough for our needs */
36#endif
37
38/* #define DEBUG_REGEXP_GRAPH  */
39/* #define DEBUG_REGEXP_EXEC */
40/* #define DEBUG_PUSH */
41/* #define DEBUG_COMPACTION */
42
43#define ERROR(str)							\
44    ctxt->error = XML_REGEXP_COMPILE_ERROR;				\
45    xmlRegexpErrCompile(ctxt, str);
46#define NEXT ctxt->cur++
47#define CUR (*(ctxt->cur))
48#define NXT(index) (ctxt->cur[index])
49
50#define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
51#define NEXTL(l) ctxt->cur += l;
52
53/**
54 * TODO:
55 *
56 * macro to flag unimplemented blocks
57 */
58#define TODO 								\
59    xmlGenericError(xmlGenericErrorContext,				\
60	    "Unimplemented block at %s:%d\n",				\
61            __FILE__, __LINE__);
62
63/************************************************************************
64 * 									*
65 * 			Datatypes and structures			*
66 * 									*
67 ************************************************************************/
68
69typedef enum {
70    XML_REGEXP_EPSILON = 1,
71    XML_REGEXP_CHARVAL,
72    XML_REGEXP_RANGES,
73    XML_REGEXP_SUBREG,
74    XML_REGEXP_STRING,
75    XML_REGEXP_ANYCHAR, /* . */
76    XML_REGEXP_ANYSPACE, /* \s */
77    XML_REGEXP_NOTSPACE, /* \S */
78    XML_REGEXP_INITNAME, /* \l */
79    XML_REGEXP_NOTINITNAME, /* \l */
80    XML_REGEXP_NAMECHAR, /* \c */
81    XML_REGEXP_NOTNAMECHAR, /* \C */
82    XML_REGEXP_DECIMAL, /* \d */
83    XML_REGEXP_NOTDECIMAL, /* \d */
84    XML_REGEXP_REALCHAR, /* \w */
85    XML_REGEXP_NOTREALCHAR, /* \w */
86    XML_REGEXP_LETTER,
87    XML_REGEXP_LETTER_UPPERCASE,
88    XML_REGEXP_LETTER_LOWERCASE,
89    XML_REGEXP_LETTER_TITLECASE,
90    XML_REGEXP_LETTER_MODIFIER,
91    XML_REGEXP_LETTER_OTHERS,
92    XML_REGEXP_MARK,
93    XML_REGEXP_MARK_NONSPACING,
94    XML_REGEXP_MARK_SPACECOMBINING,
95    XML_REGEXP_MARK_ENCLOSING,
96    XML_REGEXP_NUMBER,
97    XML_REGEXP_NUMBER_DECIMAL,
98    XML_REGEXP_NUMBER_LETTER,
99    XML_REGEXP_NUMBER_OTHERS,
100    XML_REGEXP_PUNCT,
101    XML_REGEXP_PUNCT_CONNECTOR,
102    XML_REGEXP_PUNCT_DASH,
103    XML_REGEXP_PUNCT_OPEN,
104    XML_REGEXP_PUNCT_CLOSE,
105    XML_REGEXP_PUNCT_INITQUOTE,
106    XML_REGEXP_PUNCT_FINQUOTE,
107    XML_REGEXP_PUNCT_OTHERS,
108    XML_REGEXP_SEPAR,
109    XML_REGEXP_SEPAR_SPACE,
110    XML_REGEXP_SEPAR_LINE,
111    XML_REGEXP_SEPAR_PARA,
112    XML_REGEXP_SYMBOL,
113    XML_REGEXP_SYMBOL_MATH,
114    XML_REGEXP_SYMBOL_CURRENCY,
115    XML_REGEXP_SYMBOL_MODIFIER,
116    XML_REGEXP_SYMBOL_OTHERS,
117    XML_REGEXP_OTHER,
118    XML_REGEXP_OTHER_CONTROL,
119    XML_REGEXP_OTHER_FORMAT,
120    XML_REGEXP_OTHER_PRIVATE,
121    XML_REGEXP_OTHER_NA,
122    XML_REGEXP_BLOCK_NAME
123} xmlRegAtomType;
124
125typedef enum {
126    XML_REGEXP_QUANT_EPSILON = 1,
127    XML_REGEXP_QUANT_ONCE,
128    XML_REGEXP_QUANT_OPT,
129    XML_REGEXP_QUANT_MULT,
130    XML_REGEXP_QUANT_PLUS,
131    XML_REGEXP_QUANT_ONCEONLY,
132    XML_REGEXP_QUANT_ALL,
133    XML_REGEXP_QUANT_RANGE
134} xmlRegQuantType;
135
136typedef enum {
137    XML_REGEXP_START_STATE = 1,
138    XML_REGEXP_FINAL_STATE,
139    XML_REGEXP_TRANS_STATE
140} xmlRegStateType;
141
142typedef enum {
143    XML_REGEXP_MARK_NORMAL = 0,
144    XML_REGEXP_MARK_START,
145    XML_REGEXP_MARK_VISITED
146} xmlRegMarkedType;
147
148typedef struct _xmlRegRange xmlRegRange;
149typedef xmlRegRange *xmlRegRangePtr;
150
151struct _xmlRegRange {
152    int neg;		/* 0 normal, 1 not, 2 exclude */
153    xmlRegAtomType type;
154    int start;
155    int end;
156    xmlChar *blockName;
157};
158
159typedef struct _xmlRegAtom xmlRegAtom;
160typedef xmlRegAtom *xmlRegAtomPtr;
161
162typedef struct _xmlAutomataState xmlRegState;
163typedef xmlRegState *xmlRegStatePtr;
164
165struct _xmlRegAtom {
166    int no;
167    xmlRegAtomType type;
168    xmlRegQuantType quant;
169    int min;
170    int max;
171
172    void *valuep;
173    void *valuep2;
174    int neg;
175    int codepoint;
176    xmlRegStatePtr start;
177    xmlRegStatePtr stop;
178    int maxRanges;
179    int nbRanges;
180    xmlRegRangePtr *ranges;
181    void *data;
182};
183
184typedef struct _xmlRegCounter xmlRegCounter;
185typedef xmlRegCounter *xmlRegCounterPtr;
186
187struct _xmlRegCounter {
188    int min;
189    int max;
190};
191
192typedef struct _xmlRegTrans xmlRegTrans;
193typedef xmlRegTrans *xmlRegTransPtr;
194
195struct _xmlRegTrans {
196    xmlRegAtomPtr atom;
197    int to;
198    int counter;
199    int count;
200};
201
202struct _xmlAutomataState {
203    xmlRegStateType type;
204    xmlRegMarkedType mark;
205    xmlRegMarkedType reached;
206    int no;
207
208    int maxTrans;
209    int nbTrans;
210    xmlRegTrans *trans;
211};
212
213typedef struct _xmlAutomata xmlRegParserCtxt;
214typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
215
216struct _xmlAutomata {
217    xmlChar *string;
218    xmlChar *cur;
219
220    int error;
221    int neg;
222
223    xmlRegStatePtr start;
224    xmlRegStatePtr end;
225    xmlRegStatePtr state;
226
227    xmlRegAtomPtr atom;
228
229    int maxAtoms;
230    int nbAtoms;
231    xmlRegAtomPtr *atoms;
232
233    int maxStates;
234    int nbStates;
235    xmlRegStatePtr *states;
236
237    int maxCounters;
238    int nbCounters;
239    xmlRegCounter *counters;
240
241    int determinist;
242};
243
244struct _xmlRegexp {
245    xmlChar *string;
246    int nbStates;
247    xmlRegStatePtr *states;
248    int nbAtoms;
249    xmlRegAtomPtr *atoms;
250    int nbCounters;
251    xmlRegCounter *counters;
252    int determinist;
253    /*
254     * That's the compact form for determinists automatas
255     */
256    int nbstates;
257    int *compact;
258    void **transdata;
259    int nbstrings;
260    xmlChar **stringMap;
261};
262
263typedef struct _xmlRegExecRollback xmlRegExecRollback;
264typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
265
266struct _xmlRegExecRollback {
267    xmlRegStatePtr state;/* the current state */
268    int index;		/* the index in the input stack */
269    int nextbranch;	/* the next transition to explore in that state */
270    int *counts;	/* save the automate state if it has some */
271};
272
273typedef struct _xmlRegInputToken xmlRegInputToken;
274typedef xmlRegInputToken *xmlRegInputTokenPtr;
275
276struct _xmlRegInputToken {
277    xmlChar *value;
278    void *data;
279};
280
281struct _xmlRegExecCtxt {
282    int status;		/* execution status != 0 indicate an error */
283    int determinist;	/* did we found an inderterministic behaviour */
284    xmlRegexpPtr comp;	/* the compiled regexp */
285    xmlRegExecCallbacks callback;
286    void *data;
287
288    xmlRegStatePtr state;/* the current state */
289    int transno;	/* the current transition on that state */
290    int transcount;	/* the number of char in char counted transitions */
291
292    /*
293     * A stack of rollback states
294     */
295    int maxRollbacks;
296    int nbRollbacks;
297    xmlRegExecRollback *rollbacks;
298
299    /*
300     * The state of the automata if any
301     */
302    int *counts;
303
304    /*
305     * The input stack
306     */
307    int inputStackMax;
308    int inputStackNr;
309    int index;
310    int *charStack;
311    const xmlChar *inputString; /* when operating on characters */
312    xmlRegInputTokenPtr inputStack;/* when operating on strings */
313
314};
315
316#define REGEXP_ALL_COUNTER	0x123456
317#define REGEXP_ALL_LAX_COUNTER	0x123457
318
319static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
320static void xmlRegFreeState(xmlRegStatePtr state);
321static void xmlRegFreeAtom(xmlRegAtomPtr atom);
322
323/************************************************************************
324 *									*
325 * 		Regexp memory error handler				*
326 *									*
327 ************************************************************************/
328/**
329 * xmlRegexpErrMemory:
330 * @extra:  extra informations
331 *
332 * Handle an out of memory condition
333 */
334static void
335xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
336{
337    const char *regexp = NULL;
338    if (ctxt != NULL) {
339        regexp = (const char *) ctxt->string;
340	ctxt->error = XML_ERR_NO_MEMORY;
341    }
342    __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
343		    XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
344		    regexp, NULL, 0, 0,
345		    "Memory allocation failed : %s\n", extra);
346}
347
348/**
349 * xmlRegexpErrCompile:
350 * @extra:  extra informations
351 *
352 * Handle an compilation failure
353 */
354static void
355xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
356{
357    const char *regexp = NULL;
358    int idx = 0;
359
360    if (ctxt != NULL) {
361        regexp = (const char *) ctxt->string;
362	idx = ctxt->cur - ctxt->string;
363	ctxt->error = XML_REGEXP_COMPILE_ERROR;
364    }
365    __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
366		    XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
367		    regexp, NULL, idx, 0,
368		    "failed to compile: %s\n", extra);
369}
370
371/************************************************************************
372 * 									*
373 * 			Allocation/Deallocation				*
374 * 									*
375 ************************************************************************/
376
377static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
378/**
379 * xmlRegEpxFromParse:
380 * @ctxt:  the parser context used to build it
381 *
382 * Allocate a new regexp and fill it with the reult from the parser
383 *
384 * Returns the new regexp or NULL in case of error
385 */
386static xmlRegexpPtr
387xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
388    xmlRegexpPtr ret;
389
390    ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
391    if (ret == NULL) {
392	xmlRegexpErrMemory(ctxt, "compiling regexp");
393	return(NULL);
394    }
395    memset(ret, 0, sizeof(xmlRegexp));
396    ret->string = ctxt->string;
397    ret->nbStates = ctxt->nbStates;
398    ret->states = ctxt->states;
399    ret->nbAtoms = ctxt->nbAtoms;
400    ret->atoms = ctxt->atoms;
401    ret->nbCounters = ctxt->nbCounters;
402    ret->counters = ctxt->counters;
403    ret->determinist = ctxt->determinist;
404
405    if ((ret->determinist != 0) &&
406	(ret->nbCounters == 0) &&
407	(ret->atoms != NULL) &&
408	(ret->atoms[0] != NULL) &&
409	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
410	int i, j, nbstates = 0, nbatoms = 0;
411	int *stateRemap;
412	int *stringRemap;
413	int *transitions;
414	void **transdata;
415	xmlChar **stringMap;
416        xmlChar *value;
417
418	/*
419	 * Switch to a compact representation
420	 * 1/ counting the effective number of states left
421	 * 2/ conting the unique number of atoms, and check that
422	 *    they are all of the string type
423	 * 3/ build a table state x atom for the transitions
424	 */
425
426	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
427	if (stateRemap == NULL) {
428	    xmlRegexpErrMemory(ctxt, "compiling regexp");
429	    xmlFree(ret);
430	    return(NULL);
431	}
432	for (i = 0;i < ret->nbStates;i++) {
433	    if (ret->states[i] != NULL) {
434		stateRemap[i] = nbstates;
435		nbstates++;
436	    } else {
437		stateRemap[i] = -1;
438	    }
439	}
440#ifdef DEBUG_COMPACTION
441	printf("Final: %d states\n", nbstates);
442#endif
443	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
444	if (stringMap == NULL) {
445	    xmlRegexpErrMemory(ctxt, "compiling regexp");
446	    xmlFree(stateRemap);
447	    xmlFree(ret);
448	    return(NULL);
449	}
450	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
451	if (stringRemap == NULL) {
452	    xmlRegexpErrMemory(ctxt, "compiling regexp");
453	    xmlFree(stringMap);
454	    xmlFree(stateRemap);
455	    xmlFree(ret);
456	    return(NULL);
457	}
458	for (i = 0;i < ret->nbAtoms;i++) {
459	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
460		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
461		value = ret->atoms[i]->valuep;
462                for (j = 0;j < nbatoms;j++) {
463		    if (xmlStrEqual(stringMap[j], value)) {
464			stringRemap[i] = j;
465			break;
466		    }
467		}
468		if (j >= nbatoms) {
469		    stringRemap[i] = nbatoms;
470		    stringMap[nbatoms] = xmlStrdup(value);
471		    if (stringMap[nbatoms] == NULL) {
472			for (i = 0;i < nbatoms;i++)
473			    xmlFree(stringMap[i]);
474			xmlFree(stringRemap);
475			xmlFree(stringMap);
476			xmlFree(stateRemap);
477			xmlFree(ret);
478			return(NULL);
479		    }
480		    nbatoms++;
481		}
482	    } else {
483		xmlFree(stateRemap);
484		xmlFree(stringRemap);
485		for (i = 0;i < nbatoms;i++)
486		    xmlFree(stringMap[i]);
487		xmlFree(stringMap);
488		xmlFree(ret);
489		return(NULL);
490	    }
491	}
492#ifdef DEBUG_COMPACTION
493	printf("Final: %d atoms\n", nbatoms);
494#endif
495	transitions = (int *) xmlMalloc((nbstates + 1) *
496	                                (nbatoms + 1) * sizeof(int));
497	if (transitions == NULL) {
498	    xmlFree(stateRemap);
499	    xmlFree(stringRemap);
500	    xmlFree(stringMap);
501	    xmlFree(ret);
502	    return(NULL);
503	}
504	memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
505
506	/*
507	 * Allocate the transition table. The first entry for each
508	 * state correspond to the state type.
509	 */
510	transdata = NULL;
511
512	for (i = 0;i < ret->nbStates;i++) {
513	    int stateno, atomno, targetno, prev;
514	    xmlRegStatePtr state;
515	    xmlRegTransPtr trans;
516
517	    stateno = stateRemap[i];
518	    if (stateno == -1)
519		continue;
520	    state = ret->states[i];
521
522	    transitions[stateno * (nbatoms + 1)] = state->type;
523
524	    for (j = 0;j < state->nbTrans;j++) {
525		trans = &(state->trans[j]);
526		if ((trans->to == -1) || (trans->atom == NULL))
527		    continue;
528                atomno = stringRemap[trans->atom->no];
529		if ((trans->atom->data != NULL) && (transdata == NULL)) {
530		    transdata = (void **) xmlMalloc(nbstates * nbatoms *
531			                            sizeof(void *));
532		    if (transdata != NULL)
533			memset(transdata, 0,
534			       nbstates * nbatoms * sizeof(void *));
535		    else {
536			xmlRegexpErrMemory(ctxt, "compiling regexp");
537			break;
538		    }
539		}
540		targetno = stateRemap[trans->to];
541		/*
542		 * if the same atome can generate transition to 2 different
543		 * states then it means the automata is not determinist and
544		 * the compact form can't be used !
545		 */
546		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
547		if (prev != 0) {
548		    if (prev != targetno + 1) {
549			ret->determinist = 0;
550#ifdef DEBUG_COMPACTION
551			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
552			       i, j, trans->atom->no, trans->to, atomno, targetno);
553			printf("       previous to is %d\n", prev);
554#endif
555			ret->determinist = 0;
556			if (transdata != NULL)
557			    xmlFree(transdata);
558			xmlFree(transitions);
559			xmlFree(stateRemap);
560			xmlFree(stringRemap);
561			for (i = 0;i < nbatoms;i++)
562			    xmlFree(stringMap[i]);
563			xmlFree(stringMap);
564			goto not_determ;
565		    }
566		} else {
567#if 0
568		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
569			   i, j, trans->atom->no, trans->to, atomno, targetno);
570#endif
571		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
572			targetno + 1; /* to avoid 0 */
573		    if (transdata != NULL)
574			transdata[stateno * nbatoms + atomno] =
575			    trans->atom->data;
576		}
577	    }
578	}
579	ret->determinist = 1;
580#ifdef DEBUG_COMPACTION
581	/*
582	 * Debug
583	 */
584	for (i = 0;i < nbstates;i++) {
585	    for (j = 0;j < nbatoms + 1;j++) {
586                printf("%02d ", transitions[i * (nbatoms + 1) + j]);
587	    }
588	    printf("\n");
589	}
590	printf("\n");
591#endif
592	/*
593	 * Cleanup of the old data
594	 */
595	if (ret->states != NULL) {
596	    for (i = 0;i < ret->nbStates;i++)
597		xmlRegFreeState(ret->states[i]);
598	    xmlFree(ret->states);
599	}
600	ret->states = NULL;
601	ret->nbStates = 0;
602	if (ret->atoms != NULL) {
603	    for (i = 0;i < ret->nbAtoms;i++)
604		xmlRegFreeAtom(ret->atoms[i]);
605	    xmlFree(ret->atoms);
606	}
607	ret->atoms = NULL;
608	ret->nbAtoms = 0;
609
610	ret->compact = transitions;
611	ret->transdata = transdata;
612	ret->stringMap = stringMap;
613	ret->nbstrings = nbatoms;
614	ret->nbstates = nbstates;
615	xmlFree(stateRemap);
616	xmlFree(stringRemap);
617    }
618not_determ:
619    ctxt->string = NULL;
620    ctxt->nbStates = 0;
621    ctxt->states = NULL;
622    ctxt->nbAtoms = 0;
623    ctxt->atoms = NULL;
624    ctxt->nbCounters = 0;
625    ctxt->counters = NULL;
626    return(ret);
627}
628
629/**
630 * xmlRegNewParserCtxt:
631 * @string:  the string to parse
632 *
633 * Allocate a new regexp parser context
634 *
635 * Returns the new context or NULL in case of error
636 */
637static xmlRegParserCtxtPtr
638xmlRegNewParserCtxt(const xmlChar *string) {
639    xmlRegParserCtxtPtr ret;
640
641    ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
642    if (ret == NULL)
643	return(NULL);
644    memset(ret, 0, sizeof(xmlRegParserCtxt));
645    if (string != NULL)
646	ret->string = xmlStrdup(string);
647    ret->cur = ret->string;
648    ret->neg = 0;
649    ret->error = 0;
650    ret->determinist = -1;
651    return(ret);
652}
653
654/**
655 * xmlRegNewRange:
656 * @ctxt:  the regexp parser context
657 * @neg:  is that negative
658 * @type:  the type of range
659 * @start:  the start codepoint
660 * @end:  the end codepoint
661 *
662 * Allocate a new regexp range
663 *
664 * Returns the new range or NULL in case of error
665 */
666static xmlRegRangePtr
667xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
668	       int neg, xmlRegAtomType type, int start, int end) {
669    xmlRegRangePtr ret;
670
671    ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
672    if (ret == NULL) {
673	xmlRegexpErrMemory(ctxt, "allocating range");
674	return(NULL);
675    }
676    ret->neg = neg;
677    ret->type = type;
678    ret->start = start;
679    ret->end = end;
680    return(ret);
681}
682
683/**
684 * xmlRegFreeRange:
685 * @range:  the regexp range
686 *
687 * Free a regexp range
688 */
689static void
690xmlRegFreeRange(xmlRegRangePtr range) {
691    if (range == NULL)
692	return;
693
694    if (range->blockName != NULL)
695	xmlFree(range->blockName);
696    xmlFree(range);
697}
698
699/**
700 * xmlRegNewAtom:
701 * @ctxt:  the regexp parser context
702 * @type:  the type of atom
703 *
704 * Allocate a new regexp range
705 *
706 * Returns the new atom or NULL in case of error
707 */
708static xmlRegAtomPtr
709xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
710    xmlRegAtomPtr ret;
711
712    ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
713    if (ret == NULL) {
714	xmlRegexpErrMemory(ctxt, "allocating atom");
715	return(NULL);
716    }
717    memset(ret, 0, sizeof(xmlRegAtom));
718    ret->type = type;
719    ret->quant = XML_REGEXP_QUANT_ONCE;
720    ret->min = 0;
721    ret->max = 0;
722    return(ret);
723}
724
725/**
726 * xmlRegFreeAtom:
727 * @atom:  the regexp atom
728 *
729 * Free a regexp atom
730 */
731static void
732xmlRegFreeAtom(xmlRegAtomPtr atom) {
733    int i;
734
735    if (atom == NULL)
736	return;
737
738    for (i = 0;i < atom->nbRanges;i++)
739	xmlRegFreeRange(atom->ranges[i]);
740    if (atom->ranges != NULL)
741	xmlFree(atom->ranges);
742    if (atom->type == XML_REGEXP_STRING)
743	xmlFree(atom->valuep);
744    xmlFree(atom);
745}
746
747static xmlRegStatePtr
748xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
749    xmlRegStatePtr ret;
750
751    ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
752    if (ret == NULL) {
753	xmlRegexpErrMemory(ctxt, "allocating state");
754	return(NULL);
755    }
756    memset(ret, 0, sizeof(xmlRegState));
757    ret->type = XML_REGEXP_TRANS_STATE;
758    ret->mark = XML_REGEXP_MARK_NORMAL;
759    return(ret);
760}
761
762/**
763 * xmlRegFreeState:
764 * @state:  the regexp state
765 *
766 * Free a regexp state
767 */
768static void
769xmlRegFreeState(xmlRegStatePtr state) {
770    if (state == NULL)
771	return;
772
773    if (state->trans != NULL)
774	xmlFree(state->trans);
775    xmlFree(state);
776}
777
778/**
779 * xmlRegFreeParserCtxt:
780 * @ctxt:  the regexp parser context
781 *
782 * Free a regexp parser context
783 */
784static void
785xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
786    int i;
787    if (ctxt == NULL)
788	return;
789
790    if (ctxt->string != NULL)
791	xmlFree(ctxt->string);
792    if (ctxt->states != NULL) {
793	for (i = 0;i < ctxt->nbStates;i++)
794	    xmlRegFreeState(ctxt->states[i]);
795	xmlFree(ctxt->states);
796    }
797    if (ctxt->atoms != NULL) {
798	for (i = 0;i < ctxt->nbAtoms;i++)
799	    xmlRegFreeAtom(ctxt->atoms[i]);
800	xmlFree(ctxt->atoms);
801    }
802    if (ctxt->counters != NULL)
803	xmlFree(ctxt->counters);
804    xmlFree(ctxt);
805}
806
807/************************************************************************
808 * 									*
809 * 			Display of Data structures			*
810 * 									*
811 ************************************************************************/
812
813static void
814xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
815    switch (type) {
816        case XML_REGEXP_EPSILON:
817	    fprintf(output, "epsilon "); break;
818        case XML_REGEXP_CHARVAL:
819	    fprintf(output, "charval "); break;
820        case XML_REGEXP_RANGES:
821	    fprintf(output, "ranges "); break;
822        case XML_REGEXP_SUBREG:
823	    fprintf(output, "subexpr "); break;
824        case XML_REGEXP_STRING:
825	    fprintf(output, "string "); break;
826        case XML_REGEXP_ANYCHAR:
827	    fprintf(output, "anychar "); break;
828        case XML_REGEXP_ANYSPACE:
829	    fprintf(output, "anyspace "); break;
830        case XML_REGEXP_NOTSPACE:
831	    fprintf(output, "notspace "); break;
832        case XML_REGEXP_INITNAME:
833	    fprintf(output, "initname "); break;
834        case XML_REGEXP_NOTINITNAME:
835	    fprintf(output, "notinitname "); break;
836        case XML_REGEXP_NAMECHAR:
837	    fprintf(output, "namechar "); break;
838        case XML_REGEXP_NOTNAMECHAR:
839	    fprintf(output, "notnamechar "); break;
840        case XML_REGEXP_DECIMAL:
841	    fprintf(output, "decimal "); break;
842        case XML_REGEXP_NOTDECIMAL:
843	    fprintf(output, "notdecimal "); break;
844        case XML_REGEXP_REALCHAR:
845	    fprintf(output, "realchar "); break;
846        case XML_REGEXP_NOTREALCHAR:
847	    fprintf(output, "notrealchar "); break;
848        case XML_REGEXP_LETTER:
849            fprintf(output, "LETTER "); break;
850        case XML_REGEXP_LETTER_UPPERCASE:
851            fprintf(output, "LETTER_UPPERCASE "); break;
852        case XML_REGEXP_LETTER_LOWERCASE:
853            fprintf(output, "LETTER_LOWERCASE "); break;
854        case XML_REGEXP_LETTER_TITLECASE:
855            fprintf(output, "LETTER_TITLECASE "); break;
856        case XML_REGEXP_LETTER_MODIFIER:
857            fprintf(output, "LETTER_MODIFIER "); break;
858        case XML_REGEXP_LETTER_OTHERS:
859            fprintf(output, "LETTER_OTHERS "); break;
860        case XML_REGEXP_MARK:
861            fprintf(output, "MARK "); break;
862        case XML_REGEXP_MARK_NONSPACING:
863            fprintf(output, "MARK_NONSPACING "); break;
864        case XML_REGEXP_MARK_SPACECOMBINING:
865            fprintf(output, "MARK_SPACECOMBINING "); break;
866        case XML_REGEXP_MARK_ENCLOSING:
867            fprintf(output, "MARK_ENCLOSING "); break;
868        case XML_REGEXP_NUMBER:
869            fprintf(output, "NUMBER "); break;
870        case XML_REGEXP_NUMBER_DECIMAL:
871            fprintf(output, "NUMBER_DECIMAL "); break;
872        case XML_REGEXP_NUMBER_LETTER:
873            fprintf(output, "NUMBER_LETTER "); break;
874        case XML_REGEXP_NUMBER_OTHERS:
875            fprintf(output, "NUMBER_OTHERS "); break;
876        case XML_REGEXP_PUNCT:
877            fprintf(output, "PUNCT "); break;
878        case XML_REGEXP_PUNCT_CONNECTOR:
879            fprintf(output, "PUNCT_CONNECTOR "); break;
880        case XML_REGEXP_PUNCT_DASH:
881            fprintf(output, "PUNCT_DASH "); break;
882        case XML_REGEXP_PUNCT_OPEN:
883            fprintf(output, "PUNCT_OPEN "); break;
884        case XML_REGEXP_PUNCT_CLOSE:
885            fprintf(output, "PUNCT_CLOSE "); break;
886        case XML_REGEXP_PUNCT_INITQUOTE:
887            fprintf(output, "PUNCT_INITQUOTE "); break;
888        case XML_REGEXP_PUNCT_FINQUOTE:
889            fprintf(output, "PUNCT_FINQUOTE "); break;
890        case XML_REGEXP_PUNCT_OTHERS:
891            fprintf(output, "PUNCT_OTHERS "); break;
892        case XML_REGEXP_SEPAR:
893            fprintf(output, "SEPAR "); break;
894        case XML_REGEXP_SEPAR_SPACE:
895            fprintf(output, "SEPAR_SPACE "); break;
896        case XML_REGEXP_SEPAR_LINE:
897            fprintf(output, "SEPAR_LINE "); break;
898        case XML_REGEXP_SEPAR_PARA:
899            fprintf(output, "SEPAR_PARA "); break;
900        case XML_REGEXP_SYMBOL:
901            fprintf(output, "SYMBOL "); break;
902        case XML_REGEXP_SYMBOL_MATH:
903            fprintf(output, "SYMBOL_MATH "); break;
904        case XML_REGEXP_SYMBOL_CURRENCY:
905            fprintf(output, "SYMBOL_CURRENCY "); break;
906        case XML_REGEXP_SYMBOL_MODIFIER:
907            fprintf(output, "SYMBOL_MODIFIER "); break;
908        case XML_REGEXP_SYMBOL_OTHERS:
909            fprintf(output, "SYMBOL_OTHERS "); break;
910        case XML_REGEXP_OTHER:
911            fprintf(output, "OTHER "); break;
912        case XML_REGEXP_OTHER_CONTROL:
913            fprintf(output, "OTHER_CONTROL "); break;
914        case XML_REGEXP_OTHER_FORMAT:
915            fprintf(output, "OTHER_FORMAT "); break;
916        case XML_REGEXP_OTHER_PRIVATE:
917            fprintf(output, "OTHER_PRIVATE "); break;
918        case XML_REGEXP_OTHER_NA:
919            fprintf(output, "OTHER_NA "); break;
920        case XML_REGEXP_BLOCK_NAME:
921	    fprintf(output, "BLOCK "); break;
922    }
923}
924
925static void
926xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
927    switch (type) {
928        case XML_REGEXP_QUANT_EPSILON:
929	    fprintf(output, "epsilon "); break;
930        case XML_REGEXP_QUANT_ONCE:
931	    fprintf(output, "once "); break;
932        case XML_REGEXP_QUANT_OPT:
933	    fprintf(output, "? "); break;
934        case XML_REGEXP_QUANT_MULT:
935	    fprintf(output, "* "); break;
936        case XML_REGEXP_QUANT_PLUS:
937	    fprintf(output, "+ "); break;
938	case XML_REGEXP_QUANT_RANGE:
939	    fprintf(output, "range "); break;
940	case XML_REGEXP_QUANT_ONCEONLY:
941	    fprintf(output, "onceonly "); break;
942	case XML_REGEXP_QUANT_ALL:
943	    fprintf(output, "all "); break;
944    }
945}
946static void
947xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
948    fprintf(output, "  range: ");
949    if (range->neg)
950	fprintf(output, "negative ");
951    xmlRegPrintAtomType(output, range->type);
952    fprintf(output, "%c - %c\n", range->start, range->end);
953}
954
955static void
956xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
957    fprintf(output, " atom: ");
958    if (atom == NULL) {
959	fprintf(output, "NULL\n");
960	return;
961    }
962    xmlRegPrintAtomType(output, atom->type);
963    xmlRegPrintQuantType(output, atom->quant);
964    if (atom->quant == XML_REGEXP_QUANT_RANGE)
965	fprintf(output, "%d-%d ", atom->min, atom->max);
966    if (atom->type == XML_REGEXP_STRING)
967	fprintf(output, "'%s' ", (char *) atom->valuep);
968    if (atom->type == XML_REGEXP_CHARVAL)
969	fprintf(output, "char %c\n", atom->codepoint);
970    else if (atom->type == XML_REGEXP_RANGES) {
971	int i;
972	fprintf(output, "%d entries\n", atom->nbRanges);
973	for (i = 0; i < atom->nbRanges;i++)
974	    xmlRegPrintRange(output, atom->ranges[i]);
975    } else if (atom->type == XML_REGEXP_SUBREG) {
976	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
977    } else {
978	fprintf(output, "\n");
979    }
980}
981
982static void
983xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
984    fprintf(output, "  trans: ");
985    if (trans == NULL) {
986	fprintf(output, "NULL\n");
987	return;
988    }
989    if (trans->to < 0) {
990	fprintf(output, "removed\n");
991	return;
992    }
993    if (trans->counter >= 0) {
994	fprintf(output, "counted %d, ", trans->counter);
995    }
996    if (trans->count == REGEXP_ALL_COUNTER) {
997	fprintf(output, "all transition, ");
998    } else if (trans->count >= 0) {
999	fprintf(output, "count based %d, ", trans->count);
1000    }
1001    if (trans->atom == NULL) {
1002	fprintf(output, "epsilon to %d\n", trans->to);
1003	return;
1004    }
1005    if (trans->atom->type == XML_REGEXP_CHARVAL)
1006	fprintf(output, "char %c ", trans->atom->codepoint);
1007    fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1008}
1009
1010static void
1011xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1012    int i;
1013
1014    fprintf(output, " state: ");
1015    if (state == NULL) {
1016	fprintf(output, "NULL\n");
1017	return;
1018    }
1019    if (state->type == XML_REGEXP_START_STATE)
1020	fprintf(output, "START ");
1021    if (state->type == XML_REGEXP_FINAL_STATE)
1022	fprintf(output, "FINAL ");
1023
1024    fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1025    for (i = 0;i < state->nbTrans; i++) {
1026	xmlRegPrintTrans(output, &(state->trans[i]));
1027    }
1028}
1029
1030#ifdef DEBUG_REGEXP_GRAPH
1031static void
1032xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1033    int i;
1034
1035    fprintf(output, " ctxt: ");
1036    if (ctxt == NULL) {
1037	fprintf(output, "NULL\n");
1038	return;
1039    }
1040    fprintf(output, "'%s' ", ctxt->string);
1041    if (ctxt->error)
1042	fprintf(output, "error ");
1043    if (ctxt->neg)
1044	fprintf(output, "neg ");
1045    fprintf(output, "\n");
1046    fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1047    for (i = 0;i < ctxt->nbAtoms; i++) {
1048	fprintf(output, " %02d ", i);
1049	xmlRegPrintAtom(output, ctxt->atoms[i]);
1050    }
1051    if (ctxt->atom != NULL) {
1052	fprintf(output, "current atom:\n");
1053	xmlRegPrintAtom(output, ctxt->atom);
1054    }
1055    fprintf(output, "%d states:", ctxt->nbStates);
1056    if (ctxt->start != NULL)
1057	fprintf(output, " start: %d", ctxt->start->no);
1058    if (ctxt->end != NULL)
1059	fprintf(output, " end: %d", ctxt->end->no);
1060    fprintf(output, "\n");
1061    for (i = 0;i < ctxt->nbStates; i++) {
1062	xmlRegPrintState(output, ctxt->states[i]);
1063    }
1064    fprintf(output, "%d counters:\n", ctxt->nbCounters);
1065    for (i = 0;i < ctxt->nbCounters; i++) {
1066	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1067		                                ctxt->counters[i].max);
1068    }
1069}
1070#endif
1071
1072/************************************************************************
1073 * 									*
1074 *		 Finite Automata structures manipulations		*
1075 * 									*
1076 ************************************************************************/
1077
1078static void
1079xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1080	           int neg, xmlRegAtomType type, int start, int end,
1081		   xmlChar *blockName) {
1082    xmlRegRangePtr range;
1083
1084    if (atom == NULL) {
1085	ERROR("add range: atom is NULL");
1086	return;
1087    }
1088    if (atom->type != XML_REGEXP_RANGES) {
1089	ERROR("add range: atom is not ranges");
1090	return;
1091    }
1092    if (atom->maxRanges == 0) {
1093	atom->maxRanges = 4;
1094	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1095		                             sizeof(xmlRegRangePtr));
1096	if (atom->ranges == NULL) {
1097	    xmlRegexpErrMemory(ctxt, "adding ranges");
1098	    atom->maxRanges = 0;
1099	    return;
1100	}
1101    } else if (atom->nbRanges >= atom->maxRanges) {
1102	xmlRegRangePtr *tmp;
1103	atom->maxRanges *= 2;
1104	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1105		                             sizeof(xmlRegRangePtr));
1106	if (tmp == NULL) {
1107	    xmlRegexpErrMemory(ctxt, "adding ranges");
1108	    atom->maxRanges /= 2;
1109	    return;
1110	}
1111	atom->ranges = tmp;
1112    }
1113    range = xmlRegNewRange(ctxt, neg, type, start, end);
1114    if (range == NULL)
1115	return;
1116    range->blockName = blockName;
1117    atom->ranges[atom->nbRanges++] = range;
1118
1119}
1120
1121static int
1122xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1123    if (ctxt->maxCounters == 0) {
1124	ctxt->maxCounters = 4;
1125	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1126		                             sizeof(xmlRegCounter));
1127	if (ctxt->counters == NULL) {
1128	    xmlRegexpErrMemory(ctxt, "allocating counter");
1129	    ctxt->maxCounters = 0;
1130	    return(-1);
1131	}
1132    } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1133	xmlRegCounter *tmp;
1134	ctxt->maxCounters *= 2;
1135	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1136		                           sizeof(xmlRegCounter));
1137	if (tmp == NULL) {
1138	    xmlRegexpErrMemory(ctxt, "allocating counter");
1139	    ctxt->maxCounters /= 2;
1140	    return(-1);
1141	}
1142	ctxt->counters = tmp;
1143    }
1144    ctxt->counters[ctxt->nbCounters].min = -1;
1145    ctxt->counters[ctxt->nbCounters].max = -1;
1146    return(ctxt->nbCounters++);
1147}
1148
1149static int
1150xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1151    if (atom == NULL) {
1152	ERROR("atom push: atom is NULL");
1153	return(-1);
1154    }
1155    if (ctxt->maxAtoms == 0) {
1156	ctxt->maxAtoms = 4;
1157	ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1158		                             sizeof(xmlRegAtomPtr));
1159	if (ctxt->atoms == NULL) {
1160	    xmlRegexpErrMemory(ctxt, "pushing atom");
1161	    ctxt->maxAtoms = 0;
1162	    return(-1);
1163	}
1164    } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1165	xmlRegAtomPtr *tmp;
1166	ctxt->maxAtoms *= 2;
1167	tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1168		                             sizeof(xmlRegAtomPtr));
1169	if (tmp == NULL) {
1170	    xmlRegexpErrMemory(ctxt, "allocating counter");
1171	    ctxt->maxAtoms /= 2;
1172	    return(-1);
1173	}
1174	ctxt->atoms = tmp;
1175    }
1176    atom->no = ctxt->nbAtoms;
1177    ctxt->atoms[ctxt->nbAtoms++] = atom;
1178    return(0);
1179}
1180
1181static void
1182xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1183	            xmlRegAtomPtr atom, xmlRegStatePtr target,
1184		    int counter, int count) {
1185    if (state == NULL) {
1186	ERROR("add state: state is NULL");
1187	return;
1188    }
1189    if (target == NULL) {
1190	ERROR("add state: target is NULL");
1191	return;
1192    }
1193    if (state->maxTrans == 0) {
1194	state->maxTrans = 4;
1195	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1196		                             sizeof(xmlRegTrans));
1197	if (state->trans == NULL) {
1198	    xmlRegexpErrMemory(ctxt, "adding transition");
1199	    state->maxTrans = 0;
1200	    return;
1201	}
1202    } else if (state->nbTrans >= state->maxTrans) {
1203	xmlRegTrans *tmp;
1204	state->maxTrans *= 2;
1205	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1206		                             sizeof(xmlRegTrans));
1207	if (tmp == NULL) {
1208	    xmlRegexpErrMemory(ctxt, "adding transition");
1209	    state->maxTrans /= 2;
1210	    return;
1211	}
1212	state->trans = tmp;
1213    }
1214#ifdef DEBUG_REGEXP_GRAPH
1215    printf("Add trans from %d to %d ", state->no, target->no);
1216    if (count == REGEXP_ALL_COUNTER)
1217	printf("all transition");
1218    else if (count >= 0)
1219	printf("count based %d", count);
1220    else if (counter >= 0)
1221	printf("counted %d", counter);
1222    else if (atom == NULL)
1223	printf("epsilon transition");
1224    printf("\n");
1225#endif
1226
1227    state->trans[state->nbTrans].atom = atom;
1228    state->trans[state->nbTrans].to = target->no;
1229    state->trans[state->nbTrans].counter = counter;
1230    state->trans[state->nbTrans].count = count;
1231    state->nbTrans++;
1232}
1233
1234static int
1235xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1236    if (state == NULL) return(-1);
1237    if (ctxt->maxStates == 0) {
1238	ctxt->maxStates = 4;
1239	ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1240		                             sizeof(xmlRegStatePtr));
1241	if (ctxt->states == NULL) {
1242	    xmlRegexpErrMemory(ctxt, "adding state");
1243	    ctxt->maxStates = 0;
1244	    return(-1);
1245	}
1246    } else if (ctxt->nbStates >= ctxt->maxStates) {
1247	xmlRegStatePtr *tmp;
1248	ctxt->maxStates *= 2;
1249	tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1250		                             sizeof(xmlRegStatePtr));
1251	if (tmp == NULL) {
1252	    xmlRegexpErrMemory(ctxt, "adding state");
1253	    ctxt->maxStates /= 2;
1254	    return(-1);
1255	}
1256	ctxt->states = tmp;
1257    }
1258    state->no = ctxt->nbStates;
1259    ctxt->states[ctxt->nbStates++] = state;
1260    return(0);
1261}
1262
1263/**
1264 * xmlFAGenerateAllTransition:
1265 * @ctxt:  a regexp parser context
1266 * @from:  the from state
1267 * @to:  the target state or NULL for building a new one
1268 * @lax:
1269 *
1270 */
1271static void
1272xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1273			   xmlRegStatePtr from, xmlRegStatePtr to,
1274			   int lax) {
1275    if (to == NULL) {
1276	to = xmlRegNewState(ctxt);
1277	xmlRegStatePush(ctxt, to);
1278	ctxt->state = to;
1279    }
1280    if (lax)
1281	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1282    else
1283	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1284}
1285
1286/**
1287 * xmlFAGenerateEpsilonTransition:
1288 * @ctxt:  a regexp parser context
1289 * @from:  the from state
1290 * @to:  the target state or NULL for building a new one
1291 *
1292 */
1293static void
1294xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1295			       xmlRegStatePtr from, xmlRegStatePtr to) {
1296    if (to == NULL) {
1297	to = xmlRegNewState(ctxt);
1298	xmlRegStatePush(ctxt, to);
1299	ctxt->state = to;
1300    }
1301    xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1302}
1303
1304/**
1305 * xmlFAGenerateCountedEpsilonTransition:
1306 * @ctxt:  a regexp parser context
1307 * @from:  the from state
1308 * @to:  the target state or NULL for building a new one
1309 * counter:  the counter for that transition
1310 *
1311 */
1312static void
1313xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1314	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1315    if (to == NULL) {
1316	to = xmlRegNewState(ctxt);
1317	xmlRegStatePush(ctxt, to);
1318	ctxt->state = to;
1319    }
1320    xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1321}
1322
1323/**
1324 * xmlFAGenerateCountedTransition:
1325 * @ctxt:  a regexp parser context
1326 * @from:  the from state
1327 * @to:  the target state or NULL for building a new one
1328 * counter:  the counter for that transition
1329 *
1330 */
1331static void
1332xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1333	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1334    if (to == NULL) {
1335	to = xmlRegNewState(ctxt);
1336	xmlRegStatePush(ctxt, to);
1337	ctxt->state = to;
1338    }
1339    xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1340}
1341
1342/**
1343 * xmlFAGenerateTransitions:
1344 * @ctxt:  a regexp parser context
1345 * @from:  the from state
1346 * @to:  the target state or NULL for building a new one
1347 * @atom:  the atom generating the transition
1348 *
1349 * Returns 0 if succes and -1 in case of error.
1350 */
1351static int
1352xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1353	                 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1354    if (atom == NULL) {
1355	ERROR("genrate transition: atom == NULL");
1356	return(-1);
1357    }
1358    if (atom->type == XML_REGEXP_SUBREG) {
1359	/*
1360	 * this is a subexpression handling one should not need to
1361	 * create a new node excep for XML_REGEXP_QUANT_RANGE.
1362	 */
1363	if (xmlRegAtomPush(ctxt, atom) < 0) {
1364	    return(-1);
1365	}
1366	if ((to != NULL) && (atom->stop != to) &&
1367	    (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1368	    /*
1369	     * Generate an epsilon transition to link to the target
1370	     */
1371	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1372	}
1373	switch (atom->quant) {
1374	    case XML_REGEXP_QUANT_OPT:
1375		atom->quant = XML_REGEXP_QUANT_ONCE;
1376		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1377		break;
1378	    case XML_REGEXP_QUANT_MULT:
1379		atom->quant = XML_REGEXP_QUANT_ONCE;
1380		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1381		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1382		break;
1383	    case XML_REGEXP_QUANT_PLUS:
1384		atom->quant = XML_REGEXP_QUANT_ONCE;
1385		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1386		break;
1387	    case XML_REGEXP_QUANT_RANGE: {
1388		int counter;
1389		xmlRegStatePtr newstate;
1390
1391		/*
1392		 * This one is nasty:
1393		 *   1/ register a new counter
1394		 *   2/ register an epsilon transition associated to
1395		 *      this counter going from atom->stop to atom->start
1396		 *   3/ create a new state
1397		 *   4/ generate a counted transition from atom->stop to
1398		 *      that state
1399		 */
1400		counter = xmlRegGetCounter(ctxt);
1401		ctxt->counters[counter].min = atom->min - 1;
1402		ctxt->counters[counter].max = atom->max - 1;
1403		atom->min = 0;
1404		atom->max = 0;
1405		atom->quant = XML_REGEXP_QUANT_ONCE;
1406		xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1407			                              atom->start, counter);
1408		if (to != NULL) {
1409		    newstate = to;
1410		} else {
1411		    newstate = xmlRegNewState(ctxt);
1412		    xmlRegStatePush(ctxt, newstate);
1413		    ctxt->state = newstate;
1414		}
1415		xmlFAGenerateCountedTransition(ctxt, atom->stop,
1416			                       newstate, counter);
1417	    }
1418	    default:
1419		break;
1420	}
1421	return(0);
1422    } else {
1423	if (to == NULL) {
1424	    to = xmlRegNewState(ctxt);
1425	    if (to != NULL)
1426		xmlRegStatePush(ctxt, to);
1427	    else {
1428		return(-1);
1429	    }
1430	}
1431	if (xmlRegAtomPush(ctxt, atom) < 0) {
1432	    return(-1);
1433	}
1434	xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1435	ctxt->state = to;
1436    }
1437    switch (atom->quant) {
1438	case XML_REGEXP_QUANT_OPT:
1439	    atom->quant = XML_REGEXP_QUANT_ONCE;
1440	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1441	    break;
1442	case XML_REGEXP_QUANT_MULT:
1443	    atom->quant = XML_REGEXP_QUANT_ONCE;
1444	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1445	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1446	    break;
1447	case XML_REGEXP_QUANT_PLUS:
1448	    atom->quant = XML_REGEXP_QUANT_ONCE;
1449	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1450	    break;
1451	default:
1452	    break;
1453    }
1454    return(0);
1455}
1456
1457/**
1458 * xmlFAReduceEpsilonTransitions:
1459 * @ctxt:  a regexp parser context
1460 * @fromnr:  the from state
1461 * @tonr:  the to state
1462 * @cpunter:  should that transition be associted to a counted
1463 *
1464 */
1465static void
1466xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1467	                      int tonr, int counter) {
1468    int transnr;
1469    xmlRegStatePtr from;
1470    xmlRegStatePtr to;
1471
1472#ifdef DEBUG_REGEXP_GRAPH
1473    printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1474#endif
1475    from = ctxt->states[fromnr];
1476    if (from == NULL)
1477	return;
1478    to = ctxt->states[tonr];
1479    if (to == NULL)
1480	return;
1481    if ((to->mark == XML_REGEXP_MARK_START) ||
1482	(to->mark == XML_REGEXP_MARK_VISITED))
1483	return;
1484
1485    to->mark = XML_REGEXP_MARK_VISITED;
1486    if (to->type == XML_REGEXP_FINAL_STATE) {
1487#ifdef DEBUG_REGEXP_GRAPH
1488	printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1489#endif
1490	from->type = XML_REGEXP_FINAL_STATE;
1491    }
1492    for (transnr = 0;transnr < to->nbTrans;transnr++) {
1493	if (to->trans[transnr].atom == NULL) {
1494	    /*
1495	     * Don't remove counted transitions
1496	     * Don't loop either
1497	     */
1498	    if (to->trans[transnr].to != fromnr) {
1499		if (to->trans[transnr].count >= 0) {
1500		    int newto = to->trans[transnr].to;
1501
1502		    xmlRegStateAddTrans(ctxt, from, NULL,
1503					ctxt->states[newto],
1504					-1, to->trans[transnr].count);
1505		} else {
1506#ifdef DEBUG_REGEXP_GRAPH
1507		    printf("Found epsilon trans %d from %d to %d\n",
1508			   transnr, tonr, to->trans[transnr].to);
1509#endif
1510		    if (to->trans[transnr].counter >= 0) {
1511			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1512					      to->trans[transnr].to,
1513					      to->trans[transnr].counter);
1514		    } else {
1515			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1516					      to->trans[transnr].to,
1517					      counter);
1518		    }
1519		}
1520	    }
1521	} else {
1522	    int newto = to->trans[transnr].to;
1523
1524	    if (to->trans[transnr].counter >= 0) {
1525		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1526				    ctxt->states[newto],
1527				    to->trans[transnr].counter, -1);
1528	    } else {
1529		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1530				    ctxt->states[newto], counter, -1);
1531	    }
1532	}
1533    }
1534    to->mark = XML_REGEXP_MARK_NORMAL;
1535}
1536
1537/**
1538 * xmlFAEliminateEpsilonTransitions:
1539 * @ctxt:  a regexp parser context
1540 *
1541 */
1542static void
1543xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1544    int statenr, transnr;
1545    xmlRegStatePtr state;
1546
1547    if (ctxt->states == NULL) return;
1548
1549
1550    /*
1551     * build the completed transitions bypassing the epsilons
1552     * Use a marking algorithm to avoid loops
1553     */
1554    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1555	state = ctxt->states[statenr];
1556	if (state == NULL)
1557	    continue;
1558	for (transnr = 0;transnr < state->nbTrans;transnr++) {
1559	    if ((state->trans[transnr].atom == NULL) &&
1560		(state->trans[transnr].to >= 0)) {
1561		if (state->trans[transnr].to == statenr) {
1562		    state->trans[transnr].to = -1;
1563#ifdef DEBUG_REGEXP_GRAPH
1564		    printf("Removed loopback epsilon trans %d on %d\n",
1565			   transnr, statenr);
1566#endif
1567		} else if (state->trans[transnr].count < 0) {
1568		    int newto = state->trans[transnr].to;
1569
1570#ifdef DEBUG_REGEXP_GRAPH
1571		    printf("Found epsilon trans %d from %d to %d\n",
1572			   transnr, statenr, newto);
1573#endif
1574		    state->mark = XML_REGEXP_MARK_START;
1575		    xmlFAReduceEpsilonTransitions(ctxt, statenr,
1576				      newto, state->trans[transnr].counter);
1577		    state->mark = XML_REGEXP_MARK_NORMAL;
1578#ifdef DEBUG_REGEXP_GRAPH
1579		} else {
1580		    printf("Found counted transition %d on %d\n",
1581			   transnr, statenr);
1582#endif
1583	        }
1584	    }
1585	}
1586    }
1587    /*
1588     * Eliminate the epsilon transitions
1589     */
1590    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1591	state = ctxt->states[statenr];
1592	if (state == NULL)
1593	    continue;
1594	for (transnr = 0;transnr < state->nbTrans;transnr++) {
1595	    if ((state->trans[transnr].atom == NULL) &&
1596		(state->trans[transnr].count < 0) &&
1597		(state->trans[transnr].to >= 0)) {
1598		state->trans[transnr].to = -1;
1599	    }
1600	}
1601    }
1602
1603    /*
1604     * Use this pass to detect unreachable states too
1605     */
1606    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1607	state = ctxt->states[statenr];
1608	if (state != NULL)
1609	    state->reached = XML_REGEXP_MARK_NORMAL;
1610    }
1611    state = ctxt->states[0];
1612    if (state != NULL)
1613	state->reached = XML_REGEXP_MARK_START;
1614    while (state != NULL) {
1615	xmlRegStatePtr target = NULL;
1616	state->reached = XML_REGEXP_MARK_VISITED;
1617	/*
1618	 * Mark all state reachable from the current reachable state
1619	 */
1620	for (transnr = 0;transnr < state->nbTrans;transnr++) {
1621	    if ((state->trans[transnr].to >= 0) &&
1622		((state->trans[transnr].atom != NULL) ||
1623		 (state->trans[transnr].count >= 0))) {
1624		int newto = state->trans[transnr].to;
1625
1626		if (ctxt->states[newto] == NULL)
1627		    continue;
1628		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1629		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
1630		    target = ctxt->states[newto];
1631		}
1632	    }
1633	}
1634	/*
1635	 * find the next accessible state not explored
1636	 */
1637	if (target == NULL) {
1638	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1639		state = ctxt->states[statenr];
1640		if ((state != NULL) && (state->reached ==
1641			XML_REGEXP_MARK_START)) {
1642		    target = state;
1643		    break;
1644		}
1645	    }
1646	}
1647	state = target;
1648    }
1649    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1650	state = ctxt->states[statenr];
1651	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
1652#ifdef DEBUG_REGEXP_GRAPH
1653	    printf("Removed unreachable state %d\n", statenr);
1654#endif
1655	    xmlRegFreeState(state);
1656	    ctxt->states[statenr] = NULL;
1657	}
1658    }
1659
1660}
1661
1662/**
1663 * xmlFACompareAtoms:
1664 * @atom1:  an atom
1665 * @atom2:  an atom
1666 *
1667 * Compares two atoms to check whether they are equivatents
1668 *
1669 * Returns 1 if yes and 0 otherwise
1670 */
1671static int
1672xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
1673    if (atom1 == atom2)
1674	return(1);
1675    if ((atom1 == NULL) || (atom2 == NULL))
1676	return(0);
1677
1678    if (atom1->type != atom2->type)
1679	return(0);
1680    switch (atom1->type) {
1681        case XML_REGEXP_STRING:
1682	    return(xmlStrEqual((xmlChar *)atom1->valuep,
1683			       (xmlChar *)atom2->valuep));
1684        case XML_REGEXP_EPSILON:
1685	    return(1);
1686        case XML_REGEXP_CHARVAL:
1687	    return(atom1->codepoint == atom2->codepoint);
1688        case XML_REGEXP_RANGES:
1689	    TODO;
1690	    return(0);
1691	default:
1692	    break;
1693    }
1694    return(1);
1695}
1696
1697/**
1698 * xmlFARecurseDeterminism:
1699 * @ctxt:  a regexp parser context
1700 *
1701 * Check whether the associated regexp is determinist,
1702 * should be called after xmlFAEliminateEpsilonTransitions()
1703 *
1704 */
1705static int
1706xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1707	                 int to, xmlRegAtomPtr atom) {
1708    int ret = 1;
1709    int transnr;
1710    xmlRegTransPtr t1;
1711
1712    if (state == NULL)
1713	return(ret);
1714    for (transnr = 0;transnr < state->nbTrans;transnr++) {
1715	t1 = &(state->trans[transnr]);
1716	/*
1717	 * check transitions conflicting with the one looked at
1718	 */
1719	if (t1->atom == NULL) {
1720	    if (t1->to == -1)
1721		continue;
1722	    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1723		                           to, atom);
1724	    if (ret == 0)
1725		return(0);
1726	    continue;
1727	}
1728	if (t1->to != to)
1729	    continue;
1730	if (xmlFACompareAtoms(t1->atom, atom))
1731	    return(0);
1732    }
1733    return(ret);
1734}
1735
1736/**
1737 * xmlFAComputesDeterminism:
1738 * @ctxt:  a regexp parser context
1739 *
1740 * Check whether the associated regexp is determinist,
1741 * should be called after xmlFAEliminateEpsilonTransitions()
1742 *
1743 */
1744static int
1745xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
1746    int statenr, transnr;
1747    xmlRegStatePtr state;
1748    xmlRegTransPtr t1, t2;
1749    int i;
1750    int ret = 1;
1751
1752#ifdef DEBUG_REGEXP_GRAPH
1753    printf("xmlFAComputesDeterminism\n");
1754    xmlRegPrintCtxt(stdout, ctxt);
1755#endif
1756    if (ctxt->determinist != -1)
1757	return(ctxt->determinist);
1758
1759    /*
1760     * Check for all states that there isn't 2 transitions
1761     * with the same atom and a different target.
1762     */
1763    for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1764	state = ctxt->states[statenr];
1765	if (state == NULL)
1766	    continue;
1767	for (transnr = 0;transnr < state->nbTrans;transnr++) {
1768	    t1 = &(state->trans[transnr]);
1769	    /*
1770	     * Determinism checks in case of counted or all transitions
1771	     * will have to be handled separately
1772	     */
1773	    if (t1->atom == NULL)
1774		continue;
1775	    if (t1->to == -1) /* eliminated */
1776		continue;
1777	    for (i = 0;i < transnr;i++) {
1778		t2 = &(state->trans[i]);
1779		if (t2->to == -1) /* eliminated */
1780		    continue;
1781		if (t2->atom != NULL) {
1782		    if (t1->to == t2->to) {
1783			if (xmlFACompareAtoms(t1->atom, t2->atom))
1784			    t2->to = -1; /* eliminate */
1785		    } else {
1786			/* not determinist ! */
1787			if (xmlFACompareAtoms(t1->atom, t2->atom))
1788			    ret = 0;
1789		    }
1790		} else if (t1->to != -1) {
1791		    /*
1792		     * do the closure in case of remaining specific
1793		     * epsilon transitions like choices or all
1794		     */
1795		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
1796						   t2->to, t2->atom);
1797		    if (ret == 0)
1798			return(0);
1799		}
1800	    }
1801	    if (ret == 0)
1802		break;
1803	}
1804	if (ret == 0)
1805	    break;
1806    }
1807    ctxt->determinist = ret;
1808    return(ret);
1809}
1810
1811/************************************************************************
1812 * 									*
1813 *	Routines to check input against transition atoms		*
1814 * 									*
1815 ************************************************************************/
1816
1817static int
1818xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
1819	                  int start, int end, const xmlChar *blockName) {
1820    int ret = 0;
1821
1822    switch (type) {
1823        case XML_REGEXP_STRING:
1824        case XML_REGEXP_SUBREG:
1825        case XML_REGEXP_RANGES:
1826        case XML_REGEXP_EPSILON:
1827	    return(-1);
1828        case XML_REGEXP_ANYCHAR:
1829	    ret = ((codepoint != '\n') && (codepoint != '\r'));
1830	    break;
1831        case XML_REGEXP_CHARVAL:
1832	    ret = ((codepoint >= start) && (codepoint <= end));
1833	    break;
1834        case XML_REGEXP_NOTSPACE:
1835	    neg = !neg;
1836        case XML_REGEXP_ANYSPACE:
1837	    ret = ((codepoint == '\n') || (codepoint == '\r') ||
1838		   (codepoint == '\t') || (codepoint == ' '));
1839	    break;
1840        case XML_REGEXP_NOTINITNAME:
1841	    neg = !neg;
1842        case XML_REGEXP_INITNAME:
1843	    ret = (IS_LETTER(codepoint) ||
1844		   (codepoint == '_') || (codepoint == ':'));
1845	    break;
1846        case XML_REGEXP_NOTNAMECHAR:
1847	    neg = !neg;
1848        case XML_REGEXP_NAMECHAR:
1849	    ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
1850		   (codepoint == '.') || (codepoint == '-') ||
1851		   (codepoint == '_') || (codepoint == ':') ||
1852		   IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
1853	    break;
1854        case XML_REGEXP_NOTDECIMAL:
1855	    neg = !neg;
1856        case XML_REGEXP_DECIMAL:
1857	    ret = xmlUCSIsCatNd(codepoint);
1858	    break;
1859        case XML_REGEXP_REALCHAR:
1860	    neg = !neg;
1861        case XML_REGEXP_NOTREALCHAR:
1862	    ret = xmlUCSIsCatP(codepoint);
1863	    if (ret == 0)
1864		ret = xmlUCSIsCatZ(codepoint);
1865	    if (ret == 0)
1866		ret = xmlUCSIsCatC(codepoint);
1867	    break;
1868        case XML_REGEXP_LETTER:
1869	    ret = xmlUCSIsCatL(codepoint);
1870	    break;
1871        case XML_REGEXP_LETTER_UPPERCASE:
1872	    ret = xmlUCSIsCatLu(codepoint);
1873	    break;
1874        case XML_REGEXP_LETTER_LOWERCASE:
1875	    ret = xmlUCSIsCatLl(codepoint);
1876	    break;
1877        case XML_REGEXP_LETTER_TITLECASE:
1878	    ret = xmlUCSIsCatLt(codepoint);
1879	    break;
1880        case XML_REGEXP_LETTER_MODIFIER:
1881	    ret = xmlUCSIsCatLm(codepoint);
1882	    break;
1883        case XML_REGEXP_LETTER_OTHERS:
1884	    ret = xmlUCSIsCatLo(codepoint);
1885	    break;
1886        case XML_REGEXP_MARK:
1887	    ret = xmlUCSIsCatM(codepoint);
1888	    break;
1889        case XML_REGEXP_MARK_NONSPACING:
1890	    ret = xmlUCSIsCatMn(codepoint);
1891	    break;
1892        case XML_REGEXP_MARK_SPACECOMBINING:
1893	    ret = xmlUCSIsCatMc(codepoint);
1894	    break;
1895        case XML_REGEXP_MARK_ENCLOSING:
1896	    ret = xmlUCSIsCatMe(codepoint);
1897	    break;
1898        case XML_REGEXP_NUMBER:
1899	    ret = xmlUCSIsCatN(codepoint);
1900	    break;
1901        case XML_REGEXP_NUMBER_DECIMAL:
1902	    ret = xmlUCSIsCatNd(codepoint);
1903	    break;
1904        case XML_REGEXP_NUMBER_LETTER:
1905	    ret = xmlUCSIsCatNl(codepoint);
1906	    break;
1907        case XML_REGEXP_NUMBER_OTHERS:
1908	    ret = xmlUCSIsCatNo(codepoint);
1909	    break;
1910        case XML_REGEXP_PUNCT:
1911	    ret = xmlUCSIsCatP(codepoint);
1912	    break;
1913        case XML_REGEXP_PUNCT_CONNECTOR:
1914	    ret = xmlUCSIsCatPc(codepoint);
1915	    break;
1916        case XML_REGEXP_PUNCT_DASH:
1917	    ret = xmlUCSIsCatPd(codepoint);
1918	    break;
1919        case XML_REGEXP_PUNCT_OPEN:
1920	    ret = xmlUCSIsCatPs(codepoint);
1921	    break;
1922        case XML_REGEXP_PUNCT_CLOSE:
1923	    ret = xmlUCSIsCatPe(codepoint);
1924	    break;
1925        case XML_REGEXP_PUNCT_INITQUOTE:
1926	    ret = xmlUCSIsCatPi(codepoint);
1927	    break;
1928        case XML_REGEXP_PUNCT_FINQUOTE:
1929	    ret = xmlUCSIsCatPf(codepoint);
1930	    break;
1931        case XML_REGEXP_PUNCT_OTHERS:
1932	    ret = xmlUCSIsCatPo(codepoint);
1933	    break;
1934        case XML_REGEXP_SEPAR:
1935	    ret = xmlUCSIsCatZ(codepoint);
1936	    break;
1937        case XML_REGEXP_SEPAR_SPACE:
1938	    ret = xmlUCSIsCatZs(codepoint);
1939	    break;
1940        case XML_REGEXP_SEPAR_LINE:
1941	    ret = xmlUCSIsCatZl(codepoint);
1942	    break;
1943        case XML_REGEXP_SEPAR_PARA:
1944	    ret = xmlUCSIsCatZp(codepoint);
1945	    break;
1946        case XML_REGEXP_SYMBOL:
1947	    ret = xmlUCSIsCatS(codepoint);
1948	    break;
1949        case XML_REGEXP_SYMBOL_MATH:
1950	    ret = xmlUCSIsCatSm(codepoint);
1951	    break;
1952        case XML_REGEXP_SYMBOL_CURRENCY:
1953	    ret = xmlUCSIsCatSc(codepoint);
1954	    break;
1955        case XML_REGEXP_SYMBOL_MODIFIER:
1956	    ret = xmlUCSIsCatSk(codepoint);
1957	    break;
1958        case XML_REGEXP_SYMBOL_OTHERS:
1959	    ret = xmlUCSIsCatSo(codepoint);
1960	    break;
1961        case XML_REGEXP_OTHER:
1962	    ret = xmlUCSIsCatC(codepoint);
1963	    break;
1964        case XML_REGEXP_OTHER_CONTROL:
1965	    ret = xmlUCSIsCatCc(codepoint);
1966	    break;
1967        case XML_REGEXP_OTHER_FORMAT:
1968	    ret = xmlUCSIsCatCf(codepoint);
1969	    break;
1970        case XML_REGEXP_OTHER_PRIVATE:
1971	    ret = xmlUCSIsCatCo(codepoint);
1972	    break;
1973        case XML_REGEXP_OTHER_NA:
1974	    /* ret = xmlUCSIsCatCn(codepoint); */
1975	    /* Seems it doesn't exist anymore in recent Unicode releases */
1976	    ret = 0;
1977	    break;
1978        case XML_REGEXP_BLOCK_NAME:
1979	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
1980	    break;
1981    }
1982    if (neg)
1983	return(!ret);
1984    return(ret);
1985}
1986
1987static int
1988xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
1989    int i, ret = 0;
1990    xmlRegRangePtr range;
1991
1992    if ((atom == NULL) || (!IS_CHAR(codepoint)))
1993	return(-1);
1994
1995    switch (atom->type) {
1996        case XML_REGEXP_SUBREG:
1997        case XML_REGEXP_EPSILON:
1998	    return(-1);
1999        case XML_REGEXP_CHARVAL:
2000            return(codepoint == atom->codepoint);
2001        case XML_REGEXP_RANGES: {
2002	    int accept = 0;
2003
2004	    for (i = 0;i < atom->nbRanges;i++) {
2005		range = atom->ranges[i];
2006		if (range->neg == 2) {
2007		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
2008						0, range->start, range->end,
2009						range->blockName);
2010		    if (ret != 0)
2011			return(0); /* excluded char */
2012		} else if (range->neg) {
2013		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
2014						0, range->start, range->end,
2015						range->blockName);
2016		    if (ret == 0)
2017		        accept = 1;
2018		    else
2019		        return(0);
2020		} else {
2021		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
2022						0, range->start, range->end,
2023						range->blockName);
2024		    if (ret != 0)
2025			accept = 1; /* might still be excluded */
2026		}
2027	    }
2028	    return(accept);
2029	}
2030        case XML_REGEXP_STRING:
2031	    printf("TODO: XML_REGEXP_STRING\n");
2032	    return(-1);
2033        case XML_REGEXP_ANYCHAR:
2034        case XML_REGEXP_ANYSPACE:
2035        case XML_REGEXP_NOTSPACE:
2036        case XML_REGEXP_INITNAME:
2037        case XML_REGEXP_NOTINITNAME:
2038        case XML_REGEXP_NAMECHAR:
2039        case XML_REGEXP_NOTNAMECHAR:
2040        case XML_REGEXP_DECIMAL:
2041        case XML_REGEXP_NOTDECIMAL:
2042        case XML_REGEXP_REALCHAR:
2043        case XML_REGEXP_NOTREALCHAR:
2044        case XML_REGEXP_LETTER:
2045        case XML_REGEXP_LETTER_UPPERCASE:
2046        case XML_REGEXP_LETTER_LOWERCASE:
2047        case XML_REGEXP_LETTER_TITLECASE:
2048        case XML_REGEXP_LETTER_MODIFIER:
2049        case XML_REGEXP_LETTER_OTHERS:
2050        case XML_REGEXP_MARK:
2051        case XML_REGEXP_MARK_NONSPACING:
2052        case XML_REGEXP_MARK_SPACECOMBINING:
2053        case XML_REGEXP_MARK_ENCLOSING:
2054        case XML_REGEXP_NUMBER:
2055        case XML_REGEXP_NUMBER_DECIMAL:
2056        case XML_REGEXP_NUMBER_LETTER:
2057        case XML_REGEXP_NUMBER_OTHERS:
2058        case XML_REGEXP_PUNCT:
2059        case XML_REGEXP_PUNCT_CONNECTOR:
2060        case XML_REGEXP_PUNCT_DASH:
2061        case XML_REGEXP_PUNCT_OPEN:
2062        case XML_REGEXP_PUNCT_CLOSE:
2063        case XML_REGEXP_PUNCT_INITQUOTE:
2064        case XML_REGEXP_PUNCT_FINQUOTE:
2065        case XML_REGEXP_PUNCT_OTHERS:
2066        case XML_REGEXP_SEPAR:
2067        case XML_REGEXP_SEPAR_SPACE:
2068        case XML_REGEXP_SEPAR_LINE:
2069        case XML_REGEXP_SEPAR_PARA:
2070        case XML_REGEXP_SYMBOL:
2071        case XML_REGEXP_SYMBOL_MATH:
2072        case XML_REGEXP_SYMBOL_CURRENCY:
2073        case XML_REGEXP_SYMBOL_MODIFIER:
2074        case XML_REGEXP_SYMBOL_OTHERS:
2075        case XML_REGEXP_OTHER:
2076        case XML_REGEXP_OTHER_CONTROL:
2077        case XML_REGEXP_OTHER_FORMAT:
2078        case XML_REGEXP_OTHER_PRIVATE:
2079        case XML_REGEXP_OTHER_NA:
2080	case XML_REGEXP_BLOCK_NAME:
2081	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
2082		                            (const xmlChar *)atom->valuep);
2083	    if (atom->neg)
2084		ret = !ret;
2085	    break;
2086    }
2087    return(ret);
2088}
2089
2090/************************************************************************
2091 * 									*
2092 *	Saving an restoring state of an execution context		*
2093 * 									*
2094 ************************************************************************/
2095
2096#ifdef DEBUG_REGEXP_EXEC
2097static void
2098xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
2099    printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
2100    if (exec->inputStack != NULL) {
2101	int i;
2102	printf(": ");
2103	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
2104	    printf("%s ", exec->inputStack[exec->inputStackNr - (i + 1)]);
2105    } else {
2106	printf(": %s", &(exec->inputString[exec->index]));
2107    }
2108    printf("\n");
2109}
2110#endif
2111
2112static void
2113xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
2114#ifdef DEBUG_REGEXP_EXEC
2115    printf("saving ");
2116    exec->transno++;
2117    xmlFARegDebugExec(exec);
2118    exec->transno--;
2119#endif
2120
2121    if (exec->maxRollbacks == 0) {
2122	exec->maxRollbacks = 4;
2123	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
2124		                             sizeof(xmlRegExecRollback));
2125	if (exec->rollbacks == NULL) {
2126	    xmlRegexpErrMemory(NULL, "saving regexp");
2127	    exec->maxRollbacks = 0;
2128	    return;
2129	}
2130	memset(exec->rollbacks, 0,
2131	       exec->maxRollbacks * sizeof(xmlRegExecRollback));
2132    } else if (exec->nbRollbacks >= exec->maxRollbacks) {
2133	xmlRegExecRollback *tmp;
2134	int len = exec->maxRollbacks;
2135
2136	exec->maxRollbacks *= 2;
2137	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
2138			exec->maxRollbacks * sizeof(xmlRegExecRollback));
2139	if (tmp == NULL) {
2140	    xmlRegexpErrMemory(NULL, "saving regexp");
2141	    exec->maxRollbacks /= 2;
2142	    return;
2143	}
2144	exec->rollbacks = tmp;
2145	tmp = &exec->rollbacks[len];
2146	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
2147    }
2148    exec->rollbacks[exec->nbRollbacks].state = exec->state;
2149    exec->rollbacks[exec->nbRollbacks].index = exec->index;
2150    exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
2151    if (exec->comp->nbCounters > 0) {
2152	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2153	    exec->rollbacks[exec->nbRollbacks].counts = (int *)
2154		xmlMalloc(exec->comp->nbCounters * sizeof(int));
2155	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2156		xmlRegexpErrMemory(NULL, "saving regexp");
2157		exec->status = -5;
2158		return;
2159	    }
2160	}
2161	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
2162	       exec->comp->nbCounters * sizeof(int));
2163    }
2164    exec->nbRollbacks++;
2165}
2166
2167static void
2168xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
2169    if (exec->nbRollbacks <= 0) {
2170	exec->status = -1;
2171#ifdef DEBUG_REGEXP_EXEC
2172	printf("rollback failed on empty stack\n");
2173#endif
2174	return;
2175    }
2176    exec->nbRollbacks--;
2177    exec->state = exec->rollbacks[exec->nbRollbacks].state;
2178    exec->index = exec->rollbacks[exec->nbRollbacks].index;
2179    exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
2180    if (exec->comp->nbCounters > 0) {
2181	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
2182	    fprintf(stderr, "exec save: allocation failed");
2183	    exec->status = -6;
2184	    return;
2185	}
2186	memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
2187	       exec->comp->nbCounters * sizeof(int));
2188    }
2189
2190#ifdef DEBUG_REGEXP_EXEC
2191    printf("restored ");
2192    xmlFARegDebugExec(exec);
2193#endif
2194}
2195
2196/************************************************************************
2197 * 									*
2198 *	Verifyer, running an input against a compiled regexp		*
2199 * 									*
2200 ************************************************************************/
2201
2202static int
2203xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
2204    xmlRegExecCtxt execval;
2205    xmlRegExecCtxtPtr exec = &execval;
2206    int ret, codepoint, len;
2207
2208    exec->inputString = content;
2209    exec->index = 0;
2210    exec->determinist = 1;
2211    exec->maxRollbacks = 0;
2212    exec->nbRollbacks = 0;
2213    exec->rollbacks = NULL;
2214    exec->status = 0;
2215    exec->comp = comp;
2216    exec->state = comp->states[0];
2217    exec->transno = 0;
2218    exec->transcount = 0;
2219    exec->inputStack = NULL;
2220    exec->inputStackMax = 0;
2221    if (comp->nbCounters > 0) {
2222	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2223	if (exec->counts == NULL) {
2224	    xmlRegexpErrMemory(NULL, "running regexp");
2225	    return(-1);
2226	}
2227        memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2228    } else
2229	exec->counts = NULL;
2230    while ((exec->status == 0) &&
2231	   ((exec->inputString[exec->index] != 0) ||
2232	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2233	xmlRegTransPtr trans;
2234	xmlRegAtomPtr atom;
2235
2236	/*
2237	 * End of input on non-terminal state, rollback, however we may
2238	 * still have epsilon like transition for counted transitions
2239	 * on counters, in that case don't break too early.
2240	 */
2241	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2242	    goto rollback;
2243
2244	exec->transcount = 0;
2245	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2246	    trans = &exec->state->trans[exec->transno];
2247	    if (trans->to < 0)
2248		continue;
2249	    atom = trans->atom;
2250	    ret = 0;
2251	    if (trans->count >= 0) {
2252		int count;
2253		xmlRegCounterPtr counter;
2254
2255		/*
2256		 * A counted transition.
2257		 */
2258
2259		count = exec->counts[trans->count];
2260		counter = &exec->comp->counters[trans->count];
2261#ifdef DEBUG_REGEXP_EXEC
2262		printf("testing count %d: val %d, min %d, max %d\n",
2263		       trans->count, count, counter->min,  counter->max);
2264#endif
2265		ret = ((count >= counter->min) && (count <= counter->max));
2266	    } else if (atom == NULL) {
2267		fprintf(stderr, "epsilon transition left at runtime\n");
2268		exec->status = -2;
2269		break;
2270	    } else if (exec->inputString[exec->index] != 0) {
2271                codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
2272		ret = xmlRegCheckCharacter(atom, codepoint);
2273		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2274		    xmlRegStatePtr to = comp->states[trans->to];
2275
2276		    /*
2277		     * this is a multiple input sequence
2278		     */
2279		    if (exec->state->nbTrans > exec->transno + 1) {
2280			xmlFARegExecSave(exec);
2281		    }
2282		    exec->transcount = 1;
2283		    do {
2284			/*
2285			 * Try to progress as much as possible on the input
2286			 */
2287			if (exec->transcount == atom->max) {
2288			    break;
2289			}
2290			exec->index += len;
2291			/*
2292			 * End of input: stop here
2293			 */
2294			if (exec->inputString[exec->index] == 0) {
2295			    exec->index -= len;
2296			    break;
2297			}
2298			if (exec->transcount >= atom->min) {
2299			    int transno = exec->transno;
2300			    xmlRegStatePtr state = exec->state;
2301
2302			    /*
2303			     * The transition is acceptable save it
2304			     */
2305			    exec->transno = -1; /* trick */
2306			    exec->state = to;
2307			    xmlFARegExecSave(exec);
2308			    exec->transno = transno;
2309			    exec->state = state;
2310			}
2311			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
2312				              len);
2313			ret = xmlRegCheckCharacter(atom, codepoint);
2314			exec->transcount++;
2315		    } while (ret == 1);
2316		    if (exec->transcount < atom->min)
2317			ret = 0;
2318
2319		    /*
2320		     * If the last check failed but one transition was found
2321		     * possible, rollback
2322		     */
2323		    if (ret < 0)
2324			ret = 0;
2325		    if (ret == 0) {
2326			goto rollback;
2327		    }
2328		}
2329	    }
2330	    if (ret == 1) {
2331		if (exec->state->nbTrans > exec->transno + 1) {
2332		    xmlFARegExecSave(exec);
2333		}
2334		if (trans->counter >= 0) {
2335#ifdef DEBUG_REGEXP_EXEC
2336		    printf("Increasing count %d\n", trans->counter);
2337#endif
2338		    exec->counts[trans->counter]++;
2339		}
2340#ifdef DEBUG_REGEXP_EXEC
2341		printf("entering state %d\n", trans->to);
2342#endif
2343		exec->state = comp->states[trans->to];
2344		exec->transno = 0;
2345		if (trans->atom != NULL) {
2346		    exec->index += len;
2347		}
2348		goto progress;
2349	    } else if (ret < 0) {
2350		exec->status = -4;
2351		break;
2352	    }
2353	}
2354	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2355rollback:
2356	    /*
2357	     * Failed to find a way out
2358	     */
2359	    exec->determinist = 0;
2360	    xmlFARegExecRollBack(exec);
2361	}
2362progress:
2363	continue;
2364    }
2365    if (exec->rollbacks != NULL) {
2366	if (exec->counts != NULL) {
2367	    int i;
2368
2369	    for (i = 0;i < exec->maxRollbacks;i++)
2370		if (exec->rollbacks[i].counts != NULL)
2371		    xmlFree(exec->rollbacks[i].counts);
2372	}
2373	xmlFree(exec->rollbacks);
2374    }
2375    if (exec->counts != NULL)
2376	xmlFree(exec->counts);
2377    if (exec->status == 0)
2378	return(1);
2379    if (exec->status == -1)
2380	return(0);
2381    return(exec->status);
2382}
2383
2384/************************************************************************
2385 * 									*
2386 *	Progressive interface to the verifyer one atom at a time	*
2387 * 									*
2388 ************************************************************************/
2389
2390/**
2391 * xmlRegNewExecCtxt:
2392 * @comp: a precompiled regular expression
2393 * @callback: a callback function used for handling progresses in the
2394 *            automata matching phase
2395 * @data: the context data associated to the callback in this context
2396 *
2397 * Build a context used for progressive evaluation of a regexp.
2398 *
2399 * Returns the new context
2400 */
2401xmlRegExecCtxtPtr
2402xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
2403    xmlRegExecCtxtPtr exec;
2404
2405    if (comp == NULL)
2406	return(NULL);
2407    if ((comp->compact == NULL) && (comp->states == NULL))
2408        return(NULL);
2409    exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
2410    if (exec == NULL) {
2411	xmlRegexpErrMemory(NULL, "creating execution context");
2412	return(NULL);
2413    }
2414    memset(exec, 0, sizeof(xmlRegExecCtxt));
2415    exec->inputString = NULL;
2416    exec->index = 0;
2417    exec->determinist = 1;
2418    exec->maxRollbacks = 0;
2419    exec->nbRollbacks = 0;
2420    exec->rollbacks = NULL;
2421    exec->status = 0;
2422    exec->comp = comp;
2423    if (comp->compact == NULL)
2424	exec->state = comp->states[0];
2425    exec->transno = 0;
2426    exec->transcount = 0;
2427    exec->callback = callback;
2428    exec->data = data;
2429    if (comp->nbCounters > 0) {
2430	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
2431	if (exec->counts == NULL) {
2432	    xmlRegexpErrMemory(NULL, "creating execution context");
2433	    xmlFree(exec);
2434	    return(NULL);
2435	}
2436        memset(exec->counts, 0, comp->nbCounters * sizeof(int));
2437    } else
2438	exec->counts = NULL;
2439    exec->inputStackMax = 0;
2440    exec->inputStackNr = 0;
2441    exec->inputStack = NULL;
2442    return(exec);
2443}
2444
2445/**
2446 * xmlRegFreeExecCtxt:
2447 * @exec: a regular expression evaulation context
2448 *
2449 * Free the structures associated to a regular expression evaulation context.
2450 */
2451void
2452xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
2453    if (exec == NULL)
2454	return;
2455
2456    if (exec->rollbacks != NULL) {
2457	if (exec->counts != NULL) {
2458	    int i;
2459
2460	    for (i = 0;i < exec->maxRollbacks;i++)
2461		if (exec->rollbacks[i].counts != NULL)
2462		    xmlFree(exec->rollbacks[i].counts);
2463	}
2464	xmlFree(exec->rollbacks);
2465    }
2466    if (exec->counts != NULL)
2467	xmlFree(exec->counts);
2468    if (exec->inputStack != NULL) {
2469	int i;
2470
2471	for (i = 0;i < exec->inputStackNr;i++) {
2472	    if (exec->inputStack[i].value != NULL)
2473		xmlFree(exec->inputStack[i].value);
2474	}
2475	xmlFree(exec->inputStack);
2476    }
2477    xmlFree(exec);
2478}
2479
2480static void
2481xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2482	                    void *data) {
2483#ifdef DEBUG_PUSH
2484    printf("saving value: %d:%s\n", exec->inputStackNr, value);
2485#endif
2486    if (exec->inputStackMax == 0) {
2487	exec->inputStackMax = 4;
2488	exec->inputStack = (xmlRegInputTokenPtr)
2489	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
2490	if (exec->inputStack == NULL) {
2491	    xmlRegexpErrMemory(NULL, "pushing input string");
2492	    exec->inputStackMax = 0;
2493	    return;
2494	}
2495    } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
2496	xmlRegInputTokenPtr tmp;
2497
2498	exec->inputStackMax *= 2;
2499	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
2500			exec->inputStackMax * sizeof(xmlRegInputToken));
2501	if (tmp == NULL) {
2502	    xmlRegexpErrMemory(NULL, "pushing input string");
2503	    exec->inputStackMax /= 2;
2504	    return;
2505	}
2506	exec->inputStack = tmp;
2507    }
2508    exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
2509    exec->inputStack[exec->inputStackNr].data = data;
2510    exec->inputStackNr++;
2511    exec->inputStack[exec->inputStackNr].value = NULL;
2512    exec->inputStack[exec->inputStackNr].data = NULL;
2513}
2514
2515
2516/**
2517 * xmlRegCompactPushString:
2518 * @exec: a regexp execution context
2519 * @comp:  the precompiled exec with a compact table
2520 * @value: a string token input
2521 * @data: data associated to the token to reuse in callbacks
2522 *
2523 * Push one input token in the execution context
2524 *
2525 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2526 *     a negative value in case of error.
2527 */
2528static int
2529xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
2530	                xmlRegexpPtr comp,
2531	                const xmlChar *value,
2532	                void *data) {
2533    int state = exec->index;
2534    int i, target;
2535
2536    if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
2537	return(-1);
2538
2539    if (value == NULL) {
2540	/*
2541	 * are we at a final state ?
2542	 */
2543	if (comp->compact[state * (comp->nbstrings + 1)] ==
2544            XML_REGEXP_FINAL_STATE)
2545	    return(1);
2546	return(0);
2547    }
2548
2549#ifdef DEBUG_PUSH
2550    printf("value pushed: %s\n", value);
2551#endif
2552
2553    /*
2554     * Examine all outside transition from current state
2555     */
2556    for (i = 0;i < comp->nbstrings;i++) {
2557	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
2558	if ((target > 0) && (target <= comp->nbstates)) {
2559	    target--; /* to avoid 0 */
2560	    if (xmlStrEqual(comp->stringMap[i], value)) {
2561		exec->index = target;
2562		if ((exec->callback != NULL) && (comp->transdata != NULL)) {
2563		    exec->callback(exec->data, value,
2564			  comp->transdata[state * comp->nbstrings + i], data);
2565		}
2566#ifdef DEBUG_PUSH
2567		printf("entering state %d\n", target);
2568#endif
2569		if (comp->compact[target * (comp->nbstrings + 1)] ==
2570		    XML_REGEXP_FINAL_STATE)
2571		    return(1);
2572		return(0);
2573	    }
2574	}
2575    }
2576    /*
2577     * Failed to find an exit transition out from current state for the
2578     * current token
2579     */
2580#ifdef DEBUG_PUSH
2581    printf("failed to find a transition for %s on state %d\n", value, state);
2582#endif
2583    exec->status = -1;
2584    return(-1);
2585}
2586
2587/**
2588 * xmlRegExecPushString:
2589 * @exec: a regexp execution context or NULL to indicate the end
2590 * @value: a string token input
2591 * @data: data associated to the token to reuse in callbacks
2592 *
2593 * Push one input token in the execution context
2594 *
2595 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2596 *     a negative value in case of error.
2597 */
2598int
2599xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
2600	             void *data) {
2601    xmlRegTransPtr trans;
2602    xmlRegAtomPtr atom;
2603    int ret;
2604    int final = 0;
2605
2606    if (exec == NULL)
2607	return(-1);
2608    if (exec->comp == NULL)
2609	return(-1);
2610    if (exec->status != 0)
2611	return(exec->status);
2612
2613    if (exec->comp->compact != NULL)
2614	return(xmlRegCompactPushString(exec, exec->comp, value, data));
2615
2616    if (value == NULL) {
2617        if (exec->state->type == XML_REGEXP_FINAL_STATE)
2618	    return(1);
2619	final = 1;
2620    }
2621
2622#ifdef DEBUG_PUSH
2623    printf("value pushed: %s\n", value);
2624#endif
2625    /*
2626     * If we have an active rollback stack push the new value there
2627     * and get back to where we were left
2628     */
2629    if ((value != NULL) && (exec->inputStackNr > 0)) {
2630	xmlFARegExecSaveInputString(exec, value, data);
2631	value = exec->inputStack[exec->index].value;
2632	data = exec->inputStack[exec->index].data;
2633#ifdef DEBUG_PUSH
2634	printf("value loaded: %s\n", value);
2635#endif
2636    }
2637
2638    while ((exec->status == 0) &&
2639	   ((value != NULL) ||
2640	    ((final == 1) &&
2641	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
2642
2643	/*
2644	 * End of input on non-terminal state, rollback, however we may
2645	 * still have epsilon like transition for counted transitions
2646	 * on counters, in that case don't break too early.
2647	 */
2648	if ((value == NULL) && (exec->counts == NULL))
2649	    goto rollback;
2650
2651	exec->transcount = 0;
2652	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2653	    trans = &exec->state->trans[exec->transno];
2654	    if (trans->to < 0)
2655		continue;
2656	    atom = trans->atom;
2657	    ret = 0;
2658	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
2659		int i;
2660		int count;
2661		xmlRegTransPtr t;
2662		xmlRegCounterPtr counter;
2663
2664		ret = 0;
2665
2666#ifdef DEBUG_PUSH
2667		printf("testing all lax %d\n", trans->count);
2668#endif
2669		/*
2670		 * Check all counted transitions from the current state
2671		 */
2672		if ((value == NULL) && (final)) {
2673		    ret = 1;
2674		} else if (value != NULL) {
2675		    for (i = 0;i < exec->state->nbTrans;i++) {
2676			t = &exec->state->trans[i];
2677			if ((t->counter < 0) || (t == trans))
2678			    continue;
2679			counter = &exec->comp->counters[t->counter];
2680			count = exec->counts[t->counter];
2681			if ((count < counter->max) &&
2682		            (t->atom != NULL) &&
2683			    (xmlStrEqual(value, t->atom->valuep))) {
2684			    ret = 0;
2685			    break;
2686			}
2687			if ((count >= counter->min) &&
2688			    (count < counter->max) &&
2689			    (xmlStrEqual(value, t->atom->valuep))) {
2690			    ret = 1;
2691			    break;
2692			}
2693		    }
2694		}
2695	    } else if (trans->count == REGEXP_ALL_COUNTER) {
2696		int i;
2697		int count;
2698		xmlRegTransPtr t;
2699		xmlRegCounterPtr counter;
2700
2701		ret = 1;
2702
2703#ifdef DEBUG_PUSH
2704		printf("testing all %d\n", trans->count);
2705#endif
2706		/*
2707		 * Check all counted transitions from the current state
2708		 */
2709		for (i = 0;i < exec->state->nbTrans;i++) {
2710                    t = &exec->state->trans[i];
2711		    if ((t->counter < 0) || (t == trans))
2712			continue;
2713                    counter = &exec->comp->counters[t->counter];
2714		    count = exec->counts[t->counter];
2715		    if ((count < counter->min) || (count > counter->max)) {
2716			ret = 0;
2717			break;
2718		    }
2719		}
2720	    } else if (trans->count >= 0) {
2721		int count;
2722		xmlRegCounterPtr counter;
2723
2724		/*
2725		 * A counted transition.
2726		 */
2727
2728		count = exec->counts[trans->count];
2729		counter = &exec->comp->counters[trans->count];
2730#ifdef DEBUG_PUSH
2731		printf("testing count %d: val %d, min %d, max %d\n",
2732		       trans->count, count, counter->min,  counter->max);
2733#endif
2734		ret = ((count >= counter->min) && (count <= counter->max));
2735	    } else if (atom == NULL) {
2736		fprintf(stderr, "epsilon transition left at runtime\n");
2737		exec->status = -2;
2738		break;
2739	    } else if (value != NULL) {
2740		ret = xmlStrEqual(value, atom->valuep);
2741		if ((ret == 1) && (trans->counter >= 0)) {
2742		    xmlRegCounterPtr counter;
2743		    int count;
2744
2745		    count = exec->counts[trans->counter];
2746		    counter = &exec->comp->counters[trans->counter];
2747		    if (count >= counter->max)
2748			ret = 0;
2749		}
2750
2751		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
2752		    xmlRegStatePtr to = exec->comp->states[trans->to];
2753
2754		    /*
2755		     * this is a multiple input sequence
2756		     */
2757		    if (exec->state->nbTrans > exec->transno + 1) {
2758			if (exec->inputStackNr <= 0) {
2759			    xmlFARegExecSaveInputString(exec, value, data);
2760			}
2761			xmlFARegExecSave(exec);
2762		    }
2763		    exec->transcount = 1;
2764		    do {
2765			/*
2766			 * Try to progress as much as possible on the input
2767			 */
2768			if (exec->transcount == atom->max) {
2769			    break;
2770			}
2771			exec->index++;
2772			value = exec->inputStack[exec->index].value;
2773			data = exec->inputStack[exec->index].data;
2774#ifdef DEBUG_PUSH
2775			printf("value loaded: %s\n", value);
2776#endif
2777
2778			/*
2779			 * End of input: stop here
2780			 */
2781			if (value == NULL) {
2782			    exec->index --;
2783			    break;
2784			}
2785			if (exec->transcount >= atom->min) {
2786			    int transno = exec->transno;
2787			    xmlRegStatePtr state = exec->state;
2788
2789			    /*
2790			     * The transition is acceptable save it
2791			     */
2792			    exec->transno = -1; /* trick */
2793			    exec->state = to;
2794			    if (exec->inputStackNr <= 0) {
2795				xmlFARegExecSaveInputString(exec, value, data);
2796			    }
2797			    xmlFARegExecSave(exec);
2798			    exec->transno = transno;
2799			    exec->state = state;
2800			}
2801			ret = xmlStrEqual(value, atom->valuep);
2802			exec->transcount++;
2803		    } while (ret == 1);
2804		    if (exec->transcount < atom->min)
2805			ret = 0;
2806
2807		    /*
2808		     * If the last check failed but one transition was found
2809		     * possible, rollback
2810		     */
2811		    if (ret < 0)
2812			ret = 0;
2813		    if (ret == 0) {
2814			goto rollback;
2815		    }
2816		}
2817	    }
2818	    if (ret == 1) {
2819		if ((exec->callback != NULL) && (atom != NULL) &&
2820			(data != NULL)) {
2821		    exec->callback(exec->data, atom->valuep,
2822			           atom->data, data);
2823		}
2824		if (exec->state->nbTrans > exec->transno + 1) {
2825		    if (exec->inputStackNr <= 0) {
2826			xmlFARegExecSaveInputString(exec, value, data);
2827		    }
2828		    xmlFARegExecSave(exec);
2829		}
2830		if (trans->counter >= 0) {
2831#ifdef DEBUG_PUSH
2832		    printf("Increasing count %d\n", trans->counter);
2833#endif
2834		    exec->counts[trans->counter]++;
2835		}
2836#ifdef DEBUG_PUSH
2837		printf("entering state %d\n", trans->to);
2838#endif
2839		exec->state = exec->comp->states[trans->to];
2840		exec->transno = 0;
2841		if (trans->atom != NULL) {
2842		    if (exec->inputStack != NULL) {
2843			exec->index++;
2844			if (exec->index < exec->inputStackNr) {
2845			    value = exec->inputStack[exec->index].value;
2846			    data = exec->inputStack[exec->index].data;
2847#ifdef DEBUG_PUSH
2848			    printf("value loaded: %s\n", value);
2849#endif
2850			} else {
2851			    value = NULL;
2852			    data = NULL;
2853#ifdef DEBUG_PUSH
2854			    printf("end of input\n");
2855#endif
2856			}
2857		    } else {
2858			value = NULL;
2859			data = NULL;
2860#ifdef DEBUG_PUSH
2861			printf("end of input\n");
2862#endif
2863		    }
2864		}
2865		goto progress;
2866	    } else if (ret < 0) {
2867		exec->status = -4;
2868		break;
2869	    }
2870	}
2871	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
2872rollback:
2873	    /*
2874	     * Failed to find a way out
2875	     */
2876	    exec->determinist = 0;
2877	    xmlFARegExecRollBack(exec);
2878	    if (exec->status == 0) {
2879		value = exec->inputStack[exec->index].value;
2880		data = exec->inputStack[exec->index].data;
2881#ifdef DEBUG_PUSH
2882		printf("value loaded: %s\n", value);
2883#endif
2884	    }
2885	}
2886progress:
2887	continue;
2888    }
2889    if (exec->status == 0) {
2890        return(exec->state->type == XML_REGEXP_FINAL_STATE);
2891    }
2892    return(exec->status);
2893}
2894
2895/**
2896 * xmlRegExecPushString2:
2897 * @exec: a regexp execution context or NULL to indicate the end
2898 * @value: the first string token input
2899 * @value2: the second string token input
2900 * @data: data associated to the token to reuse in callbacks
2901 *
2902 * Push one input token in the execution context
2903 *
2904 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
2905 *     a negative value in case of error.
2906 */
2907int
2908xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
2909                      const xmlChar *value2, void *data) {
2910    xmlChar buf[150];
2911    int lenn, lenp, ret;
2912    xmlChar *str;
2913
2914    if (exec == NULL)
2915	return(-1);
2916    if (exec->comp == NULL)
2917	return(-1);
2918    if (exec->status != 0)
2919	return(exec->status);
2920
2921    if (value2 == NULL)
2922        return(xmlRegExecPushString(exec, value, data));
2923
2924    lenn = strlen((char *) value2);
2925    lenp = strlen((char *) value);
2926
2927    if (150 < lenn + lenp + 2) {
2928	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
2929	if (str == NULL) {
2930	    exec->status = -1;
2931	    return(-1);
2932	}
2933    } else {
2934	str = buf;
2935    }
2936    memcpy(&str[0], value, lenp);
2937    str[lenp] = '|';
2938    memcpy(&str[lenp + 1], value2, lenn);
2939    str[lenn + lenp + 1] = 0;
2940
2941    if (exec->comp->compact != NULL)
2942	ret = xmlRegCompactPushString(exec, exec->comp, str, data);
2943    else
2944        ret = xmlRegExecPushString(exec, str, data);
2945
2946    if (str != buf)
2947        xmlFree(buf);
2948    return(ret);
2949}
2950
2951#if 0
2952static int
2953xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
2954    xmlRegTransPtr trans;
2955    xmlRegAtomPtr atom;
2956    int ret;
2957    int codepoint, len;
2958
2959    if (exec == NULL)
2960	return(-1);
2961    if (exec->status != 0)
2962	return(exec->status);
2963
2964    while ((exec->status == 0) &&
2965	   ((exec->inputString[exec->index] != 0) ||
2966	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
2967
2968	/*
2969	 * End of input on non-terminal state, rollback, however we may
2970	 * still have epsilon like transition for counted transitions
2971	 * on counters, in that case don't break too early.
2972	 */
2973	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
2974	    goto rollback;
2975
2976	exec->transcount = 0;
2977	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
2978	    trans = &exec->state->trans[exec->transno];
2979	    if (trans->to < 0)
2980		continue;
2981	    atom = trans->atom;
2982	    ret = 0;
2983	    if (trans->count >= 0) {
2984		int count;
2985		xmlRegCounterPtr counter;
2986
2987		/*
2988		 * A counted transition.
2989		 */
2990
2991		count = exec->counts[trans->count];
2992		counter = &exec->comp->counters[trans->count];
2993#ifdef DEBUG_REGEXP_EXEC
2994		printf("testing count %d: val %d, min %d, max %d\n",
2995		       trans->count, count, counter->min,  counter->max);
2996#endif
2997		ret = ((count >= counter->min) && (count <= counter->max));
2998	    } else if (atom == NULL) {
2999		fprintf(stderr, "epsilon transition left at runtime\n");
3000		exec->status = -2;
3001		break;
3002	    } else if (exec->inputString[exec->index] != 0) {
3003                codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3004		ret = xmlRegCheckCharacter(atom, codepoint);
3005		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3006		    xmlRegStatePtr to = exec->comp->states[trans->to];
3007
3008		    /*
3009		     * this is a multiple input sequence
3010		     */
3011		    if (exec->state->nbTrans > exec->transno + 1) {
3012			xmlFARegExecSave(exec);
3013		    }
3014		    exec->transcount = 1;
3015		    do {
3016			/*
3017			 * Try to progress as much as possible on the input
3018			 */
3019			if (exec->transcount == atom->max) {
3020			    break;
3021			}
3022			exec->index += len;
3023			/*
3024			 * End of input: stop here
3025			 */
3026			if (exec->inputString[exec->index] == 0) {
3027			    exec->index -= len;
3028			    break;
3029			}
3030			if (exec->transcount >= atom->min) {
3031			    int transno = exec->transno;
3032			    xmlRegStatePtr state = exec->state;
3033
3034			    /*
3035			     * The transition is acceptable save it
3036			     */
3037			    exec->transno = -1; /* trick */
3038			    exec->state = to;
3039			    xmlFARegExecSave(exec);
3040			    exec->transno = transno;
3041			    exec->state = state;
3042			}
3043			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3044				              len);
3045			ret = xmlRegCheckCharacter(atom, codepoint);
3046			exec->transcount++;
3047		    } while (ret == 1);
3048		    if (exec->transcount < atom->min)
3049			ret = 0;
3050
3051		    /*
3052		     * If the last check failed but one transition was found
3053		     * possible, rollback
3054		     */
3055		    if (ret < 0)
3056			ret = 0;
3057		    if (ret == 0) {
3058			goto rollback;
3059		    }
3060		}
3061	    }
3062	    if (ret == 1) {
3063		if (exec->state->nbTrans > exec->transno + 1) {
3064		    xmlFARegExecSave(exec);
3065		}
3066		if (trans->counter >= 0) {
3067#ifdef DEBUG_REGEXP_EXEC
3068		    printf("Increasing count %d\n", trans->counter);
3069#endif
3070		    exec->counts[trans->counter]++;
3071		}
3072#ifdef DEBUG_REGEXP_EXEC
3073		printf("entering state %d\n", trans->to);
3074#endif
3075		exec->state = exec->comp->states[trans->to];
3076		exec->transno = 0;
3077		if (trans->atom != NULL) {
3078		    exec->index += len;
3079		}
3080		goto progress;
3081	    } else if (ret < 0) {
3082		exec->status = -4;
3083		break;
3084	    }
3085	}
3086	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3087rollback:
3088	    /*
3089	     * Failed to find a way out
3090	     */
3091	    exec->determinist = 0;
3092	    xmlFARegExecRollBack(exec);
3093	}
3094progress:
3095	continue;
3096    }
3097}
3098#endif
3099/************************************************************************
3100 * 									*
3101 *	Parser for the Shemas Datatype Regular Expressions		*
3102 *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	*
3103 * 									*
3104 ************************************************************************/
3105
3106/**
3107 * xmlFAIsChar:
3108 * @ctxt:  a regexp parser context
3109 *
3110 * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
3111 */
3112static int
3113xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
3114    int cur;
3115    int len;
3116
3117    cur = CUR_SCHAR(ctxt->cur, len);
3118    if ((cur == '.') || (cur == '\\') || (cur == '?') ||
3119	(cur == '*') || (cur == '+') || (cur == '(') ||
3120	(cur == ')') || (cur == '|') || (cur == 0x5B) ||
3121	(cur == 0x5D) || (cur == 0))
3122	return(-1);
3123    return(cur);
3124}
3125
3126/**
3127 * xmlFAParseCharProp:
3128 * @ctxt:  a regexp parser context
3129 *
3130 * [27]   charProp   ::=   IsCategory | IsBlock
3131 * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
3132 *                       Separators | Symbols | Others
3133 * [29]   Letters   ::=   'L' [ultmo]?
3134 * [30]   Marks   ::=   'M' [nce]?
3135 * [31]   Numbers   ::=   'N' [dlo]?
3136 * [32]   Punctuation   ::=   'P' [cdseifo]?
3137 * [33]   Separators   ::=   'Z' [slp]?
3138 * [34]   Symbols   ::=   'S' [mcko]?
3139 * [35]   Others   ::=   'C' [cfon]?
3140 * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
3141 */
3142static void
3143xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
3144    int cur;
3145    xmlRegAtomType type = (xmlRegAtomType) 0;
3146    xmlChar *blockName = NULL;
3147
3148    cur = CUR;
3149    if (cur == 'L') {
3150	NEXT;
3151	cur = CUR;
3152	if (cur == 'u') {
3153	    NEXT;
3154	    type = XML_REGEXP_LETTER_UPPERCASE;
3155	} else if (cur == 'l') {
3156	    NEXT;
3157	    type = XML_REGEXP_LETTER_LOWERCASE;
3158	} else if (cur == 't') {
3159	    NEXT;
3160	    type = XML_REGEXP_LETTER_TITLECASE;
3161	} else if (cur == 'm') {
3162	    NEXT;
3163	    type = XML_REGEXP_LETTER_MODIFIER;
3164	} else if (cur == 'o') {
3165	    NEXT;
3166	    type = XML_REGEXP_LETTER_OTHERS;
3167	} else {
3168	    type = XML_REGEXP_LETTER;
3169	}
3170    } else if (cur == 'M') {
3171	NEXT;
3172	cur = CUR;
3173	if (cur == 'n') {
3174	    NEXT;
3175	    /* nonspacing */
3176	    type = XML_REGEXP_MARK_NONSPACING;
3177	} else if (cur == 'c') {
3178	    NEXT;
3179	    /* spacing combining */
3180	    type = XML_REGEXP_MARK_SPACECOMBINING;
3181	} else if (cur == 'e') {
3182	    NEXT;
3183	    /* enclosing */
3184	    type = XML_REGEXP_MARK_ENCLOSING;
3185	} else {
3186	    /* all marks */
3187	    type = XML_REGEXP_MARK;
3188	}
3189    } else if (cur == 'N') {
3190	NEXT;
3191	cur = CUR;
3192	if (cur == 'd') {
3193	    NEXT;
3194	    /* digital */
3195	    type = XML_REGEXP_NUMBER_DECIMAL;
3196	} else if (cur == 'l') {
3197	    NEXT;
3198	    /* letter */
3199	    type = XML_REGEXP_NUMBER_LETTER;
3200	} else if (cur == 'o') {
3201	    NEXT;
3202	    /* other */
3203	    type = XML_REGEXP_NUMBER_OTHERS;
3204	} else {
3205	    /* all numbers */
3206	    type = XML_REGEXP_NUMBER;
3207	}
3208    } else if (cur == 'P') {
3209	NEXT;
3210	cur = CUR;
3211	if (cur == 'c') {
3212	    NEXT;
3213	    /* connector */
3214	    type = XML_REGEXP_PUNCT_CONNECTOR;
3215	} else if (cur == 'd') {
3216	    NEXT;
3217	    /* dash */
3218	    type = XML_REGEXP_PUNCT_DASH;
3219	} else if (cur == 's') {
3220	    NEXT;
3221	    /* open */
3222	    type = XML_REGEXP_PUNCT_OPEN;
3223	} else if (cur == 'e') {
3224	    NEXT;
3225	    /* close */
3226	    type = XML_REGEXP_PUNCT_CLOSE;
3227	} else if (cur == 'i') {
3228	    NEXT;
3229	    /* initial quote */
3230	    type = XML_REGEXP_PUNCT_INITQUOTE;
3231	} else if (cur == 'f') {
3232	    NEXT;
3233	    /* final quote */
3234	    type = XML_REGEXP_PUNCT_FINQUOTE;
3235	} else if (cur == 'o') {
3236	    NEXT;
3237	    /* other */
3238	    type = XML_REGEXP_PUNCT_OTHERS;
3239	} else {
3240	    /* all punctuation */
3241	    type = XML_REGEXP_PUNCT;
3242	}
3243    } else if (cur == 'Z') {
3244	NEXT;
3245	cur = CUR;
3246	if (cur == 's') {
3247	    NEXT;
3248	    /* space */
3249	    type = XML_REGEXP_SEPAR_SPACE;
3250	} else if (cur == 'l') {
3251	    NEXT;
3252	    /* line */
3253	    type = XML_REGEXP_SEPAR_LINE;
3254	} else if (cur == 'p') {
3255	    NEXT;
3256	    /* paragraph */
3257	    type = XML_REGEXP_SEPAR_PARA;
3258	} else {
3259	    /* all separators */
3260	    type = XML_REGEXP_SEPAR;
3261	}
3262    } else if (cur == 'S') {
3263	NEXT;
3264	cur = CUR;
3265	if (cur == 'm') {
3266	    NEXT;
3267	    type = XML_REGEXP_SYMBOL_MATH;
3268	    /* math */
3269	} else if (cur == 'c') {
3270	    NEXT;
3271	    type = XML_REGEXP_SYMBOL_CURRENCY;
3272	    /* currency */
3273	} else if (cur == 'k') {
3274	    NEXT;
3275	    type = XML_REGEXP_SYMBOL_MODIFIER;
3276	    /* modifiers */
3277	} else if (cur == 'o') {
3278	    NEXT;
3279	    type = XML_REGEXP_SYMBOL_OTHERS;
3280	    /* other */
3281	} else {
3282	    /* all symbols */
3283	    type = XML_REGEXP_SYMBOL;
3284	}
3285    } else if (cur == 'C') {
3286	NEXT;
3287	cur = CUR;
3288	if (cur == 'c') {
3289	    NEXT;
3290	    /* control */
3291	    type = XML_REGEXP_OTHER_CONTROL;
3292	} else if (cur == 'f') {
3293	    NEXT;
3294	    /* format */
3295	    type = XML_REGEXP_OTHER_FORMAT;
3296	} else if (cur == 'o') {
3297	    NEXT;
3298	    /* private use */
3299	    type = XML_REGEXP_OTHER_PRIVATE;
3300	} else if (cur == 'n') {
3301	    NEXT;
3302	    /* not assigned */
3303	    type = XML_REGEXP_OTHER_NA;
3304	} else {
3305	    /* all others */
3306	    type = XML_REGEXP_OTHER;
3307	}
3308    } else if (cur == 'I') {
3309	const xmlChar *start;
3310	NEXT;
3311	cur = CUR;
3312	if (cur != 's') {
3313	    ERROR("IsXXXX expected");
3314	    return;
3315	}
3316	NEXT;
3317	start = ctxt->cur;
3318	cur = CUR;
3319	if (((cur >= 'a') && (cur <= 'z')) ||
3320	    ((cur >= 'A') && (cur <= 'Z')) ||
3321	    ((cur >= '0') && (cur <= '9')) ||
3322	    (cur == 0x2D)) {
3323	    NEXT;
3324	    cur = CUR;
3325	    while (((cur >= 'a') && (cur <= 'z')) ||
3326		((cur >= 'A') && (cur <= 'Z')) ||
3327		((cur >= '0') && (cur <= '9')) ||
3328		(cur == 0x2D)) {
3329		NEXT;
3330		cur = CUR;
3331	    }
3332	}
3333	type = XML_REGEXP_BLOCK_NAME;
3334	blockName = xmlStrndup(start, ctxt->cur - start);
3335    } else {
3336	ERROR("Unknown char property");
3337	return;
3338    }
3339    if (ctxt->atom == NULL) {
3340	ctxt->atom = xmlRegNewAtom(ctxt, type);
3341	if (ctxt->atom != NULL)
3342	    ctxt->atom->valuep = blockName;
3343    } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3344        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3345		           type, 0, 0, blockName);
3346    }
3347}
3348
3349/**
3350 * xmlFAParseCharClassEsc:
3351 * @ctxt:  a regexp parser context
3352 *
3353 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
3354 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
3355 * [25] catEsc   ::=   '\p{' charProp '}'
3356 * [26] complEsc ::=   '\P{' charProp '}'
3357 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
3358 */
3359static void
3360xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
3361    int cur;
3362
3363    if (CUR == '.') {
3364	if (ctxt->atom == NULL) {
3365	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
3366	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3367	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3368			       XML_REGEXP_ANYCHAR, 0, 0, NULL);
3369	}
3370	NEXT;
3371	return;
3372    }
3373    if (CUR != '\\') {
3374	ERROR("Escaped sequence: expecting \\");
3375	return;
3376    }
3377    NEXT;
3378    cur = CUR;
3379    if (cur == 'p') {
3380	NEXT;
3381	if (CUR != '{') {
3382	    ERROR("Expecting '{'");
3383	    return;
3384	}
3385	NEXT;
3386	xmlFAParseCharProp(ctxt);
3387	if (CUR != '}') {
3388	    ERROR("Expecting '}'");
3389	    return;
3390	}
3391	NEXT;
3392    } else if (cur == 'P') {
3393	NEXT;
3394	if (CUR != '{') {
3395	    ERROR("Expecting '{'");
3396	    return;
3397	}
3398	NEXT;
3399	xmlFAParseCharProp(ctxt);
3400	ctxt->atom->neg = 1;
3401	if (CUR != '}') {
3402	    ERROR("Expecting '}'");
3403	    return;
3404	}
3405	NEXT;
3406    } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
3407	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
3408	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
3409	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
3410	(cur == 0x5E)) {
3411	if (ctxt->atom == NULL) {
3412	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3413	    if (ctxt->atom != NULL)
3414		ctxt->atom->codepoint = cur;
3415	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3416	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3417			       XML_REGEXP_CHARVAL, cur, cur, NULL);
3418	}
3419	NEXT;
3420    } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
3421	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
3422	(cur == 'w') || (cur == 'W')) {
3423	xmlRegAtomType type = XML_REGEXP_ANYSPACE;
3424
3425	switch (cur) {
3426	    case 's':
3427		type = XML_REGEXP_ANYSPACE;
3428		break;
3429	    case 'S':
3430		type = XML_REGEXP_NOTSPACE;
3431		break;
3432	    case 'i':
3433		type = XML_REGEXP_INITNAME;
3434		break;
3435	    case 'I':
3436		type = XML_REGEXP_NOTINITNAME;
3437		break;
3438	    case 'c':
3439		type = XML_REGEXP_NAMECHAR;
3440		break;
3441	    case 'C':
3442		type = XML_REGEXP_NOTNAMECHAR;
3443		break;
3444	    case 'd':
3445		type = XML_REGEXP_DECIMAL;
3446		break;
3447	    case 'D':
3448		type = XML_REGEXP_NOTDECIMAL;
3449		break;
3450	    case 'w':
3451		type = XML_REGEXP_REALCHAR;
3452		break;
3453	    case 'W':
3454		type = XML_REGEXP_NOTREALCHAR;
3455		break;
3456	}
3457	NEXT;
3458	if (ctxt->atom == NULL) {
3459	    ctxt->atom = xmlRegNewAtom(ctxt, type);
3460	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
3461	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3462			       type, 0, 0, NULL);
3463	}
3464    }
3465}
3466
3467/**
3468 * xmlFAParseCharRef:
3469 * @ctxt:  a regexp parser context
3470 *
3471 * [19]   XmlCharRef   ::=   ( '&#' [0-9]+ ';' ) | (' &#x' [0-9a-fA-F]+ ';' )
3472 */
3473static int
3474xmlFAParseCharRef(xmlRegParserCtxtPtr ctxt) {
3475    int ret = 0, cur;
3476
3477    if ((CUR != '&') || (NXT(1) != '#'))
3478	return(-1);
3479    NEXT;
3480    NEXT;
3481    cur = CUR;
3482    if (cur == 'x') {
3483	NEXT;
3484	cur = CUR;
3485	if (((cur >= '0') && (cur <= '9')) ||
3486	    ((cur >= 'a') && (cur <= 'f')) ||
3487	    ((cur >= 'A') && (cur <= 'F'))) {
3488	    while (((cur >= '0') && (cur <= '9')) ||
3489		   ((cur >= 'A') && (cur <= 'F'))) {
3490		if ((cur >= '0') && (cur <= '9'))
3491		    ret = ret * 16 + cur - '0';
3492		else if ((cur >= 'a') && (cur <= 'f'))
3493		    ret = ret * 16 + 10 + (cur - 'a');
3494		else
3495		    ret = ret * 16 + 10 + (cur - 'A');
3496		NEXT;
3497		cur = CUR;
3498	    }
3499	} else {
3500	    ERROR("Char ref: expecting [0-9A-F]");
3501	    return(-1);
3502	}
3503    } else {
3504	if ((cur >= '0') && (cur <= '9')) {
3505	    while ((cur >= '0') && (cur <= '9')) {
3506		ret = ret * 10 + cur - '0';
3507		NEXT;
3508		cur = CUR;
3509	    }
3510	} else {
3511	    ERROR("Char ref: expecting [0-9]");
3512	    return(-1);
3513	}
3514    }
3515    if (cur != ';') {
3516	ERROR("Char ref: expecting ';'");
3517	return(-1);
3518    } else {
3519	NEXT;
3520    }
3521    return(ret);
3522}
3523
3524/**
3525 * xmlFAParseCharRange:
3526 * @ctxt:  a regexp parser context
3527 *
3528 * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
3529 * [18]   seRange   ::=   charOrEsc '-' charOrEsc
3530 * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
3531 * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
3532 * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
3533 */
3534static void
3535xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
3536    int cur, len;
3537    int start = -1;
3538    int end = -1;
3539
3540    if ((CUR == '&') && (NXT(1) == '#')) {
3541	end = start = xmlFAParseCharRef(ctxt);
3542        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3543	                   XML_REGEXP_CHARVAL, start, end, NULL);
3544	return;
3545    }
3546    cur = CUR;
3547    if (cur == '\\') {
3548	NEXT;
3549	cur = CUR;
3550	switch (cur) {
3551	    case 'n': start = 0xA; break;
3552	    case 'r': start = 0xD; break;
3553	    case 't': start = 0x9; break;
3554	    case '\\': case '|': case '.': case '-': case '^': case '?':
3555	    case '*': case '+': case '{': case '}': case '(': case ')':
3556	    case '[': case ']':
3557		start = cur; break;
3558	    default:
3559		ERROR("Invalid escape value");
3560		return;
3561	}
3562	end = start;
3563        len = 1;
3564    } else if ((cur != 0x5B) && (cur != 0x5D)) {
3565        end = start = CUR_SCHAR(ctxt->cur, len);
3566    } else {
3567	ERROR("Expecting a char range");
3568	return;
3569    }
3570    NEXTL(len);
3571    if (start == '-') {
3572	return;
3573    }
3574    cur = CUR;
3575    if (cur != '-') {
3576        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3577		              XML_REGEXP_CHARVAL, start, end, NULL);
3578	return;
3579    }
3580    NEXT;
3581    cur = CUR;
3582    if (cur == '\\') {
3583	NEXT;
3584	cur = CUR;
3585	switch (cur) {
3586	    case 'n': end = 0xA; break;
3587	    case 'r': end = 0xD; break;
3588	    case 't': end = 0x9; break;
3589	    case '\\': case '|': case '.': case '-': case '^': case '?':
3590	    case '*': case '+': case '{': case '}': case '(': case ')':
3591	    case '[': case ']':
3592		end = cur; break;
3593	    default:
3594		ERROR("Invalid escape value");
3595		return;
3596	}
3597        len = 1;
3598    } else if ((cur != 0x5B) && (cur != 0x5D)) {
3599        end = CUR_SCHAR(ctxt->cur, len);
3600    } else {
3601	ERROR("Expecting the end of a char range");
3602	return;
3603    }
3604    NEXTL(len);
3605    /* TODO check that the values are acceptable character ranges for XML */
3606    if (end < start) {
3607	ERROR("End of range is before start of range");
3608    } else {
3609        xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
3610		           XML_REGEXP_CHARVAL, start, end, NULL);
3611    }
3612    return;
3613}
3614
3615/**
3616 * xmlFAParsePosCharGroup:
3617 * @ctxt:  a regexp parser context
3618 *
3619 * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
3620 */
3621static void
3622xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
3623    do {
3624	if ((CUR == '\\') || (CUR == '.')) {
3625	    xmlFAParseCharClassEsc(ctxt);
3626	} else {
3627	    xmlFAParseCharRange(ctxt);
3628	}
3629    } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
3630	     (ctxt->error == 0));
3631}
3632
3633/**
3634 * xmlFAParseCharGroup:
3635 * @ctxt:  a regexp parser context
3636 *
3637 * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
3638 * [15]   negCharGroup ::= '^' posCharGroup
3639 * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
3640 * [12]   charClassExpr ::= '[' charGroup ']'
3641 */
3642static void
3643xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
3644    int n = ctxt->neg;
3645    while ((CUR != ']') && (ctxt->error == 0)) {
3646	if (CUR == '^') {
3647	    int neg = ctxt->neg;
3648
3649	    NEXT;
3650	    ctxt->neg = !ctxt->neg;
3651	    xmlFAParsePosCharGroup(ctxt);
3652	    ctxt->neg = neg;
3653	} else if (CUR == '-') {
3654	    int neg = ctxt->neg;
3655	    NEXT;
3656	    ctxt->neg = 2;
3657	    if (CUR != '[') {
3658		ERROR("charClassExpr: '[' expected");
3659		break;
3660	    }
3661	    NEXT;
3662	    xmlFAParseCharGroup(ctxt);
3663	    if (CUR == ']') {
3664		NEXT;
3665	    } else {
3666		ERROR("charClassExpr: ']' expected");
3667		break;
3668	    }
3669	    ctxt->neg = neg;
3670	    break;
3671	} else if (CUR != ']') {
3672	    xmlFAParsePosCharGroup(ctxt);
3673	}
3674    }
3675    ctxt->neg = n;
3676}
3677
3678/**
3679 * xmlFAParseCharClass:
3680 * @ctxt:  a regexp parser context
3681 *
3682 * [11]   charClass   ::=     charClassEsc | charClassExpr
3683 * [12]   charClassExpr   ::=   '[' charGroup ']'
3684 */
3685static void
3686xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
3687    if (CUR == '[') {
3688	NEXT;
3689	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
3690	if (ctxt->atom == NULL)
3691	    return;
3692	xmlFAParseCharGroup(ctxt);
3693	if (CUR == ']') {
3694	    NEXT;
3695	} else {
3696	    ERROR("xmlFAParseCharClass: ']' expected");
3697	}
3698    } else {
3699	xmlFAParseCharClassEsc(ctxt);
3700    }
3701}
3702
3703/**
3704 * xmlFAParseQuantExact:
3705 * @ctxt:  a regexp parser context
3706 *
3707 * [8]   QuantExact   ::=   [0-9]+
3708 *
3709 * Returns 0 if success or -1 in case of error
3710 */
3711static int
3712xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
3713    int ret = 0;
3714    int ok = 0;
3715
3716    while ((CUR >= '0') && (CUR <= '9')) {
3717	ret = ret * 10 + (CUR - '0');
3718	ok = 1;
3719	NEXT;
3720    }
3721    if (ok != 1) {
3722	return(-1);
3723    }
3724    return(ret);
3725}
3726
3727/**
3728 * xmlFAParseQuantifier:
3729 * @ctxt:  a regexp parser context
3730 *
3731 * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
3732 * [5]   quantity   ::=   quantRange | quantMin | QuantExact
3733 * [6]   quantRange   ::=   QuantExact ',' QuantExact
3734 * [7]   quantMin   ::=   QuantExact ','
3735 * [8]   QuantExact   ::=   [0-9]+
3736 */
3737static int
3738xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
3739    int cur;
3740
3741    cur = CUR;
3742    if ((cur == '?') || (cur == '*') || (cur == '+')) {
3743	if (ctxt->atom != NULL) {
3744	    if (cur == '?')
3745		ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
3746	    else if (cur == '*')
3747		ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
3748	    else if (cur == '+')
3749		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
3750	}
3751	NEXT;
3752	return(1);
3753    }
3754    if (cur == '{') {
3755	int min = 0, max = 0;
3756
3757	NEXT;
3758	cur = xmlFAParseQuantExact(ctxt);
3759	if (cur >= 0)
3760	    min = cur;
3761	if (CUR == ',') {
3762	    NEXT;
3763	    if (CUR == '}')
3764	        max = INT_MAX;
3765	    else {
3766	        cur = xmlFAParseQuantExact(ctxt);
3767	        if (cur >= 0)
3768		    max = cur;
3769		else {
3770		    ERROR("Improper quantifier");
3771		}
3772	    }
3773	}
3774	if (CUR == '}') {
3775	    NEXT;
3776	} else {
3777	    ERROR("Unterminated quantifier");
3778	}
3779	if (max == 0)
3780	    max = min;
3781	if (ctxt->atom != NULL) {
3782	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
3783	    ctxt->atom->min = min;
3784	    ctxt->atom->max = max;
3785	}
3786	return(1);
3787    }
3788    return(0);
3789}
3790
3791/**
3792 * xmlFAParseAtom:
3793 * @ctxt:  a regexp parser context
3794 *
3795 * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
3796 */
3797static int
3798xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
3799    int codepoint, len;
3800
3801    codepoint = xmlFAIsChar(ctxt);
3802    if (codepoint > 0) {
3803	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
3804	if (ctxt->atom == NULL)
3805	    return(-1);
3806	codepoint = CUR_SCHAR(ctxt->cur, len);
3807	ctxt->atom->codepoint = codepoint;
3808	NEXTL(len);
3809	return(1);
3810    } else if (CUR == '|') {
3811	return(0);
3812    } else if (CUR == 0) {
3813	return(0);
3814    } else if (CUR == ')') {
3815	return(0);
3816    } else if (CUR == '(') {
3817	xmlRegStatePtr start, oldend;
3818
3819	NEXT;
3820	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
3821	start = ctxt->state;
3822	oldend = ctxt->end;
3823	ctxt->end = NULL;
3824	ctxt->atom = NULL;
3825	xmlFAParseRegExp(ctxt, 0);
3826	if (CUR == ')') {
3827	    NEXT;
3828	} else {
3829	    ERROR("xmlFAParseAtom: expecting ')'");
3830	}
3831	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
3832	if (ctxt->atom == NULL)
3833	    return(-1);
3834	ctxt->atom->start = start;
3835	ctxt->atom->stop = ctxt->state;
3836	ctxt->end = oldend;
3837	return(1);
3838    } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
3839	xmlFAParseCharClass(ctxt);
3840	return(1);
3841    }
3842    return(0);
3843}
3844
3845/**
3846 * xmlFAParsePiece:
3847 * @ctxt:  a regexp parser context
3848 *
3849 * [3]   piece   ::=   atom quantifier?
3850 */
3851static int
3852xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
3853    int ret;
3854
3855    ctxt->atom = NULL;
3856    ret = xmlFAParseAtom(ctxt);
3857    if (ret == 0)
3858	return(0);
3859    if (ctxt->atom == NULL) {
3860	ERROR("internal: no atom generated");
3861    }
3862    xmlFAParseQuantifier(ctxt);
3863    return(1);
3864}
3865
3866/**
3867 * xmlFAParseBranch:
3868 * @ctxt:  a regexp parser context
3869 * @first:  is taht the first
3870 *
3871 * [2]   branch   ::=   piece*
3872 8
3873 */
3874static int
3875xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, int first) {
3876    xmlRegStatePtr previous;
3877    xmlRegAtomPtr prevatom = NULL;
3878    int ret;
3879
3880    previous = ctxt->state;
3881    ret = xmlFAParsePiece(ctxt);
3882    if (ret != 0) {
3883	if (first) {
3884	    if (xmlFAGenerateTransitions(ctxt, previous, NULL, ctxt->atom) < 0)
3885	        return(-1);
3886	    previous = ctxt->state;
3887	} else {
3888	    prevatom = ctxt->atom;
3889	}
3890	ctxt->atom = NULL;
3891    }
3892    while ((ret != 0) && (ctxt->error == 0)) {
3893	ret = xmlFAParsePiece(ctxt);
3894	if (ret != 0) {
3895	    if (first) {
3896		if (xmlFAGenerateTransitions(ctxt, previous, NULL,
3897		                             ctxt->atom) < 0)
3898		    return(-1);
3899	    } else {
3900		if (xmlFAGenerateTransitions(ctxt, previous, NULL,
3901		                             prevatom) < 0)
3902		    return(-1);
3903		prevatom = ctxt->atom;
3904	    }
3905	    previous = ctxt->state;
3906	    ctxt->atom = NULL;
3907	}
3908    }
3909    if (!first) {
3910	if (xmlFAGenerateTransitions(ctxt, previous, ctxt->end, prevatom) < 0)
3911	    return(-1);
3912    }
3913    return(0);
3914}
3915
3916/**
3917 * xmlFAParseRegExp:
3918 * @ctxt:  a regexp parser context
3919 * @top:  is that the top-level expressions ?
3920 *
3921 * [1]   regExp   ::=     branch  ( '|' branch )*
3922 */
3923static void
3924xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
3925    xmlRegStatePtr start, end, oldend;
3926
3927    oldend = ctxt->end;
3928
3929    start = ctxt->state;
3930    xmlFAParseBranch(ctxt, (ctxt->end == NULL));
3931    if (CUR != '|') {
3932	ctxt->end = ctxt->state;
3933	return;
3934    }
3935    end = ctxt->state;
3936    while ((CUR == '|') && (ctxt->error == 0)) {
3937	NEXT;
3938	ctxt->state = start;
3939	ctxt->end = end;
3940	xmlFAParseBranch(ctxt, 0);
3941    }
3942    if (!top)
3943	ctxt->end = oldend;
3944}
3945
3946/************************************************************************
3947 * 									*
3948 * 			The basic API					*
3949 * 									*
3950 ************************************************************************/
3951
3952/**
3953 * xmlRegexpPrint:
3954 * @output: the file for the output debug
3955 * @regexp: the compiled regexp
3956 *
3957 * Print the content of the compiled regular expression
3958 */
3959void
3960xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
3961    int i;
3962
3963    fprintf(output, " regexp: ");
3964    if (regexp == NULL) {
3965	fprintf(output, "NULL\n");
3966	return;
3967    }
3968    fprintf(output, "'%s' ", regexp->string);
3969    fprintf(output, "\n");
3970    fprintf(output, "%d atoms:\n", regexp->nbAtoms);
3971    for (i = 0;i < regexp->nbAtoms; i++) {
3972	fprintf(output, " %02d ", i);
3973	xmlRegPrintAtom(output, regexp->atoms[i]);
3974    }
3975    fprintf(output, "%d states:", regexp->nbStates);
3976    fprintf(output, "\n");
3977    for (i = 0;i < regexp->nbStates; i++) {
3978	xmlRegPrintState(output, regexp->states[i]);
3979    }
3980    fprintf(output, "%d counters:\n", regexp->nbCounters);
3981    for (i = 0;i < regexp->nbCounters; i++) {
3982	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
3983		                                regexp->counters[i].max);
3984    }
3985}
3986
3987/**
3988 * xmlRegexpCompile:
3989 * @regexp:  a regular expression string
3990 *
3991 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
3992 * Appendix F and build an automata suitable for testing strings against
3993 * that regular expression
3994 *
3995 * Returns the compiled expression or NULL in case of error
3996 */
3997xmlRegexpPtr
3998xmlRegexpCompile(const xmlChar *regexp) {
3999    xmlRegexpPtr ret;
4000    xmlRegParserCtxtPtr ctxt;
4001
4002    ctxt = xmlRegNewParserCtxt(regexp);
4003    if (ctxt == NULL)
4004	return(NULL);
4005
4006    /* initialize the parser */
4007    ctxt->end = NULL;
4008    ctxt->start = ctxt->state = xmlRegNewState(ctxt);
4009    xmlRegStatePush(ctxt, ctxt->start);
4010
4011    /* parse the expression building an automata */
4012    xmlFAParseRegExp(ctxt, 1);
4013    if (CUR != 0) {
4014	ERROR("xmlFAParseRegExp: extra characters");
4015    }
4016    ctxt->end = ctxt->state;
4017    ctxt->start->type = XML_REGEXP_START_STATE;
4018    ctxt->end->type = XML_REGEXP_FINAL_STATE;
4019
4020    /* remove the Epsilon except for counted transitions */
4021    xmlFAEliminateEpsilonTransitions(ctxt);
4022
4023
4024    if (ctxt->error != 0) {
4025	xmlRegFreeParserCtxt(ctxt);
4026	return(NULL);
4027    }
4028    ret = xmlRegEpxFromParse(ctxt);
4029    xmlRegFreeParserCtxt(ctxt);
4030    return(ret);
4031}
4032
4033/**
4034 * xmlRegexpExec:
4035 * @comp:  the compiled regular expression
4036 * @content:  the value to check against the regular expression
4037 *
4038 * Check if the regular expression generate the value
4039 *
4040 * Returns 1 if it matches, 0 if not and a negativa value in case of error
4041 */
4042int
4043xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
4044    if ((comp == NULL) || (content == NULL))
4045	return(-1);
4046    return(xmlFARegExec(comp, content));
4047}
4048
4049/**
4050 * xmlRegexpIsDeterminist:
4051 * @comp:  the compiled regular expression
4052 *
4053 * Check if the regular expression is determinist
4054 *
4055 * Returns 1 if it yes, 0 if not and a negativa value in case of error
4056 */
4057int
4058xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
4059    xmlAutomataPtr am;
4060    int ret;
4061
4062    if (comp == NULL)
4063	return(-1);
4064    if (comp->determinist != -1)
4065	return(comp->determinist);
4066
4067    am = xmlNewAutomata();
4068    if (am->states != NULL) {
4069	int i;
4070
4071	for (i = 0;i < am->nbStates;i++)
4072	    xmlRegFreeState(am->states[i]);
4073	xmlFree(am->states);
4074    }
4075    am->nbAtoms = comp->nbAtoms;
4076    am->atoms = comp->atoms;
4077    am->nbStates = comp->nbStates;
4078    am->states = comp->states;
4079    am->determinist = -1;
4080    ret = xmlFAComputesDeterminism(am);
4081    am->atoms = NULL;
4082    am->states = NULL;
4083    xmlFreeAutomata(am);
4084    return(ret);
4085}
4086
4087/**
4088 * xmlRegFreeRegexp:
4089 * @regexp:  the regexp
4090 *
4091 * Free a regexp
4092 */
4093void
4094xmlRegFreeRegexp(xmlRegexpPtr regexp) {
4095    int i;
4096    if (regexp == NULL)
4097	return;
4098
4099    if (regexp->string != NULL)
4100	xmlFree(regexp->string);
4101    if (regexp->states != NULL) {
4102	for (i = 0;i < regexp->nbStates;i++)
4103	    xmlRegFreeState(regexp->states[i]);
4104	xmlFree(regexp->states);
4105    }
4106    if (regexp->atoms != NULL) {
4107	for (i = 0;i < regexp->nbAtoms;i++)
4108	    xmlRegFreeAtom(regexp->atoms[i]);
4109	xmlFree(regexp->atoms);
4110    }
4111    if (regexp->counters != NULL)
4112	xmlFree(regexp->counters);
4113    if (regexp->compact != NULL)
4114	xmlFree(regexp->compact);
4115    if (regexp->transdata != NULL)
4116	xmlFree(regexp->transdata);
4117    if (regexp->stringMap != NULL) {
4118	for (i = 0; i < regexp->nbstrings;i++)
4119	    xmlFree(regexp->stringMap[i]);
4120	xmlFree(regexp->stringMap);
4121    }
4122
4123    xmlFree(regexp);
4124}
4125
4126#ifdef LIBXML_AUTOMATA_ENABLED
4127/************************************************************************
4128 * 									*
4129 * 			The Automata interface				*
4130 * 									*
4131 ************************************************************************/
4132
4133/**
4134 * xmlNewAutomata:
4135 *
4136 * Create a new automata
4137 *
4138 * Returns the new object or NULL in case of failure
4139 */
4140xmlAutomataPtr
4141xmlNewAutomata(void) {
4142    xmlAutomataPtr ctxt;
4143
4144    ctxt = xmlRegNewParserCtxt(NULL);
4145    if (ctxt == NULL)
4146	return(NULL);
4147
4148    /* initialize the parser */
4149    ctxt->end = NULL;
4150    ctxt->start = ctxt->state = xmlRegNewState(ctxt);
4151    if (ctxt->start == NULL) {
4152	xmlFreeAutomata(ctxt);
4153	return(NULL);
4154    }
4155    if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
4156        xmlRegFreeState(ctxt->start);
4157	xmlFreeAutomata(ctxt);
4158	return(NULL);
4159    }
4160
4161    return(ctxt);
4162}
4163
4164/**
4165 * xmlFreeAutomata:
4166 * @am: an automata
4167 *
4168 * Free an automata
4169 */
4170void
4171xmlFreeAutomata(xmlAutomataPtr am) {
4172    if (am == NULL)
4173	return;
4174    xmlRegFreeParserCtxt(am);
4175}
4176
4177/**
4178 * xmlAutomataGetInitState:
4179 * @am: an automata
4180 *
4181 * Initial state lookup
4182 *
4183 * Returns the initial state of the automata
4184 */
4185xmlAutomataStatePtr
4186xmlAutomataGetInitState(xmlAutomataPtr am) {
4187    if (am == NULL)
4188	return(NULL);
4189    return(am->start);
4190}
4191
4192/**
4193 * xmlAutomataSetFinalState:
4194 * @am: an automata
4195 * @state: a state in this automata
4196 *
4197 * Makes that state a final state
4198 *
4199 * Returns 0 or -1 in case of error
4200 */
4201int
4202xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
4203    if ((am == NULL) || (state == NULL))
4204	return(-1);
4205    state->type = XML_REGEXP_FINAL_STATE;
4206    return(0);
4207}
4208
4209/**
4210 * xmlAutomataNewTransition:
4211 * @am: an automata
4212 * @from: the starting point of the transition
4213 * @to: the target point of the transition or NULL
4214 * @token: the input string associated to that transition
4215 * @data: data passed to the callback function if the transition is activated
4216 *
4217 * If @to is NULL, this create first a new target state in the automata
4218 * and then adds a transition from the @from state to the target state
4219 * activated by the value of @token
4220 *
4221 * Returns the target state or NULL in case of error
4222 */
4223xmlAutomataStatePtr
4224xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
4225			 xmlAutomataStatePtr to, const xmlChar *token,
4226			 void *data) {
4227    xmlRegAtomPtr atom;
4228
4229    if ((am == NULL) || (from == NULL) || (token == NULL))
4230	return(NULL);
4231    atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4232    if (atom == NULL)
4233        return(NULL);
4234    atom->data = data;
4235    if (atom == NULL)
4236	return(NULL);
4237    atom->valuep = xmlStrdup(token);
4238
4239    if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4240        xmlRegFreeAtom(atom);
4241	return(NULL);
4242    }
4243    if (to == NULL)
4244	return(am->state);
4245    return(to);
4246}
4247
4248/**
4249 * xmlAutomataNewTransition2:
4250 * @am: an automata
4251 * @from: the starting point of the transition
4252 * @to: the target point of the transition or NULL
4253 * @token: the first input string associated to that transition
4254 * @token2: the second input string associated to that transition
4255 * @data: data passed to the callback function if the transition is activated
4256 *
4257 * If @to is NULL, this create first a new target state in the automata
4258 * and then adds a transition from the @from state to the target state
4259 * activated by the value of @token
4260 *
4261 * Returns the target state or NULL in case of error
4262 */
4263xmlAutomataStatePtr
4264xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
4265			  xmlAutomataStatePtr to, const xmlChar *token,
4266			  const xmlChar *token2, void *data) {
4267    xmlRegAtomPtr atom;
4268
4269    if ((am == NULL) || (from == NULL) || (token == NULL))
4270	return(NULL);
4271    atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4272    atom->data = data;
4273    if (atom == NULL)
4274	return(NULL);
4275    if ((token2 == NULL) || (*token2 == 0)) {
4276	atom->valuep = xmlStrdup(token);
4277    } else {
4278	int lenn, lenp;
4279	xmlChar *str;
4280
4281	lenn = strlen((char *) token2);
4282	lenp = strlen((char *) token);
4283
4284	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4285	if (str == NULL) {
4286	    xmlRegFreeAtom(atom);
4287	    return(NULL);
4288	}
4289	memcpy(&str[0], token, lenp);
4290	str[lenp] = '|';
4291	memcpy(&str[lenp + 1], token2, lenn);
4292	str[lenn + lenp + 1] = 0;
4293
4294	atom->valuep = str;
4295    }
4296
4297    if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
4298        xmlRegFreeAtom(atom);
4299	return(NULL);
4300    }
4301    if (to == NULL)
4302	return(am->state);
4303    return(to);
4304}
4305
4306/**
4307 * xmlAutomataNewCountTrans:
4308 * @am: an automata
4309 * @from: the starting point of the transition
4310 * @to: the target point of the transition or NULL
4311 * @token: the input string associated to that transition
4312 * @min:  the minimum successive occurences of token
4313 * @max:  the maximum successive occurences of token
4314 * @data:  data associated to the transition
4315 *
4316 * If @to is NULL, this create first a new target state in the automata
4317 * and then adds a transition from the @from state to the target state
4318 * activated by a succession of input of value @token and whose number
4319 * is between @min and @max
4320 *
4321 * Returns the target state or NULL in case of error
4322 */
4323xmlAutomataStatePtr
4324xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4325			 xmlAutomataStatePtr to, const xmlChar *token,
4326			 int min, int max, void *data) {
4327    xmlRegAtomPtr atom;
4328    int counter;
4329
4330    if ((am == NULL) || (from == NULL) || (token == NULL))
4331	return(NULL);
4332    if (min < 0)
4333	return(NULL);
4334    if ((max < min) || (max < 1))
4335	return(NULL);
4336    atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4337    if (atom == NULL)
4338	return(NULL);
4339    atom->valuep = xmlStrdup(token);
4340    atom->data = data;
4341    if (min == 0)
4342	atom->min = 1;
4343    else
4344	atom->min = min;
4345    atom->max = max;
4346
4347    /*
4348     * associate a counter to the transition.
4349     */
4350    counter = xmlRegGetCounter(am);
4351    am->counters[counter].min = min;
4352    am->counters[counter].max = max;
4353
4354    /* xmlFAGenerateTransitions(am, from, to, atom); */
4355    if (to == NULL) {
4356        to = xmlRegNewState(am);
4357	xmlRegStatePush(am, to);
4358    }
4359    xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4360    xmlRegAtomPush(am, atom);
4361    am->state = to;
4362
4363    if (to == NULL)
4364	to = am->state;
4365    if (to == NULL)
4366	return(NULL);
4367    if (min == 0)
4368	xmlFAGenerateEpsilonTransition(am, from, to);
4369    return(to);
4370}
4371
4372/**
4373 * xmlAutomataNewOnceTrans:
4374 * @am: an automata
4375 * @from: the starting point of the transition
4376 * @to: the target point of the transition or NULL
4377 * @token: the input string associated to that transition
4378 * @min:  the minimum successive occurences of token
4379 * @max:  the maximum successive occurences of token
4380 * @data:  data associated to the transition
4381 *
4382 * If @to is NULL, this create first a new target state in the automata
4383 * and then adds a transition from the @from state to the target state
4384 * activated by a succession of input of value @token and whose number
4385 * is between @min and @max, moreover that transistion can only be crossed
4386 * once.
4387 *
4388 * Returns the target state or NULL in case of error
4389 */
4390xmlAutomataStatePtr
4391xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4392			 xmlAutomataStatePtr to, const xmlChar *token,
4393			 int min, int max, void *data) {
4394    xmlRegAtomPtr atom;
4395    int counter;
4396
4397    if ((am == NULL) || (from == NULL) || (token == NULL))
4398	return(NULL);
4399    if (min < 1)
4400	return(NULL);
4401    if ((max < min) || (max < 1))
4402	return(NULL);
4403    atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
4404    if (atom == NULL)
4405	return(NULL);
4406    atom->valuep = xmlStrdup(token);
4407    atom->data = data;
4408    atom->quant = XML_REGEXP_QUANT_ONCEONLY;
4409    if (min == 0)
4410	atom->min = 1;
4411    else
4412	atom->min = min;
4413    atom->max = max;
4414    /*
4415     * associate a counter to the transition.
4416     */
4417    counter = xmlRegGetCounter(am);
4418    am->counters[counter].min = 1;
4419    am->counters[counter].max = 1;
4420
4421    /* xmlFAGenerateTransitions(am, from, to, atom); */
4422    if (to == NULL) {
4423	to = xmlRegNewState(am);
4424	xmlRegStatePush(am, to);
4425    }
4426    xmlRegStateAddTrans(am, from, atom, to, counter, -1);
4427    xmlRegAtomPush(am, atom);
4428    am->state = to;
4429    if (to == NULL)
4430	to = am->state;
4431    if (to == NULL)
4432	return(NULL);
4433    return(to);
4434}
4435
4436/**
4437 * xmlAutomataNewState:
4438 * @am: an automata
4439 *
4440 * Create a new disconnected state in the automata
4441 *
4442 * Returns the new state or NULL in case of error
4443 */
4444xmlAutomataStatePtr
4445xmlAutomataNewState(xmlAutomataPtr am) {
4446    xmlAutomataStatePtr to;
4447
4448    if (am == NULL)
4449	return(NULL);
4450    to = xmlRegNewState(am);
4451    xmlRegStatePush(am, to);
4452    return(to);
4453}
4454
4455/**
4456 * xmlAutomataNewEpsilon:
4457 * @am: an automata
4458 * @from: the starting point of the transition
4459 * @to: the target point of the transition or NULL
4460 *
4461 * If @to is NULL, this create first a new target state in the automata
4462 * and then adds a an epsilon transition from the @from state to the
4463 * target state
4464 *
4465 * Returns the target state or NULL in case of error
4466 */
4467xmlAutomataStatePtr
4468xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
4469		      xmlAutomataStatePtr to) {
4470    if ((am == NULL) || (from == NULL))
4471	return(NULL);
4472    xmlFAGenerateEpsilonTransition(am, from, to);
4473    if (to == NULL)
4474	return(am->state);
4475    return(to);
4476}
4477
4478/**
4479 * xmlAutomataNewAllTrans:
4480 * @am: an automata
4481 * @from: the starting point of the transition
4482 * @to: the target point of the transition or NULL
4483 * @lax: allow to transition if not all all transitions have been activated
4484 *
4485 * If @to is NULL, this create first a new target state in the automata
4486 * and then adds a an ALL transition from the @from state to the
4487 * target state. That transition is an epsilon transition allowed only when
4488 * all transitions from the @from node have been activated.
4489 *
4490 * Returns the target state or NULL in case of error
4491 */
4492xmlAutomataStatePtr
4493xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4494		       xmlAutomataStatePtr to, int lax) {
4495    if ((am == NULL) || (from == NULL))
4496	return(NULL);
4497    xmlFAGenerateAllTransition(am, from, to, lax);
4498    if (to == NULL)
4499	return(am->state);
4500    return(to);
4501}
4502
4503/**
4504 * xmlAutomataNewCounter:
4505 * @am: an automata
4506 * @min:  the minimal value on the counter
4507 * @max:  the maximal value on the counter
4508 *
4509 * Create a new counter
4510 *
4511 * Returns the counter number or -1 in case of error
4512 */
4513int
4514xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
4515    int ret;
4516
4517    if (am == NULL)
4518	return(-1);
4519
4520    ret = xmlRegGetCounter(am);
4521    if (ret < 0)
4522	return(-1);
4523    am->counters[ret].min = min;
4524    am->counters[ret].max = max;
4525    return(ret);
4526}
4527
4528/**
4529 * xmlAutomataNewCountedTrans:
4530 * @am: an automata
4531 * @from: the starting point of the transition
4532 * @to: the target point of the transition or NULL
4533 * @counter: the counter associated to that transition
4534 *
4535 * If @to is NULL, this create first a new target state in the automata
4536 * and then adds an epsilon transition from the @from state to the target state
4537 * which will increment the counter provided
4538 *
4539 * Returns the target state or NULL in case of error
4540 */
4541xmlAutomataStatePtr
4542xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4543		xmlAutomataStatePtr to, int counter) {
4544    if ((am == NULL) || (from == NULL) || (counter < 0))
4545	return(NULL);
4546    xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
4547    if (to == NULL)
4548	return(am->state);
4549    return(to);
4550}
4551
4552/**
4553 * xmlAutomataNewCounterTrans:
4554 * @am: an automata
4555 * @from: the starting point of the transition
4556 * @to: the target point of the transition or NULL
4557 * @counter: the counter associated to that transition
4558 *
4559 * If @to is NULL, this create first a new target state in the automata
4560 * and then adds an epsilon transition from the @from state to the target state
4561 * which will be allowed only if the counter is within the right range.
4562 *
4563 * Returns the target state or NULL in case of error
4564 */
4565xmlAutomataStatePtr
4566xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
4567		xmlAutomataStatePtr to, int counter) {
4568    if ((am == NULL) || (from == NULL) || (counter < 0))
4569	return(NULL);
4570    xmlFAGenerateCountedTransition(am, from, to, counter);
4571    if (to == NULL)
4572	return(am->state);
4573    return(to);
4574}
4575
4576/**
4577 * xmlAutomataCompile:
4578 * @am: an automata
4579 *
4580 * Compile the automata into a Reg Exp ready for being executed.
4581 * The automata should be free after this point.
4582 *
4583 * Returns the compiled regexp or NULL in case of error
4584 */
4585xmlRegexpPtr
4586xmlAutomataCompile(xmlAutomataPtr am) {
4587    xmlRegexpPtr ret;
4588
4589    if ((am == NULL) || (am->error != 0)) return(NULL);
4590    xmlFAEliminateEpsilonTransitions(am);
4591    /* xmlFAComputesDeterminism(am); */
4592    ret = xmlRegEpxFromParse(am);
4593
4594    return(ret);
4595}
4596
4597/**
4598 * xmlAutomataIsDeterminist:
4599 * @am: an automata
4600 *
4601 * Checks if an automata is determinist.
4602 *
4603 * Returns 1 if true, 0 if not, and -1 in case of error
4604 */
4605int
4606xmlAutomataIsDeterminist(xmlAutomataPtr am) {
4607    int ret;
4608
4609    if (am == NULL)
4610	return(-1);
4611
4612    ret = xmlFAComputesDeterminism(am);
4613    return(ret);
4614}
4615#endif /* LIBXML_AUTOMATA_ENABLED */
4616#endif /* LIBXML_REGEXP_ENABLED */
4617